Merge branch 'master' into pr_lazy_fpsimd_2

author: Andrei Vagin <avagin@google.com> 2020-03-06 21:12:32 -0800
committer: GitHub <noreply@github.com> 2020-03-06 21:12:32 -0800
commit: bf87da89d3c43555fd57e8f1d7aed21b6da78de4 (patch)
tree: 744ba15a2f663d64d56bf1c70bdfe4096f6a1af9 /pkg/sentry/kernel
parent: 89957c6c87b5ad5c7bac68f93d9472388db57702 (diff)
parent: ddfc7239be94fa9711df877a66a9718aabff8b96 (diff)
22 files changed, 566 insertions, 412 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index a27628c0a..beba29a09 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -91,6 +91,7 @@ go_library(
         "fs_context.go",
         "ipc_namespace.go",
         "kernel.go",
+        "kernel_opts.go",
         "kernel_state.go",
         "pending_signals.go",
         "pending_signals_list.go",
@@ -156,6 +157,7 @@ go_library(
         "//pkg/context",
         "//pkg/cpuid",
         "//pkg/eventchannel",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/metric",
         "//pkg/refs",
@@ -166,6 +168,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
+        "//pkg/sentry/fsbridge",
         "//pkg/sentry/hostcpu",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel/auth",
@@ -198,6 +201,7 @@ go_library(
         "//pkg/tcpip/stack",
         "//pkg/usermem",
         "//pkg/waiter",
+        "//tools/go_marshal/marshal",
     ],
 )
 
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
index a0d35d350..8e9f200d0 100644
--- a/pkg/sentry/kernel/epoll/epoll_state.go
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -38,11 +38,14 @@ func (e *EventPoll) afterLoad() {
 		}
 	}
 
-	for it := e.waitingList.Front(); it != nil; it = it.Next() {
-		if it.id.File.Readiness(it.mask) != 0 {
-			e.waitingList.Remove(it)
-			e.readyList.PushBack(it)
-			it.curList = &e.readyList
+	for it := e.waitingList.Front(); it != nil; {
+		entry := it
+		it = it.Next()
+
+		if entry.id.File.Readiness(entry.mask) != 0 {
+			e.waitingList.Remove(entry)
+			e.readyList.PushBack(entry)
+			entry.curList = &e.readyList
 			e.Notify(waiter.EventIn)
 		}
 	}
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 23b88f7a6..58001d56c 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -296,6 +296,50 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
 	return fds, nil
 }
 
+// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for
+// the given file description. If it succeeds, it takes a reference on file.
+func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
+	if minfd < 0 {
+		// Don't accept negative FDs.
+		return -1, syscall.EINVAL
+	}
+
+	// Default limit.
+	end := int32(math.MaxInt32)
+
+	// Ensure we don't get past the provided limit.
+	if limitSet := limits.FromContext(ctx); limitSet != nil {
+		lim := limitSet.Get(limits.NumberOfFiles)
+		if lim.Cur != limits.Infinity {
+			end = int32(lim.Cur)
+		}
+		if minfd >= end {
+			return -1, syscall.EMFILE
+		}
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// From f.next to find available fd.
+	fd := minfd
+	if fd < f.next {
+		fd = f.next
+	}
+	for fd < end {
+		if d, _, _ := f.get(fd); d == nil {
+			f.setVFS2(fd, file, flags)
+			if fd == f.next {
+				// Update next search start position.
+				f.next = fd + 1
+			}
+			return fd, nil
+		}
+		fd++
+	}
+	return -1, syscall.EMFILE
+}
+
 // NewFDAt sets the file reference for the given FD. If there is an active
 // reference for that FD, the ref count for that existing reference is
 // decremented.
@@ -316,9 +360,6 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2
 		return syscall.EBADF
 	}
 
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
 	// Check the limit for the provided file.
 	if limitSet := limits.FromContext(ctx); limitSet != nil {
 		if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
@@ -327,6 +368,8 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2
 	}
 
 	// Install the entry.
+	f.mu.Lock()
+	defer f.mu.Unlock()
 	f.setAll(fd, file, fileVFS2, flags)
 	return nil
 }
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 2448c1d99..47f78df9a 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -19,6 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
@@ -37,10 +38,16 @@ type FSContext struct {
 	// destroyed.
 	root *fs.Dirent
 
+	// rootVFS2 is the filesystem root.
+	rootVFS2 vfs.VirtualDentry
+
 	// cwd is the current working directory. Will be nil iff the FSContext
 	// has been destroyed.
 	cwd *fs.Dirent
 
+	// cwdVFS2 is the current working directory.
+	cwdVFS2 vfs.VirtualDentry
+
 	// umask is the current file mode creation mask. When a thread using this
 	// context invokes a syscall that creates a file, bits set in umask are
 	// removed from the permissions that the file is created with.
@@ -60,6 +67,19 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
 	return &f
 }
 
+// NewFSContextVFS2 returns a new filesystem context.
+func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
+	root.IncRef()
+	cwd.IncRef()
+	f := FSContext{
+		rootVFS2: root,
+		cwdVFS2:  cwd,
+		umask:    umask,
+	}
+	f.EnableLeakCheck("kernel.FSContext")
+	return &f
+}
+
 // destroy is the destructor for an FSContext.
 //
 // This will call DecRef on both root and cwd Dirents.  If either call to
@@ -75,11 +95,17 @@ func (f *FSContext) destroy() {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
-	f.root.DecRef()
-	f.root = nil
-
-	f.cwd.DecRef()
-	f.cwd = nil
+	if VFS2Enabled {
+		f.rootVFS2.DecRef()
+		f.rootVFS2 = vfs.VirtualDentry{}
+		f.cwdVFS2.DecRef()
+		f.cwdVFS2 = vfs.VirtualDentry{}
+	} else {
+		f.root.DecRef()
+		f.root = nil
+		f.cwd.DecRef()
+		f.cwd = nil
+	}
 }
 
 // DecRef implements RefCounter.DecRef with destructor f.destroy.
@@ -93,12 +119,21 @@ func (f *FSContext) DecRef() {
 func (f *FSContext) Fork() *FSContext {
 	f.mu.Lock()
 	defer f.mu.Unlock()
-	f.cwd.IncRef()
-	f.root.IncRef()
+
+	if VFS2Enabled {
+		f.cwdVFS2.IncRef()
+		f.rootVFS2.IncRef()
+	} else {
+		f.cwd.IncRef()
+		f.root.IncRef()
+	}
+
 	return &FSContext{
-		cwd:   f.cwd,
-		root:  f.root,
-		umask: f.umask,
+		cwd:      f.cwd,
+		root:     f.root,
+		cwdVFS2:  f.cwdVFS2,
+		rootVFS2: f.rootVFS2,
+		umask:    f.umask,
 	}
 }
 
@@ -109,12 +144,23 @@ func (f *FSContext) Fork() *FSContext {
 func (f *FSContext) WorkingDirectory() *fs.Dirent {
 	f.mu.Lock()
 	defer f.mu.Unlock()
-	if f.cwd != nil {
-		f.cwd.IncRef()
-	}
+
+	f.cwd.IncRef()
 	return f.cwd
 }
 
+// WorkingDirectoryVFS2 returns the current working directory.
+//
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
+func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	f.cwdVFS2.IncRef()
+	return f.cwdVFS2
+}
+
 // SetWorkingDirectory sets the current working directory.
 // This will take an extra reference on the Dirent.
 //
@@ -137,6 +183,20 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
 	old.DecRef()
 }
 
+// SetWorkingDirectoryVFS2 sets the current working directory.
+// This will take an extra reference on the VirtualDentry.
+//
+// This is not a valid call after destroy.
+func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	old := f.cwdVFS2
+	f.cwdVFS2 = d
+	d.IncRef()
+	old.DecRef()
+}
+
 // RootDirectory returns the current filesystem root.
 //
 // This will return nil if called after destroy(), otherwise it will return a
@@ -150,6 +210,18 @@ func (f *FSContext) RootDirectory() *fs.Dirent {
 	return f.root
 }
 
+// RootDirectoryVFS2 returns the current filesystem root.
+//
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
+func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	f.rootVFS2.IncRef()
+	return f.rootVFS2
+}
+
 // SetRootDirectory sets the root directory.
 // This will take an extra reference on the Dirent.
 //
@@ -172,6 +244,28 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
 	old.DecRef()
 }
 
+// SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd.
+//
+// This is not a valid call after free.
+func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) {
+	if !vd.Ok() {
+		panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry")
+	}
+
+	f.mu.Lock()
+
+	if !f.rootVFS2.Ok() {
+		f.mu.Unlock()
+		panic(fmt.Sprintf("FSContext.SetRootDirectoryVFS2(%v)) called after destroy", vd))
+	}
+
+	old := f.rootVFS2
+	vd.IncRef()
+	f.rootVFS2 = vd
+	f.mu.Unlock()
+	old.DecRef()
+}
+
 // Umask returns the current umask.
 func (f *FSContext) Umask() uint {
 	f.mu.Lock()
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index dcd6e91c4..1d627564f 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -43,11 +43,13 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/eventchannel"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -71,6 +73,10 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 )
 
+// VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow
+// easy access everywhere. To be removed once VFS2 becomes the default.
+var VFS2Enabled = false
+
 // Kernel represents an emulated Linux kernel. It must be initialized by calling
 // Init() or LoadFrom().
 //
@@ -105,7 +111,7 @@ type Kernel struct {
 	timekeeper                  *Timekeeper
 	tasks                       *TaskSet
 	rootUserNamespace           *auth.UserNamespace
-	networkStack                inet.Stack `state:"nosave"`
+	rootNetworkNamespace        *inet.Namespace
 	applicationCores            uint
 	useHostCores                bool
 	extraAuxv                   []arch.AuxEntry
@@ -235,6 +241,16 @@ type Kernel struct {
 	// events. This is initialized lazily on the first unimplemented
 	// syscall.
 	unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
+
+	// SpecialOpts contains special kernel options.
+	SpecialOpts
+
+	// VFS keeps the filesystem state used across the kernel.
+	vfs vfs.VirtualFilesystem
+
+	// If set to true, report address space activation waits as if the task is in
+	// external wait so that the watchdog doesn't report the task stuck.
+	SleepForAddressSpaceActivation bool
 }
 
 // InitKernelArgs holds arguments to Init.
@@ -248,8 +264,9 @@ type InitKernelArgs struct {
 	// RootUserNamespace is the root user namespace.
 	RootUserNamespace *auth.UserNamespace
 
-	// NetworkStack is the TCP/IP network stack. NetworkStack may be nil.
-	NetworkStack inet.Stack
+	// RootNetworkNamespace is the root network namespace. If nil, no networking
+	// will be available.
+	RootNetworkNamespace *inet.Namespace
 
 	// ApplicationCores is the number of logical CPUs visible to sandboxed
 	// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
@@ -308,7 +325,10 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.rootUTSNamespace = args.RootUTSNamespace
 	k.rootIPCNamespace = args.RootIPCNamespace
 	k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace
-	k.networkStack = args.NetworkStack
+	k.rootNetworkNamespace = args.RootNetworkNamespace
+	if k.rootNetworkNamespace == nil {
+		k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil)
+	}
 	k.applicationCores = args.ApplicationCores
 	if args.UseHostCores {
 		k.useHostCores = true
@@ -531,8 +551,6 @@ func (ts *TaskSet) unregisterEpollWaiters() {
 func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
 	loadStart := time.Now()
 
-	k.networkStack = net
-
 	initAppCores := k.applicationCores
 
 	// Load the pre-saved CPUID FeatureSet.
@@ -563,6 +581,10 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks)
 	log.Infof("Kernel load stats: %s", &stats)
 	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
 
+	// rootNetworkNamespace should be populated after loading the state file.
+	// Restore the root network stack.
+	k.rootNetworkNamespace.RestoreRootStack(net)
+
 	// Load the memory file's state.
 	memoryStart := time.Now()
 	if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil {
@@ -621,7 +643,7 @@ type CreateProcessArgs struct {
 	// File is a passed host FD pointing to a file to load as the init binary.
 	//
 	// This is checked if and only if Filename is "".
-	File *fs.File
+	File fsbridge.File
 
 	// Argvv is a list of arguments.
 	Argv []string
@@ -670,6 +692,13 @@ type CreateProcessArgs struct {
 	// increment it).
 	MountNamespace *fs.MountNamespace
 
+	// MountNamespaceVFS2 optionally contains the mount namespace for this
+	// process. If nil, the init process's mount namespace is used.
+	//
+	// Anyone setting MountNamespaceVFS2 must donate a reference (i.e.
+	// increment it).
+	MountNamespaceVFS2 *vfs.MountNamespace
+
 	// ContainerID is the container that the process belongs to.
 	ContainerID string
 }
@@ -708,13 +737,26 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		return ctx.args.Credentials
 	case fs.CtxRoot:
 		if ctx.args.MountNamespace != nil {
-			// MountNamespace.Root() will take a reference on the root
-			// dirent for us.
+			// MountNamespace.Root() will take a reference on the root dirent for us.
 			return ctx.args.MountNamespace.Root()
 		}
 		return nil
+	case vfs.CtxRoot:
+		if ctx.args.MountNamespaceVFS2 == nil {
+			return nil
+		}
+		// MountNamespaceVFS2.Root() takes a reference on the root dirent for us.
+		return ctx.args.MountNamespaceVFS2.Root()
+	case vfs.CtxMountNamespace:
+		if ctx.k.globalInit == nil {
+			return nil
+		}
+		// MountNamespaceVFS2 takes a reference for us.
+		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return ctx.k.RootNetworkNamespace().Stack()
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
@@ -754,34 +796,77 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	defer k.extMu.Unlock()
 	log.Infof("EXEC: %v", args.Argv)
 
-	// Grab the mount namespace.
-	mounts := args.MountNamespace
-	if mounts == nil {
-		mounts = k.GlobalInit().Leader().MountNamespace()
-		mounts.IncRef()
-	}
-
-	tg := k.NewThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
 	ctx := args.NewContext(k)
 
-	// Get the root directory from the MountNamespace.
-	root := mounts.Root()
-	// The call to newFSContext below will take a reference on root, so we
-	// don't need to hold this one.
-	defer root.DecRef()
-
-	// Grab the working directory.
-	remainingTraversals := uint(args.MaxSymlinkTraversals)
-	wd := root // Default.
-	if args.WorkingDirectory != "" {
-		var err error
-		wd, err = mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
-		if err != nil {
-			return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+	var (
+		opener    fsbridge.Lookup
+		fsContext *FSContext
+		mntns     *fs.MountNamespace
+	)
+
+	if VFS2Enabled {
+		mntnsVFS2 := args.MountNamespaceVFS2
+		if mntnsVFS2 == nil {
+			// MountNamespaceVFS2 adds a reference to the namespace, which is
+			// transferred to the new process.
+			mntnsVFS2 = k.GlobalInit().Leader().MountNamespaceVFS2()
 		}
-		defer wd.DecRef()
+		// Get the root directory from the MountNamespace.
+		root := args.MountNamespaceVFS2.Root()
+		// The call to newFSContext below will take a reference on root, so we
+		// don't need to hold this one.
+		defer root.DecRef()
+
+		// Grab the working directory.
+		wd := root // Default.
+		if args.WorkingDirectory != "" {
+			pop := vfs.PathOperation{
+				Root:               root,
+				Start:              wd,
+				Path:               fspath.Parse(args.WorkingDirectory),
+				FollowFinalSymlink: true,
+			}
+			var err error
+			wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{
+				CheckSearchable: true,
+			})
+			if err != nil {
+				return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+			}
+			defer wd.DecRef()
+		}
+		opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd)
+		fsContext = NewFSContextVFS2(root, wd, args.Umask)
+
+	} else {
+		mntns = args.MountNamespace
+		if mntns == nil {
+			mntns = k.GlobalInit().Leader().MountNamespace()
+			mntns.IncRef()
+		}
+		// Get the root directory from the MountNamespace.
+		root := mntns.Root()
+		// The call to newFSContext below will take a reference on root, so we
+		// don't need to hold this one.
+		defer root.DecRef()
+
+		// Grab the working directory.
+		remainingTraversals := args.MaxSymlinkTraversals
+		wd := root // Default.
+		if args.WorkingDirectory != "" {
+			var err error
+			wd, err = mntns.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
+			if err != nil {
+				return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+			}
+			defer wd.DecRef()
+		}
+		opener = fsbridge.NewFSLookup(mntns, root, wd)
+		fsContext = newFSContext(root, wd, args.Umask)
 	}
 
+	tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
+
 	// Check which file to start from.
 	switch {
 	case args.Filename != "":
@@ -802,11 +887,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	}
 
 	// Create a fresh task context.
-	remainingTraversals = uint(args.MaxSymlinkTraversals)
+	remainingTraversals := args.MaxSymlinkTraversals
 	loadArgs := loader.LoadArgs{
-		Mounts:              mounts,
-		Root:                root,
-		WorkingDirectory:    wd,
+		Opener:              opener,
 		RemainingTraversals: &remainingTraversals,
 		ResolveFinal:        true,
 		Filename:            args.Filename,
@@ -831,13 +914,15 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		Kernel:                  k,
 		ThreadGroup:             tg,
 		TaskContext:             tc,
-		FSContext:               newFSContext(root, wd, args.Umask),
+		FSContext:               fsContext,
 		FDTable:                 args.FDTable,
 		Credentials:             args.Credentials,
+		NetworkNamespace:        k.RootNetworkNamespace(),
 		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
 		UTSNamespace:            args.UTSNamespace,
 		IPCNamespace:            args.IPCNamespace,
 		AbstractSocketNamespace: args.AbstractSocketNamespace,
+		MountNamespaceVFS2:      args.MountNamespaceVFS2,
 		ContainerID:             args.ContainerID,
 	}
 	t, err := k.tasks.NewTask(config)
@@ -1097,6 +1182,14 @@ func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) {
 	k.sendExternalSignal(info, context)
 }
 
+// SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup.
+// This function doesn't skip signals like SendExternalSignal does.
+func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *arch.SignalInfo) error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return tg.SendSignal(info)
+}
+
 // SendContainerSignal sends the given signal to all processes inside the
 // namespace that match the given container ID.
 func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
@@ -1175,10 +1268,9 @@ func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
 	return k.rootAbstractSocketNamespace
 }
 
-// NetworkStack returns the network stack. NetworkStack may return nil if no
-// network stack is available.
-func (k *Kernel) NetworkStack() inet.Stack {
-	return k.networkStack
+// RootNetworkNamespace returns the root network namespace, always non-nil.
+func (k *Kernel) RootNetworkNamespace() *inet.Namespace {
+	return k.rootNetworkNamespace
 }
 
 // GlobalInit returns the thread group with ID 1 in the root PID namespace, or
@@ -1375,8 +1467,24 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 			return ctx.k.globalInit.mounts.Root()
 		}
 		return nil
+	case vfs.CtxRoot:
+		if ctx.k.globalInit == nil {
+			return vfs.VirtualDentry{}
+		}
+		mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+		defer mntns.DecRef()
+		// Root() takes a reference on the root dirent for us.
+		return mntns.Root()
+	case vfs.CtxMountNamespace:
+		if ctx.k.globalInit == nil {
+			return nil
+		}
+		// MountNamespaceVFS2() takes a reference for us.
+		return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
 	case fs.CtxDirentCacheLimiter:
 		return ctx.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return ctx.k.RootNetworkNamespace().Stack()
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
@@ -1420,3 +1528,8 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
 		Registers: t.Arch().StateData().Proto(),
 	})
 }
+
+// VFS returns the virtual filesystem for the kernel.
+func (k *Kernel) VFS() *vfs.VirtualFilesystem {
+	return &k.vfs
+}
diff --git a/pkg/sentry/kernel/pipe/buffer_test.go b/pkg/sentry/kernel/kernel_opts.go
index 4d54b8b8f..2e66ec587 100644
--- a/pkg/sentry/kernel/pipe/buffer_test.go
+++ b/pkg/sentry/kernel/kernel_opts.go
@@ -1,4 +1,4 @@
-// Copyright 2019 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,21 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package pipe
+package kernel
 
-import (
-	"testing"
-	"unsafe"
-
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-func TestBufferSize(t *testing.T) {
-	bufferSize := unsafe.Sizeof(buffer{})
-	if bufferSize < usermem.PageSize {
-		t.Errorf("buffer is less than a page")
-	}
-	if bufferSize > (2 * usermem.PageSize) {
-		t.Errorf("buffer is greater than two pages")
-	}
-}
+// SpecialOpts contains non-standard options for the kernel.
+//
+// +stateify savable
+type SpecialOpts struct{}
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 4c049d5b4..f29dc0472 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -1,25 +1,10 @@
 load("//tools:defs.bzl", "go_library", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
-go_template_instance(
-    name = "buffer_list",
-    out = "buffer_list.go",
-    package = "pipe",
-    prefix = "buffer",
-    template = "//pkg/ilist:generic_list",
-    types = {
-        "Element": "*buffer",
-        "Linker": "*buffer",
-    },
-)
-
 go_library(
     name = "pipe",
     srcs = [
-        "buffer.go",
-        "buffer_list.go",
         "device.go",
         "node.go",
         "pipe.go",
@@ -33,8 +18,8 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
+        "//pkg/buffer",
         "//pkg/context",
-        "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
@@ -51,7 +36,6 @@ go_test(
     name = "pipe_test",
     size = "small",
     srcs = [
-        "buffer_test.go",
         "node_test.go",
         "pipe_test.go",
     ],
diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go
deleted file mode 100644
index fe3be5dbd..000000000
--- a/pkg/sentry/kernel/pipe/buffer.go
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pipe
-
-import (
-	"io"
-
-	"gvisor.dev/gvisor/pkg/safemem"
-	"gvisor.dev/gvisor/pkg/sync"
-)
-
-// buffer encapsulates a queueable byte buffer.
-//
-// Note that the total size is slightly less than two pages. This
-// is done intentionally to ensure that the buffer object aligns
-// with runtime internals. We have no hard size or alignment
-// requirements. This two page size will effectively minimize
-// internal fragmentation, but still have a large enough chunk
-// to limit excessive segmentation.
-//
-// +stateify savable
-type buffer struct {
-	data  [8144]byte
-	read  int
-	write int
-	bufferEntry
-}
-
-// Reset resets internal data.
-//
-// This must be called before use.
-func (b *buffer) Reset() {
-	b.read = 0
-	b.write = 0
-}
-
-// Empty indicates the buffer is empty.
-//
-// This indicates there is no data left to read.
-func (b *buffer) Empty() bool {
-	return b.read == b.write
-}
-
-// Full indicates the buffer is full.
-//
-// This indicates there is no capacity left to write.
-func (b *buffer) Full() bool {
-	return b.write == len(b.data)
-}
-
-// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
-func (b *buffer) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
-	dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.write:]))
-	n, err := safemem.CopySeq(dst, srcs)
-	b.write += int(n)
-	return n, err
-}
-
-// WriteFromReader writes to the buffer from an io.Reader.
-func (b *buffer) WriteFromReader(r io.Reader, count int64) (int64, error) {
-	dst := b.data[b.write:]
-	if count < int64(len(dst)) {
-		dst = b.data[b.write:][:count]
-	}
-	n, err := r.Read(dst)
-	b.write += n
-	return int64(n), err
-}
-
-// ReadToBlocks implements safemem.Reader.ReadToBlocks.
-func (b *buffer) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
-	src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.read:b.write]))
-	n, err := safemem.CopySeq(dsts, src)
-	b.read += int(n)
-	return n, err
-}
-
-// ReadToWriter reads from the buffer into an io.Writer.
-func (b *buffer) ReadToWriter(w io.Writer, count int64, dup bool) (int64, error) {
-	src := b.data[b.read:b.write]
-	if count < int64(len(src)) {
-		src = b.data[b.read:][:count]
-	}
-	n, err := w.Write(src)
-	if !dup {
-		b.read += n
-	}
-	return int64(n), err
-}
-
-// bufferPool is a pool for buffers.
-var bufferPool = sync.Pool{
-	New: func() interface{} {
-		return new(buffer)
-	},
-}
-
-// newBuffer grabs a new buffer from the pool.
-func newBuffer() *buffer {
-	b := bufferPool.Get().(*buffer)
-	b.Reset()
-	return b
-}
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 08410283f..725e9db7d 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -20,6 +20,7 @@ import (
 	"sync/atomic"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -70,10 +71,10 @@ type Pipe struct {
 	// mu protects all pipe internal state below.
 	mu sync.Mutex `state:"nosave"`
 
-	// data is the buffer queue of pipe contents.
+	// view is the underlying set of buffers.
 	//
 	// This is protected by mu.
-	data bufferList
+	view buffer.View
 
 	// max is the maximum size of the pipe in bytes. When this max has been
 	// reached, writers will get EWOULDBLOCK.
@@ -81,11 +82,6 @@ type Pipe struct {
 	// This is protected by mu.
 	max int64
 
-	// size is the current size of the pipe in bytes.
-	//
-	// This is protected by mu.
-	size int64
-
 	// hadWriter indicates if this pipe ever had a writer. Note that this
 	// does not necessarily indicate there is *currently* a writer, just
 	// that there has been a writer at some point since the pipe was
@@ -196,7 +192,7 @@ type readOps struct {
 	limit func(int64)
 
 	// read performs the actual read operation.
-	read func(*buffer) (int64, error)
+	read func(*buffer.View) (int64, error)
 }
 
 // read reads data from the pipe into dst and returns the number of bytes
@@ -213,7 +209,7 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
 	defer p.mu.Unlock()
 
 	// Is the pipe empty?
-	if p.size == 0 {
+	if p.view.Size() == 0 {
 		if !p.HasWriters() {
 			// There are no writers, return EOF.
 			return 0, nil
@@ -222,71 +218,13 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
 	}
 
 	// Limit how much we consume.
-	if ops.left() > p.size {
-		ops.limit(p.size)
+	if ops.left() > p.view.Size() {
+		ops.limit(p.view.Size())
 	}
 
-	done := int64(0)
-	for ops.left() > 0 {
-		// Pop the first buffer.
-		first := p.data.Front()
-		if first == nil {
-			break
-		}
-
-		// Copy user data.
-		n, err := ops.read(first)
-		done += int64(n)
-		p.size -= n
-
-		// Empty buffer?
-		if first.Empty() {
-			// Push to the free list.
-			p.data.Remove(first)
-			bufferPool.Put(first)
-		}
-
-		// Handle errors.
-		if err != nil {
-			return done, err
-		}
-	}
-
-	return done, nil
-}
-
-// dup duplicates all data from this pipe into the given writer.
-//
-// There is no blocking behavior implemented here. The writer may propagate
-// some blocking error. All the writes must be complete writes.
-func (p *Pipe) dup(ctx context.Context, ops readOps) (int64, error) {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-
-	// Is the pipe empty?
-	if p.size == 0 {
-		if !p.HasWriters() {
-			// See above.
-			return 0, nil
-		}
-		return 0, syserror.ErrWouldBlock
-	}
-
-	// Limit how much we consume.
-	if ops.left() > p.size {
-		ops.limit(p.size)
-	}
-
-	done := int64(0)
-	for buf := p.data.Front(); buf != nil; buf = buf.Next() {
-		n, err := ops.read(buf)
-		done += n
-		if err != nil {
-			return done, err
-		}
-	}
-
-	return done, nil
+	// Copy user data; the read op is responsible for trimming.
+	done, err := ops.read(&p.view)
+	return done, err
 }
 
 type writeOps struct {
@@ -297,7 +235,7 @@ type writeOps struct {
 	limit func(int64)
 
 	// write should write to the provided buffer.
-	write func(*buffer) (int64, error)
+	write func(*buffer.View) (int64, error)
 }
 
 // write writes data from sv into the pipe and returns the number of bytes
@@ -317,33 +255,19 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
 	// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
 	// atomic, but requires no atomicity for writes larger than this.
 	wanted := ops.left()
-	if avail := p.max - p.size; wanted > avail {
+	if avail := p.max - p.view.Size(); wanted > avail {
 		if wanted <= p.atomicIOBytes {
 			return 0, syserror.ErrWouldBlock
 		}
 		ops.limit(avail)
 	}
 
-	done := int64(0)
-	for ops.left() > 0 {
-		// Need a new buffer?
-		last := p.data.Back()
-		if last == nil || last.Full() {
-			// Add a new buffer to the data list.
-			last = newBuffer()
-			p.data.PushBack(last)
-		}
-
-		// Copy user data.
-		n, err := ops.write(last)
-		done += int64(n)
-		p.size += n
-
-		// Handle errors.
-		if err != nil {
-			return done, err
-		}
+	// Copy user data.
+	done, err := ops.write(&p.view)
+	if err != nil {
+		return done, err
 	}
+
 	if wanted > done {
 		// Partial write due to full pipe.
 		return done, syserror.ErrWouldBlock
@@ -396,7 +320,7 @@ func (p *Pipe) HasWriters() bool {
 // Precondition: mu must be held.
 func (p *Pipe) rReadinessLocked() waiter.EventMask {
 	ready := waiter.EventMask(0)
-	if p.HasReaders() && p.data.Front() != nil {
+	if p.HasReaders() && p.view.Size() != 0 {
 		ready |= waiter.EventIn
 	}
 	if !p.HasWriters() && p.hadWriter {
@@ -422,7 +346,7 @@ func (p *Pipe) rReadiness() waiter.EventMask {
 // Precondition: mu must be held.
 func (p *Pipe) wReadinessLocked() waiter.EventMask {
 	ready := waiter.EventMask(0)
-	if p.HasWriters() && p.size < p.max {
+	if p.HasWriters() && p.view.Size() < p.max {
 		ready |= waiter.EventOut
 	}
 	if !p.HasReaders() {
@@ -451,7 +375,7 @@ func (p *Pipe) rwReadiness() waiter.EventMask {
 func (p *Pipe) queued() int64 {
 	p.mu.Lock()
 	defer p.mu.Unlock()
-	return p.size
+	return p.view.Size()
 }
 
 // FifoSize implements fs.FifoSizer.FifoSize.
@@ -474,7 +398,7 @@ func (p *Pipe) SetFifoSize(size int64) (int64, error) {
 	}
 	p.mu.Lock()
 	defer p.mu.Unlock()
-	if size < p.size {
+	if size < p.view.Size() {
 		return 0, syserror.EBUSY
 	}
 	p.max = size
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index 80158239e..5a1d4fd57 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/amutex"
+	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -49,9 +50,10 @@ func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error)
 		limit: func(l int64) {
 			dst = dst.TakeFirst64(l)
 		},
-		read: func(buf *buffer) (int64, error) {
-			n, err := dst.CopyOutFrom(ctx, buf)
+		read: func(view *buffer.View) (int64, error) {
+			n, err := dst.CopyOutFrom(ctx, view)
 			dst = dst.DropFirst64(n)
+			view.TrimFront(n)
 			return n, err
 		},
 	})
@@ -70,16 +72,15 @@ func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool)
 		limit: func(l int64) {
 			count = l
 		},
-		read: func(buf *buffer) (int64, error) {
-			n, err := buf.ReadToWriter(w, count, dup)
+		read: func(view *buffer.View) (int64, error) {
+			n, err := view.ReadToWriter(w, count)
+			if !dup {
+				view.TrimFront(n)
+			}
 			count -= n
 			return n, err
 		},
 	}
-	if dup {
-		// There is no notification for dup operations.
-		return p.dup(ctx, ops)
-	}
 	n, err := p.read(ctx, ops)
 	if n > 0 {
 		p.Notify(waiter.EventOut)
@@ -96,8 +97,8 @@ func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error)
 		limit: func(l int64) {
 			src = src.TakeFirst64(l)
 		},
-		write: func(buf *buffer) (int64, error) {
-			n, err := src.CopyInTo(ctx, buf)
+		write: func(view *buffer.View) (int64, error) {
+			n, err := src.CopyInTo(ctx, view)
 			src = src.DropFirst64(n)
 			return n, err
 		},
@@ -117,8 +118,8 @@ func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, e
 		limit: func(l int64) {
 			count = l
 		},
-		write: func(buf *buffer) (int64, error) {
-			n, err := buf.WriteFromReader(r, count)
+		write: func(view *buffer.View) (int64, error) {
+			n, err := view.WriteFromReader(r, count)
 			count -= n
 			return n, err
 		},
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index efebfd872..ded95f532 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -303,26 +303,14 @@ func (t *Task) rseqAddrInterrupt() {
 		return
 	}
 
-	buf = t.CopyScratchBuffer(linux.SizeOfRSeqCriticalSection)
-	if _, err := t.CopyInBytes(critAddr, buf); err != nil {
+	var cs linux.RSeqCriticalSection
+	if err := cs.CopyIn(t, critAddr); err != nil {
 		t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
 		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
 		return
 	}
 
-	// Manually marshal RSeqCriticalSection as this is in the hot path when
-	// rseq is enabled. It must be as fast as possible.
-	//
-	// TODO(b/130243041): Replace with go_marshal.
-	cs := linux.RSeqCriticalSection{
-		Version:          usermem.ByteOrder.Uint32(buf[0:4]),
-		Flags:            usermem.ByteOrder.Uint32(buf[4:8]),
-		Start:            usermem.ByteOrder.Uint64(buf[8:16]),
-		PostCommitOffset: usermem.ByteOrder.Uint64(buf[16:24]),
-		Abort:            usermem.ByteOrder.Uint64(buf[24:32]),
-	}
-
 	if cs.Version != 0 {
 		t.Debugf("Unknown version in %+v", cs)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 981e8c7fe..c0dbbe890 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -37,6 +37,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -424,6 +425,11 @@ type Task struct {
 	// abstractSockets is protected by mu.
 	abstractSockets *AbstractSocketNamespace
 
+	// mountNamespaceVFS2 is the task's mount namespace.
+	//
+	// It is protected by mu. It is owned by the task goroutine.
+	mountNamespaceVFS2 *vfs.MountNamespace
+
 	// parentDeathSignal is sent to this task's thread group when its parent exits.
 	//
 	// parentDeathSignal is protected by mu.
@@ -481,13 +487,10 @@ type Task struct {
 	numaPolicy   int32
 	numaNodeMask uint64
 
-	// If netns is true, the task is in a non-root network namespace. Network
-	// namespaces aren't currently implemented in full; being in a network
-	// namespace simply prevents the task from observing any network devices
-	// (including loopback) or using abstract socket addresses (see unix(7)).
+	// netns is the task's network namespace. netns is never nil.
 	//
-	// netns is protected by mu. netns is owned by the task goroutine.
-	netns bool
+	// netns is protected by mu.
+	netns *inet.Namespace
 
 	// If rseqPreempted is true, before the next call to p.Switch(),
 	// interrupt rseq critical regions as defined by rseqAddr and
@@ -552,6 +555,13 @@ type Task struct {
 	//
 	// startTime is protected by mu.
 	startTime ktime.Time
+
+	// oomScoreAdj is the task's OOM score adjustment. This is currently not
+	// used but is maintained for consistency.
+	// TODO(gvisor.dev/issue/1967)
+	//
+	// oomScoreAdj is protected by mu, and is owned by the task goroutine.
+	oomScoreAdj int32
 }
 
 func (t *Task) savePtraceTracer() *Task {
@@ -638,6 +648,11 @@ func (t *Task) Value(key interface{}) interface{} {
 		return int32(t.ThreadGroup().ID())
 	case fs.CtxRoot:
 		return t.fsContext.RootDirectory()
+	case vfs.CtxRoot:
+		return t.fsContext.RootDirectoryVFS2()
+	case vfs.CtxMountNamespace:
+		t.mountNamespaceVFS2.IncRef()
+		return t.mountNamespaceVFS2
 	case fs.CtxDirentCacheLimiter:
 		return t.k.DirentCacheLimiter
 	case inet.CtxStack:
@@ -701,6 +716,14 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
 // Preconditions: The caller must be running on the task goroutine, or t.mu
 // must be locked.
 func (t *Task) IsChrooted() bool {
+	if VFS2Enabled {
+		realRoot := t.mountNamespaceVFS2.Root()
+		defer realRoot.DecRef()
+		root := t.fsContext.RootDirectoryVFS2()
+		defer root.DecRef()
+		return root != realRoot
+	}
+
 	realRoot := t.tg.mounts.Root()
 	defer realRoot.DecRef()
 	root := t.fsContext.RootDirectory()
@@ -774,6 +797,15 @@ func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error)
 	return fds[0], nil
 }
 
+// NewFDFromVFS2 is a convenience wrapper for t.FDTable().NewFDVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.Get.
+func (t *Task) NewFDFromVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
+	return t.fdTable.NewFDVFS2(t, fd, file, flags)
+}
+
 // NewFDAt is a convenience wrapper for t.FDTable().NewFDAt.
 //
 // This automatically passes the task as the context.
@@ -783,6 +815,15 @@ func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error {
 	return t.fdTable.NewFDAt(t, fd, file, flags)
 }
 
+// NewFDAtVFS2 is a convenience wrapper for t.FDTable().NewFDAtVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDAtVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) error {
+	return t.fdTable.NewFDAtVFS2(t, fd, file, flags)
+}
+
 // WithMuLocked executes f with t.mu locked.
 func (t *Task) WithMuLocked(f func(*Task)) {
 	t.mu.Lock()
@@ -796,6 +837,15 @@ func (t *Task) MountNamespace() *fs.MountNamespace {
 	return t.tg.mounts
 }
 
+// MountNamespaceVFS2 returns t's MountNamespace. A reference is taken on the
+// returned mount namespace.
+func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.mountNamespaceVFS2.IncRef()
+	return t.mountNamespaceVFS2
+}
+
 // AbstractSockets returns t's AbstractSocketNamespace.
 func (t *Task) AbstractSockets() *AbstractSocketNamespace {
 	return t.abstractSockets
@@ -805,3 +855,28 @@ func (t *Task) AbstractSockets() *AbstractSocketNamespace {
 func (t *Task) ContainerID() string {
 	return t.containerID
 }
+
+// OOMScoreAdj gets the task's OOM score adjustment.
+func (t *Task) OOMScoreAdj() (int32, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.ExitState() == TaskExitDead {
+		return 0, syserror.ESRCH
+	}
+	return t.oomScoreAdj, nil
+}
+
+// SetOOMScoreAdj sets the task's OOM score adjustment. The value should be
+// between -1000 and 1000 inclusive.
+func (t *Task) SetOOMScoreAdj(adj int32) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.ExitState() == TaskExitDead {
+		return syserror.ESRCH
+	}
+	if adj > 1000 || adj < -1000 {
+		return syserror.EINVAL
+	}
+	t.oomScoreAdj = adj
+	return nil
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 53d4d211b..dda502bb8 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -54,8 +55,7 @@ type SharingOptions struct {
 	NewUserNamespace bool
 
 	// If NewNetworkNamespace is true, the task should have an independent
-	// network namespace. (Note that network namespaces are not really
-	// implemented; see comment on Task.netns for details.)
+	// network namespace.
 	NewNetworkNamespace bool
 
 	// If NewFiles is true, the task should use an independent file descriptor
@@ -199,6 +199,17 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		ipcns = NewIPCNamespace(userns)
 	}
 
+	netns := t.NetworkNamespace()
+	if opts.NewNetworkNamespace {
+		netns = inet.NewNamespace(netns)
+	}
+
+	// TODO(b/63601033): Implement CLONE_NEWNS.
+	mntnsVFS2 := t.mountNamespaceVFS2
+	if mntnsVFS2 != nil {
+		mntnsVFS2.IncRef()
+	}
+
 	tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace)
 	if err != nil {
 		return 0, nil, err
@@ -241,7 +252,9 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	rseqAddr := usermem.Addr(0)
 	rseqSignature := uint32(0)
 	if opts.NewThreadGroup {
-		tg.mounts.IncRef()
+		if tg.mounts != nil {
+			tg.mounts.IncRef()
+		}
 		sh := t.tg.signalHandlers
 		if opts.NewSignalHandlers {
 			sh = sh.Fork()
@@ -251,6 +264,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		rseqSignature = t.rseqSignature
 	}
 
+	adj, err := t.OOMScoreAdj()
+	if err != nil {
+		return 0, nil, err
+	}
+
 	cfg := &TaskConfig{
 		Kernel:                  t.k,
 		ThreadGroup:             tg,
@@ -260,23 +278,22 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		FDTable:                 fdTable,
 		Credentials:             creds,
 		Niceness:                t.Niceness(),
-		NetworkNamespaced:       t.netns,
+		NetworkNamespace:        netns,
 		AllowedCPUMask:          t.CPUMask(),
 		UTSNamespace:            utsns,
 		IPCNamespace:            ipcns,
 		AbstractSocketNamespace: t.abstractSockets,
+		MountNamespaceVFS2:      mntnsVFS2,
 		RSeqAddr:                rseqAddr,
 		RSeqSignature:           rseqSignature,
 		ContainerID:             t.ContainerID(),
+		OOMScoreAdj:             adj,
 	}
 	if opts.NewThreadGroup {
 		cfg.Parent = t
 	} else {
 		cfg.InheritParent = t
 	}
-	if opts.NewNetworkNamespace {
-		cfg.NetworkNamespaced = true
-	}
 	nt, err := t.tg.pidns.owner.NewTask(cfg)
 	if err != nil {
 		if opts.NewThreadGroup {
@@ -473,7 +490,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 			t.mu.Unlock()
 			return syserror.EPERM
 		}
-		t.netns = true
+		t.netns = inet.NewNamespace(t.netns)
 	}
 	if opts.NewUTSNamespace {
 		if !haveCapSysAdmin {
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 2d6e7733c..0158b1788 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -136,11 +136,11 @@ func (t *Task) Stack() *arch.Stack {
 func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) {
 	// If File is not nil, we should load that instead of resolving Filename.
 	if args.File != nil {
-		args.Filename = args.File.MappedName(ctx)
+		args.Filename = args.File.PathnameWithDeleted(ctx)
 	}
 
 	// Prepare a new user address space to load into.
-	m := mm.NewMemoryManager(k, k)
+	m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation)
 	defer m.DecUsers(ctx)
 	args.MemoryManager = m
 
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 8f57a34a6..00c425cca 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -220,7 +220,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	t.mu.Unlock()
 	t.unstopVforkParent()
 	// NOTE(b/30316266): All locks must be dropped prior to calling Activate.
-	t.MemoryManager().Activate()
+	t.MemoryManager().Activate(t)
 
 	t.ptraceExec(oldTID)
 	return (*runSyscallExit)(nil)
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 435761e5a..c4ade6e8e 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -269,6 +269,13 @@ func (*runExitMain) execute(t *Task) taskRunState {
 	t.fsContext.DecRef()
 	t.fdTable.DecRef()
 
+	t.mu.Lock()
+	if t.mountNamespaceVFS2 != nil {
+		t.mountNamespaceVFS2.DecRef()
+		t.mountNamespaceVFS2 = nil
+	}
+	t.mu.Unlock()
+
 	// If this is the last task to exit from the thread group, release the
 	// thread group's resources.
 	if lastExiter {
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index 41259210c..eeccaa197 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -32,21 +32,21 @@ const (
 // Infof logs an formatted info message by calling log.Infof.
 func (t *Task) Infof(fmt string, v ...interface{}) {
 	if log.IsLogging(log.Info) {
-		log.Infof(t.logPrefix.Load().(string)+fmt, v...)
+		log.InfofAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
 	}
 }
 
 // Warningf logs a warning string by calling log.Warningf.
 func (t *Task) Warningf(fmt string, v ...interface{}) {
 	if log.IsLogging(log.Warning) {
-		log.Warningf(t.logPrefix.Load().(string)+fmt, v...)
+		log.WarningfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
 	}
 }
 
 // Debugf creates a debug string that includes the task ID.
 func (t *Task) Debugf(fmt string, v ...interface{}) {
 	if log.IsLogging(log.Debug) {
-		log.Debugf(t.logPrefix.Load().(string)+fmt, v...)
+		log.DebugfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
 	}
 }
 
@@ -198,18 +198,11 @@ func (t *Task) traceExecEvent(tc *TaskContext) {
 	if !trace.IsEnabled() {
 		return
 	}
-	d := tc.MemoryManager.Executable()
-	if d == nil {
+	file := tc.MemoryManager.Executable()
+	if file == nil {
 		trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>")
 		return
 	}
-	defer d.DecRef()
-	root := t.fsContext.RootDirectory()
-	if root == nil {
-		trace.Logf(t.traceContext, traceCategory, "exec: << no root directory >>")
-		return
-	}
-	defer root.DecRef()
-	n, _ := d.FullName(root)
-	trace.Logf(t.traceContext, traceCategory, "exec: %s", n)
+	defer file.DecRef()
+	trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t))
 }
diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go
index 172a31e1d..f7711232c 100644
--- a/pkg/sentry/kernel/task_net.go
+++ b/pkg/sentry/kernel/task_net.go
@@ -22,14 +22,23 @@ import (
 func (t *Task) IsNetworkNamespaced() bool {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	return t.netns
+	return !t.netns.IsRoot()
 }
 
 // NetworkContext returns the network stack used by the task. NetworkContext
 // may return nil if no network stack is available.
+//
+// TODO(gvisor.dev/issue/1833): Migrate callers of this method to
+// NetworkNamespace().
 func (t *Task) NetworkContext() inet.Stack {
-	if t.IsNetworkNamespaced() {
-		return nil
-	}
-	return t.k.networkStack
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns.Stack()
+}
+
+// NetworkNamespace returns the network namespace observed by the task.
+func (t *Task) NetworkNamespace() *inet.Namespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns
 }
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 5568c91bc..799cbcd93 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -126,13 +126,39 @@ func (t *Task) doStop() {
 	}
 }
 
+func (*runApp) handleCPUIDInstruction(t *Task) error {
+	if len(arch.CPUIDInstruction) == 0 {
+		// CPUID emulation isn't supported, but this code can be
+		// executed, because the ptrace platform returns
+		// ErrContextSignalCPUID on page faults too. Look at
+		// pkg/sentry/platform/ptrace/ptrace.go:context.Switch for more
+		// details.
+		return platform.ErrContextSignal
+	}
+	// Is this a CPUID instruction?
+	region := trace.StartRegion(t.traceContext, cpuidRegion)
+	expected := arch.CPUIDInstruction[:]
+	found := make([]byte, len(expected))
+	_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+	if err == nil && bytes.Equal(expected, found) {
+		// Skip the cpuid instruction.
+		t.Arch().CPUIDEmulate(t)
+		t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
+		region.End()
+
+		return nil
+	}
+	region.End() // Not an actual CPUID, but required copy-in.
+	return platform.ErrContextSignal
+}
+
 // The runApp state checks for interrupts before executing untrusted
 // application code.
 //
 // +stateify savable
 type runApp struct{}
 
-func (*runApp) execute(t *Task) taskRunState {
+func (app *runApp) execute(t *Task) taskRunState {
 	if t.interrupted() {
 		// Checkpointing instructs tasks to stop by sending an interrupt, so we
 		// must check for stops before entering runInterrupt (instead of
@@ -237,21 +263,10 @@ func (*runApp) execute(t *Task) taskRunState {
 		return (*runApp)(nil)
 
 	case platform.ErrContextSignalCPUID:
-		// Is this a CPUID instruction?
-		region := trace.StartRegion(t.traceContext, cpuidRegion)
-		expected := arch.CPUIDInstruction[:]
-		found := make([]byte, len(expected))
-		_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
-		if err == nil && bytes.Equal(expected, found) {
-			// Skip the cpuid instruction.
-			t.Arch().CPUIDEmulate(t)
-			t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
-			region.End()
-
+		if err := app.handleCPUIDInstruction(t); err == nil {
 			// Resume execution.
 			return (*runApp)(nil)
 		}
-		region.End() // Not an actual CPUID, but required copy-in.
 
 		// The instruction at the given RIP was not a CPUID, and we
 		// fallthrough to the default signal deliver behavior below.
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index de838beef..2bbf48bb8 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -17,10 +17,12 @@ package kernel
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -64,9 +66,8 @@ type TaskConfig struct {
 	// Niceness is the niceness of the new task.
 	Niceness int
 
-	// If NetworkNamespaced is true, the new task should observe a non-root
-	// network namespace.
-	NetworkNamespaced bool
+	// NetworkNamespace is the network namespace to be used for the new task.
+	NetworkNamespace *inet.Namespace
 
 	// AllowedCPUMask contains the cpus that this task can run on.
 	AllowedCPUMask sched.CPUSet
@@ -80,6 +81,9 @@ type TaskConfig struct {
 	// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
 	AbstractSocketNamespace *AbstractSocketNamespace
 
+	// MountNamespaceVFS2 is the MountNamespace of the new task.
+	MountNamespaceVFS2 *vfs.MountNamespace
+
 	// RSeqAddr is a pointer to the the userspace linux.RSeq structure.
 	RSeqAddr usermem.Addr
 
@@ -89,6 +93,9 @@ type TaskConfig struct {
 
 	// ContainerID is the container the new task belongs to.
 	ContainerID string
+
+	// oomScoreAdj is the task's OOM score adjustment.
+	OOMScoreAdj int32
 }
 
 // NewTask creates a new task defined by cfg.
@@ -116,28 +123,30 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 			parent:   cfg.Parent,
 			children: make(map[*Task]struct{}),
 		},
-		runState:        (*runApp)(nil),
-		interruptChan:   make(chan struct{}, 1),
-		signalMask:      cfg.SignalMask,
-		signalStack:     arch.SignalStack{Flags: arch.SignalStackFlagDisable},
-		tc:              *tc,
-		fsContext:       cfg.FSContext,
-		fdTable:         cfg.FDTable,
-		p:               cfg.Kernel.Platform.NewContext(),
-		k:               cfg.Kernel,
-		ptraceTracees:   make(map[*Task]struct{}),
-		allowedCPUMask:  cfg.AllowedCPUMask.Copy(),
-		ioUsage:         &usage.IO{},
-		niceness:        cfg.Niceness,
-		netns:           cfg.NetworkNamespaced,
-		utsns:           cfg.UTSNamespace,
-		ipcns:           cfg.IPCNamespace,
-		abstractSockets: cfg.AbstractSocketNamespace,
-		rseqCPU:         -1,
-		rseqAddr:        cfg.RSeqAddr,
-		rseqSignature:   cfg.RSeqSignature,
-		futexWaiter:     futex.NewWaiter(),
-		containerID:     cfg.ContainerID,
+		runState:           (*runApp)(nil),
+		interruptChan:      make(chan struct{}, 1),
+		signalMask:         cfg.SignalMask,
+		signalStack:        arch.SignalStack{Flags: arch.SignalStackFlagDisable},
+		tc:                 *tc,
+		fsContext:          cfg.FSContext,
+		fdTable:            cfg.FDTable,
+		p:                  cfg.Kernel.Platform.NewContext(),
+		k:                  cfg.Kernel,
+		ptraceTracees:      make(map[*Task]struct{}),
+		allowedCPUMask:     cfg.AllowedCPUMask.Copy(),
+		ioUsage:            &usage.IO{},
+		niceness:           cfg.Niceness,
+		netns:              cfg.NetworkNamespace,
+		utsns:              cfg.UTSNamespace,
+		ipcns:              cfg.IPCNamespace,
+		abstractSockets:    cfg.AbstractSocketNamespace,
+		mountNamespaceVFS2: cfg.MountNamespaceVFS2,
+		rseqCPU:            -1,
+		rseqAddr:           cfg.RSeqAddr,
+		rseqSignature:      cfg.RSeqSignature,
+		futexWaiter:        futex.NewWaiter(),
+		containerID:        cfg.ContainerID,
+		oomScoreAdj:        cfg.OOMScoreAdj,
 	}
 	t.creds.Store(cfg.Credentials)
 	t.endStopCond.L = &t.tg.signalHandlers.mu
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index 2bf3ce8a8..b02044ad2 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -30,7 +30,7 @@ var MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown())
 // Activate ensures that the task has an active address space.
 func (t *Task) Activate() {
 	if mm := t.MemoryManager(); mm != nil {
-		if err := mm.Activate(); err != nil {
+		if err := mm.Activate(t); err != nil {
 			panic("unable to activate mm: " + err.Error())
 		}
 	}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 768e958d2..268f62e9d 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -256,7 +256,7 @@ type ThreadGroup struct {
 	tty *TTY
 }
 
-// NewThreadGroup returns a new, empty thread group in PID namespace ns. The
+// NewThreadGroup returns a new, empty thread group in PID namespace pidns. The
 // thread group leader will send its parent terminationSignal when it exits.
 // The new thread group isn't visible to the system until a task has been
 // created inside of it by a successful call to TaskSet.NewTask.
@@ -317,7 +317,9 @@ func (tg *ThreadGroup) release() {
 	for _, it := range its {
 		it.DestroyTimer()
 	}
-	tg.mounts.DecRef()
+	if tg.mounts != nil {
+		tg.mounts.DecRef()
+	}
 }
 
 // forEachChildThreadGroupLocked indicates over all child ThreadGroups.
author	Andrei Vagin <avagin@google.com>	2020-03-06 21:12:32 -0800
committer	GitHub <noreply@github.com>	2020-03-06 21:12:32 -0800
commit	bf87da89d3c43555fd57e8f1d7aed21b6da78de4 (patch)
tree	744ba15a2f663d64d56bf1c70bdfe4096f6a1af9 /pkg/sentry/kernel
parent	89957c6c87b5ad5c7bac68f93d9472388db57702 (diff)
parent	ddfc7239be94fa9711df877a66a9718aabff8b96 (diff)