21 files changed, 867 insertions, 164 deletions
diff --git a/pkg/fdchannel/BUILD b/pkg/fdchannel/BUILD
new file mode 100644
index 000000000..e54e7371c
--- /dev/null
+++ b/pkg/fdchannel/BUILD
@@ -0,0 +1,17 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "fdchannel",
+    srcs = ["fdchannel_unsafe.go"],
+    importpath = "gvisor.dev/gvisor/pkg/fdchannel",
+    visibility = ["//visibility:public"],
+)
+
+go_test(
+    name = "fdchannel_test",
+    size = "small",
+    srcs = ["fdchannel_test.go"],
+    embed = [":fdchannel"],
+)
diff --git a/pkg/fdchannel/fdchannel_test.go b/pkg/fdchannel/fdchannel_test.go
new file mode 100644
index 000000000..5d01dc636
--- /dev/null
+++ b/pkg/fdchannel/fdchannel_test.go
@@ -0,0 +1,131 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fdchannel
+
+import (
+	"io/ioutil"
+	"os"
+	"sync"
+	"syscall"
+	"testing"
+	"time"
+)
+
+func TestSendRecvFD(t *testing.T) {
+	sendFile, err := ioutil.TempFile("", "fdchannel_test_")
+	if err != nil {
+		t.Fatalf("failed to create temporary file: %v", err)
+	}
+	defer sendFile.Close()
+
+	chanFDs, err := NewConnectedSockets()
+	if err != nil {
+		t.Fatalf("failed to create fdchannel sockets: %v", err)
+	}
+	sendEP := NewEndpoint(chanFDs[0])
+	defer sendEP.Destroy()
+	recvEP := NewEndpoint(chanFDs[1])
+	defer recvEP.Destroy()
+
+	recvFD, err := recvEP.RecvFDNonblock()
+	if err != syscall.EAGAIN && err != syscall.EWOULDBLOCK {
+		t.Errorf("RecvFDNonblock before SendFD: got (%d, %v), wanted (<unspecified>, EAGAIN or EWOULDBLOCK", recvFD, err)
+	}
+
+	if err := sendEP.SendFD(int(sendFile.Fd())); err != nil {
+		t.Fatalf("SendFD failed: %v", err)
+	}
+	recvFD, err = recvEP.RecvFD()
+	if err != nil {
+		t.Fatalf("RecvFD failed: %v", err)
+	}
+	recvFile := os.NewFile(uintptr(recvFD), "received file")
+	defer recvFile.Close()
+
+	sendInfo, err := sendFile.Stat()
+	if err != nil {
+		t.Fatalf("failed to stat sent file: %v", err)
+	}
+	sendInfoSys := sendInfo.Sys()
+	sendStat, ok := sendInfoSys.(*syscall.Stat_t)
+	if !ok {
+		t.Fatalf("sent file's FileInfo is backed by unknown type %T", sendInfoSys)
+	}
+
+	recvInfo, err := recvFile.Stat()
+	if err != nil {
+		t.Fatalf("failed to stat received file: %v", err)
+	}
+	recvInfoSys := recvInfo.Sys()
+	recvStat, ok := recvInfoSys.(*syscall.Stat_t)
+	if !ok {
+		t.Fatalf("received file's FileInfo is backed by unknown type %T", recvInfoSys)
+	}
+
+	if sendStat.Dev != recvStat.Dev || sendStat.Ino != recvStat.Ino {
+		t.Errorf("sent file (dev=%d, ino=%d) does not match received file (dev=%d, ino=%d)", sendStat.Dev, sendStat.Ino, recvStat.Dev, recvStat.Ino)
+	}
+}
+
+func TestShutdownThenRecvFD(t *testing.T) {
+	sendFile, err := ioutil.TempFile("", "fdchannel_test_")
+	if err != nil {
+		t.Fatalf("failed to create temporary file: %v", err)
+	}
+	defer sendFile.Close()
+
+	chanFDs, err := NewConnectedSockets()
+	if err != nil {
+		t.Fatalf("failed to create fdchannel sockets: %v", err)
+	}
+	sendEP := NewEndpoint(chanFDs[0])
+	defer sendEP.Destroy()
+	recvEP := NewEndpoint(chanFDs[1])
+	defer recvEP.Destroy()
+
+	recvEP.Shutdown()
+	if _, err := recvEP.RecvFD(); err == nil {
+		t.Error("RecvFD succeeded unexpectedly")
+	}
+}
+
+func TestRecvFDThenShutdown(t *testing.T) {
+	sendFile, err := ioutil.TempFile("", "fdchannel_test_")
+	if err != nil {
+		t.Fatalf("failed to create temporary file: %v", err)
+	}
+	defer sendFile.Close()
+
+	chanFDs, err := NewConnectedSockets()
+	if err != nil {
+		t.Fatalf("failed to create fdchannel sockets: %v", err)
+	}
+	sendEP := NewEndpoint(chanFDs[0])
+	defer sendEP.Destroy()
+	recvEP := NewEndpoint(chanFDs[1])
+	defer recvEP.Destroy()
+
+	var receiverWG sync.WaitGroup
+	receiverWG.Add(1)
+	go func() {
+		defer receiverWG.Done()
+		if _, err := recvEP.RecvFD(); err == nil {
+			t.Error("RecvFD succeeded unexpectedly")
+		}
+	}()
+	defer receiverWG.Wait()
+	time.Sleep(time.Second) // to ensure recvEP.RecvFD() has blocked
+	recvEP.Shutdown()
+}
diff --git a/pkg/fdchannel/fdchannel_unsafe.go b/pkg/fdchannel/fdchannel_unsafe.go
new file mode 100644
index 000000000..367235be5
--- /dev/null
+++ b/pkg/fdchannel/fdchannel_unsafe.go
@@ -0,0 +1,146 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
+
+// Package fdchannel implements passing file descriptors between processes over
+// Unix domain sockets.
+package fdchannel
+
+import (
+	"fmt"
+	"reflect"
+	"sync/atomic"
+	"syscall"
+	"unsafe"
+)
+
+// int32 is the real type of a file descriptor.
+const sizeofInt32 = int(unsafe.Sizeof(int32(0)))
+
+// NewConnectedSockets returns a pair of file descriptors, owned by the caller,
+// representing connected sockets that may be passed to separate calls to
+// NewEndpoint to create connected Endpoints.
+func NewConnectedSockets() ([2]int, error) {
+	return syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC, 0)
+}
+
+// Endpoint sends file descriptors to, and receives them from, another
+// connected Endpoint.
+//
+// Endpoint is not copyable or movable by value.
+type Endpoint struct {
+	sockfd int32 // accessed using atomic memory operations
+	msghdr syscall.Msghdr
+	cmsg   *syscall.Cmsghdr // followed by sizeofInt32 bytes of data
+}
+
+// Init must be called on zero-value Endpoints before first use. sockfd must be
+// a blocking AF_UNIX SOCK_SEQPACKET socket.
+func (ep *Endpoint) Init(sockfd int) {
+	// "Datagram sockets in various domains (e.g., the UNIX and Internet
+	// domains) permit zero-length datagrams." - recv(2). Experimentally,
+	// sendmsg+recvmsg for a zero-length datagram is slightly faster than
+	// sendmsg+recvmsg for a single byte over a stream socket.
+	cmsgSlice := make([]byte, syscall.CmsgSpace(sizeofInt32))
+	cmsgReflect := (*reflect.SliceHeader)((unsafe.Pointer)(&cmsgSlice))
+	ep.sockfd = int32(sockfd)
+	ep.msghdr.Control = (*byte)((unsafe.Pointer)(cmsgReflect.Data))
+	ep.cmsg = (*syscall.Cmsghdr)((unsafe.Pointer)(cmsgReflect.Data))
+	// ep.msghdr.Controllen and ep.cmsg.* are mutated by recvmsg(2), so they're
+	// set before calling sendmsg/recvmsg.
+}
+
+// NewEndpoint is a convenience function that returns an initialized Endpoint
+// allocated on the heap.
+func NewEndpoint(sockfd int) *Endpoint {
+	ep := &Endpoint{}
+	ep.Init(sockfd)
+	return ep
+}
+
+// Destroy releases resources owned by ep. No other Endpoint methods may be
+// called after Destroy.
+func (ep *Endpoint) Destroy() {
+	// These need not use sync/atomic since there must not be any concurrent
+	// calls to Endpoint methods.
+	if ep.sockfd >= 0 {
+		syscall.Close(int(ep.sockfd))
+		ep.sockfd = -1
+	}
+}
+
+// Shutdown causes concurrent and future calls to ep.SendFD(), ep.RecvFD(), and
+// ep.RecvFDNonblock(), as well as the same calls in the connected Endpoint, to
+// unblock and return errors. It does not wait for concurrent calls to return.
+//
+// Shutdown is the only Endpoint method that may be called concurrently with
+// other methods.
+func (ep *Endpoint) Shutdown() {
+	if sockfd := int(atomic.SwapInt32(&ep.sockfd, -1)); sockfd >= 0 {
+		syscall.Shutdown(sockfd, syscall.SHUT_RDWR)
+		syscall.Close(sockfd)
+	}
+}
+
+// SendFD sends the open file description represented by the given file
+// descriptor to the connected Endpoint.
+func (ep *Endpoint) SendFD(fd int) error {
+	cmsgLen := syscall.CmsgLen(sizeofInt32)
+	ep.cmsg.Level = syscall.SOL_SOCKET
+	ep.cmsg.Type = syscall.SCM_RIGHTS
+	ep.cmsg.SetLen(cmsgLen)
+	*ep.cmsgData() = int32(fd)
+	ep.msghdr.SetControllen(cmsgLen)
+	_, _, e := syscall.Syscall(syscall.SYS_SENDMSG, uintptr(atomic.LoadInt32(&ep.sockfd)), uintptr((unsafe.Pointer)(&ep.msghdr)), 0)
+	if e != 0 {
+		return e
+	}
+	return nil
+}
+
+// RecvFD receives an open file description from the connected Endpoint and
+// returns a file descriptor representing it, owned by the caller.
+func (ep *Endpoint) RecvFD() (int, error) {
+	return ep.recvFD(0)
+}
+
+// RecvFDNonblock receives an open file description from the connected Endpoint
+// and returns a file descriptor representing it, owned by the caller. If there
+// are no pending receivable open file descriptions, RecvFDNonblock returns
+// (<unspecified>, EAGAIN or EWOULDBLOCK).
+func (ep *Endpoint) RecvFDNonblock() (int, error) {
+	return ep.recvFD(syscall.MSG_DONTWAIT)
+}
+
+func (ep *Endpoint) recvFD(flags uintptr) (int, error) {
+	cmsgLen := syscall.CmsgLen(sizeofInt32)
+	ep.msghdr.SetControllen(cmsgLen)
+	_, _, e := syscall.Syscall(syscall.SYS_RECVMSG, uintptr(atomic.LoadInt32(&ep.sockfd)), uintptr((unsafe.Pointer)(&ep.msghdr)), flags|syscall.MSG_TRUNC)
+	if e != 0 {
+		return -1, e
+	}
+	if int(ep.msghdr.Controllen) != cmsgLen {
+		return -1, fmt.Errorf("received control message has incorrect length: got %d, wanted %d", ep.msghdr.Controllen, cmsgLen)
+	}
+	if ep.cmsg.Level != syscall.SOL_SOCKET || ep.cmsg.Type != syscall.SCM_RIGHTS {
+		return -1, fmt.Errorf("received control message has incorrect (level, type): got (%v, %v), wanted (%v, %v)", ep.cmsg.Level, ep.cmsg.Type, syscall.SOL_SOCKET, syscall.SCM_RIGHTS)
+	}
+	return int(*ep.cmsgData()), nil
+}
+
+func (ep *Endpoint) cmsgData() *int32 {
+	// syscall.CmsgLen(0) == syscall.cmsgAlignOf(syscall.SizeofCmsghdr)
+	return (*int32)((unsafe.Pointer)(uintptr((unsafe.Pointer)(ep.cmsg)) + uintptr(syscall.CmsgLen(0))))
+}
diff --git a/pkg/log/log.go b/pkg/log/log.go
index 0765a1963..ab9ad01ef 100644
--- a/pkg/log/log.go
+++ b/pkg/log/log.go
@@ -50,6 +50,19 @@ const (
 	Debug
 )
 
+func (l Level) String() string {
+	switch l {
+	case Warning:
+		return "Warning"
+	case Info:
+		return "Info"
+	case Debug:
+		return "Debug"
+	default:
+		return fmt.Sprintf("Invalid level: %d", l)
+	}
+}
+
 // Emitter is the final destination for logs.
 type Emitter interface {
 	// Emit emits the given log statement. This allows for control over the
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index 15a1fe8a9..5dccb8e3c 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "control",
     srcs = [
         "control.go",
+        "logging.go",
         "pprof.go",
         "proc.go",
         "state.go",
@@ -26,8 +27,10 @@ go_library(
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
         "//pkg/sentry/state",
+        "//pkg/sentry/strace",
         "//pkg/sentry/usage",
         "//pkg/sentry/watchdog",
+        "//pkg/tcpip/link/sniffer",
         "//pkg/urpc",
     ],
 )
diff --git a/pkg/sentry/control/logging.go b/pkg/sentry/control/logging.go
new file mode 100644
index 000000000..811f24324
--- /dev/null
+++ b/pkg/sentry/control/logging.go
@@ -0,0 +1,136 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package control
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/strace"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+)
+
+// LoggingArgs are the arguments to use for changing the logging
+// level and strace list.
+type LoggingArgs struct {
+	// SetLevel is a flag used to indicate that we should update
+	// the logging level. We should be able to change the strace
+	// list without affecting the logging level and vice versa.
+	SetLevel bool
+
+	// Level is the log level that will be set if SetLevel is true.
+	Level log.Level
+
+	// SetLogPackets indicates that we should update the log packets flag.
+	SetLogPackets bool
+
+	// LogPackets is the actual value to set for LogPackets.
+	// SetLogPackets must be enabled to indicate that we're changing
+	// the value.
+	LogPackets bool
+
+	// SetStrace is a flag used to indicate that strace related
+	// arguments were passed in.
+	SetStrace bool
+
+	// EnableStrace is a flag from the CLI that specifies whether to
+	// enable strace at all. If this flag is false then a completely
+	// pristine copy of the syscall table will be swapped in. This
+	// approach is used to remain consistent with an empty strace
+	// whitelist meaning trace all system calls.
+	EnableStrace bool
+
+	// Strace is the whitelist of syscalls to trace to log. If this
+	// and StraceEventWhitelist are empty trace all system calls.
+	StraceWhitelist []string
+
+	// SetEventStrace is a flag used to indicate that event strace
+	// related arguments were passed in.
+	SetEventStrace bool
+
+	// StraceEventWhitelist is the whitelist of syscalls to trace
+	// to event log.
+	StraceEventWhitelist []string
+}
+
+// Logging provides functions related to logging.
+type Logging struct{}
+
+// Change will change the log level and strace arguments. Although
+// this functions signature requires an error it never acctually
+// return san error. It's required by the URPC interface.
+// Additionally, it may look odd that this is the only method
+// attached to an empty struct but this is also part of how
+// URPC dispatches.
+func (l *Logging) Change(args *LoggingArgs, code *int) error {
+	if args.SetLevel {
+		// Logging uses an atomic for the level so this is thread safe.
+		log.SetLevel(args.Level)
+	}
+
+	if args.SetLogPackets {
+		if args.LogPackets {
+			atomic.StoreUint32(&sniffer.LogPackets, 1)
+		} else {
+			atomic.StoreUint32(&sniffer.LogPackets, 0)
+		}
+		log.Infof("LogPackets set to: %v", atomic.LoadUint32(&sniffer.LogPackets))
+	}
+
+	if args.SetStrace {
+		if err := l.configureStrace(args); err != nil {
+			return fmt.Errorf("error configuring strace: %v", err)
+		}
+	}
+
+	if args.SetEventStrace {
+		if err := l.configureEventStrace(args); err != nil {
+			return fmt.Errorf("error configuring event strace: %v", err)
+		}
+	}
+
+	return nil
+}
+
+func (l *Logging) configureStrace(args *LoggingArgs) error {
+	if args.EnableStrace {
+		// Install the whitelist specified.
+		if len(args.StraceWhitelist) > 0 {
+			if err := strace.Enable(args.StraceWhitelist, strace.SinkTypeLog); err != nil {
+				return err
+			}
+		} else {
+			// For convenience, if strace is enabled but whitelist
+			// is empty, enable everything to log.
+			strace.EnableAll(strace.SinkTypeLog)
+		}
+	} else {
+		// Uninstall all strace functions.
+		strace.Disable(strace.SinkTypeLog)
+	}
+	return nil
+}
+
+func (l *Logging) configureEventStrace(args *LoggingArgs) error {
+	if len(args.StraceEventWhitelist) > 0 {
+		if err := strace.Enable(args.StraceEventWhitelist, strace.SinkTypeEvent); err != nil {
+			return err
+		}
+	} else {
+		strace.Disable(strace.SinkTypeEvent)
+	}
+	return nil
+}
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 55ffe6c0c..8e1f5674d 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -310,9 +310,11 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error
 		return 0, syserror.ErrInterrupted
 	}
 
+	unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append)
 	// Handle append mode.
 	if f.Flags().Append {
 		if err := f.offsetForAppend(ctx, &f.offset); err != nil {
+			unlockAppendMu()
 			f.mu.Unlock()
 			return 0, err
 		}
@@ -322,6 +324,7 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error
 	limit, ok := f.checkLimit(ctx, f.offset)
 	switch {
 	case ok && limit == 0:
+		unlockAppendMu()
 		f.mu.Unlock()
 		return 0, syserror.ErrExceedsFileSizeLimit
 	case ok:
@@ -333,6 +336,7 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error
 	if n >= 0 && !f.flags.NonSeekable {
 		atomic.StoreInt64(&f.offset, f.offset+n)
 	}
+	unlockAppendMu()
 	f.mu.Unlock()
 	return n, err
 }
@@ -348,13 +352,11 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64
 	// However, on Linux, if a file is opened with O_APPEND,  pwrite()
 	// appends data to the end of the file, regardless of the value of
 	// offset."
+	unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append)
+	defer unlockAppendMu()
+
 	if f.Flags().Append {
-		if !f.mu.Lock(ctx) {
-			return 0, syserror.ErrInterrupted
-		}
-		defer f.mu.Unlock()
 		if err := f.offsetForAppend(ctx, &offset); err != nil {
-			f.mu.Unlock()
 			return 0, err
 		}
 	}
@@ -373,7 +375,7 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64
 
 // offsetForAppend sets the given offset to the end of the file.
 //
-// Precondition: the underlying file mutex should be held.
+// Precondition: the file.Dirent.Inode.appendMu mutex should be held for writing.
 func (f *File) offsetForAppend(ctx context.Context, offset *int64) error {
 	uattr, err := f.Dirent.Inode.UnstableAttr(ctx)
 	if err != nil {
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index a889586aa..e4aae1135 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -15,6 +15,8 @@
 package fs
 
 import (
+	"sync"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/metric"
@@ -55,6 +57,12 @@ type Inode struct {
 
 	// overlay is the overlay entry for this Inode.
 	overlay *overlayEntry
+
+	// appendMu is used to synchronize write operations into files which
+	// have been opened with O_APPEND. Operations which change a file size
+	// have to take this lock for read. Write operations to files with
+	// O_APPEND have to take this lock for write.
+	appendMu sync.RWMutex `state:"nosave"`
 }
 
 // LockCtx is an Inode's lock context and contains different personalities of locks; both
@@ -337,6 +345,8 @@ func (i *Inode) Truncate(ctx context.Context, d *Dirent, size int64) error {
 	if i.overlay != nil {
 		return overlayTruncate(ctx, i.overlay, d, size)
 	}
+	i.appendMu.RLock()
+	defer i.appendMu.RUnlock()
 	return i.InodeOperations.Truncate(ctx, i, size)
 }
 
@@ -438,3 +448,12 @@ func (i *Inode) CheckCapability(ctx context.Context, cp linux.Capability) bool {
 	}
 	return creds.HasCapability(cp)
 }
+
+func (i *Inode) lockAppendMu(appendMode bool) func() {
+	if appendMode {
+		i.appendMu.Lock()
+		return i.appendMu.Unlock
+	}
+	i.appendMu.RLock()
+	return i.appendMu.RUnlock
+}
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 57b8b14e3..920d86042 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -537,12 +537,6 @@ func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error {
 	if o.upper != nil {
 		err = o.upper.check(ctx, p)
 	} else {
-		if p.Write {
-			// Since writes will be redirected to the upper filesystem, the lower
-			// filesystem need not be writable, but must be readable for copy-up.
-			p.Write = false
-			p.Read = true
-		}
 		err = o.lower.check(ctx, p)
 	}
 	o.copyMu.RUnlock()
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index b70c583f3..da41a10ab 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -31,6 +31,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/binary",
         "//pkg/log",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index caa1a5c4d..37694620c 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -20,6 +20,7 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -55,9 +56,8 @@ func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSo
 			"psched": newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))),
 			"ptype":  newStaticProcInode(ctx, msrc, []byte("Type Device      Function")),
 			"route":  newStaticProcInode(ctx, msrc, []byte("Iface   Destination     Gateway         Flags   RefCnt  Use     Metric  Mask            MTU     Window  IRTT")),
-			"tcp":    newStaticProcInode(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")),
-
-			"udp": newStaticProcInode(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops")),
+			"tcp":    seqfile.NewSeqFileInode(ctx, &netTCP{k: k}, msrc),
+			"udp":    newStaticProcInode(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops")),
 
 			"unix": seqfile.NewSeqFileInode(ctx, &netUnix{k: k}, msrc),
 		}
@@ -210,10 +210,6 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
 	}
 
 	var buf bytes.Buffer
-	// Header
-	fmt.Fprintf(&buf, "Num       RefCount Protocol Flags    Type St Inode Path\n")
-
-	// Entries
 	for _, se := range n.k.ListSockets() {
 		s := se.Sock.Get()
 		if s == nil {
@@ -222,6 +218,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
 		}
 		sfile := s.(*fs.File)
 		if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
+			s.DecRef()
 			// Not a unix socket.
 			continue
 		}
@@ -281,12 +278,160 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
 		}
 		fmt.Fprintf(&buf, "\n")
 
-		sfile.DecRef()
+		s.DecRef()
+	}
+
+	data := []seqfile.SeqData{
+		{
+			Buf:    []byte("Num       RefCount Protocol Flags    Type St Inode Path\n"),
+			Handle: n,
+		},
+		{
+			Buf:    buf.Bytes(),
+			Handle: n,
+		},
+	}
+	return data, 0
+}
+
+// netTCP implements seqfile.SeqSource for /proc/net/tcp.
+//
+// +stateify savable
+type netTCP struct {
+	k *kernel.Kernel
+}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (*netTCP) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
+func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	t := kernel.TaskFromContext(ctx)
+
+	if h != nil {
+		return nil, 0
+	}
+
+	var buf bytes.Buffer
+	for _, se := range n.k.ListSockets() {
+		s := se.Sock.Get()
+		if s == nil {
+			log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock)
+			continue
+		}
+		sfile := s.(*fs.File)
+		sops, ok := sfile.FileOperations.(socket.Socket)
+		if !ok {
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
+		}
+		if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) {
+			s.DecRef()
+			// Not tcp4 sockets.
+			continue
+		}
+
+		// Linux's documentation for the fields below can be found at
+		// https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt.
+		// For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock().
+		// Note that the header doesn't contain labels for all the fields.
+
+		// Field: sl; entry number.
+		fmt.Fprintf(&buf, "%4d: ", se.ID)
+
+		portBuf := make([]byte, 2)
+
+		// Field: local_adddress.
+		var localAddr linux.SockAddrInet
+		if local, _, err := sops.GetSockName(t); err == nil {
+			localAddr = local.(linux.SockAddrInet)
+		}
+		binary.LittleEndian.PutUint16(portBuf, localAddr.Port)
+		fmt.Fprintf(&buf, "%08X:%04X ",
+			binary.LittleEndian.Uint32(localAddr.Addr[:]),
+			portBuf)
+
+		// Field: rem_address.
+		var remoteAddr linux.SockAddrInet
+		if remote, _, err := sops.GetPeerName(t); err == nil {
+			remoteAddr = remote.(linux.SockAddrInet)
+		}
+		binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port)
+		fmt.Fprintf(&buf, "%08X:%04X ",
+			binary.LittleEndian.Uint32(remoteAddr.Addr[:]),
+			portBuf)
+
+		// Field: state; socket state.
+		fmt.Fprintf(&buf, "%02X ", sops.State())
+
+		// Field: tx_queue, rx_queue; number of packets in the transmit and
+		// receive queue. Unimplemented.
+		fmt.Fprintf(&buf, "%08X:%08X ", 0, 0)
+
+		// Field: tr, tm->when; timer active state and number of jiffies
+		// until timer expires. Unimplemented.
+		fmt.Fprintf(&buf, "%02X:%08X ", 0, 0)
+
+		// Field: retrnsmt; number of unrecovered RTO timeouts.
+		// Unimplemented.
+		fmt.Fprintf(&buf, "%08X ", 0)
+
+		// Field: uid.
+		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
+		if err != nil {
+			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
+			fmt.Fprintf(&buf, "%5d ", 0)
+		} else {
+			fmt.Fprintf(&buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
+		}
+
+		// Field: timeout; number of unanswered 0-window probes.
+		// Unimplemented.
+		fmt.Fprintf(&buf, "%8d ", 0)
+
+		// Field: inode.
+		fmt.Fprintf(&buf, "%8d ", sfile.InodeID())
+
+		// Field: refcount. Don't count the ref we obtain while deferencing
+		// the weakref to this socket.
+		fmt.Fprintf(&buf, "%d ", sfile.ReadRefs()-1)
+
+		// Field: Socket struct address. Redacted due to the same reason as
+		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
+		fmt.Fprintf(&buf, "%#016p ", (*socket.Socket)(nil))
+
+		// Field: retransmit timeout. Unimplemented.
+		fmt.Fprintf(&buf, "%d ", 0)
+
+		// Field: predicted tick of soft clock (delayed ACK control data).
+		// Unimplemented.
+		fmt.Fprintf(&buf, "%d ", 0)
+
+		// Field: (ack.quick<<1)|ack.pingpong, Unimplemented.
+		fmt.Fprintf(&buf, "%d ", 0)
+
+		// Field: sending congestion window, Unimplemented.
+		fmt.Fprintf(&buf, "%d ", 0)
+
+		// Field: Slow start size threshold, -1 if threshold >= 0xFFFF.
+		// Unimplemented, report as large threshold.
+		fmt.Fprintf(&buf, "%d", -1)
+
+		fmt.Fprintf(&buf, "\n")
+
+		s.DecRef()
 	}
 
-	data := []seqfile.SeqData{{
-		Buf:    buf.Bytes(),
-		Handle: (*netUnix)(nil),
-	}}
+	data := []seqfile.SeqData{
+		{
+			Buf:    []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode                                                     \n"),
+			Handle: n,
+		},
+		{
+			Buf:    buf.Bytes(),
+			Handle: n,
+		},
+	}
 	return data, 0
 }
diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go
index 978dc679b..eed1c2854 100644
--- a/pkg/sentry/fs/splice.go
+++ b/pkg/sentry/fs/splice.go
@@ -88,6 +88,8 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64,
 
 	// Check append-only mode and the limit.
 	if !dstPipe {
+		unlock := dst.Dirent.Inode.lockAppendMu(dst.Flags().Append)
+		defer unlock()
 		if dst.Flags().Append {
 			if opts.DstOffset {
 				// We need to acquire the lock.
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index a5fcdf969..881dd89b0 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -133,6 +133,9 @@ func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSou
 	}
 
 	// Construct a mount which will follow the cache options provided.
+	//
+	// TODO(gvisor.dev/issue/179): There should be no reason to disable
+	// caching once bind mounts are properly supported.
 	var msrc *fs.MountSource
 	switch options[cacheKey] {
 	case "", cacheAll:
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 37cb8c8b9..42779baa9 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -4,6 +4,17 @@ load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
+    name = "atomicptr_credentials",
+    out = "atomicptr_credentials.go",
+    package = "auth",
+    suffix = "Credentials",
+    template = "//third_party/gvsync:generic_atomicptr",
+    types = {
+        "Value": "Credentials",
+    },
+)
+
+go_template_instance(
     name = "id_map_range",
     out = "id_map_range.go",
     package = "auth",
@@ -34,6 +45,7 @@ go_template_instance(
 go_library(
     name = "auth",
     srcs = [
+        "atomicptr_credentials.go",
         "auth.go",
         "capability_set.go",
         "context.go",
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index c297c5973..2e3a39d3b 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -386,10 +386,11 @@ type Task struct {
 
 	// creds is the task's credentials.
 	//
-	// creds is protected by mu, however the value itself is immutable and can
-	// only be changed by a copy. After reading the pointer, access will
-	// proceed outside the scope of mu. creds is owned by the task goroutine.
-	creds *auth.Credentials
+	// creds.Load() may be called without synchronization. creds.Store() is
+	// serialized by mu. creds is owned by the task goroutine. All
+	// auth.Credentials objects that creds may point to, or have pointed to
+	// in the past, must be treated as immutable.
+	creds auth.AtomicPtrCredentials
 
 	// utsns is the task's UTS namespace.
 	//
@@ -597,7 +598,7 @@ func (t *Task) Value(key interface{}) interface{} {
 	case CtxTask:
 		return t
 	case auth.CtxCredentials:
-		return t.creds
+		return t.Credentials()
 	case context.CtxThreadGroupID:
 		return int32(t.ThreadGroup().ID())
 	case fs.CtxRoot:
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 0e621f0d1..b5cc3860d 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -425,6 +425,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 	if opts.NewAddressSpace || opts.NewSignalHandlers {
 		return syserror.EINVAL
 	}
+	creds := t.Credentials()
 	if opts.NewThreadGroup {
 		t.tg.signalHandlers.mu.Lock()
 		if t.tg.tasksCount != 1 {
@@ -439,8 +440,6 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 		if t.IsChrooted() {
 			return syserror.EPERM
 		}
-		// This temporary is needed because Go.
-		creds := t.Credentials()
 		newUserNS, err := creds.NewChildUserNamespace()
 		if err != nil {
 			return err
@@ -449,6 +448,8 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 		if err != nil {
 			return err
 		}
+		// Need to reload creds, becaue t.SetUserNamespace() changed task credentials.
+		creds = t.Credentials()
 	}
 	haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN)
 	if opts.NewPIDNamespace {
@@ -473,7 +474,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 		}
 		// Note that this must happen after NewUserNamespace, so the
 		// new user namespace is used if there is one.
-		t.utsns = t.utsns.Clone(t.creds.UserNamespace)
+		t.utsns = t.utsns.Clone(creds.UserNamespace)
 	}
 	if opts.NewIPCNamespace {
 		if !haveCapSysAdmin {
@@ -482,7 +483,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 		}
 		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
 		// namespace"
-		t.ipcns = NewIPCNamespace(t.creds.UserNamespace)
+		t.ipcns = NewIPCNamespace(creds.UserNamespace)
 	}
 	var oldfds *FDMap
 	if opts.NewFiles {
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index 39c138925..78ff14b20 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -25,30 +25,22 @@ import (
 //
 // This value must be considered immutable.
 func (t *Task) Credentials() *auth.Credentials {
-	t.mu.Lock()
-	defer t.mu.Unlock()
-	return t.creds
+	return t.creds.Load()
 }
 
 // UserNamespace returns the user namespace associated with the task.
 func (t *Task) UserNamespace() *auth.UserNamespace {
-	t.mu.Lock()
-	defer t.mu.Unlock()
-	return t.creds.UserNamespace
+	return t.Credentials().UserNamespace
 }
 
 // HasCapabilityIn checks if the task has capability cp in user namespace ns.
 func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool {
-	t.mu.Lock()
-	defer t.mu.Unlock()
-	return t.creds.HasCapabilityIn(cp, ns)
+	return t.Credentials().HasCapabilityIn(cp, ns)
 }
 
 // HasCapability checks if the task has capability cp in its user namespace.
 func (t *Task) HasCapability(cp linux.Capability) bool {
-	t.mu.Lock()
-	defer t.mu.Unlock()
-	return t.creds.HasCapability(cp)
+	return t.Credentials().HasCapability(cp)
 }
 
 // SetUID implements the semantics of setuid(2).
@@ -57,9 +49,12 @@ func (t *Task) SetUID(uid auth.UID) error {
 	if !uid.Ok() {
 		return syserror.EINVAL
 	}
+
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	kuid := t.creds.UserNamespace.MapToKUID(uid)
+
+	creds := t.Credentials()
+	kuid := creds.UserNamespace.MapToKUID(uid)
 	if !kuid.Ok() {
 		return syserror.EINVAL
 	}
@@ -67,17 +62,17 @@ func (t *Task) SetUID(uid auth.UID) error {
 	// effective UID of the caller is root (more precisely: if the caller has
 	// the CAP_SETUID capability), the real UID and saved set-user-ID are also
 	// set." - setuid(2)
-	if t.creds.HasCapability(linux.CAP_SETUID) {
+	if creds.HasCapability(linux.CAP_SETUID) {
 		t.setKUIDsUncheckedLocked(kuid, kuid, kuid)
 		return nil
 	}
 	// "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID
 	// capability) and uid does not match the real UID or saved set-user-ID of
 	// the calling process."
-	if kuid != t.creds.RealKUID && kuid != t.creds.SavedKUID {
+	if kuid != creds.RealKUID && kuid != creds.SavedKUID {
 		return syserror.EPERM
 	}
-	t.setKUIDsUncheckedLocked(t.creds.RealKUID, kuid, t.creds.SavedKUID)
+	t.setKUIDsUncheckedLocked(creds.RealKUID, kuid, creds.SavedKUID)
 	return nil
 }
 
@@ -87,37 +82,38 @@ func (t *Task) SetREUID(r, e auth.UID) error {
 	defer t.mu.Unlock()
 	// "Supplying a value of -1 for either the real or effective user ID forces
 	// the system to leave that ID unchanged." - setreuid(2)
-	newR := t.creds.RealKUID
+	creds := t.Credentials()
+	newR := creds.RealKUID
 	if r.Ok() {
-		newR = t.creds.UserNamespace.MapToKUID(r)
+		newR = creds.UserNamespace.MapToKUID(r)
 		if !newR.Ok() {
 			return syserror.EINVAL
 		}
 	}
-	newE := t.creds.EffectiveKUID
+	newE := creds.EffectiveKUID
 	if e.Ok() {
-		newE = t.creds.UserNamespace.MapToKUID(e)
+		newE = creds.UserNamespace.MapToKUID(e)
 		if !newE.Ok() {
 			return syserror.EINVAL
 		}
 	}
-	if !t.creds.HasCapability(linux.CAP_SETUID) {
+	if !creds.HasCapability(linux.CAP_SETUID) {
 		// "Unprivileged processes may only set the effective user ID to the
 		// real user ID, the effective user ID, or the saved set-user-ID."
-		if newE != t.creds.RealKUID && newE != t.creds.EffectiveKUID && newE != t.creds.SavedKUID {
+		if newE != creds.RealKUID && newE != creds.EffectiveKUID && newE != creds.SavedKUID {
 			return syserror.EPERM
 		}
 		// "Unprivileged users may only set the real user ID to the real user
 		// ID or the effective user ID."
-		if newR != t.creds.RealKUID && newR != t.creds.EffectiveKUID {
+		if newR != creds.RealKUID && newR != creds.EffectiveKUID {
 			return syserror.EPERM
 		}
 	}
 	// "If the real user ID is set (i.e., ruid is not -1) or the effective user
 	// ID is set to a value not equal to the previous real user ID, the saved
 	// set-user-ID will be set to the new effective user ID."
-	newS := t.creds.SavedKUID
-	if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKUID) {
+	newS := creds.SavedKUID
+	if r.Ok() || (e.Ok() && newE != creds.EffectiveKUID) {
 		newS = newE
 	}
 	t.setKUIDsUncheckedLocked(newR, newE, newS)
@@ -136,23 +132,24 @@ func (t *Task) SetRESUID(r, e, s auth.UID) error {
 	// arguments equals -1, the corresponding value is not changed." -
 	// setresuid(2)
 	var err error
-	newR := t.creds.RealKUID
+	creds := t.Credentials()
+	newR := creds.RealKUID
 	if r.Ok() {
-		newR, err = t.creds.UseUID(r)
+		newR, err = creds.UseUID(r)
 		if err != nil {
 			return err
 		}
 	}
-	newE := t.creds.EffectiveKUID
+	newE := creds.EffectiveKUID
 	if e.Ok() {
-		newE, err = t.creds.UseUID(e)
+		newE, err = creds.UseUID(e)
 		if err != nil {
 			return err
 		}
 	}
-	newS := t.creds.SavedKUID
+	newS := creds.SavedKUID
 	if s.Ok() {
-		newS, err = t.creds.UseUID(s)
+		newS, err = creds.UseUID(s)
 		if err != nil {
 			return err
 		}
@@ -163,10 +160,10 @@ func (t *Task) SetRESUID(r, e, s auth.UID) error {
 
 // Preconditions: t.mu must be locked.
 func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
-	root := t.creds.UserNamespace.MapToKUID(auth.RootUID)
-	oldR, oldE, oldS := t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID
-	t.creds = t.creds.Fork() // See doc for creds.
-	t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID = newR, newE, newS
+	creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds.
+	root := creds.UserNamespace.MapToKUID(auth.RootUID)
+	oldR, oldE, oldS := creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID
+	creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID = newR, newE, newS
 
 	// "1. If one or more of the real, effective or saved set user IDs was
 	// previously 0, and as a result of the UID changes all of these IDs have a
@@ -184,9 +181,9 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
 		// being cleared." (A thread's effective capability set is always
 		// cleared when such a credential change is made,
 		// regardless of the setting of the "keep capabilities" flag.)
-		if !t.creds.KeepCaps {
-			t.creds.PermittedCaps = 0
-			t.creds.EffectiveCaps = 0
+		if !creds.KeepCaps {
+			creds.PermittedCaps = 0
+			creds.EffectiveCaps = 0
 		}
 	}
 	// """
@@ -197,9 +194,9 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
 	// permitted set is copied to the effective set.
 	// """
 	if oldE == root && newE != root {
-		t.creds.EffectiveCaps = 0
+		creds.EffectiveCaps = 0
 	} else if oldE != root && newE == root {
-		t.creds.EffectiveCaps = t.creds.PermittedCaps
+		creds.EffectiveCaps = creds.PermittedCaps
 	}
 	// "4. If the filesystem user ID is changed from 0 to nonzero (see
 	// setfsuid(2)), then the following capabilities are cleared from the
@@ -220,6 +217,7 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
 		// Not documented, but compare Linux's kernel/cred.c:commit_creds().
 		t.parentDeathSignal = 0
 	}
+	t.creds.Store(creds)
 }
 
 // SetGID implements the semantics of setgid(2).
@@ -227,20 +225,23 @@ func (t *Task) SetGID(gid auth.GID) error {
 	if !gid.Ok() {
 		return syserror.EINVAL
 	}
+
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	kgid := t.creds.UserNamespace.MapToKGID(gid)
+
+	creds := t.Credentials()
+	kgid := creds.UserNamespace.MapToKGID(gid)
 	if !kgid.Ok() {
 		return syserror.EINVAL
 	}
-	if t.creds.HasCapability(linux.CAP_SETGID) {
+	if creds.HasCapability(linux.CAP_SETGID) {
 		t.setKGIDsUncheckedLocked(kgid, kgid, kgid)
 		return nil
 	}
-	if kgid != t.creds.RealKGID && kgid != t.creds.SavedKGID {
+	if kgid != creds.RealKGID && kgid != creds.SavedKGID {
 		return syserror.EPERM
 	}
-	t.setKGIDsUncheckedLocked(t.creds.RealKGID, kgid, t.creds.SavedKGID)
+	t.setKGIDsUncheckedLocked(creds.RealKGID, kgid, creds.SavedKGID)
 	return nil
 }
 
@@ -248,30 +249,32 @@ func (t *Task) SetGID(gid auth.GID) error {
 func (t *Task) SetREGID(r, e auth.GID) error {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	newR := t.creds.RealKGID
+
+	creds := t.Credentials()
+	newR := creds.RealKGID
 	if r.Ok() {
-		newR = t.creds.UserNamespace.MapToKGID(r)
+		newR = creds.UserNamespace.MapToKGID(r)
 		if !newR.Ok() {
 			return syserror.EINVAL
 		}
 	}
-	newE := t.creds.EffectiveKGID
+	newE := creds.EffectiveKGID
 	if e.Ok() {
-		newE = t.creds.UserNamespace.MapToKGID(e)
+		newE = creds.UserNamespace.MapToKGID(e)
 		if !newE.Ok() {
 			return syserror.EINVAL
 		}
 	}
-	if !t.creds.HasCapability(linux.CAP_SETGID) {
-		if newE != t.creds.RealKGID && newE != t.creds.EffectiveKGID && newE != t.creds.SavedKGID {
+	if !creds.HasCapability(linux.CAP_SETGID) {
+		if newE != creds.RealKGID && newE != creds.EffectiveKGID && newE != creds.SavedKGID {
 			return syserror.EPERM
 		}
-		if newR != t.creds.RealKGID && newR != t.creds.EffectiveKGID {
+		if newR != creds.RealKGID && newR != creds.EffectiveKGID {
 			return syserror.EPERM
 		}
 	}
-	newS := t.creds.SavedKGID
-	if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKGID) {
+	newS := creds.SavedKGID
+	if r.Ok() || (e.Ok() && newE != creds.EffectiveKGID) {
 		newS = newE
 	}
 	t.setKGIDsUncheckedLocked(newR, newE, newS)
@@ -280,26 +283,29 @@ func (t *Task) SetREGID(r, e auth.GID) error {
 
 // SetRESGID implements the semantics of the setresgid(2) syscall.
 func (t *Task) SetRESGID(r, e, s auth.GID) error {
+	var err error
+
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	var err error
-	newR := t.creds.RealKGID
+
+	creds := t.Credentials()
+	newR := creds.RealKGID
 	if r.Ok() {
-		newR, err = t.creds.UseGID(r)
+		newR, err = creds.UseGID(r)
 		if err != nil {
 			return err
 		}
 	}
-	newE := t.creds.EffectiveKGID
+	newE := creds.EffectiveKGID
 	if e.Ok() {
-		newE, err = t.creds.UseGID(e)
+		newE, err = creds.UseGID(e)
 		if err != nil {
 			return err
 		}
 	}
-	newS := t.creds.SavedKGID
+	newS := creds.SavedKGID
 	if s.Ok() {
-		newS, err = t.creds.UseGID(s)
+		newS, err = creds.UseGID(s)
 		if err != nil {
 			return err
 		}
@@ -309,9 +315,9 @@ func (t *Task) SetRESGID(r, e, s auth.GID) error {
 }
 
 func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) {
-	oldE := t.creds.EffectiveKGID
-	t.creds = t.creds.Fork() // See doc for creds.
-	t.creds.RealKGID, t.creds.EffectiveKGID, t.creds.SavedKGID = newR, newE, newS
+	creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds.
+	oldE := creds.EffectiveKGID
+	creds.RealKGID, creds.EffectiveKGID, creds.SavedKGID = newR, newE, newS
 
 	if oldE != newE {
 		// "[dumpability] is reset to the current value contained in
@@ -327,6 +333,7 @@ func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) {
 		// kernel/cred.c:commit_creds().
 		t.parentDeathSignal = 0
 	}
+	t.creds.Store(creds)
 }
 
 // SetExtraGIDs attempts to change t's supplemental groups. All IDs are
@@ -334,19 +341,21 @@ func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) {
 func (t *Task) SetExtraGIDs(gids []auth.GID) error {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	if !t.creds.HasCapability(linux.CAP_SETGID) {
+	creds := t.Credentials()
+	if !creds.HasCapability(linux.CAP_SETGID) {
 		return syserror.EPERM
 	}
 	kgids := make([]auth.KGID, len(gids))
 	for i, gid := range gids {
-		kgid := t.creds.UserNamespace.MapToKGID(gid)
+		kgid := creds.UserNamespace.MapToKGID(gid)
 		if !kgid.Ok() {
 			return syserror.EINVAL
 		}
 		kgids[i] = kgid
 	}
-	t.creds = t.creds.Fork() // See doc for creds.
-	t.creds.ExtraKGIDs = kgids
+	creds = creds.Fork() // The credentials object is immutable. See doc for creds.
+	creds.ExtraKGIDs = kgids
+	t.creds.Store(creds)
 	return nil
 }
 
@@ -360,27 +369,29 @@ func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.Capabili
 	if effective & ^permitted != 0 {
 		return syserror.EPERM
 	}
+	creds := t.Credentials()
 	// "It is also a limiting superset for the capabilities that may be added
 	// to the inheritable set by a thread that does not have the CAP_SETPCAP
 	// capability in its effective set."
-	if !t.creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(t.creds.InheritableCaps|t.creds.PermittedCaps) != 0) {
+	if !creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(creds.InheritableCaps|creds.PermittedCaps) != 0) {
 		return syserror.EPERM
 	}
 	// "If a thread drops a capability from its permitted set, it can never
 	// reacquire that capability (unless it execve(2)s ..."
-	if permitted & ^t.creds.PermittedCaps != 0 {
+	if permitted & ^creds.PermittedCaps != 0 {
 		return syserror.EPERM
 	}
 	// "... if a capability is not in the bounding set, then a thread can't add
 	// this capability to its inheritable set, even if it was in its permitted
 	// capabilities ..."
-	if inheritable & ^(t.creds.InheritableCaps|t.creds.BoundingCaps) != 0 {
+	if inheritable & ^(creds.InheritableCaps|creds.BoundingCaps) != 0 {
 		return syserror.EPERM
 	}
-	t.creds = t.creds.Fork() // See doc for creds.
-	t.creds.PermittedCaps = permitted
-	t.creds.InheritableCaps = inheritable
-	t.creds.EffectiveCaps = effective
+	creds = creds.Fork() // The credentials object is immutable. See doc for creds.
+	creds.PermittedCaps = permitted
+	creds.InheritableCaps = inheritable
+	creds.EffectiveCaps = effective
+	t.creds.Store(creds)
 	return nil
 }
 
@@ -389,11 +400,13 @@ func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.Capabili
 func (t *Task) DropBoundingCapability(cp linux.Capability) error {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	if !t.creds.HasCapability(linux.CAP_SETPCAP) {
+	creds := t.Credentials()
+	if !creds.HasCapability(linux.CAP_SETPCAP) {
 		return syserror.EPERM
 	}
-	t.creds = t.creds.Fork() // See doc for creds.
-	t.creds.BoundingCaps &^= auth.CapabilitySetOf(cp)
+	creds = creds.Fork() // The credentials object is immutable. See doc for creds.
+	creds.BoundingCaps &^= auth.CapabilitySetOf(cp)
+	t.creds.Store(creds)
 	return nil
 }
 
@@ -402,31 +415,33 @@ func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 
+	creds := t.Credentials()
 	// "A process reassociating itself with a user namespace must have the
 	// CAP_SYS_ADMIN capability in the target user namespace." - setns(2)
 	//
 	// If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN
 	// in ns (by rule 3 in auth.Credentials.HasCapability).
-	if !t.creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) {
+	if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) {
 		return syserror.EPERM
 	}
 
-	t.creds = t.creds.Fork() // See doc for creds.
-	t.creds.UserNamespace = ns
+	creds = creds.Fork() // The credentials object is immutable. See doc for creds.
+	creds.UserNamespace = ns
 	// "The child process created by clone(2) with the CLONE_NEWUSER flag
 	// starts out with a complete set of capabilities in the new user
 	// namespace. Likewise, a process that creates a new user namespace using
 	// unshare(2) or joins an existing user namespace using setns(2) gains a
 	// full set of capabilities in that namespace."
-	t.creds.PermittedCaps = auth.AllCapabilities
-	t.creds.InheritableCaps = 0
-	t.creds.EffectiveCaps = auth.AllCapabilities
-	t.creds.BoundingCaps = auth.AllCapabilities
+	creds.PermittedCaps = auth.AllCapabilities
+	creds.InheritableCaps = 0
+	creds.EffectiveCaps = auth.AllCapabilities
+	creds.BoundingCaps = auth.AllCapabilities
 	// "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER
 	// flag sets the "securebits" flags (see capabilities(7)) to their default
 	// values (all flags disabled) in the child (for clone(2)) or caller (for
 	// unshare(2), or setns(2)." - user_namespaces(7)
-	t.creds.KeepCaps = false
+	creds.KeepCaps = false
+	t.creds.Store(creds)
 
 	return nil
 }
@@ -435,8 +450,9 @@ func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error {
 func (t *Task) SetKeepCaps(k bool) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	t.creds = t.creds.Fork() // See doc for creds.
-	t.creds.KeepCaps = k
+	creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds.
+	creds.KeepCaps = k
+	t.creds.Store(creds)
 }
 
 // updateCredsForExec updates t.creds to reflect an execve().
@@ -512,15 +528,16 @@ func (t *Task) updateCredsForExecLocked() {
 	// the effective user ID.
 	var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0
 	fileEffective := false
-	root := t.creds.UserNamespace.MapToKUID(auth.RootUID)
-	if t.creds.EffectiveKUID == root || t.creds.RealKUID == root {
-		newPermitted = t.creds.InheritableCaps | t.creds.BoundingCaps
-		if t.creds.EffectiveKUID == root {
+	creds := t.Credentials()
+	root := creds.UserNamespace.MapToKUID(auth.RootUID)
+	if creds.EffectiveKUID == root || creds.RealKUID == root {
+		newPermitted = creds.InheritableCaps | creds.BoundingCaps
+		if creds.EffectiveKUID == root {
 			fileEffective = true
 		}
 	}
 
-	t.creds = t.creds.Fork() // See doc for creds.
+	creds = creds.Fork() // The credentials object is immutable. See doc for creds.
 
 	// Now we enter poorly-documented, somewhat confusing territory. (The
 	// accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds
@@ -562,27 +579,28 @@ func (t *Task) updateCredsForExecLocked() {
 	// But since no_new_privs is always set (A3 is always true), this becomes
 	// much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1
 	// is a no-op. So we can just do C1 and C2 unconditionally.
-	if t.creds.EffectiveKUID != t.creds.RealKUID || t.creds.EffectiveKGID != t.creds.RealKGID {
-		t.creds.EffectiveKUID = t.creds.RealKUID
-		t.creds.EffectiveKGID = t.creds.RealKGID
+	if creds.EffectiveKUID != creds.RealKUID || creds.EffectiveKGID != creds.RealKGID {
+		creds.EffectiveKUID = creds.RealKUID
+		creds.EffectiveKGID = creds.RealKGID
 		t.parentDeathSignal = 0
 	}
 	// (Saved set-user-ID is always set to the new effective user ID, and saved
 	// set-group-ID is always set to the new effective group ID, regardless of
 	// the above.)
-	t.creds.SavedKUID = t.creds.RealKUID
-	t.creds.SavedKGID = t.creds.RealKGID
-	t.creds.PermittedCaps &= newPermitted
+	creds.SavedKUID = creds.RealKUID
+	creds.SavedKGID = creds.RealKGID
+	creds.PermittedCaps &= newPermitted
 	if fileEffective {
-		t.creds.EffectiveCaps = t.creds.PermittedCaps
+		creds.EffectiveCaps = creds.PermittedCaps
 	} else {
-		t.creds.EffectiveCaps = 0
+		creds.EffectiveCaps = 0
 	}
 
 	// prctl(2): The "keep capabilities" value will be reset to 0 on subsequent
 	// calls to execve(2).
-	t.creds.KeepCaps = false
+	creds.KeepCaps = false
 
 	// "The bounding set is inherited at fork(2) from the thread's parent, and
 	// is preserved across an execve(2)". So we're done.
+	t.creds.Store(creds)
 }
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 9458f5c2a..72caae537 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -119,7 +119,6 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		ptraceTracees:   make(map[*Task]struct{}),
 		allowedCPUMask:  cfg.AllowedCPUMask.Copy(),
 		ioUsage:         &usage.IO{},
-		creds:           cfg.Credentials,
 		niceness:        cfg.Niceness,
 		netns:           cfg.NetworkNamespaced,
 		utsns:           cfg.UTSNamespace,
@@ -129,6 +128,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		futexWaiter:     futex.NewWaiter(),
 		containerID:     cfg.ContainerID,
 	}
+	t.creds.Store(cfg.Credentials)
 	t.endStopCond.L = &t.tg.signalHandlers.mu
 	t.ptraceTracer.Store((*Task)(nil))
 	// We don't construct t.blockingTimer until Task.run(); see that function
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index dca8e4c0e..f15b3415a 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -370,13 +370,16 @@ func (t *thread) destroy() {
 
 // init initializes trace options.
 func (t *thread) init() {
-	// Set our TRACESYSGOOD option to differeniate real SIGTRAP.
+	// Set our TRACESYSGOOD option to differeniate real SIGTRAP. We also
+	// set PTRACE_O_EXITKILL to ensure that the unexpected exit of the
+	// sentry will immediately kill the associated stubs.
+	const PTRACE_O_EXITKILL = 0x100000
 	_, _, errno := syscall.RawSyscall6(
 		syscall.SYS_PTRACE,
 		syscall.PTRACE_SETOPTIONS,
 		uintptr(t.tid),
 		0,
-		syscall.PTRACE_O_TRACESYSGOOD,
+		syscall.PTRACE_O_TRACESYSGOOD|syscall.PTRACE_O_TRACEEXIT|PTRACE_O_EXITKILL,
 		0, 0)
 	if errno != 0 {
 		panic(fmt.Sprintf("ptrace set options failed: %v", errno))
@@ -419,7 +422,7 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
 	// between syscall-enter-stop and syscall-exit-stop; it happens *after*
 	// syscall-exit-stop.)" - ptrace(2), "Syscall-stops"
 	if sig := t.wait(stopped); sig != (syscallEvent | syscall.SIGTRAP) {
-		panic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig))
+		t.dumpAndPanic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig))
 	}
 
 	// Grab registers.
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 2a41e8176..7f18b1ac8 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -379,7 +379,7 @@ var AMD64 = &kernel.SyscallTable{
 		326: syscalls.ErrorWithEvent("copy_file_range", syscall.ENOSYS, "", nil),
 		327: syscalls.Undocumented("preadv2", Preadv2),
 		328: syscalls.Undocumented("pwritev2", Pwritev2),
-		397: syscalls.Undocumented("statx", Statx),
+		332: syscalls.Supported("statx", Statx),
 	},
 
 	Emulate: map[usermem.Addr]uintptr{
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index d9ed02c99..04962726a 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -304,44 +304,100 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
 		return 0, syserror.ENOENT
 	}
 
-	err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error {
-		if !fs.IsDir(d.Inode.StableAttr) {
-			return syserror.ENOTDIR
-		}
+	fileFlags := linuxToFlags(flags)
+	// Linux always adds the O_LARGEFILE flag when running in 64-bit mode.
+	fileFlags.LargeFile = true
+
+	err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, parent *fs.Dirent, name string, remainingTraversals uint) error {
+		// Resolve the name to see if it exists, and follow any
+		// symlinks along the way. We must do the symlink resolution
+		// manually because if the symlink target does not exist, we
+		// must create the target (and not the symlink itself).
+		var (
+			found *fs.Dirent
+			err   error
+		)
+		for {
+			if !fs.IsDir(parent.Inode.StableAttr) {
+				return syserror.ENOTDIR
+			}
 
-		fileFlags := linuxToFlags(flags)
-		// Linux always adds the O_LARGEFILE flag when running in 64-bit mode.
-		fileFlags.LargeFile = true
+			// Start by looking up the dirent at 'name'.
+			found, err = t.MountNamespace().FindLink(t, root, parent, name, &remainingTraversals)
+			if err != nil {
+				break
+			}
+
+			// We found something (possibly a symlink). If the
+			// O_EXCL flag was passed, then we can immediately
+			// return EEXIST.
+			if flags&linux.O_EXCL != 0 {
+				return syserror.EEXIST
+			}
+
+			// If we have a non-symlink, then we can proceed.
+			if !fs.IsSymlink(found.Inode.StableAttr) {
+				break
+			}
+
+			// If O_NOFOLLOW was passed, then don't try to resolve
+			// anything.
+			if flags&linux.O_NOFOLLOW != 0 {
+				return syserror.ELOOP
+			}
+
+			// Try to resolve the symlink directly to a Dirent.
+			resolved, err := found.Inode.Getlink(t)
+			if err == nil || err != fs.ErrResolveViaReadlink {
+				// No more resolution necessary.
+				found.DecRef()
+				found = resolved
+				break
+			}
+
+			// Resolve the symlink to a path via Readlink.
+			path, err := found.Inode.Readlink(t)
+			if err != nil {
+				break
+			}
+			remainingTraversals--
+
+			// Get the new parent from the target path.
+			newParentPath, newName := fs.SplitLast(path)
+			newParent, err := t.MountNamespace().FindInode(t, root, parent, newParentPath, &remainingTraversals)
+			if err != nil {
+				break
+			}
+
+			// Repeat the process with the parent and name of the
+			// symlink target.
+			parent.DecRef()
+			parent = newParent
+			name = newName
+		}
 
-		// Does this file exist already?
-		targetDirent, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals)
 		var newFile *fs.File
 		switch err {
 		case nil:
 			// The file existed.
-			defer targetDirent.DecRef()
-
-			// Check if we wanted to create.
-			if flags&linux.O_EXCL != 0 {
-				return syserror.EEXIST
-			}
+			defer found.DecRef()
 
 			// Like sys_open, check for a few things about the
 			// filesystem before trying to get a reference to the
 			// fs.File. The same constraints on Check apply.
-			if err := targetDirent.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil {
+			if err := found.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil {
 				return err
 			}
 
 			// Should we truncate the file?
 			if flags&linux.O_TRUNC != 0 {
-				if err := targetDirent.Inode.Truncate(t, targetDirent, 0); err != nil {
+				if err := found.Inode.Truncate(t, found, 0); err != nil {
 					return err
 				}
 			}
 
 			// Create a new fs.File.
-			newFile, err = targetDirent.Inode.GetFile(t, targetDirent, fileFlags)
+			newFile, err = found.Inode.GetFile(t, found, fileFlags)
 			if err != nil {
 				return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
 			}
@@ -350,19 +406,19 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
 			// File does not exist. Proceed with creation.
 
 			// Do we have write permissions on the parent?
-			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+			if err := parent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
 				return err
 			}
 
 			// Attempt a creation.
 			perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
-			newFile, err = d.Create(t, root, name, fileFlags, perms)
+			newFile, err = parent.Create(t, root, name, fileFlags, perms)
 			if err != nil {
 				// No luck, bail.
 				return err
 			}
 			defer newFile.DecRef()
-			targetDirent = newFile.Dirent
+			found = newFile.Dirent
 		default:
 			return err
 		}
@@ -378,10 +434,10 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
 		fd = uintptr(newFD)
 
 		// Queue the open inotify event. The creation event is
-		// automatically queued when the dirent is targetDirent. The
-		// open events are implemented at the syscall layer so we need
-		// to manually queue one here.
-		targetDirent.InotifyEvent(linux.IN_OPEN, 0)
+		// automatically queued when the dirent is found. The open
+		// events are implemented at the syscall layer so we need to
+		// manually queue one here.
+		found.InotifyEvent(linux.IN_OPEN, 0)
 
 		return nil
 	})