Check in gVisor.

PiperOrigin-RevId: 194583126 Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
author: Googler <noreply@google.com> 2018-04-27 10:37:02 -0700
committer: Adin Scannell <ascannell@google.com> 2018-04-28 01:44:26 -0400
commit: d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree: 54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/fs/host
parent: f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)
19 files changed, 3580 insertions, 0 deletions
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
new file mode 100644
index 000000000..97b64daed
--- /dev/null
+++ b/pkg/sentry/fs/host/BUILD
@@ -0,0 +1,104 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "host_state",
+    srcs = [
+        "control.go",
+        "descriptor.go",
+        "descriptor_state.go",
+        "file.go",
+        "fs.go",
+        "inode.go",
+        "inode_state.go",
+        "socket.go",
+        "socket_state.go",
+    ],
+    out = "host_state.go",
+    package = "host",
+)
+
+go_library(
+    name = "host",
+    srcs = [
+        "control.go",
+        "descriptor.go",
+        "descriptor_state.go",
+        "device.go",
+        "file.go",
+        "fs.go",
+        "host_state.go",
+        "inode.go",
+        "inode_state.go",
+        "ioctl_unsafe.go",
+        "socket.go",
+        "socket_state.go",
+        "socket_unsafe.go",
+        "util.go",
+        "util_unsafe.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/fd",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/secio",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/socket/control",
+        "//pkg/sentry/socket/unix",
+        "//pkg/sentry/uniqueid",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/link/rawfile",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/unet",
+        "//pkg/waiter",
+        "//pkg/waiter/fdnotifier",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "host_test",
+    size = "small",
+    srcs = [
+        "fs_test.go",
+        "inode_test.go",
+        "socket_test.go",
+        "wait_test.go",
+    ],
+    embed = [":host"],
+    deps = [
+        "//pkg/fd",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/usermem",
+        "//pkg/syserr",
+        "//pkg/tcpip",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/waiter",
+        "//pkg/waiter/fdnotifier",
+    ],
+)
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
new file mode 100644
index 000000000..d2b007ab2
--- /dev/null
+++ b/pkg/sentry/fs/host/control.go
@@ -0,0 +1,90 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+type scmRights struct {
+	fds []int
+}
+
+func newSCMRights(fds []int) control.SCMRights {
+	return &scmRights{fds}
+}
+
+// Files implements control.SCMRights.Files.
+func (c *scmRights) Files(ctx context.Context, max int) control.RightsFiles {
+	n := max
+	if l := len(c.fds); n > l {
+		n = l
+	}
+
+	rf := control.RightsFiles(fdsToFiles(ctx, c.fds[:n]))
+
+	// Only consume converted FDs (fdsToFiles may convert fewer than n FDs).
+	c.fds = c.fds[len(rf):]
+	return rf
+}
+
+// Clone implements unix.RightsControlMessage.Clone.
+func (c *scmRights) Clone() unix.RightsControlMessage {
+	// Host rights never need to be cloned.
+	return nil
+}
+
+// Release implements unix.RightsControlMessage.Release.
+func (c *scmRights) Release() {
+	for _, fd := range c.fds {
+		syscall.Close(fd)
+	}
+	c.fds = nil
+}
+
+// If an error is encountered, only files created before the error will be
+// returned. This is what Linux does.
+func fdsToFiles(ctx context.Context, fds []int) []*fs.File {
+	files := make([]*fs.File, 0, len(fds))
+	for _, fd := range fds {
+		// Get flags. We do it here because they may be modified
+		// by subsequent functions.
+		fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0)
+		if errno != 0 {
+			ctx.Warningf("Error retrieving host FD flags: %v", error(errno))
+			break
+		}
+
+		// Create the file backed by hostFD.
+		file, err := NewFile(ctx, fd, fs.FileOwnerFromContext(ctx))
+		if err != nil {
+			ctx.Warningf("Error creating file from host FD: %v", err)
+			break
+		}
+
+		// Set known flags.
+		file.SetFlags(fs.SettableFileFlags{
+			NonBlocking: fileFlags&syscall.O_NONBLOCK != 0,
+		})
+
+		files = append(files, file)
+	}
+	return files
+}
diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go
new file mode 100644
index 000000000..613bd06e8
--- /dev/null
+++ b/pkg/sentry/fs/host/descriptor.go
@@ -0,0 +1,118 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"path"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
+)
+
+// descriptor wraps a host fd.
+type descriptor struct {
+	// donated is true if the host fd was donated by another process.
+	donated bool
+
+	// If origFD >= 0, it is the host fd that this file was
+	// originally created from, which must be available at time
+	// of restore. Only valid if donated is true.
+	origFD int
+
+	// wouldBlock is true if value (below) points to a file that can
+	// return EWOULDBLOCK for operations that would block.
+	wouldBlock bool
+
+	// value is the wrapped host fd. It is never saved or restored
+	// directly. How it is restored depends on whether it was
+	// donated and the fs.MountSource it was originally
+	// opened/created from.
+	value int `state:"nosave"`
+}
+
+// newDescriptor returns a wrapped host file descriptor. On success,
+// the descriptor is registered for event notifications with queue.
+func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue *waiter.Queue) (*descriptor, error) {
+	ownedFD := fd
+	origFD := -1
+	if saveable {
+		var err error
+		ownedFD, err = syscall.Dup(fd)
+		if err != nil {
+			return nil, err
+		}
+		origFD = fd
+	}
+	if wouldBlock {
+		if err := syscall.SetNonblock(ownedFD, true); err != nil {
+			return nil, err
+		}
+		if err := fdnotifier.AddFD(int32(ownedFD), queue); err != nil {
+			return nil, err
+		}
+	}
+	return &descriptor{
+		donated:    donated,
+		origFD:     origFD,
+		wouldBlock: wouldBlock,
+		value:      ownedFD,
+	}, nil
+}
+
+// initAfterLoad initializes the value of the descriptor after Load.
+func (d *descriptor) initAfterLoad(mo *superOperations, id uint64, queue *waiter.Queue) error {
+	if d.donated {
+		var err error
+		d.value, err = syscall.Dup(d.origFD)
+		if err != nil {
+			return fmt.Errorf("failed to dup restored fd %d: %v", d.origFD, err)
+		}
+	} else {
+		name, ok := mo.inodeMappings[id]
+		if !ok {
+			return fmt.Errorf("failed to find path for inode number %d", id)
+		}
+		fullpath := path.Join(mo.root, name)
+
+		var err error
+		d.value, err = open(nil, fullpath)
+		if err != nil {
+			return fmt.Errorf("failed to open %q: %v", fullpath, err)
+		}
+	}
+	if d.wouldBlock {
+		if err := syscall.SetNonblock(d.value, true); err != nil {
+			return err
+		}
+		if err := fdnotifier.AddFD(int32(d.value), queue); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Release releases all resources held by descriptor.
+func (d *descriptor) Release() {
+	if d.wouldBlock {
+		fdnotifier.RemoveFD(int32(d.value))
+	}
+	if err := syscall.Close(d.value); err != nil {
+		log.Warningf("error closing fd %d: %v", d.value, err)
+	}
+	d.value = -1
+}
diff --git a/pkg/sentry/fs/host/descriptor_state.go b/pkg/sentry/fs/host/descriptor_state.go
new file mode 100644
index 000000000..7fb274451
--- /dev/null
+++ b/pkg/sentry/fs/host/descriptor_state.go
@@ -0,0 +1,29 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+// beforeSave is invoked by stateify.
+func (d *descriptor) beforeSave() {
+	if d.donated && d.origFD < 0 {
+		panic("donated file descriptor cannot be saved")
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (d *descriptor) afterLoad() {
+	// value must be manually restored by the descriptor's parent using
+	// initAfterLoad.
+	d.value = -1
+}
diff --git a/pkg/sentry/fs/host/device.go b/pkg/sentry/fs/host/device.go
new file mode 100644
index 000000000..f2a0b6b15
--- /dev/null
+++ b/pkg/sentry/fs/host/device.go
@@ -0,0 +1,25 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+)
+
+// hostFileDevice is the host file virtual device.
+var hostFileDevice = device.NewAnonMultiDevice()
+
+// hostPipeDevice is the host pipe virtual device.
+var hostPipeDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
new file mode 100644
index 000000000..bdf844337
--- /dev/null
+++ b/pkg/sentry/fs/host/file.go
@@ -0,0 +1,371 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/secio"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
+)
+
+// fileOperations implements fs.FileOperations for a host file descriptor.
+type fileOperations struct {
+	fsutil.NoopRelease `state:"nosave"`
+
+	// iops are the Inode operations for this file.
+	iops *inodeOperations `state:"wait"`
+
+	// a scratch buffer for reading directory entries.
+	dirinfo *dirInfo `state:"nosave"`
+
+	// dirCursor is the directory cursor.
+	dirCursor string
+
+	// allowIoctl determines whether ioctls should be passed through to the
+	// host.
+	allowIoctl bool
+}
+
+// fileOperations implements fs.FileOperations.
+var _ fs.FileOperations = (*fileOperations)(nil)
+
+// NewFile creates a new File backed by the provided host file descriptor. If
+// NewFile succeeds, ownership of the fd is transferred to the returned File.
+//
+// The returned File cannot be saved, since there is no guarantee that the same
+// fd will exist or represent the same file at time of restore. If such a
+// guarantee does exist, use ImportFile instead.
+func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error) {
+	return newFileFromDonatedFD(ctx, fd, mounter, false, false)
+}
+
+// ImportFile creates a new File backed by the provided host file descriptor.
+// Unlike NewFile, the file descriptor used by the File is duped from fd to
+// ensure that later changes to fd are not reflected by the fs.File.
+//
+// If the returned file is saved, it will be restored by re-importing the fd
+// originally passed to ImportFile. It is the restorer's responsibility to
+// ensure that the fd represents the same file.
+func ImportFile(ctx context.Context, fd int, mounter fs.FileOwner, allowIoctl bool) (*fs.File, error) {
+	return newFileFromDonatedFD(ctx, fd, mounter, true, allowIoctl)
+}
+
+// newFileFromDonatedFD returns an fs.File from a donated fd. If the fd is
+// saveable, then saveable is true.
+func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner, saveable, allowIoctl bool) (*fs.File, error) {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(donated, &s); err != nil {
+		return nil, err
+	}
+	switch s.Mode & syscall.S_IFMT {
+	case syscall.S_IFSOCK:
+		flags, err := fileFlagsFromDonatedFD(donated)
+		if err != nil {
+			return nil, err
+		}
+		s, err := newSocket(ctx, donated, saveable)
+		if err != nil {
+			return nil, err
+		}
+		s.SetFlags(fs.SettableFileFlags{
+			NonBlocking: flags.NonBlocking,
+		})
+		return s, nil
+	default:
+		flags, err := fileFlagsFromDonatedFD(donated)
+		if err != nil {
+			return nil, err
+		}
+		msrc := newMountSource(ctx, "/", mounter, &Filesystem{}, fs.MountSourceFlags{}, false /* dontTranslateOwnership */)
+		inode, err := newInode(ctx, msrc, donated, saveable, true /* donated */)
+		if err != nil {
+			return nil, err
+		}
+		iops := inode.InodeOperations.(*inodeOperations)
+
+		name := fmt.Sprintf("host:[%d]", inode.StableAttr.InodeID)
+		dirent := fs.NewDirent(inode, name)
+		defer dirent.DecRef()
+
+		return newFile(ctx, dirent, flags, iops, allowIoctl), nil
+	}
+}
+
+func fileFlagsFromDonatedFD(donated int) (fs.FileFlags, error) {
+	flags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(donated), syscall.F_GETFL, 0)
+	if errno != 0 {
+		log.Warningf("Failed to get file flags for donated fd %d (errno=%d)", donated, errno)
+		return fs.FileFlags{}, syscall.EIO
+	}
+	accmode := flags & syscall.O_ACCMODE
+	return fs.FileFlags{
+		Direct:      flags&syscall.O_DIRECT != 0,
+		NonBlocking: flags&syscall.O_NONBLOCK != 0,
+		Sync:        flags&syscall.O_SYNC != 0,
+		Append:      flags&syscall.O_APPEND != 0,
+		Read:        accmode == syscall.O_RDONLY || accmode == syscall.O_RDWR,
+		Write:       accmode == syscall.O_WRONLY || accmode == syscall.O_RDWR,
+	}, nil
+}
+
+// newFile returns a new fs.File.
+func newFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations, allowIoctl bool) *fs.File {
+	if !iops.ReturnsWouldBlock() {
+		// Allow reading/writing at an arbitrary offset for files
+		// that support it.
+		flags.Pread = true
+		flags.Pwrite = true
+	}
+	return fs.NewFile(ctx, dirent, flags, &fileOperations{
+		iops:       iops,
+		allowIoctl: allowIoctl,
+	})
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (f *fileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	f.iops.fileState.queue.EventRegister(e, mask)
+	fdnotifier.UpdateFD(int32(f.iops.fileState.FD()))
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (f *fileOperations) EventUnregister(e *waiter.Entry) {
+	f.iops.fileState.queue.EventUnregister(e)
+	fdnotifier.UpdateFD(int32(f.iops.fileState.FD()))
+}
+
+// Readiness uses the poll() syscall to check the status of the underlying FD.
+func (f *fileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fdnotifier.NonBlockingPoll(int32(f.iops.fileState.FD()), mask)
+}
+
+// Readdir implements fs.FileOperations.Readdir.
+func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
+	root := fs.RootFromContext(ctx)
+	defer root.DecRef()
+	dirCtx := &fs.DirCtx{
+		Serializer: serializer,
+		DirCursor:  &f.dirCursor,
+	}
+	return fs.DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset())
+}
+
+// IterateDir implements fs.DirIterator.IterateDir.
+func (f *fileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	if f.dirinfo == nil {
+		f.dirinfo = new(dirInfo)
+		f.dirinfo.buf = make([]byte, usermem.PageSize)
+	}
+	entries, err := f.iops.readdirAll(f.dirinfo)
+	if err != nil {
+		return offset, err
+	}
+	count, err := fs.GenericReaddir(dirCtx, fs.NewSortedDentryMap(entries))
+	return offset + count, err
+}
+
+// Write implements fs.FileOperations.Write.
+func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	// Would this file block?
+	if f.iops.ReturnsWouldBlock() {
+		// These files can't be memory mapped, assert this. This also
+		// means that writes do not need to synchronize with memory
+		// mappings nor metadata cached by this file's fs.Inode.
+		if canMap(file.Dirent.Inode) {
+			panic("files that can return EWOULDBLOCK cannot be memory mapped")
+		}
+		// Ignore the offset, these files don't support writing at
+		// an arbitrary offset.
+		writer := fd.NewReadWriter(f.iops.fileState.FD())
+		n, err := src.CopyInTo(ctx, safemem.FromIOWriter{writer})
+		if isBlockError(err) {
+			err = syserror.ErrWouldBlock
+		}
+		return n, err
+	}
+	if !file.Dirent.Inode.MountSource.Flags.ForcePageCache {
+		writer := secio.NewOffsetWriter(fd.NewReadWriter(f.iops.fileState.FD()), offset)
+		return src.CopyInTo(ctx, safemem.FromIOWriter{writer})
+	}
+	return f.iops.cachingInodeOps.Write(ctx, src, offset)
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	// Would this file block?
+	if f.iops.ReturnsWouldBlock() {
+		// These files can't be memory mapped, assert this. This also
+		// means that reads do not need to synchronize with memory
+		// mappings nor metadata cached by this file's fs.Inode.
+		if canMap(file.Dirent.Inode) {
+			panic("files that can return EWOULDBLOCK cannot be memory mapped")
+		}
+		// Ignore the offset, these files don't support reading at
+		// an arbitrary offset.
+		reader := fd.NewReadWriter(f.iops.fileState.FD())
+		n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{reader})
+		if isBlockError(err) {
+			// If we got any data at all, return it as a "completed" partial read
+			// rather than retrying until complete.
+			if n != 0 {
+				err = nil
+			} else {
+				err = syserror.ErrWouldBlock
+			}
+		}
+		return n, err
+	}
+	if !file.Dirent.Inode.MountSource.Flags.ForcePageCache {
+		reader := secio.NewOffsetReader(fd.NewReadWriter(f.iops.fileState.FD()), offset)
+		return dst.CopyOutFrom(ctx, safemem.FromIOReader{reader})
+	}
+	return f.iops.cachingInodeOps.Read(ctx, file, dst, offset)
+}
+
+// Fsync implements fs.FileOperations.Fsync.
+func (f *fileOperations) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error {
+	switch syncType {
+	case fs.SyncAll, fs.SyncData:
+		if err := file.Dirent.Inode.WriteOut(ctx); err != nil {
+			return err
+		}
+		fallthrough
+	case fs.SyncBackingStorage:
+		return syscall.Fsync(f.iops.fileState.FD())
+	}
+	panic("invalid sync type")
+}
+
+// Flush implements fs.FileOperations.Flush.
+func (f *fileOperations) Flush(context.Context, *fs.File) error {
+	// This is a no-op because flushing the resource backing this
+	// file would mean closing it. We can't do that because other
+	// open files may depend on the backing host fd.
+	return nil
+}
+
+// ConfigureMMap implements fs.FileOperations.ConfigureMMap.
+func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
+	if !canMap(file.Dirent.Inode) {
+		return syserror.ENODEV
+	}
+	return fsutil.GenericConfigureMMap(file, f.iops.cachingInodeOps, opts)
+}
+
+// Seek implements fs.FileOperations.Seek.
+func (f *fileOperations) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
+	return fsutil.SeekWithDirCursor(ctx, file, whence, offset, &f.dirCursor)
+}
+
+// Ioctl implements fs.FileOperations.Iocotl.
+func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	if !f.allowIoctl {
+		return 0, syserror.ENOTTY
+	}
+	// Ignore arg[0].  This is the real FD:
+	fd := f.iops.fileState.FD()
+	ioctl := args[1].Uint64()
+	switch ioctl {
+	case unix.TCGETS:
+		termios, err := ioctlGetTermios(fd)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case unix.TCSETS, unix.TCSETSW:
+		var termios linux.Termios
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := ioctlSetTermios(fd, ioctl, &termios)
+		return 0, err
+
+	case unix.TIOCGPGRP:
+		// Args: pid_t *argp
+		// When successful, equivalent to *argp = tcgetpgrp(fd).
+		// Get the process group ID of the foreground process group on
+		// this terminal.
+
+		t := kernel.TaskFromContext(ctx)
+		if t == nil {
+			panic(fmt.Sprintf("cannot get thread group from context %v", ctx))
+		}
+		tid := t.ThreadID()
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &tid, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case unix.TIOCSPGRP:
+		// Args: const pid_t *argp
+		// Equivalent to tcsetpgrp(fd, *argp).
+		// Set the foreground process group ID of this terminal.
+
+		// Not much we can do with this one at the moment, so we just
+		// lie and pretend everything is great. Bash and Sh seem fine
+		// with this.
+		log.Warningf("Ignoring application ioctl(TIOCSPGRP) call")
+		return 0, nil
+
+	case unix.TIOCGWINSZ:
+		// Args: struct winsize *argp
+		// Get window size.
+		winsize, err := unix.IoctlGetWinsize(fd, unix.TIOCGWINSZ)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case unix.TIOCSWINSZ:
+		// Args: const struct winsize *argp
+		// Set window size.
+		var winsize unix.Winsize
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := unix.IoctlSetWinsize(fd, unix.TIOCSWINSZ, &winsize)
+		return 0, err
+
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
new file mode 100644
index 000000000..ffd55a5ab
--- /dev/null
+++ b/pkg/sentry/fs/host/fs.go
@@ -0,0 +1,327 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package host implements an fs.Filesystem for files backed by host
+// file descriptors.
+package host
+
+import (
+	"fmt"
+	"path"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// FilesystemName is the name under which Filesystem is registered.
+const FilesystemName = "whitelistfs"
+
+const (
+	// whitelistKey is the mount option containing a comma-separated list
+	// of host paths to whitelist.
+	whitelistKey = "whitelist"
+
+	// rootPathKey is the mount option containing the root path of the
+	// mount.
+	rootPathKey = "root"
+
+	// dontTranslateOwnershipKey is the key to superOperations.dontTranslateOwnership.
+	dontTranslateOwnershipKey = "dont_translate_ownership"
+)
+
+// maxTraversals determines link traversals in building the whitelist.
+const maxTraversals = 10
+
+// Filesystem is a pseudo file system that is only available during the setup
+// to lock down the configurations. This filesystem should only be mounted at root.
+//
+// Think twice before exposing this to applications.
+type Filesystem struct {
+	// whitelist is a set of host paths to whitelist.
+	paths []string
+}
+
+// Name is the identifier of this file system.
+func (*Filesystem) Name() string {
+	return FilesystemName
+}
+
+// AllowUserMount prohibits users from using mount(2) with this file system.
+func (*Filesystem) AllowUserMount() bool {
+	return false
+}
+
+// Flags returns that there is nothing special about this file system.
+func (*Filesystem) Flags() fs.FilesystemFlags {
+	return 0
+}
+
+// Mount returns an fs.Inode exposing the host file system.  It is intended to be locked
+// down in PreExec below.
+func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+	// Parse generic comma-separated key=value options.
+	options := fs.GenericMountSourceOptions(data)
+
+	// Grab the whitelist if one was specified.
+	// TODO: require another option "testonly" in order to allow
+	// no whitelist.
+	if wl, ok := options[whitelistKey]; ok {
+		f.paths = strings.Split(wl, "|")
+		delete(options, whitelistKey)
+	}
+
+	// If the rootPath was set, use it. Othewise default to the root of the
+	// host fs.
+	rootPath := "/"
+	if rp, ok := options[rootPathKey]; ok {
+		rootPath = rp
+		delete(options, rootPathKey)
+
+		// We must relativize the whitelisted paths to the new root.
+		for i, p := range f.paths {
+			rel, err := filepath.Rel(rootPath, p)
+			if err != nil {
+				return nil, fmt.Errorf("whitelist path %q must be a child of root path %q", p, rootPath)
+			}
+			f.paths[i] = path.Join("/", rel)
+		}
+	}
+	fd, err := open(nil, rootPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to find root: %v", err)
+	}
+
+	var dontTranslateOwnership bool
+	if v, ok := options[dontTranslateOwnershipKey]; ok {
+		b, err := strconv.ParseBool(v)
+		if err != nil {
+			return nil, fmt.Errorf("invalid value for %q: %v", dontTranslateOwnershipKey, err)
+		}
+		dontTranslateOwnership = b
+		delete(options, dontTranslateOwnershipKey)
+	}
+
+	// Fail if the caller passed us more options than we know about.
+	if len(options) > 0 {
+		return nil, fmt.Errorf("unsupported mount options: %v", options)
+	}
+
+	// The mounting EUID/EGID will be cached by this file system. This will
+	// be used to assign ownership to files that we own.
+	owner := fs.FileOwnerFromContext(ctx)
+
+	// Construct the host file system mount and inode.
+	msrc := newMountSource(ctx, rootPath, owner, f, flags, dontTranslateOwnership)
+	return newInode(ctx, msrc, fd, false /* saveable */, false /* donated */)
+}
+
+// InstallWhitelist locks down the MountNamespace to only the currently installed
+// Dirents and the given paths.
+func (f *Filesystem) InstallWhitelist(ctx context.Context, m *fs.MountNamespace) error {
+	return installWhitelist(ctx, m, f.paths)
+}
+
+func installWhitelist(ctx context.Context, m *fs.MountNamespace, paths []string) error {
+	if len(paths) == 0 || (len(paths) == 1 && paths[0] == "") {
+		// Warning will be logged during filter installation if the empty
+		// whitelist matters (allows for host file access).
+		return nil
+	}
+
+	// Done tracks entries already added.
+	done := make(map[string]bool)
+	root := m.Root()
+	defer root.DecRef()
+
+	for i := 0; i < len(paths); i++ {
+		// Make sure the path is absolute. This is a sanity check.
+		if !path.IsAbs(paths[i]) {
+			return fmt.Errorf("path %q is not absolute", paths[i])
+		}
+
+		// We need to add all the intermediate paths, in case one of
+		// them is a symlink that needs to be resolved.
+		for j := 1; j <= len(paths[i]); j++ {
+			if j < len(paths[i]) && paths[i][j] != '/' {
+				continue
+			}
+			current := paths[i][:j]
+
+			// Lookup the given component in the tree.
+			d, err := m.FindLink(ctx, root, nil, current, maxTraversals)
+			if err != nil {
+				log.Warningf("populate failed for %q: %v", current, err)
+				continue
+			}
+
+			// It's critical that this DecRef happens after the
+			// freeze below. This ensures that the dentry is in
+			// place to be frozen. Otherwise, we freeze without
+			// these entries.
+			defer d.DecRef()
+
+			// Expand the last component if necessary.
+			if current == paths[i] {
+				// Is it a directory or symlink?
+				sattr := d.Inode.StableAttr
+				if fs.IsDir(sattr) {
+					for name := range childDentAttrs(ctx, d) {
+						paths = append(paths, path.Join(current, name))
+					}
+				}
+				if fs.IsSymlink(sattr) {
+					// Only expand symlinks once. The
+					// folder structure may contain
+					// recursive symlinks and we don't want
+					// to end up infinitely expanding this
+					// symlink. This is safe because this
+					// is the last component. If a later
+					// path wants to symlink something
+					// beneath this symlink that will still
+					// be handled by the FindLink above.
+					if done[current] {
+						continue
+					}
+
+					s, err := d.Inode.Readlink(ctx)
+					if err != nil {
+						log.Warningf("readlink failed for %q: %v", current, err)
+						continue
+					}
+					if path.IsAbs(s) {
+						paths = append(paths, s)
+					} else {
+						target := path.Join(path.Dir(current), s)
+						paths = append(paths, target)
+					}
+				}
+			}
+
+			// Only report this one once even though we may look
+			// it up more than once. If we whitelist /a/b,/a then
+			// /a will be "done" when it is looked up for /a/b,
+			// however we still need to expand all of its contents
+			// when whitelisting /a.
+			if !done[current] {
+				log.Debugf("whitelisted: %s", current)
+			}
+			done[current] = true
+		}
+	}
+
+	// Freeze the mount tree in place. This prevents any new paths from
+	// being opened and any old ones from being removed. If we do provide
+	// tmpfs mounts, we'll want to freeze/thaw those separately.
+	m.Freeze()
+	return nil
+}
+
+func childDentAttrs(ctx context.Context, d *fs.Dirent) map[string]fs.DentAttr {
+	dirname, _ := d.FullName(nil /* root */)
+	dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
+	if err != nil {
+		log.Warningf("failed to open directory %q: %v", dirname, err)
+		return nil
+	}
+	dir.DecRef()
+	var stubSerializer fs.CollectEntriesSerializer
+	if err := dir.Readdir(ctx, &stubSerializer); err != nil {
+		log.Warningf("failed to iterate on host directory %q: %v", dirname, err)
+		return nil
+	}
+	delete(stubSerializer.Entries, ".")
+	delete(stubSerializer.Entries, "..")
+	return stubSerializer.Entries
+}
+
+// newMountSource constructs a new host fs.MountSource
+// relative to a root path. The root should match the mount point.
+func newMountSource(ctx context.Context, root string, mounter fs.FileOwner, filesystem fs.Filesystem, flags fs.MountSourceFlags, dontTranslateOwnership bool) *fs.MountSource {
+	return fs.NewMountSource(&superOperations{
+		root:                   root,
+		inodeMappings:          make(map[uint64]string),
+		mounter:                mounter,
+		dontTranslateOwnership: dontTranslateOwnership,
+	}, filesystem, flags)
+}
+
+// superOperations implements fs.MountSourceOperations.
+type superOperations struct {
+	fs.SimpleMountSourceOperations `state:"nosave"`
+
+	// root is the path of the mount point. All inode mappings
+	// are relative to this root.
+	root string
+
+	// inodeMappings contains mappings of fs.Inodes associated
+	// with this MountSource to paths under root.
+	inodeMappings map[uint64]string
+
+	// mounter is the cached EUID/EGID that mounted this file system.
+	mounter fs.FileOwner
+
+	// dontTranslateOwnership indicates whether to not translate file
+	// ownership.
+	//
+	// By default, files/directories owned by the sandbox uses UID/GID
+	// of the mounter. For files/directories that are not owned by the
+	// sandbox, file UID/GID is translated to a UID/GID which cannot
+	// be mapped in the sandboxed application's user namespace. The
+	// UID/GID will look like the nobody UID/GID (65534) but is not
+	// strictly owned by the user "nobody".
+	//
+	// If whitelistfs is a lower filesystem in an overlay, set
+	// dont_translate_ownership=true in mount options.
+	dontTranslateOwnership bool
+}
+
+var _ fs.MountSourceOperations = (*superOperations)(nil)
+
+// ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings.
+func (m *superOperations) ResetInodeMappings() {
+	m.inodeMappings = make(map[uint64]string)
+}
+
+// SaveInodeMapping implements fs.MountSourceOperations.SaveInodeMapping.
+func (m *superOperations) SaveInodeMapping(inode *fs.Inode, path string) {
+	// This is very unintuitive. We *CANNOT* trust the inode's StableAttrs,
+	// because overlay copyUp may have changed them out from under us.
+	// So much for "immutable".
+	sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr
+	m.inodeMappings[sattr.InodeID] = path
+}
+
+// Keep implements fs.MountSourceOperations.Keep.
+//
+// TODO: It is possible to change the permissions on a
+// host file while it is in the dirent cache (say from RO to RW), but it is not
+// possible to re-open the file with more relaxed permissions, since the host
+// FD is already open and stored in the inode.
+//
+// Using the dirent LRU cache increases the odds that this bug is encountered.
+// Since host file access is relatively fast anyways, we disable the LRU cache
+// for host fs files.  Once we can properly deal with permissions changes and
+// re-opening host files, we should revisit whether or not to make use of the
+// LRU cache.
+func (*superOperations) Keep(*fs.Dirent) bool {
+	return false
+}
+
+func init() {
+	fs.RegisterFilesystem(&Filesystem{})
+}
diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go
new file mode 100644
index 000000000..c000afc49
--- /dev/null
+++ b/pkg/sentry/fs/host/fs_test.go
@@ -0,0 +1,383 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path"
+	"reflect"
+	"sort"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// newTestMountNamespace creates a MountNamespace with a ramfs root.
+// It returns the host folder created, which should be removed when done.
+func newTestMountNamespace(t *testing.T) (*fs.MountNamespace, string, error) {
+	p, err := ioutil.TempDir("", "root")
+	if err != nil {
+		return nil, "", err
+	}
+
+	fd, err := open(nil, p)
+	if err != nil {
+		os.RemoveAll(p)
+		return nil, "", err
+	}
+	ctx := contexttest.Context(t)
+	root, err := newInode(ctx, newMountSource(ctx, p, fs.RootOwner, &Filesystem{}, fs.MountSourceFlags{}, false), fd, false, false)
+	if err != nil {
+		os.RemoveAll(p)
+		return nil, "", err
+	}
+	mm, err := fs.NewMountNamespace(ctx, root)
+	if err != nil {
+		os.RemoveAll(p)
+		return nil, "", err
+	}
+	return mm, p, nil
+}
+
+// createTestDirs populates the root with some test files and directories.
+// /a/a1.txt
+// /a/a2.txt
+// /b/b1.txt
+// /b/c/c1.txt
+// /symlinks/normal.txt
+// /symlinks/to_normal.txt -> /symlinks/normal.txt
+// /symlinks/recursive -> /symlinks
+func createTestDirs(ctx context.Context, t *testing.T, m *fs.MountNamespace) error {
+	r := m.Root()
+	defer r.DecRef()
+
+	if err := r.CreateDirectory(ctx, r, "a", fs.FilePermsFromMode(0777)); err != nil {
+		return err
+	}
+
+	a, err := r.Walk(ctx, r, "a")
+	if err != nil {
+		return err
+	}
+	defer a.DecRef()
+
+	a1, err := a.Create(ctx, r, "a1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
+	if err != nil {
+		return err
+	}
+	a1.DecRef()
+
+	a2, err := a.Create(ctx, r, "a2.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
+	if err != nil {
+		return err
+	}
+	a2.DecRef()
+
+	if err := r.CreateDirectory(ctx, r, "b", fs.FilePermsFromMode(0777)); err != nil {
+		return err
+	}
+
+	b, err := r.Walk(ctx, r, "b")
+	if err != nil {
+		return err
+	}
+	defer b.DecRef()
+
+	b1, err := b.Create(ctx, r, "b1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
+	if err != nil {
+		return err
+	}
+	b1.DecRef()
+
+	if err := b.CreateDirectory(ctx, r, "c", fs.FilePermsFromMode(0777)); err != nil {
+		return err
+	}
+
+	c, err := b.Walk(ctx, r, "c")
+	if err != nil {
+		return err
+	}
+	defer c.DecRef()
+
+	c1, err := c.Create(ctx, r, "c1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
+	if err != nil {
+		return err
+	}
+	c1.DecRef()
+
+	if err := r.CreateDirectory(ctx, r, "symlinks", fs.FilePermsFromMode(0777)); err != nil {
+		return err
+	}
+
+	symlinks, err := r.Walk(ctx, r, "symlinks")
+	if err != nil {
+		return err
+	}
+	defer symlinks.DecRef()
+
+	normal, err := symlinks.Create(ctx, r, "normal.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
+	if err != nil {
+		return err
+	}
+	normal.DecRef()
+
+	if err := symlinks.CreateLink(ctx, r, "/symlinks/normal.txt", "to_normal.txt"); err != nil {
+		return err
+	}
+
+	if err := symlinks.CreateLink(ctx, r, "/symlinks", "recursive"); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// allPaths returns a slice of all paths of entries visible in the rootfs.
+func allPaths(ctx context.Context, t *testing.T, m *fs.MountNamespace, base string) ([]string, error) {
+	var paths []string
+	root := m.Root()
+	defer root.DecRef()
+
+	d, err := m.FindLink(ctx, root, nil, base, 1)
+	if err != nil {
+		t.Logf("FindLink failed for %q", base)
+		return paths, err
+	}
+	defer d.DecRef()
+
+	if fs.IsDir(d.Inode.StableAttr) {
+		dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
+		if err != nil {
+			return nil, fmt.Errorf("failed to open directory %q: %v", base, err)
+		}
+		iter, ok := dir.FileOperations.(fs.DirIterator)
+		if !ok {
+			return nil, fmt.Errorf("cannot directly iterate on host directory %q", base)
+		}
+		dirCtx := &fs.DirCtx{
+			Serializer: noopDentrySerializer{},
+		}
+		if _, err := fs.DirentReaddir(ctx, d, iter, root, dirCtx, 0); err != nil {
+			return nil, err
+		}
+		for name := range dirCtx.DentAttrs() {
+			if name == "." || name == ".." {
+				continue
+			}
+
+			fullName := path.Join(base, name)
+			paths = append(paths, fullName)
+
+			// Recurse.
+			subpaths, err := allPaths(ctx, t, m, fullName)
+			if err != nil {
+				return paths, err
+			}
+			paths = append(paths, subpaths...)
+		}
+	}
+
+	return paths, nil
+}
+
+type noopDentrySerializer struct{}
+
+func (noopDentrySerializer) CopyOut(string, fs.DentAttr) error {
+	return nil
+}
+func (noopDentrySerializer) Written() int {
+	return 4096
+}
+
+// pathsEqual returns true if the two string slices contain the same entries.
+func pathsEqual(got, want []string) bool {
+	sort.Strings(got)
+	sort.Strings(want)
+
+	if len(got) != len(want) {
+		return false
+	}
+
+	for i := range got {
+		if got[i] != want[i] {
+			return false
+		}
+	}
+
+	return true
+}
+
+func TestWhitelist(t *testing.T) {
+	for _, test := range []struct {
+		// description of the test.
+		desc string
+		// paths are the paths to whitelist
+		paths []string
+		// want are all of the directory entries that should be
+		// visible (nothing beyond this set should be visible).
+		want []string
+	}{
+		{
+			desc:  "root",
+			paths: []string{"/"},
+			want:  []string{"/a", "/a/a1.txt", "/a/a2.txt", "/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt", "/symlinks", "/symlinks/normal.txt", "/symlinks/to_normal.txt", "/symlinks/recursive"},
+		},
+		{
+			desc:  "top-level directories",
+			paths: []string{"/a", "/b"},
+			want:  []string{"/a", "/a/a1.txt", "/a/a2.txt", "/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"},
+		},
+		{
+			desc:  "nested directories (1/2)",
+			paths: []string{"/b", "/b/c"},
+			want:  []string{"/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"},
+		},
+		{
+			desc:  "nested directories (2/2)",
+			paths: []string{"/b/c", "/b"},
+			want:  []string{"/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"},
+		},
+		{
+			desc:  "single file",
+			paths: []string{"/b/c/c1.txt"},
+			want:  []string{"/b", "/b/c", "/b/c/c1.txt"},
+		},
+		{
+			desc:  "single file and directory",
+			paths: []string{"/a/a1.txt", "/b/c"},
+			want:  []string{"/a", "/a/a1.txt", "/b", "/b/c", "/b/c/c1.txt"},
+		},
+		{
+			desc:  "symlink",
+			paths: []string{"/symlinks/to_normal.txt"},
+			want:  []string{"/symlinks", "/symlinks/normal.txt", "/symlinks/to_normal.txt"},
+		},
+		{
+			desc:  "recursive symlink",
+			paths: []string{"/symlinks/recursive/normal.txt"},
+			want:  []string{"/symlinks", "/symlinks/normal.txt", "/symlinks/recursive"},
+		},
+	} {
+		t.Run(test.desc, func(t *testing.T) {
+			m, p, err := newTestMountNamespace(t)
+			if err != nil {
+				t.Errorf("Failed to create MountNamespace: %v", err)
+			}
+			defer os.RemoveAll(p)
+
+			ctx := withRoot(contexttest.RootContext(t), m.Root())
+			if err := createTestDirs(ctx, t, m); err != nil {
+				t.Errorf("Failed to create test dirs: %v", err)
+			}
+
+			if err := installWhitelist(ctx, m, test.paths); err != nil {
+				t.Errorf("installWhitelist(%v) err got %v want nil", test.paths, err)
+			}
+
+			got, err := allPaths(ctx, t, m, "/")
+			if err != nil {
+				t.Fatalf("Failed to lookup paths (whitelisted: %v): %v", test.paths, err)
+			}
+
+			if !pathsEqual(got, test.want) {
+				t.Errorf("For paths %v got %v want %v", test.paths, got, test.want)
+			}
+		})
+	}
+}
+
+func TestRootPath(t *testing.T) {
+	// Create a temp dir, which will be the root of our mounted fs.
+	rootPath, err := ioutil.TempDir(os.TempDir(), "root")
+	if err != nil {
+		t.Fatalf("TempDir failed: %v", err)
+	}
+	defer os.RemoveAll(rootPath)
+
+	// Create two files inside the new root, one which will be whitelisted
+	// and one not.
+	whitelisted, err := ioutil.TempFile(rootPath, "white")
+	if err != nil {
+		t.Fatalf("TempFile failed: %v", err)
+	}
+	if _, err := ioutil.TempFile(rootPath, "black"); err != nil {
+		t.Fatalf("TempFile failed: %v", err)
+	}
+
+	// Create a mount with a root path and single whitelisted file.
+	hostFS := &Filesystem{}
+	ctx := contexttest.Context(t)
+	data := fmt.Sprintf("%s=%s,%s=%s", rootPathKey, rootPath, whitelistKey, whitelisted.Name())
+	inode, err := hostFS.Mount(ctx, "", fs.MountSourceFlags{}, data)
+	if err != nil {
+		t.Fatalf("Mount failed: %v", err)
+	}
+	mm, err := fs.NewMountNamespace(ctx, inode)
+	if err != nil {
+		t.Fatalf("NewMountNamespace failed: %v", err)
+	}
+	if err := hostFS.InstallWhitelist(ctx, mm); err != nil {
+		t.Fatalf("InstallWhitelist failed: %v", err)
+	}
+
+	// Get the contents of the root directory.
+	rootDir := mm.Root()
+	rctx := withRoot(ctx, rootDir)
+	f, err := rootDir.Inode.GetFile(rctx, rootDir, fs.FileFlags{})
+	if err != nil {
+		t.Fatalf("GetFile failed: %v", err)
+	}
+	c := &fs.CollectEntriesSerializer{}
+	if err := f.Readdir(rctx, c); err != nil {
+		t.Fatalf("Readdir failed: %v", err)
+	}
+
+	// We should have only our whitelisted file, plus the dots.
+	want := []string{path.Base(whitelisted.Name()), ".", ".."}
+	got := c.Order
+	sort.Strings(want)
+	sort.Strings(got)
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("Readdir got %v, wanted %v", got, want)
+	}
+}
+
+type rootContext struct {
+	context.Context
+	root *fs.Dirent
+}
+
+// withRoot returns a copy of ctx with the given root.
+func withRoot(ctx context.Context, root *fs.Dirent) context.Context {
+	return &rootContext{
+		Context: ctx,
+		root:    root,
+	}
+}
+
+// Value implements Context.Value.
+func (rc rootContext) Value(key interface{}) interface{} {
+	switch key {
+	case fs.CtxRoot:
+		rc.root.IncRef()
+		return rc.root
+	default:
+		return rc.Context.Value(key)
+	}
+}
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
new file mode 100644
index 000000000..226bc5164
--- /dev/null
+++ b/pkg/sentry/fs/host/inode.go
@@ -0,0 +1,506 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/secio"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// inodeOperations implements fs.InodeOperations for an fs.Inodes backed
+// by a host file descriptor.
+type inodeOperations struct {
+	fsutil.InodeNotVirtual           `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.DeprecatedFileOperations  `state:"nosave"`
+
+	// fileState implements fs.CachedFileObject. It exists
+	// to break a circular load dependency between inodeOperations
+	// and cachingInodeOps (below).
+	fileState *inodeFileState `state:"wait"`
+
+	// cachedInodeOps implements memmap.Mappable.
+	cachingInodeOps *fsutil.CachingInodeOperations
+
+	// readdirMu protects the file offset on the host FD. This is needed
+	// for readdir because getdents must use the kernel offset, so
+	// concurrent readdirs must be exclusive.
+	//
+	// All read/write functions pass the offset directly to the kernel and
+	// thus don't need a lock.
+	readdirMu sync.Mutex `state:"nosave"`
+}
+
+// inodeFileState implements fs.CachedFileObject and otherwise fully
+// encapsulates state that needs to be manually loaded on restore for
+// this file object.
+//
+// This unfortunate structure exists because fs.CachingInodeOperations
+// defines afterLoad and therefore cannot be lazily loaded (to break a
+// circular load dependency between it and inodeOperations). Even with
+// lazy loading, this approach defines the dependencies between objects
+// and the expected load behavior more concretely.
+type inodeFileState struct {
+	// Common file system state.
+	mops *superOperations `state:"wait"`
+
+	// descriptor is the backing host fd.
+	descriptor *descriptor `state:"wait"`
+
+	// Event queue for blocking operations.
+	queue waiter.Queue `state:"nosave"`
+
+	// sattr is used to restore the inodeOperations.
+	sattr fs.StableAttr `state:"wait"`
+
+	// savedUAttr is only allocated during S/R. It points to the save-time
+	// unstable attributes and is used to validate restore-time ones.
+	//
+	// Note that these unstable attributes are only used to detect cross-S/R
+	// external file system metadata changes. They may differ from the
+	// cached unstable attributes in cachingInodeOps, as that might differ
+	// from the external file system attributes if there had been WriteOut
+	// failures. S/R is transparent to Sentry and the latter will continue
+	// using its cached values after restore.
+	savedUAttr *fs.UnstableAttr
+}
+
+// ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt.
+func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
+	// TODO: Using safemem.FromIOReader here is wasteful for two
+	// reasons:
+	//
+	// - Using preadv instead of iterated preads saves on host system calls.
+	//
+	// - Host system calls can handle destination memory that would fault in
+	// gr3 (i.e. they can accept safemem.Blocks with NeedSafecopy() == true),
+	// so the buffering performed by FromIOReader is unnecessary.
+	//
+	// This also applies to the write path below.
+	return safemem.FromIOReader{secio.NewOffsetReader(fd.NewReadWriter(i.FD()), int64(offset))}.ReadToBlocks(dsts)
+}
+
+// WriteFromBlocksAt implements fsutil.CachedFileObject.WriteFromBlocksAt.
+func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
+	return safemem.FromIOWriter{secio.NewOffsetWriter(fd.NewReadWriter(i.FD()), int64(offset))}.WriteFromBlocks(srcs)
+}
+
+// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
+func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error {
+	if mask.Empty() {
+		return nil
+	}
+	if mask.UID || mask.GID {
+		return syserror.EPERM
+	}
+	if mask.Perms {
+		if err := syscall.Fchmod(i.FD(), uint32(attr.Perms.LinuxMode())); err != nil {
+			return err
+		}
+	}
+	if mask.Size {
+		if err := syscall.Ftruncate(i.FD(), attr.Size); err != nil {
+			return err
+		}
+	}
+	if mask.AccessTime || mask.ModificationTime {
+		ts := fs.TimeSpec{
+			ATime:     attr.AccessTime,
+			ATimeOmit: !mask.AccessTime,
+			MTime:     attr.ModificationTime,
+			MTimeOmit: !mask.ModificationTime,
+		}
+		if err := setTimestamps(i.FD(), ts); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Sync implements fsutil.CachedFileObject.Sync.
+func (i *inodeFileState) Sync(ctx context.Context) error {
+	return syscall.Fsync(i.FD())
+}
+
+// FD implements fsutil.CachedFileObject.FD.
+func (i *inodeFileState) FD() int {
+	return i.descriptor.value
+}
+
+func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, error) {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(i.FD(), &s); err != nil {
+		return fs.UnstableAttr{}, err
+	}
+	return unstableAttr(i.mops, &s), nil
+}
+
+// inodeOperations implements fs.InodeOperations.
+var _ fs.InodeOperations = (*inodeOperations)(nil)
+
+// newInode returns a new fs.Inode backed by the host fd.
+func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool, donated bool) (*fs.Inode, error) {
+	// Retrieve metadata.
+	var s syscall.Stat_t
+	err := syscall.Fstat(fd, &s)
+	if err != nil {
+		return nil, err
+	}
+
+	fileState := &inodeFileState{
+		mops:  msrc.MountSourceOperations.(*superOperations),
+		sattr: stableAttr(&s),
+	}
+
+	// Initialize the wrapped host file descriptor.
+	fileState.descriptor, err = newDescriptor(
+		fd,
+		donated,
+		saveable,
+		wouldBlock(&s),
+		&fileState.queue,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	// Build the fs.InodeOperations.
+	uattr := unstableAttr(msrc.MountSourceOperations.(*superOperations), &s)
+	iops := &inodeOperations{
+		fileState:       fileState,
+		cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, msrc.Flags.ForcePageCache),
+	}
+
+	// Return the fs.Inode.
+	return fs.NewInode(iops, msrc, fileState.sattr), nil
+}
+
+// Mappable implements fs.InodeOperations.Mappable.
+func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable {
+	if !canMap(inode) {
+		return nil
+	}
+	return i.cachingInodeOps
+}
+
+// ReturnsWouldBlock returns true if this host fd can return EWOULDBLOCK
+// for operations that would block.
+func (i *inodeOperations) ReturnsWouldBlock() bool {
+	return i.fileState.descriptor.wouldBlock
+}
+
+// Release implements fs.InodeOperations.Release.
+func (i *inodeOperations) Release(context.Context) {
+	i.fileState.descriptor.Release()
+	i.cachingInodeOps.Release()
+}
+
+// Lookup implements fs.InodeOperations.Lookup.
+func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
+	// Get a new fd relative to i at name.
+	fd, err := open(i, name)
+	if err != nil {
+		if err == syserror.ENOENT {
+			return nil, syserror.ENOENT
+		}
+		return nil, err
+	}
+
+	inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */)
+	if err != nil {
+		return nil, err
+	}
+
+	// Return the fs.Dirent.
+	return fs.NewDirent(inode, name), nil
+}
+
+// Create implements fs.InodeOperations.Create.
+func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) {
+	// Create a file relative to i at name.
+	//
+	// N.B. We always open this file O_RDWR regardless of flags because a
+	// future GetFile might want more access. Open allows this regardless
+	// of perm.
+	fd, err := openAt(i, name, syscall.O_RDWR|syscall.O_CREAT|syscall.O_EXCL, perm.LinuxMode())
+	if err != nil {
+		return nil, err
+	}
+
+	inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */)
+	if err != nil {
+		return nil, err
+	}
+
+	d := fs.NewDirent(inode, name)
+	defer d.DecRef()
+	return inode.GetFile(ctx, d, flags)
+}
+
+// CreateDirectory implements fs.InodeOperations.CreateDirectory.
+func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
+	return syscall.Mkdirat(i.fileState.FD(), name, uint32(perm.LinuxMode()))
+}
+
+// CreateLink implements fs.InodeOperations.CreateLink.
+func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error {
+	return createLink(i.fileState.FD(), oldname, newname)
+}
+
+// CreateHardLink implements fs.InodeOperations.CreateHardLink.
+func (*inodeOperations) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error {
+	return syserror.EPERM
+}
+
+// CreateFifo implements fs.InodeOperations.CreateFifo.
+func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error {
+	return syserror.EOPNOTSUPP
+}
+
+// Remove implements fs.InodeOperations.Remove.
+func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error {
+	return unlinkAt(i.fileState.FD(), name, false /* dir */)
+}
+
+// RemoveDirectory implements fs.InodeOperations.RemoveDirectory.
+func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error {
+	return unlinkAt(i.fileState.FD(), name, true /* dir */)
+}
+
+// Rename implements fs.InodeOperations.Rename.
+func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
+	op, ok := oldParent.InodeOperations.(*inodeOperations)
+	if !ok {
+		return syscall.EXDEV
+	}
+	np, ok := newParent.InodeOperations.(*inodeOperations)
+	if !ok {
+		return syscall.EXDEV
+	}
+	return syscall.Renameat(op.fileState.FD(), oldName, np.fileState.FD(), newName)
+}
+
+// Bind implements fs.InodeOperations.Bind.
+func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) error {
+	return syserror.EOPNOTSUPP
+}
+
+// BoundEndpoint implements fs.InodeOperations.BoundEndpoint.
+func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) unix.BoundEndpoint {
+	return nil
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return newFile(ctx, d, flags, i, false), nil
+}
+
+// canMap returns true if this fs.Inode can be memory mapped.
+func canMap(inode *fs.Inode) bool {
+	// FIXME: Some obscure character devices can be mapped.
+	return fs.IsFile(inode.StableAttr)
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	// When the kernel supports mapping host FDs, we do so to take
+	// advantage of the host page cache. We forego updating fs.Inodes
+	// because the host manages consistency of its own inode structures.
+	//
+	// For fs.Inodes that can never be mapped we take advantage of
+	// synchronizing metadata updates through host caches.
+	//
+	// So can we use host kernel metadata caches?
+	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+		// Then just obtain the attributes.
+		return i.fileState.unstableAttr(ctx)
+	}
+	// No, we're maintaining consistency of metadata ourselves.
+	return i.cachingInodeOps.UnstableAttr(ctx, inode)
+}
+
+// Check implements fs.InodeOperations.Check.
+func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (i *inodeOperations) SetOwner(context.Context, *fs.Inode, fs.FileOwner) error {
+	return syserror.EPERM
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, f fs.FilePermissions) bool {
+	// Can we use host kernel metadata caches?
+	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+		// Then just change the timestamps on the fd, the host
+		// will synchronize the metadata update with any host
+		// inode and page cache.
+		return syscall.Fchmod(i.fileState.FD(), uint32(f.LinuxMode())) == nil
+	}
+	// Otherwise update our cached metadata.
+	return i.cachingInodeOps.SetPermissions(ctx, inode, f)
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
+	// Can we use host kernel metadata caches?
+	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+		// Then just change the timestamps on the fd, the host
+		// will synchronize the metadata update with any host
+		// inode and page cache.
+		return setTimestamps(i.fileState.FD(), ts)
+	}
+	// Otherwise update our cached metadata.
+	return i.cachingInodeOps.SetTimestamps(ctx, inode, ts)
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
+	// Is the file not memory-mappable?
+	if !canMap(inode) {
+		// Then just change the file size on the fd, the host
+		// will synchronize the metadata update with any host
+		// inode and page cache.
+		return syscall.Ftruncate(i.fileState.FD(), size)
+	}
+	// Otherwise we need to go through cachingInodeOps, even if the host page
+	// cache is in use, to invalidate private copies of truncated pages.
+	return i.cachingInodeOps.Truncate(ctx, inode, size)
+}
+
+// WriteOut implements fs.InodeOperations.WriteOut.
+func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
+	// Have we been using host kernel metadata caches?
+	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+		// Then the metadata is already up to date on the host.
+		return nil
+	}
+	// Otherwise we need to write out cached pages and attributes
+	// that are dirty.
+	return i.cachingInodeOps.WriteOut(ctx, inode)
+}
+
+// Readlink implements fs.InodeOperations.Readlink.
+func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+	return readLink(i.fileState.FD())
+}
+
+// Getlink implements fs.InodeOperations.Getlink.
+func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
+	if !fs.IsSymlink(i.fileState.sattr) {
+		return nil, syserror.ENOLINK
+	}
+	return nil, fs.ErrResolveViaReadlink
+}
+
+// StatFS implements fs.InodeOperations.StatFS.
+func (i *inodeOperations) StatFS(context.Context) (fs.Info, error) {
+	return fs.Info{}, syserror.ENOSYS
+}
+
+// AddLink implements fs.InodeOperations.AddLink.
+// FIXME: Remove this from InodeOperations altogether.
+func (i *inodeOperations) AddLink() {}
+
+// DropLink implements fs.InodeOperations.DropLink.
+// FIXME: Remove this from InodeOperations altogether.
+func (i *inodeOperations) DropLink() {}
+
+// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
+// FIXME: Remove this from InodeOperations altogether.
+func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
+
+// readdirAll returns all of the directory entries in i.
+func (i *inodeOperations) readdirAll(d *dirInfo) (map[string]fs.DentAttr, error) {
+	i.readdirMu.Lock()
+	defer i.readdirMu.Unlock()
+
+	fd := i.fileState.FD()
+
+	// syscall.ReadDirent will use getdents, which will seek the file past
+	// the last directory entry. To read the directory entries a second
+	// time, we need to seek back to the beginning.
+	if _, err := syscall.Seek(fd, 0, 0); err != nil {
+		if err == syscall.ESPIPE {
+			// All directories should be seekable. If this file
+			// isn't seekable, it is not a directory and we should
+			// return that more sane error.
+			err = syscall.ENOTDIR
+		}
+		return nil, err
+	}
+
+	names := make([]string, 0, 100)
+	for {
+		// Refill the buffer if necessary
+		if d.bufp >= d.nbuf {
+			d.bufp = 0
+			// ReadDirent will just do a sys_getdents64 to the kernel.
+			n, err := syscall.ReadDirent(fd, d.buf)
+			if err != nil {
+				return nil, err
+			}
+			if n == 0 {
+				break // EOF
+			}
+			d.nbuf = n
+		}
+
+		var nb int
+		// Parse the dirent buffer we just get and return the directory names along
+		// with the number of bytes consumed in the buffer.
+		nb, _, names = syscall.ParseDirent(d.buf[d.bufp:d.nbuf], -1, names)
+		d.bufp += nb
+	}
+
+	entries := make(map[string]fs.DentAttr)
+	for _, filename := range names {
+		// Lookup the type and host device and inode.
+		stat, lerr := fstatat(fd, filename, linux.AT_SYMLINK_NOFOLLOW)
+		if lerr == syscall.ENOENT {
+			// File disappeared between readdir and lstat.
+			// Just treat it as if it didn't exist.
+			continue
+		}
+
+		// There was a serious problem, we should probably report it.
+		if lerr != nil {
+			return nil, lerr
+		}
+
+		entries[filename] = fs.DentAttr{
+			Type: nodeType(&stat),
+			InodeID: hostFileDevice.Map(device.MultiDeviceKey{
+				Device: stat.Dev,
+				Inode:  stat.Ino,
+			}),
+		}
+	}
+	return entries, nil
+}
diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go
new file mode 100644
index 000000000..80066512a
--- /dev/null
+++ b/pkg/sentry/fs/host/inode_state.go
@@ -0,0 +1,79 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// beforeSave is invoked by stateify.
+func (i *inodeFileState) beforeSave() {
+	if !i.queue.IsEmpty() {
+		panic("event queue must be empty")
+	}
+	if !i.descriptor.donated && i.sattr.Type == fs.RegularFile {
+		uattr, err := i.unstableAttr(context.Background())
+		if err != nil {
+			panic(fmt.Sprintf("failed to get unstable atttribute of %s: %v", i.mops.inodeMappings[i.sattr.InodeID], err))
+		}
+		i.savedUAttr = &uattr
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (i *inodeFileState) afterLoad() {
+	// Initialize the descriptor value.
+	if err := i.descriptor.initAfterLoad(i.mops, i.sattr.InodeID, &i.queue); err != nil {
+		panic(fmt.Sprintf("failed to load value of descriptor: %v", err))
+	}
+
+	// Remap the inode number.
+	var s syscall.Stat_t
+	if err := syscall.Fstat(i.FD(), &s); err != nil {
+		panic(fmt.Sprintf("failed to get metadata for fd %d: %v", i.FD(), err))
+	}
+	key := device.MultiDeviceKey{
+		Device: s.Dev,
+		Inode:  s.Ino,
+	}
+	if !hostFileDevice.Load(key, i.sattr.InodeID) {
+		// This means there was a conflict at s.Dev and s.Ino with
+		// another inode mapping: two files that were unique on the
+		// saved filesystem are no longer unique on this filesystem.
+		// Since this violates the contract that filesystems cannot
+		// change across save and restore, error out.
+		panic(fmt.Sprintf("host %s conflict in host device mappings: %s", key, hostFileDevice))
+	}
+
+	if !i.descriptor.donated && i.sattr.Type == fs.RegularFile {
+		env, ok := fs.CurrentRestoreEnvironment()
+		if !ok {
+			panic("missing restore environment")
+		}
+		uattr := unstableAttr(i.mops, &s)
+		if env.ValidateFileSize && uattr.Size != i.savedUAttr.Size {
+			panic(fmt.Errorf("file size has changed for %s: previously %d, now %d", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size))
+		}
+		if env.ValidateFileTimestamp && uattr.ModificationTime != i.savedUAttr.ModificationTime {
+			panic(fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime))
+		}
+		i.savedUAttr = nil
+	}
+}
diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go
new file mode 100644
index 000000000..0ff87c418
--- /dev/null
+++ b/pkg/sentry/fs/host/inode_test.go
@@ -0,0 +1,112 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"io/ioutil"
+	"os"
+	"path"
+	"syscall"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// TestMultipleReaddir verifies that multiple Readdir calls return the same
+// thing if they use different dir contexts.
+func TestMultipleReaddir(t *testing.T) {
+	p, err := ioutil.TempDir("", "readdir")
+	if err != nil {
+		t.Fatalf("Failed to create test dir: %v", err)
+	}
+	defer os.RemoveAll(p)
+
+	f, err := os.Create(path.Join(p, "a.txt"))
+	if err != nil {
+		t.Fatalf("Failed to create a.txt: %v", err)
+	}
+	f.Close()
+
+	f, err = os.Create(path.Join(p, "b.txt"))
+	if err != nil {
+		t.Fatalf("Failed to create b.txt: %v", err)
+	}
+	f.Close()
+
+	fd, err := open(nil, p)
+	if err != nil {
+		t.Fatalf("Failed to open %q: %v", p, err)
+	}
+	ctx := contexttest.Context(t)
+	n, err := newInode(ctx, newMountSource(ctx, p, fs.RootOwner, &Filesystem{}, fs.MountSourceFlags{}, false), fd, false, false)
+	if err != nil {
+		t.Fatalf("Failed to create inode: %v", err)
+	}
+
+	dirent := fs.NewDirent(n, "readdir")
+	openFile, err := n.GetFile(ctx, dirent, fs.FileFlags{Read: true})
+	if err != nil {
+		t.Fatalf("Failed to get file: %v", err)
+	}
+	defer openFile.DecRef()
+
+	c1 := &fs.DirCtx{DirCursor: new(string)}
+	if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, c1, 0); err != nil {
+		t.Fatalf("First Readdir failed: %v", err)
+	}
+
+	c2 := &fs.DirCtx{DirCursor: new(string)}
+	if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, c2, 0); err != nil {
+		t.Errorf("Second Readdir failed: %v", err)
+	}
+
+	if _, ok := c1.DentAttrs()["a.txt"]; !ok {
+		t.Errorf("want a.txt in first Readdir, got %v", c1.DentAttrs())
+	}
+	if _, ok := c1.DentAttrs()["b.txt"]; !ok {
+		t.Errorf("want b.txt in first Readdir, got %v", c1.DentAttrs())
+	}
+
+	if _, ok := c2.DentAttrs()["a.txt"]; !ok {
+		t.Errorf("want a.txt in second Readdir, got %v", c2.DentAttrs())
+	}
+	if _, ok := c2.DentAttrs()["b.txt"]; !ok {
+		t.Errorf("want b.txt in second Readdir, got %v", c2.DentAttrs())
+	}
+}
+
+// TestCloseFD verifies fds will be closed.
+func TestCloseFD(t *testing.T) {
+	var p [2]int
+	if err := syscall.Pipe(p[0:]); err != nil {
+		t.Fatalf("Failed to create pipe %v", err)
+	}
+	defer syscall.Close(p[0])
+	defer syscall.Close(p[1])
+
+	// Use the write-end because we will detect if it's closed on the read end.
+	ctx := contexttest.Context(t)
+	file, err := NewFile(ctx, p[1], fs.RootOwner)
+	if err != nil {
+		t.Fatalf("Failed to create File: %v", err)
+	}
+	file.DecRef()
+
+	s := make([]byte, 10)
+	if c, err := syscall.Read(p[0], s); c != 0 || err != nil {
+		t.Errorf("want 0, nil (EOF) from read end, got %v, %v", c, err)
+	}
+}
diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go
new file mode 100644
index 000000000..3c07c3850
--- /dev/null
+++ b/pkg/sentry/fs/host/ioctl_unsafe.go
@@ -0,0 +1,39 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+func ioctlGetTermios(fd int) (*linux.Termios, error) {
+	var t linux.Termios
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.TCGETS, uintptr(unsafe.Pointer(&t)))
+	if errno != 0 {
+		return nil, errno
+	}
+	return &t, nil
+}
+
+func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error {
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
new file mode 100644
index 000000000..8e36ed7ee
--- /dev/null
+++ b/pkg/sentry/fs/host/socket.go
@@ -0,0 +1,471 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
+	unixsocket "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
+)
+
+// endpoint encapsulates the state needed to represent a host Unix socket.
+type endpoint struct {
+	queue waiter.Queue `state:"nosave"`
+
+	// stype is the type of Unix socket. (Ex: unix.SockStream,
+	// unix.SockSeqpacket, unix.SockDgram)
+	stype unix.SockType `state:"nosave"`
+
+	// fd is the host fd backing this file.
+	fd int `state:"nosave"`
+
+	// If srfd >= 0, it is the host fd that fd was imported from.
+	srfd int `state:"wait"`
+}
+
+func (e *endpoint) init() error {
+	family, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_DOMAIN)
+	if err != nil {
+		return err
+	}
+
+	if family != syscall.AF_UNIX {
+		// We only allow Unix sockets.
+		return syserror.EINVAL
+	}
+
+	stype, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_TYPE)
+	if err != nil {
+		return err
+	}
+
+	if err := syscall.SetNonblock(e.fd, true); err != nil {
+		return err
+	}
+
+	e.stype = unix.SockType(stype)
+	if err := fdnotifier.AddFD(int32(e.fd), &e.queue); err != nil {
+		return err
+	}
+	return nil
+}
+
+// newEndpoint creates a new host endpoint.
+func newEndpoint(fd int, srfd int) (*endpoint, error) {
+	ep := &endpoint{fd: fd, srfd: srfd}
+	if err := ep.init(); err != nil {
+		return nil, err
+	}
+	return ep, nil
+}
+
+// newSocket allocates a new unix socket with host endpoint.
+func newSocket(ctx context.Context, fd int, saveable bool) (*fs.File, error) {
+	ownedfd := fd
+	srfd := -1
+	if saveable {
+		var err error
+		ownedfd, err = syscall.Dup(fd)
+		if err != nil {
+			return nil, err
+		}
+		srfd = fd
+	}
+	ep, err := newEndpoint(ownedfd, srfd)
+	if err != nil {
+		if saveable {
+			syscall.Close(ownedfd)
+		}
+		return nil, err
+	}
+	return unixsocket.New(ctx, ep), nil
+}
+
+// NewSocketWithDirent allocates a new unix socket with host endpoint.
+//
+// This is currently only used by unsaveable Gofer nodes.
+//
+// NewSocketWithDirent takes ownership of f on success.
+func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.FileFlags) (*fs.File, error) {
+	ep, err := newEndpoint(f.FD(), -1)
+	if err != nil {
+		return nil, err
+	}
+
+	// Take ownship of the FD.
+	f.Release()
+
+	return unixsocket.NewWithDirent(ctx, d, ep, flags), nil
+}
+
+// Close implements unix.Endpoint.Close.
+func (e *endpoint) Close() {
+	fdnotifier.RemoveFD(int32(e.fd))
+	syscall.Close(e.fd)
+	e.fd = -1
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (e *endpoint) EventRegister(we *waiter.Entry, mask waiter.EventMask) {
+	e.queue.EventRegister(we, mask)
+	fdnotifier.UpdateFD(int32(e.fd))
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (e *endpoint) EventUnregister(we *waiter.Entry) {
+	e.queue.EventUnregister(we)
+	fdnotifier.UpdateFD(int32(e.fd))
+}
+
+// Readiness implements unix.Endpoint.Readiness.
+func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fdnotifier.NonBlockingPoll(int32(e.fd), mask)
+}
+
+// Type implements unix.Endpoint.Type.
+func (e *endpoint) Type() unix.SockType {
+	return e.stype
+}
+
+// Connect implements unix.Endpoint.Connect.
+func (e *endpoint) Connect(server unix.BoundEndpoint) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// Bind implements unix.Endpoint.Bind.
+func (e *endpoint) Bind(address tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// Listen implements unix.Endpoint.Listen.
+func (e *endpoint) Listen(backlog int) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// Accept implements unix.Endpoint.Accept.
+func (e *endpoint) Accept() (unix.Endpoint, *tcpip.Error) {
+	return nil, tcpip.ErrInvalidEndpointState
+}
+
+// Shutdown implements unix.Endpoint.Shutdown.
+func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// GetSockOpt implements unix.Endpoint.GetSockOpt.
+func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+	switch o := opt.(type) {
+	case tcpip.ErrorOption:
+		_, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_ERROR)
+		return translateError(err)
+	case *tcpip.PasscredOption:
+		// We don't support passcred on host sockets.
+		*o = 0
+		return nil
+	case *tcpip.SendBufferSizeOption:
+		v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+		*o = tcpip.SendBufferSizeOption(v)
+		return translateError(err)
+	case *tcpip.ReceiveBufferSizeOption:
+		v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF)
+		*o = tcpip.ReceiveBufferSizeOption(v)
+		return translateError(err)
+	case *tcpip.ReuseAddressOption:
+		v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_REUSEADDR)
+		*o = tcpip.ReuseAddressOption(v)
+		return translateError(err)
+	case *tcpip.ReceiveQueueSizeOption:
+		return tcpip.ErrQueueSizeNotSupported
+	}
+	return tcpip.ErrInvalidEndpointState
+}
+
+// SetSockOpt implements unix.Endpoint.SetSockOpt.
+func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	return nil
+}
+
+// GetLocalAddress implements unix.Endpoint.GetLocalAddress.
+func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	return tcpip.FullAddress{}, nil
+}
+
+// GetRemoteAddress implements unix.Endpoint.GetRemoteAddress.
+func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+	return tcpip.FullAddress{}, nil
+}
+
+// Passcred returns whether or not the SO_PASSCRED socket option is
+// enabled on this end.
+func (e *endpoint) Passcred() bool {
+	// We don't support credential passing for host sockets.
+	return false
+}
+
+// ConnectedPasscred returns whether or not the SO_PASSCRED socket option
+// is enabled on the connected end.
+func (e *endpoint) ConnectedPasscred() bool {
+	// We don't support credential passing for host sockets.
+	return false
+}
+
+// SendMsg implements unix.Endpoint.SendMsg.
+func (e *endpoint) SendMsg(data [][]byte, controlMessages unix.ControlMessages, to unix.BoundEndpoint) (uintptr, *tcpip.Error) {
+	if to != nil {
+		return 0, tcpip.ErrInvalidEndpointState
+	}
+	return sendMsg(e.fd, data, controlMessages)
+}
+
+func sendMsg(fd int, data [][]byte, controlMessages unix.ControlMessages) (uintptr, *tcpip.Error) {
+	if !controlMessages.Empty() {
+		return 0, tcpip.ErrInvalidEndpointState
+	}
+	n, err := fdWriteVec(fd, data)
+	return n, translateError(err)
+}
+
+// RecvMsg implements unix.Endpoint.RecvMsg.
+func (e *endpoint) RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) {
+	return recvMsg(e.fd, data, numRights, peek, addr)
+}
+
+func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) {
+	var cm unet.ControlMessage
+	if numRights > 0 {
+		cm.EnableFDs(int(numRights))
+	}
+	rl, ml, cl, err := fdReadVec(fd, data, []byte(cm), peek)
+	if err == syscall.EAGAIN {
+		return 0, 0, unix.ControlMessages{}, tcpip.ErrWouldBlock
+	}
+	if err != nil {
+		return 0, 0, unix.ControlMessages{}, translateError(err)
+	}
+
+	// Trim the control data if we received less than the full amount.
+	if cl < uint64(len(cm)) {
+		cm = cm[:cl]
+	}
+
+	// Avoid extra allocations in the case where there isn't any control data.
+	if len(cm) == 0 {
+		return rl, ml, unix.ControlMessages{}, nil
+	}
+
+	fds, err := cm.ExtractFDs()
+	if err != nil {
+		return 0, 0, unix.ControlMessages{}, translateError(err)
+	}
+
+	if len(fds) == 0 {
+		return rl, ml, unix.ControlMessages{}, nil
+	}
+	return rl, ml, control.New(nil, nil, newSCMRights(fds)), nil
+}
+
+// NewConnectedEndpoint creates a new unix.Receiver and unix.ConnectedEndpoint
+// backed by a host FD that will pretend to be bound at a given sentry path.
+func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (unix.Receiver, unix.ConnectedEndpoint, *tcpip.Error) {
+	if err := fdnotifier.AddFD(int32(file.FD()), queue); err != nil {
+		return nil, nil, translateError(err)
+	}
+
+	e := &connectedEndpoint{path: path, queue: queue, file: file}
+
+	// AtomicRefCounters start off with a single reference. We need two.
+	e.ref.IncRef()
+
+	return e, e, nil
+}
+
+// connectedEndpoint is a host FD backed implementation of
+// unix.ConnectedEndpoint and unix.Receiver.
+//
+// connectedEndpoint does not support save/restore for now.
+type connectedEndpoint struct {
+	queue *waiter.Queue
+	path  string
+
+	// ref keeps track of references to a connectedEndpoint.
+	ref refs.AtomicRefCount
+
+	// mu protects fd, readClosed and writeClosed.
+	mu sync.RWMutex
+
+	// file is an *fd.FD containing the FD backing this endpoint. It must be
+	// set to nil if it has been closed.
+	file *fd.FD
+
+	// readClosed is true if the FD has read shutdown or if it has been closed.
+	readClosed bool
+
+	// writeClosed is true if the FD has write shutdown or if it has been
+	// closed.
+	writeClosed bool
+}
+
+// Send implements unix.ConnectedEndpoint.Send.
+func (c *connectedEndpoint) Send(data [][]byte, controlMessages unix.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *tcpip.Error) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.writeClosed {
+		return 0, false, tcpip.ErrClosedForSend
+	}
+	n, err := sendMsg(c.file.FD(), data, controlMessages)
+	// There is no need for the callee to call SendNotify because sendMsg uses
+	// the host's sendmsg(2) and the host kernel's queue.
+	return n, false, err
+}
+
+// SendNotify implements unix.ConnectedEndpoint.SendNotify.
+func (c *connectedEndpoint) SendNotify() {}
+
+// CloseSend implements unix.ConnectedEndpoint.CloseSend.
+func (c *connectedEndpoint) CloseSend() {
+	c.mu.Lock()
+	c.writeClosed = true
+	c.mu.Unlock()
+}
+
+// CloseNotify implements unix.ConnectedEndpoint.CloseNotify.
+func (c *connectedEndpoint) CloseNotify() {}
+
+// Writable implements unix.ConnectedEndpoint.Writable.
+func (c *connectedEndpoint) Writable() bool {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.writeClosed {
+		return true
+	}
+	return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventOut)&waiter.EventOut != 0
+}
+
+// Passcred implements unix.ConnectedEndpoint.Passcred.
+func (c *connectedEndpoint) Passcred() bool {
+	// We don't support credential passing for host sockets.
+	return false
+}
+
+// GetLocalAddress implements unix.ConnectedEndpoint.GetLocalAddress.
+func (c *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	return tcpip.FullAddress{Addr: tcpip.Address(c.path)}, nil
+}
+
+// EventUpdate implements unix.ConnectedEndpoint.EventUpdate.
+func (c *connectedEndpoint) EventUpdate() {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.file.FD() != -1 {
+		fdnotifier.UpdateFD(int32(c.file.FD()))
+	}
+}
+
+// Recv implements unix.Receiver.Recv.
+func (c *connectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, unix.ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.readClosed {
+		return 0, 0, unix.ControlMessages{}, tcpip.FullAddress{}, false, tcpip.ErrClosedForReceive
+	}
+	rl, ml, cm, err := recvMsg(c.file.FD(), data, numRights, peek, nil)
+	// There is no need for the callee to call RecvNotify because recvMsg uses
+	// the host's recvmsg(2) and the host kernel's queue.
+	return rl, ml, cm, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, err
+}
+
+// close releases all resources related to the endpoint.
+func (c *connectedEndpoint) close() {
+	fdnotifier.RemoveFD(int32(c.file.FD()))
+	c.file.Close()
+	c.file = nil
+}
+
+// RecvNotify implements unix.Receiver.RecvNotify.
+func (c *connectedEndpoint) RecvNotify() {}
+
+// CloseRecv implements unix.Receiver.CloseRecv.
+func (c *connectedEndpoint) CloseRecv() {
+	c.mu.Lock()
+	c.readClosed = true
+	c.mu.Unlock()
+}
+
+// Readable implements unix.Receiver.Readable.
+func (c *connectedEndpoint) Readable() bool {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.readClosed {
+		return true
+	}
+	return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventIn)&waiter.EventIn != 0
+}
+
+// SendQueuedSize implements unix.Receiver.SendQueuedSize.
+func (c *connectedEndpoint) SendQueuedSize() int64 {
+	// SendQueuedSize isn't supported for host sockets because we don't allow the
+	// sentry to call ioctl(2).
+	return -1
+}
+
+// RecvQueuedSize implements unix.Receiver.RecvQueuedSize.
+func (c *connectedEndpoint) RecvQueuedSize() int64 {
+	// RecvQueuedSize isn't supported for host sockets because we don't allow the
+	// sentry to call ioctl(2).
+	return -1
+}
+
+// SendMaxQueueSize implements unix.Receiver.SendMaxQueueSize.
+func (c *connectedEndpoint) SendMaxQueueSize() int64 {
+	v, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+	if err != nil {
+		return -1
+	}
+	return int64(v)
+}
+
+// RecvMaxQueueSize implements unix.Receiver.RecvMaxQueueSize.
+func (c *connectedEndpoint) RecvMaxQueueSize() int64 {
+	v, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_RCVBUF)
+	if err != nil {
+		return -1
+	}
+	return int64(v)
+}
+
+// Release implements unix.ConnectedEndpoint.Release and unix.Receiver.Release.
+func (c *connectedEndpoint) Release() {
+	c.ref.DecRefWithDestructor(c.close)
+}
+
+func translateError(err error) *tcpip.Error {
+	if err == nil {
+		return nil
+	}
+	return rawfile.TranslateErrno(err.(syscall.Errno))
+}
diff --git a/pkg/sentry/fs/host/socket_state.go b/pkg/sentry/fs/host/socket_state.go
new file mode 100644
index 000000000..6acabd55a
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_state.go
@@ -0,0 +1,39 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"syscall"
+)
+
+// beforeSave is invoked by stateify.
+func (ep *endpoint) beforeSave() {
+	if ep.srfd < 0 {
+		panic("only host file descriptors provided at sentry startup can be saved")
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (ep *endpoint) afterLoad() {
+	fd, err := syscall.Dup(ep.srfd)
+	if err != nil {
+		panic(fmt.Sprintf("failed to dup restored fd %d: %v", ep.srfd, err))
+	}
+	ep.fd = fd
+	if err := ep.init(); err != nil {
+		panic(fmt.Sprintf("Could not restore host socket fd %d: %v", ep.srfd, err))
+	}
+}
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
new file mode 100644
index 000000000..80c46dcfa
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -0,0 +1,401 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"reflect"
+	"syscall"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
+)
+
+var (
+	// Make sure that connectedEndpoint implements unix.ConnectedEndpoint.
+	_ = unix.ConnectedEndpoint(new(connectedEndpoint))
+
+	// Make sure that connectedEndpoint implements unix.Receiver.
+	_ = unix.Receiver(new(connectedEndpoint))
+)
+
+func getFl(fd int) (uint32, error) {
+	fl, _, err := syscall.RawSyscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0)
+	if err == 0 {
+		return uint32(fl), nil
+	}
+	return 0, err
+}
+
+func TestSocketIsBlocking(t *testing.T) {
+	// Using socketpair here because it's already connected.
+	pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+	if err != nil {
+		t.Fatalf("host socket creation failed: %v", err)
+	}
+
+	fl, err := getFl(pair[0])
+	if err != nil {
+		t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err)
+	}
+	if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK {
+		t.Fatalf("Expected socket %v to be blocking", pair[0])
+	}
+	if fl, err = getFl(pair[1]); err != nil {
+		t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[1], err)
+	}
+	if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK {
+		t.Fatalf("Expected socket %v to be blocking", pair[1])
+	}
+	sock, err := newSocket(contexttest.Context(t), pair[0], false)
+	if err != nil {
+		t.Fatalf("newSocket(%v) failed => %v", pair[0], err)
+	}
+	defer sock.DecRef()
+	// Test that the socket now is non blocking.
+	if fl, err = getFl(pair[0]); err != nil {
+		t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err)
+	}
+	if fl&syscall.O_NONBLOCK != syscall.O_NONBLOCK {
+		t.Errorf("Expected socket %v to have becoming non blocking", pair[0])
+	}
+	if fl, err = getFl(pair[1]); err != nil {
+		t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[1], err)
+	}
+	if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK {
+		t.Errorf("Did not expect socket %v to become non blocking", pair[1])
+	}
+}
+
+func TestSocketWritev(t *testing.T) {
+	// Using socketpair here because it's already connected.
+	pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+	if err != nil {
+		t.Fatalf("host socket creation failed: %v", err)
+	}
+	socket, err := newSocket(contexttest.Context(t), pair[0], false)
+	if err != nil {
+		t.Fatalf("newSocket(%v) => %v", pair[0], err)
+	}
+	defer socket.DecRef()
+	buf := []byte("hello world\n")
+	n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(buf))
+	if err != nil {
+		t.Fatalf("socket writev failed: %v", err)
+	}
+
+	if n != int64(len(buf)) {
+		t.Fatalf("socket writev wrote incorrect bytes: %d", n)
+	}
+}
+
+func TestSocketWritevLen0(t *testing.T) {
+	// Using socketpair here because it's already connected.
+	pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+	if err != nil {
+		t.Fatalf("host socket creation failed: %v", err)
+	}
+	socket, err := newSocket(contexttest.Context(t), pair[0], false)
+	if err != nil {
+		t.Fatalf("newSocket(%v) => %v", pair[0], err)
+	}
+	defer socket.DecRef()
+	n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(nil))
+	if err != nil {
+		t.Fatalf("socket writev failed: %v", err)
+	}
+
+	if n != 0 {
+		t.Fatalf("socket writev wrote incorrect bytes: %d", n)
+	}
+}
+
+func TestSocketSendMsgLen0(t *testing.T) {
+	// Using socketpair here because it's already connected.
+	pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+	if err != nil {
+		t.Fatalf("host socket creation failed: %v", err)
+	}
+	sfile, err := newSocket(contexttest.Context(t), pair[0], false)
+	if err != nil {
+		t.Fatalf("newSocket(%v) => %v", pair[0], err)
+	}
+	defer sfile.DecRef()
+
+	s := sfile.FileOperations.(socket.Socket)
+	n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, unix.ControlMessages{})
+	if n != 0 {
+		t.Fatalf("socket sendmsg() failed: %v wrote: %d", terr, n)
+	}
+
+	if terr != nil {
+		t.Fatalf("socket sendmsg() failed: %v", terr)
+	}
+}
+
+func TestListen(t *testing.T) {
+	pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+	if err != nil {
+		t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err)
+	}
+	sfile1, err := newSocket(contexttest.Context(t), pair[0], false)
+	if err != nil {
+		t.Fatalf("newSocket(%v) => %v", pair[0], err)
+	}
+	defer sfile1.DecRef()
+	socket1 := sfile1.FileOperations.(socket.Socket)
+
+	sfile2, err := newSocket(contexttest.Context(t), pair[1], false)
+	if err != nil {
+		t.Fatalf("newSocket(%v) => %v", pair[1], err)
+	}
+	defer sfile2.DecRef()
+	socket2 := sfile2.FileOperations.(socket.Socket)
+
+	// Socketpairs can not be listened to.
+	if err := socket1.Listen(nil, 64); err != syserr.ErrInvalidEndpointState {
+		t.Fatalf("socket1.Listen(nil, 64) => %v, want syserr.ErrInvalidEndpointState", err)
+	}
+	if err := socket2.Listen(nil, 64); err != syserr.ErrInvalidEndpointState {
+		t.Fatalf("socket2.Listen(nil, 64) => %v, want syserr.ErrInvalidEndpointState", err)
+	}
+
+	// Create a Unix socket, do not bind it.
+	sock, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+	if err != nil {
+		t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err)
+	}
+	sfile3, err := newSocket(contexttest.Context(t), sock, false)
+	if err != nil {
+		t.Fatalf("newSocket(%v) => %v", sock, err)
+	}
+	defer sfile3.DecRef()
+	socket3 := sfile3.FileOperations.(socket.Socket)
+
+	// This socket is not bound so we can't listen on it.
+	if err := socket3.Listen(nil, 64); err != syserr.ErrInvalidEndpointState {
+		t.Fatalf("socket3.Listen(nil, 64) => %v, want syserr.ErrInvalidEndpointState", err)
+	}
+}
+
+func TestSend(t *testing.T) {
+	e := connectedEndpoint{writeClosed: true}
+	if _, _, err := e.Send(nil, unix.ControlMessages{}, tcpip.FullAddress{}); err != tcpip.ErrClosedForSend {
+		t.Errorf("Got %#v.Send() = %v, want = %v", e, err, tcpip.ErrClosedForSend)
+	}
+}
+
+func TestRecv(t *testing.T) {
+	e := connectedEndpoint{readClosed: true}
+	if _, _, _, _, _, err := e.Recv(nil, false, 0, false); err != tcpip.ErrClosedForReceive {
+		t.Errorf("Got %#v.Recv() = %v, want = %v", e, err, tcpip.ErrClosedForReceive)
+	}
+}
+
+func TestPasscred(t *testing.T) {
+	e := connectedEndpoint{}
+	if got, want := e.Passcred(), false; got != want {
+		t.Errorf("Got %#v.Passcred() = %t, want = %t", e, got, want)
+	}
+}
+
+func TestGetLocalAddress(t *testing.T) {
+	e := connectedEndpoint{path: "foo"}
+	want := tcpip.FullAddress{Addr: tcpip.Address("foo")}
+	if got, err := e.GetLocalAddress(); err != nil || got != want {
+		t.Errorf("Got %#v.GetLocalAddress() = %#v, %v, want = %#v, %v", e, got, err, want, nil)
+	}
+}
+
+func TestQueuedSize(t *testing.T) {
+	e := connectedEndpoint{}
+	tests := []struct {
+		name string
+		f    func() int64
+	}{
+		{"SendQueuedSize", e.SendQueuedSize},
+		{"RecvQueuedSize", e.RecvQueuedSize},
+	}
+
+	for _, test := range tests {
+		if got, want := test.f(), int64(-1); got != want {
+			t.Errorf("Got %#v.%s() = %d, want = %d", e, test.name, got, want)
+		}
+	}
+}
+
+func TestReadable(t *testing.T) {
+	e := connectedEndpoint{readClosed: true}
+	if got, want := e.Readable(), true; got != want {
+		t.Errorf("Got %#v.Readable() = %t, want = %t", e, got, want)
+	}
+}
+
+func TestWritable(t *testing.T) {
+	e := connectedEndpoint{writeClosed: true}
+	if got, want := e.Writable(), true; got != want {
+		t.Errorf("Got %#v.Writable() = %t, want = %t", e, got, want)
+	}
+}
+
+func TestRelease(t *testing.T) {
+	f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c := &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
+	want := &connectedEndpoint{queue: c.queue}
+	want.ref.DecRef()
+	fdnotifier.AddFD(int32(c.file.FD()), nil)
+	c.Release()
+	if !reflect.DeepEqual(c, want) {
+		t.Errorf("got = %#v, want = %#v", c, want)
+	}
+}
+
+func TestClose(t *testing.T) {
+	type testCase struct {
+		name  string
+		cep   *connectedEndpoint
+		addFD bool
+		f     func()
+		want  *connectedEndpoint
+	}
+
+	var tests []testCase
+
+	// nil is the value used by connectedEndpoint to indicate a closed file.
+	// Non-nil files are used to check if the file gets closed.
+
+	f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c := &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
+	tests = append(tests, testCase{
+		name:  "First CloseRecv",
+		cep:   c,
+		addFD: false,
+		f:     c.CloseRecv,
+		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true},
+	})
+
+	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true}
+	tests = append(tests, testCase{
+		name:  "Second CloseRecv",
+		cep:   c,
+		addFD: false,
+		f:     c.CloseRecv,
+		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true},
+	})
+
+	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
+	tests = append(tests, testCase{
+		name:  "First CloseSend",
+		cep:   c,
+		addFD: false,
+		f:     c.CloseSend,
+		want:  &connectedEndpoint{queue: c.queue, file: c.file, writeClosed: true},
+	})
+
+	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true}
+	tests = append(tests, testCase{
+		name:  "Second CloseSend",
+		cep:   c,
+		addFD: false,
+		f:     c.CloseSend,
+		want:  &connectedEndpoint{queue: c.queue, file: c.file, writeClosed: true},
+	})
+
+	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true}
+	tests = append(tests, testCase{
+		name:  "CloseSend then CloseRecv",
+		cep:   c,
+		addFD: true,
+		f:     c.CloseRecv,
+		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
+	})
+
+	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true}
+	tests = append(tests, testCase{
+		name:  "CloseRecv then CloseSend",
+		cep:   c,
+		addFD: true,
+		f:     c.CloseSend,
+		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
+	})
+
+	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true}
+	tests = append(tests, testCase{
+		name:  "Full close then CloseRecv",
+		cep:   c,
+		addFD: false,
+		f:     c.CloseRecv,
+		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
+	})
+
+	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true}
+	tests = append(tests, testCase{
+		name:  "Full close then CloseSend",
+		cep:   c,
+		addFD: false,
+		f:     c.CloseSend,
+		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
+	})
+
+	for _, test := range tests {
+		if test.addFD {
+			fdnotifier.AddFD(int32(test.cep.file.FD()), nil)
+		}
+		if test.f(); !reflect.DeepEqual(test.cep, test.want) {
+			t.Errorf("%s: got = %#v, want = %#v", test.name, test.cep, test.want)
+		}
+	}
+}
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
new file mode 100644
index 000000000..bf8da6867
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -0,0 +1,82 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+// buildIovec builds an iovec slice from the given []byte slice.
+func buildIovec(bufs [][]byte) (uintptr, []syscall.Iovec) {
+	var length uintptr
+	iovecs := make([]syscall.Iovec, 0, 10)
+	for i := range bufs {
+		if l := len(bufs[i]); l > 0 {
+			length += uintptr(l)
+			iovecs = append(iovecs, syscall.Iovec{
+				Base: &bufs[i][0],
+				Len:  uint64(l),
+			})
+		}
+	}
+	return length, iovecs
+}
+
+func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool) (readLen uintptr, msgLen uintptr, controlLen uint64, err error) {
+	flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC)
+	if peek {
+		flags |= syscall.MSG_PEEK
+	}
+
+	length, iovecs := buildIovec(bufs)
+
+	var msg syscall.Msghdr
+	if len(control) != 0 {
+		msg.Control = &control[0]
+		msg.Controllen = uint64(len(control))
+	}
+
+	if len(iovecs) != 0 {
+		msg.Iov = &iovecs[0]
+		msg.Iovlen = uint64(len(iovecs))
+	}
+	n, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags)
+	if e != 0 {
+		return 0, 0, 0, e
+	}
+
+	if n > length {
+		return length, n, msg.Controllen, nil
+	}
+
+	return n, n, msg.Controllen, nil
+}
+
+func fdWriteVec(fd int, bufs [][]byte) (uintptr, error) {
+	_, iovecs := buildIovec(bufs)
+
+	var msg syscall.Msghdr
+	if len(iovecs) > 0 {
+		msg.Iov = &iovecs[0]
+		msg.Iovlen = uint64(len(iovecs))
+	}
+	n, _, e := syscall.RawSyscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), syscall.MSG_DONTWAIT|syscall.MSG_NOSIGNAL)
+	if e != 0 {
+		return 0, e
+	}
+
+	return n, nil
+}
diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go
new file mode 100644
index 000000000..74c703eb7
--- /dev/null
+++ b/pkg/sentry/fs/host/util.go
@@ -0,0 +1,197 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"os"
+	"path"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func open(parent *inodeOperations, name string) (int, error) {
+	if parent == nil && !path.IsAbs(name) {
+		return -1, syserror.EINVAL
+	}
+	name = path.Clean(name)
+
+	// Don't follow through symlinks.
+	flags := syscall.O_NOFOLLOW
+
+	if fd, err := openAt(parent, name, flags|syscall.O_RDWR, 0); err == nil {
+		return fd, nil
+	}
+	// Retry as read-only.
+	if fd, err := openAt(parent, name, flags|syscall.O_RDONLY, 0); err == nil {
+		return fd, nil
+	}
+
+	// Retry as write-only.
+	if fd, err := openAt(parent, name, flags|syscall.O_WRONLY, 0); err == nil {
+		return fd, nil
+	}
+
+	// Retry as a symlink, by including O_PATH as an option.
+	fd, err := openAt(parent, name, linux.O_PATH|flags, 0)
+	if err == nil {
+		return fd, nil
+	}
+
+	// Everything failed.
+	return -1, err
+}
+
+func openAt(parent *inodeOperations, name string, flags int, perm linux.FileMode) (int, error) {
+	if parent == nil {
+		return syscall.Open(name, flags, uint32(perm))
+	}
+	return syscall.Openat(parent.fileState.FD(), name, flags, uint32(perm))
+}
+
+func nodeType(s *syscall.Stat_t) fs.InodeType {
+	switch x := (s.Mode & syscall.S_IFMT); x {
+	case syscall.S_IFLNK:
+		return fs.Symlink
+	case syscall.S_IFIFO:
+		return fs.Pipe
+	case syscall.S_IFCHR:
+		return fs.CharacterDevice
+	case syscall.S_IFBLK:
+		return fs.BlockDevice
+	case syscall.S_IFSOCK:
+		return fs.Socket
+	case syscall.S_IFDIR:
+		return fs.Directory
+	case syscall.S_IFREG:
+		return fs.RegularFile
+	default:
+		// This shouldn't happen, but just in case...
+		log.Warningf("unknown host file type %d: assuming regular", x)
+		return fs.RegularFile
+	}
+}
+
+func wouldBlock(s *syscall.Stat_t) bool {
+	typ := nodeType(s)
+	return typ == fs.Pipe || typ == fs.Socket || typ == fs.CharacterDevice
+}
+
+func stableAttr(s *syscall.Stat_t) fs.StableAttr {
+	return fs.StableAttr{
+		Type:     nodeType(s),
+		DeviceID: hostFileDevice.DeviceID(),
+		InodeID: hostFileDevice.Map(device.MultiDeviceKey{
+			Device: s.Dev,
+			Inode:  s.Ino,
+		}),
+		BlockSize: int64(s.Blksize),
+	}
+}
+
+func owner(mo *superOperations, s *syscall.Stat_t) fs.FileOwner {
+	// User requested no translation, just return actual owner.
+	if mo.dontTranslateOwnership {
+		return fs.FileOwner{auth.KUID(s.Uid), auth.KGID(s.Gid)}
+	}
+
+	// Show only IDs relevant to the sandboxed task. I.e. if we not own the
+	// file, no sandboxed task can own the file. In that case, we
+	// use OverflowID for UID, implying that the IDs are not mapped in the
+	// "root" user namespace.
+	//
+	// E.g.
+	// sandbox's host EUID/EGID is 1/1.
+	// some_dir's host UID/GID is 2/1.
+	// Task that mounted this fs has virtualized EUID/EGID 5/5.
+	//
+	// If you executed `ls -n` in the sandboxed task, it would show:
+	// drwxwrxwrx [...] 65534 5 [...] some_dir
+
+	// Files are owned by OverflowID by default.
+	owner := fs.FileOwner{auth.KUID(auth.OverflowUID), auth.KGID(auth.OverflowGID)}
+
+	// If we own file on host, let mounting task's initial EUID own
+	// the file.
+	if s.Uid == hostUID {
+		owner.UID = mo.mounter.UID
+	}
+
+	// If our group matches file's group, make file's group match
+	// the mounting task's initial EGID.
+	for _, gid := range hostGIDs {
+		if s.Gid == gid {
+			owner.GID = mo.mounter.GID
+			break
+		}
+	}
+	return owner
+}
+
+func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr {
+	return fs.UnstableAttr{
+		Size:             s.Size,
+		Usage:            s.Blocks * 512,
+		Perms:            fs.FilePermsFromMode(linux.FileMode(s.Mode)),
+		Owner:            owner(mo, s),
+		AccessTime:       ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec),
+		ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec),
+		StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec),
+		Links:            s.Nlink,
+	}
+}
+
+type dirInfo struct {
+	buf  []byte // buffer for directory I/O.
+	nbuf int    // length of buf; return value from ReadDirent.
+	bufp int    // location of next record in buf.
+}
+
+// isBlockError unwraps os errors and checks if they are caused by EAGAIN or
+// EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock.
+func isBlockError(err error) bool {
+	if err == syserror.EAGAIN || err == syserror.EWOULDBLOCK {
+		return true
+	}
+	if pe, ok := err.(*os.PathError); ok {
+		return isBlockError(pe.Err)
+	}
+	return false
+}
+
+func hostEffectiveKIDs() (uint32, []uint32, error) {
+	gids, err := os.Getgroups()
+	if err != nil {
+		return 0, nil, err
+	}
+	egids := make([]uint32, len(gids))
+	for i, gid := range gids {
+		egids[i] = uint32(gid)
+	}
+	return uint32(os.Geteuid()), append(egids, uint32(os.Getegid())), nil
+}
+
+var hostUID uint32
+var hostGIDs []uint32
+
+func init() {
+	hostUID, hostGIDs, _ = hostEffectiveKIDs()
+}
diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go
new file mode 100644
index 000000000..c38d2392d
--- /dev/null
+++ b/pkg/sentry/fs/host/util_unsafe.go
@@ -0,0 +1,137 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+)
+
+func createLink(fd int, name string, linkName string) error {
+	namePtr, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return err
+	}
+	linkNamePtr, err := syscall.BytePtrFromString(linkName)
+	if err != nil {
+		return err
+	}
+	_, _, errno := syscall.Syscall(
+		syscall.SYS_SYMLINKAT,
+		uintptr(unsafe.Pointer(namePtr)),
+		uintptr(fd),
+		uintptr(unsafe.Pointer(linkNamePtr)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+func readLink(fd int) (string, error) {
+	// Buffer sizing copied from os.Readlink.
+	for l := 128; ; l *= 2 {
+		b := make([]byte, l)
+		n, _, errno := syscall.Syscall6(
+			syscall.SYS_READLINKAT,
+			uintptr(fd),
+			uintptr(unsafe.Pointer(syscall.StringBytePtr(""))),
+			uintptr(unsafe.Pointer(&b[0])),
+			uintptr(l),
+			0, 0)
+		if n < 0 {
+			n = 0
+		}
+		if errno != 0 {
+			return "", errno
+		}
+		if n < uintptr(l) {
+			return string(b[:n]), nil
+		}
+	}
+}
+
+func unlinkAt(fd int, name string, dir bool) error {
+	namePtr, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return err
+	}
+	var flags uintptr
+	if dir {
+		flags = linux.AT_REMOVEDIR
+	}
+	_, _, errno := syscall.Syscall(
+		syscall.SYS_UNLINKAT,
+		uintptr(fd),
+		uintptr(unsafe.Pointer(namePtr)),
+		flags,
+	)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+func timespecFromTimestamp(t ktime.Time, omit, setSysTime bool) syscall.Timespec {
+	if omit {
+		return syscall.Timespec{0, linux.UTIME_OMIT}
+	}
+	if setSysTime {
+		return syscall.Timespec{0, linux.UTIME_NOW}
+	}
+	return syscall.NsecToTimespec(t.Nanoseconds())
+}
+
+func setTimestamps(fd int, ts fs.TimeSpec) error {
+	if ts.ATimeOmit && ts.MTimeOmit {
+		return nil
+	}
+	var sts [2]syscall.Timespec
+	sts[0] = timespecFromTimestamp(ts.ATime, ts.ATimeOmit, ts.ATimeSetSystemTime)
+	sts[1] = timespecFromTimestamp(ts.MTime, ts.MTimeOmit, ts.MTimeSetSystemTime)
+	_, _, errno := syscall.Syscall6(
+		syscall.SYS_UTIMENSAT,
+		uintptr(fd),
+		0, /* path */
+		uintptr(unsafe.Pointer(&sts)),
+		0, /* flags */
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+func fstatat(fd int, name string, flags int) (syscall.Stat_t, error) {
+	var stat syscall.Stat_t
+	namePtr, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return stat, err
+	}
+	_, _, errno := syscall.Syscall6(
+		syscall.SYS_NEWFSTATAT,
+		uintptr(fd),
+		uintptr(unsafe.Pointer(namePtr)),
+		uintptr(unsafe.Pointer(&stat)),
+		uintptr(flags),
+		0, 0)
+	if errno != 0 {
+		return stat, errno
+	}
+	return stat, nil
+}
diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go
new file mode 100644
index 000000000..c5f5c9c0d
--- /dev/null
+++ b/pkg/sentry/fs/host/wait_test.go
@@ -0,0 +1,70 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"testing"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+func TestWait(t *testing.T) {
+	var fds [2]int
+	err := syscall.Pipe(fds[:])
+	if err != nil {
+		t.Fatalf("Unable to create pipe: %v", err)
+	}
+
+	defer syscall.Close(fds[1])
+
+	ctx := contexttest.Context(t)
+	file, err := NewFile(ctx, fds[0], fs.RootOwner)
+	if err != nil {
+		syscall.Close(fds[0])
+		t.Fatalf("NewFile failed: %v", err)
+	}
+
+	defer file.DecRef()
+
+	r := file.Readiness(waiter.EventIn)
+	if r != 0 {
+		t.Fatalf("File is ready for read when it shouldn't be.")
+	}
+
+	e, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&e, waiter.EventIn)
+	defer file.EventUnregister(&e)
+
+	// Check that there are no notifications yet.
+	if len(ch) != 0 {
+		t.Fatalf("Channel is non-empty")
+	}
+
+	// Write to the pipe, so it should be writable now.
+	syscall.Write(fds[1], []byte{1})
+
+	// Check that we get a notification. We need to yield the current thread
+	// so that the fdnotifier can deliver notifications, so we use a
+	// 1-second timeout instead of just checking the length of the channel.
+	select {
+	case <-ch:
+	case <-time.After(1 * time.Second):
+		t.Fatalf("Channel not notified")
+	}
+}
author	Googler <noreply@google.com>	2018-04-27 10:37:02 -0700
committer	Adin Scannell <ascannell@google.com>	2018-04-28 01:44:26 -0400
commit	d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree	54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/fs/host
parent	f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)