Merge 216da0b7 (automated)

author: gVisor bot <gvisor-bot@google.com> 2019-06-02 06:44:55 +0000
committer: gVisor bot <gvisor-bot@google.com> 2019-06-02 06:44:55 +0000
commit: ceb0d792f328d1fc0692197d8856a43c3936a571 (patch)
tree: 83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/fs/host
parent: deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff)
parent: 216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff)
17 files changed, 3026 insertions, 0 deletions
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
new file mode 100644
index 000000000..9ebb9bbb3
--- /dev/null
+++ b/pkg/sentry/fs/host/control.go
@@ -0,0 +1,93 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
+)
+
+type scmRights struct {
+	fds []int
+}
+
+func newSCMRights(fds []int) control.SCMRights {
+	return &scmRights{fds}
+}
+
+// Files implements control.SCMRights.Files.
+func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFiles, bool) {
+	n := max
+	var trunc bool
+	if l := len(c.fds); n > l {
+		n = l
+	} else if n < l {
+		trunc = true
+	}
+
+	rf := control.RightsFiles(fdsToFiles(ctx, c.fds[:n]))
+
+	// Only consume converted FDs (fdsToFiles may convert fewer than n FDs).
+	c.fds = c.fds[len(rf):]
+	return rf, trunc
+}
+
+// Clone implements transport.RightsControlMessage.Clone.
+func (c *scmRights) Clone() transport.RightsControlMessage {
+	// Host rights never need to be cloned.
+	return nil
+}
+
+// Release implements transport.RightsControlMessage.Release.
+func (c *scmRights) Release() {
+	for _, fd := range c.fds {
+		syscall.Close(fd)
+	}
+	c.fds = nil
+}
+
+// If an error is encountered, only files created before the error will be
+// returned. This is what Linux does.
+func fdsToFiles(ctx context.Context, fds []int) []*fs.File {
+	files := make([]*fs.File, 0, len(fds))
+	for _, fd := range fds {
+		// Get flags. We do it here because they may be modified
+		// by subsequent functions.
+		fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0)
+		if errno != 0 {
+			ctx.Warningf("Error retrieving host FD flags: %v", error(errno))
+			break
+		}
+
+		// Create the file backed by hostFD.
+		file, err := NewFile(ctx, fd, fs.FileOwnerFromContext(ctx))
+		if err != nil {
+			ctx.Warningf("Error creating file from host FD: %v", err)
+			break
+		}
+
+		// Set known flags.
+		file.SetFlags(fs.SettableFileFlags{
+			NonBlocking: fileFlags&syscall.O_NONBLOCK != 0,
+		})
+
+		files = append(files, file)
+	}
+	return files
+}
diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go
new file mode 100644
index 000000000..ffcd57a94
--- /dev/null
+++ b/pkg/sentry/fs/host/descriptor.go
@@ -0,0 +1,120 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"path"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// descriptor wraps a host fd.
+//
+// +stateify savable
+type descriptor struct {
+	// donated is true if the host fd was donated by another process.
+	donated bool
+
+	// If origFD >= 0, it is the host fd that this file was originally created
+	// from, which must be available at time of restore. The FD can be closed
+	// after descriptor is created. Only set if donated is true.
+	origFD int
+
+	// wouldBlock is true if value (below) points to a file that can
+	// return EWOULDBLOCK for operations that would block.
+	wouldBlock bool
+
+	// value is the wrapped host fd. It is never saved or restored
+	// directly. How it is restored depends on whether it was
+	// donated and the fs.MountSource it was originally
+	// opened/created from.
+	value int `state:"nosave"`
+}
+
+// newDescriptor returns a wrapped host file descriptor. On success,
+// the descriptor is registered for event notifications with queue.
+func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue *waiter.Queue) (*descriptor, error) {
+	ownedFD := fd
+	origFD := -1
+	if saveable {
+		var err error
+		ownedFD, err = syscall.Dup(fd)
+		if err != nil {
+			return nil, err
+		}
+		origFD = fd
+	}
+	if wouldBlock {
+		if err := syscall.SetNonblock(ownedFD, true); err != nil {
+			return nil, err
+		}
+		if err := fdnotifier.AddFD(int32(ownedFD), queue); err != nil {
+			return nil, err
+		}
+	}
+	return &descriptor{
+		donated:    donated,
+		origFD:     origFD,
+		wouldBlock: wouldBlock,
+		value:      ownedFD,
+	}, nil
+}
+
+// initAfterLoad initializes the value of the descriptor after Load.
+func (d *descriptor) initAfterLoad(mo *superOperations, id uint64, queue *waiter.Queue) error {
+	if d.donated {
+		var err error
+		d.value, err = syscall.Dup(d.origFD)
+		if err != nil {
+			return fmt.Errorf("failed to dup restored fd %d: %v", d.origFD, err)
+		}
+	} else {
+		name, ok := mo.inodeMappings[id]
+		if !ok {
+			return fmt.Errorf("failed to find path for inode number %d", id)
+		}
+		fullpath := path.Join(mo.root, name)
+
+		var err error
+		d.value, err = open(nil, fullpath)
+		if err != nil {
+			return fmt.Errorf("failed to open %q: %v", fullpath, err)
+		}
+	}
+	if d.wouldBlock {
+		if err := syscall.SetNonblock(d.value, true); err != nil {
+			return err
+		}
+		if err := fdnotifier.AddFD(int32(d.value), queue); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Release releases all resources held by descriptor.
+func (d *descriptor) Release() {
+	if d.wouldBlock {
+		fdnotifier.RemoveFD(int32(d.value))
+	}
+	if err := syscall.Close(d.value); err != nil {
+		log.Warningf("error closing fd %d: %v", d.value, err)
+	}
+	d.value = -1
+}
diff --git a/pkg/sentry/fs/host/descriptor_state.go b/pkg/sentry/fs/host/descriptor_state.go
new file mode 100644
index 000000000..8167390a9
--- /dev/null
+++ b/pkg/sentry/fs/host/descriptor_state.go
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+// beforeSave is invoked by stateify.
+func (d *descriptor) beforeSave() {
+	if d.donated && d.origFD < 0 {
+		panic("donated file descriptor cannot be saved")
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (d *descriptor) afterLoad() {
+	// value must be manually restored by the descriptor's parent using
+	// initAfterLoad.
+	d.value = -1
+}
diff --git a/pkg/sentry/fs/host/device.go b/pkg/sentry/fs/host/device.go
new file mode 100644
index 000000000..055024c44
--- /dev/null
+++ b/pkg/sentry/fs/host/device.go
@@ -0,0 +1,25 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+)
+
+// hostFileDevice is the host file virtual device.
+var hostFileDevice = device.NewAnonMultiDevice()
+
+// hostPipeDevice is the host pipe virtual device.
+var hostPipeDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
new file mode 100644
index 000000000..ad0a3ec85
--- /dev/null
+++ b/pkg/sentry/fs/host/file.go
@@ -0,0 +1,286 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/secio"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// fileOperations implements fs.FileOperations for a host file descriptor.
+//
+// +stateify savable
+type fileOperations struct {
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosplice"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+	// iops are the Inode operations for this file.
+	iops *inodeOperations `state:"wait"`
+
+	// a scratch buffer for reading directory entries.
+	dirinfo *dirInfo `state:"nosave"`
+
+	// dirCursor is the directory cursor.
+	dirCursor string
+}
+
+// fileOperations implements fs.FileOperations.
+var _ fs.FileOperations = (*fileOperations)(nil)
+
+// NewFile creates a new File backed by the provided host file descriptor. If
+// NewFile succeeds, ownership of the FD is transferred to the returned File.
+//
+// The returned File cannot be saved, since there is no guarantee that the same
+// FD will exist or represent the same file at time of restore. If such a
+// guarantee does exist, use ImportFile instead.
+func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error) {
+	return newFileFromDonatedFD(ctx, fd, mounter, false, false)
+}
+
+// ImportFile creates a new File backed by the provided host file descriptor.
+// Unlike NewFile, the file descriptor used by the File is duped from FD to
+// ensure that later changes to FD are not reflected by the fs.File.
+//
+// If the returned file is saved, it will be restored by re-importing the FD
+// originally passed to ImportFile. It is the restorer's responsibility to
+// ensure that the FD represents the same file.
+func ImportFile(ctx context.Context, fd int, mounter fs.FileOwner, isTTY bool) (*fs.File, error) {
+	return newFileFromDonatedFD(ctx, fd, mounter, true, isTTY)
+}
+
+// newFileFromDonatedFD returns an fs.File from a donated FD. If the FD is
+// saveable, then saveable is true.
+func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner, saveable, isTTY bool) (*fs.File, error) {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(donated, &s); err != nil {
+		return nil, err
+	}
+	flags, err := fileFlagsFromDonatedFD(donated)
+	if err != nil {
+		return nil, err
+	}
+	switch s.Mode & syscall.S_IFMT {
+	case syscall.S_IFSOCK:
+		if isTTY {
+			return nil, fmt.Errorf("cannot import host socket as TTY")
+		}
+
+		s, err := newSocket(ctx, donated, saveable)
+		if err != nil {
+			return nil, err
+		}
+		s.SetFlags(fs.SettableFileFlags{
+			NonBlocking: flags.NonBlocking,
+		})
+		return s, nil
+	default:
+		msrc := newMountSource(ctx, "/", mounter, &Filesystem{}, fs.MountSourceFlags{}, false /* dontTranslateOwnership */)
+		inode, err := newInode(ctx, msrc, donated, saveable, true /* donated */)
+		if err != nil {
+			return nil, err
+		}
+		iops := inode.InodeOperations.(*inodeOperations)
+
+		name := fmt.Sprintf("host:[%d]", inode.StableAttr.InodeID)
+		dirent := fs.NewDirent(inode, name)
+		defer dirent.DecRef()
+
+		if isTTY {
+			return newTTYFile(ctx, dirent, flags, iops), nil
+		}
+
+		return newFile(ctx, dirent, flags, iops), nil
+	}
+}
+
+func fileFlagsFromDonatedFD(donated int) (fs.FileFlags, error) {
+	flags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(donated), syscall.F_GETFL, 0)
+	if errno != 0 {
+		log.Warningf("Failed to get file flags for donated FD %d (errno=%d)", donated, errno)
+		return fs.FileFlags{}, syscall.EIO
+	}
+	accmode := flags & syscall.O_ACCMODE
+	return fs.FileFlags{
+		Direct:      flags&syscall.O_DIRECT != 0,
+		NonBlocking: flags&syscall.O_NONBLOCK != 0,
+		Sync:        flags&syscall.O_SYNC != 0,
+		Append:      flags&syscall.O_APPEND != 0,
+		Read:        accmode == syscall.O_RDONLY || accmode == syscall.O_RDWR,
+		Write:       accmode == syscall.O_WRONLY || accmode == syscall.O_RDWR,
+	}, nil
+}
+
+// newFile returns a new fs.File.
+func newFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File {
+	if !iops.ReturnsWouldBlock() {
+		// Allow reading/writing at an arbitrary offset for files
+		// that support it.
+		flags.Pread = true
+		flags.Pwrite = true
+	}
+	return fs.NewFile(ctx, dirent, flags, &fileOperations{iops: iops})
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (f *fileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	f.iops.fileState.queue.EventRegister(e, mask)
+	fdnotifier.UpdateFD(int32(f.iops.fileState.FD()))
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (f *fileOperations) EventUnregister(e *waiter.Entry) {
+	f.iops.fileState.queue.EventUnregister(e)
+	fdnotifier.UpdateFD(int32(f.iops.fileState.FD()))
+}
+
+// Readiness uses the poll() syscall to check the status of the underlying FD.
+func (f *fileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fdnotifier.NonBlockingPoll(int32(f.iops.fileState.FD()), mask)
+}
+
+// Readdir implements fs.FileOperations.Readdir.
+func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
+	root := fs.RootFromContext(ctx)
+	if root != nil {
+		defer root.DecRef()
+	}
+	dirCtx := &fs.DirCtx{
+		Serializer: serializer,
+		DirCursor:  &f.dirCursor,
+	}
+	return fs.DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset())
+}
+
+// IterateDir implements fs.DirIterator.IterateDir.
+func (f *fileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	if f.dirinfo == nil {
+		f.dirinfo = new(dirInfo)
+		f.dirinfo.buf = make([]byte, usermem.PageSize)
+	}
+	entries, err := f.iops.readdirAll(f.dirinfo)
+	if err != nil {
+		return offset, err
+	}
+	count, err := fs.GenericReaddir(dirCtx, fs.NewSortedDentryMap(entries))
+	return offset + count, err
+}
+
+// Write implements fs.FileOperations.Write.
+func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	// Would this file block?
+	if f.iops.ReturnsWouldBlock() {
+		// These files can't be memory mapped, assert this. This also
+		// means that writes do not need to synchronize with memory
+		// mappings nor metadata cached by this file's fs.Inode.
+		if canMap(file.Dirent.Inode) {
+			panic("files that can return EWOULDBLOCK cannot be memory mapped")
+		}
+		// Ignore the offset, these files don't support writing at
+		// an arbitrary offset.
+		writer := fd.NewReadWriter(f.iops.fileState.FD())
+		n, err := src.CopyInTo(ctx, safemem.FromIOWriter{writer})
+		if isBlockError(err) {
+			err = syserror.ErrWouldBlock
+		}
+		return n, err
+	}
+	if !file.Dirent.Inode.MountSource.Flags.ForcePageCache {
+		writer := secio.NewOffsetWriter(fd.NewReadWriter(f.iops.fileState.FD()), offset)
+		return src.CopyInTo(ctx, safemem.FromIOWriter{writer})
+	}
+	return f.iops.cachingInodeOps.Write(ctx, src, offset)
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	// Would this file block?
+	if f.iops.ReturnsWouldBlock() {
+		// These files can't be memory mapped, assert this. This also
+		// means that reads do not need to synchronize with memory
+		// mappings nor metadata cached by this file's fs.Inode.
+		if canMap(file.Dirent.Inode) {
+			panic("files that can return EWOULDBLOCK cannot be memory mapped")
+		}
+		// Ignore the offset, these files don't support reading at
+		// an arbitrary offset.
+		reader := fd.NewReadWriter(f.iops.fileState.FD())
+		n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{reader})
+		if isBlockError(err) {
+			// If we got any data at all, return it as a "completed" partial read
+			// rather than retrying until complete.
+			if n != 0 {
+				err = nil
+			} else {
+				err = syserror.ErrWouldBlock
+			}
+		}
+		return n, err
+	}
+	if !file.Dirent.Inode.MountSource.Flags.ForcePageCache {
+		reader := secio.NewOffsetReader(fd.NewReadWriter(f.iops.fileState.FD()), offset)
+		return dst.CopyOutFrom(ctx, safemem.FromIOReader{reader})
+	}
+	return f.iops.cachingInodeOps.Read(ctx, file, dst, offset)
+}
+
+// Fsync implements fs.FileOperations.Fsync.
+func (f *fileOperations) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error {
+	switch syncType {
+	case fs.SyncAll, fs.SyncData:
+		if err := file.Dirent.Inode.WriteOut(ctx); err != nil {
+			return err
+		}
+		fallthrough
+	case fs.SyncBackingStorage:
+		return syscall.Fsync(f.iops.fileState.FD())
+	}
+	panic("invalid sync type")
+}
+
+// Flush implements fs.FileOperations.Flush.
+func (f *fileOperations) Flush(context.Context, *fs.File) error {
+	// This is a no-op because flushing the resource backing this
+	// file would mean closing it. We can't do that because other
+	// open files may depend on the backing host FD.
+	return nil
+}
+
+// ConfigureMMap implements fs.FileOperations.ConfigureMMap.
+func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
+	if !canMap(file.Dirent.Inode) {
+		return syserror.ENODEV
+	}
+	return fsutil.GenericConfigureMMap(file, f.iops.cachingInodeOps, opts)
+}
+
+// Seek implements fs.FileOperations.Seek.
+func (f *fileOperations) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
+	return fsutil.SeekWithDirCursor(ctx, file, whence, offset, &f.dirCursor)
+}
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
new file mode 100644
index 000000000..b1b8dc0b6
--- /dev/null
+++ b/pkg/sentry/fs/host/fs.go
@@ -0,0 +1,339 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package host implements an fs.Filesystem for files backed by host
+// file descriptors.
+package host
+
+import (
+	"fmt"
+	"path"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// FilesystemName is the name under which Filesystem is registered.
+const FilesystemName = "whitelistfs"
+
+const (
+	// whitelistKey is the mount option containing a comma-separated list
+	// of host paths to whitelist.
+	whitelistKey = "whitelist"
+
+	// rootPathKey is the mount option containing the root path of the
+	// mount.
+	rootPathKey = "root"
+
+	// dontTranslateOwnershipKey is the key to superOperations.dontTranslateOwnership.
+	dontTranslateOwnershipKey = "dont_translate_ownership"
+)
+
+// maxTraversals determines link traversals in building the whitelist.
+const maxTraversals = 10
+
+// Filesystem is a pseudo file system that is only available during the setup
+// to lock down the configurations. This filesystem should only be mounted at root.
+//
+// Think twice before exposing this to applications.
+//
+// +stateify savable
+type Filesystem struct {
+	// whitelist is a set of host paths to whitelist.
+	paths []string
+}
+
+var _ fs.Filesystem = (*Filesystem)(nil)
+
+// Name is the identifier of this file system.
+func (*Filesystem) Name() string {
+	return FilesystemName
+}
+
+// AllowUserMount prohibits users from using mount(2) with this file system.
+func (*Filesystem) AllowUserMount() bool {
+	return false
+}
+
+// AllowUserList allows this filesystem to be listed in /proc/filesystems.
+func (*Filesystem) AllowUserList() bool {
+	return true
+}
+
+// Flags returns that there is nothing special about this file system.
+func (*Filesystem) Flags() fs.FilesystemFlags {
+	return 0
+}
+
+// Mount returns an fs.Inode exposing the host file system.  It is intended to be locked
+// down in PreExec below.
+func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
+	// Parse generic comma-separated key=value options.
+	options := fs.GenericMountSourceOptions(data)
+
+	// Grab the whitelist if one was specified.
+	// TODO(edahlgren/mpratt/hzy): require another option "testonly" in order to allow
+	// no whitelist.
+	if wl, ok := options[whitelistKey]; ok {
+		f.paths = strings.Split(wl, "|")
+		delete(options, whitelistKey)
+	}
+
+	// If the rootPath was set, use it. Othewise default to the root of the
+	// host fs.
+	rootPath := "/"
+	if rp, ok := options[rootPathKey]; ok {
+		rootPath = rp
+		delete(options, rootPathKey)
+
+		// We must relativize the whitelisted paths to the new root.
+		for i, p := range f.paths {
+			rel, err := filepath.Rel(rootPath, p)
+			if err != nil {
+				return nil, fmt.Errorf("whitelist path %q must be a child of root path %q", p, rootPath)
+			}
+			f.paths[i] = path.Join("/", rel)
+		}
+	}
+	fd, err := open(nil, rootPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to find root: %v", err)
+	}
+
+	var dontTranslateOwnership bool
+	if v, ok := options[dontTranslateOwnershipKey]; ok {
+		b, err := strconv.ParseBool(v)
+		if err != nil {
+			return nil, fmt.Errorf("invalid value for %q: %v", dontTranslateOwnershipKey, err)
+		}
+		dontTranslateOwnership = b
+		delete(options, dontTranslateOwnershipKey)
+	}
+
+	// Fail if the caller passed us more options than we know about.
+	if len(options) > 0 {
+		return nil, fmt.Errorf("unsupported mount options: %v", options)
+	}
+
+	// The mounting EUID/EGID will be cached by this file system. This will
+	// be used to assign ownership to files that we own.
+	owner := fs.FileOwnerFromContext(ctx)
+
+	// Construct the host file system mount and inode.
+	msrc := newMountSource(ctx, rootPath, owner, f, flags, dontTranslateOwnership)
+	return newInode(ctx, msrc, fd, false /* saveable */, false /* donated */)
+}
+
+// InstallWhitelist locks down the MountNamespace to only the currently installed
+// Dirents and the given paths.
+func (f *Filesystem) InstallWhitelist(ctx context.Context, m *fs.MountNamespace) error {
+	return installWhitelist(ctx, m, f.paths)
+}
+
+func installWhitelist(ctx context.Context, m *fs.MountNamespace, paths []string) error {
+	if len(paths) == 0 || (len(paths) == 1 && paths[0] == "") {
+		// Warning will be logged during filter installation if the empty
+		// whitelist matters (allows for host file access).
+		return nil
+	}
+
+	// Done tracks entries already added.
+	done := make(map[string]bool)
+	root := m.Root()
+	defer root.DecRef()
+
+	for i := 0; i < len(paths); i++ {
+		// Make sure the path is absolute. This is a sanity check.
+		if !path.IsAbs(paths[i]) {
+			return fmt.Errorf("path %q is not absolute", paths[i])
+		}
+
+		// We need to add all the intermediate paths, in case one of
+		// them is a symlink that needs to be resolved.
+		for j := 1; j <= len(paths[i]); j++ {
+			if j < len(paths[i]) && paths[i][j] != '/' {
+				continue
+			}
+			current := paths[i][:j]
+
+			// Lookup the given component in the tree.
+			remainingTraversals := uint(maxTraversals)
+			d, err := m.FindLink(ctx, root, nil, current, &remainingTraversals)
+			if err != nil {
+				log.Warningf("populate failed for %q: %v", current, err)
+				continue
+			}
+
+			// It's critical that this DecRef happens after the
+			// freeze below. This ensures that the dentry is in
+			// place to be frozen. Otherwise, we freeze without
+			// these entries.
+			defer d.DecRef()
+
+			// Expand the last component if necessary.
+			if current == paths[i] {
+				// Is it a directory or symlink?
+				sattr := d.Inode.StableAttr
+				if fs.IsDir(sattr) {
+					for name := range childDentAttrs(ctx, d) {
+						paths = append(paths, path.Join(current, name))
+					}
+				}
+				if fs.IsSymlink(sattr) {
+					// Only expand symlinks once. The
+					// folder structure may contain
+					// recursive symlinks and we don't want
+					// to end up infinitely expanding this
+					// symlink. This is safe because this
+					// is the last component. If a later
+					// path wants to symlink something
+					// beneath this symlink that will still
+					// be handled by the FindLink above.
+					if done[current] {
+						continue
+					}
+
+					s, err := d.Inode.Readlink(ctx)
+					if err != nil {
+						log.Warningf("readlink failed for %q: %v", current, err)
+						continue
+					}
+					if path.IsAbs(s) {
+						paths = append(paths, s)
+					} else {
+						target := path.Join(path.Dir(current), s)
+						paths = append(paths, target)
+					}
+				}
+			}
+
+			// Only report this one once even though we may look
+			// it up more than once. If we whitelist /a/b,/a then
+			// /a will be "done" when it is looked up for /a/b,
+			// however we still need to expand all of its contents
+			// when whitelisting /a.
+			if !done[current] {
+				log.Debugf("whitelisted: %s", current)
+			}
+			done[current] = true
+		}
+	}
+
+	// Freeze the mount tree in place. This prevents any new paths from
+	// being opened and any old ones from being removed. If we do provide
+	// tmpfs mounts, we'll want to freeze/thaw those separately.
+	m.Freeze()
+	return nil
+}
+
+func childDentAttrs(ctx context.Context, d *fs.Dirent) map[string]fs.DentAttr {
+	dirname, _ := d.FullName(nil /* root */)
+	dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
+	if err != nil {
+		log.Warningf("failed to open directory %q: %v", dirname, err)
+		return nil
+	}
+	dir.DecRef()
+	var stubSerializer fs.CollectEntriesSerializer
+	if err := dir.Readdir(ctx, &stubSerializer); err != nil {
+		log.Warningf("failed to iterate on host directory %q: %v", dirname, err)
+		return nil
+	}
+	delete(stubSerializer.Entries, ".")
+	delete(stubSerializer.Entries, "..")
+	return stubSerializer.Entries
+}
+
+// newMountSource constructs a new host fs.MountSource
+// relative to a root path. The root should match the mount point.
+func newMountSource(ctx context.Context, root string, mounter fs.FileOwner, filesystem fs.Filesystem, flags fs.MountSourceFlags, dontTranslateOwnership bool) *fs.MountSource {
+	return fs.NewMountSource(&superOperations{
+		root:                   root,
+		inodeMappings:          make(map[uint64]string),
+		mounter:                mounter,
+		dontTranslateOwnership: dontTranslateOwnership,
+	}, filesystem, flags)
+}
+
+// superOperations implements fs.MountSourceOperations.
+//
+// +stateify savable
+type superOperations struct {
+	fs.SimpleMountSourceOperations
+
+	// root is the path of the mount point. All inode mappings
+	// are relative to this root.
+	root string
+
+	// inodeMappings contains mappings of fs.Inodes associated
+	// with this MountSource to paths under root.
+	inodeMappings map[uint64]string
+
+	// mounter is the cached EUID/EGID that mounted this file system.
+	mounter fs.FileOwner
+
+	// dontTranslateOwnership indicates whether to not translate file
+	// ownership.
+	//
+	// By default, files/directories owned by the sandbox uses UID/GID
+	// of the mounter. For files/directories that are not owned by the
+	// sandbox, file UID/GID is translated to a UID/GID which cannot
+	// be mapped in the sandboxed application's user namespace. The
+	// UID/GID will look like the nobody UID/GID (65534) but is not
+	// strictly owned by the user "nobody".
+	//
+	// If whitelistfs is a lower filesystem in an overlay, set
+	// dont_translate_ownership=true in mount options.
+	dontTranslateOwnership bool
+}
+
+var _ fs.MountSourceOperations = (*superOperations)(nil)
+
+// ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings.
+func (m *superOperations) ResetInodeMappings() {
+	m.inodeMappings = make(map[uint64]string)
+}
+
+// SaveInodeMapping implements fs.MountSourceOperations.SaveInodeMapping.
+func (m *superOperations) SaveInodeMapping(inode *fs.Inode, path string) {
+	// This is very unintuitive. We *CANNOT* trust the inode's StableAttrs,
+	// because overlay copyUp may have changed them out from under us.
+	// So much for "immutable".
+	sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr
+	m.inodeMappings[sattr.InodeID] = path
+}
+
+// Keep implements fs.MountSourceOperations.Keep.
+//
+// TODO(b/72455313,b/77596690): It is possible to change the permissions on a
+// host file while it is in the dirent cache (say from RO to RW), but it is not
+// possible to re-open the file with more relaxed permissions, since the host
+// FD is already open and stored in the inode.
+//
+// Using the dirent LRU cache increases the odds that this bug is encountered.
+// Since host file access is relatively fast anyways, we disable the LRU cache
+// for host fs files.  Once we can properly deal with permissions changes and
+// re-opening host files, we should revisit whether or not to make use of the
+// LRU cache.
+func (*superOperations) Keep(*fs.Dirent) bool {
+	return false
+}
+
+func init() {
+	fs.RegisterFilesystem(&Filesystem{})
+}
diff --git a/pkg/sentry/fs/host/host_state_autogen.go b/pkg/sentry/fs/host/host_state_autogen.go
new file mode 100755
index 000000000..22cfa1222
--- /dev/null
+++ b/pkg/sentry/fs/host/host_state_autogen.go
@@ -0,0 +1,142 @@
+// automatically generated by stateify.
+
+package host
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+func (x *descriptor) save(m state.Map) {
+	x.beforeSave()
+	m.Save("donated", &x.donated)
+	m.Save("origFD", &x.origFD)
+	m.Save("wouldBlock", &x.wouldBlock)
+}
+
+func (x *descriptor) load(m state.Map) {
+	m.Load("donated", &x.donated)
+	m.Load("origFD", &x.origFD)
+	m.Load("wouldBlock", &x.wouldBlock)
+	m.AfterLoad(x.afterLoad)
+}
+
+func (x *fileOperations) beforeSave() {}
+func (x *fileOperations) save(m state.Map) {
+	x.beforeSave()
+	m.Save("iops", &x.iops)
+	m.Save("dirCursor", &x.dirCursor)
+}
+
+func (x *fileOperations) afterLoad() {}
+func (x *fileOperations) load(m state.Map) {
+	m.LoadWait("iops", &x.iops)
+	m.Load("dirCursor", &x.dirCursor)
+}
+
+func (x *Filesystem) beforeSave() {}
+func (x *Filesystem) save(m state.Map) {
+	x.beforeSave()
+	m.Save("paths", &x.paths)
+}
+
+func (x *Filesystem) afterLoad() {}
+func (x *Filesystem) load(m state.Map) {
+	m.Load("paths", &x.paths)
+}
+
+func (x *superOperations) beforeSave() {}
+func (x *superOperations) save(m state.Map) {
+	x.beforeSave()
+	m.Save("SimpleMountSourceOperations", &x.SimpleMountSourceOperations)
+	m.Save("root", &x.root)
+	m.Save("inodeMappings", &x.inodeMappings)
+	m.Save("mounter", &x.mounter)
+	m.Save("dontTranslateOwnership", &x.dontTranslateOwnership)
+}
+
+func (x *superOperations) afterLoad() {}
+func (x *superOperations) load(m state.Map) {
+	m.Load("SimpleMountSourceOperations", &x.SimpleMountSourceOperations)
+	m.Load("root", &x.root)
+	m.Load("inodeMappings", &x.inodeMappings)
+	m.Load("mounter", &x.mounter)
+	m.Load("dontTranslateOwnership", &x.dontTranslateOwnership)
+}
+
+func (x *inodeOperations) beforeSave() {}
+func (x *inodeOperations) save(m state.Map) {
+	x.beforeSave()
+	m.Save("fileState", &x.fileState)
+	m.Save("cachingInodeOps", &x.cachingInodeOps)
+}
+
+func (x *inodeOperations) afterLoad() {}
+func (x *inodeOperations) load(m state.Map) {
+	m.LoadWait("fileState", &x.fileState)
+	m.Load("cachingInodeOps", &x.cachingInodeOps)
+}
+
+func (x *inodeFileState) save(m state.Map) {
+	x.beforeSave()
+	if !state.IsZeroValue(x.queue) { m.Failf("queue is %v, expected zero", x.queue) }
+	m.Save("mops", &x.mops)
+	m.Save("descriptor", &x.descriptor)
+	m.Save("sattr", &x.sattr)
+	m.Save("savedUAttr", &x.savedUAttr)
+}
+
+func (x *inodeFileState) load(m state.Map) {
+	m.LoadWait("mops", &x.mops)
+	m.LoadWait("descriptor", &x.descriptor)
+	m.LoadWait("sattr", &x.sattr)
+	m.Load("savedUAttr", &x.savedUAttr)
+	m.AfterLoad(x.afterLoad)
+}
+
+func (x *ConnectedEndpoint) save(m state.Map) {
+	x.beforeSave()
+	m.Save("queue", &x.queue)
+	m.Save("path", &x.path)
+	m.Save("ref", &x.ref)
+	m.Save("readClosed", &x.readClosed)
+	m.Save("writeClosed", &x.writeClosed)
+	m.Save("srfd", &x.srfd)
+	m.Save("stype", &x.stype)
+}
+
+func (x *ConnectedEndpoint) load(m state.Map) {
+	m.Load("queue", &x.queue)
+	m.Load("path", &x.path)
+	m.Load("ref", &x.ref)
+	m.Load("readClosed", &x.readClosed)
+	m.Load("writeClosed", &x.writeClosed)
+	m.LoadWait("srfd", &x.srfd)
+	m.Load("stype", &x.stype)
+	m.AfterLoad(x.afterLoad)
+}
+
+func (x *TTYFileOperations) beforeSave() {}
+func (x *TTYFileOperations) save(m state.Map) {
+	x.beforeSave()
+	m.Save("fileOperations", &x.fileOperations)
+	m.Save("session", &x.session)
+	m.Save("fgProcessGroup", &x.fgProcessGroup)
+}
+
+func (x *TTYFileOperations) afterLoad() {}
+func (x *TTYFileOperations) load(m state.Map) {
+	m.Load("fileOperations", &x.fileOperations)
+	m.Load("session", &x.session)
+	m.Load("fgProcessGroup", &x.fgProcessGroup)
+}
+
+func init() {
+	state.Register("host.descriptor", (*descriptor)(nil), state.Fns{Save: (*descriptor).save, Load: (*descriptor).load})
+	state.Register("host.fileOperations", (*fileOperations)(nil), state.Fns{Save: (*fileOperations).save, Load: (*fileOperations).load})
+	state.Register("host.Filesystem", (*Filesystem)(nil), state.Fns{Save: (*Filesystem).save, Load: (*Filesystem).load})
+	state.Register("host.superOperations", (*superOperations)(nil), state.Fns{Save: (*superOperations).save, Load: (*superOperations).load})
+	state.Register("host.inodeOperations", (*inodeOperations)(nil), state.Fns{Save: (*inodeOperations).save, Load: (*inodeOperations).load})
+	state.Register("host.inodeFileState", (*inodeFileState)(nil), state.Fns{Save: (*inodeFileState).save, Load: (*inodeFileState).load})
+	state.Register("host.ConnectedEndpoint", (*ConnectedEndpoint)(nil), state.Fns{Save: (*ConnectedEndpoint).save, Load: (*ConnectedEndpoint).load})
+	state.Register("host.TTYFileOperations", (*TTYFileOperations)(nil), state.Fns{Save: (*TTYFileOperations).save, Load: (*TTYFileOperations).load})
+}
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
new file mode 100644
index 000000000..7a230e426
--- /dev/null
+++ b/pkg/sentry/fs/host/inode.go
@@ -0,0 +1,527 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/secio"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// inodeOperations implements fs.InodeOperations for an fs.Inodes backed
+// by a host file descriptor.
+//
+// +stateify savable
+type inodeOperations struct {
+	fsutil.InodeNotVirtual           `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+
+	// fileState implements fs.CachedFileObject. It exists
+	// to break a circular load dependency between inodeOperations
+	// and cachingInodeOps (below).
+	fileState *inodeFileState `state:"wait"`
+
+	// cachedInodeOps implements memmap.Mappable.
+	cachingInodeOps *fsutil.CachingInodeOperations
+
+	// readdirMu protects the file offset on the host FD. This is needed
+	// for readdir because getdents must use the kernel offset, so
+	// concurrent readdirs must be exclusive.
+	//
+	// All read/write functions pass the offset directly to the kernel and
+	// thus don't need a lock.
+	readdirMu sync.Mutex `state:"nosave"`
+}
+
+// inodeFileState implements fs.CachedFileObject and otherwise fully
+// encapsulates state that needs to be manually loaded on restore for
+// this file object.
+//
+// This unfortunate structure exists because fs.CachingInodeOperations
+// defines afterLoad and therefore cannot be lazily loaded (to break a
+// circular load dependency between it and inodeOperations). Even with
+// lazy loading, this approach defines the dependencies between objects
+// and the expected load behavior more concretely.
+//
+// +stateify savable
+type inodeFileState struct {
+	// Common file system state.
+	mops *superOperations `state:"wait"`
+
+	// descriptor is the backing host FD.
+	descriptor *descriptor `state:"wait"`
+
+	// Event queue for blocking operations.
+	queue waiter.Queue `state:"zerovalue"`
+
+	// sattr is used to restore the inodeOperations.
+	sattr fs.StableAttr `state:"wait"`
+
+	// savedUAttr is only allocated during S/R. It points to the save-time
+	// unstable attributes and is used to validate restore-time ones.
+	//
+	// Note that these unstable attributes are only used to detect cross-S/R
+	// external file system metadata changes. They may differ from the
+	// cached unstable attributes in cachingInodeOps, as that might differ
+	// from the external file system attributes if there had been WriteOut
+	// failures. S/R is transparent to Sentry and the latter will continue
+	// using its cached values after restore.
+	savedUAttr *fs.UnstableAttr
+}
+
+// ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt.
+func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
+	// TODO(jamieliu): Using safemem.FromIOReader here is wasteful for two
+	// reasons:
+	//
+	// - Using preadv instead of iterated preads saves on host system calls.
+	//
+	// - Host system calls can handle destination memory that would fault in
+	// gr3 (i.e. they can accept safemem.Blocks with NeedSafecopy() == true),
+	// so the buffering performed by FromIOReader is unnecessary.
+	//
+	// This also applies to the write path below.
+	return safemem.FromIOReader{secio.NewOffsetReader(fd.NewReadWriter(i.FD()), int64(offset))}.ReadToBlocks(dsts)
+}
+
+// WriteFromBlocksAt implements fsutil.CachedFileObject.WriteFromBlocksAt.
+func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
+	return safemem.FromIOWriter{secio.NewOffsetWriter(fd.NewReadWriter(i.FD()), int64(offset))}.WriteFromBlocks(srcs)
+}
+
+// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
+func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error {
+	if mask.Empty() {
+		return nil
+	}
+	if mask.UID || mask.GID {
+		return syserror.EPERM
+	}
+	if mask.Perms {
+		if err := syscall.Fchmod(i.FD(), uint32(attr.Perms.LinuxMode())); err != nil {
+			return err
+		}
+	}
+	if mask.Size {
+		if err := syscall.Ftruncate(i.FD(), attr.Size); err != nil {
+			return err
+		}
+	}
+	if mask.AccessTime || mask.ModificationTime {
+		ts := fs.TimeSpec{
+			ATime:     attr.AccessTime,
+			ATimeOmit: !mask.AccessTime,
+			MTime:     attr.ModificationTime,
+			MTimeOmit: !mask.ModificationTime,
+		}
+		if err := setTimestamps(i.FD(), ts); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Sync implements fsutil.CachedFileObject.Sync.
+func (i *inodeFileState) Sync(ctx context.Context) error {
+	return syscall.Fsync(i.FD())
+}
+
+// FD implements fsutil.CachedFileObject.FD.
+func (i *inodeFileState) FD() int {
+	return i.descriptor.value
+}
+
+func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, error) {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(i.FD(), &s); err != nil {
+		return fs.UnstableAttr{}, err
+	}
+	return unstableAttr(i.mops, &s), nil
+}
+
+// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
+func (i *inodeFileState) Allocate(_ context.Context, offset, length int64) error {
+	return syscall.Fallocate(i.FD(), 0, offset, length)
+}
+
+// inodeOperations implements fs.InodeOperations.
+var _ fs.InodeOperations = (*inodeOperations)(nil)
+
+// newInode returns a new fs.Inode backed by the host FD.
+func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool, donated bool) (*fs.Inode, error) {
+	// Retrieve metadata.
+	var s syscall.Stat_t
+	err := syscall.Fstat(fd, &s)
+	if err != nil {
+		return nil, err
+	}
+
+	fileState := &inodeFileState{
+		mops:  msrc.MountSourceOperations.(*superOperations),
+		sattr: stableAttr(&s),
+	}
+
+	// Initialize the wrapped host file descriptor.
+	fileState.descriptor, err = newDescriptor(
+		fd,
+		donated,
+		saveable,
+		wouldBlock(&s),
+		&fileState.queue,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	// Build the fs.InodeOperations.
+	uattr := unstableAttr(msrc.MountSourceOperations.(*superOperations), &s)
+	iops := &inodeOperations{
+		fileState:       fileState,
+		cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, msrc.Flags.ForcePageCache),
+	}
+
+	// Return the fs.Inode.
+	return fs.NewInode(iops, msrc, fileState.sattr), nil
+}
+
+// Mappable implements fs.InodeOperations.Mappable.
+func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable {
+	if !canMap(inode) {
+		return nil
+	}
+	return i.cachingInodeOps
+}
+
+// ReturnsWouldBlock returns true if this host FD can return EWOULDBLOCK for
+// operations that would block.
+func (i *inodeOperations) ReturnsWouldBlock() bool {
+	return i.fileState.descriptor.wouldBlock
+}
+
+// Release implements fs.InodeOperations.Release.
+func (i *inodeOperations) Release(context.Context) {
+	i.fileState.descriptor.Release()
+	i.cachingInodeOps.Release()
+}
+
+// Lookup implements fs.InodeOperations.Lookup.
+func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
+	// Get a new FD relative to i at name.
+	fd, err := open(i, name)
+	if err != nil {
+		if err == syserror.ENOENT {
+			return nil, syserror.ENOENT
+		}
+		return nil, err
+	}
+
+	inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */)
+	if err != nil {
+		return nil, err
+	}
+
+	// Return the fs.Dirent.
+	return fs.NewDirent(inode, name), nil
+}
+
+// Create implements fs.InodeOperations.Create.
+func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) {
+	// Create a file relative to i at name.
+	//
+	// N.B. We always open this file O_RDWR regardless of flags because a
+	// future GetFile might want more access. Open allows this regardless
+	// of perm.
+	fd, err := openAt(i, name, syscall.O_RDWR|syscall.O_CREAT|syscall.O_EXCL, perm.LinuxMode())
+	if err != nil {
+		return nil, err
+	}
+
+	inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */)
+	if err != nil {
+		return nil, err
+	}
+
+	d := fs.NewDirent(inode, name)
+	defer d.DecRef()
+	return inode.GetFile(ctx, d, flags)
+}
+
+// CreateDirectory implements fs.InodeOperations.CreateDirectory.
+func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
+	return syscall.Mkdirat(i.fileState.FD(), name, uint32(perm.LinuxMode()))
+}
+
+// CreateLink implements fs.InodeOperations.CreateLink.
+func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error {
+	return createLink(i.fileState.FD(), oldname, newname)
+}
+
+// CreateHardLink implements fs.InodeOperations.CreateHardLink.
+func (*inodeOperations) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error {
+	return syserror.EPERM
+}
+
+// CreateFifo implements fs.InodeOperations.CreateFifo.
+func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error {
+	return syserror.EPERM
+}
+
+// Remove implements fs.InodeOperations.Remove.
+func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error {
+	return unlinkAt(i.fileState.FD(), name, false /* dir */)
+}
+
+// RemoveDirectory implements fs.InodeOperations.RemoveDirectory.
+func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error {
+	return unlinkAt(i.fileState.FD(), name, true /* dir */)
+}
+
+// Rename implements fs.InodeOperations.Rename.
+func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+	op, ok := oldParent.InodeOperations.(*inodeOperations)
+	if !ok {
+		return syscall.EXDEV
+	}
+	np, ok := newParent.InodeOperations.(*inodeOperations)
+	if !ok {
+		return syscall.EXDEV
+	}
+	return syscall.Renameat(op.fileState.FD(), oldName, np.fileState.FD(), newName)
+}
+
+// Bind implements fs.InodeOperations.Bind.
+func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data transport.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) {
+	return nil, syserror.EOPNOTSUPP
+}
+
+// BoundEndpoint implements fs.InodeOperations.BoundEndpoint.
+func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport.BoundEndpoint {
+	return nil
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return newFile(ctx, d, flags, i), nil
+}
+
+// canMap returns true if this fs.Inode can be memory mapped.
+func canMap(inode *fs.Inode) bool {
+	// FIXME(b/38213152): Some obscure character devices can be mapped.
+	return fs.IsFile(inode.StableAttr)
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	// When the kernel supports mapping host FDs, we do so to take
+	// advantage of the host page cache. We forego updating fs.Inodes
+	// because the host manages consistency of its own inode structures.
+	//
+	// For fs.Inodes that can never be mapped we take advantage of
+	// synchronizing metadata updates through host caches.
+	//
+	// So can we use host kernel metadata caches?
+	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+		// Then just obtain the attributes.
+		return i.fileState.unstableAttr(ctx)
+	}
+	// No, we're maintaining consistency of metadata ourselves.
+	return i.cachingInodeOps.UnstableAttr(ctx, inode)
+}
+
+// Check implements fs.InodeOperations.Check.
+func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (i *inodeOperations) SetOwner(context.Context, *fs.Inode, fs.FileOwner) error {
+	return syserror.EPERM
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, f fs.FilePermissions) bool {
+	// Can we use host kernel metadata caches?
+	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+		// Then just change the timestamps on the FD, the host
+		// will synchronize the metadata update with any host
+		// inode and page cache.
+		return syscall.Fchmod(i.fileState.FD(), uint32(f.LinuxMode())) == nil
+	}
+	// Otherwise update our cached metadata.
+	return i.cachingInodeOps.SetPermissions(ctx, inode, f)
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
+	// Can we use host kernel metadata caches?
+	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+		// Then just change the timestamps on the FD, the host
+		// will synchronize the metadata update with any host
+		// inode and page cache.
+		return setTimestamps(i.fileState.FD(), ts)
+	}
+	// Otherwise update our cached metadata.
+	return i.cachingInodeOps.SetTimestamps(ctx, inode, ts)
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
+	// Is the file not memory-mappable?
+	if !canMap(inode) {
+		// Then just change the file size on the FD, the host
+		// will synchronize the metadata update with any host
+		// inode and page cache.
+		return syscall.Ftruncate(i.fileState.FD(), size)
+	}
+	// Otherwise we need to go through cachingInodeOps, even if the host page
+	// cache is in use, to invalidate private copies of truncated pages.
+	return i.cachingInodeOps.Truncate(ctx, inode, size)
+}
+
+// Allocate implements fs.InodeOperations.Allocate.
+func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset, length int64) error {
+	// Is the file not memory-mappable?
+	if !canMap(inode) {
+		// Then just send the call to the FD, the host will synchronize the metadata
+		// update with any host inode and page cache.
+		return i.fileState.Allocate(ctx, offset, length)
+	}
+	// Otherwise we need to go through cachingInodeOps, even if the host page
+	// cache is in use, to invalidate private copies of truncated pages.
+	return i.cachingInodeOps.Allocate(ctx, offset, length)
+}
+
+// WriteOut implements fs.InodeOperations.WriteOut.
+func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
+	// Have we been using host kernel metadata caches?
+	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+		// Then the metadata is already up to date on the host.
+		return nil
+	}
+	// Otherwise we need to write out cached pages and attributes
+	// that are dirty.
+	return i.cachingInodeOps.WriteOut(ctx, inode)
+}
+
+// Readlink implements fs.InodeOperations.Readlink.
+func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+	return readLink(i.fileState.FD())
+}
+
+// Getlink implements fs.InodeOperations.Getlink.
+func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
+	if !fs.IsSymlink(i.fileState.sattr) {
+		return nil, syserror.ENOLINK
+	}
+	return nil, fs.ErrResolveViaReadlink
+}
+
+// StatFS implements fs.InodeOperations.StatFS.
+func (i *inodeOperations) StatFS(context.Context) (fs.Info, error) {
+	return fs.Info{}, syserror.ENOSYS
+}
+
+// AddLink implements fs.InodeOperations.AddLink.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
+func (i *inodeOperations) AddLink() {}
+
+// DropLink implements fs.InodeOperations.DropLink.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
+func (i *inodeOperations) DropLink() {}
+
+// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
+func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
+
+// readdirAll returns all of the directory entries in i.
+func (i *inodeOperations) readdirAll(d *dirInfo) (map[string]fs.DentAttr, error) {
+	i.readdirMu.Lock()
+	defer i.readdirMu.Unlock()
+
+	fd := i.fileState.FD()
+
+	// syscall.ReadDirent will use getdents, which will seek the file past
+	// the last directory entry. To read the directory entries a second
+	// time, we need to seek back to the beginning.
+	if _, err := syscall.Seek(fd, 0, 0); err != nil {
+		if err == syscall.ESPIPE {
+			// All directories should be seekable. If this file
+			// isn't seekable, it is not a directory and we should
+			// return that more sane error.
+			err = syscall.ENOTDIR
+		}
+		return nil, err
+	}
+
+	names := make([]string, 0, 100)
+	for {
+		// Refill the buffer if necessary
+		if d.bufp >= d.nbuf {
+			d.bufp = 0
+			// ReadDirent will just do a sys_getdents64 to the kernel.
+			n, err := syscall.ReadDirent(fd, d.buf)
+			if err != nil {
+				return nil, err
+			}
+			if n == 0 {
+				break // EOF
+			}
+			d.nbuf = n
+		}
+
+		var nb int
+		// Parse the dirent buffer we just get and return the directory names along
+		// with the number of bytes consumed in the buffer.
+		nb, _, names = syscall.ParseDirent(d.buf[d.bufp:d.nbuf], -1, names)
+		d.bufp += nb
+	}
+
+	entries := make(map[string]fs.DentAttr)
+	for _, filename := range names {
+		// Lookup the type and host device and inode.
+		stat, lerr := fstatat(fd, filename, linux.AT_SYMLINK_NOFOLLOW)
+		if lerr == syscall.ENOENT {
+			// File disappeared between readdir and lstat.
+			// Just treat it as if it didn't exist.
+			continue
+		}
+
+		// There was a serious problem, we should probably report it.
+		if lerr != nil {
+			return nil, lerr
+		}
+
+		entries[filename] = fs.DentAttr{
+			Type: nodeType(&stat),
+			InodeID: hostFileDevice.Map(device.MultiDeviceKey{
+				Device: stat.Dev,
+				Inode:  stat.Ino,
+			}),
+		}
+	}
+	return entries, nil
+}
diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go
new file mode 100644
index 000000000..26cc755bc
--- /dev/null
+++ b/pkg/sentry/fs/host/inode_state.go
@@ -0,0 +1,79 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// beforeSave is invoked by stateify.
+func (i *inodeFileState) beforeSave() {
+	if !i.queue.IsEmpty() {
+		panic("event queue must be empty")
+	}
+	if !i.descriptor.donated && i.sattr.Type == fs.RegularFile {
+		uattr, err := i.unstableAttr(context.Background())
+		if err != nil {
+			panic(fs.ErrSaveRejection{fmt.Errorf("failed to get unstable atttribute of %s: %v", i.mops.inodeMappings[i.sattr.InodeID], err)})
+		}
+		i.savedUAttr = &uattr
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (i *inodeFileState) afterLoad() {
+	// Initialize the descriptor value.
+	if err := i.descriptor.initAfterLoad(i.mops, i.sattr.InodeID, &i.queue); err != nil {
+		panic(fmt.Sprintf("failed to load value of descriptor: %v", err))
+	}
+
+	// Remap the inode number.
+	var s syscall.Stat_t
+	if err := syscall.Fstat(i.FD(), &s); err != nil {
+		panic(fs.ErrCorruption{fmt.Errorf("failed to get metadata for fd %d: %v", i.FD(), err)})
+	}
+	key := device.MultiDeviceKey{
+		Device: s.Dev,
+		Inode:  s.Ino,
+	}
+	if !hostFileDevice.Load(key, i.sattr.InodeID) {
+		// This means there was a conflict at s.Dev and s.Ino with
+		// another inode mapping: two files that were unique on the
+		// saved filesystem are no longer unique on this filesystem.
+		// Since this violates the contract that filesystems cannot
+		// change across save and restore, error out.
+		panic(fs.ErrCorruption{fmt.Errorf("host %s conflict in host device mappings: %s", key, hostFileDevice)})
+	}
+
+	if !i.descriptor.donated && i.sattr.Type == fs.RegularFile {
+		env, ok := fs.CurrentRestoreEnvironment()
+		if !ok {
+			panic("missing restore environment")
+		}
+		uattr := unstableAttr(i.mops, &s)
+		if env.ValidateFileSize && uattr.Size != i.savedUAttr.Size {
+			panic(fs.ErrCorruption{fmt.Errorf("file size has changed for %s: previously %d, now %d", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size)})
+		}
+		if env.ValidateFileTimestamp && uattr.ModificationTime != i.savedUAttr.ModificationTime {
+			panic(fs.ErrCorruption{fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime)})
+		}
+		i.savedUAttr = nil
+	}
+}
diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go
new file mode 100644
index 000000000..b5a85c4d9
--- /dev/null
+++ b/pkg/sentry/fs/host/ioctl_unsafe.go
@@ -0,0 +1,56 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+func ioctlGetTermios(fd int) (*linux.Termios, error) {
+	var t linux.Termios
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t)))
+	if errno != 0 {
+		return nil, errno
+	}
+	return &t, nil
+}
+
+func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error {
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+func ioctlGetWinsize(fd int) (*linux.Winsize, error) {
+	var w linux.Winsize
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCGWINSZ, uintptr(unsafe.Pointer(&w)))
+	if errno != 0 {
+		return nil, errno
+	}
+	return &w, nil
+}
+
+func ioctlSetWinsize(fd int, w *linux.Winsize) error {
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCSWINSZ, uintptr(unsafe.Pointer(w)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
new file mode 100644
index 000000000..3ed137006
--- /dev/null
+++ b/pkg/sentry/fs/host/socket.go
@@ -0,0 +1,390 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
+	unixsocket "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// maxSendBufferSize is the maximum host send buffer size allowed for endpoint.
+//
+// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max).
+const maxSendBufferSize = 8 << 20
+
+// ConnectedEndpoint is a host FD backed implementation of
+// transport.ConnectedEndpoint and transport.Receiver.
+//
+// +stateify savable
+type ConnectedEndpoint struct {
+	queue *waiter.Queue
+	path  string
+
+	// ref keeps track of references to a connectedEndpoint.
+	ref refs.AtomicRefCount
+
+	// mu protects fd, readClosed and writeClosed.
+	mu sync.RWMutex `state:"nosave"`
+
+	// file is an *fd.FD containing the FD backing this endpoint. It must be
+	// set to nil if it has been closed.
+	file *fd.FD `state:"nosave"`
+
+	// readClosed is true if the FD has read shutdown or if it has been closed.
+	readClosed bool
+
+	// writeClosed is true if the FD has write shutdown or if it has been
+	// closed.
+	writeClosed bool
+
+	// If srfd >= 0, it is the host FD that file was imported from.
+	srfd int `state:"wait"`
+
+	// stype is the type of Unix socket.
+	stype transport.SockType
+
+	// sndbuf is the size of the send buffer.
+	//
+	// N.B. When this is smaller than the host size, we present it via
+	// GetSockOpt and message splitting/rejection in SendMsg, but do not
+	// prevent lots of small messages from filling the real send buffer
+	// size on the host.
+	sndbuf int `state:"nosave"`
+}
+
+// init performs initialization required for creating new ConnectedEndpoints and
+// for restoring them.
+func (c *ConnectedEndpoint) init() *syserr.Error {
+	family, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_DOMAIN)
+	if err != nil {
+		return syserr.FromError(err)
+	}
+
+	if family != syscall.AF_UNIX {
+		// We only allow Unix sockets.
+		return syserr.ErrInvalidEndpointState
+	}
+
+	stype, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_TYPE)
+	if err != nil {
+		return syserr.FromError(err)
+	}
+
+	if err := syscall.SetNonblock(c.file.FD(), true); err != nil {
+		return syserr.FromError(err)
+	}
+
+	sndbuf, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+	if err != nil {
+		return syserr.FromError(err)
+	}
+	if sndbuf > maxSendBufferSize {
+		log.Warningf("Socket send buffer too large: %d", sndbuf)
+		return syserr.ErrInvalidEndpointState
+	}
+
+	c.stype = transport.SockType(stype)
+	c.sndbuf = sndbuf
+
+	return nil
+}
+
+// NewConnectedEndpoint creates a new ConnectedEndpoint backed by a host FD
+// that will pretend to be bound at a given sentry path.
+//
+// The caller is responsible for calling Init(). Additionaly, Release needs to
+// be called twice because ConnectedEndpoint is both a transport.Receiver and
+// transport.ConnectedEndpoint.
+func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (*ConnectedEndpoint, *syserr.Error) {
+	e := ConnectedEndpoint{
+		path:  path,
+		queue: queue,
+		file:  file,
+		srfd:  -1,
+	}
+
+	if err := e.init(); err != nil {
+		return nil, err
+	}
+
+	// AtomicRefCounters start off with a single reference. We need two.
+	e.ref.IncRef()
+
+	return &e, nil
+}
+
+// Init will do initialization required without holding other locks.
+func (c *ConnectedEndpoint) Init() {
+	if err := fdnotifier.AddFD(int32(c.file.FD()), c.queue); err != nil {
+		panic(err)
+	}
+}
+
+// NewSocketWithDirent allocates a new unix socket with host endpoint.
+//
+// This is currently only used by unsaveable Gofer nodes.
+//
+// NewSocketWithDirent takes ownership of f on success.
+func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.FileFlags) (*fs.File, error) {
+	f2 := fd.New(f.FD())
+	var q waiter.Queue
+	e, err := NewConnectedEndpoint(f2, &q, "" /* path */)
+	if err != nil {
+		f2.Release()
+		return nil, err.ToError()
+	}
+
+	// Take ownship of the FD.
+	f.Release()
+
+	e.Init()
+
+	ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
+
+	return unixsocket.NewWithDirent(ctx, d, ep, e.stype != transport.SockStream, flags), nil
+}
+
+// newSocket allocates a new unix socket with host endpoint.
+func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error) {
+	ownedfd := orgfd
+	srfd := -1
+	if saveable {
+		var err error
+		ownedfd, err = syscall.Dup(orgfd)
+		if err != nil {
+			return nil, err
+		}
+		srfd = orgfd
+	}
+	f := fd.New(ownedfd)
+	var q waiter.Queue
+	e, err := NewConnectedEndpoint(f, &q, "" /* path */)
+	if err != nil {
+		if saveable {
+			f.Close()
+		} else {
+			f.Release()
+		}
+		return nil, err.ToError()
+	}
+
+	e.srfd = srfd
+	e.Init()
+
+	ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
+
+	return unixsocket.New(ctx, ep, e.stype != transport.SockStream), nil
+}
+
+// Send implements transport.ConnectedEndpoint.Send.
+func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *syserr.Error) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.writeClosed {
+		return 0, false, syserr.ErrClosedForSend
+	}
+
+	if !controlMessages.Empty() {
+		return 0, false, syserr.ErrInvalidEndpointState
+	}
+
+	// Since stream sockets don't preserve message boundaries, we can write
+	// only as much of the message as fits in the send buffer.
+	truncate := c.stype == transport.SockStream
+
+	n, totalLen, err := fdWriteVec(c.file.FD(), data, c.sndbuf, truncate)
+	if n < totalLen && err == nil {
+		// The host only returns a short write if it would otherwise
+		// block (and only for stream sockets).
+		err = syserror.EAGAIN
+	}
+	if n > 0 && err != syserror.EAGAIN {
+		// The caller may need to block to send more data, but
+		// otherwise there isn't anything that can be done about an
+		// error with a partial write.
+		err = nil
+	}
+
+	// There is no need for the callee to call SendNotify because fdWriteVec
+	// uses the host's sendmsg(2) and the host kernel's queue.
+	return n, false, syserr.FromError(err)
+}
+
+// SendNotify implements transport.ConnectedEndpoint.SendNotify.
+func (c *ConnectedEndpoint) SendNotify() {}
+
+// CloseSend implements transport.ConnectedEndpoint.CloseSend.
+func (c *ConnectedEndpoint) CloseSend() {
+	c.mu.Lock()
+	c.writeClosed = true
+	c.mu.Unlock()
+}
+
+// CloseNotify implements transport.ConnectedEndpoint.CloseNotify.
+func (c *ConnectedEndpoint) CloseNotify() {}
+
+// Writable implements transport.ConnectedEndpoint.Writable.
+func (c *ConnectedEndpoint) Writable() bool {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.writeClosed {
+		return true
+	}
+	return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventOut)&waiter.EventOut != 0
+}
+
+// Passcred implements transport.ConnectedEndpoint.Passcred.
+func (c *ConnectedEndpoint) Passcred() bool {
+	// We don't support credential passing for host sockets.
+	return false
+}
+
+// GetLocalAddress implements transport.ConnectedEndpoint.GetLocalAddress.
+func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	return tcpip.FullAddress{Addr: tcpip.Address(c.path)}, nil
+}
+
+// EventUpdate implements transport.ConnectedEndpoint.EventUpdate.
+func (c *ConnectedEndpoint) EventUpdate() {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.file.FD() != -1 {
+		fdnotifier.UpdateFD(int32(c.file.FD()))
+	}
+}
+
+// Recv implements transport.Receiver.Recv.
+func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.readClosed {
+		return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.ErrClosedForReceive
+	}
+
+	var cm unet.ControlMessage
+	if numRights > 0 {
+		cm.EnableFDs(int(numRights))
+	}
+
+	// N.B. Unix sockets don't have a receive buffer, the send buffer
+	// serves both purposes.
+	rl, ml, cl, cTrunc, err := fdReadVec(c.file.FD(), data, []byte(cm), peek, c.sndbuf)
+	if rl > 0 && err != nil {
+		// We got some data, so all we need to do on error is return
+		// the data that we got. Short reads are fine, no need to
+		// block.
+		err = nil
+	}
+	if err != nil {
+		return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err)
+	}
+
+	// There is no need for the callee to call RecvNotify because fdReadVec uses
+	// the host's recvmsg(2) and the host kernel's queue.
+
+	// Trim the control data if we received less than the full amount.
+	if cl < uint64(len(cm)) {
+		cm = cm[:cl]
+	}
+
+	// Avoid extra allocations in the case where there isn't any control data.
+	if len(cm) == 0 {
+		return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
+	}
+
+	fds, err := cm.ExtractFDs()
+	if err != nil {
+		return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err)
+	}
+
+	if len(fds) == 0 {
+		return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
+	}
+	return rl, ml, control.New(nil, nil, newSCMRights(fds)), cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
+}
+
+// close releases all resources related to the endpoint.
+func (c *ConnectedEndpoint) close() {
+	fdnotifier.RemoveFD(int32(c.file.FD()))
+	c.file.Close()
+	c.file = nil
+}
+
+// RecvNotify implements transport.Receiver.RecvNotify.
+func (c *ConnectedEndpoint) RecvNotify() {}
+
+// CloseRecv implements transport.Receiver.CloseRecv.
+func (c *ConnectedEndpoint) CloseRecv() {
+	c.mu.Lock()
+	c.readClosed = true
+	c.mu.Unlock()
+}
+
+// Readable implements transport.Receiver.Readable.
+func (c *ConnectedEndpoint) Readable() bool {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.readClosed {
+		return true
+	}
+	return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventIn)&waiter.EventIn != 0
+}
+
+// SendQueuedSize implements transport.Receiver.SendQueuedSize.
+func (c *ConnectedEndpoint) SendQueuedSize() int64 {
+	// SendQueuedSize isn't supported for host sockets because we don't allow the
+	// sentry to call ioctl(2).
+	return -1
+}
+
+// RecvQueuedSize implements transport.Receiver.RecvQueuedSize.
+func (c *ConnectedEndpoint) RecvQueuedSize() int64 {
+	// RecvQueuedSize isn't supported for host sockets because we don't allow the
+	// sentry to call ioctl(2).
+	return -1
+}
+
+// SendMaxQueueSize implements transport.Receiver.SendMaxQueueSize.
+func (c *ConnectedEndpoint) SendMaxQueueSize() int64 {
+	return int64(c.sndbuf)
+}
+
+// RecvMaxQueueSize implements transport.Receiver.RecvMaxQueueSize.
+func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 {
+	// N.B. Unix sockets don't use the receive buffer. We'll claim it is
+	// the same size as the send buffer.
+	return int64(c.sndbuf)
+}
+
+// Release implements transport.ConnectedEndpoint.Release and transport.Receiver.Release.
+func (c *ConnectedEndpoint) Release() {
+	c.ref.DecRefWithDestructor(c.close)
+}
diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go
new file mode 100644
index 000000000..5efbb3ae8
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_iovec.go
@@ -0,0 +1,113 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// maxIovs is the maximum number of iovecs to pass to the host.
+var maxIovs = linux.UIO_MAXIOV
+
+// copyToMulti copies as many bytes from src to dst as possible.
+func copyToMulti(dst [][]byte, src []byte) {
+	for _, d := range dst {
+		done := copy(d, src)
+		src = src[done:]
+		if len(src) == 0 {
+			break
+		}
+	}
+}
+
+// copyFromMulti copies as many bytes from src to dst as possible.
+func copyFromMulti(dst []byte, src [][]byte) {
+	for _, s := range src {
+		done := copy(dst, s)
+		dst = dst[done:]
+		if len(dst) == 0 {
+			break
+		}
+	}
+}
+
+// buildIovec builds an iovec slice from the given []byte slice.
+//
+// If truncate, truncate bufs > maxlen. Otherwise, immediately return an error.
+//
+// If length < the total length of bufs, err indicates why, even when returning
+// a truncated iovec.
+//
+// If intermediate != nil, iovecs references intermediate rather than bufs and
+// the caller must copy to/from bufs as necessary.
+func buildIovec(bufs [][]byte, maxlen int, truncate bool) (length uintptr, iovecs []syscall.Iovec, intermediate []byte, err error) {
+	var iovsRequired int
+	for _, b := range bufs {
+		length += uintptr(len(b))
+		if len(b) > 0 {
+			iovsRequired++
+		}
+	}
+
+	stopLen := length
+	if length > uintptr(maxlen) {
+		if truncate {
+			stopLen = uintptr(maxlen)
+			err = syserror.EAGAIN
+		} else {
+			return 0, nil, nil, syserror.EMSGSIZE
+		}
+	}
+
+	if iovsRequired > maxIovs {
+		// The kernel will reject our call if we pass this many iovs.
+		// Use a single intermediate buffer instead.
+		b := make([]byte, stopLen)
+
+		return stopLen, []syscall.Iovec{{
+			Base: &b[0],
+			Len:  uint64(stopLen),
+		}}, b, err
+	}
+
+	var total uintptr
+	iovecs = make([]syscall.Iovec, 0, iovsRequired)
+	for i := range bufs {
+		l := len(bufs[i])
+		if l == 0 {
+			continue
+		}
+
+		stop := l
+		if total+uintptr(stop) > stopLen {
+			stop = int(stopLen - total)
+		}
+
+		iovecs = append(iovecs, syscall.Iovec{
+			Base: &bufs[i][0],
+			Len:  uint64(stop),
+		})
+
+		total += uintptr(stop)
+		if total >= stopLen {
+			break
+		}
+	}
+
+	return total, iovecs, nil, err
+}
diff --git a/pkg/sentry/fs/host/socket_state.go b/pkg/sentry/fs/host/socket_state.go
new file mode 100644
index 000000000..5676c451a
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_state.go
@@ -0,0 +1,42 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+)
+
+// beforeSave is invoked by stateify.
+func (c *ConnectedEndpoint) beforeSave() {
+	if c.srfd < 0 {
+		panic("only host file descriptors provided at sentry startup can be saved")
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (c *ConnectedEndpoint) afterLoad() {
+	f, err := syscall.Dup(c.srfd)
+	if err != nil {
+		panic(fmt.Sprintf("failed to dup restored FD %d: %v", c.srfd, err))
+	}
+	c.file = fd.New(f)
+	if err := c.init(); err != nil {
+		panic(fmt.Sprintf("Could not restore host socket FD %d: %v", c.srfd, err))
+	}
+	c.Init()
+}
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
new file mode 100644
index 000000000..e57be0506
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -0,0 +1,100 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+// fdReadVec receives from fd to bufs.
+//
+// If the total length of bufs is > maxlen, fdReadVec will do a partial read
+// and err will indicate why the message was truncated.
+func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int) (readLen uintptr, msgLen uintptr, controlLen uint64, controlTrunc bool, err error) {
+	flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC)
+	if peek {
+		flags |= syscall.MSG_PEEK
+	}
+
+	// Always truncate the receive buffer. All socket types will truncate
+	// received messages.
+	length, iovecs, intermediate, err := buildIovec(bufs, maxlen, true)
+	if err != nil && len(iovecs) == 0 {
+		// No partial write to do, return error immediately.
+		return 0, 0, 0, false, err
+	}
+
+	var msg syscall.Msghdr
+	if len(control) != 0 {
+		msg.Control = &control[0]
+		msg.Controllen = uint64(len(control))
+	}
+
+	if len(iovecs) != 0 {
+		msg.Iov = &iovecs[0]
+		msg.Iovlen = uint64(len(iovecs))
+	}
+
+	n, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags)
+	if e != 0 {
+		// N.B. prioritize the syscall error over the buildIovec error.
+		return 0, 0, 0, false, e
+	}
+
+	// Copy data back to bufs.
+	if intermediate != nil {
+		copyToMulti(bufs, intermediate)
+	}
+
+	controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
+
+	if n > length {
+		return length, n, msg.Controllen, controlTrunc, err
+	}
+
+	return n, n, msg.Controllen, controlTrunc, err
+}
+
+// fdWriteVec sends from bufs to fd.
+//
+// If the total length of bufs is > maxlen && truncate, fdWriteVec will do a
+// partial write and err will indicate why the message was truncated.
+func fdWriteVec(fd int, bufs [][]byte, maxlen int, truncate bool) (uintptr, uintptr, error) {
+	length, iovecs, intermediate, err := buildIovec(bufs, maxlen, truncate)
+	if err != nil && len(iovecs) == 0 {
+		// No partial write to do, return error immediately.
+		return 0, length, err
+	}
+
+	// Copy data to intermediate buf.
+	if intermediate != nil {
+		copyFromMulti(intermediate, bufs)
+	}
+
+	var msg syscall.Msghdr
+	if len(iovecs) > 0 {
+		msg.Iov = &iovecs[0]
+		msg.Iovlen = uint64(len(iovecs))
+	}
+
+	n, _, e := syscall.RawSyscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), syscall.MSG_DONTWAIT|syscall.MSG_NOSIGNAL)
+	if e != 0 {
+		// N.B. prioritize the syscall error over the buildIovec error.
+		return 0, length, e
+	}
+
+	return n, length, err
+}
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
new file mode 100644
index 000000000..e45b339f5
--- /dev/null
+++ b/pkg/sentry/fs/host/tty.go
@@ -0,0 +1,351 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// TTYFileOperations implements fs.FileOperations for a host file descriptor
+// that wraps a TTY FD.
+//
+// +stateify savable
+type TTYFileOperations struct {
+	fileOperations
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// session is the session attached to this TTYFileOperations.
+	session *kernel.Session
+
+	// fgProcessGroup is the foreground process group that is currently
+	// connected to this TTY.
+	fgProcessGroup *kernel.ProcessGroup
+}
+
+// newTTYFile returns a new fs.File that wraps a TTY FD.
+func newTTYFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File {
+	return fs.NewFile(ctx, dirent, flags, &TTYFileOperations{
+		fileOperations: fileOperations{iops: iops},
+	})
+}
+
+// InitForegroundProcessGroup sets the foreground process group and session for
+// the TTY. This should only be called once, after the foreground process group
+// has been created, but before it has started running.
+func (t *TTYFileOperations) InitForegroundProcessGroup(pg *kernel.ProcessGroup) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.fgProcessGroup != nil {
+		panic("foreground process group is already set")
+	}
+	t.fgProcessGroup = pg
+	t.session = pg.Session()
+}
+
+// ForegroundProcessGroup returns the foreground process for the TTY.
+func (t *TTYFileOperations) ForegroundProcessGroup() *kernel.ProcessGroup {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.fgProcessGroup
+}
+
+// Read implements fs.FileOperations.Read.
+//
+// Reading from a TTY is only allowed for foreground process groups. Background
+// process groups will either get EIO or a SIGTTIN.
+//
+// See drivers/tty/n_tty.c:n_tty_read()=>job_control().
+func (t *TTYFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Are we allowed to do the read?
+	// drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change().
+	if err := t.checkChange(ctx, linux.SIGTTIN); err != nil {
+		return 0, err
+	}
+
+	// Do the read.
+	return t.fileOperations.Read(ctx, file, dst, offset)
+}
+
+// Write implements fs.FileOperations.Write.
+func (t *TTYFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Are we allowed to do the write?
+	if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+		return 0, err
+	}
+	return t.fileOperations.Write(ctx, file, src, offset)
+}
+
+// Release implements fs.FileOperations.Release.
+func (t *TTYFileOperations) Release() {
+	t.mu.Lock()
+	t.fgProcessGroup = nil
+	t.mu.Unlock()
+
+	t.fileOperations.Release()
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// Ignore arg[0].  This is the real FD:
+	fd := t.fileOperations.iops.fileState.FD()
+	ioctl := args[1].Uint64()
+	switch ioctl {
+	case linux.TCGETS:
+		termios, err := ioctlGetTermios(fd)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			return 0, err
+		}
+
+		var termios linux.Termios
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := ioctlSetTermios(fd, ioctl, &termios)
+		return 0, err
+
+	case linux.TIOCGPGRP:
+		// Args: pid_t *argp
+		// When successful, equivalent to *argp = tcgetpgrp(fd).
+		// Get the process group ID of the foreground process group on
+		// this terminal.
+
+		pidns := kernel.PIDNamespaceFromContext(ctx)
+		if pidns == nil {
+			return 0, syserror.ENOTTY
+		}
+
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		// Map the ProcessGroup into a ProcessGroupID in the task's PID
+		// namespace.
+		pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TIOCSPGRP:
+		// Args: const pid_t *argp
+		// Equivalent to tcsetpgrp(fd, *argp).
+		// Set the foreground process group ID of this terminal.
+
+		task := kernel.TaskFromContext(ctx)
+		if task == nil {
+			return 0, syserror.ENOTTY
+		}
+
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		// Check that we are allowed to set the process group.
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			// drivers/tty/tty_io.c:tiocspgrp() converts -EIO from
+			// tty_check_change() to -ENOTTY.
+			if err == syserror.EIO {
+				return 0, syserror.ENOTTY
+			}
+			return 0, err
+		}
+
+		// Check that calling task's process group is in the TTY
+		// session.
+		if task.ThreadGroup().Session() != t.session {
+			return 0, syserror.ENOTTY
+		}
+
+		var pgID kernel.ProcessGroupID
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+
+		// pgID must be non-negative.
+		if pgID < 0 {
+			return 0, syserror.EINVAL
+		}
+
+		// Process group with pgID must exist in this PID namespace.
+		pidns := task.PIDNamespace()
+		pg := pidns.ProcessGroupWithID(pgID)
+		if pg == nil {
+			return 0, syserror.ESRCH
+		}
+
+		// Check that new process group is in the TTY session.
+		if pg.Session() != t.session {
+			return 0, syserror.EPERM
+		}
+
+		t.fgProcessGroup = pg
+		return 0, nil
+
+	case linux.TIOCGWINSZ:
+		// Args: struct winsize *argp
+		// Get window size.
+		winsize, err := ioctlGetWinsize(fd)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TIOCSWINSZ:
+		// Args: const struct winsize *argp
+		// Set window size.
+
+		// Unlike setting the termios, any process group (even
+		// background ones) can set the winsize.
+
+		var winsize linux.Winsize
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := ioctlSetWinsize(fd, &winsize)
+		return 0, err
+
+	// Unimplemented commands.
+	case linux.TIOCSETD,
+		linux.TIOCSBRK,
+		linux.TIOCCBRK,
+		linux.TCSBRK,
+		linux.TCSBRKP,
+		linux.TIOCSTI,
+		linux.TIOCCONS,
+		linux.FIONBIO,
+		linux.TIOCEXCL,
+		linux.TIOCNXCL,
+		linux.TIOCGEXCL,
+		linux.TIOCNOTTY,
+		linux.TIOCSCTTY,
+		linux.TIOCGSID,
+		linux.TIOCGETD,
+		linux.TIOCVHANGUP,
+		linux.TIOCGDEV,
+		linux.TIOCMGET,
+		linux.TIOCMSET,
+		linux.TIOCMBIC,
+		linux.TIOCMBIS,
+		linux.TIOCGICOUNT,
+		linux.TCFLSH,
+		linux.TIOCSSERIAL,
+		linux.TIOCGPTPEER:
+
+		unimpl.EmitUnimplementedEvent(ctx)
+		fallthrough
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+// checkChange checks that the process group is allowed to read, write, or
+// change the state of the TTY.
+//
+// This corresponds to Linux drivers/tty/tty_io.c:tty_check_change(). The logic
+// is a bit convoluted, but documented inline.
+//
+// Preconditions: t.mu must be held.
+func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		// No task? Linux does not have an analog for this case, but
+		// tty_check_change is more of a blacklist of cases than a
+		// whitelist, and is surprisingly permissive. Allowing the
+		// change seems most appropriate.
+		return nil
+	}
+
+	tg := task.ThreadGroup()
+	pg := tg.ProcessGroup()
+
+	// If the session for the task is different than the session for the
+	// controlling TTY, then the change is allowed. Seems like a bad idea,
+	// but that's exactly what linux does.
+	if tg.Session() != t.fgProcessGroup.Session() {
+		return nil
+	}
+
+	// If we are the foreground process group, then the change is allowed.
+	if pg == t.fgProcessGroup {
+		return nil
+	}
+
+	// We are not the foreground process group.
+
+	// Is the provided signal blocked or ignored?
+	if (task.SignalMask()&linux.SignalSetOf(sig) != 0) || tg.SignalHandlers().IsIgnored(sig) {
+		// If the signal is SIGTTIN, then we are attempting to read
+		// from the TTY. Don't send the signal and return EIO.
+		if sig == linux.SIGTTIN {
+			return syserror.EIO
+		}
+
+		// Otherwise, we are writing or changing terminal state. This is allowed.
+		return nil
+	}
+
+	// If the process group is an orphan, return EIO.
+	if pg.IsOrphan() {
+		return syserror.EIO
+	}
+
+	// Otherwise, send the signal to the process group and return ERESTARTSYS.
+	//
+	// Note that Linux also unconditionally sets TIF_SIGPENDING on current,
+	// but this isn't necessary in gVisor because the rationale given in
+	// 040b6362d58f "tty: fix leakage of -ERESTARTSYS to userland" doesn't
+	// apply: the sentry will handle -ERESTARTSYS in
+	// kernel.runApp.execute() even if the kernel.Task isn't interrupted.
+	//
+	// Linux ignores the result of kill_pgrp().
+	_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
+	return kernel.ERESTARTSYS
+}
diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go
new file mode 100644
index 000000000..94ff7708e
--- /dev/null
+++ b/pkg/sentry/fs/host/util.go
@@ -0,0 +1,197 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"os"
+	"path"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func open(parent *inodeOperations, name string) (int, error) {
+	if parent == nil && !path.IsAbs(name) {
+		return -1, syserror.EINVAL
+	}
+	name = path.Clean(name)
+
+	// Don't follow through symlinks.
+	flags := syscall.O_NOFOLLOW
+
+	if fd, err := openAt(parent, name, flags|syscall.O_RDWR, 0); err == nil {
+		return fd, nil
+	}
+	// Retry as read-only.
+	if fd, err := openAt(parent, name, flags|syscall.O_RDONLY, 0); err == nil {
+		return fd, nil
+	}
+
+	// Retry as write-only.
+	if fd, err := openAt(parent, name, flags|syscall.O_WRONLY, 0); err == nil {
+		return fd, nil
+	}
+
+	// Retry as a symlink, by including O_PATH as an option.
+	fd, err := openAt(parent, name, linux.O_PATH|flags, 0)
+	if err == nil {
+		return fd, nil
+	}
+
+	// Everything failed.
+	return -1, err
+}
+
+func openAt(parent *inodeOperations, name string, flags int, perm linux.FileMode) (int, error) {
+	if parent == nil {
+		return syscall.Open(name, flags, uint32(perm))
+	}
+	return syscall.Openat(parent.fileState.FD(), name, flags, uint32(perm))
+}
+
+func nodeType(s *syscall.Stat_t) fs.InodeType {
+	switch x := (s.Mode & syscall.S_IFMT); x {
+	case syscall.S_IFLNK:
+		return fs.Symlink
+	case syscall.S_IFIFO:
+		return fs.Pipe
+	case syscall.S_IFCHR:
+		return fs.CharacterDevice
+	case syscall.S_IFBLK:
+		return fs.BlockDevice
+	case syscall.S_IFSOCK:
+		return fs.Socket
+	case syscall.S_IFDIR:
+		return fs.Directory
+	case syscall.S_IFREG:
+		return fs.RegularFile
+	default:
+		// This shouldn't happen, but just in case...
+		log.Warningf("unknown host file type %d: assuming regular", x)
+		return fs.RegularFile
+	}
+}
+
+func wouldBlock(s *syscall.Stat_t) bool {
+	typ := nodeType(s)
+	return typ == fs.Pipe || typ == fs.Socket || typ == fs.CharacterDevice
+}
+
+func stableAttr(s *syscall.Stat_t) fs.StableAttr {
+	return fs.StableAttr{
+		Type:     nodeType(s),
+		DeviceID: hostFileDevice.DeviceID(),
+		InodeID: hostFileDevice.Map(device.MultiDeviceKey{
+			Device: s.Dev,
+			Inode:  s.Ino,
+		}),
+		BlockSize: int64(s.Blksize),
+	}
+}
+
+func owner(mo *superOperations, s *syscall.Stat_t) fs.FileOwner {
+	// User requested no translation, just return actual owner.
+	if mo.dontTranslateOwnership {
+		return fs.FileOwner{auth.KUID(s.Uid), auth.KGID(s.Gid)}
+	}
+
+	// Show only IDs relevant to the sandboxed task. I.e. if we not own the
+	// file, no sandboxed task can own the file. In that case, we
+	// use OverflowID for UID, implying that the IDs are not mapped in the
+	// "root" user namespace.
+	//
+	// E.g.
+	// sandbox's host EUID/EGID is 1/1.
+	// some_dir's host UID/GID is 2/1.
+	// Task that mounted this fs has virtualized EUID/EGID 5/5.
+	//
+	// If you executed `ls -n` in the sandboxed task, it would show:
+	// drwxwrxwrx [...] 65534 5 [...] some_dir
+
+	// Files are owned by OverflowID by default.
+	owner := fs.FileOwner{auth.KUID(auth.OverflowUID), auth.KGID(auth.OverflowGID)}
+
+	// If we own file on host, let mounting task's initial EUID own
+	// the file.
+	if s.Uid == hostUID {
+		owner.UID = mo.mounter.UID
+	}
+
+	// If our group matches file's group, make file's group match
+	// the mounting task's initial EGID.
+	for _, gid := range hostGIDs {
+		if s.Gid == gid {
+			owner.GID = mo.mounter.GID
+			break
+		}
+	}
+	return owner
+}
+
+func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr {
+	return fs.UnstableAttr{
+		Size:             s.Size,
+		Usage:            s.Blocks * 512,
+		Perms:            fs.FilePermsFromMode(linux.FileMode(s.Mode)),
+		Owner:            owner(mo, s),
+		AccessTime:       ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec),
+		ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec),
+		StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec),
+		Links:            s.Nlink,
+	}
+}
+
+type dirInfo struct {
+	buf  []byte // buffer for directory I/O.
+	nbuf int    // length of buf; return value from ReadDirent.
+	bufp int    // location of next record in buf.
+}
+
+// isBlockError unwraps os errors and checks if they are caused by EAGAIN or
+// EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock.
+func isBlockError(err error) bool {
+	if err == syserror.EAGAIN || err == syserror.EWOULDBLOCK {
+		return true
+	}
+	if pe, ok := err.(*os.PathError); ok {
+		return isBlockError(pe.Err)
+	}
+	return false
+}
+
+func hostEffectiveKIDs() (uint32, []uint32, error) {
+	gids, err := os.Getgroups()
+	if err != nil {
+		return 0, nil, err
+	}
+	egids := make([]uint32, len(gids))
+	for i, gid := range gids {
+		egids[i] = uint32(gid)
+	}
+	return uint32(os.Geteuid()), append(egids, uint32(os.Getegid())), nil
+}
+
+var hostUID uint32
+var hostGIDs []uint32
+
+func init() {
+	hostUID, hostGIDs, _ = hostEffectiveKIDs()
+}
diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go
new file mode 100644
index 000000000..b95a57c3f
--- /dev/null
+++ b/pkg/sentry/fs/host/util_unsafe.go
@@ -0,0 +1,137 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+)
+
+// NulByte is a single NUL byte. It is passed to readlinkat as an empty string.
+var NulByte byte = '\x00'
+
+func createLink(fd int, name string, linkName string) error {
+	namePtr, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return err
+	}
+	linkNamePtr, err := syscall.BytePtrFromString(linkName)
+	if err != nil {
+		return err
+	}
+	_, _, errno := syscall.Syscall(
+		syscall.SYS_SYMLINKAT,
+		uintptr(unsafe.Pointer(namePtr)),
+		uintptr(fd),
+		uintptr(unsafe.Pointer(linkNamePtr)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+func readLink(fd int) (string, error) {
+	// Buffer sizing copied from os.Readlink.
+	for l := 128; ; l *= 2 {
+		b := make([]byte, l)
+		n, _, errno := syscall.Syscall6(
+			syscall.SYS_READLINKAT,
+			uintptr(fd),
+			uintptr(unsafe.Pointer(&NulByte)), // ""
+			uintptr(unsafe.Pointer(&b[0])),
+			uintptr(l),
+			0, 0)
+		if errno != 0 {
+			return "", errno
+		}
+		if n < uintptr(l) {
+			return string(b[:n]), nil
+		}
+	}
+}
+
+func unlinkAt(fd int, name string, dir bool) error {
+	namePtr, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return err
+	}
+	var flags uintptr
+	if dir {
+		flags = linux.AT_REMOVEDIR
+	}
+	_, _, errno := syscall.Syscall(
+		syscall.SYS_UNLINKAT,
+		uintptr(fd),
+		uintptr(unsafe.Pointer(namePtr)),
+		flags,
+	)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+func timespecFromTimestamp(t ktime.Time, omit, setSysTime bool) syscall.Timespec {
+	if omit {
+		return syscall.Timespec{0, linux.UTIME_OMIT}
+	}
+	if setSysTime {
+		return syscall.Timespec{0, linux.UTIME_NOW}
+	}
+	return syscall.NsecToTimespec(t.Nanoseconds())
+}
+
+func setTimestamps(fd int, ts fs.TimeSpec) error {
+	if ts.ATimeOmit && ts.MTimeOmit {
+		return nil
+	}
+	var sts [2]syscall.Timespec
+	sts[0] = timespecFromTimestamp(ts.ATime, ts.ATimeOmit, ts.ATimeSetSystemTime)
+	sts[1] = timespecFromTimestamp(ts.MTime, ts.MTimeOmit, ts.MTimeSetSystemTime)
+	_, _, errno := syscall.Syscall6(
+		syscall.SYS_UTIMENSAT,
+		uintptr(fd),
+		0, /* path */
+		uintptr(unsafe.Pointer(&sts)),
+		0, /* flags */
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+func fstatat(fd int, name string, flags int) (syscall.Stat_t, error) {
+	var stat syscall.Stat_t
+	namePtr, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return stat, err
+	}
+	_, _, errno := syscall.Syscall6(
+		syscall.SYS_NEWFSTATAT,
+		uintptr(fd),
+		uintptr(unsafe.Pointer(namePtr)),
+		uintptr(unsafe.Pointer(&stat)),
+		uintptr(flags),
+		0, 0)
+	if errno != 0 {
+		return stat, errno
+	}
+	return stat, nil
+}
author	gVisor bot <gvisor-bot@google.com>	2019-06-02 06:44:55 +0000
committer	gVisor bot <gvisor-bot@google.com>	2019-06-02 06:44:55 +0000
commit	ceb0d792f328d1fc0692197d8856a43c3936a571 (patch)
tree	83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/fs/host
parent	deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff)
parent	216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff)