summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fs/host
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/fs/host')
-rw-r--r--pkg/sentry/fs/host/BUILD104
-rw-r--r--pkg/sentry/fs/host/control.go90
-rw-r--r--pkg/sentry/fs/host/descriptor.go118
-rw-r--r--pkg/sentry/fs/host/descriptor_state.go29
-rw-r--r--pkg/sentry/fs/host/device.go25
-rw-r--r--pkg/sentry/fs/host/file.go371
-rw-r--r--pkg/sentry/fs/host/fs.go327
-rw-r--r--pkg/sentry/fs/host/fs_test.go383
-rw-r--r--pkg/sentry/fs/host/inode.go506
-rw-r--r--pkg/sentry/fs/host/inode_state.go79
-rw-r--r--pkg/sentry/fs/host/inode_test.go112
-rw-r--r--pkg/sentry/fs/host/ioctl_unsafe.go39
-rw-r--r--pkg/sentry/fs/host/socket.go471
-rw-r--r--pkg/sentry/fs/host/socket_state.go39
-rw-r--r--pkg/sentry/fs/host/socket_test.go401
-rw-r--r--pkg/sentry/fs/host/socket_unsafe.go82
-rw-r--r--pkg/sentry/fs/host/util.go197
-rw-r--r--pkg/sentry/fs/host/util_unsafe.go137
-rw-r--r--pkg/sentry/fs/host/wait_test.go70
19 files changed, 3580 insertions, 0 deletions
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
new file mode 100644
index 000000000..97b64daed
--- /dev/null
+++ b/pkg/sentry/fs/host/BUILD
@@ -0,0 +1,104 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+ name = "host_state",
+ srcs = [
+ "control.go",
+ "descriptor.go",
+ "descriptor_state.go",
+ "file.go",
+ "fs.go",
+ "inode.go",
+ "inode_state.go",
+ "socket.go",
+ "socket_state.go",
+ ],
+ out = "host_state.go",
+ package = "host",
+)
+
+go_library(
+ name = "host",
+ srcs = [
+ "control.go",
+ "descriptor.go",
+ "descriptor_state.go",
+ "device.go",
+ "file.go",
+ "fs.go",
+ "host_state.go",
+ "inode.go",
+ "inode_state.go",
+ "ioctl_unsafe.go",
+ "socket.go",
+ "socket_state.go",
+ "socket_unsafe.go",
+ "util.go",
+ "util_unsafe.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/amutex",
+ "//pkg/fd",
+ "//pkg/log",
+ "//pkg/refs",
+ "//pkg/secio",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/context",
+ "//pkg/sentry/device",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/time",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/platform",
+ "//pkg/sentry/safemem",
+ "//pkg/sentry/socket",
+ "//pkg/sentry/socket/control",
+ "//pkg/sentry/socket/unix",
+ "//pkg/sentry/uniqueid",
+ "//pkg/sentry/usage",
+ "//pkg/sentry/usermem",
+ "//pkg/state",
+ "//pkg/syserror",
+ "//pkg/tcpip",
+ "//pkg/tcpip/link/rawfile",
+ "//pkg/tcpip/transport/unix",
+ "//pkg/unet",
+ "//pkg/waiter",
+ "//pkg/waiter/fdnotifier",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
+
+go_test(
+ name = "host_test",
+ size = "small",
+ srcs = [
+ "fs_test.go",
+ "inode_test.go",
+ "socket_test.go",
+ "wait_test.go",
+ ],
+ embed = [":host"],
+ deps = [
+ "//pkg/fd",
+ "//pkg/sentry/context",
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/socket",
+ "//pkg/sentry/usermem",
+ "//pkg/syserr",
+ "//pkg/tcpip",
+ "//pkg/tcpip/transport/unix",
+ "//pkg/waiter",
+ "//pkg/waiter/fdnotifier",
+ ],
+)
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
new file mode 100644
index 000000000..d2b007ab2
--- /dev/null
+++ b/pkg/sentry/fs/host/control.go
@@ -0,0 +1,90 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+type scmRights struct {
+ fds []int
+}
+
+func newSCMRights(fds []int) control.SCMRights {
+ return &scmRights{fds}
+}
+
+// Files implements control.SCMRights.Files.
+func (c *scmRights) Files(ctx context.Context, max int) control.RightsFiles {
+ n := max
+ if l := len(c.fds); n > l {
+ n = l
+ }
+
+ rf := control.RightsFiles(fdsToFiles(ctx, c.fds[:n]))
+
+ // Only consume converted FDs (fdsToFiles may convert fewer than n FDs).
+ c.fds = c.fds[len(rf):]
+ return rf
+}
+
+// Clone implements unix.RightsControlMessage.Clone.
+func (c *scmRights) Clone() unix.RightsControlMessage {
+ // Host rights never need to be cloned.
+ return nil
+}
+
+// Release implements unix.RightsControlMessage.Release.
+func (c *scmRights) Release() {
+ for _, fd := range c.fds {
+ syscall.Close(fd)
+ }
+ c.fds = nil
+}
+
+// If an error is encountered, only files created before the error will be
+// returned. This is what Linux does.
+func fdsToFiles(ctx context.Context, fds []int) []*fs.File {
+ files := make([]*fs.File, 0, len(fds))
+ for _, fd := range fds {
+ // Get flags. We do it here because they may be modified
+ // by subsequent functions.
+ fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0)
+ if errno != 0 {
+ ctx.Warningf("Error retrieving host FD flags: %v", error(errno))
+ break
+ }
+
+ // Create the file backed by hostFD.
+ file, err := NewFile(ctx, fd, fs.FileOwnerFromContext(ctx))
+ if err != nil {
+ ctx.Warningf("Error creating file from host FD: %v", err)
+ break
+ }
+
+ // Set known flags.
+ file.SetFlags(fs.SettableFileFlags{
+ NonBlocking: fileFlags&syscall.O_NONBLOCK != 0,
+ })
+
+ files = append(files, file)
+ }
+ return files
+}
diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go
new file mode 100644
index 000000000..613bd06e8
--- /dev/null
+++ b/pkg/sentry/fs/host/descriptor.go
@@ -0,0 +1,118 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "fmt"
+ "path"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+ "gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
+)
+
+// descriptor wraps a host fd.
+type descriptor struct {
+ // donated is true if the host fd was donated by another process.
+ donated bool
+
+ // If origFD >= 0, it is the host fd that this file was
+ // originally created from, which must be available at time
+ // of restore. Only valid if donated is true.
+ origFD int
+
+ // wouldBlock is true if value (below) points to a file that can
+ // return EWOULDBLOCK for operations that would block.
+ wouldBlock bool
+
+ // value is the wrapped host fd. It is never saved or restored
+ // directly. How it is restored depends on whether it was
+ // donated and the fs.MountSource it was originally
+ // opened/created from.
+ value int `state:"nosave"`
+}
+
+// newDescriptor returns a wrapped host file descriptor. On success,
+// the descriptor is registered for event notifications with queue.
+func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue *waiter.Queue) (*descriptor, error) {
+ ownedFD := fd
+ origFD := -1
+ if saveable {
+ var err error
+ ownedFD, err = syscall.Dup(fd)
+ if err != nil {
+ return nil, err
+ }
+ origFD = fd
+ }
+ if wouldBlock {
+ if err := syscall.SetNonblock(ownedFD, true); err != nil {
+ return nil, err
+ }
+ if err := fdnotifier.AddFD(int32(ownedFD), queue); err != nil {
+ return nil, err
+ }
+ }
+ return &descriptor{
+ donated: donated,
+ origFD: origFD,
+ wouldBlock: wouldBlock,
+ value: ownedFD,
+ }, nil
+}
+
+// initAfterLoad initializes the value of the descriptor after Load.
+func (d *descriptor) initAfterLoad(mo *superOperations, id uint64, queue *waiter.Queue) error {
+ if d.donated {
+ var err error
+ d.value, err = syscall.Dup(d.origFD)
+ if err != nil {
+ return fmt.Errorf("failed to dup restored fd %d: %v", d.origFD, err)
+ }
+ } else {
+ name, ok := mo.inodeMappings[id]
+ if !ok {
+ return fmt.Errorf("failed to find path for inode number %d", id)
+ }
+ fullpath := path.Join(mo.root, name)
+
+ var err error
+ d.value, err = open(nil, fullpath)
+ if err != nil {
+ return fmt.Errorf("failed to open %q: %v", fullpath, err)
+ }
+ }
+ if d.wouldBlock {
+ if err := syscall.SetNonblock(d.value, true); err != nil {
+ return err
+ }
+ if err := fdnotifier.AddFD(int32(d.value), queue); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// Release releases all resources held by descriptor.
+func (d *descriptor) Release() {
+ if d.wouldBlock {
+ fdnotifier.RemoveFD(int32(d.value))
+ }
+ if err := syscall.Close(d.value); err != nil {
+ log.Warningf("error closing fd %d: %v", d.value, err)
+ }
+ d.value = -1
+}
diff --git a/pkg/sentry/fs/host/descriptor_state.go b/pkg/sentry/fs/host/descriptor_state.go
new file mode 100644
index 000000000..7fb274451
--- /dev/null
+++ b/pkg/sentry/fs/host/descriptor_state.go
@@ -0,0 +1,29 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+// beforeSave is invoked by stateify.
+func (d *descriptor) beforeSave() {
+ if d.donated && d.origFD < 0 {
+ panic("donated file descriptor cannot be saved")
+ }
+}
+
+// afterLoad is invoked by stateify.
+func (d *descriptor) afterLoad() {
+ // value must be manually restored by the descriptor's parent using
+ // initAfterLoad.
+ d.value = -1
+}
diff --git a/pkg/sentry/fs/host/device.go b/pkg/sentry/fs/host/device.go
new file mode 100644
index 000000000..f2a0b6b15
--- /dev/null
+++ b/pkg/sentry/fs/host/device.go
@@ -0,0 +1,25 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+)
+
+// hostFileDevice is the host file virtual device.
+var hostFileDevice = device.NewAnonMultiDevice()
+
+// hostPipeDevice is the host pipe virtual device.
+var hostPipeDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
new file mode 100644
index 000000000..bdf844337
--- /dev/null
+++ b/pkg/sentry/fs/host/file.go
@@ -0,0 +1,371 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "fmt"
+ "syscall"
+
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/fd"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/secio"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+ "gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
+)
+
+// fileOperations implements fs.FileOperations for a host file descriptor.
+type fileOperations struct {
+ fsutil.NoopRelease `state:"nosave"`
+
+ // iops are the Inode operations for this file.
+ iops *inodeOperations `state:"wait"`
+
+ // a scratch buffer for reading directory entries.
+ dirinfo *dirInfo `state:"nosave"`
+
+ // dirCursor is the directory cursor.
+ dirCursor string
+
+ // allowIoctl determines whether ioctls should be passed through to the
+ // host.
+ allowIoctl bool
+}
+
+// fileOperations implements fs.FileOperations.
+var _ fs.FileOperations = (*fileOperations)(nil)
+
+// NewFile creates a new File backed by the provided host file descriptor. If
+// NewFile succeeds, ownership of the fd is transferred to the returned File.
+//
+// The returned File cannot be saved, since there is no guarantee that the same
+// fd will exist or represent the same file at time of restore. If such a
+// guarantee does exist, use ImportFile instead.
+func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error) {
+ return newFileFromDonatedFD(ctx, fd, mounter, false, false)
+}
+
+// ImportFile creates a new File backed by the provided host file descriptor.
+// Unlike NewFile, the file descriptor used by the File is duped from fd to
+// ensure that later changes to fd are not reflected by the fs.File.
+//
+// If the returned file is saved, it will be restored by re-importing the fd
+// originally passed to ImportFile. It is the restorer's responsibility to
+// ensure that the fd represents the same file.
+func ImportFile(ctx context.Context, fd int, mounter fs.FileOwner, allowIoctl bool) (*fs.File, error) {
+ return newFileFromDonatedFD(ctx, fd, mounter, true, allowIoctl)
+}
+
+// newFileFromDonatedFD returns an fs.File from a donated fd. If the fd is
+// saveable, then saveable is true.
+func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner, saveable, allowIoctl bool) (*fs.File, error) {
+ var s syscall.Stat_t
+ if err := syscall.Fstat(donated, &s); err != nil {
+ return nil, err
+ }
+ switch s.Mode & syscall.S_IFMT {
+ case syscall.S_IFSOCK:
+ flags, err := fileFlagsFromDonatedFD(donated)
+ if err != nil {
+ return nil, err
+ }
+ s, err := newSocket(ctx, donated, saveable)
+ if err != nil {
+ return nil, err
+ }
+ s.SetFlags(fs.SettableFileFlags{
+ NonBlocking: flags.NonBlocking,
+ })
+ return s, nil
+ default:
+ flags, err := fileFlagsFromDonatedFD(donated)
+ if err != nil {
+ return nil, err
+ }
+ msrc := newMountSource(ctx, "/", mounter, &Filesystem{}, fs.MountSourceFlags{}, false /* dontTranslateOwnership */)
+ inode, err := newInode(ctx, msrc, donated, saveable, true /* donated */)
+ if err != nil {
+ return nil, err
+ }
+ iops := inode.InodeOperations.(*inodeOperations)
+
+ name := fmt.Sprintf("host:[%d]", inode.StableAttr.InodeID)
+ dirent := fs.NewDirent(inode, name)
+ defer dirent.DecRef()
+
+ return newFile(ctx, dirent, flags, iops, allowIoctl), nil
+ }
+}
+
+func fileFlagsFromDonatedFD(donated int) (fs.FileFlags, error) {
+ flags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(donated), syscall.F_GETFL, 0)
+ if errno != 0 {
+ log.Warningf("Failed to get file flags for donated fd %d (errno=%d)", donated, errno)
+ return fs.FileFlags{}, syscall.EIO
+ }
+ accmode := flags & syscall.O_ACCMODE
+ return fs.FileFlags{
+ Direct: flags&syscall.O_DIRECT != 0,
+ NonBlocking: flags&syscall.O_NONBLOCK != 0,
+ Sync: flags&syscall.O_SYNC != 0,
+ Append: flags&syscall.O_APPEND != 0,
+ Read: accmode == syscall.O_RDONLY || accmode == syscall.O_RDWR,
+ Write: accmode == syscall.O_WRONLY || accmode == syscall.O_RDWR,
+ }, nil
+}
+
+// newFile returns a new fs.File.
+func newFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations, allowIoctl bool) *fs.File {
+ if !iops.ReturnsWouldBlock() {
+ // Allow reading/writing at an arbitrary offset for files
+ // that support it.
+ flags.Pread = true
+ flags.Pwrite = true
+ }
+ return fs.NewFile(ctx, dirent, flags, &fileOperations{
+ iops: iops,
+ allowIoctl: allowIoctl,
+ })
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (f *fileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ f.iops.fileState.queue.EventRegister(e, mask)
+ fdnotifier.UpdateFD(int32(f.iops.fileState.FD()))
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (f *fileOperations) EventUnregister(e *waiter.Entry) {
+ f.iops.fileState.queue.EventUnregister(e)
+ fdnotifier.UpdateFD(int32(f.iops.fileState.FD()))
+}
+
+// Readiness uses the poll() syscall to check the status of the underlying FD.
+func (f *fileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return fdnotifier.NonBlockingPoll(int32(f.iops.fileState.FD()), mask)
+}
+
+// Readdir implements fs.FileOperations.Readdir.
+func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
+ root := fs.RootFromContext(ctx)
+ defer root.DecRef()
+ dirCtx := &fs.DirCtx{
+ Serializer: serializer,
+ DirCursor: &f.dirCursor,
+ }
+ return fs.DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset())
+}
+
+// IterateDir implements fs.DirIterator.IterateDir.
+func (f *fileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+ if f.dirinfo == nil {
+ f.dirinfo = new(dirInfo)
+ f.dirinfo.buf = make([]byte, usermem.PageSize)
+ }
+ entries, err := f.iops.readdirAll(f.dirinfo)
+ if err != nil {
+ return offset, err
+ }
+ count, err := fs.GenericReaddir(dirCtx, fs.NewSortedDentryMap(entries))
+ return offset + count, err
+}
+
+// Write implements fs.FileOperations.Write.
+func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+ // Would this file block?
+ if f.iops.ReturnsWouldBlock() {
+ // These files can't be memory mapped, assert this. This also
+ // means that writes do not need to synchronize with memory
+ // mappings nor metadata cached by this file's fs.Inode.
+ if canMap(file.Dirent.Inode) {
+ panic("files that can return EWOULDBLOCK cannot be memory mapped")
+ }
+ // Ignore the offset, these files don't support writing at
+ // an arbitrary offset.
+ writer := fd.NewReadWriter(f.iops.fileState.FD())
+ n, err := src.CopyInTo(ctx, safemem.FromIOWriter{writer})
+ if isBlockError(err) {
+ err = syserror.ErrWouldBlock
+ }
+ return n, err
+ }
+ if !file.Dirent.Inode.MountSource.Flags.ForcePageCache {
+ writer := secio.NewOffsetWriter(fd.NewReadWriter(f.iops.fileState.FD()), offset)
+ return src.CopyInTo(ctx, safemem.FromIOWriter{writer})
+ }
+ return f.iops.cachingInodeOps.Write(ctx, src, offset)
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ // Would this file block?
+ if f.iops.ReturnsWouldBlock() {
+ // These files can't be memory mapped, assert this. This also
+ // means that reads do not need to synchronize with memory
+ // mappings nor metadata cached by this file's fs.Inode.
+ if canMap(file.Dirent.Inode) {
+ panic("files that can return EWOULDBLOCK cannot be memory mapped")
+ }
+ // Ignore the offset, these files don't support reading at
+ // an arbitrary offset.
+ reader := fd.NewReadWriter(f.iops.fileState.FD())
+ n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{reader})
+ if isBlockError(err) {
+ // If we got any data at all, return it as a "completed" partial read
+ // rather than retrying until complete.
+ if n != 0 {
+ err = nil
+ } else {
+ err = syserror.ErrWouldBlock
+ }
+ }
+ return n, err
+ }
+ if !file.Dirent.Inode.MountSource.Flags.ForcePageCache {
+ reader := secio.NewOffsetReader(fd.NewReadWriter(f.iops.fileState.FD()), offset)
+ return dst.CopyOutFrom(ctx, safemem.FromIOReader{reader})
+ }
+ return f.iops.cachingInodeOps.Read(ctx, file, dst, offset)
+}
+
+// Fsync implements fs.FileOperations.Fsync.
+func (f *fileOperations) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error {
+ switch syncType {
+ case fs.SyncAll, fs.SyncData:
+ if err := file.Dirent.Inode.WriteOut(ctx); err != nil {
+ return err
+ }
+ fallthrough
+ case fs.SyncBackingStorage:
+ return syscall.Fsync(f.iops.fileState.FD())
+ }
+ panic("invalid sync type")
+}
+
+// Flush implements fs.FileOperations.Flush.
+func (f *fileOperations) Flush(context.Context, *fs.File) error {
+ // This is a no-op because flushing the resource backing this
+ // file would mean closing it. We can't do that because other
+ // open files may depend on the backing host fd.
+ return nil
+}
+
+// ConfigureMMap implements fs.FileOperations.ConfigureMMap.
+func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
+ if !canMap(file.Dirent.Inode) {
+ return syserror.ENODEV
+ }
+ return fsutil.GenericConfigureMMap(file, f.iops.cachingInodeOps, opts)
+}
+
+// Seek implements fs.FileOperations.Seek.
+func (f *fileOperations) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
+ return fsutil.SeekWithDirCursor(ctx, file, whence, offset, &f.dirCursor)
+}
+
+// Ioctl implements fs.FileOperations.Iocotl.
+func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ if !f.allowIoctl {
+ return 0, syserror.ENOTTY
+ }
+ // Ignore arg[0]. This is the real FD:
+ fd := f.iops.fileState.FD()
+ ioctl := args[1].Uint64()
+ switch ioctl {
+ case unix.TCGETS:
+ termios, err := ioctlGetTermios(fd)
+ if err != nil {
+ return 0, err
+ }
+ _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ return 0, err
+
+ case unix.TCSETS, unix.TCSETSW:
+ var termios linux.Termios
+ if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
+ AddressSpaceActive: true,
+ }); err != nil {
+ return 0, err
+ }
+ err := ioctlSetTermios(fd, ioctl, &termios)
+ return 0, err
+
+ case unix.TIOCGPGRP:
+ // Args: pid_t *argp
+ // When successful, equivalent to *argp = tcgetpgrp(fd).
+ // Get the process group ID of the foreground process group on
+ // this terminal.
+
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ panic(fmt.Sprintf("cannot get thread group from context %v", ctx))
+ }
+ tid := t.ThreadID()
+ _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &tid, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ return 0, err
+
+ case unix.TIOCSPGRP:
+ // Args: const pid_t *argp
+ // Equivalent to tcsetpgrp(fd, *argp).
+ // Set the foreground process group ID of this terminal.
+
+ // Not much we can do with this one at the moment, so we just
+ // lie and pretend everything is great. Bash and Sh seem fine
+ // with this.
+ log.Warningf("Ignoring application ioctl(TIOCSPGRP) call")
+ return 0, nil
+
+ case unix.TIOCGWINSZ:
+ // Args: struct winsize *argp
+ // Get window size.
+ winsize, err := unix.IoctlGetWinsize(fd, unix.TIOCGWINSZ)
+ if err != nil {
+ return 0, err
+ }
+ _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ return 0, err
+
+ case unix.TIOCSWINSZ:
+ // Args: const struct winsize *argp
+ // Set window size.
+ var winsize unix.Winsize
+ if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
+ AddressSpaceActive: true,
+ }); err != nil {
+ return 0, err
+ }
+ err := unix.IoctlSetWinsize(fd, unix.TIOCSWINSZ, &winsize)
+ return 0, err
+
+ default:
+ return 0, syserror.ENOTTY
+ }
+}
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
new file mode 100644
index 000000000..ffd55a5ab
--- /dev/null
+++ b/pkg/sentry/fs/host/fs.go
@@ -0,0 +1,327 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package host implements an fs.Filesystem for files backed by host
+// file descriptors.
+package host
+
+import (
+ "fmt"
+ "path"
+ "path/filepath"
+ "strconv"
+ "strings"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// FilesystemName is the name under which Filesystem is registered.
+const FilesystemName = "whitelistfs"
+
+const (
+ // whitelistKey is the mount option containing a comma-separated list
+ // of host paths to whitelist.
+ whitelistKey = "whitelist"
+
+ // rootPathKey is the mount option containing the root path of the
+ // mount.
+ rootPathKey = "root"
+
+ // dontTranslateOwnershipKey is the key to superOperations.dontTranslateOwnership.
+ dontTranslateOwnershipKey = "dont_translate_ownership"
+)
+
+// maxTraversals determines link traversals in building the whitelist.
+const maxTraversals = 10
+
+// Filesystem is a pseudo file system that is only available during the setup
+// to lock down the configurations. This filesystem should only be mounted at root.
+//
+// Think twice before exposing this to applications.
+type Filesystem struct {
+ // whitelist is a set of host paths to whitelist.
+ paths []string
+}
+
+// Name is the identifier of this file system.
+func (*Filesystem) Name() string {
+ return FilesystemName
+}
+
+// AllowUserMount prohibits users from using mount(2) with this file system.
+func (*Filesystem) AllowUserMount() bool {
+ return false
+}
+
+// Flags returns that there is nothing special about this file system.
+func (*Filesystem) Flags() fs.FilesystemFlags {
+ return 0
+}
+
+// Mount returns an fs.Inode exposing the host file system. It is intended to be locked
+// down in PreExec below.
+func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+ // Parse generic comma-separated key=value options.
+ options := fs.GenericMountSourceOptions(data)
+
+ // Grab the whitelist if one was specified.
+ // TODO: require another option "testonly" in order to allow
+ // no whitelist.
+ if wl, ok := options[whitelistKey]; ok {
+ f.paths = strings.Split(wl, "|")
+ delete(options, whitelistKey)
+ }
+
+ // If the rootPath was set, use it. Othewise default to the root of the
+ // host fs.
+ rootPath := "/"
+ if rp, ok := options[rootPathKey]; ok {
+ rootPath = rp
+ delete(options, rootPathKey)
+
+ // We must relativize the whitelisted paths to the new root.
+ for i, p := range f.paths {
+ rel, err := filepath.Rel(rootPath, p)
+ if err != nil {
+ return nil, fmt.Errorf("whitelist path %q must be a child of root path %q", p, rootPath)
+ }
+ f.paths[i] = path.Join("/", rel)
+ }
+ }
+ fd, err := open(nil, rootPath)
+ if err != nil {
+ return nil, fmt.Errorf("failed to find root: %v", err)
+ }
+
+ var dontTranslateOwnership bool
+ if v, ok := options[dontTranslateOwnershipKey]; ok {
+ b, err := strconv.ParseBool(v)
+ if err != nil {
+ return nil, fmt.Errorf("invalid value for %q: %v", dontTranslateOwnershipKey, err)
+ }
+ dontTranslateOwnership = b
+ delete(options, dontTranslateOwnershipKey)
+ }
+
+ // Fail if the caller passed us more options than we know about.
+ if len(options) > 0 {
+ return nil, fmt.Errorf("unsupported mount options: %v", options)
+ }
+
+ // The mounting EUID/EGID will be cached by this file system. This will
+ // be used to assign ownership to files that we own.
+ owner := fs.FileOwnerFromContext(ctx)
+
+ // Construct the host file system mount and inode.
+ msrc := newMountSource(ctx, rootPath, owner, f, flags, dontTranslateOwnership)
+ return newInode(ctx, msrc, fd, false /* saveable */, false /* donated */)
+}
+
+// InstallWhitelist locks down the MountNamespace to only the currently installed
+// Dirents and the given paths.
+func (f *Filesystem) InstallWhitelist(ctx context.Context, m *fs.MountNamespace) error {
+ return installWhitelist(ctx, m, f.paths)
+}
+
+func installWhitelist(ctx context.Context, m *fs.MountNamespace, paths []string) error {
+ if len(paths) == 0 || (len(paths) == 1 && paths[0] == "") {
+ // Warning will be logged during filter installation if the empty
+ // whitelist matters (allows for host file access).
+ return nil
+ }
+
+ // Done tracks entries already added.
+ done := make(map[string]bool)
+ root := m.Root()
+ defer root.DecRef()
+
+ for i := 0; i < len(paths); i++ {
+ // Make sure the path is absolute. This is a sanity check.
+ if !path.IsAbs(paths[i]) {
+ return fmt.Errorf("path %q is not absolute", paths[i])
+ }
+
+ // We need to add all the intermediate paths, in case one of
+ // them is a symlink that needs to be resolved.
+ for j := 1; j <= len(paths[i]); j++ {
+ if j < len(paths[i]) && paths[i][j] != '/' {
+ continue
+ }
+ current := paths[i][:j]
+
+ // Lookup the given component in the tree.
+ d, err := m.FindLink(ctx, root, nil, current, maxTraversals)
+ if err != nil {
+ log.Warningf("populate failed for %q: %v", current, err)
+ continue
+ }
+
+ // It's critical that this DecRef happens after the
+ // freeze below. This ensures that the dentry is in
+ // place to be frozen. Otherwise, we freeze without
+ // these entries.
+ defer d.DecRef()
+
+ // Expand the last component if necessary.
+ if current == paths[i] {
+ // Is it a directory or symlink?
+ sattr := d.Inode.StableAttr
+ if fs.IsDir(sattr) {
+ for name := range childDentAttrs(ctx, d) {
+ paths = append(paths, path.Join(current, name))
+ }
+ }
+ if fs.IsSymlink(sattr) {
+ // Only expand symlinks once. The
+ // folder structure may contain
+ // recursive symlinks and we don't want
+ // to end up infinitely expanding this
+ // symlink. This is safe because this
+ // is the last component. If a later
+ // path wants to symlink something
+ // beneath this symlink that will still
+ // be handled by the FindLink above.
+ if done[current] {
+ continue
+ }
+
+ s, err := d.Inode.Readlink(ctx)
+ if err != nil {
+ log.Warningf("readlink failed for %q: %v", current, err)
+ continue
+ }
+ if path.IsAbs(s) {
+ paths = append(paths, s)
+ } else {
+ target := path.Join(path.Dir(current), s)
+ paths = append(paths, target)
+ }
+ }
+ }
+
+ // Only report this one once even though we may look
+ // it up more than once. If we whitelist /a/b,/a then
+ // /a will be "done" when it is looked up for /a/b,
+ // however we still need to expand all of its contents
+ // when whitelisting /a.
+ if !done[current] {
+ log.Debugf("whitelisted: %s", current)
+ }
+ done[current] = true
+ }
+ }
+
+ // Freeze the mount tree in place. This prevents any new paths from
+ // being opened and any old ones from being removed. If we do provide
+ // tmpfs mounts, we'll want to freeze/thaw those separately.
+ m.Freeze()
+ return nil
+}
+
+func childDentAttrs(ctx context.Context, d *fs.Dirent) map[string]fs.DentAttr {
+ dirname, _ := d.FullName(nil /* root */)
+ dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
+ if err != nil {
+ log.Warningf("failed to open directory %q: %v", dirname, err)
+ return nil
+ }
+ dir.DecRef()
+ var stubSerializer fs.CollectEntriesSerializer
+ if err := dir.Readdir(ctx, &stubSerializer); err != nil {
+ log.Warningf("failed to iterate on host directory %q: %v", dirname, err)
+ return nil
+ }
+ delete(stubSerializer.Entries, ".")
+ delete(stubSerializer.Entries, "..")
+ return stubSerializer.Entries
+}
+
+// newMountSource constructs a new host fs.MountSource
+// relative to a root path. The root should match the mount point.
+func newMountSource(ctx context.Context, root string, mounter fs.FileOwner, filesystem fs.Filesystem, flags fs.MountSourceFlags, dontTranslateOwnership bool) *fs.MountSource {
+ return fs.NewMountSource(&superOperations{
+ root: root,
+ inodeMappings: make(map[uint64]string),
+ mounter: mounter,
+ dontTranslateOwnership: dontTranslateOwnership,
+ }, filesystem, flags)
+}
+
+// superOperations implements fs.MountSourceOperations.
+type superOperations struct {
+ fs.SimpleMountSourceOperations `state:"nosave"`
+
+ // root is the path of the mount point. All inode mappings
+ // are relative to this root.
+ root string
+
+ // inodeMappings contains mappings of fs.Inodes associated
+ // with this MountSource to paths under root.
+ inodeMappings map[uint64]string
+
+ // mounter is the cached EUID/EGID that mounted this file system.
+ mounter fs.FileOwner
+
+ // dontTranslateOwnership indicates whether to not translate file
+ // ownership.
+ //
+ // By default, files/directories owned by the sandbox uses UID/GID
+ // of the mounter. For files/directories that are not owned by the
+ // sandbox, file UID/GID is translated to a UID/GID which cannot
+ // be mapped in the sandboxed application's user namespace. The
+ // UID/GID will look like the nobody UID/GID (65534) but is not
+ // strictly owned by the user "nobody".
+ //
+ // If whitelistfs is a lower filesystem in an overlay, set
+ // dont_translate_ownership=true in mount options.
+ dontTranslateOwnership bool
+}
+
+var _ fs.MountSourceOperations = (*superOperations)(nil)
+
+// ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings.
+func (m *superOperations) ResetInodeMappings() {
+ m.inodeMappings = make(map[uint64]string)
+}
+
+// SaveInodeMapping implements fs.MountSourceOperations.SaveInodeMapping.
+func (m *superOperations) SaveInodeMapping(inode *fs.Inode, path string) {
+ // This is very unintuitive. We *CANNOT* trust the inode's StableAttrs,
+ // because overlay copyUp may have changed them out from under us.
+ // So much for "immutable".
+ sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr
+ m.inodeMappings[sattr.InodeID] = path
+}
+
+// Keep implements fs.MountSourceOperations.Keep.
+//
+// TODO: It is possible to change the permissions on a
+// host file while it is in the dirent cache (say from RO to RW), but it is not
+// possible to re-open the file with more relaxed permissions, since the host
+// FD is already open and stored in the inode.
+//
+// Using the dirent LRU cache increases the odds that this bug is encountered.
+// Since host file access is relatively fast anyways, we disable the LRU cache
+// for host fs files. Once we can properly deal with permissions changes and
+// re-opening host files, we should revisit whether or not to make use of the
+// LRU cache.
+func (*superOperations) Keep(*fs.Dirent) bool {
+ return false
+}
+
+func init() {
+ fs.RegisterFilesystem(&Filesystem{})
+}
diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go
new file mode 100644
index 000000000..c000afc49
--- /dev/null
+++ b/pkg/sentry/fs/host/fs_test.go
@@ -0,0 +1,383 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path"
+ "reflect"
+ "sort"
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// newTestMountNamespace creates a MountNamespace with a ramfs root.
+// It returns the host folder created, which should be removed when done.
+func newTestMountNamespace(t *testing.T) (*fs.MountNamespace, string, error) {
+ p, err := ioutil.TempDir("", "root")
+ if err != nil {
+ return nil, "", err
+ }
+
+ fd, err := open(nil, p)
+ if err != nil {
+ os.RemoveAll(p)
+ return nil, "", err
+ }
+ ctx := contexttest.Context(t)
+ root, err := newInode(ctx, newMountSource(ctx, p, fs.RootOwner, &Filesystem{}, fs.MountSourceFlags{}, false), fd, false, false)
+ if err != nil {
+ os.RemoveAll(p)
+ return nil, "", err
+ }
+ mm, err := fs.NewMountNamespace(ctx, root)
+ if err != nil {
+ os.RemoveAll(p)
+ return nil, "", err
+ }
+ return mm, p, nil
+}
+
+// createTestDirs populates the root with some test files and directories.
+// /a/a1.txt
+// /a/a2.txt
+// /b/b1.txt
+// /b/c/c1.txt
+// /symlinks/normal.txt
+// /symlinks/to_normal.txt -> /symlinks/normal.txt
+// /symlinks/recursive -> /symlinks
+func createTestDirs(ctx context.Context, t *testing.T, m *fs.MountNamespace) error {
+ r := m.Root()
+ defer r.DecRef()
+
+ if err := r.CreateDirectory(ctx, r, "a", fs.FilePermsFromMode(0777)); err != nil {
+ return err
+ }
+
+ a, err := r.Walk(ctx, r, "a")
+ if err != nil {
+ return err
+ }
+ defer a.DecRef()
+
+ a1, err := a.Create(ctx, r, "a1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
+ if err != nil {
+ return err
+ }
+ a1.DecRef()
+
+ a2, err := a.Create(ctx, r, "a2.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
+ if err != nil {
+ return err
+ }
+ a2.DecRef()
+
+ if err := r.CreateDirectory(ctx, r, "b", fs.FilePermsFromMode(0777)); err != nil {
+ return err
+ }
+
+ b, err := r.Walk(ctx, r, "b")
+ if err != nil {
+ return err
+ }
+ defer b.DecRef()
+
+ b1, err := b.Create(ctx, r, "b1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
+ if err != nil {
+ return err
+ }
+ b1.DecRef()
+
+ if err := b.CreateDirectory(ctx, r, "c", fs.FilePermsFromMode(0777)); err != nil {
+ return err
+ }
+
+ c, err := b.Walk(ctx, r, "c")
+ if err != nil {
+ return err
+ }
+ defer c.DecRef()
+
+ c1, err := c.Create(ctx, r, "c1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
+ if err != nil {
+ return err
+ }
+ c1.DecRef()
+
+ if err := r.CreateDirectory(ctx, r, "symlinks", fs.FilePermsFromMode(0777)); err != nil {
+ return err
+ }
+
+ symlinks, err := r.Walk(ctx, r, "symlinks")
+ if err != nil {
+ return err
+ }
+ defer symlinks.DecRef()
+
+ normal, err := symlinks.Create(ctx, r, "normal.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
+ if err != nil {
+ return err
+ }
+ normal.DecRef()
+
+ if err := symlinks.CreateLink(ctx, r, "/symlinks/normal.txt", "to_normal.txt"); err != nil {
+ return err
+ }
+
+ if err := symlinks.CreateLink(ctx, r, "/symlinks", "recursive"); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+// allPaths returns a slice of all paths of entries visible in the rootfs.
+func allPaths(ctx context.Context, t *testing.T, m *fs.MountNamespace, base string) ([]string, error) {
+ var paths []string
+ root := m.Root()
+ defer root.DecRef()
+
+ d, err := m.FindLink(ctx, root, nil, base, 1)
+ if err != nil {
+ t.Logf("FindLink failed for %q", base)
+ return paths, err
+ }
+ defer d.DecRef()
+
+ if fs.IsDir(d.Inode.StableAttr) {
+ dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
+ if err != nil {
+ return nil, fmt.Errorf("failed to open directory %q: %v", base, err)
+ }
+ iter, ok := dir.FileOperations.(fs.DirIterator)
+ if !ok {
+ return nil, fmt.Errorf("cannot directly iterate on host directory %q", base)
+ }
+ dirCtx := &fs.DirCtx{
+ Serializer: noopDentrySerializer{},
+ }
+ if _, err := fs.DirentReaddir(ctx, d, iter, root, dirCtx, 0); err != nil {
+ return nil, err
+ }
+ for name := range dirCtx.DentAttrs() {
+ if name == "." || name == ".." {
+ continue
+ }
+
+ fullName := path.Join(base, name)
+ paths = append(paths, fullName)
+
+ // Recurse.
+ subpaths, err := allPaths(ctx, t, m, fullName)
+ if err != nil {
+ return paths, err
+ }
+ paths = append(paths, subpaths...)
+ }
+ }
+
+ return paths, nil
+}
+
+type noopDentrySerializer struct{}
+
+func (noopDentrySerializer) CopyOut(string, fs.DentAttr) error {
+ return nil
+}
+func (noopDentrySerializer) Written() int {
+ return 4096
+}
+
+// pathsEqual returns true if the two string slices contain the same entries.
+func pathsEqual(got, want []string) bool {
+ sort.Strings(got)
+ sort.Strings(want)
+
+ if len(got) != len(want) {
+ return false
+ }
+
+ for i := range got {
+ if got[i] != want[i] {
+ return false
+ }
+ }
+
+ return true
+}
+
+func TestWhitelist(t *testing.T) {
+ for _, test := range []struct {
+ // description of the test.
+ desc string
+ // paths are the paths to whitelist
+ paths []string
+ // want are all of the directory entries that should be
+ // visible (nothing beyond this set should be visible).
+ want []string
+ }{
+ {
+ desc: "root",
+ paths: []string{"/"},
+ want: []string{"/a", "/a/a1.txt", "/a/a2.txt", "/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt", "/symlinks", "/symlinks/normal.txt", "/symlinks/to_normal.txt", "/symlinks/recursive"},
+ },
+ {
+ desc: "top-level directories",
+ paths: []string{"/a", "/b"},
+ want: []string{"/a", "/a/a1.txt", "/a/a2.txt", "/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"},
+ },
+ {
+ desc: "nested directories (1/2)",
+ paths: []string{"/b", "/b/c"},
+ want: []string{"/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"},
+ },
+ {
+ desc: "nested directories (2/2)",
+ paths: []string{"/b/c", "/b"},
+ want: []string{"/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"},
+ },
+ {
+ desc: "single file",
+ paths: []string{"/b/c/c1.txt"},
+ want: []string{"/b", "/b/c", "/b/c/c1.txt"},
+ },
+ {
+ desc: "single file and directory",
+ paths: []string{"/a/a1.txt", "/b/c"},
+ want: []string{"/a", "/a/a1.txt", "/b", "/b/c", "/b/c/c1.txt"},
+ },
+ {
+ desc: "symlink",
+ paths: []string{"/symlinks/to_normal.txt"},
+ want: []string{"/symlinks", "/symlinks/normal.txt", "/symlinks/to_normal.txt"},
+ },
+ {
+ desc: "recursive symlink",
+ paths: []string{"/symlinks/recursive/normal.txt"},
+ want: []string{"/symlinks", "/symlinks/normal.txt", "/symlinks/recursive"},
+ },
+ } {
+ t.Run(test.desc, func(t *testing.T) {
+ m, p, err := newTestMountNamespace(t)
+ if err != nil {
+ t.Errorf("Failed to create MountNamespace: %v", err)
+ }
+ defer os.RemoveAll(p)
+
+ ctx := withRoot(contexttest.RootContext(t), m.Root())
+ if err := createTestDirs(ctx, t, m); err != nil {
+ t.Errorf("Failed to create test dirs: %v", err)
+ }
+
+ if err := installWhitelist(ctx, m, test.paths); err != nil {
+ t.Errorf("installWhitelist(%v) err got %v want nil", test.paths, err)
+ }
+
+ got, err := allPaths(ctx, t, m, "/")
+ if err != nil {
+ t.Fatalf("Failed to lookup paths (whitelisted: %v): %v", test.paths, err)
+ }
+
+ if !pathsEqual(got, test.want) {
+ t.Errorf("For paths %v got %v want %v", test.paths, got, test.want)
+ }
+ })
+ }
+}
+
+func TestRootPath(t *testing.T) {
+ // Create a temp dir, which will be the root of our mounted fs.
+ rootPath, err := ioutil.TempDir(os.TempDir(), "root")
+ if err != nil {
+ t.Fatalf("TempDir failed: %v", err)
+ }
+ defer os.RemoveAll(rootPath)
+
+ // Create two files inside the new root, one which will be whitelisted
+ // and one not.
+ whitelisted, err := ioutil.TempFile(rootPath, "white")
+ if err != nil {
+ t.Fatalf("TempFile failed: %v", err)
+ }
+ if _, err := ioutil.TempFile(rootPath, "black"); err != nil {
+ t.Fatalf("TempFile failed: %v", err)
+ }
+
+ // Create a mount with a root path and single whitelisted file.
+ hostFS := &Filesystem{}
+ ctx := contexttest.Context(t)
+ data := fmt.Sprintf("%s=%s,%s=%s", rootPathKey, rootPath, whitelistKey, whitelisted.Name())
+ inode, err := hostFS.Mount(ctx, "", fs.MountSourceFlags{}, data)
+ if err != nil {
+ t.Fatalf("Mount failed: %v", err)
+ }
+ mm, err := fs.NewMountNamespace(ctx, inode)
+ if err != nil {
+ t.Fatalf("NewMountNamespace failed: %v", err)
+ }
+ if err := hostFS.InstallWhitelist(ctx, mm); err != nil {
+ t.Fatalf("InstallWhitelist failed: %v", err)
+ }
+
+ // Get the contents of the root directory.
+ rootDir := mm.Root()
+ rctx := withRoot(ctx, rootDir)
+ f, err := rootDir.Inode.GetFile(rctx, rootDir, fs.FileFlags{})
+ if err != nil {
+ t.Fatalf("GetFile failed: %v", err)
+ }
+ c := &fs.CollectEntriesSerializer{}
+ if err := f.Readdir(rctx, c); err != nil {
+ t.Fatalf("Readdir failed: %v", err)
+ }
+
+ // We should have only our whitelisted file, plus the dots.
+ want := []string{path.Base(whitelisted.Name()), ".", ".."}
+ got := c.Order
+ sort.Strings(want)
+ sort.Strings(got)
+ if !reflect.DeepEqual(got, want) {
+ t.Errorf("Readdir got %v, wanted %v", got, want)
+ }
+}
+
+type rootContext struct {
+ context.Context
+ root *fs.Dirent
+}
+
+// withRoot returns a copy of ctx with the given root.
+func withRoot(ctx context.Context, root *fs.Dirent) context.Context {
+ return &rootContext{
+ Context: ctx,
+ root: root,
+ }
+}
+
+// Value implements Context.Value.
+func (rc rootContext) Value(key interface{}) interface{} {
+ switch key {
+ case fs.CtxRoot:
+ rc.root.IncRef()
+ return rc.root
+ default:
+ return rc.Context.Value(key)
+ }
+}
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
new file mode 100644
index 000000000..226bc5164
--- /dev/null
+++ b/pkg/sentry/fs/host/inode.go
@@ -0,0 +1,506 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "sync"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/fd"
+ "gvisor.googlesource.com/gvisor/pkg/secio"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// inodeOperations implements fs.InodeOperations for an fs.Inodes backed
+// by a host file descriptor.
+type inodeOperations struct {
+ fsutil.InodeNotVirtual `state:"nosave"`
+ fsutil.InodeNoExtendedAttributes `state:"nosave"`
+ fsutil.DeprecatedFileOperations `state:"nosave"`
+
+ // fileState implements fs.CachedFileObject. It exists
+ // to break a circular load dependency between inodeOperations
+ // and cachingInodeOps (below).
+ fileState *inodeFileState `state:"wait"`
+
+ // cachedInodeOps implements memmap.Mappable.
+ cachingInodeOps *fsutil.CachingInodeOperations
+
+ // readdirMu protects the file offset on the host FD. This is needed
+ // for readdir because getdents must use the kernel offset, so
+ // concurrent readdirs must be exclusive.
+ //
+ // All read/write functions pass the offset directly to the kernel and
+ // thus don't need a lock.
+ readdirMu sync.Mutex `state:"nosave"`
+}
+
+// inodeFileState implements fs.CachedFileObject and otherwise fully
+// encapsulates state that needs to be manually loaded on restore for
+// this file object.
+//
+// This unfortunate structure exists because fs.CachingInodeOperations
+// defines afterLoad and therefore cannot be lazily loaded (to break a
+// circular load dependency between it and inodeOperations). Even with
+// lazy loading, this approach defines the dependencies between objects
+// and the expected load behavior more concretely.
+type inodeFileState struct {
+ // Common file system state.
+ mops *superOperations `state:"wait"`
+
+ // descriptor is the backing host fd.
+ descriptor *descriptor `state:"wait"`
+
+ // Event queue for blocking operations.
+ queue waiter.Queue `state:"nosave"`
+
+ // sattr is used to restore the inodeOperations.
+ sattr fs.StableAttr `state:"wait"`
+
+ // savedUAttr is only allocated during S/R. It points to the save-time
+ // unstable attributes and is used to validate restore-time ones.
+ //
+ // Note that these unstable attributes are only used to detect cross-S/R
+ // external file system metadata changes. They may differ from the
+ // cached unstable attributes in cachingInodeOps, as that might differ
+ // from the external file system attributes if there had been WriteOut
+ // failures. S/R is transparent to Sentry and the latter will continue
+ // using its cached values after restore.
+ savedUAttr *fs.UnstableAttr
+}
+
+// ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt.
+func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
+ // TODO: Using safemem.FromIOReader here is wasteful for two
+ // reasons:
+ //
+ // - Using preadv instead of iterated preads saves on host system calls.
+ //
+ // - Host system calls can handle destination memory that would fault in
+ // gr3 (i.e. they can accept safemem.Blocks with NeedSafecopy() == true),
+ // so the buffering performed by FromIOReader is unnecessary.
+ //
+ // This also applies to the write path below.
+ return safemem.FromIOReader{secio.NewOffsetReader(fd.NewReadWriter(i.FD()), int64(offset))}.ReadToBlocks(dsts)
+}
+
+// WriteFromBlocksAt implements fsutil.CachedFileObject.WriteFromBlocksAt.
+func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
+ return safemem.FromIOWriter{secio.NewOffsetWriter(fd.NewReadWriter(i.FD()), int64(offset))}.WriteFromBlocks(srcs)
+}
+
+// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
+func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error {
+ if mask.Empty() {
+ return nil
+ }
+ if mask.UID || mask.GID {
+ return syserror.EPERM
+ }
+ if mask.Perms {
+ if err := syscall.Fchmod(i.FD(), uint32(attr.Perms.LinuxMode())); err != nil {
+ return err
+ }
+ }
+ if mask.Size {
+ if err := syscall.Ftruncate(i.FD(), attr.Size); err != nil {
+ return err
+ }
+ }
+ if mask.AccessTime || mask.ModificationTime {
+ ts := fs.TimeSpec{
+ ATime: attr.AccessTime,
+ ATimeOmit: !mask.AccessTime,
+ MTime: attr.ModificationTime,
+ MTimeOmit: !mask.ModificationTime,
+ }
+ if err := setTimestamps(i.FD(), ts); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// Sync implements fsutil.CachedFileObject.Sync.
+func (i *inodeFileState) Sync(ctx context.Context) error {
+ return syscall.Fsync(i.FD())
+}
+
+// FD implements fsutil.CachedFileObject.FD.
+func (i *inodeFileState) FD() int {
+ return i.descriptor.value
+}
+
+func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, error) {
+ var s syscall.Stat_t
+ if err := syscall.Fstat(i.FD(), &s); err != nil {
+ return fs.UnstableAttr{}, err
+ }
+ return unstableAttr(i.mops, &s), nil
+}
+
+// inodeOperations implements fs.InodeOperations.
+var _ fs.InodeOperations = (*inodeOperations)(nil)
+
+// newInode returns a new fs.Inode backed by the host fd.
+func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool, donated bool) (*fs.Inode, error) {
+ // Retrieve metadata.
+ var s syscall.Stat_t
+ err := syscall.Fstat(fd, &s)
+ if err != nil {
+ return nil, err
+ }
+
+ fileState := &inodeFileState{
+ mops: msrc.MountSourceOperations.(*superOperations),
+ sattr: stableAttr(&s),
+ }
+
+ // Initialize the wrapped host file descriptor.
+ fileState.descriptor, err = newDescriptor(
+ fd,
+ donated,
+ saveable,
+ wouldBlock(&s),
+ &fileState.queue,
+ )
+ if err != nil {
+ return nil, err
+ }
+
+ // Build the fs.InodeOperations.
+ uattr := unstableAttr(msrc.MountSourceOperations.(*superOperations), &s)
+ iops := &inodeOperations{
+ fileState: fileState,
+ cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, msrc.Flags.ForcePageCache),
+ }
+
+ // Return the fs.Inode.
+ return fs.NewInode(iops, msrc, fileState.sattr), nil
+}
+
+// Mappable implements fs.InodeOperations.Mappable.
+func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable {
+ if !canMap(inode) {
+ return nil
+ }
+ return i.cachingInodeOps
+}
+
+// ReturnsWouldBlock returns true if this host fd can return EWOULDBLOCK
+// for operations that would block.
+func (i *inodeOperations) ReturnsWouldBlock() bool {
+ return i.fileState.descriptor.wouldBlock
+}
+
+// Release implements fs.InodeOperations.Release.
+func (i *inodeOperations) Release(context.Context) {
+ i.fileState.descriptor.Release()
+ i.cachingInodeOps.Release()
+}
+
+// Lookup implements fs.InodeOperations.Lookup.
+func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
+ // Get a new fd relative to i at name.
+ fd, err := open(i, name)
+ if err != nil {
+ if err == syserror.ENOENT {
+ return nil, syserror.ENOENT
+ }
+ return nil, err
+ }
+
+ inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */)
+ if err != nil {
+ return nil, err
+ }
+
+ // Return the fs.Dirent.
+ return fs.NewDirent(inode, name), nil
+}
+
+// Create implements fs.InodeOperations.Create.
+func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) {
+ // Create a file relative to i at name.
+ //
+ // N.B. We always open this file O_RDWR regardless of flags because a
+ // future GetFile might want more access. Open allows this regardless
+ // of perm.
+ fd, err := openAt(i, name, syscall.O_RDWR|syscall.O_CREAT|syscall.O_EXCL, perm.LinuxMode())
+ if err != nil {
+ return nil, err
+ }
+
+ inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */)
+ if err != nil {
+ return nil, err
+ }
+
+ d := fs.NewDirent(inode, name)
+ defer d.DecRef()
+ return inode.GetFile(ctx, d, flags)
+}
+
+// CreateDirectory implements fs.InodeOperations.CreateDirectory.
+func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
+ return syscall.Mkdirat(i.fileState.FD(), name, uint32(perm.LinuxMode()))
+}
+
+// CreateLink implements fs.InodeOperations.CreateLink.
+func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error {
+ return createLink(i.fileState.FD(), oldname, newname)
+}
+
+// CreateHardLink implements fs.InodeOperations.CreateHardLink.
+func (*inodeOperations) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error {
+ return syserror.EPERM
+}
+
+// CreateFifo implements fs.InodeOperations.CreateFifo.
+func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error {
+ return syserror.EOPNOTSUPP
+}
+
+// Remove implements fs.InodeOperations.Remove.
+func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error {
+ return unlinkAt(i.fileState.FD(), name, false /* dir */)
+}
+
+// RemoveDirectory implements fs.InodeOperations.RemoveDirectory.
+func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error {
+ return unlinkAt(i.fileState.FD(), name, true /* dir */)
+}
+
+// Rename implements fs.InodeOperations.Rename.
+func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
+ op, ok := oldParent.InodeOperations.(*inodeOperations)
+ if !ok {
+ return syscall.EXDEV
+ }
+ np, ok := newParent.InodeOperations.(*inodeOperations)
+ if !ok {
+ return syscall.EXDEV
+ }
+ return syscall.Renameat(op.fileState.FD(), oldName, np.fileState.FD(), newName)
+}
+
+// Bind implements fs.InodeOperations.Bind.
+func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) error {
+ return syserror.EOPNOTSUPP
+}
+
+// BoundEndpoint implements fs.InodeOperations.BoundEndpoint.
+func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) unix.BoundEndpoint {
+ return nil
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+ return newFile(ctx, d, flags, i, false), nil
+}
+
+// canMap returns true if this fs.Inode can be memory mapped.
+func canMap(inode *fs.Inode) bool {
+ // FIXME: Some obscure character devices can be mapped.
+ return fs.IsFile(inode.StableAttr)
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+ // When the kernel supports mapping host FDs, we do so to take
+ // advantage of the host page cache. We forego updating fs.Inodes
+ // because the host manages consistency of its own inode structures.
+ //
+ // For fs.Inodes that can never be mapped we take advantage of
+ // synchronizing metadata updates through host caches.
+ //
+ // So can we use host kernel metadata caches?
+ if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+ // Then just obtain the attributes.
+ return i.fileState.unstableAttr(ctx)
+ }
+ // No, we're maintaining consistency of metadata ourselves.
+ return i.cachingInodeOps.UnstableAttr(ctx, inode)
+}
+
+// Check implements fs.InodeOperations.Check.
+func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+ return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (i *inodeOperations) SetOwner(context.Context, *fs.Inode, fs.FileOwner) error {
+ return syserror.EPERM
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, f fs.FilePermissions) bool {
+ // Can we use host kernel metadata caches?
+ if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+ // Then just change the timestamps on the fd, the host
+ // will synchronize the metadata update with any host
+ // inode and page cache.
+ return syscall.Fchmod(i.fileState.FD(), uint32(f.LinuxMode())) == nil
+ }
+ // Otherwise update our cached metadata.
+ return i.cachingInodeOps.SetPermissions(ctx, inode, f)
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
+ // Can we use host kernel metadata caches?
+ if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+ // Then just change the timestamps on the fd, the host
+ // will synchronize the metadata update with any host
+ // inode and page cache.
+ return setTimestamps(i.fileState.FD(), ts)
+ }
+ // Otherwise update our cached metadata.
+ return i.cachingInodeOps.SetTimestamps(ctx, inode, ts)
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
+ // Is the file not memory-mappable?
+ if !canMap(inode) {
+ // Then just change the file size on the fd, the host
+ // will synchronize the metadata update with any host
+ // inode and page cache.
+ return syscall.Ftruncate(i.fileState.FD(), size)
+ }
+ // Otherwise we need to go through cachingInodeOps, even if the host page
+ // cache is in use, to invalidate private copies of truncated pages.
+ return i.cachingInodeOps.Truncate(ctx, inode, size)
+}
+
+// WriteOut implements fs.InodeOperations.WriteOut.
+func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
+ // Have we been using host kernel metadata caches?
+ if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+ // Then the metadata is already up to date on the host.
+ return nil
+ }
+ // Otherwise we need to write out cached pages and attributes
+ // that are dirty.
+ return i.cachingInodeOps.WriteOut(ctx, inode)
+}
+
+// Readlink implements fs.InodeOperations.Readlink.
+func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+ return readLink(i.fileState.FD())
+}
+
+// Getlink implements fs.InodeOperations.Getlink.
+func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
+ if !fs.IsSymlink(i.fileState.sattr) {
+ return nil, syserror.ENOLINK
+ }
+ return nil, fs.ErrResolveViaReadlink
+}
+
+// StatFS implements fs.InodeOperations.StatFS.
+func (i *inodeOperations) StatFS(context.Context) (fs.Info, error) {
+ return fs.Info{}, syserror.ENOSYS
+}
+
+// AddLink implements fs.InodeOperations.AddLink.
+// FIXME: Remove this from InodeOperations altogether.
+func (i *inodeOperations) AddLink() {}
+
+// DropLink implements fs.InodeOperations.DropLink.
+// FIXME: Remove this from InodeOperations altogether.
+func (i *inodeOperations) DropLink() {}
+
+// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
+// FIXME: Remove this from InodeOperations altogether.
+func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
+
+// readdirAll returns all of the directory entries in i.
+func (i *inodeOperations) readdirAll(d *dirInfo) (map[string]fs.DentAttr, error) {
+ i.readdirMu.Lock()
+ defer i.readdirMu.Unlock()
+
+ fd := i.fileState.FD()
+
+ // syscall.ReadDirent will use getdents, which will seek the file past
+ // the last directory entry. To read the directory entries a second
+ // time, we need to seek back to the beginning.
+ if _, err := syscall.Seek(fd, 0, 0); err != nil {
+ if err == syscall.ESPIPE {
+ // All directories should be seekable. If this file
+ // isn't seekable, it is not a directory and we should
+ // return that more sane error.
+ err = syscall.ENOTDIR
+ }
+ return nil, err
+ }
+
+ names := make([]string, 0, 100)
+ for {
+ // Refill the buffer if necessary
+ if d.bufp >= d.nbuf {
+ d.bufp = 0
+ // ReadDirent will just do a sys_getdents64 to the kernel.
+ n, err := syscall.ReadDirent(fd, d.buf)
+ if err != nil {
+ return nil, err
+ }
+ if n == 0 {
+ break // EOF
+ }
+ d.nbuf = n
+ }
+
+ var nb int
+ // Parse the dirent buffer we just get and return the directory names along
+ // with the number of bytes consumed in the buffer.
+ nb, _, names = syscall.ParseDirent(d.buf[d.bufp:d.nbuf], -1, names)
+ d.bufp += nb
+ }
+
+ entries := make(map[string]fs.DentAttr)
+ for _, filename := range names {
+ // Lookup the type and host device and inode.
+ stat, lerr := fstatat(fd, filename, linux.AT_SYMLINK_NOFOLLOW)
+ if lerr == syscall.ENOENT {
+ // File disappeared between readdir and lstat.
+ // Just treat it as if it didn't exist.
+ continue
+ }
+
+ // There was a serious problem, we should probably report it.
+ if lerr != nil {
+ return nil, lerr
+ }
+
+ entries[filename] = fs.DentAttr{
+ Type: nodeType(&stat),
+ InodeID: hostFileDevice.Map(device.MultiDeviceKey{
+ Device: stat.Dev,
+ Inode: stat.Ino,
+ }),
+ }
+ }
+ return entries, nil
+}
diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go
new file mode 100644
index 000000000..80066512a
--- /dev/null
+++ b/pkg/sentry/fs/host/inode_state.go
@@ -0,0 +1,79 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "fmt"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// beforeSave is invoked by stateify.
+func (i *inodeFileState) beforeSave() {
+ if !i.queue.IsEmpty() {
+ panic("event queue must be empty")
+ }
+ if !i.descriptor.donated && i.sattr.Type == fs.RegularFile {
+ uattr, err := i.unstableAttr(context.Background())
+ if err != nil {
+ panic(fmt.Sprintf("failed to get unstable atttribute of %s: %v", i.mops.inodeMappings[i.sattr.InodeID], err))
+ }
+ i.savedUAttr = &uattr
+ }
+}
+
+// afterLoad is invoked by stateify.
+func (i *inodeFileState) afterLoad() {
+ // Initialize the descriptor value.
+ if err := i.descriptor.initAfterLoad(i.mops, i.sattr.InodeID, &i.queue); err != nil {
+ panic(fmt.Sprintf("failed to load value of descriptor: %v", err))
+ }
+
+ // Remap the inode number.
+ var s syscall.Stat_t
+ if err := syscall.Fstat(i.FD(), &s); err != nil {
+ panic(fmt.Sprintf("failed to get metadata for fd %d: %v", i.FD(), err))
+ }
+ key := device.MultiDeviceKey{
+ Device: s.Dev,
+ Inode: s.Ino,
+ }
+ if !hostFileDevice.Load(key, i.sattr.InodeID) {
+ // This means there was a conflict at s.Dev and s.Ino with
+ // another inode mapping: two files that were unique on the
+ // saved filesystem are no longer unique on this filesystem.
+ // Since this violates the contract that filesystems cannot
+ // change across save and restore, error out.
+ panic(fmt.Sprintf("host %s conflict in host device mappings: %s", key, hostFileDevice))
+ }
+
+ if !i.descriptor.donated && i.sattr.Type == fs.RegularFile {
+ env, ok := fs.CurrentRestoreEnvironment()
+ if !ok {
+ panic("missing restore environment")
+ }
+ uattr := unstableAttr(i.mops, &s)
+ if env.ValidateFileSize && uattr.Size != i.savedUAttr.Size {
+ panic(fmt.Errorf("file size has changed for %s: previously %d, now %d", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size))
+ }
+ if env.ValidateFileTimestamp && uattr.ModificationTime != i.savedUAttr.ModificationTime {
+ panic(fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime))
+ }
+ i.savedUAttr = nil
+ }
+}
diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go
new file mode 100644
index 000000000..0ff87c418
--- /dev/null
+++ b/pkg/sentry/fs/host/inode_test.go
@@ -0,0 +1,112 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "io/ioutil"
+ "os"
+ "path"
+ "syscall"
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// TestMultipleReaddir verifies that multiple Readdir calls return the same
+// thing if they use different dir contexts.
+func TestMultipleReaddir(t *testing.T) {
+ p, err := ioutil.TempDir("", "readdir")
+ if err != nil {
+ t.Fatalf("Failed to create test dir: %v", err)
+ }
+ defer os.RemoveAll(p)
+
+ f, err := os.Create(path.Join(p, "a.txt"))
+ if err != nil {
+ t.Fatalf("Failed to create a.txt: %v", err)
+ }
+ f.Close()
+
+ f, err = os.Create(path.Join(p, "b.txt"))
+ if err != nil {
+ t.Fatalf("Failed to create b.txt: %v", err)
+ }
+ f.Close()
+
+ fd, err := open(nil, p)
+ if err != nil {
+ t.Fatalf("Failed to open %q: %v", p, err)
+ }
+ ctx := contexttest.Context(t)
+ n, err := newInode(ctx, newMountSource(ctx, p, fs.RootOwner, &Filesystem{}, fs.MountSourceFlags{}, false), fd, false, false)
+ if err != nil {
+ t.Fatalf("Failed to create inode: %v", err)
+ }
+
+ dirent := fs.NewDirent(n, "readdir")
+ openFile, err := n.GetFile(ctx, dirent, fs.FileFlags{Read: true})
+ if err != nil {
+ t.Fatalf("Failed to get file: %v", err)
+ }
+ defer openFile.DecRef()
+
+ c1 := &fs.DirCtx{DirCursor: new(string)}
+ if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, c1, 0); err != nil {
+ t.Fatalf("First Readdir failed: %v", err)
+ }
+
+ c2 := &fs.DirCtx{DirCursor: new(string)}
+ if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, c2, 0); err != nil {
+ t.Errorf("Second Readdir failed: %v", err)
+ }
+
+ if _, ok := c1.DentAttrs()["a.txt"]; !ok {
+ t.Errorf("want a.txt in first Readdir, got %v", c1.DentAttrs())
+ }
+ if _, ok := c1.DentAttrs()["b.txt"]; !ok {
+ t.Errorf("want b.txt in first Readdir, got %v", c1.DentAttrs())
+ }
+
+ if _, ok := c2.DentAttrs()["a.txt"]; !ok {
+ t.Errorf("want a.txt in second Readdir, got %v", c2.DentAttrs())
+ }
+ if _, ok := c2.DentAttrs()["b.txt"]; !ok {
+ t.Errorf("want b.txt in second Readdir, got %v", c2.DentAttrs())
+ }
+}
+
+// TestCloseFD verifies fds will be closed.
+func TestCloseFD(t *testing.T) {
+ var p [2]int
+ if err := syscall.Pipe(p[0:]); err != nil {
+ t.Fatalf("Failed to create pipe %v", err)
+ }
+ defer syscall.Close(p[0])
+ defer syscall.Close(p[1])
+
+ // Use the write-end because we will detect if it's closed on the read end.
+ ctx := contexttest.Context(t)
+ file, err := NewFile(ctx, p[1], fs.RootOwner)
+ if err != nil {
+ t.Fatalf("Failed to create File: %v", err)
+ }
+ file.DecRef()
+
+ s := make([]byte, 10)
+ if c, err := syscall.Read(p[0], s); c != 0 || err != nil {
+ t.Errorf("want 0, nil (EOF) from read end, got %v, %v", c, err)
+ }
+}
diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go
new file mode 100644
index 000000000..3c07c3850
--- /dev/null
+++ b/pkg/sentry/fs/host/ioctl_unsafe.go
@@ -0,0 +1,39 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "syscall"
+ "unsafe"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+func ioctlGetTermios(fd int) (*linux.Termios, error) {
+ var t linux.Termios
+ _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.TCGETS, uintptr(unsafe.Pointer(&t)))
+ if errno != 0 {
+ return nil, errno
+ }
+ return &t, nil
+}
+
+func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error {
+ _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t)))
+ if errno != 0 {
+ return errno
+ }
+ return nil
+}
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
new file mode 100644
index 000000000..8e36ed7ee
--- /dev/null
+++ b/pkg/sentry/fs/host/socket.go
@@ -0,0 +1,471 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "sync"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/fd"
+ "gvisor.googlesource.com/gvisor/pkg/refs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
+ unixsocket "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+ "gvisor.googlesource.com/gvisor/pkg/unet"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+ "gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
+)
+
+// endpoint encapsulates the state needed to represent a host Unix socket.
+type endpoint struct {
+ queue waiter.Queue `state:"nosave"`
+
+ // stype is the type of Unix socket. (Ex: unix.SockStream,
+ // unix.SockSeqpacket, unix.SockDgram)
+ stype unix.SockType `state:"nosave"`
+
+ // fd is the host fd backing this file.
+ fd int `state:"nosave"`
+
+ // If srfd >= 0, it is the host fd that fd was imported from.
+ srfd int `state:"wait"`
+}
+
+func (e *endpoint) init() error {
+ family, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_DOMAIN)
+ if err != nil {
+ return err
+ }
+
+ if family != syscall.AF_UNIX {
+ // We only allow Unix sockets.
+ return syserror.EINVAL
+ }
+
+ stype, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_TYPE)
+ if err != nil {
+ return err
+ }
+
+ if err := syscall.SetNonblock(e.fd, true); err != nil {
+ return err
+ }
+
+ e.stype = unix.SockType(stype)
+ if err := fdnotifier.AddFD(int32(e.fd), &e.queue); err != nil {
+ return err
+ }
+ return nil
+}
+
+// newEndpoint creates a new host endpoint.
+func newEndpoint(fd int, srfd int) (*endpoint, error) {
+ ep := &endpoint{fd: fd, srfd: srfd}
+ if err := ep.init(); err != nil {
+ return nil, err
+ }
+ return ep, nil
+}
+
+// newSocket allocates a new unix socket with host endpoint.
+func newSocket(ctx context.Context, fd int, saveable bool) (*fs.File, error) {
+ ownedfd := fd
+ srfd := -1
+ if saveable {
+ var err error
+ ownedfd, err = syscall.Dup(fd)
+ if err != nil {
+ return nil, err
+ }
+ srfd = fd
+ }
+ ep, err := newEndpoint(ownedfd, srfd)
+ if err != nil {
+ if saveable {
+ syscall.Close(ownedfd)
+ }
+ return nil, err
+ }
+ return unixsocket.New(ctx, ep), nil
+}
+
+// NewSocketWithDirent allocates a new unix socket with host endpoint.
+//
+// This is currently only used by unsaveable Gofer nodes.
+//
+// NewSocketWithDirent takes ownership of f on success.
+func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.FileFlags) (*fs.File, error) {
+ ep, err := newEndpoint(f.FD(), -1)
+ if err != nil {
+ return nil, err
+ }
+
+ // Take ownship of the FD.
+ f.Release()
+
+ return unixsocket.NewWithDirent(ctx, d, ep, flags), nil
+}
+
+// Close implements unix.Endpoint.Close.
+func (e *endpoint) Close() {
+ fdnotifier.RemoveFD(int32(e.fd))
+ syscall.Close(e.fd)
+ e.fd = -1
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (e *endpoint) EventRegister(we *waiter.Entry, mask waiter.EventMask) {
+ e.queue.EventRegister(we, mask)
+ fdnotifier.UpdateFD(int32(e.fd))
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (e *endpoint) EventUnregister(we *waiter.Entry) {
+ e.queue.EventUnregister(we)
+ fdnotifier.UpdateFD(int32(e.fd))
+}
+
+// Readiness implements unix.Endpoint.Readiness.
+func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return fdnotifier.NonBlockingPoll(int32(e.fd), mask)
+}
+
+// Type implements unix.Endpoint.Type.
+func (e *endpoint) Type() unix.SockType {
+ return e.stype
+}
+
+// Connect implements unix.Endpoint.Connect.
+func (e *endpoint) Connect(server unix.BoundEndpoint) *tcpip.Error {
+ return tcpip.ErrInvalidEndpointState
+}
+
+// Bind implements unix.Endpoint.Bind.
+func (e *endpoint) Bind(address tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+ return tcpip.ErrInvalidEndpointState
+}
+
+// Listen implements unix.Endpoint.Listen.
+func (e *endpoint) Listen(backlog int) *tcpip.Error {
+ return tcpip.ErrInvalidEndpointState
+}
+
+// Accept implements unix.Endpoint.Accept.
+func (e *endpoint) Accept() (unix.Endpoint, *tcpip.Error) {
+ return nil, tcpip.ErrInvalidEndpointState
+}
+
+// Shutdown implements unix.Endpoint.Shutdown.
+func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+ return tcpip.ErrInvalidEndpointState
+}
+
+// GetSockOpt implements unix.Endpoint.GetSockOpt.
+func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+ switch o := opt.(type) {
+ case tcpip.ErrorOption:
+ _, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_ERROR)
+ return translateError(err)
+ case *tcpip.PasscredOption:
+ // We don't support passcred on host sockets.
+ *o = 0
+ return nil
+ case *tcpip.SendBufferSizeOption:
+ v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+ *o = tcpip.SendBufferSizeOption(v)
+ return translateError(err)
+ case *tcpip.ReceiveBufferSizeOption:
+ v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF)
+ *o = tcpip.ReceiveBufferSizeOption(v)
+ return translateError(err)
+ case *tcpip.ReuseAddressOption:
+ v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_REUSEADDR)
+ *o = tcpip.ReuseAddressOption(v)
+ return translateError(err)
+ case *tcpip.ReceiveQueueSizeOption:
+ return tcpip.ErrQueueSizeNotSupported
+ }
+ return tcpip.ErrInvalidEndpointState
+}
+
+// SetSockOpt implements unix.Endpoint.SetSockOpt.
+func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+ return nil
+}
+
+// GetLocalAddress implements unix.Endpoint.GetLocalAddress.
+func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+ return tcpip.FullAddress{}, nil
+}
+
+// GetRemoteAddress implements unix.Endpoint.GetRemoteAddress.
+func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+ return tcpip.FullAddress{}, nil
+}
+
+// Passcred returns whether or not the SO_PASSCRED socket option is
+// enabled on this end.
+func (e *endpoint) Passcred() bool {
+ // We don't support credential passing for host sockets.
+ return false
+}
+
+// ConnectedPasscred returns whether or not the SO_PASSCRED socket option
+// is enabled on the connected end.
+func (e *endpoint) ConnectedPasscred() bool {
+ // We don't support credential passing for host sockets.
+ return false
+}
+
+// SendMsg implements unix.Endpoint.SendMsg.
+func (e *endpoint) SendMsg(data [][]byte, controlMessages unix.ControlMessages, to unix.BoundEndpoint) (uintptr, *tcpip.Error) {
+ if to != nil {
+ return 0, tcpip.ErrInvalidEndpointState
+ }
+ return sendMsg(e.fd, data, controlMessages)
+}
+
+func sendMsg(fd int, data [][]byte, controlMessages unix.ControlMessages) (uintptr, *tcpip.Error) {
+ if !controlMessages.Empty() {
+ return 0, tcpip.ErrInvalidEndpointState
+ }
+ n, err := fdWriteVec(fd, data)
+ return n, translateError(err)
+}
+
+// RecvMsg implements unix.Endpoint.RecvMsg.
+func (e *endpoint) RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) {
+ return recvMsg(e.fd, data, numRights, peek, addr)
+}
+
+func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) {
+ var cm unet.ControlMessage
+ if numRights > 0 {
+ cm.EnableFDs(int(numRights))
+ }
+ rl, ml, cl, err := fdReadVec(fd, data, []byte(cm), peek)
+ if err == syscall.EAGAIN {
+ return 0, 0, unix.ControlMessages{}, tcpip.ErrWouldBlock
+ }
+ if err != nil {
+ return 0, 0, unix.ControlMessages{}, translateError(err)
+ }
+
+ // Trim the control data if we received less than the full amount.
+ if cl < uint64(len(cm)) {
+ cm = cm[:cl]
+ }
+
+ // Avoid extra allocations in the case where there isn't any control data.
+ if len(cm) == 0 {
+ return rl, ml, unix.ControlMessages{}, nil
+ }
+
+ fds, err := cm.ExtractFDs()
+ if err != nil {
+ return 0, 0, unix.ControlMessages{}, translateError(err)
+ }
+
+ if len(fds) == 0 {
+ return rl, ml, unix.ControlMessages{}, nil
+ }
+ return rl, ml, control.New(nil, nil, newSCMRights(fds)), nil
+}
+
+// NewConnectedEndpoint creates a new unix.Receiver and unix.ConnectedEndpoint
+// backed by a host FD that will pretend to be bound at a given sentry path.
+func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (unix.Receiver, unix.ConnectedEndpoint, *tcpip.Error) {
+ if err := fdnotifier.AddFD(int32(file.FD()), queue); err != nil {
+ return nil, nil, translateError(err)
+ }
+
+ e := &connectedEndpoint{path: path, queue: queue, file: file}
+
+ // AtomicRefCounters start off with a single reference. We need two.
+ e.ref.IncRef()
+
+ return e, e, nil
+}
+
+// connectedEndpoint is a host FD backed implementation of
+// unix.ConnectedEndpoint and unix.Receiver.
+//
+// connectedEndpoint does not support save/restore for now.
+type connectedEndpoint struct {
+ queue *waiter.Queue
+ path string
+
+ // ref keeps track of references to a connectedEndpoint.
+ ref refs.AtomicRefCount
+
+ // mu protects fd, readClosed and writeClosed.
+ mu sync.RWMutex
+
+ // file is an *fd.FD containing the FD backing this endpoint. It must be
+ // set to nil if it has been closed.
+ file *fd.FD
+
+ // readClosed is true if the FD has read shutdown or if it has been closed.
+ readClosed bool
+
+ // writeClosed is true if the FD has write shutdown or if it has been
+ // closed.
+ writeClosed bool
+}
+
+// Send implements unix.ConnectedEndpoint.Send.
+func (c *connectedEndpoint) Send(data [][]byte, controlMessages unix.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *tcpip.Error) {
+ c.mu.RLock()
+ defer c.mu.RUnlock()
+ if c.writeClosed {
+ return 0, false, tcpip.ErrClosedForSend
+ }
+ n, err := sendMsg(c.file.FD(), data, controlMessages)
+ // There is no need for the callee to call SendNotify because sendMsg uses
+ // the host's sendmsg(2) and the host kernel's queue.
+ return n, false, err
+}
+
+// SendNotify implements unix.ConnectedEndpoint.SendNotify.
+func (c *connectedEndpoint) SendNotify() {}
+
+// CloseSend implements unix.ConnectedEndpoint.CloseSend.
+func (c *connectedEndpoint) CloseSend() {
+ c.mu.Lock()
+ c.writeClosed = true
+ c.mu.Unlock()
+}
+
+// CloseNotify implements unix.ConnectedEndpoint.CloseNotify.
+func (c *connectedEndpoint) CloseNotify() {}
+
+// Writable implements unix.ConnectedEndpoint.Writable.
+func (c *connectedEndpoint) Writable() bool {
+ c.mu.RLock()
+ defer c.mu.RUnlock()
+ if c.writeClosed {
+ return true
+ }
+ return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventOut)&waiter.EventOut != 0
+}
+
+// Passcred implements unix.ConnectedEndpoint.Passcred.
+func (c *connectedEndpoint) Passcred() bool {
+ // We don't support credential passing for host sockets.
+ return false
+}
+
+// GetLocalAddress implements unix.ConnectedEndpoint.GetLocalAddress.
+func (c *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+ return tcpip.FullAddress{Addr: tcpip.Address(c.path)}, nil
+}
+
+// EventUpdate implements unix.ConnectedEndpoint.EventUpdate.
+func (c *connectedEndpoint) EventUpdate() {
+ c.mu.RLock()
+ defer c.mu.RUnlock()
+ if c.file.FD() != -1 {
+ fdnotifier.UpdateFD(int32(c.file.FD()))
+ }
+}
+
+// Recv implements unix.Receiver.Recv.
+func (c *connectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, unix.ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) {
+ c.mu.RLock()
+ defer c.mu.RUnlock()
+ if c.readClosed {
+ return 0, 0, unix.ControlMessages{}, tcpip.FullAddress{}, false, tcpip.ErrClosedForReceive
+ }
+ rl, ml, cm, err := recvMsg(c.file.FD(), data, numRights, peek, nil)
+ // There is no need for the callee to call RecvNotify because recvMsg uses
+ // the host's recvmsg(2) and the host kernel's queue.
+ return rl, ml, cm, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, err
+}
+
+// close releases all resources related to the endpoint.
+func (c *connectedEndpoint) close() {
+ fdnotifier.RemoveFD(int32(c.file.FD()))
+ c.file.Close()
+ c.file = nil
+}
+
+// RecvNotify implements unix.Receiver.RecvNotify.
+func (c *connectedEndpoint) RecvNotify() {}
+
+// CloseRecv implements unix.Receiver.CloseRecv.
+func (c *connectedEndpoint) CloseRecv() {
+ c.mu.Lock()
+ c.readClosed = true
+ c.mu.Unlock()
+}
+
+// Readable implements unix.Receiver.Readable.
+func (c *connectedEndpoint) Readable() bool {
+ c.mu.RLock()
+ defer c.mu.RUnlock()
+ if c.readClosed {
+ return true
+ }
+ return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventIn)&waiter.EventIn != 0
+}
+
+// SendQueuedSize implements unix.Receiver.SendQueuedSize.
+func (c *connectedEndpoint) SendQueuedSize() int64 {
+ // SendQueuedSize isn't supported for host sockets because we don't allow the
+ // sentry to call ioctl(2).
+ return -1
+}
+
+// RecvQueuedSize implements unix.Receiver.RecvQueuedSize.
+func (c *connectedEndpoint) RecvQueuedSize() int64 {
+ // RecvQueuedSize isn't supported for host sockets because we don't allow the
+ // sentry to call ioctl(2).
+ return -1
+}
+
+// SendMaxQueueSize implements unix.Receiver.SendMaxQueueSize.
+func (c *connectedEndpoint) SendMaxQueueSize() int64 {
+ v, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+ if err != nil {
+ return -1
+ }
+ return int64(v)
+}
+
+// RecvMaxQueueSize implements unix.Receiver.RecvMaxQueueSize.
+func (c *connectedEndpoint) RecvMaxQueueSize() int64 {
+ v, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_RCVBUF)
+ if err != nil {
+ return -1
+ }
+ return int64(v)
+}
+
+// Release implements unix.ConnectedEndpoint.Release and unix.Receiver.Release.
+func (c *connectedEndpoint) Release() {
+ c.ref.DecRefWithDestructor(c.close)
+}
+
+func translateError(err error) *tcpip.Error {
+ if err == nil {
+ return nil
+ }
+ return rawfile.TranslateErrno(err.(syscall.Errno))
+}
diff --git a/pkg/sentry/fs/host/socket_state.go b/pkg/sentry/fs/host/socket_state.go
new file mode 100644
index 000000000..6acabd55a
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_state.go
@@ -0,0 +1,39 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "fmt"
+ "syscall"
+)
+
+// beforeSave is invoked by stateify.
+func (ep *endpoint) beforeSave() {
+ if ep.srfd < 0 {
+ panic("only host file descriptors provided at sentry startup can be saved")
+ }
+}
+
+// afterLoad is invoked by stateify.
+func (ep *endpoint) afterLoad() {
+ fd, err := syscall.Dup(ep.srfd)
+ if err != nil {
+ panic(fmt.Sprintf("failed to dup restored fd %d: %v", ep.srfd, err))
+ }
+ ep.fd = fd
+ if err := ep.init(); err != nil {
+ panic(fmt.Sprintf("Could not restore host socket fd %d: %v", ep.srfd, err))
+ }
+}
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
new file mode 100644
index 000000000..80c46dcfa
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -0,0 +1,401 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "reflect"
+ "syscall"
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/fd"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserr"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+ "gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
+)
+
+var (
+ // Make sure that connectedEndpoint implements unix.ConnectedEndpoint.
+ _ = unix.ConnectedEndpoint(new(connectedEndpoint))
+
+ // Make sure that connectedEndpoint implements unix.Receiver.
+ _ = unix.Receiver(new(connectedEndpoint))
+)
+
+func getFl(fd int) (uint32, error) {
+ fl, _, err := syscall.RawSyscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0)
+ if err == 0 {
+ return uint32(fl), nil
+ }
+ return 0, err
+}
+
+func TestSocketIsBlocking(t *testing.T) {
+ // Using socketpair here because it's already connected.
+ pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+ if err != nil {
+ t.Fatalf("host socket creation failed: %v", err)
+ }
+
+ fl, err := getFl(pair[0])
+ if err != nil {
+ t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err)
+ }
+ if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK {
+ t.Fatalf("Expected socket %v to be blocking", pair[0])
+ }
+ if fl, err = getFl(pair[1]); err != nil {
+ t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[1], err)
+ }
+ if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK {
+ t.Fatalf("Expected socket %v to be blocking", pair[1])
+ }
+ sock, err := newSocket(contexttest.Context(t), pair[0], false)
+ if err != nil {
+ t.Fatalf("newSocket(%v) failed => %v", pair[0], err)
+ }
+ defer sock.DecRef()
+ // Test that the socket now is non blocking.
+ if fl, err = getFl(pair[0]); err != nil {
+ t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err)
+ }
+ if fl&syscall.O_NONBLOCK != syscall.O_NONBLOCK {
+ t.Errorf("Expected socket %v to have becoming non blocking", pair[0])
+ }
+ if fl, err = getFl(pair[1]); err != nil {
+ t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[1], err)
+ }
+ if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK {
+ t.Errorf("Did not expect socket %v to become non blocking", pair[1])
+ }
+}
+
+func TestSocketWritev(t *testing.T) {
+ // Using socketpair here because it's already connected.
+ pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+ if err != nil {
+ t.Fatalf("host socket creation failed: %v", err)
+ }
+ socket, err := newSocket(contexttest.Context(t), pair[0], false)
+ if err != nil {
+ t.Fatalf("newSocket(%v) => %v", pair[0], err)
+ }
+ defer socket.DecRef()
+ buf := []byte("hello world\n")
+ n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(buf))
+ if err != nil {
+ t.Fatalf("socket writev failed: %v", err)
+ }
+
+ if n != int64(len(buf)) {
+ t.Fatalf("socket writev wrote incorrect bytes: %d", n)
+ }
+}
+
+func TestSocketWritevLen0(t *testing.T) {
+ // Using socketpair here because it's already connected.
+ pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+ if err != nil {
+ t.Fatalf("host socket creation failed: %v", err)
+ }
+ socket, err := newSocket(contexttest.Context(t), pair[0], false)
+ if err != nil {
+ t.Fatalf("newSocket(%v) => %v", pair[0], err)
+ }
+ defer socket.DecRef()
+ n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(nil))
+ if err != nil {
+ t.Fatalf("socket writev failed: %v", err)
+ }
+
+ if n != 0 {
+ t.Fatalf("socket writev wrote incorrect bytes: %d", n)
+ }
+}
+
+func TestSocketSendMsgLen0(t *testing.T) {
+ // Using socketpair here because it's already connected.
+ pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+ if err != nil {
+ t.Fatalf("host socket creation failed: %v", err)
+ }
+ sfile, err := newSocket(contexttest.Context(t), pair[0], false)
+ if err != nil {
+ t.Fatalf("newSocket(%v) => %v", pair[0], err)
+ }
+ defer sfile.DecRef()
+
+ s := sfile.FileOperations.(socket.Socket)
+ n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, unix.ControlMessages{})
+ if n != 0 {
+ t.Fatalf("socket sendmsg() failed: %v wrote: %d", terr, n)
+ }
+
+ if terr != nil {
+ t.Fatalf("socket sendmsg() failed: %v", terr)
+ }
+}
+
+func TestListen(t *testing.T) {
+ pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+ if err != nil {
+ t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err)
+ }
+ sfile1, err := newSocket(contexttest.Context(t), pair[0], false)
+ if err != nil {
+ t.Fatalf("newSocket(%v) => %v", pair[0], err)
+ }
+ defer sfile1.DecRef()
+ socket1 := sfile1.FileOperations.(socket.Socket)
+
+ sfile2, err := newSocket(contexttest.Context(t), pair[1], false)
+ if err != nil {
+ t.Fatalf("newSocket(%v) => %v", pair[1], err)
+ }
+ defer sfile2.DecRef()
+ socket2 := sfile2.FileOperations.(socket.Socket)
+
+ // Socketpairs can not be listened to.
+ if err := socket1.Listen(nil, 64); err != syserr.ErrInvalidEndpointState {
+ t.Fatalf("socket1.Listen(nil, 64) => %v, want syserr.ErrInvalidEndpointState", err)
+ }
+ if err := socket2.Listen(nil, 64); err != syserr.ErrInvalidEndpointState {
+ t.Fatalf("socket2.Listen(nil, 64) => %v, want syserr.ErrInvalidEndpointState", err)
+ }
+
+ // Create a Unix socket, do not bind it.
+ sock, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+ if err != nil {
+ t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err)
+ }
+ sfile3, err := newSocket(contexttest.Context(t), sock, false)
+ if err != nil {
+ t.Fatalf("newSocket(%v) => %v", sock, err)
+ }
+ defer sfile3.DecRef()
+ socket3 := sfile3.FileOperations.(socket.Socket)
+
+ // This socket is not bound so we can't listen on it.
+ if err := socket3.Listen(nil, 64); err != syserr.ErrInvalidEndpointState {
+ t.Fatalf("socket3.Listen(nil, 64) => %v, want syserr.ErrInvalidEndpointState", err)
+ }
+}
+
+func TestSend(t *testing.T) {
+ e := connectedEndpoint{writeClosed: true}
+ if _, _, err := e.Send(nil, unix.ControlMessages{}, tcpip.FullAddress{}); err != tcpip.ErrClosedForSend {
+ t.Errorf("Got %#v.Send() = %v, want = %v", e, err, tcpip.ErrClosedForSend)
+ }
+}
+
+func TestRecv(t *testing.T) {
+ e := connectedEndpoint{readClosed: true}
+ if _, _, _, _, _, err := e.Recv(nil, false, 0, false); err != tcpip.ErrClosedForReceive {
+ t.Errorf("Got %#v.Recv() = %v, want = %v", e, err, tcpip.ErrClosedForReceive)
+ }
+}
+
+func TestPasscred(t *testing.T) {
+ e := connectedEndpoint{}
+ if got, want := e.Passcred(), false; got != want {
+ t.Errorf("Got %#v.Passcred() = %t, want = %t", e, got, want)
+ }
+}
+
+func TestGetLocalAddress(t *testing.T) {
+ e := connectedEndpoint{path: "foo"}
+ want := tcpip.FullAddress{Addr: tcpip.Address("foo")}
+ if got, err := e.GetLocalAddress(); err != nil || got != want {
+ t.Errorf("Got %#v.GetLocalAddress() = %#v, %v, want = %#v, %v", e, got, err, want, nil)
+ }
+}
+
+func TestQueuedSize(t *testing.T) {
+ e := connectedEndpoint{}
+ tests := []struct {
+ name string
+ f func() int64
+ }{
+ {"SendQueuedSize", e.SendQueuedSize},
+ {"RecvQueuedSize", e.RecvQueuedSize},
+ }
+
+ for _, test := range tests {
+ if got, want := test.f(), int64(-1); got != want {
+ t.Errorf("Got %#v.%s() = %d, want = %d", e, test.name, got, want)
+ }
+ }
+}
+
+func TestReadable(t *testing.T) {
+ e := connectedEndpoint{readClosed: true}
+ if got, want := e.Readable(), true; got != want {
+ t.Errorf("Got %#v.Readable() = %t, want = %t", e, got, want)
+ }
+}
+
+func TestWritable(t *testing.T) {
+ e := connectedEndpoint{writeClosed: true}
+ if got, want := e.Writable(), true; got != want {
+ t.Errorf("Got %#v.Writable() = %t, want = %t", e, got, want)
+ }
+}
+
+func TestRelease(t *testing.T) {
+ f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+ if err != nil {
+ t.Fatal("Creating socket:", err)
+ }
+ c := &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
+ want := &connectedEndpoint{queue: c.queue}
+ want.ref.DecRef()
+ fdnotifier.AddFD(int32(c.file.FD()), nil)
+ c.Release()
+ if !reflect.DeepEqual(c, want) {
+ t.Errorf("got = %#v, want = %#v", c, want)
+ }
+}
+
+func TestClose(t *testing.T) {
+ type testCase struct {
+ name string
+ cep *connectedEndpoint
+ addFD bool
+ f func()
+ want *connectedEndpoint
+ }
+
+ var tests []testCase
+
+ // nil is the value used by connectedEndpoint to indicate a closed file.
+ // Non-nil files are used to check if the file gets closed.
+
+ f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+ if err != nil {
+ t.Fatal("Creating socket:", err)
+ }
+ c := &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
+ tests = append(tests, testCase{
+ name: "First CloseRecv",
+ cep: c,
+ addFD: false,
+ f: c.CloseRecv,
+ want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true},
+ })
+
+ f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+ if err != nil {
+ t.Fatal("Creating socket:", err)
+ }
+ c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true}
+ tests = append(tests, testCase{
+ name: "Second CloseRecv",
+ cep: c,
+ addFD: false,
+ f: c.CloseRecv,
+ want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true},
+ })
+
+ f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+ if err != nil {
+ t.Fatal("Creating socket:", err)
+ }
+ c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
+ tests = append(tests, testCase{
+ name: "First CloseSend",
+ cep: c,
+ addFD: false,
+ f: c.CloseSend,
+ want: &connectedEndpoint{queue: c.queue, file: c.file, writeClosed: true},
+ })
+
+ f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+ if err != nil {
+ t.Fatal("Creating socket:", err)
+ }
+ c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true}
+ tests = append(tests, testCase{
+ name: "Second CloseSend",
+ cep: c,
+ addFD: false,
+ f: c.CloseSend,
+ want: &connectedEndpoint{queue: c.queue, file: c.file, writeClosed: true},
+ })
+
+ f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+ if err != nil {
+ t.Fatal("Creating socket:", err)
+ }
+ c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true}
+ tests = append(tests, testCase{
+ name: "CloseSend then CloseRecv",
+ cep: c,
+ addFD: true,
+ f: c.CloseRecv,
+ want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
+ })
+
+ f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+ if err != nil {
+ t.Fatal("Creating socket:", err)
+ }
+ c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true}
+ tests = append(tests, testCase{
+ name: "CloseRecv then CloseSend",
+ cep: c,
+ addFD: true,
+ f: c.CloseSend,
+ want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
+ })
+
+ f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+ if err != nil {
+ t.Fatal("Creating socket:", err)
+ }
+ c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true}
+ tests = append(tests, testCase{
+ name: "Full close then CloseRecv",
+ cep: c,
+ addFD: false,
+ f: c.CloseRecv,
+ want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
+ })
+
+ f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+ if err != nil {
+ t.Fatal("Creating socket:", err)
+ }
+ c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true}
+ tests = append(tests, testCase{
+ name: "Full close then CloseSend",
+ cep: c,
+ addFD: false,
+ f: c.CloseSend,
+ want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
+ })
+
+ for _, test := range tests {
+ if test.addFD {
+ fdnotifier.AddFD(int32(test.cep.file.FD()), nil)
+ }
+ if test.f(); !reflect.DeepEqual(test.cep, test.want) {
+ t.Errorf("%s: got = %#v, want = %#v", test.name, test.cep, test.want)
+ }
+ }
+}
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
new file mode 100644
index 000000000..bf8da6867
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -0,0 +1,82 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "syscall"
+ "unsafe"
+)
+
+// buildIovec builds an iovec slice from the given []byte slice.
+func buildIovec(bufs [][]byte) (uintptr, []syscall.Iovec) {
+ var length uintptr
+ iovecs := make([]syscall.Iovec, 0, 10)
+ for i := range bufs {
+ if l := len(bufs[i]); l > 0 {
+ length += uintptr(l)
+ iovecs = append(iovecs, syscall.Iovec{
+ Base: &bufs[i][0],
+ Len: uint64(l),
+ })
+ }
+ }
+ return length, iovecs
+}
+
+func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool) (readLen uintptr, msgLen uintptr, controlLen uint64, err error) {
+ flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC)
+ if peek {
+ flags |= syscall.MSG_PEEK
+ }
+
+ length, iovecs := buildIovec(bufs)
+
+ var msg syscall.Msghdr
+ if len(control) != 0 {
+ msg.Control = &control[0]
+ msg.Controllen = uint64(len(control))
+ }
+
+ if len(iovecs) != 0 {
+ msg.Iov = &iovecs[0]
+ msg.Iovlen = uint64(len(iovecs))
+ }
+ n, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags)
+ if e != 0 {
+ return 0, 0, 0, e
+ }
+
+ if n > length {
+ return length, n, msg.Controllen, nil
+ }
+
+ return n, n, msg.Controllen, nil
+}
+
+func fdWriteVec(fd int, bufs [][]byte) (uintptr, error) {
+ _, iovecs := buildIovec(bufs)
+
+ var msg syscall.Msghdr
+ if len(iovecs) > 0 {
+ msg.Iov = &iovecs[0]
+ msg.Iovlen = uint64(len(iovecs))
+ }
+ n, _, e := syscall.RawSyscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), syscall.MSG_DONTWAIT|syscall.MSG_NOSIGNAL)
+ if e != 0 {
+ return 0, e
+ }
+
+ return n, nil
+}
diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go
new file mode 100644
index 000000000..74c703eb7
--- /dev/null
+++ b/pkg/sentry/fs/host/util.go
@@ -0,0 +1,197 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "os"
+ "path"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func open(parent *inodeOperations, name string) (int, error) {
+ if parent == nil && !path.IsAbs(name) {
+ return -1, syserror.EINVAL
+ }
+ name = path.Clean(name)
+
+ // Don't follow through symlinks.
+ flags := syscall.O_NOFOLLOW
+
+ if fd, err := openAt(parent, name, flags|syscall.O_RDWR, 0); err == nil {
+ return fd, nil
+ }
+ // Retry as read-only.
+ if fd, err := openAt(parent, name, flags|syscall.O_RDONLY, 0); err == nil {
+ return fd, nil
+ }
+
+ // Retry as write-only.
+ if fd, err := openAt(parent, name, flags|syscall.O_WRONLY, 0); err == nil {
+ return fd, nil
+ }
+
+ // Retry as a symlink, by including O_PATH as an option.
+ fd, err := openAt(parent, name, linux.O_PATH|flags, 0)
+ if err == nil {
+ return fd, nil
+ }
+
+ // Everything failed.
+ return -1, err
+}
+
+func openAt(parent *inodeOperations, name string, flags int, perm linux.FileMode) (int, error) {
+ if parent == nil {
+ return syscall.Open(name, flags, uint32(perm))
+ }
+ return syscall.Openat(parent.fileState.FD(), name, flags, uint32(perm))
+}
+
+func nodeType(s *syscall.Stat_t) fs.InodeType {
+ switch x := (s.Mode & syscall.S_IFMT); x {
+ case syscall.S_IFLNK:
+ return fs.Symlink
+ case syscall.S_IFIFO:
+ return fs.Pipe
+ case syscall.S_IFCHR:
+ return fs.CharacterDevice
+ case syscall.S_IFBLK:
+ return fs.BlockDevice
+ case syscall.S_IFSOCK:
+ return fs.Socket
+ case syscall.S_IFDIR:
+ return fs.Directory
+ case syscall.S_IFREG:
+ return fs.RegularFile
+ default:
+ // This shouldn't happen, but just in case...
+ log.Warningf("unknown host file type %d: assuming regular", x)
+ return fs.RegularFile
+ }
+}
+
+func wouldBlock(s *syscall.Stat_t) bool {
+ typ := nodeType(s)
+ return typ == fs.Pipe || typ == fs.Socket || typ == fs.CharacterDevice
+}
+
+func stableAttr(s *syscall.Stat_t) fs.StableAttr {
+ return fs.StableAttr{
+ Type: nodeType(s),
+ DeviceID: hostFileDevice.DeviceID(),
+ InodeID: hostFileDevice.Map(device.MultiDeviceKey{
+ Device: s.Dev,
+ Inode: s.Ino,
+ }),
+ BlockSize: int64(s.Blksize),
+ }
+}
+
+func owner(mo *superOperations, s *syscall.Stat_t) fs.FileOwner {
+ // User requested no translation, just return actual owner.
+ if mo.dontTranslateOwnership {
+ return fs.FileOwner{auth.KUID(s.Uid), auth.KGID(s.Gid)}
+ }
+
+ // Show only IDs relevant to the sandboxed task. I.e. if we not own the
+ // file, no sandboxed task can own the file. In that case, we
+ // use OverflowID for UID, implying that the IDs are not mapped in the
+ // "root" user namespace.
+ //
+ // E.g.
+ // sandbox's host EUID/EGID is 1/1.
+ // some_dir's host UID/GID is 2/1.
+ // Task that mounted this fs has virtualized EUID/EGID 5/5.
+ //
+ // If you executed `ls -n` in the sandboxed task, it would show:
+ // drwxwrxwrx [...] 65534 5 [...] some_dir
+
+ // Files are owned by OverflowID by default.
+ owner := fs.FileOwner{auth.KUID(auth.OverflowUID), auth.KGID(auth.OverflowGID)}
+
+ // If we own file on host, let mounting task's initial EUID own
+ // the file.
+ if s.Uid == hostUID {
+ owner.UID = mo.mounter.UID
+ }
+
+ // If our group matches file's group, make file's group match
+ // the mounting task's initial EGID.
+ for _, gid := range hostGIDs {
+ if s.Gid == gid {
+ owner.GID = mo.mounter.GID
+ break
+ }
+ }
+ return owner
+}
+
+func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr {
+ return fs.UnstableAttr{
+ Size: s.Size,
+ Usage: s.Blocks * 512,
+ Perms: fs.FilePermsFromMode(linux.FileMode(s.Mode)),
+ Owner: owner(mo, s),
+ AccessTime: ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec),
+ ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec),
+ StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec),
+ Links: s.Nlink,
+ }
+}
+
+type dirInfo struct {
+ buf []byte // buffer for directory I/O.
+ nbuf int // length of buf; return value from ReadDirent.
+ bufp int // location of next record in buf.
+}
+
+// isBlockError unwraps os errors and checks if they are caused by EAGAIN or
+// EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock.
+func isBlockError(err error) bool {
+ if err == syserror.EAGAIN || err == syserror.EWOULDBLOCK {
+ return true
+ }
+ if pe, ok := err.(*os.PathError); ok {
+ return isBlockError(pe.Err)
+ }
+ return false
+}
+
+func hostEffectiveKIDs() (uint32, []uint32, error) {
+ gids, err := os.Getgroups()
+ if err != nil {
+ return 0, nil, err
+ }
+ egids := make([]uint32, len(gids))
+ for i, gid := range gids {
+ egids[i] = uint32(gid)
+ }
+ return uint32(os.Geteuid()), append(egids, uint32(os.Getegid())), nil
+}
+
+var hostUID uint32
+var hostGIDs []uint32
+
+func init() {
+ hostUID, hostGIDs, _ = hostEffectiveKIDs()
+}
diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go
new file mode 100644
index 000000000..c38d2392d
--- /dev/null
+++ b/pkg/sentry/fs/host/util_unsafe.go
@@ -0,0 +1,137 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "syscall"
+ "unsafe"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+)
+
+func createLink(fd int, name string, linkName string) error {
+ namePtr, err := syscall.BytePtrFromString(name)
+ if err != nil {
+ return err
+ }
+ linkNamePtr, err := syscall.BytePtrFromString(linkName)
+ if err != nil {
+ return err
+ }
+ _, _, errno := syscall.Syscall(
+ syscall.SYS_SYMLINKAT,
+ uintptr(unsafe.Pointer(namePtr)),
+ uintptr(fd),
+ uintptr(unsafe.Pointer(linkNamePtr)))
+ if errno != 0 {
+ return errno
+ }
+ return nil
+}
+
+func readLink(fd int) (string, error) {
+ // Buffer sizing copied from os.Readlink.
+ for l := 128; ; l *= 2 {
+ b := make([]byte, l)
+ n, _, errno := syscall.Syscall6(
+ syscall.SYS_READLINKAT,
+ uintptr(fd),
+ uintptr(unsafe.Pointer(syscall.StringBytePtr(""))),
+ uintptr(unsafe.Pointer(&b[0])),
+ uintptr(l),
+ 0, 0)
+ if n < 0 {
+ n = 0
+ }
+ if errno != 0 {
+ return "", errno
+ }
+ if n < uintptr(l) {
+ return string(b[:n]), nil
+ }
+ }
+}
+
+func unlinkAt(fd int, name string, dir bool) error {
+ namePtr, err := syscall.BytePtrFromString(name)
+ if err != nil {
+ return err
+ }
+ var flags uintptr
+ if dir {
+ flags = linux.AT_REMOVEDIR
+ }
+ _, _, errno := syscall.Syscall(
+ syscall.SYS_UNLINKAT,
+ uintptr(fd),
+ uintptr(unsafe.Pointer(namePtr)),
+ flags,
+ )
+ if errno != 0 {
+ return errno
+ }
+ return nil
+}
+
+func timespecFromTimestamp(t ktime.Time, omit, setSysTime bool) syscall.Timespec {
+ if omit {
+ return syscall.Timespec{0, linux.UTIME_OMIT}
+ }
+ if setSysTime {
+ return syscall.Timespec{0, linux.UTIME_NOW}
+ }
+ return syscall.NsecToTimespec(t.Nanoseconds())
+}
+
+func setTimestamps(fd int, ts fs.TimeSpec) error {
+ if ts.ATimeOmit && ts.MTimeOmit {
+ return nil
+ }
+ var sts [2]syscall.Timespec
+ sts[0] = timespecFromTimestamp(ts.ATime, ts.ATimeOmit, ts.ATimeSetSystemTime)
+ sts[1] = timespecFromTimestamp(ts.MTime, ts.MTimeOmit, ts.MTimeSetSystemTime)
+ _, _, errno := syscall.Syscall6(
+ syscall.SYS_UTIMENSAT,
+ uintptr(fd),
+ 0, /* path */
+ uintptr(unsafe.Pointer(&sts)),
+ 0, /* flags */
+ 0, 0)
+ if errno != 0 {
+ return errno
+ }
+ return nil
+}
+
+func fstatat(fd int, name string, flags int) (syscall.Stat_t, error) {
+ var stat syscall.Stat_t
+ namePtr, err := syscall.BytePtrFromString(name)
+ if err != nil {
+ return stat, err
+ }
+ _, _, errno := syscall.Syscall6(
+ syscall.SYS_NEWFSTATAT,
+ uintptr(fd),
+ uintptr(unsafe.Pointer(namePtr)),
+ uintptr(unsafe.Pointer(&stat)),
+ uintptr(flags),
+ 0, 0)
+ if errno != 0 {
+ return stat, errno
+ }
+ return stat, nil
+}
diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go
new file mode 100644
index 000000000..c5f5c9c0d
--- /dev/null
+++ b/pkg/sentry/fs/host/wait_test.go
@@ -0,0 +1,70 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "syscall"
+ "testing"
+ "time"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+func TestWait(t *testing.T) {
+ var fds [2]int
+ err := syscall.Pipe(fds[:])
+ if err != nil {
+ t.Fatalf("Unable to create pipe: %v", err)
+ }
+
+ defer syscall.Close(fds[1])
+
+ ctx := contexttest.Context(t)
+ file, err := NewFile(ctx, fds[0], fs.RootOwner)
+ if err != nil {
+ syscall.Close(fds[0])
+ t.Fatalf("NewFile failed: %v", err)
+ }
+
+ defer file.DecRef()
+
+ r := file.Readiness(waiter.EventIn)
+ if r != 0 {
+ t.Fatalf("File is ready for read when it shouldn't be.")
+ }
+
+ e, ch := waiter.NewChannelEntry(nil)
+ file.EventRegister(&e, waiter.EventIn)
+ defer file.EventUnregister(&e)
+
+ // Check that there are no notifications yet.
+ if len(ch) != 0 {
+ t.Fatalf("Channel is non-empty")
+ }
+
+ // Write to the pipe, so it should be writable now.
+ syscall.Write(fds[1], []byte{1})
+
+ // Check that we get a notification. We need to yield the current thread
+ // so that the fdnotifier can deliver notifications, so we use a
+ // 1-second timeout instead of just checking the length of the channel.
+ select {
+ case <-ch:
+ case <-time.After(1 * time.Second):
+ t.Fatalf("Channel not notified")
+ }
+}