summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fs/host
diff options
context:
space:
mode:
authorgVisor bot <gvisor-bot@google.com>2019-06-02 06:44:55 +0000
committergVisor bot <gvisor-bot@google.com>2019-06-02 06:44:55 +0000
commitceb0d792f328d1fc0692197d8856a43c3936a571 (patch)
tree83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/fs/host
parentdeb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff)
parent216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff)
Merge 216da0b7 (automated)
Diffstat (limited to 'pkg/sentry/fs/host')
-rw-r--r--pkg/sentry/fs/host/control.go93
-rw-r--r--pkg/sentry/fs/host/descriptor.go120
-rw-r--r--pkg/sentry/fs/host/descriptor_state.go29
-rw-r--r--pkg/sentry/fs/host/device.go25
-rw-r--r--pkg/sentry/fs/host/file.go286
-rw-r--r--pkg/sentry/fs/host/fs.go339
-rwxr-xr-xpkg/sentry/fs/host/host_state_autogen.go142
-rw-r--r--pkg/sentry/fs/host/inode.go527
-rw-r--r--pkg/sentry/fs/host/inode_state.go79
-rw-r--r--pkg/sentry/fs/host/ioctl_unsafe.go56
-rw-r--r--pkg/sentry/fs/host/socket.go390
-rw-r--r--pkg/sentry/fs/host/socket_iovec.go113
-rw-r--r--pkg/sentry/fs/host/socket_state.go42
-rw-r--r--pkg/sentry/fs/host/socket_unsafe.go100
-rw-r--r--pkg/sentry/fs/host/tty.go351
-rw-r--r--pkg/sentry/fs/host/util.go197
-rw-r--r--pkg/sentry/fs/host/util_unsafe.go137
17 files changed, 3026 insertions, 0 deletions
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
new file mode 100644
index 000000000..9ebb9bbb3
--- /dev/null
+++ b/pkg/sentry/fs/host/control.go
@@ -0,0 +1,93 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
+)
+
+type scmRights struct {
+ fds []int
+}
+
+func newSCMRights(fds []int) control.SCMRights {
+ return &scmRights{fds}
+}
+
+// Files implements control.SCMRights.Files.
+func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFiles, bool) {
+ n := max
+ var trunc bool
+ if l := len(c.fds); n > l {
+ n = l
+ } else if n < l {
+ trunc = true
+ }
+
+ rf := control.RightsFiles(fdsToFiles(ctx, c.fds[:n]))
+
+ // Only consume converted FDs (fdsToFiles may convert fewer than n FDs).
+ c.fds = c.fds[len(rf):]
+ return rf, trunc
+}
+
+// Clone implements transport.RightsControlMessage.Clone.
+func (c *scmRights) Clone() transport.RightsControlMessage {
+ // Host rights never need to be cloned.
+ return nil
+}
+
+// Release implements transport.RightsControlMessage.Release.
+func (c *scmRights) Release() {
+ for _, fd := range c.fds {
+ syscall.Close(fd)
+ }
+ c.fds = nil
+}
+
+// If an error is encountered, only files created before the error will be
+// returned. This is what Linux does.
+func fdsToFiles(ctx context.Context, fds []int) []*fs.File {
+ files := make([]*fs.File, 0, len(fds))
+ for _, fd := range fds {
+ // Get flags. We do it here because they may be modified
+ // by subsequent functions.
+ fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0)
+ if errno != 0 {
+ ctx.Warningf("Error retrieving host FD flags: %v", error(errno))
+ break
+ }
+
+ // Create the file backed by hostFD.
+ file, err := NewFile(ctx, fd, fs.FileOwnerFromContext(ctx))
+ if err != nil {
+ ctx.Warningf("Error creating file from host FD: %v", err)
+ break
+ }
+
+ // Set known flags.
+ file.SetFlags(fs.SettableFileFlags{
+ NonBlocking: fileFlags&syscall.O_NONBLOCK != 0,
+ })
+
+ files = append(files, file)
+ }
+ return files
+}
diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go
new file mode 100644
index 000000000..ffcd57a94
--- /dev/null
+++ b/pkg/sentry/fs/host/descriptor.go
@@ -0,0 +1,120 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "fmt"
+ "path"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/fdnotifier"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// descriptor wraps a host fd.
+//
+// +stateify savable
+type descriptor struct {
+ // donated is true if the host fd was donated by another process.
+ donated bool
+
+ // If origFD >= 0, it is the host fd that this file was originally created
+ // from, which must be available at time of restore. The FD can be closed
+ // after descriptor is created. Only set if donated is true.
+ origFD int
+
+ // wouldBlock is true if value (below) points to a file that can
+ // return EWOULDBLOCK for operations that would block.
+ wouldBlock bool
+
+ // value is the wrapped host fd. It is never saved or restored
+ // directly. How it is restored depends on whether it was
+ // donated and the fs.MountSource it was originally
+ // opened/created from.
+ value int `state:"nosave"`
+}
+
+// newDescriptor returns a wrapped host file descriptor. On success,
+// the descriptor is registered for event notifications with queue.
+func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue *waiter.Queue) (*descriptor, error) {
+ ownedFD := fd
+ origFD := -1
+ if saveable {
+ var err error
+ ownedFD, err = syscall.Dup(fd)
+ if err != nil {
+ return nil, err
+ }
+ origFD = fd
+ }
+ if wouldBlock {
+ if err := syscall.SetNonblock(ownedFD, true); err != nil {
+ return nil, err
+ }
+ if err := fdnotifier.AddFD(int32(ownedFD), queue); err != nil {
+ return nil, err
+ }
+ }
+ return &descriptor{
+ donated: donated,
+ origFD: origFD,
+ wouldBlock: wouldBlock,
+ value: ownedFD,
+ }, nil
+}
+
+// initAfterLoad initializes the value of the descriptor after Load.
+func (d *descriptor) initAfterLoad(mo *superOperations, id uint64, queue *waiter.Queue) error {
+ if d.donated {
+ var err error
+ d.value, err = syscall.Dup(d.origFD)
+ if err != nil {
+ return fmt.Errorf("failed to dup restored fd %d: %v", d.origFD, err)
+ }
+ } else {
+ name, ok := mo.inodeMappings[id]
+ if !ok {
+ return fmt.Errorf("failed to find path for inode number %d", id)
+ }
+ fullpath := path.Join(mo.root, name)
+
+ var err error
+ d.value, err = open(nil, fullpath)
+ if err != nil {
+ return fmt.Errorf("failed to open %q: %v", fullpath, err)
+ }
+ }
+ if d.wouldBlock {
+ if err := syscall.SetNonblock(d.value, true); err != nil {
+ return err
+ }
+ if err := fdnotifier.AddFD(int32(d.value), queue); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// Release releases all resources held by descriptor.
+func (d *descriptor) Release() {
+ if d.wouldBlock {
+ fdnotifier.RemoveFD(int32(d.value))
+ }
+ if err := syscall.Close(d.value); err != nil {
+ log.Warningf("error closing fd %d: %v", d.value, err)
+ }
+ d.value = -1
+}
diff --git a/pkg/sentry/fs/host/descriptor_state.go b/pkg/sentry/fs/host/descriptor_state.go
new file mode 100644
index 000000000..8167390a9
--- /dev/null
+++ b/pkg/sentry/fs/host/descriptor_state.go
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+// beforeSave is invoked by stateify.
+func (d *descriptor) beforeSave() {
+ if d.donated && d.origFD < 0 {
+ panic("donated file descriptor cannot be saved")
+ }
+}
+
+// afterLoad is invoked by stateify.
+func (d *descriptor) afterLoad() {
+ // value must be manually restored by the descriptor's parent using
+ // initAfterLoad.
+ d.value = -1
+}
diff --git a/pkg/sentry/fs/host/device.go b/pkg/sentry/fs/host/device.go
new file mode 100644
index 000000000..055024c44
--- /dev/null
+++ b/pkg/sentry/fs/host/device.go
@@ -0,0 +1,25 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+)
+
+// hostFileDevice is the host file virtual device.
+var hostFileDevice = device.NewAnonMultiDevice()
+
+// hostPipeDevice is the host pipe virtual device.
+var hostPipeDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
new file mode 100644
index 000000000..ad0a3ec85
--- /dev/null
+++ b/pkg/sentry/fs/host/file.go
@@ -0,0 +1,286 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "fmt"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/fd"
+ "gvisor.googlesource.com/gvisor/pkg/fdnotifier"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/secio"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// fileOperations implements fs.FileOperations for a host file descriptor.
+//
+// +stateify savable
+type fileOperations struct {
+ fsutil.FileNoIoctl `state:"nosave"`
+ fsutil.FileNoSplice `state:"nosplice"`
+ fsutil.FileNoopRelease `state:"nosave"`
+ fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+
+ // iops are the Inode operations for this file.
+ iops *inodeOperations `state:"wait"`
+
+ // a scratch buffer for reading directory entries.
+ dirinfo *dirInfo `state:"nosave"`
+
+ // dirCursor is the directory cursor.
+ dirCursor string
+}
+
+// fileOperations implements fs.FileOperations.
+var _ fs.FileOperations = (*fileOperations)(nil)
+
+// NewFile creates a new File backed by the provided host file descriptor. If
+// NewFile succeeds, ownership of the FD is transferred to the returned File.
+//
+// The returned File cannot be saved, since there is no guarantee that the same
+// FD will exist or represent the same file at time of restore. If such a
+// guarantee does exist, use ImportFile instead.
+func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error) {
+ return newFileFromDonatedFD(ctx, fd, mounter, false, false)
+}
+
+// ImportFile creates a new File backed by the provided host file descriptor.
+// Unlike NewFile, the file descriptor used by the File is duped from FD to
+// ensure that later changes to FD are not reflected by the fs.File.
+//
+// If the returned file is saved, it will be restored by re-importing the FD
+// originally passed to ImportFile. It is the restorer's responsibility to
+// ensure that the FD represents the same file.
+func ImportFile(ctx context.Context, fd int, mounter fs.FileOwner, isTTY bool) (*fs.File, error) {
+ return newFileFromDonatedFD(ctx, fd, mounter, true, isTTY)
+}
+
+// newFileFromDonatedFD returns an fs.File from a donated FD. If the FD is
+// saveable, then saveable is true.
+func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner, saveable, isTTY bool) (*fs.File, error) {
+ var s syscall.Stat_t
+ if err := syscall.Fstat(donated, &s); err != nil {
+ return nil, err
+ }
+ flags, err := fileFlagsFromDonatedFD(donated)
+ if err != nil {
+ return nil, err
+ }
+ switch s.Mode & syscall.S_IFMT {
+ case syscall.S_IFSOCK:
+ if isTTY {
+ return nil, fmt.Errorf("cannot import host socket as TTY")
+ }
+
+ s, err := newSocket(ctx, donated, saveable)
+ if err != nil {
+ return nil, err
+ }
+ s.SetFlags(fs.SettableFileFlags{
+ NonBlocking: flags.NonBlocking,
+ })
+ return s, nil
+ default:
+ msrc := newMountSource(ctx, "/", mounter, &Filesystem{}, fs.MountSourceFlags{}, false /* dontTranslateOwnership */)
+ inode, err := newInode(ctx, msrc, donated, saveable, true /* donated */)
+ if err != nil {
+ return nil, err
+ }
+ iops := inode.InodeOperations.(*inodeOperations)
+
+ name := fmt.Sprintf("host:[%d]", inode.StableAttr.InodeID)
+ dirent := fs.NewDirent(inode, name)
+ defer dirent.DecRef()
+
+ if isTTY {
+ return newTTYFile(ctx, dirent, flags, iops), nil
+ }
+
+ return newFile(ctx, dirent, flags, iops), nil
+ }
+}
+
+func fileFlagsFromDonatedFD(donated int) (fs.FileFlags, error) {
+ flags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(donated), syscall.F_GETFL, 0)
+ if errno != 0 {
+ log.Warningf("Failed to get file flags for donated FD %d (errno=%d)", donated, errno)
+ return fs.FileFlags{}, syscall.EIO
+ }
+ accmode := flags & syscall.O_ACCMODE
+ return fs.FileFlags{
+ Direct: flags&syscall.O_DIRECT != 0,
+ NonBlocking: flags&syscall.O_NONBLOCK != 0,
+ Sync: flags&syscall.O_SYNC != 0,
+ Append: flags&syscall.O_APPEND != 0,
+ Read: accmode == syscall.O_RDONLY || accmode == syscall.O_RDWR,
+ Write: accmode == syscall.O_WRONLY || accmode == syscall.O_RDWR,
+ }, nil
+}
+
+// newFile returns a new fs.File.
+func newFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File {
+ if !iops.ReturnsWouldBlock() {
+ // Allow reading/writing at an arbitrary offset for files
+ // that support it.
+ flags.Pread = true
+ flags.Pwrite = true
+ }
+ return fs.NewFile(ctx, dirent, flags, &fileOperations{iops: iops})
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (f *fileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ f.iops.fileState.queue.EventRegister(e, mask)
+ fdnotifier.UpdateFD(int32(f.iops.fileState.FD()))
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (f *fileOperations) EventUnregister(e *waiter.Entry) {
+ f.iops.fileState.queue.EventUnregister(e)
+ fdnotifier.UpdateFD(int32(f.iops.fileState.FD()))
+}
+
+// Readiness uses the poll() syscall to check the status of the underlying FD.
+func (f *fileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return fdnotifier.NonBlockingPoll(int32(f.iops.fileState.FD()), mask)
+}
+
+// Readdir implements fs.FileOperations.Readdir.
+func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
+ root := fs.RootFromContext(ctx)
+ if root != nil {
+ defer root.DecRef()
+ }
+ dirCtx := &fs.DirCtx{
+ Serializer: serializer,
+ DirCursor: &f.dirCursor,
+ }
+ return fs.DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset())
+}
+
+// IterateDir implements fs.DirIterator.IterateDir.
+func (f *fileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+ if f.dirinfo == nil {
+ f.dirinfo = new(dirInfo)
+ f.dirinfo.buf = make([]byte, usermem.PageSize)
+ }
+ entries, err := f.iops.readdirAll(f.dirinfo)
+ if err != nil {
+ return offset, err
+ }
+ count, err := fs.GenericReaddir(dirCtx, fs.NewSortedDentryMap(entries))
+ return offset + count, err
+}
+
+// Write implements fs.FileOperations.Write.
+func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+ // Would this file block?
+ if f.iops.ReturnsWouldBlock() {
+ // These files can't be memory mapped, assert this. This also
+ // means that writes do not need to synchronize with memory
+ // mappings nor metadata cached by this file's fs.Inode.
+ if canMap(file.Dirent.Inode) {
+ panic("files that can return EWOULDBLOCK cannot be memory mapped")
+ }
+ // Ignore the offset, these files don't support writing at
+ // an arbitrary offset.
+ writer := fd.NewReadWriter(f.iops.fileState.FD())
+ n, err := src.CopyInTo(ctx, safemem.FromIOWriter{writer})
+ if isBlockError(err) {
+ err = syserror.ErrWouldBlock
+ }
+ return n, err
+ }
+ if !file.Dirent.Inode.MountSource.Flags.ForcePageCache {
+ writer := secio.NewOffsetWriter(fd.NewReadWriter(f.iops.fileState.FD()), offset)
+ return src.CopyInTo(ctx, safemem.FromIOWriter{writer})
+ }
+ return f.iops.cachingInodeOps.Write(ctx, src, offset)
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ // Would this file block?
+ if f.iops.ReturnsWouldBlock() {
+ // These files can't be memory mapped, assert this. This also
+ // means that reads do not need to synchronize with memory
+ // mappings nor metadata cached by this file's fs.Inode.
+ if canMap(file.Dirent.Inode) {
+ panic("files that can return EWOULDBLOCK cannot be memory mapped")
+ }
+ // Ignore the offset, these files don't support reading at
+ // an arbitrary offset.
+ reader := fd.NewReadWriter(f.iops.fileState.FD())
+ n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{reader})
+ if isBlockError(err) {
+ // If we got any data at all, return it as a "completed" partial read
+ // rather than retrying until complete.
+ if n != 0 {
+ err = nil
+ } else {
+ err = syserror.ErrWouldBlock
+ }
+ }
+ return n, err
+ }
+ if !file.Dirent.Inode.MountSource.Flags.ForcePageCache {
+ reader := secio.NewOffsetReader(fd.NewReadWriter(f.iops.fileState.FD()), offset)
+ return dst.CopyOutFrom(ctx, safemem.FromIOReader{reader})
+ }
+ return f.iops.cachingInodeOps.Read(ctx, file, dst, offset)
+}
+
+// Fsync implements fs.FileOperations.Fsync.
+func (f *fileOperations) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error {
+ switch syncType {
+ case fs.SyncAll, fs.SyncData:
+ if err := file.Dirent.Inode.WriteOut(ctx); err != nil {
+ return err
+ }
+ fallthrough
+ case fs.SyncBackingStorage:
+ return syscall.Fsync(f.iops.fileState.FD())
+ }
+ panic("invalid sync type")
+}
+
+// Flush implements fs.FileOperations.Flush.
+func (f *fileOperations) Flush(context.Context, *fs.File) error {
+ // This is a no-op because flushing the resource backing this
+ // file would mean closing it. We can't do that because other
+ // open files may depend on the backing host FD.
+ return nil
+}
+
+// ConfigureMMap implements fs.FileOperations.ConfigureMMap.
+func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
+ if !canMap(file.Dirent.Inode) {
+ return syserror.ENODEV
+ }
+ return fsutil.GenericConfigureMMap(file, f.iops.cachingInodeOps, opts)
+}
+
+// Seek implements fs.FileOperations.Seek.
+func (f *fileOperations) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
+ return fsutil.SeekWithDirCursor(ctx, file, whence, offset, &f.dirCursor)
+}
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
new file mode 100644
index 000000000..b1b8dc0b6
--- /dev/null
+++ b/pkg/sentry/fs/host/fs.go
@@ -0,0 +1,339 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package host implements an fs.Filesystem for files backed by host
+// file descriptors.
+package host
+
+import (
+ "fmt"
+ "path"
+ "path/filepath"
+ "strconv"
+ "strings"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// FilesystemName is the name under which Filesystem is registered.
+const FilesystemName = "whitelistfs"
+
+const (
+ // whitelistKey is the mount option containing a comma-separated list
+ // of host paths to whitelist.
+ whitelistKey = "whitelist"
+
+ // rootPathKey is the mount option containing the root path of the
+ // mount.
+ rootPathKey = "root"
+
+ // dontTranslateOwnershipKey is the key to superOperations.dontTranslateOwnership.
+ dontTranslateOwnershipKey = "dont_translate_ownership"
+)
+
+// maxTraversals determines link traversals in building the whitelist.
+const maxTraversals = 10
+
+// Filesystem is a pseudo file system that is only available during the setup
+// to lock down the configurations. This filesystem should only be mounted at root.
+//
+// Think twice before exposing this to applications.
+//
+// +stateify savable
+type Filesystem struct {
+ // whitelist is a set of host paths to whitelist.
+ paths []string
+}
+
+var _ fs.Filesystem = (*Filesystem)(nil)
+
+// Name is the identifier of this file system.
+func (*Filesystem) Name() string {
+ return FilesystemName
+}
+
+// AllowUserMount prohibits users from using mount(2) with this file system.
+func (*Filesystem) AllowUserMount() bool {
+ return false
+}
+
+// AllowUserList allows this filesystem to be listed in /proc/filesystems.
+func (*Filesystem) AllowUserList() bool {
+ return true
+}
+
+// Flags returns that there is nothing special about this file system.
+func (*Filesystem) Flags() fs.FilesystemFlags {
+ return 0
+}
+
+// Mount returns an fs.Inode exposing the host file system. It is intended to be locked
+// down in PreExec below.
+func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
+ // Parse generic comma-separated key=value options.
+ options := fs.GenericMountSourceOptions(data)
+
+ // Grab the whitelist if one was specified.
+ // TODO(edahlgren/mpratt/hzy): require another option "testonly" in order to allow
+ // no whitelist.
+ if wl, ok := options[whitelistKey]; ok {
+ f.paths = strings.Split(wl, "|")
+ delete(options, whitelistKey)
+ }
+
+ // If the rootPath was set, use it. Othewise default to the root of the
+ // host fs.
+ rootPath := "/"
+ if rp, ok := options[rootPathKey]; ok {
+ rootPath = rp
+ delete(options, rootPathKey)
+
+ // We must relativize the whitelisted paths to the new root.
+ for i, p := range f.paths {
+ rel, err := filepath.Rel(rootPath, p)
+ if err != nil {
+ return nil, fmt.Errorf("whitelist path %q must be a child of root path %q", p, rootPath)
+ }
+ f.paths[i] = path.Join("/", rel)
+ }
+ }
+ fd, err := open(nil, rootPath)
+ if err != nil {
+ return nil, fmt.Errorf("failed to find root: %v", err)
+ }
+
+ var dontTranslateOwnership bool
+ if v, ok := options[dontTranslateOwnershipKey]; ok {
+ b, err := strconv.ParseBool(v)
+ if err != nil {
+ return nil, fmt.Errorf("invalid value for %q: %v", dontTranslateOwnershipKey, err)
+ }
+ dontTranslateOwnership = b
+ delete(options, dontTranslateOwnershipKey)
+ }
+
+ // Fail if the caller passed us more options than we know about.
+ if len(options) > 0 {
+ return nil, fmt.Errorf("unsupported mount options: %v", options)
+ }
+
+ // The mounting EUID/EGID will be cached by this file system. This will
+ // be used to assign ownership to files that we own.
+ owner := fs.FileOwnerFromContext(ctx)
+
+ // Construct the host file system mount and inode.
+ msrc := newMountSource(ctx, rootPath, owner, f, flags, dontTranslateOwnership)
+ return newInode(ctx, msrc, fd, false /* saveable */, false /* donated */)
+}
+
+// InstallWhitelist locks down the MountNamespace to only the currently installed
+// Dirents and the given paths.
+func (f *Filesystem) InstallWhitelist(ctx context.Context, m *fs.MountNamespace) error {
+ return installWhitelist(ctx, m, f.paths)
+}
+
+func installWhitelist(ctx context.Context, m *fs.MountNamespace, paths []string) error {
+ if len(paths) == 0 || (len(paths) == 1 && paths[0] == "") {
+ // Warning will be logged during filter installation if the empty
+ // whitelist matters (allows for host file access).
+ return nil
+ }
+
+ // Done tracks entries already added.
+ done := make(map[string]bool)
+ root := m.Root()
+ defer root.DecRef()
+
+ for i := 0; i < len(paths); i++ {
+ // Make sure the path is absolute. This is a sanity check.
+ if !path.IsAbs(paths[i]) {
+ return fmt.Errorf("path %q is not absolute", paths[i])
+ }
+
+ // We need to add all the intermediate paths, in case one of
+ // them is a symlink that needs to be resolved.
+ for j := 1; j <= len(paths[i]); j++ {
+ if j < len(paths[i]) && paths[i][j] != '/' {
+ continue
+ }
+ current := paths[i][:j]
+
+ // Lookup the given component in the tree.
+ remainingTraversals := uint(maxTraversals)
+ d, err := m.FindLink(ctx, root, nil, current, &remainingTraversals)
+ if err != nil {
+ log.Warningf("populate failed for %q: %v", current, err)
+ continue
+ }
+
+ // It's critical that this DecRef happens after the
+ // freeze below. This ensures that the dentry is in
+ // place to be frozen. Otherwise, we freeze without
+ // these entries.
+ defer d.DecRef()
+
+ // Expand the last component if necessary.
+ if current == paths[i] {
+ // Is it a directory or symlink?
+ sattr := d.Inode.StableAttr
+ if fs.IsDir(sattr) {
+ for name := range childDentAttrs(ctx, d) {
+ paths = append(paths, path.Join(current, name))
+ }
+ }
+ if fs.IsSymlink(sattr) {
+ // Only expand symlinks once. The
+ // folder structure may contain
+ // recursive symlinks and we don't want
+ // to end up infinitely expanding this
+ // symlink. This is safe because this
+ // is the last component. If a later
+ // path wants to symlink something
+ // beneath this symlink that will still
+ // be handled by the FindLink above.
+ if done[current] {
+ continue
+ }
+
+ s, err := d.Inode.Readlink(ctx)
+ if err != nil {
+ log.Warningf("readlink failed for %q: %v", current, err)
+ continue
+ }
+ if path.IsAbs(s) {
+ paths = append(paths, s)
+ } else {
+ target := path.Join(path.Dir(current), s)
+ paths = append(paths, target)
+ }
+ }
+ }
+
+ // Only report this one once even though we may look
+ // it up more than once. If we whitelist /a/b,/a then
+ // /a will be "done" when it is looked up for /a/b,
+ // however we still need to expand all of its contents
+ // when whitelisting /a.
+ if !done[current] {
+ log.Debugf("whitelisted: %s", current)
+ }
+ done[current] = true
+ }
+ }
+
+ // Freeze the mount tree in place. This prevents any new paths from
+ // being opened and any old ones from being removed. If we do provide
+ // tmpfs mounts, we'll want to freeze/thaw those separately.
+ m.Freeze()
+ return nil
+}
+
+func childDentAttrs(ctx context.Context, d *fs.Dirent) map[string]fs.DentAttr {
+ dirname, _ := d.FullName(nil /* root */)
+ dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
+ if err != nil {
+ log.Warningf("failed to open directory %q: %v", dirname, err)
+ return nil
+ }
+ dir.DecRef()
+ var stubSerializer fs.CollectEntriesSerializer
+ if err := dir.Readdir(ctx, &stubSerializer); err != nil {
+ log.Warningf("failed to iterate on host directory %q: %v", dirname, err)
+ return nil
+ }
+ delete(stubSerializer.Entries, ".")
+ delete(stubSerializer.Entries, "..")
+ return stubSerializer.Entries
+}
+
+// newMountSource constructs a new host fs.MountSource
+// relative to a root path. The root should match the mount point.
+func newMountSource(ctx context.Context, root string, mounter fs.FileOwner, filesystem fs.Filesystem, flags fs.MountSourceFlags, dontTranslateOwnership bool) *fs.MountSource {
+ return fs.NewMountSource(&superOperations{
+ root: root,
+ inodeMappings: make(map[uint64]string),
+ mounter: mounter,
+ dontTranslateOwnership: dontTranslateOwnership,
+ }, filesystem, flags)
+}
+
+// superOperations implements fs.MountSourceOperations.
+//
+// +stateify savable
+type superOperations struct {
+ fs.SimpleMountSourceOperations
+
+ // root is the path of the mount point. All inode mappings
+ // are relative to this root.
+ root string
+
+ // inodeMappings contains mappings of fs.Inodes associated
+ // with this MountSource to paths under root.
+ inodeMappings map[uint64]string
+
+ // mounter is the cached EUID/EGID that mounted this file system.
+ mounter fs.FileOwner
+
+ // dontTranslateOwnership indicates whether to not translate file
+ // ownership.
+ //
+ // By default, files/directories owned by the sandbox uses UID/GID
+ // of the mounter. For files/directories that are not owned by the
+ // sandbox, file UID/GID is translated to a UID/GID which cannot
+ // be mapped in the sandboxed application's user namespace. The
+ // UID/GID will look like the nobody UID/GID (65534) but is not
+ // strictly owned by the user "nobody".
+ //
+ // If whitelistfs is a lower filesystem in an overlay, set
+ // dont_translate_ownership=true in mount options.
+ dontTranslateOwnership bool
+}
+
+var _ fs.MountSourceOperations = (*superOperations)(nil)
+
+// ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings.
+func (m *superOperations) ResetInodeMappings() {
+ m.inodeMappings = make(map[uint64]string)
+}
+
+// SaveInodeMapping implements fs.MountSourceOperations.SaveInodeMapping.
+func (m *superOperations) SaveInodeMapping(inode *fs.Inode, path string) {
+ // This is very unintuitive. We *CANNOT* trust the inode's StableAttrs,
+ // because overlay copyUp may have changed them out from under us.
+ // So much for "immutable".
+ sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr
+ m.inodeMappings[sattr.InodeID] = path
+}
+
+// Keep implements fs.MountSourceOperations.Keep.
+//
+// TODO(b/72455313,b/77596690): It is possible to change the permissions on a
+// host file while it is in the dirent cache (say from RO to RW), but it is not
+// possible to re-open the file with more relaxed permissions, since the host
+// FD is already open and stored in the inode.
+//
+// Using the dirent LRU cache increases the odds that this bug is encountered.
+// Since host file access is relatively fast anyways, we disable the LRU cache
+// for host fs files. Once we can properly deal with permissions changes and
+// re-opening host files, we should revisit whether or not to make use of the
+// LRU cache.
+func (*superOperations) Keep(*fs.Dirent) bool {
+ return false
+}
+
+func init() {
+ fs.RegisterFilesystem(&Filesystem{})
+}
diff --git a/pkg/sentry/fs/host/host_state_autogen.go b/pkg/sentry/fs/host/host_state_autogen.go
new file mode 100755
index 000000000..22cfa1222
--- /dev/null
+++ b/pkg/sentry/fs/host/host_state_autogen.go
@@ -0,0 +1,142 @@
+// automatically generated by stateify.
+
+package host
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+func (x *descriptor) save(m state.Map) {
+ x.beforeSave()
+ m.Save("donated", &x.donated)
+ m.Save("origFD", &x.origFD)
+ m.Save("wouldBlock", &x.wouldBlock)
+}
+
+func (x *descriptor) load(m state.Map) {
+ m.Load("donated", &x.donated)
+ m.Load("origFD", &x.origFD)
+ m.Load("wouldBlock", &x.wouldBlock)
+ m.AfterLoad(x.afterLoad)
+}
+
+func (x *fileOperations) beforeSave() {}
+func (x *fileOperations) save(m state.Map) {
+ x.beforeSave()
+ m.Save("iops", &x.iops)
+ m.Save("dirCursor", &x.dirCursor)
+}
+
+func (x *fileOperations) afterLoad() {}
+func (x *fileOperations) load(m state.Map) {
+ m.LoadWait("iops", &x.iops)
+ m.Load("dirCursor", &x.dirCursor)
+}
+
+func (x *Filesystem) beforeSave() {}
+func (x *Filesystem) save(m state.Map) {
+ x.beforeSave()
+ m.Save("paths", &x.paths)
+}
+
+func (x *Filesystem) afterLoad() {}
+func (x *Filesystem) load(m state.Map) {
+ m.Load("paths", &x.paths)
+}
+
+func (x *superOperations) beforeSave() {}
+func (x *superOperations) save(m state.Map) {
+ x.beforeSave()
+ m.Save("SimpleMountSourceOperations", &x.SimpleMountSourceOperations)
+ m.Save("root", &x.root)
+ m.Save("inodeMappings", &x.inodeMappings)
+ m.Save("mounter", &x.mounter)
+ m.Save("dontTranslateOwnership", &x.dontTranslateOwnership)
+}
+
+func (x *superOperations) afterLoad() {}
+func (x *superOperations) load(m state.Map) {
+ m.Load("SimpleMountSourceOperations", &x.SimpleMountSourceOperations)
+ m.Load("root", &x.root)
+ m.Load("inodeMappings", &x.inodeMappings)
+ m.Load("mounter", &x.mounter)
+ m.Load("dontTranslateOwnership", &x.dontTranslateOwnership)
+}
+
+func (x *inodeOperations) beforeSave() {}
+func (x *inodeOperations) save(m state.Map) {
+ x.beforeSave()
+ m.Save("fileState", &x.fileState)
+ m.Save("cachingInodeOps", &x.cachingInodeOps)
+}
+
+func (x *inodeOperations) afterLoad() {}
+func (x *inodeOperations) load(m state.Map) {
+ m.LoadWait("fileState", &x.fileState)
+ m.Load("cachingInodeOps", &x.cachingInodeOps)
+}
+
+func (x *inodeFileState) save(m state.Map) {
+ x.beforeSave()
+ if !state.IsZeroValue(x.queue) { m.Failf("queue is %v, expected zero", x.queue) }
+ m.Save("mops", &x.mops)
+ m.Save("descriptor", &x.descriptor)
+ m.Save("sattr", &x.sattr)
+ m.Save("savedUAttr", &x.savedUAttr)
+}
+
+func (x *inodeFileState) load(m state.Map) {
+ m.LoadWait("mops", &x.mops)
+ m.LoadWait("descriptor", &x.descriptor)
+ m.LoadWait("sattr", &x.sattr)
+ m.Load("savedUAttr", &x.savedUAttr)
+ m.AfterLoad(x.afterLoad)
+}
+
+func (x *ConnectedEndpoint) save(m state.Map) {
+ x.beforeSave()
+ m.Save("queue", &x.queue)
+ m.Save("path", &x.path)
+ m.Save("ref", &x.ref)
+ m.Save("readClosed", &x.readClosed)
+ m.Save("writeClosed", &x.writeClosed)
+ m.Save("srfd", &x.srfd)
+ m.Save("stype", &x.stype)
+}
+
+func (x *ConnectedEndpoint) load(m state.Map) {
+ m.Load("queue", &x.queue)
+ m.Load("path", &x.path)
+ m.Load("ref", &x.ref)
+ m.Load("readClosed", &x.readClosed)
+ m.Load("writeClosed", &x.writeClosed)
+ m.LoadWait("srfd", &x.srfd)
+ m.Load("stype", &x.stype)
+ m.AfterLoad(x.afterLoad)
+}
+
+func (x *TTYFileOperations) beforeSave() {}
+func (x *TTYFileOperations) save(m state.Map) {
+ x.beforeSave()
+ m.Save("fileOperations", &x.fileOperations)
+ m.Save("session", &x.session)
+ m.Save("fgProcessGroup", &x.fgProcessGroup)
+}
+
+func (x *TTYFileOperations) afterLoad() {}
+func (x *TTYFileOperations) load(m state.Map) {
+ m.Load("fileOperations", &x.fileOperations)
+ m.Load("session", &x.session)
+ m.Load("fgProcessGroup", &x.fgProcessGroup)
+}
+
+func init() {
+ state.Register("host.descriptor", (*descriptor)(nil), state.Fns{Save: (*descriptor).save, Load: (*descriptor).load})
+ state.Register("host.fileOperations", (*fileOperations)(nil), state.Fns{Save: (*fileOperations).save, Load: (*fileOperations).load})
+ state.Register("host.Filesystem", (*Filesystem)(nil), state.Fns{Save: (*Filesystem).save, Load: (*Filesystem).load})
+ state.Register("host.superOperations", (*superOperations)(nil), state.Fns{Save: (*superOperations).save, Load: (*superOperations).load})
+ state.Register("host.inodeOperations", (*inodeOperations)(nil), state.Fns{Save: (*inodeOperations).save, Load: (*inodeOperations).load})
+ state.Register("host.inodeFileState", (*inodeFileState)(nil), state.Fns{Save: (*inodeFileState).save, Load: (*inodeFileState).load})
+ state.Register("host.ConnectedEndpoint", (*ConnectedEndpoint)(nil), state.Fns{Save: (*ConnectedEndpoint).save, Load: (*ConnectedEndpoint).load})
+ state.Register("host.TTYFileOperations", (*TTYFileOperations)(nil), state.Fns{Save: (*TTYFileOperations).save, Load: (*TTYFileOperations).load})
+}
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
new file mode 100644
index 000000000..7a230e426
--- /dev/null
+++ b/pkg/sentry/fs/host/inode.go
@@ -0,0 +1,527 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "sync"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/fd"
+ "gvisor.googlesource.com/gvisor/pkg/secio"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// inodeOperations implements fs.InodeOperations for an fs.Inodes backed
+// by a host file descriptor.
+//
+// +stateify savable
+type inodeOperations struct {
+ fsutil.InodeNotVirtual `state:"nosave"`
+ fsutil.InodeNoExtendedAttributes `state:"nosave"`
+
+ // fileState implements fs.CachedFileObject. It exists
+ // to break a circular load dependency between inodeOperations
+ // and cachingInodeOps (below).
+ fileState *inodeFileState `state:"wait"`
+
+ // cachedInodeOps implements memmap.Mappable.
+ cachingInodeOps *fsutil.CachingInodeOperations
+
+ // readdirMu protects the file offset on the host FD. This is needed
+ // for readdir because getdents must use the kernel offset, so
+ // concurrent readdirs must be exclusive.
+ //
+ // All read/write functions pass the offset directly to the kernel and
+ // thus don't need a lock.
+ readdirMu sync.Mutex `state:"nosave"`
+}
+
+// inodeFileState implements fs.CachedFileObject and otherwise fully
+// encapsulates state that needs to be manually loaded on restore for
+// this file object.
+//
+// This unfortunate structure exists because fs.CachingInodeOperations
+// defines afterLoad and therefore cannot be lazily loaded (to break a
+// circular load dependency between it and inodeOperations). Even with
+// lazy loading, this approach defines the dependencies between objects
+// and the expected load behavior more concretely.
+//
+// +stateify savable
+type inodeFileState struct {
+ // Common file system state.
+ mops *superOperations `state:"wait"`
+
+ // descriptor is the backing host FD.
+ descriptor *descriptor `state:"wait"`
+
+ // Event queue for blocking operations.
+ queue waiter.Queue `state:"zerovalue"`
+
+ // sattr is used to restore the inodeOperations.
+ sattr fs.StableAttr `state:"wait"`
+
+ // savedUAttr is only allocated during S/R. It points to the save-time
+ // unstable attributes and is used to validate restore-time ones.
+ //
+ // Note that these unstable attributes are only used to detect cross-S/R
+ // external file system metadata changes. They may differ from the
+ // cached unstable attributes in cachingInodeOps, as that might differ
+ // from the external file system attributes if there had been WriteOut
+ // failures. S/R is transparent to Sentry and the latter will continue
+ // using its cached values after restore.
+ savedUAttr *fs.UnstableAttr
+}
+
+// ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt.
+func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
+ // TODO(jamieliu): Using safemem.FromIOReader here is wasteful for two
+ // reasons:
+ //
+ // - Using preadv instead of iterated preads saves on host system calls.
+ //
+ // - Host system calls can handle destination memory that would fault in
+ // gr3 (i.e. they can accept safemem.Blocks with NeedSafecopy() == true),
+ // so the buffering performed by FromIOReader is unnecessary.
+ //
+ // This also applies to the write path below.
+ return safemem.FromIOReader{secio.NewOffsetReader(fd.NewReadWriter(i.FD()), int64(offset))}.ReadToBlocks(dsts)
+}
+
+// WriteFromBlocksAt implements fsutil.CachedFileObject.WriteFromBlocksAt.
+func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
+ return safemem.FromIOWriter{secio.NewOffsetWriter(fd.NewReadWriter(i.FD()), int64(offset))}.WriteFromBlocks(srcs)
+}
+
+// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
+func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error {
+ if mask.Empty() {
+ return nil
+ }
+ if mask.UID || mask.GID {
+ return syserror.EPERM
+ }
+ if mask.Perms {
+ if err := syscall.Fchmod(i.FD(), uint32(attr.Perms.LinuxMode())); err != nil {
+ return err
+ }
+ }
+ if mask.Size {
+ if err := syscall.Ftruncate(i.FD(), attr.Size); err != nil {
+ return err
+ }
+ }
+ if mask.AccessTime || mask.ModificationTime {
+ ts := fs.TimeSpec{
+ ATime: attr.AccessTime,
+ ATimeOmit: !mask.AccessTime,
+ MTime: attr.ModificationTime,
+ MTimeOmit: !mask.ModificationTime,
+ }
+ if err := setTimestamps(i.FD(), ts); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// Sync implements fsutil.CachedFileObject.Sync.
+func (i *inodeFileState) Sync(ctx context.Context) error {
+ return syscall.Fsync(i.FD())
+}
+
+// FD implements fsutil.CachedFileObject.FD.
+func (i *inodeFileState) FD() int {
+ return i.descriptor.value
+}
+
+func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, error) {
+ var s syscall.Stat_t
+ if err := syscall.Fstat(i.FD(), &s); err != nil {
+ return fs.UnstableAttr{}, err
+ }
+ return unstableAttr(i.mops, &s), nil
+}
+
+// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
+func (i *inodeFileState) Allocate(_ context.Context, offset, length int64) error {
+ return syscall.Fallocate(i.FD(), 0, offset, length)
+}
+
+// inodeOperations implements fs.InodeOperations.
+var _ fs.InodeOperations = (*inodeOperations)(nil)
+
+// newInode returns a new fs.Inode backed by the host FD.
+func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool, donated bool) (*fs.Inode, error) {
+ // Retrieve metadata.
+ var s syscall.Stat_t
+ err := syscall.Fstat(fd, &s)
+ if err != nil {
+ return nil, err
+ }
+
+ fileState := &inodeFileState{
+ mops: msrc.MountSourceOperations.(*superOperations),
+ sattr: stableAttr(&s),
+ }
+
+ // Initialize the wrapped host file descriptor.
+ fileState.descriptor, err = newDescriptor(
+ fd,
+ donated,
+ saveable,
+ wouldBlock(&s),
+ &fileState.queue,
+ )
+ if err != nil {
+ return nil, err
+ }
+
+ // Build the fs.InodeOperations.
+ uattr := unstableAttr(msrc.MountSourceOperations.(*superOperations), &s)
+ iops := &inodeOperations{
+ fileState: fileState,
+ cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, msrc.Flags.ForcePageCache),
+ }
+
+ // Return the fs.Inode.
+ return fs.NewInode(iops, msrc, fileState.sattr), nil
+}
+
+// Mappable implements fs.InodeOperations.Mappable.
+func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable {
+ if !canMap(inode) {
+ return nil
+ }
+ return i.cachingInodeOps
+}
+
+// ReturnsWouldBlock returns true if this host FD can return EWOULDBLOCK for
+// operations that would block.
+func (i *inodeOperations) ReturnsWouldBlock() bool {
+ return i.fileState.descriptor.wouldBlock
+}
+
+// Release implements fs.InodeOperations.Release.
+func (i *inodeOperations) Release(context.Context) {
+ i.fileState.descriptor.Release()
+ i.cachingInodeOps.Release()
+}
+
+// Lookup implements fs.InodeOperations.Lookup.
+func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
+ // Get a new FD relative to i at name.
+ fd, err := open(i, name)
+ if err != nil {
+ if err == syserror.ENOENT {
+ return nil, syserror.ENOENT
+ }
+ return nil, err
+ }
+
+ inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */)
+ if err != nil {
+ return nil, err
+ }
+
+ // Return the fs.Dirent.
+ return fs.NewDirent(inode, name), nil
+}
+
+// Create implements fs.InodeOperations.Create.
+func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) {
+ // Create a file relative to i at name.
+ //
+ // N.B. We always open this file O_RDWR regardless of flags because a
+ // future GetFile might want more access. Open allows this regardless
+ // of perm.
+ fd, err := openAt(i, name, syscall.O_RDWR|syscall.O_CREAT|syscall.O_EXCL, perm.LinuxMode())
+ if err != nil {
+ return nil, err
+ }
+
+ inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */)
+ if err != nil {
+ return nil, err
+ }
+
+ d := fs.NewDirent(inode, name)
+ defer d.DecRef()
+ return inode.GetFile(ctx, d, flags)
+}
+
+// CreateDirectory implements fs.InodeOperations.CreateDirectory.
+func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
+ return syscall.Mkdirat(i.fileState.FD(), name, uint32(perm.LinuxMode()))
+}
+
+// CreateLink implements fs.InodeOperations.CreateLink.
+func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error {
+ return createLink(i.fileState.FD(), oldname, newname)
+}
+
+// CreateHardLink implements fs.InodeOperations.CreateHardLink.
+func (*inodeOperations) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error {
+ return syserror.EPERM
+}
+
+// CreateFifo implements fs.InodeOperations.CreateFifo.
+func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error {
+ return syserror.EPERM
+}
+
+// Remove implements fs.InodeOperations.Remove.
+func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error {
+ return unlinkAt(i.fileState.FD(), name, false /* dir */)
+}
+
+// RemoveDirectory implements fs.InodeOperations.RemoveDirectory.
+func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error {
+ return unlinkAt(i.fileState.FD(), name, true /* dir */)
+}
+
+// Rename implements fs.InodeOperations.Rename.
+func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+ op, ok := oldParent.InodeOperations.(*inodeOperations)
+ if !ok {
+ return syscall.EXDEV
+ }
+ np, ok := newParent.InodeOperations.(*inodeOperations)
+ if !ok {
+ return syscall.EXDEV
+ }
+ return syscall.Renameat(op.fileState.FD(), oldName, np.fileState.FD(), newName)
+}
+
+// Bind implements fs.InodeOperations.Bind.
+func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data transport.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) {
+ return nil, syserror.EOPNOTSUPP
+}
+
+// BoundEndpoint implements fs.InodeOperations.BoundEndpoint.
+func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport.BoundEndpoint {
+ return nil
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+ return newFile(ctx, d, flags, i), nil
+}
+
+// canMap returns true if this fs.Inode can be memory mapped.
+func canMap(inode *fs.Inode) bool {
+ // FIXME(b/38213152): Some obscure character devices can be mapped.
+ return fs.IsFile(inode.StableAttr)
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+ // When the kernel supports mapping host FDs, we do so to take
+ // advantage of the host page cache. We forego updating fs.Inodes
+ // because the host manages consistency of its own inode structures.
+ //
+ // For fs.Inodes that can never be mapped we take advantage of
+ // synchronizing metadata updates through host caches.
+ //
+ // So can we use host kernel metadata caches?
+ if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+ // Then just obtain the attributes.
+ return i.fileState.unstableAttr(ctx)
+ }
+ // No, we're maintaining consistency of metadata ourselves.
+ return i.cachingInodeOps.UnstableAttr(ctx, inode)
+}
+
+// Check implements fs.InodeOperations.Check.
+func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+ return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (i *inodeOperations) SetOwner(context.Context, *fs.Inode, fs.FileOwner) error {
+ return syserror.EPERM
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, f fs.FilePermissions) bool {
+ // Can we use host kernel metadata caches?
+ if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+ // Then just change the timestamps on the FD, the host
+ // will synchronize the metadata update with any host
+ // inode and page cache.
+ return syscall.Fchmod(i.fileState.FD(), uint32(f.LinuxMode())) == nil
+ }
+ // Otherwise update our cached metadata.
+ return i.cachingInodeOps.SetPermissions(ctx, inode, f)
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
+ // Can we use host kernel metadata caches?
+ if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+ // Then just change the timestamps on the FD, the host
+ // will synchronize the metadata update with any host
+ // inode and page cache.
+ return setTimestamps(i.fileState.FD(), ts)
+ }
+ // Otherwise update our cached metadata.
+ return i.cachingInodeOps.SetTimestamps(ctx, inode, ts)
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
+ // Is the file not memory-mappable?
+ if !canMap(inode) {
+ // Then just change the file size on the FD, the host
+ // will synchronize the metadata update with any host
+ // inode and page cache.
+ return syscall.Ftruncate(i.fileState.FD(), size)
+ }
+ // Otherwise we need to go through cachingInodeOps, even if the host page
+ // cache is in use, to invalidate private copies of truncated pages.
+ return i.cachingInodeOps.Truncate(ctx, inode, size)
+}
+
+// Allocate implements fs.InodeOperations.Allocate.
+func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset, length int64) error {
+ // Is the file not memory-mappable?
+ if !canMap(inode) {
+ // Then just send the call to the FD, the host will synchronize the metadata
+ // update with any host inode and page cache.
+ return i.fileState.Allocate(ctx, offset, length)
+ }
+ // Otherwise we need to go through cachingInodeOps, even if the host page
+ // cache is in use, to invalidate private copies of truncated pages.
+ return i.cachingInodeOps.Allocate(ctx, offset, length)
+}
+
+// WriteOut implements fs.InodeOperations.WriteOut.
+func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
+ // Have we been using host kernel metadata caches?
+ if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+ // Then the metadata is already up to date on the host.
+ return nil
+ }
+ // Otherwise we need to write out cached pages and attributes
+ // that are dirty.
+ return i.cachingInodeOps.WriteOut(ctx, inode)
+}
+
+// Readlink implements fs.InodeOperations.Readlink.
+func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+ return readLink(i.fileState.FD())
+}
+
+// Getlink implements fs.InodeOperations.Getlink.
+func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
+ if !fs.IsSymlink(i.fileState.sattr) {
+ return nil, syserror.ENOLINK
+ }
+ return nil, fs.ErrResolveViaReadlink
+}
+
+// StatFS implements fs.InodeOperations.StatFS.
+func (i *inodeOperations) StatFS(context.Context) (fs.Info, error) {
+ return fs.Info{}, syserror.ENOSYS
+}
+
+// AddLink implements fs.InodeOperations.AddLink.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
+func (i *inodeOperations) AddLink() {}
+
+// DropLink implements fs.InodeOperations.DropLink.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
+func (i *inodeOperations) DropLink() {}
+
+// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
+func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
+
+// readdirAll returns all of the directory entries in i.
+func (i *inodeOperations) readdirAll(d *dirInfo) (map[string]fs.DentAttr, error) {
+ i.readdirMu.Lock()
+ defer i.readdirMu.Unlock()
+
+ fd := i.fileState.FD()
+
+ // syscall.ReadDirent will use getdents, which will seek the file past
+ // the last directory entry. To read the directory entries a second
+ // time, we need to seek back to the beginning.
+ if _, err := syscall.Seek(fd, 0, 0); err != nil {
+ if err == syscall.ESPIPE {
+ // All directories should be seekable. If this file
+ // isn't seekable, it is not a directory and we should
+ // return that more sane error.
+ err = syscall.ENOTDIR
+ }
+ return nil, err
+ }
+
+ names := make([]string, 0, 100)
+ for {
+ // Refill the buffer if necessary
+ if d.bufp >= d.nbuf {
+ d.bufp = 0
+ // ReadDirent will just do a sys_getdents64 to the kernel.
+ n, err := syscall.ReadDirent(fd, d.buf)
+ if err != nil {
+ return nil, err
+ }
+ if n == 0 {
+ break // EOF
+ }
+ d.nbuf = n
+ }
+
+ var nb int
+ // Parse the dirent buffer we just get and return the directory names along
+ // with the number of bytes consumed in the buffer.
+ nb, _, names = syscall.ParseDirent(d.buf[d.bufp:d.nbuf], -1, names)
+ d.bufp += nb
+ }
+
+ entries := make(map[string]fs.DentAttr)
+ for _, filename := range names {
+ // Lookup the type and host device and inode.
+ stat, lerr := fstatat(fd, filename, linux.AT_SYMLINK_NOFOLLOW)
+ if lerr == syscall.ENOENT {
+ // File disappeared between readdir and lstat.
+ // Just treat it as if it didn't exist.
+ continue
+ }
+
+ // There was a serious problem, we should probably report it.
+ if lerr != nil {
+ return nil, lerr
+ }
+
+ entries[filename] = fs.DentAttr{
+ Type: nodeType(&stat),
+ InodeID: hostFileDevice.Map(device.MultiDeviceKey{
+ Device: stat.Dev,
+ Inode: stat.Ino,
+ }),
+ }
+ }
+ return entries, nil
+}
diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go
new file mode 100644
index 000000000..26cc755bc
--- /dev/null
+++ b/pkg/sentry/fs/host/inode_state.go
@@ -0,0 +1,79 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "fmt"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// beforeSave is invoked by stateify.
+func (i *inodeFileState) beforeSave() {
+ if !i.queue.IsEmpty() {
+ panic("event queue must be empty")
+ }
+ if !i.descriptor.donated && i.sattr.Type == fs.RegularFile {
+ uattr, err := i.unstableAttr(context.Background())
+ if err != nil {
+ panic(fs.ErrSaveRejection{fmt.Errorf("failed to get unstable atttribute of %s: %v", i.mops.inodeMappings[i.sattr.InodeID], err)})
+ }
+ i.savedUAttr = &uattr
+ }
+}
+
+// afterLoad is invoked by stateify.
+func (i *inodeFileState) afterLoad() {
+ // Initialize the descriptor value.
+ if err := i.descriptor.initAfterLoad(i.mops, i.sattr.InodeID, &i.queue); err != nil {
+ panic(fmt.Sprintf("failed to load value of descriptor: %v", err))
+ }
+
+ // Remap the inode number.
+ var s syscall.Stat_t
+ if err := syscall.Fstat(i.FD(), &s); err != nil {
+ panic(fs.ErrCorruption{fmt.Errorf("failed to get metadata for fd %d: %v", i.FD(), err)})
+ }
+ key := device.MultiDeviceKey{
+ Device: s.Dev,
+ Inode: s.Ino,
+ }
+ if !hostFileDevice.Load(key, i.sattr.InodeID) {
+ // This means there was a conflict at s.Dev and s.Ino with
+ // another inode mapping: two files that were unique on the
+ // saved filesystem are no longer unique on this filesystem.
+ // Since this violates the contract that filesystems cannot
+ // change across save and restore, error out.
+ panic(fs.ErrCorruption{fmt.Errorf("host %s conflict in host device mappings: %s", key, hostFileDevice)})
+ }
+
+ if !i.descriptor.donated && i.sattr.Type == fs.RegularFile {
+ env, ok := fs.CurrentRestoreEnvironment()
+ if !ok {
+ panic("missing restore environment")
+ }
+ uattr := unstableAttr(i.mops, &s)
+ if env.ValidateFileSize && uattr.Size != i.savedUAttr.Size {
+ panic(fs.ErrCorruption{fmt.Errorf("file size has changed for %s: previously %d, now %d", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size)})
+ }
+ if env.ValidateFileTimestamp && uattr.ModificationTime != i.savedUAttr.ModificationTime {
+ panic(fs.ErrCorruption{fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime)})
+ }
+ i.savedUAttr = nil
+ }
+}
diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go
new file mode 100644
index 000000000..b5a85c4d9
--- /dev/null
+++ b/pkg/sentry/fs/host/ioctl_unsafe.go
@@ -0,0 +1,56 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "syscall"
+ "unsafe"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+func ioctlGetTermios(fd int) (*linux.Termios, error) {
+ var t linux.Termios
+ _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t)))
+ if errno != 0 {
+ return nil, errno
+ }
+ return &t, nil
+}
+
+func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error {
+ _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t)))
+ if errno != 0 {
+ return errno
+ }
+ return nil
+}
+
+func ioctlGetWinsize(fd int) (*linux.Winsize, error) {
+ var w linux.Winsize
+ _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCGWINSZ, uintptr(unsafe.Pointer(&w)))
+ if errno != 0 {
+ return nil, errno
+ }
+ return &w, nil
+}
+
+func ioctlSetWinsize(fd int, w *linux.Winsize) error {
+ _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCSWINSZ, uintptr(unsafe.Pointer(w)))
+ if errno != 0 {
+ return errno
+ }
+ return nil
+}
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
new file mode 100644
index 000000000..3ed137006
--- /dev/null
+++ b/pkg/sentry/fs/host/socket.go
@@ -0,0 +1,390 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "sync"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/fd"
+ "gvisor.googlesource.com/gvisor/pkg/fdnotifier"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/refs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
+ unixsocket "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+ "gvisor.googlesource.com/gvisor/pkg/syserr"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip"
+ "gvisor.googlesource.com/gvisor/pkg/unet"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// maxSendBufferSize is the maximum host send buffer size allowed for endpoint.
+//
+// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max).
+const maxSendBufferSize = 8 << 20
+
+// ConnectedEndpoint is a host FD backed implementation of
+// transport.ConnectedEndpoint and transport.Receiver.
+//
+// +stateify savable
+type ConnectedEndpoint struct {
+ queue *waiter.Queue
+ path string
+
+ // ref keeps track of references to a connectedEndpoint.
+ ref refs.AtomicRefCount
+
+ // mu protects fd, readClosed and writeClosed.
+ mu sync.RWMutex `state:"nosave"`
+
+ // file is an *fd.FD containing the FD backing this endpoint. It must be
+ // set to nil if it has been closed.
+ file *fd.FD `state:"nosave"`
+
+ // readClosed is true if the FD has read shutdown or if it has been closed.
+ readClosed bool
+
+ // writeClosed is true if the FD has write shutdown or if it has been
+ // closed.
+ writeClosed bool
+
+ // If srfd >= 0, it is the host FD that file was imported from.
+ srfd int `state:"wait"`
+
+ // stype is the type of Unix socket.
+ stype transport.SockType
+
+ // sndbuf is the size of the send buffer.
+ //
+ // N.B. When this is smaller than the host size, we present it via
+ // GetSockOpt and message splitting/rejection in SendMsg, but do not
+ // prevent lots of small messages from filling the real send buffer
+ // size on the host.
+ sndbuf int `state:"nosave"`
+}
+
+// init performs initialization required for creating new ConnectedEndpoints and
+// for restoring them.
+func (c *ConnectedEndpoint) init() *syserr.Error {
+ family, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_DOMAIN)
+ if err != nil {
+ return syserr.FromError(err)
+ }
+
+ if family != syscall.AF_UNIX {
+ // We only allow Unix sockets.
+ return syserr.ErrInvalidEndpointState
+ }
+
+ stype, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_TYPE)
+ if err != nil {
+ return syserr.FromError(err)
+ }
+
+ if err := syscall.SetNonblock(c.file.FD(), true); err != nil {
+ return syserr.FromError(err)
+ }
+
+ sndbuf, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+ if err != nil {
+ return syserr.FromError(err)
+ }
+ if sndbuf > maxSendBufferSize {
+ log.Warningf("Socket send buffer too large: %d", sndbuf)
+ return syserr.ErrInvalidEndpointState
+ }
+
+ c.stype = transport.SockType(stype)
+ c.sndbuf = sndbuf
+
+ return nil
+}
+
+// NewConnectedEndpoint creates a new ConnectedEndpoint backed by a host FD
+// that will pretend to be bound at a given sentry path.
+//
+// The caller is responsible for calling Init(). Additionaly, Release needs to
+// be called twice because ConnectedEndpoint is both a transport.Receiver and
+// transport.ConnectedEndpoint.
+func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (*ConnectedEndpoint, *syserr.Error) {
+ e := ConnectedEndpoint{
+ path: path,
+ queue: queue,
+ file: file,
+ srfd: -1,
+ }
+
+ if err := e.init(); err != nil {
+ return nil, err
+ }
+
+ // AtomicRefCounters start off with a single reference. We need two.
+ e.ref.IncRef()
+
+ return &e, nil
+}
+
+// Init will do initialization required without holding other locks.
+func (c *ConnectedEndpoint) Init() {
+ if err := fdnotifier.AddFD(int32(c.file.FD()), c.queue); err != nil {
+ panic(err)
+ }
+}
+
+// NewSocketWithDirent allocates a new unix socket with host endpoint.
+//
+// This is currently only used by unsaveable Gofer nodes.
+//
+// NewSocketWithDirent takes ownership of f on success.
+func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.FileFlags) (*fs.File, error) {
+ f2 := fd.New(f.FD())
+ var q waiter.Queue
+ e, err := NewConnectedEndpoint(f2, &q, "" /* path */)
+ if err != nil {
+ f2.Release()
+ return nil, err.ToError()
+ }
+
+ // Take ownship of the FD.
+ f.Release()
+
+ e.Init()
+
+ ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
+
+ return unixsocket.NewWithDirent(ctx, d, ep, e.stype != transport.SockStream, flags), nil
+}
+
+// newSocket allocates a new unix socket with host endpoint.
+func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error) {
+ ownedfd := orgfd
+ srfd := -1
+ if saveable {
+ var err error
+ ownedfd, err = syscall.Dup(orgfd)
+ if err != nil {
+ return nil, err
+ }
+ srfd = orgfd
+ }
+ f := fd.New(ownedfd)
+ var q waiter.Queue
+ e, err := NewConnectedEndpoint(f, &q, "" /* path */)
+ if err != nil {
+ if saveable {
+ f.Close()
+ } else {
+ f.Release()
+ }
+ return nil, err.ToError()
+ }
+
+ e.srfd = srfd
+ e.Init()
+
+ ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
+
+ return unixsocket.New(ctx, ep, e.stype != transport.SockStream), nil
+}
+
+// Send implements transport.ConnectedEndpoint.Send.
+func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *syserr.Error) {
+ c.mu.RLock()
+ defer c.mu.RUnlock()
+ if c.writeClosed {
+ return 0, false, syserr.ErrClosedForSend
+ }
+
+ if !controlMessages.Empty() {
+ return 0, false, syserr.ErrInvalidEndpointState
+ }
+
+ // Since stream sockets don't preserve message boundaries, we can write
+ // only as much of the message as fits in the send buffer.
+ truncate := c.stype == transport.SockStream
+
+ n, totalLen, err := fdWriteVec(c.file.FD(), data, c.sndbuf, truncate)
+ if n < totalLen && err == nil {
+ // The host only returns a short write if it would otherwise
+ // block (and only for stream sockets).
+ err = syserror.EAGAIN
+ }
+ if n > 0 && err != syserror.EAGAIN {
+ // The caller may need to block to send more data, but
+ // otherwise there isn't anything that can be done about an
+ // error with a partial write.
+ err = nil
+ }
+
+ // There is no need for the callee to call SendNotify because fdWriteVec
+ // uses the host's sendmsg(2) and the host kernel's queue.
+ return n, false, syserr.FromError(err)
+}
+
+// SendNotify implements transport.ConnectedEndpoint.SendNotify.
+func (c *ConnectedEndpoint) SendNotify() {}
+
+// CloseSend implements transport.ConnectedEndpoint.CloseSend.
+func (c *ConnectedEndpoint) CloseSend() {
+ c.mu.Lock()
+ c.writeClosed = true
+ c.mu.Unlock()
+}
+
+// CloseNotify implements transport.ConnectedEndpoint.CloseNotify.
+func (c *ConnectedEndpoint) CloseNotify() {}
+
+// Writable implements transport.ConnectedEndpoint.Writable.
+func (c *ConnectedEndpoint) Writable() bool {
+ c.mu.RLock()
+ defer c.mu.RUnlock()
+ if c.writeClosed {
+ return true
+ }
+ return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventOut)&waiter.EventOut != 0
+}
+
+// Passcred implements transport.ConnectedEndpoint.Passcred.
+func (c *ConnectedEndpoint) Passcred() bool {
+ // We don't support credential passing for host sockets.
+ return false
+}
+
+// GetLocalAddress implements transport.ConnectedEndpoint.GetLocalAddress.
+func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+ return tcpip.FullAddress{Addr: tcpip.Address(c.path)}, nil
+}
+
+// EventUpdate implements transport.ConnectedEndpoint.EventUpdate.
+func (c *ConnectedEndpoint) EventUpdate() {
+ c.mu.RLock()
+ defer c.mu.RUnlock()
+ if c.file.FD() != -1 {
+ fdnotifier.UpdateFD(int32(c.file.FD()))
+ }
+}
+
+// Recv implements transport.Receiver.Recv.
+func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+ c.mu.RLock()
+ defer c.mu.RUnlock()
+ if c.readClosed {
+ return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.ErrClosedForReceive
+ }
+
+ var cm unet.ControlMessage
+ if numRights > 0 {
+ cm.EnableFDs(int(numRights))
+ }
+
+ // N.B. Unix sockets don't have a receive buffer, the send buffer
+ // serves both purposes.
+ rl, ml, cl, cTrunc, err := fdReadVec(c.file.FD(), data, []byte(cm), peek, c.sndbuf)
+ if rl > 0 && err != nil {
+ // We got some data, so all we need to do on error is return
+ // the data that we got. Short reads are fine, no need to
+ // block.
+ err = nil
+ }
+ if err != nil {
+ return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err)
+ }
+
+ // There is no need for the callee to call RecvNotify because fdReadVec uses
+ // the host's recvmsg(2) and the host kernel's queue.
+
+ // Trim the control data if we received less than the full amount.
+ if cl < uint64(len(cm)) {
+ cm = cm[:cl]
+ }
+
+ // Avoid extra allocations in the case where there isn't any control data.
+ if len(cm) == 0 {
+ return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
+ }
+
+ fds, err := cm.ExtractFDs()
+ if err != nil {
+ return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err)
+ }
+
+ if len(fds) == 0 {
+ return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
+ }
+ return rl, ml, control.New(nil, nil, newSCMRights(fds)), cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
+}
+
+// close releases all resources related to the endpoint.
+func (c *ConnectedEndpoint) close() {
+ fdnotifier.RemoveFD(int32(c.file.FD()))
+ c.file.Close()
+ c.file = nil
+}
+
+// RecvNotify implements transport.Receiver.RecvNotify.
+func (c *ConnectedEndpoint) RecvNotify() {}
+
+// CloseRecv implements transport.Receiver.CloseRecv.
+func (c *ConnectedEndpoint) CloseRecv() {
+ c.mu.Lock()
+ c.readClosed = true
+ c.mu.Unlock()
+}
+
+// Readable implements transport.Receiver.Readable.
+func (c *ConnectedEndpoint) Readable() bool {
+ c.mu.RLock()
+ defer c.mu.RUnlock()
+ if c.readClosed {
+ return true
+ }
+ return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventIn)&waiter.EventIn != 0
+}
+
+// SendQueuedSize implements transport.Receiver.SendQueuedSize.
+func (c *ConnectedEndpoint) SendQueuedSize() int64 {
+ // SendQueuedSize isn't supported for host sockets because we don't allow the
+ // sentry to call ioctl(2).
+ return -1
+}
+
+// RecvQueuedSize implements transport.Receiver.RecvQueuedSize.
+func (c *ConnectedEndpoint) RecvQueuedSize() int64 {
+ // RecvQueuedSize isn't supported for host sockets because we don't allow the
+ // sentry to call ioctl(2).
+ return -1
+}
+
+// SendMaxQueueSize implements transport.Receiver.SendMaxQueueSize.
+func (c *ConnectedEndpoint) SendMaxQueueSize() int64 {
+ return int64(c.sndbuf)
+}
+
+// RecvMaxQueueSize implements transport.Receiver.RecvMaxQueueSize.
+func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 {
+ // N.B. Unix sockets don't use the receive buffer. We'll claim it is
+ // the same size as the send buffer.
+ return int64(c.sndbuf)
+}
+
+// Release implements transport.ConnectedEndpoint.Release and transport.Receiver.Release.
+func (c *ConnectedEndpoint) Release() {
+ c.ref.DecRefWithDestructor(c.close)
+}
diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go
new file mode 100644
index 000000000..5efbb3ae8
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_iovec.go
@@ -0,0 +1,113 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// maxIovs is the maximum number of iovecs to pass to the host.
+var maxIovs = linux.UIO_MAXIOV
+
+// copyToMulti copies as many bytes from src to dst as possible.
+func copyToMulti(dst [][]byte, src []byte) {
+ for _, d := range dst {
+ done := copy(d, src)
+ src = src[done:]
+ if len(src) == 0 {
+ break
+ }
+ }
+}
+
+// copyFromMulti copies as many bytes from src to dst as possible.
+func copyFromMulti(dst []byte, src [][]byte) {
+ for _, s := range src {
+ done := copy(dst, s)
+ dst = dst[done:]
+ if len(dst) == 0 {
+ break
+ }
+ }
+}
+
+// buildIovec builds an iovec slice from the given []byte slice.
+//
+// If truncate, truncate bufs > maxlen. Otherwise, immediately return an error.
+//
+// If length < the total length of bufs, err indicates why, even when returning
+// a truncated iovec.
+//
+// If intermediate != nil, iovecs references intermediate rather than bufs and
+// the caller must copy to/from bufs as necessary.
+func buildIovec(bufs [][]byte, maxlen int, truncate bool) (length uintptr, iovecs []syscall.Iovec, intermediate []byte, err error) {
+ var iovsRequired int
+ for _, b := range bufs {
+ length += uintptr(len(b))
+ if len(b) > 0 {
+ iovsRequired++
+ }
+ }
+
+ stopLen := length
+ if length > uintptr(maxlen) {
+ if truncate {
+ stopLen = uintptr(maxlen)
+ err = syserror.EAGAIN
+ } else {
+ return 0, nil, nil, syserror.EMSGSIZE
+ }
+ }
+
+ if iovsRequired > maxIovs {
+ // The kernel will reject our call if we pass this many iovs.
+ // Use a single intermediate buffer instead.
+ b := make([]byte, stopLen)
+
+ return stopLen, []syscall.Iovec{{
+ Base: &b[0],
+ Len: uint64(stopLen),
+ }}, b, err
+ }
+
+ var total uintptr
+ iovecs = make([]syscall.Iovec, 0, iovsRequired)
+ for i := range bufs {
+ l := len(bufs[i])
+ if l == 0 {
+ continue
+ }
+
+ stop := l
+ if total+uintptr(stop) > stopLen {
+ stop = int(stopLen - total)
+ }
+
+ iovecs = append(iovecs, syscall.Iovec{
+ Base: &bufs[i][0],
+ Len: uint64(stop),
+ })
+
+ total += uintptr(stop)
+ if total >= stopLen {
+ break
+ }
+ }
+
+ return total, iovecs, nil, err
+}
diff --git a/pkg/sentry/fs/host/socket_state.go b/pkg/sentry/fs/host/socket_state.go
new file mode 100644
index 000000000..5676c451a
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_state.go
@@ -0,0 +1,42 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "fmt"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/fd"
+)
+
+// beforeSave is invoked by stateify.
+func (c *ConnectedEndpoint) beforeSave() {
+ if c.srfd < 0 {
+ panic("only host file descriptors provided at sentry startup can be saved")
+ }
+}
+
+// afterLoad is invoked by stateify.
+func (c *ConnectedEndpoint) afterLoad() {
+ f, err := syscall.Dup(c.srfd)
+ if err != nil {
+ panic(fmt.Sprintf("failed to dup restored FD %d: %v", c.srfd, err))
+ }
+ c.file = fd.New(f)
+ if err := c.init(); err != nil {
+ panic(fmt.Sprintf("Could not restore host socket FD %d: %v", c.srfd, err))
+ }
+ c.Init()
+}
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
new file mode 100644
index 000000000..e57be0506
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -0,0 +1,100 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "syscall"
+ "unsafe"
+)
+
+// fdReadVec receives from fd to bufs.
+//
+// If the total length of bufs is > maxlen, fdReadVec will do a partial read
+// and err will indicate why the message was truncated.
+func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int) (readLen uintptr, msgLen uintptr, controlLen uint64, controlTrunc bool, err error) {
+ flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC)
+ if peek {
+ flags |= syscall.MSG_PEEK
+ }
+
+ // Always truncate the receive buffer. All socket types will truncate
+ // received messages.
+ length, iovecs, intermediate, err := buildIovec(bufs, maxlen, true)
+ if err != nil && len(iovecs) == 0 {
+ // No partial write to do, return error immediately.
+ return 0, 0, 0, false, err
+ }
+
+ var msg syscall.Msghdr
+ if len(control) != 0 {
+ msg.Control = &control[0]
+ msg.Controllen = uint64(len(control))
+ }
+
+ if len(iovecs) != 0 {
+ msg.Iov = &iovecs[0]
+ msg.Iovlen = uint64(len(iovecs))
+ }
+
+ n, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags)
+ if e != 0 {
+ // N.B. prioritize the syscall error over the buildIovec error.
+ return 0, 0, 0, false, e
+ }
+
+ // Copy data back to bufs.
+ if intermediate != nil {
+ copyToMulti(bufs, intermediate)
+ }
+
+ controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
+
+ if n > length {
+ return length, n, msg.Controllen, controlTrunc, err
+ }
+
+ return n, n, msg.Controllen, controlTrunc, err
+}
+
+// fdWriteVec sends from bufs to fd.
+//
+// If the total length of bufs is > maxlen && truncate, fdWriteVec will do a
+// partial write and err will indicate why the message was truncated.
+func fdWriteVec(fd int, bufs [][]byte, maxlen int, truncate bool) (uintptr, uintptr, error) {
+ length, iovecs, intermediate, err := buildIovec(bufs, maxlen, truncate)
+ if err != nil && len(iovecs) == 0 {
+ // No partial write to do, return error immediately.
+ return 0, length, err
+ }
+
+ // Copy data to intermediate buf.
+ if intermediate != nil {
+ copyFromMulti(intermediate, bufs)
+ }
+
+ var msg syscall.Msghdr
+ if len(iovecs) > 0 {
+ msg.Iov = &iovecs[0]
+ msg.Iovlen = uint64(len(iovecs))
+ }
+
+ n, _, e := syscall.RawSyscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), syscall.MSG_DONTWAIT|syscall.MSG_NOSIGNAL)
+ if e != 0 {
+ // N.B. prioritize the syscall error over the buildIovec error.
+ return 0, length, e
+ }
+
+ return n, length, err
+}
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
new file mode 100644
index 000000000..e45b339f5
--- /dev/null
+++ b/pkg/sentry/fs/host/tty.go
@@ -0,0 +1,351 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "sync"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// TTYFileOperations implements fs.FileOperations for a host file descriptor
+// that wraps a TTY FD.
+//
+// +stateify savable
+type TTYFileOperations struct {
+ fileOperations
+
+ // mu protects the fields below.
+ mu sync.Mutex `state:"nosave"`
+
+ // session is the session attached to this TTYFileOperations.
+ session *kernel.Session
+
+ // fgProcessGroup is the foreground process group that is currently
+ // connected to this TTY.
+ fgProcessGroup *kernel.ProcessGroup
+}
+
+// newTTYFile returns a new fs.File that wraps a TTY FD.
+func newTTYFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File {
+ return fs.NewFile(ctx, dirent, flags, &TTYFileOperations{
+ fileOperations: fileOperations{iops: iops},
+ })
+}
+
+// InitForegroundProcessGroup sets the foreground process group and session for
+// the TTY. This should only be called once, after the foreground process group
+// has been created, but before it has started running.
+func (t *TTYFileOperations) InitForegroundProcessGroup(pg *kernel.ProcessGroup) {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ if t.fgProcessGroup != nil {
+ panic("foreground process group is already set")
+ }
+ t.fgProcessGroup = pg
+ t.session = pg.Session()
+}
+
+// ForegroundProcessGroup returns the foreground process for the TTY.
+func (t *TTYFileOperations) ForegroundProcessGroup() *kernel.ProcessGroup {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ return t.fgProcessGroup
+}
+
+// Read implements fs.FileOperations.Read.
+//
+// Reading from a TTY is only allowed for foreground process groups. Background
+// process groups will either get EIO or a SIGTTIN.
+//
+// See drivers/tty/n_tty.c:n_tty_read()=>job_control().
+func (t *TTYFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+
+ // Are we allowed to do the read?
+ // drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change().
+ if err := t.checkChange(ctx, linux.SIGTTIN); err != nil {
+ return 0, err
+ }
+
+ // Do the read.
+ return t.fileOperations.Read(ctx, file, dst, offset)
+}
+
+// Write implements fs.FileOperations.Write.
+func (t *TTYFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+
+ // Are we allowed to do the write?
+ if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+ return 0, err
+ }
+ return t.fileOperations.Write(ctx, file, src, offset)
+}
+
+// Release implements fs.FileOperations.Release.
+func (t *TTYFileOperations) Release() {
+ t.mu.Lock()
+ t.fgProcessGroup = nil
+ t.mu.Unlock()
+
+ t.fileOperations.Release()
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ // Ignore arg[0]. This is the real FD:
+ fd := t.fileOperations.iops.fileState.FD()
+ ioctl := args[1].Uint64()
+ switch ioctl {
+ case linux.TCGETS:
+ termios, err := ioctlGetTermios(fd)
+ if err != nil {
+ return 0, err
+ }
+ _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ return 0, err
+
+ case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
+ t.mu.Lock()
+ defer t.mu.Unlock()
+
+ if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+ return 0, err
+ }
+
+ var termios linux.Termios
+ if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
+ AddressSpaceActive: true,
+ }); err != nil {
+ return 0, err
+ }
+ err := ioctlSetTermios(fd, ioctl, &termios)
+ return 0, err
+
+ case linux.TIOCGPGRP:
+ // Args: pid_t *argp
+ // When successful, equivalent to *argp = tcgetpgrp(fd).
+ // Get the process group ID of the foreground process group on
+ // this terminal.
+
+ pidns := kernel.PIDNamespaceFromContext(ctx)
+ if pidns == nil {
+ return 0, syserror.ENOTTY
+ }
+
+ t.mu.Lock()
+ defer t.mu.Unlock()
+
+ // Map the ProcessGroup into a ProcessGroupID in the task's PID
+ // namespace.
+ pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
+ _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ return 0, err
+
+ case linux.TIOCSPGRP:
+ // Args: const pid_t *argp
+ // Equivalent to tcsetpgrp(fd, *argp).
+ // Set the foreground process group ID of this terminal.
+
+ task := kernel.TaskFromContext(ctx)
+ if task == nil {
+ return 0, syserror.ENOTTY
+ }
+
+ t.mu.Lock()
+ defer t.mu.Unlock()
+
+ // Check that we are allowed to set the process group.
+ if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+ // drivers/tty/tty_io.c:tiocspgrp() converts -EIO from
+ // tty_check_change() to -ENOTTY.
+ if err == syserror.EIO {
+ return 0, syserror.ENOTTY
+ }
+ return 0, err
+ }
+
+ // Check that calling task's process group is in the TTY
+ // session.
+ if task.ThreadGroup().Session() != t.session {
+ return 0, syserror.ENOTTY
+ }
+
+ var pgID kernel.ProcessGroupID
+ if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
+ AddressSpaceActive: true,
+ }); err != nil {
+ return 0, err
+ }
+
+ // pgID must be non-negative.
+ if pgID < 0 {
+ return 0, syserror.EINVAL
+ }
+
+ // Process group with pgID must exist in this PID namespace.
+ pidns := task.PIDNamespace()
+ pg := pidns.ProcessGroupWithID(pgID)
+ if pg == nil {
+ return 0, syserror.ESRCH
+ }
+
+ // Check that new process group is in the TTY session.
+ if pg.Session() != t.session {
+ return 0, syserror.EPERM
+ }
+
+ t.fgProcessGroup = pg
+ return 0, nil
+
+ case linux.TIOCGWINSZ:
+ // Args: struct winsize *argp
+ // Get window size.
+ winsize, err := ioctlGetWinsize(fd)
+ if err != nil {
+ return 0, err
+ }
+ _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ return 0, err
+
+ case linux.TIOCSWINSZ:
+ // Args: const struct winsize *argp
+ // Set window size.
+
+ // Unlike setting the termios, any process group (even
+ // background ones) can set the winsize.
+
+ var winsize linux.Winsize
+ if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
+ AddressSpaceActive: true,
+ }); err != nil {
+ return 0, err
+ }
+ err := ioctlSetWinsize(fd, &winsize)
+ return 0, err
+
+ // Unimplemented commands.
+ case linux.TIOCSETD,
+ linux.TIOCSBRK,
+ linux.TIOCCBRK,
+ linux.TCSBRK,
+ linux.TCSBRKP,
+ linux.TIOCSTI,
+ linux.TIOCCONS,
+ linux.FIONBIO,
+ linux.TIOCEXCL,
+ linux.TIOCNXCL,
+ linux.TIOCGEXCL,
+ linux.TIOCNOTTY,
+ linux.TIOCSCTTY,
+ linux.TIOCGSID,
+ linux.TIOCGETD,
+ linux.TIOCVHANGUP,
+ linux.TIOCGDEV,
+ linux.TIOCMGET,
+ linux.TIOCMSET,
+ linux.TIOCMBIC,
+ linux.TIOCMBIS,
+ linux.TIOCGICOUNT,
+ linux.TCFLSH,
+ linux.TIOCSSERIAL,
+ linux.TIOCGPTPEER:
+
+ unimpl.EmitUnimplementedEvent(ctx)
+ fallthrough
+ default:
+ return 0, syserror.ENOTTY
+ }
+}
+
+// checkChange checks that the process group is allowed to read, write, or
+// change the state of the TTY.
+//
+// This corresponds to Linux drivers/tty/tty_io.c:tty_check_change(). The logic
+// is a bit convoluted, but documented inline.
+//
+// Preconditions: t.mu must be held.
+func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) error {
+ task := kernel.TaskFromContext(ctx)
+ if task == nil {
+ // No task? Linux does not have an analog for this case, but
+ // tty_check_change is more of a blacklist of cases than a
+ // whitelist, and is surprisingly permissive. Allowing the
+ // change seems most appropriate.
+ return nil
+ }
+
+ tg := task.ThreadGroup()
+ pg := tg.ProcessGroup()
+
+ // If the session for the task is different than the session for the
+ // controlling TTY, then the change is allowed. Seems like a bad idea,
+ // but that's exactly what linux does.
+ if tg.Session() != t.fgProcessGroup.Session() {
+ return nil
+ }
+
+ // If we are the foreground process group, then the change is allowed.
+ if pg == t.fgProcessGroup {
+ return nil
+ }
+
+ // We are not the foreground process group.
+
+ // Is the provided signal blocked or ignored?
+ if (task.SignalMask()&linux.SignalSetOf(sig) != 0) || tg.SignalHandlers().IsIgnored(sig) {
+ // If the signal is SIGTTIN, then we are attempting to read
+ // from the TTY. Don't send the signal and return EIO.
+ if sig == linux.SIGTTIN {
+ return syserror.EIO
+ }
+
+ // Otherwise, we are writing or changing terminal state. This is allowed.
+ return nil
+ }
+
+ // If the process group is an orphan, return EIO.
+ if pg.IsOrphan() {
+ return syserror.EIO
+ }
+
+ // Otherwise, send the signal to the process group and return ERESTARTSYS.
+ //
+ // Note that Linux also unconditionally sets TIF_SIGPENDING on current,
+ // but this isn't necessary in gVisor because the rationale given in
+ // 040b6362d58f "tty: fix leakage of -ERESTARTSYS to userland" doesn't
+ // apply: the sentry will handle -ERESTARTSYS in
+ // kernel.runApp.execute() even if the kernel.Task isn't interrupted.
+ //
+ // Linux ignores the result of kill_pgrp().
+ _ = pg.SendSignal(kernel.SignalInfoPriv(sig))
+ return kernel.ERESTARTSYS
+}
diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go
new file mode 100644
index 000000000..94ff7708e
--- /dev/null
+++ b/pkg/sentry/fs/host/util.go
@@ -0,0 +1,197 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "os"
+ "path"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func open(parent *inodeOperations, name string) (int, error) {
+ if parent == nil && !path.IsAbs(name) {
+ return -1, syserror.EINVAL
+ }
+ name = path.Clean(name)
+
+ // Don't follow through symlinks.
+ flags := syscall.O_NOFOLLOW
+
+ if fd, err := openAt(parent, name, flags|syscall.O_RDWR, 0); err == nil {
+ return fd, nil
+ }
+ // Retry as read-only.
+ if fd, err := openAt(parent, name, flags|syscall.O_RDONLY, 0); err == nil {
+ return fd, nil
+ }
+
+ // Retry as write-only.
+ if fd, err := openAt(parent, name, flags|syscall.O_WRONLY, 0); err == nil {
+ return fd, nil
+ }
+
+ // Retry as a symlink, by including O_PATH as an option.
+ fd, err := openAt(parent, name, linux.O_PATH|flags, 0)
+ if err == nil {
+ return fd, nil
+ }
+
+ // Everything failed.
+ return -1, err
+}
+
+func openAt(parent *inodeOperations, name string, flags int, perm linux.FileMode) (int, error) {
+ if parent == nil {
+ return syscall.Open(name, flags, uint32(perm))
+ }
+ return syscall.Openat(parent.fileState.FD(), name, flags, uint32(perm))
+}
+
+func nodeType(s *syscall.Stat_t) fs.InodeType {
+ switch x := (s.Mode & syscall.S_IFMT); x {
+ case syscall.S_IFLNK:
+ return fs.Symlink
+ case syscall.S_IFIFO:
+ return fs.Pipe
+ case syscall.S_IFCHR:
+ return fs.CharacterDevice
+ case syscall.S_IFBLK:
+ return fs.BlockDevice
+ case syscall.S_IFSOCK:
+ return fs.Socket
+ case syscall.S_IFDIR:
+ return fs.Directory
+ case syscall.S_IFREG:
+ return fs.RegularFile
+ default:
+ // This shouldn't happen, but just in case...
+ log.Warningf("unknown host file type %d: assuming regular", x)
+ return fs.RegularFile
+ }
+}
+
+func wouldBlock(s *syscall.Stat_t) bool {
+ typ := nodeType(s)
+ return typ == fs.Pipe || typ == fs.Socket || typ == fs.CharacterDevice
+}
+
+func stableAttr(s *syscall.Stat_t) fs.StableAttr {
+ return fs.StableAttr{
+ Type: nodeType(s),
+ DeviceID: hostFileDevice.DeviceID(),
+ InodeID: hostFileDevice.Map(device.MultiDeviceKey{
+ Device: s.Dev,
+ Inode: s.Ino,
+ }),
+ BlockSize: int64(s.Blksize),
+ }
+}
+
+func owner(mo *superOperations, s *syscall.Stat_t) fs.FileOwner {
+ // User requested no translation, just return actual owner.
+ if mo.dontTranslateOwnership {
+ return fs.FileOwner{auth.KUID(s.Uid), auth.KGID(s.Gid)}
+ }
+
+ // Show only IDs relevant to the sandboxed task. I.e. if we not own the
+ // file, no sandboxed task can own the file. In that case, we
+ // use OverflowID for UID, implying that the IDs are not mapped in the
+ // "root" user namespace.
+ //
+ // E.g.
+ // sandbox's host EUID/EGID is 1/1.
+ // some_dir's host UID/GID is 2/1.
+ // Task that mounted this fs has virtualized EUID/EGID 5/5.
+ //
+ // If you executed `ls -n` in the sandboxed task, it would show:
+ // drwxwrxwrx [...] 65534 5 [...] some_dir
+
+ // Files are owned by OverflowID by default.
+ owner := fs.FileOwner{auth.KUID(auth.OverflowUID), auth.KGID(auth.OverflowGID)}
+
+ // If we own file on host, let mounting task's initial EUID own
+ // the file.
+ if s.Uid == hostUID {
+ owner.UID = mo.mounter.UID
+ }
+
+ // If our group matches file's group, make file's group match
+ // the mounting task's initial EGID.
+ for _, gid := range hostGIDs {
+ if s.Gid == gid {
+ owner.GID = mo.mounter.GID
+ break
+ }
+ }
+ return owner
+}
+
+func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr {
+ return fs.UnstableAttr{
+ Size: s.Size,
+ Usage: s.Blocks * 512,
+ Perms: fs.FilePermsFromMode(linux.FileMode(s.Mode)),
+ Owner: owner(mo, s),
+ AccessTime: ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec),
+ ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec),
+ StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec),
+ Links: s.Nlink,
+ }
+}
+
+type dirInfo struct {
+ buf []byte // buffer for directory I/O.
+ nbuf int // length of buf; return value from ReadDirent.
+ bufp int // location of next record in buf.
+}
+
+// isBlockError unwraps os errors and checks if they are caused by EAGAIN or
+// EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock.
+func isBlockError(err error) bool {
+ if err == syserror.EAGAIN || err == syserror.EWOULDBLOCK {
+ return true
+ }
+ if pe, ok := err.(*os.PathError); ok {
+ return isBlockError(pe.Err)
+ }
+ return false
+}
+
+func hostEffectiveKIDs() (uint32, []uint32, error) {
+ gids, err := os.Getgroups()
+ if err != nil {
+ return 0, nil, err
+ }
+ egids := make([]uint32, len(gids))
+ for i, gid := range gids {
+ egids[i] = uint32(gid)
+ }
+ return uint32(os.Geteuid()), append(egids, uint32(os.Getegid())), nil
+}
+
+var hostUID uint32
+var hostGIDs []uint32
+
+func init() {
+ hostUID, hostGIDs, _ = hostEffectiveKIDs()
+}
diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go
new file mode 100644
index 000000000..b95a57c3f
--- /dev/null
+++ b/pkg/sentry/fs/host/util_unsafe.go
@@ -0,0 +1,137 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+ "syscall"
+ "unsafe"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+)
+
+// NulByte is a single NUL byte. It is passed to readlinkat as an empty string.
+var NulByte byte = '\x00'
+
+func createLink(fd int, name string, linkName string) error {
+ namePtr, err := syscall.BytePtrFromString(name)
+ if err != nil {
+ return err
+ }
+ linkNamePtr, err := syscall.BytePtrFromString(linkName)
+ if err != nil {
+ return err
+ }
+ _, _, errno := syscall.Syscall(
+ syscall.SYS_SYMLINKAT,
+ uintptr(unsafe.Pointer(namePtr)),
+ uintptr(fd),
+ uintptr(unsafe.Pointer(linkNamePtr)))
+ if errno != 0 {
+ return errno
+ }
+ return nil
+}
+
+func readLink(fd int) (string, error) {
+ // Buffer sizing copied from os.Readlink.
+ for l := 128; ; l *= 2 {
+ b := make([]byte, l)
+ n, _, errno := syscall.Syscall6(
+ syscall.SYS_READLINKAT,
+ uintptr(fd),
+ uintptr(unsafe.Pointer(&NulByte)), // ""
+ uintptr(unsafe.Pointer(&b[0])),
+ uintptr(l),
+ 0, 0)
+ if errno != 0 {
+ return "", errno
+ }
+ if n < uintptr(l) {
+ return string(b[:n]), nil
+ }
+ }
+}
+
+func unlinkAt(fd int, name string, dir bool) error {
+ namePtr, err := syscall.BytePtrFromString(name)
+ if err != nil {
+ return err
+ }
+ var flags uintptr
+ if dir {
+ flags = linux.AT_REMOVEDIR
+ }
+ _, _, errno := syscall.Syscall(
+ syscall.SYS_UNLINKAT,
+ uintptr(fd),
+ uintptr(unsafe.Pointer(namePtr)),
+ flags,
+ )
+ if errno != 0 {
+ return errno
+ }
+ return nil
+}
+
+func timespecFromTimestamp(t ktime.Time, omit, setSysTime bool) syscall.Timespec {
+ if omit {
+ return syscall.Timespec{0, linux.UTIME_OMIT}
+ }
+ if setSysTime {
+ return syscall.Timespec{0, linux.UTIME_NOW}
+ }
+ return syscall.NsecToTimespec(t.Nanoseconds())
+}
+
+func setTimestamps(fd int, ts fs.TimeSpec) error {
+ if ts.ATimeOmit && ts.MTimeOmit {
+ return nil
+ }
+ var sts [2]syscall.Timespec
+ sts[0] = timespecFromTimestamp(ts.ATime, ts.ATimeOmit, ts.ATimeSetSystemTime)
+ sts[1] = timespecFromTimestamp(ts.MTime, ts.MTimeOmit, ts.MTimeSetSystemTime)
+ _, _, errno := syscall.Syscall6(
+ syscall.SYS_UTIMENSAT,
+ uintptr(fd),
+ 0, /* path */
+ uintptr(unsafe.Pointer(&sts)),
+ 0, /* flags */
+ 0, 0)
+ if errno != 0 {
+ return errno
+ }
+ return nil
+}
+
+func fstatat(fd int, name string, flags int) (syscall.Stat_t, error) {
+ var stat syscall.Stat_t
+ namePtr, err := syscall.BytePtrFromString(name)
+ if err != nil {
+ return stat, err
+ }
+ _, _, errno := syscall.Syscall6(
+ syscall.SYS_NEWFSTATAT,
+ uintptr(fd),
+ uintptr(unsafe.Pointer(namePtr)),
+ uintptr(unsafe.Pointer(&stat)),
+ uintptr(flags),
+ 0, 0)
+ if errno != 0 {
+ return stat, errno
+ }
+ return stat, nil
+}