diff options
author | gVisor bot <gvisor-bot@google.com> | 2019-06-02 06:44:55 +0000 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2019-06-02 06:44:55 +0000 |
commit | ceb0d792f328d1fc0692197d8856a43c3936a571 (patch) | |
tree | 83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/fs/host | |
parent | deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff) | |
parent | 216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff) |
Merge 216da0b7 (automated)
Diffstat (limited to 'pkg/sentry/fs/host')
-rw-r--r-- | pkg/sentry/fs/host/control.go | 93 | ||||
-rw-r--r-- | pkg/sentry/fs/host/descriptor.go | 120 | ||||
-rw-r--r-- | pkg/sentry/fs/host/descriptor_state.go | 29 | ||||
-rw-r--r-- | pkg/sentry/fs/host/device.go | 25 | ||||
-rw-r--r-- | pkg/sentry/fs/host/file.go | 286 | ||||
-rw-r--r-- | pkg/sentry/fs/host/fs.go | 339 | ||||
-rwxr-xr-x | pkg/sentry/fs/host/host_state_autogen.go | 142 | ||||
-rw-r--r-- | pkg/sentry/fs/host/inode.go | 527 | ||||
-rw-r--r-- | pkg/sentry/fs/host/inode_state.go | 79 | ||||
-rw-r--r-- | pkg/sentry/fs/host/ioctl_unsafe.go | 56 | ||||
-rw-r--r-- | pkg/sentry/fs/host/socket.go | 390 | ||||
-rw-r--r-- | pkg/sentry/fs/host/socket_iovec.go | 113 | ||||
-rw-r--r-- | pkg/sentry/fs/host/socket_state.go | 42 | ||||
-rw-r--r-- | pkg/sentry/fs/host/socket_unsafe.go | 100 | ||||
-rw-r--r-- | pkg/sentry/fs/host/tty.go | 351 | ||||
-rw-r--r-- | pkg/sentry/fs/host/util.go | 197 | ||||
-rw-r--r-- | pkg/sentry/fs/host/util_unsafe.go | 137 |
17 files changed, 3026 insertions, 0 deletions
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go new file mode 100644 index 000000000..9ebb9bbb3 --- /dev/null +++ b/pkg/sentry/fs/host/control.go @@ -0,0 +1,93 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/control" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" +) + +type scmRights struct { + fds []int +} + +func newSCMRights(fds []int) control.SCMRights { + return &scmRights{fds} +} + +// Files implements control.SCMRights.Files. +func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFiles, bool) { + n := max + var trunc bool + if l := len(c.fds); n > l { + n = l + } else if n < l { + trunc = true + } + + rf := control.RightsFiles(fdsToFiles(ctx, c.fds[:n])) + + // Only consume converted FDs (fdsToFiles may convert fewer than n FDs). + c.fds = c.fds[len(rf):] + return rf, trunc +} + +// Clone implements transport.RightsControlMessage.Clone. +func (c *scmRights) Clone() transport.RightsControlMessage { + // Host rights never need to be cloned. + return nil +} + +// Release implements transport.RightsControlMessage.Release. +func (c *scmRights) Release() { + for _, fd := range c.fds { + syscall.Close(fd) + } + c.fds = nil +} + +// If an error is encountered, only files created before the error will be +// returned. This is what Linux does. +func fdsToFiles(ctx context.Context, fds []int) []*fs.File { + files := make([]*fs.File, 0, len(fds)) + for _, fd := range fds { + // Get flags. We do it here because they may be modified + // by subsequent functions. + fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0) + if errno != 0 { + ctx.Warningf("Error retrieving host FD flags: %v", error(errno)) + break + } + + // Create the file backed by hostFD. + file, err := NewFile(ctx, fd, fs.FileOwnerFromContext(ctx)) + if err != nil { + ctx.Warningf("Error creating file from host FD: %v", err) + break + } + + // Set known flags. + file.SetFlags(fs.SettableFileFlags{ + NonBlocking: fileFlags&syscall.O_NONBLOCK != 0, + }) + + files = append(files, file) + } + return files +} diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go new file mode 100644 index 000000000..ffcd57a94 --- /dev/null +++ b/pkg/sentry/fs/host/descriptor.go @@ -0,0 +1,120 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "path" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/fdnotifier" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// descriptor wraps a host fd. +// +// +stateify savable +type descriptor struct { + // donated is true if the host fd was donated by another process. + donated bool + + // If origFD >= 0, it is the host fd that this file was originally created + // from, which must be available at time of restore. The FD can be closed + // after descriptor is created. Only set if donated is true. + origFD int + + // wouldBlock is true if value (below) points to a file that can + // return EWOULDBLOCK for operations that would block. + wouldBlock bool + + // value is the wrapped host fd. It is never saved or restored + // directly. How it is restored depends on whether it was + // donated and the fs.MountSource it was originally + // opened/created from. + value int `state:"nosave"` +} + +// newDescriptor returns a wrapped host file descriptor. On success, +// the descriptor is registered for event notifications with queue. +func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue *waiter.Queue) (*descriptor, error) { + ownedFD := fd + origFD := -1 + if saveable { + var err error + ownedFD, err = syscall.Dup(fd) + if err != nil { + return nil, err + } + origFD = fd + } + if wouldBlock { + if err := syscall.SetNonblock(ownedFD, true); err != nil { + return nil, err + } + if err := fdnotifier.AddFD(int32(ownedFD), queue); err != nil { + return nil, err + } + } + return &descriptor{ + donated: donated, + origFD: origFD, + wouldBlock: wouldBlock, + value: ownedFD, + }, nil +} + +// initAfterLoad initializes the value of the descriptor after Load. +func (d *descriptor) initAfterLoad(mo *superOperations, id uint64, queue *waiter.Queue) error { + if d.donated { + var err error + d.value, err = syscall.Dup(d.origFD) + if err != nil { + return fmt.Errorf("failed to dup restored fd %d: %v", d.origFD, err) + } + } else { + name, ok := mo.inodeMappings[id] + if !ok { + return fmt.Errorf("failed to find path for inode number %d", id) + } + fullpath := path.Join(mo.root, name) + + var err error + d.value, err = open(nil, fullpath) + if err != nil { + return fmt.Errorf("failed to open %q: %v", fullpath, err) + } + } + if d.wouldBlock { + if err := syscall.SetNonblock(d.value, true); err != nil { + return err + } + if err := fdnotifier.AddFD(int32(d.value), queue); err != nil { + return err + } + } + return nil +} + +// Release releases all resources held by descriptor. +func (d *descriptor) Release() { + if d.wouldBlock { + fdnotifier.RemoveFD(int32(d.value)) + } + if err := syscall.Close(d.value); err != nil { + log.Warningf("error closing fd %d: %v", d.value, err) + } + d.value = -1 +} diff --git a/pkg/sentry/fs/host/descriptor_state.go b/pkg/sentry/fs/host/descriptor_state.go new file mode 100644 index 000000000..8167390a9 --- /dev/null +++ b/pkg/sentry/fs/host/descriptor_state.go @@ -0,0 +1,29 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +// beforeSave is invoked by stateify. +func (d *descriptor) beforeSave() { + if d.donated && d.origFD < 0 { + panic("donated file descriptor cannot be saved") + } +} + +// afterLoad is invoked by stateify. +func (d *descriptor) afterLoad() { + // value must be manually restored by the descriptor's parent using + // initAfterLoad. + d.value = -1 +} diff --git a/pkg/sentry/fs/host/device.go b/pkg/sentry/fs/host/device.go new file mode 100644 index 000000000..055024c44 --- /dev/null +++ b/pkg/sentry/fs/host/device.go @@ -0,0 +1,25 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/device" +) + +// hostFileDevice is the host file virtual device. +var hostFileDevice = device.NewAnonMultiDevice() + +// hostPipeDevice is the host pipe virtual device. +var hostPipeDevice = device.NewAnonDevice() diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go new file mode 100644 index 000000000..ad0a3ec85 --- /dev/null +++ b/pkg/sentry/fs/host/file.go @@ -0,0 +1,286 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/fdnotifier" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/secio" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// fileOperations implements fs.FileOperations for a host file descriptor. +// +// +stateify savable +type fileOperations struct { + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoSplice `state:"nosplice"` + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + // iops are the Inode operations for this file. + iops *inodeOperations `state:"wait"` + + // a scratch buffer for reading directory entries. + dirinfo *dirInfo `state:"nosave"` + + // dirCursor is the directory cursor. + dirCursor string +} + +// fileOperations implements fs.FileOperations. +var _ fs.FileOperations = (*fileOperations)(nil) + +// NewFile creates a new File backed by the provided host file descriptor. If +// NewFile succeeds, ownership of the FD is transferred to the returned File. +// +// The returned File cannot be saved, since there is no guarantee that the same +// FD will exist or represent the same file at time of restore. If such a +// guarantee does exist, use ImportFile instead. +func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error) { + return newFileFromDonatedFD(ctx, fd, mounter, false, false) +} + +// ImportFile creates a new File backed by the provided host file descriptor. +// Unlike NewFile, the file descriptor used by the File is duped from FD to +// ensure that later changes to FD are not reflected by the fs.File. +// +// If the returned file is saved, it will be restored by re-importing the FD +// originally passed to ImportFile. It is the restorer's responsibility to +// ensure that the FD represents the same file. +func ImportFile(ctx context.Context, fd int, mounter fs.FileOwner, isTTY bool) (*fs.File, error) { + return newFileFromDonatedFD(ctx, fd, mounter, true, isTTY) +} + +// newFileFromDonatedFD returns an fs.File from a donated FD. If the FD is +// saveable, then saveable is true. +func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner, saveable, isTTY bool) (*fs.File, error) { + var s syscall.Stat_t + if err := syscall.Fstat(donated, &s); err != nil { + return nil, err + } + flags, err := fileFlagsFromDonatedFD(donated) + if err != nil { + return nil, err + } + switch s.Mode & syscall.S_IFMT { + case syscall.S_IFSOCK: + if isTTY { + return nil, fmt.Errorf("cannot import host socket as TTY") + } + + s, err := newSocket(ctx, donated, saveable) + if err != nil { + return nil, err + } + s.SetFlags(fs.SettableFileFlags{ + NonBlocking: flags.NonBlocking, + }) + return s, nil + default: + msrc := newMountSource(ctx, "/", mounter, &Filesystem{}, fs.MountSourceFlags{}, false /* dontTranslateOwnership */) + inode, err := newInode(ctx, msrc, donated, saveable, true /* donated */) + if err != nil { + return nil, err + } + iops := inode.InodeOperations.(*inodeOperations) + + name := fmt.Sprintf("host:[%d]", inode.StableAttr.InodeID) + dirent := fs.NewDirent(inode, name) + defer dirent.DecRef() + + if isTTY { + return newTTYFile(ctx, dirent, flags, iops), nil + } + + return newFile(ctx, dirent, flags, iops), nil + } +} + +func fileFlagsFromDonatedFD(donated int) (fs.FileFlags, error) { + flags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(donated), syscall.F_GETFL, 0) + if errno != 0 { + log.Warningf("Failed to get file flags for donated FD %d (errno=%d)", donated, errno) + return fs.FileFlags{}, syscall.EIO + } + accmode := flags & syscall.O_ACCMODE + return fs.FileFlags{ + Direct: flags&syscall.O_DIRECT != 0, + NonBlocking: flags&syscall.O_NONBLOCK != 0, + Sync: flags&syscall.O_SYNC != 0, + Append: flags&syscall.O_APPEND != 0, + Read: accmode == syscall.O_RDONLY || accmode == syscall.O_RDWR, + Write: accmode == syscall.O_WRONLY || accmode == syscall.O_RDWR, + }, nil +} + +// newFile returns a new fs.File. +func newFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File { + if !iops.ReturnsWouldBlock() { + // Allow reading/writing at an arbitrary offset for files + // that support it. + flags.Pread = true + flags.Pwrite = true + } + return fs.NewFile(ctx, dirent, flags, &fileOperations{iops: iops}) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (f *fileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + f.iops.fileState.queue.EventRegister(e, mask) + fdnotifier.UpdateFD(int32(f.iops.fileState.FD())) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (f *fileOperations) EventUnregister(e *waiter.Entry) { + f.iops.fileState.queue.EventUnregister(e) + fdnotifier.UpdateFD(int32(f.iops.fileState.FD())) +} + +// Readiness uses the poll() syscall to check the status of the underlying FD. +func (f *fileOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + return fdnotifier.NonBlockingPoll(int32(f.iops.fileState.FD()), mask) +} + +// Readdir implements fs.FileOperations.Readdir. +func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { + root := fs.RootFromContext(ctx) + if root != nil { + defer root.DecRef() + } + dirCtx := &fs.DirCtx{ + Serializer: serializer, + DirCursor: &f.dirCursor, + } + return fs.DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset()) +} + +// IterateDir implements fs.DirIterator.IterateDir. +func (f *fileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { + if f.dirinfo == nil { + f.dirinfo = new(dirInfo) + f.dirinfo.buf = make([]byte, usermem.PageSize) + } + entries, err := f.iops.readdirAll(f.dirinfo) + if err != nil { + return offset, err + } + count, err := fs.GenericReaddir(dirCtx, fs.NewSortedDentryMap(entries)) + return offset + count, err +} + +// Write implements fs.FileOperations.Write. +func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + // Would this file block? + if f.iops.ReturnsWouldBlock() { + // These files can't be memory mapped, assert this. This also + // means that writes do not need to synchronize with memory + // mappings nor metadata cached by this file's fs.Inode. + if canMap(file.Dirent.Inode) { + panic("files that can return EWOULDBLOCK cannot be memory mapped") + } + // Ignore the offset, these files don't support writing at + // an arbitrary offset. + writer := fd.NewReadWriter(f.iops.fileState.FD()) + n, err := src.CopyInTo(ctx, safemem.FromIOWriter{writer}) + if isBlockError(err) { + err = syserror.ErrWouldBlock + } + return n, err + } + if !file.Dirent.Inode.MountSource.Flags.ForcePageCache { + writer := secio.NewOffsetWriter(fd.NewReadWriter(f.iops.fileState.FD()), offset) + return src.CopyInTo(ctx, safemem.FromIOWriter{writer}) + } + return f.iops.cachingInodeOps.Write(ctx, src, offset) +} + +// Read implements fs.FileOperations.Read. +func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + // Would this file block? + if f.iops.ReturnsWouldBlock() { + // These files can't be memory mapped, assert this. This also + // means that reads do not need to synchronize with memory + // mappings nor metadata cached by this file's fs.Inode. + if canMap(file.Dirent.Inode) { + panic("files that can return EWOULDBLOCK cannot be memory mapped") + } + // Ignore the offset, these files don't support reading at + // an arbitrary offset. + reader := fd.NewReadWriter(f.iops.fileState.FD()) + n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{reader}) + if isBlockError(err) { + // If we got any data at all, return it as a "completed" partial read + // rather than retrying until complete. + if n != 0 { + err = nil + } else { + err = syserror.ErrWouldBlock + } + } + return n, err + } + if !file.Dirent.Inode.MountSource.Flags.ForcePageCache { + reader := secio.NewOffsetReader(fd.NewReadWriter(f.iops.fileState.FD()), offset) + return dst.CopyOutFrom(ctx, safemem.FromIOReader{reader}) + } + return f.iops.cachingInodeOps.Read(ctx, file, dst, offset) +} + +// Fsync implements fs.FileOperations.Fsync. +func (f *fileOperations) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error { + switch syncType { + case fs.SyncAll, fs.SyncData: + if err := file.Dirent.Inode.WriteOut(ctx); err != nil { + return err + } + fallthrough + case fs.SyncBackingStorage: + return syscall.Fsync(f.iops.fileState.FD()) + } + panic("invalid sync type") +} + +// Flush implements fs.FileOperations.Flush. +func (f *fileOperations) Flush(context.Context, *fs.File) error { + // This is a no-op because flushing the resource backing this + // file would mean closing it. We can't do that because other + // open files may depend on the backing host FD. + return nil +} + +// ConfigureMMap implements fs.FileOperations.ConfigureMMap. +func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error { + if !canMap(file.Dirent.Inode) { + return syserror.ENODEV + } + return fsutil.GenericConfigureMMap(file, f.iops.cachingInodeOps, opts) +} + +// Seek implements fs.FileOperations.Seek. +func (f *fileOperations) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) { + return fsutil.SeekWithDirCursor(ctx, file, whence, offset, &f.dirCursor) +} diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go new file mode 100644 index 000000000..b1b8dc0b6 --- /dev/null +++ b/pkg/sentry/fs/host/fs.go @@ -0,0 +1,339 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package host implements an fs.Filesystem for files backed by host +// file descriptors. +package host + +import ( + "fmt" + "path" + "path/filepath" + "strconv" + "strings" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// FilesystemName is the name under which Filesystem is registered. +const FilesystemName = "whitelistfs" + +const ( + // whitelistKey is the mount option containing a comma-separated list + // of host paths to whitelist. + whitelistKey = "whitelist" + + // rootPathKey is the mount option containing the root path of the + // mount. + rootPathKey = "root" + + // dontTranslateOwnershipKey is the key to superOperations.dontTranslateOwnership. + dontTranslateOwnershipKey = "dont_translate_ownership" +) + +// maxTraversals determines link traversals in building the whitelist. +const maxTraversals = 10 + +// Filesystem is a pseudo file system that is only available during the setup +// to lock down the configurations. This filesystem should only be mounted at root. +// +// Think twice before exposing this to applications. +// +// +stateify savable +type Filesystem struct { + // whitelist is a set of host paths to whitelist. + paths []string +} + +var _ fs.Filesystem = (*Filesystem)(nil) + +// Name is the identifier of this file system. +func (*Filesystem) Name() string { + return FilesystemName +} + +// AllowUserMount prohibits users from using mount(2) with this file system. +func (*Filesystem) AllowUserMount() bool { + return false +} + +// AllowUserList allows this filesystem to be listed in /proc/filesystems. +func (*Filesystem) AllowUserList() bool { + return true +} + +// Flags returns that there is nothing special about this file system. +func (*Filesystem) Flags() fs.FilesystemFlags { + return 0 +} + +// Mount returns an fs.Inode exposing the host file system. It is intended to be locked +// down in PreExec below. +func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) { + // Parse generic comma-separated key=value options. + options := fs.GenericMountSourceOptions(data) + + // Grab the whitelist if one was specified. + // TODO(edahlgren/mpratt/hzy): require another option "testonly" in order to allow + // no whitelist. + if wl, ok := options[whitelistKey]; ok { + f.paths = strings.Split(wl, "|") + delete(options, whitelistKey) + } + + // If the rootPath was set, use it. Othewise default to the root of the + // host fs. + rootPath := "/" + if rp, ok := options[rootPathKey]; ok { + rootPath = rp + delete(options, rootPathKey) + + // We must relativize the whitelisted paths to the new root. + for i, p := range f.paths { + rel, err := filepath.Rel(rootPath, p) + if err != nil { + return nil, fmt.Errorf("whitelist path %q must be a child of root path %q", p, rootPath) + } + f.paths[i] = path.Join("/", rel) + } + } + fd, err := open(nil, rootPath) + if err != nil { + return nil, fmt.Errorf("failed to find root: %v", err) + } + + var dontTranslateOwnership bool + if v, ok := options[dontTranslateOwnershipKey]; ok { + b, err := strconv.ParseBool(v) + if err != nil { + return nil, fmt.Errorf("invalid value for %q: %v", dontTranslateOwnershipKey, err) + } + dontTranslateOwnership = b + delete(options, dontTranslateOwnershipKey) + } + + // Fail if the caller passed us more options than we know about. + if len(options) > 0 { + return nil, fmt.Errorf("unsupported mount options: %v", options) + } + + // The mounting EUID/EGID will be cached by this file system. This will + // be used to assign ownership to files that we own. + owner := fs.FileOwnerFromContext(ctx) + + // Construct the host file system mount and inode. + msrc := newMountSource(ctx, rootPath, owner, f, flags, dontTranslateOwnership) + return newInode(ctx, msrc, fd, false /* saveable */, false /* donated */) +} + +// InstallWhitelist locks down the MountNamespace to only the currently installed +// Dirents and the given paths. +func (f *Filesystem) InstallWhitelist(ctx context.Context, m *fs.MountNamespace) error { + return installWhitelist(ctx, m, f.paths) +} + +func installWhitelist(ctx context.Context, m *fs.MountNamespace, paths []string) error { + if len(paths) == 0 || (len(paths) == 1 && paths[0] == "") { + // Warning will be logged during filter installation if the empty + // whitelist matters (allows for host file access). + return nil + } + + // Done tracks entries already added. + done := make(map[string]bool) + root := m.Root() + defer root.DecRef() + + for i := 0; i < len(paths); i++ { + // Make sure the path is absolute. This is a sanity check. + if !path.IsAbs(paths[i]) { + return fmt.Errorf("path %q is not absolute", paths[i]) + } + + // We need to add all the intermediate paths, in case one of + // them is a symlink that needs to be resolved. + for j := 1; j <= len(paths[i]); j++ { + if j < len(paths[i]) && paths[i][j] != '/' { + continue + } + current := paths[i][:j] + + // Lookup the given component in the tree. + remainingTraversals := uint(maxTraversals) + d, err := m.FindLink(ctx, root, nil, current, &remainingTraversals) + if err != nil { + log.Warningf("populate failed for %q: %v", current, err) + continue + } + + // It's critical that this DecRef happens after the + // freeze below. This ensures that the dentry is in + // place to be frozen. Otherwise, we freeze without + // these entries. + defer d.DecRef() + + // Expand the last component if necessary. + if current == paths[i] { + // Is it a directory or symlink? + sattr := d.Inode.StableAttr + if fs.IsDir(sattr) { + for name := range childDentAttrs(ctx, d) { + paths = append(paths, path.Join(current, name)) + } + } + if fs.IsSymlink(sattr) { + // Only expand symlinks once. The + // folder structure may contain + // recursive symlinks and we don't want + // to end up infinitely expanding this + // symlink. This is safe because this + // is the last component. If a later + // path wants to symlink something + // beneath this symlink that will still + // be handled by the FindLink above. + if done[current] { + continue + } + + s, err := d.Inode.Readlink(ctx) + if err != nil { + log.Warningf("readlink failed for %q: %v", current, err) + continue + } + if path.IsAbs(s) { + paths = append(paths, s) + } else { + target := path.Join(path.Dir(current), s) + paths = append(paths, target) + } + } + } + + // Only report this one once even though we may look + // it up more than once. If we whitelist /a/b,/a then + // /a will be "done" when it is looked up for /a/b, + // however we still need to expand all of its contents + // when whitelisting /a. + if !done[current] { + log.Debugf("whitelisted: %s", current) + } + done[current] = true + } + } + + // Freeze the mount tree in place. This prevents any new paths from + // being opened and any old ones from being removed. If we do provide + // tmpfs mounts, we'll want to freeze/thaw those separately. + m.Freeze() + return nil +} + +func childDentAttrs(ctx context.Context, d *fs.Dirent) map[string]fs.DentAttr { + dirname, _ := d.FullName(nil /* root */) + dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) + if err != nil { + log.Warningf("failed to open directory %q: %v", dirname, err) + return nil + } + dir.DecRef() + var stubSerializer fs.CollectEntriesSerializer + if err := dir.Readdir(ctx, &stubSerializer); err != nil { + log.Warningf("failed to iterate on host directory %q: %v", dirname, err) + return nil + } + delete(stubSerializer.Entries, ".") + delete(stubSerializer.Entries, "..") + return stubSerializer.Entries +} + +// newMountSource constructs a new host fs.MountSource +// relative to a root path. The root should match the mount point. +func newMountSource(ctx context.Context, root string, mounter fs.FileOwner, filesystem fs.Filesystem, flags fs.MountSourceFlags, dontTranslateOwnership bool) *fs.MountSource { + return fs.NewMountSource(&superOperations{ + root: root, + inodeMappings: make(map[uint64]string), + mounter: mounter, + dontTranslateOwnership: dontTranslateOwnership, + }, filesystem, flags) +} + +// superOperations implements fs.MountSourceOperations. +// +// +stateify savable +type superOperations struct { + fs.SimpleMountSourceOperations + + // root is the path of the mount point. All inode mappings + // are relative to this root. + root string + + // inodeMappings contains mappings of fs.Inodes associated + // with this MountSource to paths under root. + inodeMappings map[uint64]string + + // mounter is the cached EUID/EGID that mounted this file system. + mounter fs.FileOwner + + // dontTranslateOwnership indicates whether to not translate file + // ownership. + // + // By default, files/directories owned by the sandbox uses UID/GID + // of the mounter. For files/directories that are not owned by the + // sandbox, file UID/GID is translated to a UID/GID which cannot + // be mapped in the sandboxed application's user namespace. The + // UID/GID will look like the nobody UID/GID (65534) but is not + // strictly owned by the user "nobody". + // + // If whitelistfs is a lower filesystem in an overlay, set + // dont_translate_ownership=true in mount options. + dontTranslateOwnership bool +} + +var _ fs.MountSourceOperations = (*superOperations)(nil) + +// ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings. +func (m *superOperations) ResetInodeMappings() { + m.inodeMappings = make(map[uint64]string) +} + +// SaveInodeMapping implements fs.MountSourceOperations.SaveInodeMapping. +func (m *superOperations) SaveInodeMapping(inode *fs.Inode, path string) { + // This is very unintuitive. We *CANNOT* trust the inode's StableAttrs, + // because overlay copyUp may have changed them out from under us. + // So much for "immutable". + sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr + m.inodeMappings[sattr.InodeID] = path +} + +// Keep implements fs.MountSourceOperations.Keep. +// +// TODO(b/72455313,b/77596690): It is possible to change the permissions on a +// host file while it is in the dirent cache (say from RO to RW), but it is not +// possible to re-open the file with more relaxed permissions, since the host +// FD is already open and stored in the inode. +// +// Using the dirent LRU cache increases the odds that this bug is encountered. +// Since host file access is relatively fast anyways, we disable the LRU cache +// for host fs files. Once we can properly deal with permissions changes and +// re-opening host files, we should revisit whether or not to make use of the +// LRU cache. +func (*superOperations) Keep(*fs.Dirent) bool { + return false +} + +func init() { + fs.RegisterFilesystem(&Filesystem{}) +} diff --git a/pkg/sentry/fs/host/host_state_autogen.go b/pkg/sentry/fs/host/host_state_autogen.go new file mode 100755 index 000000000..22cfa1222 --- /dev/null +++ b/pkg/sentry/fs/host/host_state_autogen.go @@ -0,0 +1,142 @@ +// automatically generated by stateify. + +package host + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" +) + +func (x *descriptor) save(m state.Map) { + x.beforeSave() + m.Save("donated", &x.donated) + m.Save("origFD", &x.origFD) + m.Save("wouldBlock", &x.wouldBlock) +} + +func (x *descriptor) load(m state.Map) { + m.Load("donated", &x.donated) + m.Load("origFD", &x.origFD) + m.Load("wouldBlock", &x.wouldBlock) + m.AfterLoad(x.afterLoad) +} + +func (x *fileOperations) beforeSave() {} +func (x *fileOperations) save(m state.Map) { + x.beforeSave() + m.Save("iops", &x.iops) + m.Save("dirCursor", &x.dirCursor) +} + +func (x *fileOperations) afterLoad() {} +func (x *fileOperations) load(m state.Map) { + m.LoadWait("iops", &x.iops) + m.Load("dirCursor", &x.dirCursor) +} + +func (x *Filesystem) beforeSave() {} +func (x *Filesystem) save(m state.Map) { + x.beforeSave() + m.Save("paths", &x.paths) +} + +func (x *Filesystem) afterLoad() {} +func (x *Filesystem) load(m state.Map) { + m.Load("paths", &x.paths) +} + +func (x *superOperations) beforeSave() {} +func (x *superOperations) save(m state.Map) { + x.beforeSave() + m.Save("SimpleMountSourceOperations", &x.SimpleMountSourceOperations) + m.Save("root", &x.root) + m.Save("inodeMappings", &x.inodeMappings) + m.Save("mounter", &x.mounter) + m.Save("dontTranslateOwnership", &x.dontTranslateOwnership) +} + +func (x *superOperations) afterLoad() {} +func (x *superOperations) load(m state.Map) { + m.Load("SimpleMountSourceOperations", &x.SimpleMountSourceOperations) + m.Load("root", &x.root) + m.Load("inodeMappings", &x.inodeMappings) + m.Load("mounter", &x.mounter) + m.Load("dontTranslateOwnership", &x.dontTranslateOwnership) +} + +func (x *inodeOperations) beforeSave() {} +func (x *inodeOperations) save(m state.Map) { + x.beforeSave() + m.Save("fileState", &x.fileState) + m.Save("cachingInodeOps", &x.cachingInodeOps) +} + +func (x *inodeOperations) afterLoad() {} +func (x *inodeOperations) load(m state.Map) { + m.LoadWait("fileState", &x.fileState) + m.Load("cachingInodeOps", &x.cachingInodeOps) +} + +func (x *inodeFileState) save(m state.Map) { + x.beforeSave() + if !state.IsZeroValue(x.queue) { m.Failf("queue is %v, expected zero", x.queue) } + m.Save("mops", &x.mops) + m.Save("descriptor", &x.descriptor) + m.Save("sattr", &x.sattr) + m.Save("savedUAttr", &x.savedUAttr) +} + +func (x *inodeFileState) load(m state.Map) { + m.LoadWait("mops", &x.mops) + m.LoadWait("descriptor", &x.descriptor) + m.LoadWait("sattr", &x.sattr) + m.Load("savedUAttr", &x.savedUAttr) + m.AfterLoad(x.afterLoad) +} + +func (x *ConnectedEndpoint) save(m state.Map) { + x.beforeSave() + m.Save("queue", &x.queue) + m.Save("path", &x.path) + m.Save("ref", &x.ref) + m.Save("readClosed", &x.readClosed) + m.Save("writeClosed", &x.writeClosed) + m.Save("srfd", &x.srfd) + m.Save("stype", &x.stype) +} + +func (x *ConnectedEndpoint) load(m state.Map) { + m.Load("queue", &x.queue) + m.Load("path", &x.path) + m.Load("ref", &x.ref) + m.Load("readClosed", &x.readClosed) + m.Load("writeClosed", &x.writeClosed) + m.LoadWait("srfd", &x.srfd) + m.Load("stype", &x.stype) + m.AfterLoad(x.afterLoad) +} + +func (x *TTYFileOperations) beforeSave() {} +func (x *TTYFileOperations) save(m state.Map) { + x.beforeSave() + m.Save("fileOperations", &x.fileOperations) + m.Save("session", &x.session) + m.Save("fgProcessGroup", &x.fgProcessGroup) +} + +func (x *TTYFileOperations) afterLoad() {} +func (x *TTYFileOperations) load(m state.Map) { + m.Load("fileOperations", &x.fileOperations) + m.Load("session", &x.session) + m.Load("fgProcessGroup", &x.fgProcessGroup) +} + +func init() { + state.Register("host.descriptor", (*descriptor)(nil), state.Fns{Save: (*descriptor).save, Load: (*descriptor).load}) + state.Register("host.fileOperations", (*fileOperations)(nil), state.Fns{Save: (*fileOperations).save, Load: (*fileOperations).load}) + state.Register("host.Filesystem", (*Filesystem)(nil), state.Fns{Save: (*Filesystem).save, Load: (*Filesystem).load}) + state.Register("host.superOperations", (*superOperations)(nil), state.Fns{Save: (*superOperations).save, Load: (*superOperations).load}) + state.Register("host.inodeOperations", (*inodeOperations)(nil), state.Fns{Save: (*inodeOperations).save, Load: (*inodeOperations).load}) + state.Register("host.inodeFileState", (*inodeFileState)(nil), state.Fns{Save: (*inodeFileState).save, Load: (*inodeFileState).load}) + state.Register("host.ConnectedEndpoint", (*ConnectedEndpoint)(nil), state.Fns{Save: (*ConnectedEndpoint).save, Load: (*ConnectedEndpoint).load}) + state.Register("host.TTYFileOperations", (*TTYFileOperations)(nil), state.Fns{Save: (*TTYFileOperations).save, Load: (*TTYFileOperations).load}) +} diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go new file mode 100644 index 000000000..7a230e426 --- /dev/null +++ b/pkg/sentry/fs/host/inode.go @@ -0,0 +1,527 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/secio" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// inodeOperations implements fs.InodeOperations for an fs.Inodes backed +// by a host file descriptor. +// +// +stateify savable +type inodeOperations struct { + fsutil.InodeNotVirtual `state:"nosave"` + fsutil.InodeNoExtendedAttributes `state:"nosave"` + + // fileState implements fs.CachedFileObject. It exists + // to break a circular load dependency between inodeOperations + // and cachingInodeOps (below). + fileState *inodeFileState `state:"wait"` + + // cachedInodeOps implements memmap.Mappable. + cachingInodeOps *fsutil.CachingInodeOperations + + // readdirMu protects the file offset on the host FD. This is needed + // for readdir because getdents must use the kernel offset, so + // concurrent readdirs must be exclusive. + // + // All read/write functions pass the offset directly to the kernel and + // thus don't need a lock. + readdirMu sync.Mutex `state:"nosave"` +} + +// inodeFileState implements fs.CachedFileObject and otherwise fully +// encapsulates state that needs to be manually loaded on restore for +// this file object. +// +// This unfortunate structure exists because fs.CachingInodeOperations +// defines afterLoad and therefore cannot be lazily loaded (to break a +// circular load dependency between it and inodeOperations). Even with +// lazy loading, this approach defines the dependencies between objects +// and the expected load behavior more concretely. +// +// +stateify savable +type inodeFileState struct { + // Common file system state. + mops *superOperations `state:"wait"` + + // descriptor is the backing host FD. + descriptor *descriptor `state:"wait"` + + // Event queue for blocking operations. + queue waiter.Queue `state:"zerovalue"` + + // sattr is used to restore the inodeOperations. + sattr fs.StableAttr `state:"wait"` + + // savedUAttr is only allocated during S/R. It points to the save-time + // unstable attributes and is used to validate restore-time ones. + // + // Note that these unstable attributes are only used to detect cross-S/R + // external file system metadata changes. They may differ from the + // cached unstable attributes in cachingInodeOps, as that might differ + // from the external file system attributes if there had been WriteOut + // failures. S/R is transparent to Sentry and the latter will continue + // using its cached values after restore. + savedUAttr *fs.UnstableAttr +} + +// ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt. +func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) { + // TODO(jamieliu): Using safemem.FromIOReader here is wasteful for two + // reasons: + // + // - Using preadv instead of iterated preads saves on host system calls. + // + // - Host system calls can handle destination memory that would fault in + // gr3 (i.e. they can accept safemem.Blocks with NeedSafecopy() == true), + // so the buffering performed by FromIOReader is unnecessary. + // + // This also applies to the write path below. + return safemem.FromIOReader{secio.NewOffsetReader(fd.NewReadWriter(i.FD()), int64(offset))}.ReadToBlocks(dsts) +} + +// WriteFromBlocksAt implements fsutil.CachedFileObject.WriteFromBlocksAt. +func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { + return safemem.FromIOWriter{secio.NewOffsetWriter(fd.NewReadWriter(i.FD()), int64(offset))}.WriteFromBlocks(srcs) +} + +// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes. +func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error { + if mask.Empty() { + return nil + } + if mask.UID || mask.GID { + return syserror.EPERM + } + if mask.Perms { + if err := syscall.Fchmod(i.FD(), uint32(attr.Perms.LinuxMode())); err != nil { + return err + } + } + if mask.Size { + if err := syscall.Ftruncate(i.FD(), attr.Size); err != nil { + return err + } + } + if mask.AccessTime || mask.ModificationTime { + ts := fs.TimeSpec{ + ATime: attr.AccessTime, + ATimeOmit: !mask.AccessTime, + MTime: attr.ModificationTime, + MTimeOmit: !mask.ModificationTime, + } + if err := setTimestamps(i.FD(), ts); err != nil { + return err + } + } + return nil +} + +// Sync implements fsutil.CachedFileObject.Sync. +func (i *inodeFileState) Sync(ctx context.Context) error { + return syscall.Fsync(i.FD()) +} + +// FD implements fsutil.CachedFileObject.FD. +func (i *inodeFileState) FD() int { + return i.descriptor.value +} + +func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, error) { + var s syscall.Stat_t + if err := syscall.Fstat(i.FD(), &s); err != nil { + return fs.UnstableAttr{}, err + } + return unstableAttr(i.mops, &s), nil +} + +// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes. +func (i *inodeFileState) Allocate(_ context.Context, offset, length int64) error { + return syscall.Fallocate(i.FD(), 0, offset, length) +} + +// inodeOperations implements fs.InodeOperations. +var _ fs.InodeOperations = (*inodeOperations)(nil) + +// newInode returns a new fs.Inode backed by the host FD. +func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool, donated bool) (*fs.Inode, error) { + // Retrieve metadata. + var s syscall.Stat_t + err := syscall.Fstat(fd, &s) + if err != nil { + return nil, err + } + + fileState := &inodeFileState{ + mops: msrc.MountSourceOperations.(*superOperations), + sattr: stableAttr(&s), + } + + // Initialize the wrapped host file descriptor. + fileState.descriptor, err = newDescriptor( + fd, + donated, + saveable, + wouldBlock(&s), + &fileState.queue, + ) + if err != nil { + return nil, err + } + + // Build the fs.InodeOperations. + uattr := unstableAttr(msrc.MountSourceOperations.(*superOperations), &s) + iops := &inodeOperations{ + fileState: fileState, + cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, msrc.Flags.ForcePageCache), + } + + // Return the fs.Inode. + return fs.NewInode(iops, msrc, fileState.sattr), nil +} + +// Mappable implements fs.InodeOperations.Mappable. +func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable { + if !canMap(inode) { + return nil + } + return i.cachingInodeOps +} + +// ReturnsWouldBlock returns true if this host FD can return EWOULDBLOCK for +// operations that would block. +func (i *inodeOperations) ReturnsWouldBlock() bool { + return i.fileState.descriptor.wouldBlock +} + +// Release implements fs.InodeOperations.Release. +func (i *inodeOperations) Release(context.Context) { + i.fileState.descriptor.Release() + i.cachingInodeOps.Release() +} + +// Lookup implements fs.InodeOperations.Lookup. +func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) { + // Get a new FD relative to i at name. + fd, err := open(i, name) + if err != nil { + if err == syserror.ENOENT { + return nil, syserror.ENOENT + } + return nil, err + } + + inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */) + if err != nil { + return nil, err + } + + // Return the fs.Dirent. + return fs.NewDirent(inode, name), nil +} + +// Create implements fs.InodeOperations.Create. +func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) { + // Create a file relative to i at name. + // + // N.B. We always open this file O_RDWR regardless of flags because a + // future GetFile might want more access. Open allows this regardless + // of perm. + fd, err := openAt(i, name, syscall.O_RDWR|syscall.O_CREAT|syscall.O_EXCL, perm.LinuxMode()) + if err != nil { + return nil, err + } + + inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */) + if err != nil { + return nil, err + } + + d := fs.NewDirent(inode, name) + defer d.DecRef() + return inode.GetFile(ctx, d, flags) +} + +// CreateDirectory implements fs.InodeOperations.CreateDirectory. +func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { + return syscall.Mkdirat(i.fileState.FD(), name, uint32(perm.LinuxMode())) +} + +// CreateLink implements fs.InodeOperations.CreateLink. +func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error { + return createLink(i.fileState.FD(), oldname, newname) +} + +// CreateHardLink implements fs.InodeOperations.CreateHardLink. +func (*inodeOperations) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error { + return syserror.EPERM +} + +// CreateFifo implements fs.InodeOperations.CreateFifo. +func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error { + return syserror.EPERM +} + +// Remove implements fs.InodeOperations.Remove. +func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error { + return unlinkAt(i.fileState.FD(), name, false /* dir */) +} + +// RemoveDirectory implements fs.InodeOperations.RemoveDirectory. +func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error { + return unlinkAt(i.fileState.FD(), name, true /* dir */) +} + +// Rename implements fs.InodeOperations.Rename. +func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error { + op, ok := oldParent.InodeOperations.(*inodeOperations) + if !ok { + return syscall.EXDEV + } + np, ok := newParent.InodeOperations.(*inodeOperations) + if !ok { + return syscall.EXDEV + } + return syscall.Renameat(op.fileState.FD(), oldName, np.fileState.FD(), newName) +} + +// Bind implements fs.InodeOperations.Bind. +func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data transport.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) { + return nil, syserror.EOPNOTSUPP +} + +// BoundEndpoint implements fs.InodeOperations.BoundEndpoint. +func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport.BoundEndpoint { + return nil +} + +// GetFile implements fs.InodeOperations.GetFile. +func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return newFile(ctx, d, flags, i), nil +} + +// canMap returns true if this fs.Inode can be memory mapped. +func canMap(inode *fs.Inode) bool { + // FIXME(b/38213152): Some obscure character devices can be mapped. + return fs.IsFile(inode.StableAttr) +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + // When the kernel supports mapping host FDs, we do so to take + // advantage of the host page cache. We forego updating fs.Inodes + // because the host manages consistency of its own inode structures. + // + // For fs.Inodes that can never be mapped we take advantage of + // synchronizing metadata updates through host caches. + // + // So can we use host kernel metadata caches? + if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) { + // Then just obtain the attributes. + return i.fileState.unstableAttr(ctx) + } + // No, we're maintaining consistency of metadata ourselves. + return i.cachingInodeOps.UnstableAttr(ctx, inode) +} + +// Check implements fs.InodeOperations.Check. +func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (i *inodeOperations) SetOwner(context.Context, *fs.Inode, fs.FileOwner) error { + return syserror.EPERM +} + +// SetPermissions implements fs.InodeOperations.SetPermissions. +func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, f fs.FilePermissions) bool { + // Can we use host kernel metadata caches? + if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) { + // Then just change the timestamps on the FD, the host + // will synchronize the metadata update with any host + // inode and page cache. + return syscall.Fchmod(i.fileState.FD(), uint32(f.LinuxMode())) == nil + } + // Otherwise update our cached metadata. + return i.cachingInodeOps.SetPermissions(ctx, inode, f) +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error { + // Can we use host kernel metadata caches? + if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) { + // Then just change the timestamps on the FD, the host + // will synchronize the metadata update with any host + // inode and page cache. + return setTimestamps(i.fileState.FD(), ts) + } + // Otherwise update our cached metadata. + return i.cachingInodeOps.SetTimestamps(ctx, inode, ts) +} + +// Truncate implements fs.InodeOperations.Truncate. +func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error { + // Is the file not memory-mappable? + if !canMap(inode) { + // Then just change the file size on the FD, the host + // will synchronize the metadata update with any host + // inode and page cache. + return syscall.Ftruncate(i.fileState.FD(), size) + } + // Otherwise we need to go through cachingInodeOps, even if the host page + // cache is in use, to invalidate private copies of truncated pages. + return i.cachingInodeOps.Truncate(ctx, inode, size) +} + +// Allocate implements fs.InodeOperations.Allocate. +func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset, length int64) error { + // Is the file not memory-mappable? + if !canMap(inode) { + // Then just send the call to the FD, the host will synchronize the metadata + // update with any host inode and page cache. + return i.fileState.Allocate(ctx, offset, length) + } + // Otherwise we need to go through cachingInodeOps, even if the host page + // cache is in use, to invalidate private copies of truncated pages. + return i.cachingInodeOps.Allocate(ctx, offset, length) +} + +// WriteOut implements fs.InodeOperations.WriteOut. +func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error { + // Have we been using host kernel metadata caches? + if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) { + // Then the metadata is already up to date on the host. + return nil + } + // Otherwise we need to write out cached pages and attributes + // that are dirty. + return i.cachingInodeOps.WriteOut(ctx, inode) +} + +// Readlink implements fs.InodeOperations.Readlink. +func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { + return readLink(i.fileState.FD()) +} + +// Getlink implements fs.InodeOperations.Getlink. +func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { + if !fs.IsSymlink(i.fileState.sattr) { + return nil, syserror.ENOLINK + } + return nil, fs.ErrResolveViaReadlink +} + +// StatFS implements fs.InodeOperations.StatFS. +func (i *inodeOperations) StatFS(context.Context) (fs.Info, error) { + return fs.Info{}, syserror.ENOSYS +} + +// AddLink implements fs.InodeOperations.AddLink. +// FIXME(b/63117438): Remove this from InodeOperations altogether. +func (i *inodeOperations) AddLink() {} + +// DropLink implements fs.InodeOperations.DropLink. +// FIXME(b/63117438): Remove this from InodeOperations altogether. +func (i *inodeOperations) DropLink() {} + +// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange. +// FIXME(b/63117438): Remove this from InodeOperations altogether. +func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {} + +// readdirAll returns all of the directory entries in i. +func (i *inodeOperations) readdirAll(d *dirInfo) (map[string]fs.DentAttr, error) { + i.readdirMu.Lock() + defer i.readdirMu.Unlock() + + fd := i.fileState.FD() + + // syscall.ReadDirent will use getdents, which will seek the file past + // the last directory entry. To read the directory entries a second + // time, we need to seek back to the beginning. + if _, err := syscall.Seek(fd, 0, 0); err != nil { + if err == syscall.ESPIPE { + // All directories should be seekable. If this file + // isn't seekable, it is not a directory and we should + // return that more sane error. + err = syscall.ENOTDIR + } + return nil, err + } + + names := make([]string, 0, 100) + for { + // Refill the buffer if necessary + if d.bufp >= d.nbuf { + d.bufp = 0 + // ReadDirent will just do a sys_getdents64 to the kernel. + n, err := syscall.ReadDirent(fd, d.buf) + if err != nil { + return nil, err + } + if n == 0 { + break // EOF + } + d.nbuf = n + } + + var nb int + // Parse the dirent buffer we just get and return the directory names along + // with the number of bytes consumed in the buffer. + nb, _, names = syscall.ParseDirent(d.buf[d.bufp:d.nbuf], -1, names) + d.bufp += nb + } + + entries := make(map[string]fs.DentAttr) + for _, filename := range names { + // Lookup the type and host device and inode. + stat, lerr := fstatat(fd, filename, linux.AT_SYMLINK_NOFOLLOW) + if lerr == syscall.ENOENT { + // File disappeared between readdir and lstat. + // Just treat it as if it didn't exist. + continue + } + + // There was a serious problem, we should probably report it. + if lerr != nil { + return nil, lerr + } + + entries[filename] = fs.DentAttr{ + Type: nodeType(&stat), + InodeID: hostFileDevice.Map(device.MultiDeviceKey{ + Device: stat.Dev, + Inode: stat.Ino, + }), + } + } + return entries, nil +} diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go new file mode 100644 index 000000000..26cc755bc --- /dev/null +++ b/pkg/sentry/fs/host/inode_state.go @@ -0,0 +1,79 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// beforeSave is invoked by stateify. +func (i *inodeFileState) beforeSave() { + if !i.queue.IsEmpty() { + panic("event queue must be empty") + } + if !i.descriptor.donated && i.sattr.Type == fs.RegularFile { + uattr, err := i.unstableAttr(context.Background()) + if err != nil { + panic(fs.ErrSaveRejection{fmt.Errorf("failed to get unstable atttribute of %s: %v", i.mops.inodeMappings[i.sattr.InodeID], err)}) + } + i.savedUAttr = &uattr + } +} + +// afterLoad is invoked by stateify. +func (i *inodeFileState) afterLoad() { + // Initialize the descriptor value. + if err := i.descriptor.initAfterLoad(i.mops, i.sattr.InodeID, &i.queue); err != nil { + panic(fmt.Sprintf("failed to load value of descriptor: %v", err)) + } + + // Remap the inode number. + var s syscall.Stat_t + if err := syscall.Fstat(i.FD(), &s); err != nil { + panic(fs.ErrCorruption{fmt.Errorf("failed to get metadata for fd %d: %v", i.FD(), err)}) + } + key := device.MultiDeviceKey{ + Device: s.Dev, + Inode: s.Ino, + } + if !hostFileDevice.Load(key, i.sattr.InodeID) { + // This means there was a conflict at s.Dev and s.Ino with + // another inode mapping: two files that were unique on the + // saved filesystem are no longer unique on this filesystem. + // Since this violates the contract that filesystems cannot + // change across save and restore, error out. + panic(fs.ErrCorruption{fmt.Errorf("host %s conflict in host device mappings: %s", key, hostFileDevice)}) + } + + if !i.descriptor.donated && i.sattr.Type == fs.RegularFile { + env, ok := fs.CurrentRestoreEnvironment() + if !ok { + panic("missing restore environment") + } + uattr := unstableAttr(i.mops, &s) + if env.ValidateFileSize && uattr.Size != i.savedUAttr.Size { + panic(fs.ErrCorruption{fmt.Errorf("file size has changed for %s: previously %d, now %d", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size)}) + } + if env.ValidateFileTimestamp && uattr.ModificationTime != i.savedUAttr.ModificationTime { + panic(fs.ErrCorruption{fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime)}) + } + i.savedUAttr = nil + } +} diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go new file mode 100644 index 000000000..b5a85c4d9 --- /dev/null +++ b/pkg/sentry/fs/host/ioctl_unsafe.go @@ -0,0 +1,56 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" +) + +func ioctlGetTermios(fd int) (*linux.Termios, error) { + var t linux.Termios + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t))) + if errno != 0 { + return nil, errno + } + return &t, nil +} + +func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error { + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t))) + if errno != 0 { + return errno + } + return nil +} + +func ioctlGetWinsize(fd int) (*linux.Winsize, error) { + var w linux.Winsize + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCGWINSZ, uintptr(unsafe.Pointer(&w))) + if errno != 0 { + return nil, errno + } + return &w, nil +} + +func ioctlSetWinsize(fd int, w *linux.Winsize) error { + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCSWINSZ, uintptr(unsafe.Pointer(w))) + if errno != 0 { + return errno + } + return nil +} diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go new file mode 100644 index 000000000..3ed137006 --- /dev/null +++ b/pkg/sentry/fs/host/socket.go @@ -0,0 +1,390 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/fdnotifier" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/control" + unixsocket "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid" + "gvisor.googlesource.com/gvisor/pkg/syserr" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/unet" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// maxSendBufferSize is the maximum host send buffer size allowed for endpoint. +// +// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max). +const maxSendBufferSize = 8 << 20 + +// ConnectedEndpoint is a host FD backed implementation of +// transport.ConnectedEndpoint and transport.Receiver. +// +// +stateify savable +type ConnectedEndpoint struct { + queue *waiter.Queue + path string + + // ref keeps track of references to a connectedEndpoint. + ref refs.AtomicRefCount + + // mu protects fd, readClosed and writeClosed. + mu sync.RWMutex `state:"nosave"` + + // file is an *fd.FD containing the FD backing this endpoint. It must be + // set to nil if it has been closed. + file *fd.FD `state:"nosave"` + + // readClosed is true if the FD has read shutdown or if it has been closed. + readClosed bool + + // writeClosed is true if the FD has write shutdown or if it has been + // closed. + writeClosed bool + + // If srfd >= 0, it is the host FD that file was imported from. + srfd int `state:"wait"` + + // stype is the type of Unix socket. + stype transport.SockType + + // sndbuf is the size of the send buffer. + // + // N.B. When this is smaller than the host size, we present it via + // GetSockOpt and message splitting/rejection in SendMsg, but do not + // prevent lots of small messages from filling the real send buffer + // size on the host. + sndbuf int `state:"nosave"` +} + +// init performs initialization required for creating new ConnectedEndpoints and +// for restoring them. +func (c *ConnectedEndpoint) init() *syserr.Error { + family, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_DOMAIN) + if err != nil { + return syserr.FromError(err) + } + + if family != syscall.AF_UNIX { + // We only allow Unix sockets. + return syserr.ErrInvalidEndpointState + } + + stype, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_TYPE) + if err != nil { + return syserr.FromError(err) + } + + if err := syscall.SetNonblock(c.file.FD(), true); err != nil { + return syserr.FromError(err) + } + + sndbuf, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_SNDBUF) + if err != nil { + return syserr.FromError(err) + } + if sndbuf > maxSendBufferSize { + log.Warningf("Socket send buffer too large: %d", sndbuf) + return syserr.ErrInvalidEndpointState + } + + c.stype = transport.SockType(stype) + c.sndbuf = sndbuf + + return nil +} + +// NewConnectedEndpoint creates a new ConnectedEndpoint backed by a host FD +// that will pretend to be bound at a given sentry path. +// +// The caller is responsible for calling Init(). Additionaly, Release needs to +// be called twice because ConnectedEndpoint is both a transport.Receiver and +// transport.ConnectedEndpoint. +func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (*ConnectedEndpoint, *syserr.Error) { + e := ConnectedEndpoint{ + path: path, + queue: queue, + file: file, + srfd: -1, + } + + if err := e.init(); err != nil { + return nil, err + } + + // AtomicRefCounters start off with a single reference. We need two. + e.ref.IncRef() + + return &e, nil +} + +// Init will do initialization required without holding other locks. +func (c *ConnectedEndpoint) Init() { + if err := fdnotifier.AddFD(int32(c.file.FD()), c.queue); err != nil { + panic(err) + } +} + +// NewSocketWithDirent allocates a new unix socket with host endpoint. +// +// This is currently only used by unsaveable Gofer nodes. +// +// NewSocketWithDirent takes ownership of f on success. +func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.FileFlags) (*fs.File, error) { + f2 := fd.New(f.FD()) + var q waiter.Queue + e, err := NewConnectedEndpoint(f2, &q, "" /* path */) + if err != nil { + f2.Release() + return nil, err.ToError() + } + + // Take ownship of the FD. + f.Release() + + e.Init() + + ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e) + + return unixsocket.NewWithDirent(ctx, d, ep, e.stype != transport.SockStream, flags), nil +} + +// newSocket allocates a new unix socket with host endpoint. +func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error) { + ownedfd := orgfd + srfd := -1 + if saveable { + var err error + ownedfd, err = syscall.Dup(orgfd) + if err != nil { + return nil, err + } + srfd = orgfd + } + f := fd.New(ownedfd) + var q waiter.Queue + e, err := NewConnectedEndpoint(f, &q, "" /* path */) + if err != nil { + if saveable { + f.Close() + } else { + f.Release() + } + return nil, err.ToError() + } + + e.srfd = srfd + e.Init() + + ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e) + + return unixsocket.New(ctx, ep, e.stype != transport.SockStream), nil +} + +// Send implements transport.ConnectedEndpoint.Send. +func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *syserr.Error) { + c.mu.RLock() + defer c.mu.RUnlock() + if c.writeClosed { + return 0, false, syserr.ErrClosedForSend + } + + if !controlMessages.Empty() { + return 0, false, syserr.ErrInvalidEndpointState + } + + // Since stream sockets don't preserve message boundaries, we can write + // only as much of the message as fits in the send buffer. + truncate := c.stype == transport.SockStream + + n, totalLen, err := fdWriteVec(c.file.FD(), data, c.sndbuf, truncate) + if n < totalLen && err == nil { + // The host only returns a short write if it would otherwise + // block (and only for stream sockets). + err = syserror.EAGAIN + } + if n > 0 && err != syserror.EAGAIN { + // The caller may need to block to send more data, but + // otherwise there isn't anything that can be done about an + // error with a partial write. + err = nil + } + + // There is no need for the callee to call SendNotify because fdWriteVec + // uses the host's sendmsg(2) and the host kernel's queue. + return n, false, syserr.FromError(err) +} + +// SendNotify implements transport.ConnectedEndpoint.SendNotify. +func (c *ConnectedEndpoint) SendNotify() {} + +// CloseSend implements transport.ConnectedEndpoint.CloseSend. +func (c *ConnectedEndpoint) CloseSend() { + c.mu.Lock() + c.writeClosed = true + c.mu.Unlock() +} + +// CloseNotify implements transport.ConnectedEndpoint.CloseNotify. +func (c *ConnectedEndpoint) CloseNotify() {} + +// Writable implements transport.ConnectedEndpoint.Writable. +func (c *ConnectedEndpoint) Writable() bool { + c.mu.RLock() + defer c.mu.RUnlock() + if c.writeClosed { + return true + } + return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventOut)&waiter.EventOut != 0 +} + +// Passcred implements transport.ConnectedEndpoint.Passcred. +func (c *ConnectedEndpoint) Passcred() bool { + // We don't support credential passing for host sockets. + return false +} + +// GetLocalAddress implements transport.ConnectedEndpoint.GetLocalAddress. +func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { + return tcpip.FullAddress{Addr: tcpip.Address(c.path)}, nil +} + +// EventUpdate implements transport.ConnectedEndpoint.EventUpdate. +func (c *ConnectedEndpoint) EventUpdate() { + c.mu.RLock() + defer c.mu.RUnlock() + if c.file.FD() != -1 { + fdnotifier.UpdateFD(int32(c.file.FD())) + } +} + +// Recv implements transport.Receiver.Recv. +func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { + c.mu.RLock() + defer c.mu.RUnlock() + if c.readClosed { + return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.ErrClosedForReceive + } + + var cm unet.ControlMessage + if numRights > 0 { + cm.EnableFDs(int(numRights)) + } + + // N.B. Unix sockets don't have a receive buffer, the send buffer + // serves both purposes. + rl, ml, cl, cTrunc, err := fdReadVec(c.file.FD(), data, []byte(cm), peek, c.sndbuf) + if rl > 0 && err != nil { + // We got some data, so all we need to do on error is return + // the data that we got. Short reads are fine, no need to + // block. + err = nil + } + if err != nil { + return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err) + } + + // There is no need for the callee to call RecvNotify because fdReadVec uses + // the host's recvmsg(2) and the host kernel's queue. + + // Trim the control data if we received less than the full amount. + if cl < uint64(len(cm)) { + cm = cm[:cl] + } + + // Avoid extra allocations in the case where there isn't any control data. + if len(cm) == 0 { + return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil + } + + fds, err := cm.ExtractFDs() + if err != nil { + return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err) + } + + if len(fds) == 0 { + return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil + } + return rl, ml, control.New(nil, nil, newSCMRights(fds)), cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil +} + +// close releases all resources related to the endpoint. +func (c *ConnectedEndpoint) close() { + fdnotifier.RemoveFD(int32(c.file.FD())) + c.file.Close() + c.file = nil +} + +// RecvNotify implements transport.Receiver.RecvNotify. +func (c *ConnectedEndpoint) RecvNotify() {} + +// CloseRecv implements transport.Receiver.CloseRecv. +func (c *ConnectedEndpoint) CloseRecv() { + c.mu.Lock() + c.readClosed = true + c.mu.Unlock() +} + +// Readable implements transport.Receiver.Readable. +func (c *ConnectedEndpoint) Readable() bool { + c.mu.RLock() + defer c.mu.RUnlock() + if c.readClosed { + return true + } + return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventIn)&waiter.EventIn != 0 +} + +// SendQueuedSize implements transport.Receiver.SendQueuedSize. +func (c *ConnectedEndpoint) SendQueuedSize() int64 { + // SendQueuedSize isn't supported for host sockets because we don't allow the + // sentry to call ioctl(2). + return -1 +} + +// RecvQueuedSize implements transport.Receiver.RecvQueuedSize. +func (c *ConnectedEndpoint) RecvQueuedSize() int64 { + // RecvQueuedSize isn't supported for host sockets because we don't allow the + // sentry to call ioctl(2). + return -1 +} + +// SendMaxQueueSize implements transport.Receiver.SendMaxQueueSize. +func (c *ConnectedEndpoint) SendMaxQueueSize() int64 { + return int64(c.sndbuf) +} + +// RecvMaxQueueSize implements transport.Receiver.RecvMaxQueueSize. +func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 { + // N.B. Unix sockets don't use the receive buffer. We'll claim it is + // the same size as the send buffer. + return int64(c.sndbuf) +} + +// Release implements transport.ConnectedEndpoint.Release and transport.Receiver.Release. +func (c *ConnectedEndpoint) Release() { + c.ref.DecRefWithDestructor(c.close) +} diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go new file mode 100644 index 000000000..5efbb3ae8 --- /dev/null +++ b/pkg/sentry/fs/host/socket_iovec.go @@ -0,0 +1,113 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// maxIovs is the maximum number of iovecs to pass to the host. +var maxIovs = linux.UIO_MAXIOV + +// copyToMulti copies as many bytes from src to dst as possible. +func copyToMulti(dst [][]byte, src []byte) { + for _, d := range dst { + done := copy(d, src) + src = src[done:] + if len(src) == 0 { + break + } + } +} + +// copyFromMulti copies as many bytes from src to dst as possible. +func copyFromMulti(dst []byte, src [][]byte) { + for _, s := range src { + done := copy(dst, s) + dst = dst[done:] + if len(dst) == 0 { + break + } + } +} + +// buildIovec builds an iovec slice from the given []byte slice. +// +// If truncate, truncate bufs > maxlen. Otherwise, immediately return an error. +// +// If length < the total length of bufs, err indicates why, even when returning +// a truncated iovec. +// +// If intermediate != nil, iovecs references intermediate rather than bufs and +// the caller must copy to/from bufs as necessary. +func buildIovec(bufs [][]byte, maxlen int, truncate bool) (length uintptr, iovecs []syscall.Iovec, intermediate []byte, err error) { + var iovsRequired int + for _, b := range bufs { + length += uintptr(len(b)) + if len(b) > 0 { + iovsRequired++ + } + } + + stopLen := length + if length > uintptr(maxlen) { + if truncate { + stopLen = uintptr(maxlen) + err = syserror.EAGAIN + } else { + return 0, nil, nil, syserror.EMSGSIZE + } + } + + if iovsRequired > maxIovs { + // The kernel will reject our call if we pass this many iovs. + // Use a single intermediate buffer instead. + b := make([]byte, stopLen) + + return stopLen, []syscall.Iovec{{ + Base: &b[0], + Len: uint64(stopLen), + }}, b, err + } + + var total uintptr + iovecs = make([]syscall.Iovec, 0, iovsRequired) + for i := range bufs { + l := len(bufs[i]) + if l == 0 { + continue + } + + stop := l + if total+uintptr(stop) > stopLen { + stop = int(stopLen - total) + } + + iovecs = append(iovecs, syscall.Iovec{ + Base: &bufs[i][0], + Len: uint64(stop), + }) + + total += uintptr(stop) + if total >= stopLen { + break + } + } + + return total, iovecs, nil, err +} diff --git a/pkg/sentry/fs/host/socket_state.go b/pkg/sentry/fs/host/socket_state.go new file mode 100644 index 000000000..5676c451a --- /dev/null +++ b/pkg/sentry/fs/host/socket_state.go @@ -0,0 +1,42 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/fd" +) + +// beforeSave is invoked by stateify. +func (c *ConnectedEndpoint) beforeSave() { + if c.srfd < 0 { + panic("only host file descriptors provided at sentry startup can be saved") + } +} + +// afterLoad is invoked by stateify. +func (c *ConnectedEndpoint) afterLoad() { + f, err := syscall.Dup(c.srfd) + if err != nil { + panic(fmt.Sprintf("failed to dup restored FD %d: %v", c.srfd, err)) + } + c.file = fd.New(f) + if err := c.init(); err != nil { + panic(fmt.Sprintf("Could not restore host socket FD %d: %v", c.srfd, err)) + } + c.Init() +} diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go new file mode 100644 index 000000000..e57be0506 --- /dev/null +++ b/pkg/sentry/fs/host/socket_unsafe.go @@ -0,0 +1,100 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" +) + +// fdReadVec receives from fd to bufs. +// +// If the total length of bufs is > maxlen, fdReadVec will do a partial read +// and err will indicate why the message was truncated. +func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int) (readLen uintptr, msgLen uintptr, controlLen uint64, controlTrunc bool, err error) { + flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC) + if peek { + flags |= syscall.MSG_PEEK + } + + // Always truncate the receive buffer. All socket types will truncate + // received messages. + length, iovecs, intermediate, err := buildIovec(bufs, maxlen, true) + if err != nil && len(iovecs) == 0 { + // No partial write to do, return error immediately. + return 0, 0, 0, false, err + } + + var msg syscall.Msghdr + if len(control) != 0 { + msg.Control = &control[0] + msg.Controllen = uint64(len(control)) + } + + if len(iovecs) != 0 { + msg.Iov = &iovecs[0] + msg.Iovlen = uint64(len(iovecs)) + } + + n, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags) + if e != 0 { + // N.B. prioritize the syscall error over the buildIovec error. + return 0, 0, 0, false, e + } + + // Copy data back to bufs. + if intermediate != nil { + copyToMulti(bufs, intermediate) + } + + controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC + + if n > length { + return length, n, msg.Controllen, controlTrunc, err + } + + return n, n, msg.Controllen, controlTrunc, err +} + +// fdWriteVec sends from bufs to fd. +// +// If the total length of bufs is > maxlen && truncate, fdWriteVec will do a +// partial write and err will indicate why the message was truncated. +func fdWriteVec(fd int, bufs [][]byte, maxlen int, truncate bool) (uintptr, uintptr, error) { + length, iovecs, intermediate, err := buildIovec(bufs, maxlen, truncate) + if err != nil && len(iovecs) == 0 { + // No partial write to do, return error immediately. + return 0, length, err + } + + // Copy data to intermediate buf. + if intermediate != nil { + copyFromMulti(intermediate, bufs) + } + + var msg syscall.Msghdr + if len(iovecs) > 0 { + msg.Iov = &iovecs[0] + msg.Iovlen = uint64(len(iovecs)) + } + + n, _, e := syscall.RawSyscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), syscall.MSG_DONTWAIT|syscall.MSG_NOSIGNAL) + if e != 0 { + // N.B. prioritize the syscall error over the buildIovec error. + return 0, length, e + } + + return n, length, err +} diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go new file mode 100644 index 000000000..e45b339f5 --- /dev/null +++ b/pkg/sentry/fs/host/tty.go @@ -0,0 +1,351 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// TTYFileOperations implements fs.FileOperations for a host file descriptor +// that wraps a TTY FD. +// +// +stateify savable +type TTYFileOperations struct { + fileOperations + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // session is the session attached to this TTYFileOperations. + session *kernel.Session + + // fgProcessGroup is the foreground process group that is currently + // connected to this TTY. + fgProcessGroup *kernel.ProcessGroup +} + +// newTTYFile returns a new fs.File that wraps a TTY FD. +func newTTYFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File { + return fs.NewFile(ctx, dirent, flags, &TTYFileOperations{ + fileOperations: fileOperations{iops: iops}, + }) +} + +// InitForegroundProcessGroup sets the foreground process group and session for +// the TTY. This should only be called once, after the foreground process group +// has been created, but before it has started running. +func (t *TTYFileOperations) InitForegroundProcessGroup(pg *kernel.ProcessGroup) { + t.mu.Lock() + defer t.mu.Unlock() + if t.fgProcessGroup != nil { + panic("foreground process group is already set") + } + t.fgProcessGroup = pg + t.session = pg.Session() +} + +// ForegroundProcessGroup returns the foreground process for the TTY. +func (t *TTYFileOperations) ForegroundProcessGroup() *kernel.ProcessGroup { + t.mu.Lock() + defer t.mu.Unlock() + return t.fgProcessGroup +} + +// Read implements fs.FileOperations.Read. +// +// Reading from a TTY is only allowed for foreground process groups. Background +// process groups will either get EIO or a SIGTTIN. +// +// See drivers/tty/n_tty.c:n_tty_read()=>job_control(). +func (t *TTYFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Are we allowed to do the read? + // drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change(). + if err := t.checkChange(ctx, linux.SIGTTIN); err != nil { + return 0, err + } + + // Do the read. + return t.fileOperations.Read(ctx, file, dst, offset) +} + +// Write implements fs.FileOperations.Write. +func (t *TTYFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Are we allowed to do the write? + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + return 0, err + } + return t.fileOperations.Write(ctx, file, src, offset) +} + +// Release implements fs.FileOperations.Release. +func (t *TTYFileOperations) Release() { + t.mu.Lock() + t.fgProcessGroup = nil + t.mu.Unlock() + + t.fileOperations.Release() +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + // Ignore arg[0]. This is the real FD: + fd := t.fileOperations.iops.fileState.FD() + ioctl := args[1].Uint64() + switch ioctl { + case linux.TCGETS: + termios, err := ioctlGetTermios(fd) + if err != nil { + return 0, err + } + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TCSETS, linux.TCSETSW, linux.TCSETSF: + t.mu.Lock() + defer t.mu.Unlock() + + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + return 0, err + } + + var termios linux.Termios + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + err := ioctlSetTermios(fd, ioctl, &termios) + return 0, err + + case linux.TIOCGPGRP: + // Args: pid_t *argp + // When successful, equivalent to *argp = tcgetpgrp(fd). + // Get the process group ID of the foreground process group on + // this terminal. + + pidns := kernel.PIDNamespaceFromContext(ctx) + if pidns == nil { + return 0, syserror.ENOTTY + } + + t.mu.Lock() + defer t.mu.Unlock() + + // Map the ProcessGroup into a ProcessGroupID in the task's PID + // namespace. + pgID := pidns.IDOfProcessGroup(t.fgProcessGroup) + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TIOCSPGRP: + // Args: const pid_t *argp + // Equivalent to tcsetpgrp(fd, *argp). + // Set the foreground process group ID of this terminal. + + task := kernel.TaskFromContext(ctx) + if task == nil { + return 0, syserror.ENOTTY + } + + t.mu.Lock() + defer t.mu.Unlock() + + // Check that we are allowed to set the process group. + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + // drivers/tty/tty_io.c:tiocspgrp() converts -EIO from + // tty_check_change() to -ENOTTY. + if err == syserror.EIO { + return 0, syserror.ENOTTY + } + return 0, err + } + + // Check that calling task's process group is in the TTY + // session. + if task.ThreadGroup().Session() != t.session { + return 0, syserror.ENOTTY + } + + var pgID kernel.ProcessGroupID + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + + // pgID must be non-negative. + if pgID < 0 { + return 0, syserror.EINVAL + } + + // Process group with pgID must exist in this PID namespace. + pidns := task.PIDNamespace() + pg := pidns.ProcessGroupWithID(pgID) + if pg == nil { + return 0, syserror.ESRCH + } + + // Check that new process group is in the TTY session. + if pg.Session() != t.session { + return 0, syserror.EPERM + } + + t.fgProcessGroup = pg + return 0, nil + + case linux.TIOCGWINSZ: + // Args: struct winsize *argp + // Get window size. + winsize, err := ioctlGetWinsize(fd) + if err != nil { + return 0, err + } + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TIOCSWINSZ: + // Args: const struct winsize *argp + // Set window size. + + // Unlike setting the termios, any process group (even + // background ones) can set the winsize. + + var winsize linux.Winsize + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + err := ioctlSetWinsize(fd, &winsize) + return 0, err + + // Unimplemented commands. + case linux.TIOCSETD, + linux.TIOCSBRK, + linux.TIOCCBRK, + linux.TCSBRK, + linux.TCSBRKP, + linux.TIOCSTI, + linux.TIOCCONS, + linux.FIONBIO, + linux.TIOCEXCL, + linux.TIOCNXCL, + linux.TIOCGEXCL, + linux.TIOCNOTTY, + linux.TIOCSCTTY, + linux.TIOCGSID, + linux.TIOCGETD, + linux.TIOCVHANGUP, + linux.TIOCGDEV, + linux.TIOCMGET, + linux.TIOCMSET, + linux.TIOCMBIC, + linux.TIOCMBIS, + linux.TIOCGICOUNT, + linux.TCFLSH, + linux.TIOCSSERIAL, + linux.TIOCGPTPEER: + + unimpl.EmitUnimplementedEvent(ctx) + fallthrough + default: + return 0, syserror.ENOTTY + } +} + +// checkChange checks that the process group is allowed to read, write, or +// change the state of the TTY. +// +// This corresponds to Linux drivers/tty/tty_io.c:tty_check_change(). The logic +// is a bit convoluted, but documented inline. +// +// Preconditions: t.mu must be held. +func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) error { + task := kernel.TaskFromContext(ctx) + if task == nil { + // No task? Linux does not have an analog for this case, but + // tty_check_change is more of a blacklist of cases than a + // whitelist, and is surprisingly permissive. Allowing the + // change seems most appropriate. + return nil + } + + tg := task.ThreadGroup() + pg := tg.ProcessGroup() + + // If the session for the task is different than the session for the + // controlling TTY, then the change is allowed. Seems like a bad idea, + // but that's exactly what linux does. + if tg.Session() != t.fgProcessGroup.Session() { + return nil + } + + // If we are the foreground process group, then the change is allowed. + if pg == t.fgProcessGroup { + return nil + } + + // We are not the foreground process group. + + // Is the provided signal blocked or ignored? + if (task.SignalMask()&linux.SignalSetOf(sig) != 0) || tg.SignalHandlers().IsIgnored(sig) { + // If the signal is SIGTTIN, then we are attempting to read + // from the TTY. Don't send the signal and return EIO. + if sig == linux.SIGTTIN { + return syserror.EIO + } + + // Otherwise, we are writing or changing terminal state. This is allowed. + return nil + } + + // If the process group is an orphan, return EIO. + if pg.IsOrphan() { + return syserror.EIO + } + + // Otherwise, send the signal to the process group and return ERESTARTSYS. + // + // Note that Linux also unconditionally sets TIF_SIGPENDING on current, + // but this isn't necessary in gVisor because the rationale given in + // 040b6362d58f "tty: fix leakage of -ERESTARTSYS to userland" doesn't + // apply: the sentry will handle -ERESTARTSYS in + // kernel.runApp.execute() even if the kernel.Task isn't interrupted. + // + // Linux ignores the result of kill_pgrp(). + _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) + return kernel.ERESTARTSYS +} diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go new file mode 100644 index 000000000..94ff7708e --- /dev/null +++ b/pkg/sentry/fs/host/util.go @@ -0,0 +1,197 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "os" + "path" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +func open(parent *inodeOperations, name string) (int, error) { + if parent == nil && !path.IsAbs(name) { + return -1, syserror.EINVAL + } + name = path.Clean(name) + + // Don't follow through symlinks. + flags := syscall.O_NOFOLLOW + + if fd, err := openAt(parent, name, flags|syscall.O_RDWR, 0); err == nil { + return fd, nil + } + // Retry as read-only. + if fd, err := openAt(parent, name, flags|syscall.O_RDONLY, 0); err == nil { + return fd, nil + } + + // Retry as write-only. + if fd, err := openAt(parent, name, flags|syscall.O_WRONLY, 0); err == nil { + return fd, nil + } + + // Retry as a symlink, by including O_PATH as an option. + fd, err := openAt(parent, name, linux.O_PATH|flags, 0) + if err == nil { + return fd, nil + } + + // Everything failed. + return -1, err +} + +func openAt(parent *inodeOperations, name string, flags int, perm linux.FileMode) (int, error) { + if parent == nil { + return syscall.Open(name, flags, uint32(perm)) + } + return syscall.Openat(parent.fileState.FD(), name, flags, uint32(perm)) +} + +func nodeType(s *syscall.Stat_t) fs.InodeType { + switch x := (s.Mode & syscall.S_IFMT); x { + case syscall.S_IFLNK: + return fs.Symlink + case syscall.S_IFIFO: + return fs.Pipe + case syscall.S_IFCHR: + return fs.CharacterDevice + case syscall.S_IFBLK: + return fs.BlockDevice + case syscall.S_IFSOCK: + return fs.Socket + case syscall.S_IFDIR: + return fs.Directory + case syscall.S_IFREG: + return fs.RegularFile + default: + // This shouldn't happen, but just in case... + log.Warningf("unknown host file type %d: assuming regular", x) + return fs.RegularFile + } +} + +func wouldBlock(s *syscall.Stat_t) bool { + typ := nodeType(s) + return typ == fs.Pipe || typ == fs.Socket || typ == fs.CharacterDevice +} + +func stableAttr(s *syscall.Stat_t) fs.StableAttr { + return fs.StableAttr{ + Type: nodeType(s), + DeviceID: hostFileDevice.DeviceID(), + InodeID: hostFileDevice.Map(device.MultiDeviceKey{ + Device: s.Dev, + Inode: s.Ino, + }), + BlockSize: int64(s.Blksize), + } +} + +func owner(mo *superOperations, s *syscall.Stat_t) fs.FileOwner { + // User requested no translation, just return actual owner. + if mo.dontTranslateOwnership { + return fs.FileOwner{auth.KUID(s.Uid), auth.KGID(s.Gid)} + } + + // Show only IDs relevant to the sandboxed task. I.e. if we not own the + // file, no sandboxed task can own the file. In that case, we + // use OverflowID for UID, implying that the IDs are not mapped in the + // "root" user namespace. + // + // E.g. + // sandbox's host EUID/EGID is 1/1. + // some_dir's host UID/GID is 2/1. + // Task that mounted this fs has virtualized EUID/EGID 5/5. + // + // If you executed `ls -n` in the sandboxed task, it would show: + // drwxwrxwrx [...] 65534 5 [...] some_dir + + // Files are owned by OverflowID by default. + owner := fs.FileOwner{auth.KUID(auth.OverflowUID), auth.KGID(auth.OverflowGID)} + + // If we own file on host, let mounting task's initial EUID own + // the file. + if s.Uid == hostUID { + owner.UID = mo.mounter.UID + } + + // If our group matches file's group, make file's group match + // the mounting task's initial EGID. + for _, gid := range hostGIDs { + if s.Gid == gid { + owner.GID = mo.mounter.GID + break + } + } + return owner +} + +func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr { + return fs.UnstableAttr{ + Size: s.Size, + Usage: s.Blocks * 512, + Perms: fs.FilePermsFromMode(linux.FileMode(s.Mode)), + Owner: owner(mo, s), + AccessTime: ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec), + ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec), + StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec), + Links: s.Nlink, + } +} + +type dirInfo struct { + buf []byte // buffer for directory I/O. + nbuf int // length of buf; return value from ReadDirent. + bufp int // location of next record in buf. +} + +// isBlockError unwraps os errors and checks if they are caused by EAGAIN or +// EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock. +func isBlockError(err error) bool { + if err == syserror.EAGAIN || err == syserror.EWOULDBLOCK { + return true + } + if pe, ok := err.(*os.PathError); ok { + return isBlockError(pe.Err) + } + return false +} + +func hostEffectiveKIDs() (uint32, []uint32, error) { + gids, err := os.Getgroups() + if err != nil { + return 0, nil, err + } + egids := make([]uint32, len(gids)) + for i, gid := range gids { + egids[i] = uint32(gid) + } + return uint32(os.Geteuid()), append(egids, uint32(os.Getegid())), nil +} + +var hostUID uint32 +var hostGIDs []uint32 + +func init() { + hostUID, hostGIDs, _ = hostEffectiveKIDs() +} diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go new file mode 100644 index 000000000..b95a57c3f --- /dev/null +++ b/pkg/sentry/fs/host/util_unsafe.go @@ -0,0 +1,137 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" +) + +// NulByte is a single NUL byte. It is passed to readlinkat as an empty string. +var NulByte byte = '\x00' + +func createLink(fd int, name string, linkName string) error { + namePtr, err := syscall.BytePtrFromString(name) + if err != nil { + return err + } + linkNamePtr, err := syscall.BytePtrFromString(linkName) + if err != nil { + return err + } + _, _, errno := syscall.Syscall( + syscall.SYS_SYMLINKAT, + uintptr(unsafe.Pointer(namePtr)), + uintptr(fd), + uintptr(unsafe.Pointer(linkNamePtr))) + if errno != 0 { + return errno + } + return nil +} + +func readLink(fd int) (string, error) { + // Buffer sizing copied from os.Readlink. + for l := 128; ; l *= 2 { + b := make([]byte, l) + n, _, errno := syscall.Syscall6( + syscall.SYS_READLINKAT, + uintptr(fd), + uintptr(unsafe.Pointer(&NulByte)), // "" + uintptr(unsafe.Pointer(&b[0])), + uintptr(l), + 0, 0) + if errno != 0 { + return "", errno + } + if n < uintptr(l) { + return string(b[:n]), nil + } + } +} + +func unlinkAt(fd int, name string, dir bool) error { + namePtr, err := syscall.BytePtrFromString(name) + if err != nil { + return err + } + var flags uintptr + if dir { + flags = linux.AT_REMOVEDIR + } + _, _, errno := syscall.Syscall( + syscall.SYS_UNLINKAT, + uintptr(fd), + uintptr(unsafe.Pointer(namePtr)), + flags, + ) + if errno != 0 { + return errno + } + return nil +} + +func timespecFromTimestamp(t ktime.Time, omit, setSysTime bool) syscall.Timespec { + if omit { + return syscall.Timespec{0, linux.UTIME_OMIT} + } + if setSysTime { + return syscall.Timespec{0, linux.UTIME_NOW} + } + return syscall.NsecToTimespec(t.Nanoseconds()) +} + +func setTimestamps(fd int, ts fs.TimeSpec) error { + if ts.ATimeOmit && ts.MTimeOmit { + return nil + } + var sts [2]syscall.Timespec + sts[0] = timespecFromTimestamp(ts.ATime, ts.ATimeOmit, ts.ATimeSetSystemTime) + sts[1] = timespecFromTimestamp(ts.MTime, ts.MTimeOmit, ts.MTimeSetSystemTime) + _, _, errno := syscall.Syscall6( + syscall.SYS_UTIMENSAT, + uintptr(fd), + 0, /* path */ + uintptr(unsafe.Pointer(&sts)), + 0, /* flags */ + 0, 0) + if errno != 0 { + return errno + } + return nil +} + +func fstatat(fd int, name string, flags int) (syscall.Stat_t, error) { + var stat syscall.Stat_t + namePtr, err := syscall.BytePtrFromString(name) + if err != nil { + return stat, err + } + _, _, errno := syscall.Syscall6( + syscall.SYS_NEWFSTATAT, + uintptr(fd), + uintptr(unsafe.Pointer(namePtr)), + uintptr(unsafe.Pointer(&stat)), + uintptr(flags), + 0, 0) + if errno != 0 { + return stat, errno + } + return stat, nil +} |