diff options
Diffstat (limited to 'pkg/sentry/fs/host')
-rw-r--r-- | pkg/sentry/fs/host/BUILD | 104 | ||||
-rw-r--r-- | pkg/sentry/fs/host/control.go | 90 | ||||
-rw-r--r-- | pkg/sentry/fs/host/descriptor.go | 118 | ||||
-rw-r--r-- | pkg/sentry/fs/host/descriptor_state.go | 29 | ||||
-rw-r--r-- | pkg/sentry/fs/host/device.go | 25 | ||||
-rw-r--r-- | pkg/sentry/fs/host/file.go | 371 | ||||
-rw-r--r-- | pkg/sentry/fs/host/fs.go | 327 | ||||
-rw-r--r-- | pkg/sentry/fs/host/fs_test.go | 383 | ||||
-rw-r--r-- | pkg/sentry/fs/host/inode.go | 506 | ||||
-rw-r--r-- | pkg/sentry/fs/host/inode_state.go | 79 | ||||
-rw-r--r-- | pkg/sentry/fs/host/inode_test.go | 112 | ||||
-rw-r--r-- | pkg/sentry/fs/host/ioctl_unsafe.go | 39 | ||||
-rw-r--r-- | pkg/sentry/fs/host/socket.go | 471 | ||||
-rw-r--r-- | pkg/sentry/fs/host/socket_state.go | 39 | ||||
-rw-r--r-- | pkg/sentry/fs/host/socket_test.go | 401 | ||||
-rw-r--r-- | pkg/sentry/fs/host/socket_unsafe.go | 82 | ||||
-rw-r--r-- | pkg/sentry/fs/host/util.go | 197 | ||||
-rw-r--r-- | pkg/sentry/fs/host/util_unsafe.go | 137 | ||||
-rw-r--r-- | pkg/sentry/fs/host/wait_test.go | 70 |
19 files changed, 3580 insertions, 0 deletions
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD new file mode 100644 index 000000000..97b64daed --- /dev/null +++ b/pkg/sentry/fs/host/BUILD @@ -0,0 +1,104 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "host_state", + srcs = [ + "control.go", + "descriptor.go", + "descriptor_state.go", + "file.go", + "fs.go", + "inode.go", + "inode_state.go", + "socket.go", + "socket_state.go", + ], + out = "host_state.go", + package = "host", +) + +go_library( + name = "host", + srcs = [ + "control.go", + "descriptor.go", + "descriptor_state.go", + "device.go", + "file.go", + "fs.go", + "host_state.go", + "inode.go", + "inode_state.go", + "ioctl_unsafe.go", + "socket.go", + "socket_state.go", + "socket_unsafe.go", + "util.go", + "util_unsafe.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/amutex", + "//pkg/fd", + "//pkg/log", + "//pkg/refs", + "//pkg/secio", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/platform", + "//pkg/sentry/safemem", + "//pkg/sentry/socket", + "//pkg/sentry/socket/control", + "//pkg/sentry/socket/unix", + "//pkg/sentry/uniqueid", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserror", + "//pkg/tcpip", + "//pkg/tcpip/link/rawfile", + "//pkg/tcpip/transport/unix", + "//pkg/unet", + "//pkg/waiter", + "//pkg/waiter/fdnotifier", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "host_test", + size = "small", + srcs = [ + "fs_test.go", + "inode_test.go", + "socket_test.go", + "wait_test.go", + ], + embed = [":host"], + deps = [ + "//pkg/fd", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/socket", + "//pkg/sentry/usermem", + "//pkg/syserr", + "//pkg/tcpip", + "//pkg/tcpip/transport/unix", + "//pkg/waiter", + "//pkg/waiter/fdnotifier", + ], +) diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go new file mode 100644 index 000000000..d2b007ab2 --- /dev/null +++ b/pkg/sentry/fs/host/control.go @@ -0,0 +1,90 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/control" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" +) + +type scmRights struct { + fds []int +} + +func newSCMRights(fds []int) control.SCMRights { + return &scmRights{fds} +} + +// Files implements control.SCMRights.Files. +func (c *scmRights) Files(ctx context.Context, max int) control.RightsFiles { + n := max + if l := len(c.fds); n > l { + n = l + } + + rf := control.RightsFiles(fdsToFiles(ctx, c.fds[:n])) + + // Only consume converted FDs (fdsToFiles may convert fewer than n FDs). + c.fds = c.fds[len(rf):] + return rf +} + +// Clone implements unix.RightsControlMessage.Clone. +func (c *scmRights) Clone() unix.RightsControlMessage { + // Host rights never need to be cloned. + return nil +} + +// Release implements unix.RightsControlMessage.Release. +func (c *scmRights) Release() { + for _, fd := range c.fds { + syscall.Close(fd) + } + c.fds = nil +} + +// If an error is encountered, only files created before the error will be +// returned. This is what Linux does. +func fdsToFiles(ctx context.Context, fds []int) []*fs.File { + files := make([]*fs.File, 0, len(fds)) + for _, fd := range fds { + // Get flags. We do it here because they may be modified + // by subsequent functions. + fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0) + if errno != 0 { + ctx.Warningf("Error retrieving host FD flags: %v", error(errno)) + break + } + + // Create the file backed by hostFD. + file, err := NewFile(ctx, fd, fs.FileOwnerFromContext(ctx)) + if err != nil { + ctx.Warningf("Error creating file from host FD: %v", err) + break + } + + // Set known flags. + file.SetFlags(fs.SettableFileFlags{ + NonBlocking: fileFlags&syscall.O_NONBLOCK != 0, + }) + + files = append(files, file) + } + return files +} diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go new file mode 100644 index 000000000..613bd06e8 --- /dev/null +++ b/pkg/sentry/fs/host/descriptor.go @@ -0,0 +1,118 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "path" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/waiter" + "gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier" +) + +// descriptor wraps a host fd. +type descriptor struct { + // donated is true if the host fd was donated by another process. + donated bool + + // If origFD >= 0, it is the host fd that this file was + // originally created from, which must be available at time + // of restore. Only valid if donated is true. + origFD int + + // wouldBlock is true if value (below) points to a file that can + // return EWOULDBLOCK for operations that would block. + wouldBlock bool + + // value is the wrapped host fd. It is never saved or restored + // directly. How it is restored depends on whether it was + // donated and the fs.MountSource it was originally + // opened/created from. + value int `state:"nosave"` +} + +// newDescriptor returns a wrapped host file descriptor. On success, +// the descriptor is registered for event notifications with queue. +func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue *waiter.Queue) (*descriptor, error) { + ownedFD := fd + origFD := -1 + if saveable { + var err error + ownedFD, err = syscall.Dup(fd) + if err != nil { + return nil, err + } + origFD = fd + } + if wouldBlock { + if err := syscall.SetNonblock(ownedFD, true); err != nil { + return nil, err + } + if err := fdnotifier.AddFD(int32(ownedFD), queue); err != nil { + return nil, err + } + } + return &descriptor{ + donated: donated, + origFD: origFD, + wouldBlock: wouldBlock, + value: ownedFD, + }, nil +} + +// initAfterLoad initializes the value of the descriptor after Load. +func (d *descriptor) initAfterLoad(mo *superOperations, id uint64, queue *waiter.Queue) error { + if d.donated { + var err error + d.value, err = syscall.Dup(d.origFD) + if err != nil { + return fmt.Errorf("failed to dup restored fd %d: %v", d.origFD, err) + } + } else { + name, ok := mo.inodeMappings[id] + if !ok { + return fmt.Errorf("failed to find path for inode number %d", id) + } + fullpath := path.Join(mo.root, name) + + var err error + d.value, err = open(nil, fullpath) + if err != nil { + return fmt.Errorf("failed to open %q: %v", fullpath, err) + } + } + if d.wouldBlock { + if err := syscall.SetNonblock(d.value, true); err != nil { + return err + } + if err := fdnotifier.AddFD(int32(d.value), queue); err != nil { + return err + } + } + return nil +} + +// Release releases all resources held by descriptor. +func (d *descriptor) Release() { + if d.wouldBlock { + fdnotifier.RemoveFD(int32(d.value)) + } + if err := syscall.Close(d.value); err != nil { + log.Warningf("error closing fd %d: %v", d.value, err) + } + d.value = -1 +} diff --git a/pkg/sentry/fs/host/descriptor_state.go b/pkg/sentry/fs/host/descriptor_state.go new file mode 100644 index 000000000..7fb274451 --- /dev/null +++ b/pkg/sentry/fs/host/descriptor_state.go @@ -0,0 +1,29 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +// beforeSave is invoked by stateify. +func (d *descriptor) beforeSave() { + if d.donated && d.origFD < 0 { + panic("donated file descriptor cannot be saved") + } +} + +// afterLoad is invoked by stateify. +func (d *descriptor) afterLoad() { + // value must be manually restored by the descriptor's parent using + // initAfterLoad. + d.value = -1 +} diff --git a/pkg/sentry/fs/host/device.go b/pkg/sentry/fs/host/device.go new file mode 100644 index 000000000..f2a0b6b15 --- /dev/null +++ b/pkg/sentry/fs/host/device.go @@ -0,0 +1,25 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/device" +) + +// hostFileDevice is the host file virtual device. +var hostFileDevice = device.NewAnonMultiDevice() + +// hostPipeDevice is the host pipe virtual device. +var hostPipeDevice = device.NewAnonDevice() diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go new file mode 100644 index 000000000..bdf844337 --- /dev/null +++ b/pkg/sentry/fs/host/file.go @@ -0,0 +1,371 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/secio" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" + "gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier" +) + +// fileOperations implements fs.FileOperations for a host file descriptor. +type fileOperations struct { + fsutil.NoopRelease `state:"nosave"` + + // iops are the Inode operations for this file. + iops *inodeOperations `state:"wait"` + + // a scratch buffer for reading directory entries. + dirinfo *dirInfo `state:"nosave"` + + // dirCursor is the directory cursor. + dirCursor string + + // allowIoctl determines whether ioctls should be passed through to the + // host. + allowIoctl bool +} + +// fileOperations implements fs.FileOperations. +var _ fs.FileOperations = (*fileOperations)(nil) + +// NewFile creates a new File backed by the provided host file descriptor. If +// NewFile succeeds, ownership of the fd is transferred to the returned File. +// +// The returned File cannot be saved, since there is no guarantee that the same +// fd will exist or represent the same file at time of restore. If such a +// guarantee does exist, use ImportFile instead. +func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error) { + return newFileFromDonatedFD(ctx, fd, mounter, false, false) +} + +// ImportFile creates a new File backed by the provided host file descriptor. +// Unlike NewFile, the file descriptor used by the File is duped from fd to +// ensure that later changes to fd are not reflected by the fs.File. +// +// If the returned file is saved, it will be restored by re-importing the fd +// originally passed to ImportFile. It is the restorer's responsibility to +// ensure that the fd represents the same file. +func ImportFile(ctx context.Context, fd int, mounter fs.FileOwner, allowIoctl bool) (*fs.File, error) { + return newFileFromDonatedFD(ctx, fd, mounter, true, allowIoctl) +} + +// newFileFromDonatedFD returns an fs.File from a donated fd. If the fd is +// saveable, then saveable is true. +func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner, saveable, allowIoctl bool) (*fs.File, error) { + var s syscall.Stat_t + if err := syscall.Fstat(donated, &s); err != nil { + return nil, err + } + switch s.Mode & syscall.S_IFMT { + case syscall.S_IFSOCK: + flags, err := fileFlagsFromDonatedFD(donated) + if err != nil { + return nil, err + } + s, err := newSocket(ctx, donated, saveable) + if err != nil { + return nil, err + } + s.SetFlags(fs.SettableFileFlags{ + NonBlocking: flags.NonBlocking, + }) + return s, nil + default: + flags, err := fileFlagsFromDonatedFD(donated) + if err != nil { + return nil, err + } + msrc := newMountSource(ctx, "/", mounter, &Filesystem{}, fs.MountSourceFlags{}, false /* dontTranslateOwnership */) + inode, err := newInode(ctx, msrc, donated, saveable, true /* donated */) + if err != nil { + return nil, err + } + iops := inode.InodeOperations.(*inodeOperations) + + name := fmt.Sprintf("host:[%d]", inode.StableAttr.InodeID) + dirent := fs.NewDirent(inode, name) + defer dirent.DecRef() + + return newFile(ctx, dirent, flags, iops, allowIoctl), nil + } +} + +func fileFlagsFromDonatedFD(donated int) (fs.FileFlags, error) { + flags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(donated), syscall.F_GETFL, 0) + if errno != 0 { + log.Warningf("Failed to get file flags for donated fd %d (errno=%d)", donated, errno) + return fs.FileFlags{}, syscall.EIO + } + accmode := flags & syscall.O_ACCMODE + return fs.FileFlags{ + Direct: flags&syscall.O_DIRECT != 0, + NonBlocking: flags&syscall.O_NONBLOCK != 0, + Sync: flags&syscall.O_SYNC != 0, + Append: flags&syscall.O_APPEND != 0, + Read: accmode == syscall.O_RDONLY || accmode == syscall.O_RDWR, + Write: accmode == syscall.O_WRONLY || accmode == syscall.O_RDWR, + }, nil +} + +// newFile returns a new fs.File. +func newFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations, allowIoctl bool) *fs.File { + if !iops.ReturnsWouldBlock() { + // Allow reading/writing at an arbitrary offset for files + // that support it. + flags.Pread = true + flags.Pwrite = true + } + return fs.NewFile(ctx, dirent, flags, &fileOperations{ + iops: iops, + allowIoctl: allowIoctl, + }) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (f *fileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + f.iops.fileState.queue.EventRegister(e, mask) + fdnotifier.UpdateFD(int32(f.iops.fileState.FD())) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (f *fileOperations) EventUnregister(e *waiter.Entry) { + f.iops.fileState.queue.EventUnregister(e) + fdnotifier.UpdateFD(int32(f.iops.fileState.FD())) +} + +// Readiness uses the poll() syscall to check the status of the underlying FD. +func (f *fileOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + return fdnotifier.NonBlockingPoll(int32(f.iops.fileState.FD()), mask) +} + +// Readdir implements fs.FileOperations.Readdir. +func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { + root := fs.RootFromContext(ctx) + defer root.DecRef() + dirCtx := &fs.DirCtx{ + Serializer: serializer, + DirCursor: &f.dirCursor, + } + return fs.DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset()) +} + +// IterateDir implements fs.DirIterator.IterateDir. +func (f *fileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { + if f.dirinfo == nil { + f.dirinfo = new(dirInfo) + f.dirinfo.buf = make([]byte, usermem.PageSize) + } + entries, err := f.iops.readdirAll(f.dirinfo) + if err != nil { + return offset, err + } + count, err := fs.GenericReaddir(dirCtx, fs.NewSortedDentryMap(entries)) + return offset + count, err +} + +// Write implements fs.FileOperations.Write. +func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + // Would this file block? + if f.iops.ReturnsWouldBlock() { + // These files can't be memory mapped, assert this. This also + // means that writes do not need to synchronize with memory + // mappings nor metadata cached by this file's fs.Inode. + if canMap(file.Dirent.Inode) { + panic("files that can return EWOULDBLOCK cannot be memory mapped") + } + // Ignore the offset, these files don't support writing at + // an arbitrary offset. + writer := fd.NewReadWriter(f.iops.fileState.FD()) + n, err := src.CopyInTo(ctx, safemem.FromIOWriter{writer}) + if isBlockError(err) { + err = syserror.ErrWouldBlock + } + return n, err + } + if !file.Dirent.Inode.MountSource.Flags.ForcePageCache { + writer := secio.NewOffsetWriter(fd.NewReadWriter(f.iops.fileState.FD()), offset) + return src.CopyInTo(ctx, safemem.FromIOWriter{writer}) + } + return f.iops.cachingInodeOps.Write(ctx, src, offset) +} + +// Read implements fs.FileOperations.Read. +func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + // Would this file block? + if f.iops.ReturnsWouldBlock() { + // These files can't be memory mapped, assert this. This also + // means that reads do not need to synchronize with memory + // mappings nor metadata cached by this file's fs.Inode. + if canMap(file.Dirent.Inode) { + panic("files that can return EWOULDBLOCK cannot be memory mapped") + } + // Ignore the offset, these files don't support reading at + // an arbitrary offset. + reader := fd.NewReadWriter(f.iops.fileState.FD()) + n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{reader}) + if isBlockError(err) { + // If we got any data at all, return it as a "completed" partial read + // rather than retrying until complete. + if n != 0 { + err = nil + } else { + err = syserror.ErrWouldBlock + } + } + return n, err + } + if !file.Dirent.Inode.MountSource.Flags.ForcePageCache { + reader := secio.NewOffsetReader(fd.NewReadWriter(f.iops.fileState.FD()), offset) + return dst.CopyOutFrom(ctx, safemem.FromIOReader{reader}) + } + return f.iops.cachingInodeOps.Read(ctx, file, dst, offset) +} + +// Fsync implements fs.FileOperations.Fsync. +func (f *fileOperations) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error { + switch syncType { + case fs.SyncAll, fs.SyncData: + if err := file.Dirent.Inode.WriteOut(ctx); err != nil { + return err + } + fallthrough + case fs.SyncBackingStorage: + return syscall.Fsync(f.iops.fileState.FD()) + } + panic("invalid sync type") +} + +// Flush implements fs.FileOperations.Flush. +func (f *fileOperations) Flush(context.Context, *fs.File) error { + // This is a no-op because flushing the resource backing this + // file would mean closing it. We can't do that because other + // open files may depend on the backing host fd. + return nil +} + +// ConfigureMMap implements fs.FileOperations.ConfigureMMap. +func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error { + if !canMap(file.Dirent.Inode) { + return syserror.ENODEV + } + return fsutil.GenericConfigureMMap(file, f.iops.cachingInodeOps, opts) +} + +// Seek implements fs.FileOperations.Seek. +func (f *fileOperations) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) { + return fsutil.SeekWithDirCursor(ctx, file, whence, offset, &f.dirCursor) +} + +// Ioctl implements fs.FileOperations.Iocotl. +func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + if !f.allowIoctl { + return 0, syserror.ENOTTY + } + // Ignore arg[0]. This is the real FD: + fd := f.iops.fileState.FD() + ioctl := args[1].Uint64() + switch ioctl { + case unix.TCGETS: + termios, err := ioctlGetTermios(fd) + if err != nil { + return 0, err + } + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case unix.TCSETS, unix.TCSETSW: + var termios linux.Termios + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + err := ioctlSetTermios(fd, ioctl, &termios) + return 0, err + + case unix.TIOCGPGRP: + // Args: pid_t *argp + // When successful, equivalent to *argp = tcgetpgrp(fd). + // Get the process group ID of the foreground process group on + // this terminal. + + t := kernel.TaskFromContext(ctx) + if t == nil { + panic(fmt.Sprintf("cannot get thread group from context %v", ctx)) + } + tid := t.ThreadID() + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &tid, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case unix.TIOCSPGRP: + // Args: const pid_t *argp + // Equivalent to tcsetpgrp(fd, *argp). + // Set the foreground process group ID of this terminal. + + // Not much we can do with this one at the moment, so we just + // lie and pretend everything is great. Bash and Sh seem fine + // with this. + log.Warningf("Ignoring application ioctl(TIOCSPGRP) call") + return 0, nil + + case unix.TIOCGWINSZ: + // Args: struct winsize *argp + // Get window size. + winsize, err := unix.IoctlGetWinsize(fd, unix.TIOCGWINSZ) + if err != nil { + return 0, err + } + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case unix.TIOCSWINSZ: + // Args: const struct winsize *argp + // Set window size. + var winsize unix.Winsize + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + err := unix.IoctlSetWinsize(fd, unix.TIOCSWINSZ, &winsize) + return 0, err + + default: + return 0, syserror.ENOTTY + } +} diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go new file mode 100644 index 000000000..ffd55a5ab --- /dev/null +++ b/pkg/sentry/fs/host/fs.go @@ -0,0 +1,327 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package host implements an fs.Filesystem for files backed by host +// file descriptors. +package host + +import ( + "fmt" + "path" + "path/filepath" + "strconv" + "strings" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// FilesystemName is the name under which Filesystem is registered. +const FilesystemName = "whitelistfs" + +const ( + // whitelistKey is the mount option containing a comma-separated list + // of host paths to whitelist. + whitelistKey = "whitelist" + + // rootPathKey is the mount option containing the root path of the + // mount. + rootPathKey = "root" + + // dontTranslateOwnershipKey is the key to superOperations.dontTranslateOwnership. + dontTranslateOwnershipKey = "dont_translate_ownership" +) + +// maxTraversals determines link traversals in building the whitelist. +const maxTraversals = 10 + +// Filesystem is a pseudo file system that is only available during the setup +// to lock down the configurations. This filesystem should only be mounted at root. +// +// Think twice before exposing this to applications. +type Filesystem struct { + // whitelist is a set of host paths to whitelist. + paths []string +} + +// Name is the identifier of this file system. +func (*Filesystem) Name() string { + return FilesystemName +} + +// AllowUserMount prohibits users from using mount(2) with this file system. +func (*Filesystem) AllowUserMount() bool { + return false +} + +// Flags returns that there is nothing special about this file system. +func (*Filesystem) Flags() fs.FilesystemFlags { + return 0 +} + +// Mount returns an fs.Inode exposing the host file system. It is intended to be locked +// down in PreExec below. +func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) { + // Parse generic comma-separated key=value options. + options := fs.GenericMountSourceOptions(data) + + // Grab the whitelist if one was specified. + // TODO: require another option "testonly" in order to allow + // no whitelist. + if wl, ok := options[whitelistKey]; ok { + f.paths = strings.Split(wl, "|") + delete(options, whitelistKey) + } + + // If the rootPath was set, use it. Othewise default to the root of the + // host fs. + rootPath := "/" + if rp, ok := options[rootPathKey]; ok { + rootPath = rp + delete(options, rootPathKey) + + // We must relativize the whitelisted paths to the new root. + for i, p := range f.paths { + rel, err := filepath.Rel(rootPath, p) + if err != nil { + return nil, fmt.Errorf("whitelist path %q must be a child of root path %q", p, rootPath) + } + f.paths[i] = path.Join("/", rel) + } + } + fd, err := open(nil, rootPath) + if err != nil { + return nil, fmt.Errorf("failed to find root: %v", err) + } + + var dontTranslateOwnership bool + if v, ok := options[dontTranslateOwnershipKey]; ok { + b, err := strconv.ParseBool(v) + if err != nil { + return nil, fmt.Errorf("invalid value for %q: %v", dontTranslateOwnershipKey, err) + } + dontTranslateOwnership = b + delete(options, dontTranslateOwnershipKey) + } + + // Fail if the caller passed us more options than we know about. + if len(options) > 0 { + return nil, fmt.Errorf("unsupported mount options: %v", options) + } + + // The mounting EUID/EGID will be cached by this file system. This will + // be used to assign ownership to files that we own. + owner := fs.FileOwnerFromContext(ctx) + + // Construct the host file system mount and inode. + msrc := newMountSource(ctx, rootPath, owner, f, flags, dontTranslateOwnership) + return newInode(ctx, msrc, fd, false /* saveable */, false /* donated */) +} + +// InstallWhitelist locks down the MountNamespace to only the currently installed +// Dirents and the given paths. +func (f *Filesystem) InstallWhitelist(ctx context.Context, m *fs.MountNamespace) error { + return installWhitelist(ctx, m, f.paths) +} + +func installWhitelist(ctx context.Context, m *fs.MountNamespace, paths []string) error { + if len(paths) == 0 || (len(paths) == 1 && paths[0] == "") { + // Warning will be logged during filter installation if the empty + // whitelist matters (allows for host file access). + return nil + } + + // Done tracks entries already added. + done := make(map[string]bool) + root := m.Root() + defer root.DecRef() + + for i := 0; i < len(paths); i++ { + // Make sure the path is absolute. This is a sanity check. + if !path.IsAbs(paths[i]) { + return fmt.Errorf("path %q is not absolute", paths[i]) + } + + // We need to add all the intermediate paths, in case one of + // them is a symlink that needs to be resolved. + for j := 1; j <= len(paths[i]); j++ { + if j < len(paths[i]) && paths[i][j] != '/' { + continue + } + current := paths[i][:j] + + // Lookup the given component in the tree. + d, err := m.FindLink(ctx, root, nil, current, maxTraversals) + if err != nil { + log.Warningf("populate failed for %q: %v", current, err) + continue + } + + // It's critical that this DecRef happens after the + // freeze below. This ensures that the dentry is in + // place to be frozen. Otherwise, we freeze without + // these entries. + defer d.DecRef() + + // Expand the last component if necessary. + if current == paths[i] { + // Is it a directory or symlink? + sattr := d.Inode.StableAttr + if fs.IsDir(sattr) { + for name := range childDentAttrs(ctx, d) { + paths = append(paths, path.Join(current, name)) + } + } + if fs.IsSymlink(sattr) { + // Only expand symlinks once. The + // folder structure may contain + // recursive symlinks and we don't want + // to end up infinitely expanding this + // symlink. This is safe because this + // is the last component. If a later + // path wants to symlink something + // beneath this symlink that will still + // be handled by the FindLink above. + if done[current] { + continue + } + + s, err := d.Inode.Readlink(ctx) + if err != nil { + log.Warningf("readlink failed for %q: %v", current, err) + continue + } + if path.IsAbs(s) { + paths = append(paths, s) + } else { + target := path.Join(path.Dir(current), s) + paths = append(paths, target) + } + } + } + + // Only report this one once even though we may look + // it up more than once. If we whitelist /a/b,/a then + // /a will be "done" when it is looked up for /a/b, + // however we still need to expand all of its contents + // when whitelisting /a. + if !done[current] { + log.Debugf("whitelisted: %s", current) + } + done[current] = true + } + } + + // Freeze the mount tree in place. This prevents any new paths from + // being opened and any old ones from being removed. If we do provide + // tmpfs mounts, we'll want to freeze/thaw those separately. + m.Freeze() + return nil +} + +func childDentAttrs(ctx context.Context, d *fs.Dirent) map[string]fs.DentAttr { + dirname, _ := d.FullName(nil /* root */) + dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) + if err != nil { + log.Warningf("failed to open directory %q: %v", dirname, err) + return nil + } + dir.DecRef() + var stubSerializer fs.CollectEntriesSerializer + if err := dir.Readdir(ctx, &stubSerializer); err != nil { + log.Warningf("failed to iterate on host directory %q: %v", dirname, err) + return nil + } + delete(stubSerializer.Entries, ".") + delete(stubSerializer.Entries, "..") + return stubSerializer.Entries +} + +// newMountSource constructs a new host fs.MountSource +// relative to a root path. The root should match the mount point. +func newMountSource(ctx context.Context, root string, mounter fs.FileOwner, filesystem fs.Filesystem, flags fs.MountSourceFlags, dontTranslateOwnership bool) *fs.MountSource { + return fs.NewMountSource(&superOperations{ + root: root, + inodeMappings: make(map[uint64]string), + mounter: mounter, + dontTranslateOwnership: dontTranslateOwnership, + }, filesystem, flags) +} + +// superOperations implements fs.MountSourceOperations. +type superOperations struct { + fs.SimpleMountSourceOperations `state:"nosave"` + + // root is the path of the mount point. All inode mappings + // are relative to this root. + root string + + // inodeMappings contains mappings of fs.Inodes associated + // with this MountSource to paths under root. + inodeMappings map[uint64]string + + // mounter is the cached EUID/EGID that mounted this file system. + mounter fs.FileOwner + + // dontTranslateOwnership indicates whether to not translate file + // ownership. + // + // By default, files/directories owned by the sandbox uses UID/GID + // of the mounter. For files/directories that are not owned by the + // sandbox, file UID/GID is translated to a UID/GID which cannot + // be mapped in the sandboxed application's user namespace. The + // UID/GID will look like the nobody UID/GID (65534) but is not + // strictly owned by the user "nobody". + // + // If whitelistfs is a lower filesystem in an overlay, set + // dont_translate_ownership=true in mount options. + dontTranslateOwnership bool +} + +var _ fs.MountSourceOperations = (*superOperations)(nil) + +// ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings. +func (m *superOperations) ResetInodeMappings() { + m.inodeMappings = make(map[uint64]string) +} + +// SaveInodeMapping implements fs.MountSourceOperations.SaveInodeMapping. +func (m *superOperations) SaveInodeMapping(inode *fs.Inode, path string) { + // This is very unintuitive. We *CANNOT* trust the inode's StableAttrs, + // because overlay copyUp may have changed them out from under us. + // So much for "immutable". + sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr + m.inodeMappings[sattr.InodeID] = path +} + +// Keep implements fs.MountSourceOperations.Keep. +// +// TODO: It is possible to change the permissions on a +// host file while it is in the dirent cache (say from RO to RW), but it is not +// possible to re-open the file with more relaxed permissions, since the host +// FD is already open and stored in the inode. +// +// Using the dirent LRU cache increases the odds that this bug is encountered. +// Since host file access is relatively fast anyways, we disable the LRU cache +// for host fs files. Once we can properly deal with permissions changes and +// re-opening host files, we should revisit whether or not to make use of the +// LRU cache. +func (*superOperations) Keep(*fs.Dirent) bool { + return false +} + +func init() { + fs.RegisterFilesystem(&Filesystem{}) +} diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go new file mode 100644 index 000000000..c000afc49 --- /dev/null +++ b/pkg/sentry/fs/host/fs_test.go @@ -0,0 +1,383 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "io/ioutil" + "os" + "path" + "reflect" + "sort" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// newTestMountNamespace creates a MountNamespace with a ramfs root. +// It returns the host folder created, which should be removed when done. +func newTestMountNamespace(t *testing.T) (*fs.MountNamespace, string, error) { + p, err := ioutil.TempDir("", "root") + if err != nil { + return nil, "", err + } + + fd, err := open(nil, p) + if err != nil { + os.RemoveAll(p) + return nil, "", err + } + ctx := contexttest.Context(t) + root, err := newInode(ctx, newMountSource(ctx, p, fs.RootOwner, &Filesystem{}, fs.MountSourceFlags{}, false), fd, false, false) + if err != nil { + os.RemoveAll(p) + return nil, "", err + } + mm, err := fs.NewMountNamespace(ctx, root) + if err != nil { + os.RemoveAll(p) + return nil, "", err + } + return mm, p, nil +} + +// createTestDirs populates the root with some test files and directories. +// /a/a1.txt +// /a/a2.txt +// /b/b1.txt +// /b/c/c1.txt +// /symlinks/normal.txt +// /symlinks/to_normal.txt -> /symlinks/normal.txt +// /symlinks/recursive -> /symlinks +func createTestDirs(ctx context.Context, t *testing.T, m *fs.MountNamespace) error { + r := m.Root() + defer r.DecRef() + + if err := r.CreateDirectory(ctx, r, "a", fs.FilePermsFromMode(0777)); err != nil { + return err + } + + a, err := r.Walk(ctx, r, "a") + if err != nil { + return err + } + defer a.DecRef() + + a1, err := a.Create(ctx, r, "a1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666)) + if err != nil { + return err + } + a1.DecRef() + + a2, err := a.Create(ctx, r, "a2.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666)) + if err != nil { + return err + } + a2.DecRef() + + if err := r.CreateDirectory(ctx, r, "b", fs.FilePermsFromMode(0777)); err != nil { + return err + } + + b, err := r.Walk(ctx, r, "b") + if err != nil { + return err + } + defer b.DecRef() + + b1, err := b.Create(ctx, r, "b1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666)) + if err != nil { + return err + } + b1.DecRef() + + if err := b.CreateDirectory(ctx, r, "c", fs.FilePermsFromMode(0777)); err != nil { + return err + } + + c, err := b.Walk(ctx, r, "c") + if err != nil { + return err + } + defer c.DecRef() + + c1, err := c.Create(ctx, r, "c1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666)) + if err != nil { + return err + } + c1.DecRef() + + if err := r.CreateDirectory(ctx, r, "symlinks", fs.FilePermsFromMode(0777)); err != nil { + return err + } + + symlinks, err := r.Walk(ctx, r, "symlinks") + if err != nil { + return err + } + defer symlinks.DecRef() + + normal, err := symlinks.Create(ctx, r, "normal.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666)) + if err != nil { + return err + } + normal.DecRef() + + if err := symlinks.CreateLink(ctx, r, "/symlinks/normal.txt", "to_normal.txt"); err != nil { + return err + } + + if err := symlinks.CreateLink(ctx, r, "/symlinks", "recursive"); err != nil { + return err + } + + return nil +} + +// allPaths returns a slice of all paths of entries visible in the rootfs. +func allPaths(ctx context.Context, t *testing.T, m *fs.MountNamespace, base string) ([]string, error) { + var paths []string + root := m.Root() + defer root.DecRef() + + d, err := m.FindLink(ctx, root, nil, base, 1) + if err != nil { + t.Logf("FindLink failed for %q", base) + return paths, err + } + defer d.DecRef() + + if fs.IsDir(d.Inode.StableAttr) { + dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) + if err != nil { + return nil, fmt.Errorf("failed to open directory %q: %v", base, err) + } + iter, ok := dir.FileOperations.(fs.DirIterator) + if !ok { + return nil, fmt.Errorf("cannot directly iterate on host directory %q", base) + } + dirCtx := &fs.DirCtx{ + Serializer: noopDentrySerializer{}, + } + if _, err := fs.DirentReaddir(ctx, d, iter, root, dirCtx, 0); err != nil { + return nil, err + } + for name := range dirCtx.DentAttrs() { + if name == "." || name == ".." { + continue + } + + fullName := path.Join(base, name) + paths = append(paths, fullName) + + // Recurse. + subpaths, err := allPaths(ctx, t, m, fullName) + if err != nil { + return paths, err + } + paths = append(paths, subpaths...) + } + } + + return paths, nil +} + +type noopDentrySerializer struct{} + +func (noopDentrySerializer) CopyOut(string, fs.DentAttr) error { + return nil +} +func (noopDentrySerializer) Written() int { + return 4096 +} + +// pathsEqual returns true if the two string slices contain the same entries. +func pathsEqual(got, want []string) bool { + sort.Strings(got) + sort.Strings(want) + + if len(got) != len(want) { + return false + } + + for i := range got { + if got[i] != want[i] { + return false + } + } + + return true +} + +func TestWhitelist(t *testing.T) { + for _, test := range []struct { + // description of the test. + desc string + // paths are the paths to whitelist + paths []string + // want are all of the directory entries that should be + // visible (nothing beyond this set should be visible). + want []string + }{ + { + desc: "root", + paths: []string{"/"}, + want: []string{"/a", "/a/a1.txt", "/a/a2.txt", "/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt", "/symlinks", "/symlinks/normal.txt", "/symlinks/to_normal.txt", "/symlinks/recursive"}, + }, + { + desc: "top-level directories", + paths: []string{"/a", "/b"}, + want: []string{"/a", "/a/a1.txt", "/a/a2.txt", "/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"}, + }, + { + desc: "nested directories (1/2)", + paths: []string{"/b", "/b/c"}, + want: []string{"/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"}, + }, + { + desc: "nested directories (2/2)", + paths: []string{"/b/c", "/b"}, + want: []string{"/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"}, + }, + { + desc: "single file", + paths: []string{"/b/c/c1.txt"}, + want: []string{"/b", "/b/c", "/b/c/c1.txt"}, + }, + { + desc: "single file and directory", + paths: []string{"/a/a1.txt", "/b/c"}, + want: []string{"/a", "/a/a1.txt", "/b", "/b/c", "/b/c/c1.txt"}, + }, + { + desc: "symlink", + paths: []string{"/symlinks/to_normal.txt"}, + want: []string{"/symlinks", "/symlinks/normal.txt", "/symlinks/to_normal.txt"}, + }, + { + desc: "recursive symlink", + paths: []string{"/symlinks/recursive/normal.txt"}, + want: []string{"/symlinks", "/symlinks/normal.txt", "/symlinks/recursive"}, + }, + } { + t.Run(test.desc, func(t *testing.T) { + m, p, err := newTestMountNamespace(t) + if err != nil { + t.Errorf("Failed to create MountNamespace: %v", err) + } + defer os.RemoveAll(p) + + ctx := withRoot(contexttest.RootContext(t), m.Root()) + if err := createTestDirs(ctx, t, m); err != nil { + t.Errorf("Failed to create test dirs: %v", err) + } + + if err := installWhitelist(ctx, m, test.paths); err != nil { + t.Errorf("installWhitelist(%v) err got %v want nil", test.paths, err) + } + + got, err := allPaths(ctx, t, m, "/") + if err != nil { + t.Fatalf("Failed to lookup paths (whitelisted: %v): %v", test.paths, err) + } + + if !pathsEqual(got, test.want) { + t.Errorf("For paths %v got %v want %v", test.paths, got, test.want) + } + }) + } +} + +func TestRootPath(t *testing.T) { + // Create a temp dir, which will be the root of our mounted fs. + rootPath, err := ioutil.TempDir(os.TempDir(), "root") + if err != nil { + t.Fatalf("TempDir failed: %v", err) + } + defer os.RemoveAll(rootPath) + + // Create two files inside the new root, one which will be whitelisted + // and one not. + whitelisted, err := ioutil.TempFile(rootPath, "white") + if err != nil { + t.Fatalf("TempFile failed: %v", err) + } + if _, err := ioutil.TempFile(rootPath, "black"); err != nil { + t.Fatalf("TempFile failed: %v", err) + } + + // Create a mount with a root path and single whitelisted file. + hostFS := &Filesystem{} + ctx := contexttest.Context(t) + data := fmt.Sprintf("%s=%s,%s=%s", rootPathKey, rootPath, whitelistKey, whitelisted.Name()) + inode, err := hostFS.Mount(ctx, "", fs.MountSourceFlags{}, data) + if err != nil { + t.Fatalf("Mount failed: %v", err) + } + mm, err := fs.NewMountNamespace(ctx, inode) + if err != nil { + t.Fatalf("NewMountNamespace failed: %v", err) + } + if err := hostFS.InstallWhitelist(ctx, mm); err != nil { + t.Fatalf("InstallWhitelist failed: %v", err) + } + + // Get the contents of the root directory. + rootDir := mm.Root() + rctx := withRoot(ctx, rootDir) + f, err := rootDir.Inode.GetFile(rctx, rootDir, fs.FileFlags{}) + if err != nil { + t.Fatalf("GetFile failed: %v", err) + } + c := &fs.CollectEntriesSerializer{} + if err := f.Readdir(rctx, c); err != nil { + t.Fatalf("Readdir failed: %v", err) + } + + // We should have only our whitelisted file, plus the dots. + want := []string{path.Base(whitelisted.Name()), ".", ".."} + got := c.Order + sort.Strings(want) + sort.Strings(got) + if !reflect.DeepEqual(got, want) { + t.Errorf("Readdir got %v, wanted %v", got, want) + } +} + +type rootContext struct { + context.Context + root *fs.Dirent +} + +// withRoot returns a copy of ctx with the given root. +func withRoot(ctx context.Context, root *fs.Dirent) context.Context { + return &rootContext{ + Context: ctx, + root: root, + } +} + +// Value implements Context.Value. +func (rc rootContext) Value(key interface{}) interface{} { + switch key { + case fs.CtxRoot: + rc.root.IncRef() + return rc.root + default: + return rc.Context.Value(key) + } +} diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go new file mode 100644 index 000000000..226bc5164 --- /dev/null +++ b/pkg/sentry/fs/host/inode.go @@ -0,0 +1,506 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/secio" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// inodeOperations implements fs.InodeOperations for an fs.Inodes backed +// by a host file descriptor. +type inodeOperations struct { + fsutil.InodeNotVirtual `state:"nosave"` + fsutil.InodeNoExtendedAttributes `state:"nosave"` + fsutil.DeprecatedFileOperations `state:"nosave"` + + // fileState implements fs.CachedFileObject. It exists + // to break a circular load dependency between inodeOperations + // and cachingInodeOps (below). + fileState *inodeFileState `state:"wait"` + + // cachedInodeOps implements memmap.Mappable. + cachingInodeOps *fsutil.CachingInodeOperations + + // readdirMu protects the file offset on the host FD. This is needed + // for readdir because getdents must use the kernel offset, so + // concurrent readdirs must be exclusive. + // + // All read/write functions pass the offset directly to the kernel and + // thus don't need a lock. + readdirMu sync.Mutex `state:"nosave"` +} + +// inodeFileState implements fs.CachedFileObject and otherwise fully +// encapsulates state that needs to be manually loaded on restore for +// this file object. +// +// This unfortunate structure exists because fs.CachingInodeOperations +// defines afterLoad and therefore cannot be lazily loaded (to break a +// circular load dependency between it and inodeOperations). Even with +// lazy loading, this approach defines the dependencies between objects +// and the expected load behavior more concretely. +type inodeFileState struct { + // Common file system state. + mops *superOperations `state:"wait"` + + // descriptor is the backing host fd. + descriptor *descriptor `state:"wait"` + + // Event queue for blocking operations. + queue waiter.Queue `state:"nosave"` + + // sattr is used to restore the inodeOperations. + sattr fs.StableAttr `state:"wait"` + + // savedUAttr is only allocated during S/R. It points to the save-time + // unstable attributes and is used to validate restore-time ones. + // + // Note that these unstable attributes are only used to detect cross-S/R + // external file system metadata changes. They may differ from the + // cached unstable attributes in cachingInodeOps, as that might differ + // from the external file system attributes if there had been WriteOut + // failures. S/R is transparent to Sentry and the latter will continue + // using its cached values after restore. + savedUAttr *fs.UnstableAttr +} + +// ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt. +func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) { + // TODO: Using safemem.FromIOReader here is wasteful for two + // reasons: + // + // - Using preadv instead of iterated preads saves on host system calls. + // + // - Host system calls can handle destination memory that would fault in + // gr3 (i.e. they can accept safemem.Blocks with NeedSafecopy() == true), + // so the buffering performed by FromIOReader is unnecessary. + // + // This also applies to the write path below. + return safemem.FromIOReader{secio.NewOffsetReader(fd.NewReadWriter(i.FD()), int64(offset))}.ReadToBlocks(dsts) +} + +// WriteFromBlocksAt implements fsutil.CachedFileObject.WriteFromBlocksAt. +func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { + return safemem.FromIOWriter{secio.NewOffsetWriter(fd.NewReadWriter(i.FD()), int64(offset))}.WriteFromBlocks(srcs) +} + +// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes. +func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error { + if mask.Empty() { + return nil + } + if mask.UID || mask.GID { + return syserror.EPERM + } + if mask.Perms { + if err := syscall.Fchmod(i.FD(), uint32(attr.Perms.LinuxMode())); err != nil { + return err + } + } + if mask.Size { + if err := syscall.Ftruncate(i.FD(), attr.Size); err != nil { + return err + } + } + if mask.AccessTime || mask.ModificationTime { + ts := fs.TimeSpec{ + ATime: attr.AccessTime, + ATimeOmit: !mask.AccessTime, + MTime: attr.ModificationTime, + MTimeOmit: !mask.ModificationTime, + } + if err := setTimestamps(i.FD(), ts); err != nil { + return err + } + } + return nil +} + +// Sync implements fsutil.CachedFileObject.Sync. +func (i *inodeFileState) Sync(ctx context.Context) error { + return syscall.Fsync(i.FD()) +} + +// FD implements fsutil.CachedFileObject.FD. +func (i *inodeFileState) FD() int { + return i.descriptor.value +} + +func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, error) { + var s syscall.Stat_t + if err := syscall.Fstat(i.FD(), &s); err != nil { + return fs.UnstableAttr{}, err + } + return unstableAttr(i.mops, &s), nil +} + +// inodeOperations implements fs.InodeOperations. +var _ fs.InodeOperations = (*inodeOperations)(nil) + +// newInode returns a new fs.Inode backed by the host fd. +func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool, donated bool) (*fs.Inode, error) { + // Retrieve metadata. + var s syscall.Stat_t + err := syscall.Fstat(fd, &s) + if err != nil { + return nil, err + } + + fileState := &inodeFileState{ + mops: msrc.MountSourceOperations.(*superOperations), + sattr: stableAttr(&s), + } + + // Initialize the wrapped host file descriptor. + fileState.descriptor, err = newDescriptor( + fd, + donated, + saveable, + wouldBlock(&s), + &fileState.queue, + ) + if err != nil { + return nil, err + } + + // Build the fs.InodeOperations. + uattr := unstableAttr(msrc.MountSourceOperations.(*superOperations), &s) + iops := &inodeOperations{ + fileState: fileState, + cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, msrc.Flags.ForcePageCache), + } + + // Return the fs.Inode. + return fs.NewInode(iops, msrc, fileState.sattr), nil +} + +// Mappable implements fs.InodeOperations.Mappable. +func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable { + if !canMap(inode) { + return nil + } + return i.cachingInodeOps +} + +// ReturnsWouldBlock returns true if this host fd can return EWOULDBLOCK +// for operations that would block. +func (i *inodeOperations) ReturnsWouldBlock() bool { + return i.fileState.descriptor.wouldBlock +} + +// Release implements fs.InodeOperations.Release. +func (i *inodeOperations) Release(context.Context) { + i.fileState.descriptor.Release() + i.cachingInodeOps.Release() +} + +// Lookup implements fs.InodeOperations.Lookup. +func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) { + // Get a new fd relative to i at name. + fd, err := open(i, name) + if err != nil { + if err == syserror.ENOENT { + return nil, syserror.ENOENT + } + return nil, err + } + + inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */) + if err != nil { + return nil, err + } + + // Return the fs.Dirent. + return fs.NewDirent(inode, name), nil +} + +// Create implements fs.InodeOperations.Create. +func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) { + // Create a file relative to i at name. + // + // N.B. We always open this file O_RDWR regardless of flags because a + // future GetFile might want more access. Open allows this regardless + // of perm. + fd, err := openAt(i, name, syscall.O_RDWR|syscall.O_CREAT|syscall.O_EXCL, perm.LinuxMode()) + if err != nil { + return nil, err + } + + inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */) + if err != nil { + return nil, err + } + + d := fs.NewDirent(inode, name) + defer d.DecRef() + return inode.GetFile(ctx, d, flags) +} + +// CreateDirectory implements fs.InodeOperations.CreateDirectory. +func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { + return syscall.Mkdirat(i.fileState.FD(), name, uint32(perm.LinuxMode())) +} + +// CreateLink implements fs.InodeOperations.CreateLink. +func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error { + return createLink(i.fileState.FD(), oldname, newname) +} + +// CreateHardLink implements fs.InodeOperations.CreateHardLink. +func (*inodeOperations) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error { + return syserror.EPERM +} + +// CreateFifo implements fs.InodeOperations.CreateFifo. +func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error { + return syserror.EOPNOTSUPP +} + +// Remove implements fs.InodeOperations.Remove. +func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error { + return unlinkAt(i.fileState.FD(), name, false /* dir */) +} + +// RemoveDirectory implements fs.InodeOperations.RemoveDirectory. +func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error { + return unlinkAt(i.fileState.FD(), name, true /* dir */) +} + +// Rename implements fs.InodeOperations.Rename. +func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error { + op, ok := oldParent.InodeOperations.(*inodeOperations) + if !ok { + return syscall.EXDEV + } + np, ok := newParent.InodeOperations.(*inodeOperations) + if !ok { + return syscall.EXDEV + } + return syscall.Renameat(op.fileState.FD(), oldName, np.fileState.FD(), newName) +} + +// Bind implements fs.InodeOperations.Bind. +func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) error { + return syserror.EOPNOTSUPP +} + +// BoundEndpoint implements fs.InodeOperations.BoundEndpoint. +func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) unix.BoundEndpoint { + return nil +} + +// GetFile implements fs.InodeOperations.GetFile. +func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return newFile(ctx, d, flags, i, false), nil +} + +// canMap returns true if this fs.Inode can be memory mapped. +func canMap(inode *fs.Inode) bool { + // FIXME: Some obscure character devices can be mapped. + return fs.IsFile(inode.StableAttr) +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + // When the kernel supports mapping host FDs, we do so to take + // advantage of the host page cache. We forego updating fs.Inodes + // because the host manages consistency of its own inode structures. + // + // For fs.Inodes that can never be mapped we take advantage of + // synchronizing metadata updates through host caches. + // + // So can we use host kernel metadata caches? + if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) { + // Then just obtain the attributes. + return i.fileState.unstableAttr(ctx) + } + // No, we're maintaining consistency of metadata ourselves. + return i.cachingInodeOps.UnstableAttr(ctx, inode) +} + +// Check implements fs.InodeOperations.Check. +func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (i *inodeOperations) SetOwner(context.Context, *fs.Inode, fs.FileOwner) error { + return syserror.EPERM +} + +// SetPermissions implements fs.InodeOperations.SetPermissions. +func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, f fs.FilePermissions) bool { + // Can we use host kernel metadata caches? + if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) { + // Then just change the timestamps on the fd, the host + // will synchronize the metadata update with any host + // inode and page cache. + return syscall.Fchmod(i.fileState.FD(), uint32(f.LinuxMode())) == nil + } + // Otherwise update our cached metadata. + return i.cachingInodeOps.SetPermissions(ctx, inode, f) +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error { + // Can we use host kernel metadata caches? + if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) { + // Then just change the timestamps on the fd, the host + // will synchronize the metadata update with any host + // inode and page cache. + return setTimestamps(i.fileState.FD(), ts) + } + // Otherwise update our cached metadata. + return i.cachingInodeOps.SetTimestamps(ctx, inode, ts) +} + +// Truncate implements fs.InodeOperations.Truncate. +func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error { + // Is the file not memory-mappable? + if !canMap(inode) { + // Then just change the file size on the fd, the host + // will synchronize the metadata update with any host + // inode and page cache. + return syscall.Ftruncate(i.fileState.FD(), size) + } + // Otherwise we need to go through cachingInodeOps, even if the host page + // cache is in use, to invalidate private copies of truncated pages. + return i.cachingInodeOps.Truncate(ctx, inode, size) +} + +// WriteOut implements fs.InodeOperations.WriteOut. +func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error { + // Have we been using host kernel metadata caches? + if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) { + // Then the metadata is already up to date on the host. + return nil + } + // Otherwise we need to write out cached pages and attributes + // that are dirty. + return i.cachingInodeOps.WriteOut(ctx, inode) +} + +// Readlink implements fs.InodeOperations.Readlink. +func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { + return readLink(i.fileState.FD()) +} + +// Getlink implements fs.InodeOperations.Getlink. +func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { + if !fs.IsSymlink(i.fileState.sattr) { + return nil, syserror.ENOLINK + } + return nil, fs.ErrResolveViaReadlink +} + +// StatFS implements fs.InodeOperations.StatFS. +func (i *inodeOperations) StatFS(context.Context) (fs.Info, error) { + return fs.Info{}, syserror.ENOSYS +} + +// AddLink implements fs.InodeOperations.AddLink. +// FIXME: Remove this from InodeOperations altogether. +func (i *inodeOperations) AddLink() {} + +// DropLink implements fs.InodeOperations.DropLink. +// FIXME: Remove this from InodeOperations altogether. +func (i *inodeOperations) DropLink() {} + +// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange. +// FIXME: Remove this from InodeOperations altogether. +func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {} + +// readdirAll returns all of the directory entries in i. +func (i *inodeOperations) readdirAll(d *dirInfo) (map[string]fs.DentAttr, error) { + i.readdirMu.Lock() + defer i.readdirMu.Unlock() + + fd := i.fileState.FD() + + // syscall.ReadDirent will use getdents, which will seek the file past + // the last directory entry. To read the directory entries a second + // time, we need to seek back to the beginning. + if _, err := syscall.Seek(fd, 0, 0); err != nil { + if err == syscall.ESPIPE { + // All directories should be seekable. If this file + // isn't seekable, it is not a directory and we should + // return that more sane error. + err = syscall.ENOTDIR + } + return nil, err + } + + names := make([]string, 0, 100) + for { + // Refill the buffer if necessary + if d.bufp >= d.nbuf { + d.bufp = 0 + // ReadDirent will just do a sys_getdents64 to the kernel. + n, err := syscall.ReadDirent(fd, d.buf) + if err != nil { + return nil, err + } + if n == 0 { + break // EOF + } + d.nbuf = n + } + + var nb int + // Parse the dirent buffer we just get and return the directory names along + // with the number of bytes consumed in the buffer. + nb, _, names = syscall.ParseDirent(d.buf[d.bufp:d.nbuf], -1, names) + d.bufp += nb + } + + entries := make(map[string]fs.DentAttr) + for _, filename := range names { + // Lookup the type and host device and inode. + stat, lerr := fstatat(fd, filename, linux.AT_SYMLINK_NOFOLLOW) + if lerr == syscall.ENOENT { + // File disappeared between readdir and lstat. + // Just treat it as if it didn't exist. + continue + } + + // There was a serious problem, we should probably report it. + if lerr != nil { + return nil, lerr + } + + entries[filename] = fs.DentAttr{ + Type: nodeType(&stat), + InodeID: hostFileDevice.Map(device.MultiDeviceKey{ + Device: stat.Dev, + Inode: stat.Ino, + }), + } + } + return entries, nil +} diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go new file mode 100644 index 000000000..80066512a --- /dev/null +++ b/pkg/sentry/fs/host/inode_state.go @@ -0,0 +1,79 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// beforeSave is invoked by stateify. +func (i *inodeFileState) beforeSave() { + if !i.queue.IsEmpty() { + panic("event queue must be empty") + } + if !i.descriptor.donated && i.sattr.Type == fs.RegularFile { + uattr, err := i.unstableAttr(context.Background()) + if err != nil { + panic(fmt.Sprintf("failed to get unstable atttribute of %s: %v", i.mops.inodeMappings[i.sattr.InodeID], err)) + } + i.savedUAttr = &uattr + } +} + +// afterLoad is invoked by stateify. +func (i *inodeFileState) afterLoad() { + // Initialize the descriptor value. + if err := i.descriptor.initAfterLoad(i.mops, i.sattr.InodeID, &i.queue); err != nil { + panic(fmt.Sprintf("failed to load value of descriptor: %v", err)) + } + + // Remap the inode number. + var s syscall.Stat_t + if err := syscall.Fstat(i.FD(), &s); err != nil { + panic(fmt.Sprintf("failed to get metadata for fd %d: %v", i.FD(), err)) + } + key := device.MultiDeviceKey{ + Device: s.Dev, + Inode: s.Ino, + } + if !hostFileDevice.Load(key, i.sattr.InodeID) { + // This means there was a conflict at s.Dev and s.Ino with + // another inode mapping: two files that were unique on the + // saved filesystem are no longer unique on this filesystem. + // Since this violates the contract that filesystems cannot + // change across save and restore, error out. + panic(fmt.Sprintf("host %s conflict in host device mappings: %s", key, hostFileDevice)) + } + + if !i.descriptor.donated && i.sattr.Type == fs.RegularFile { + env, ok := fs.CurrentRestoreEnvironment() + if !ok { + panic("missing restore environment") + } + uattr := unstableAttr(i.mops, &s) + if env.ValidateFileSize && uattr.Size != i.savedUAttr.Size { + panic(fmt.Errorf("file size has changed for %s: previously %d, now %d", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size)) + } + if env.ValidateFileTimestamp && uattr.ModificationTime != i.savedUAttr.ModificationTime { + panic(fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime)) + } + i.savedUAttr = nil + } +} diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go new file mode 100644 index 000000000..0ff87c418 --- /dev/null +++ b/pkg/sentry/fs/host/inode_test.go @@ -0,0 +1,112 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "io/ioutil" + "os" + "path" + "syscall" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// TestMultipleReaddir verifies that multiple Readdir calls return the same +// thing if they use different dir contexts. +func TestMultipleReaddir(t *testing.T) { + p, err := ioutil.TempDir("", "readdir") + if err != nil { + t.Fatalf("Failed to create test dir: %v", err) + } + defer os.RemoveAll(p) + + f, err := os.Create(path.Join(p, "a.txt")) + if err != nil { + t.Fatalf("Failed to create a.txt: %v", err) + } + f.Close() + + f, err = os.Create(path.Join(p, "b.txt")) + if err != nil { + t.Fatalf("Failed to create b.txt: %v", err) + } + f.Close() + + fd, err := open(nil, p) + if err != nil { + t.Fatalf("Failed to open %q: %v", p, err) + } + ctx := contexttest.Context(t) + n, err := newInode(ctx, newMountSource(ctx, p, fs.RootOwner, &Filesystem{}, fs.MountSourceFlags{}, false), fd, false, false) + if err != nil { + t.Fatalf("Failed to create inode: %v", err) + } + + dirent := fs.NewDirent(n, "readdir") + openFile, err := n.GetFile(ctx, dirent, fs.FileFlags{Read: true}) + if err != nil { + t.Fatalf("Failed to get file: %v", err) + } + defer openFile.DecRef() + + c1 := &fs.DirCtx{DirCursor: new(string)} + if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, c1, 0); err != nil { + t.Fatalf("First Readdir failed: %v", err) + } + + c2 := &fs.DirCtx{DirCursor: new(string)} + if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, c2, 0); err != nil { + t.Errorf("Second Readdir failed: %v", err) + } + + if _, ok := c1.DentAttrs()["a.txt"]; !ok { + t.Errorf("want a.txt in first Readdir, got %v", c1.DentAttrs()) + } + if _, ok := c1.DentAttrs()["b.txt"]; !ok { + t.Errorf("want b.txt in first Readdir, got %v", c1.DentAttrs()) + } + + if _, ok := c2.DentAttrs()["a.txt"]; !ok { + t.Errorf("want a.txt in second Readdir, got %v", c2.DentAttrs()) + } + if _, ok := c2.DentAttrs()["b.txt"]; !ok { + t.Errorf("want b.txt in second Readdir, got %v", c2.DentAttrs()) + } +} + +// TestCloseFD verifies fds will be closed. +func TestCloseFD(t *testing.T) { + var p [2]int + if err := syscall.Pipe(p[0:]); err != nil { + t.Fatalf("Failed to create pipe %v", err) + } + defer syscall.Close(p[0]) + defer syscall.Close(p[1]) + + // Use the write-end because we will detect if it's closed on the read end. + ctx := contexttest.Context(t) + file, err := NewFile(ctx, p[1], fs.RootOwner) + if err != nil { + t.Fatalf("Failed to create File: %v", err) + } + file.DecRef() + + s := make([]byte, 10) + if c, err := syscall.Read(p[0], s); c != 0 || err != nil { + t.Errorf("want 0, nil (EOF) from read end, got %v, %v", c, err) + } +} diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go new file mode 100644 index 000000000..3c07c3850 --- /dev/null +++ b/pkg/sentry/fs/host/ioctl_unsafe.go @@ -0,0 +1,39 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" +) + +func ioctlGetTermios(fd int) (*linux.Termios, error) { + var t linux.Termios + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.TCGETS, uintptr(unsafe.Pointer(&t))) + if errno != 0 { + return nil, errno + } + return &t, nil +} + +func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error { + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t))) + if errno != 0 { + return errno + } + return nil +} diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go new file mode 100644 index 000000000..8e36ed7ee --- /dev/null +++ b/pkg/sentry/fs/host/socket.go @@ -0,0 +1,471 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/control" + unixsocket "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" + "gvisor.googlesource.com/gvisor/pkg/unet" + "gvisor.googlesource.com/gvisor/pkg/waiter" + "gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier" +) + +// endpoint encapsulates the state needed to represent a host Unix socket. +type endpoint struct { + queue waiter.Queue `state:"nosave"` + + // stype is the type of Unix socket. (Ex: unix.SockStream, + // unix.SockSeqpacket, unix.SockDgram) + stype unix.SockType `state:"nosave"` + + // fd is the host fd backing this file. + fd int `state:"nosave"` + + // If srfd >= 0, it is the host fd that fd was imported from. + srfd int `state:"wait"` +} + +func (e *endpoint) init() error { + family, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_DOMAIN) + if err != nil { + return err + } + + if family != syscall.AF_UNIX { + // We only allow Unix sockets. + return syserror.EINVAL + } + + stype, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_TYPE) + if err != nil { + return err + } + + if err := syscall.SetNonblock(e.fd, true); err != nil { + return err + } + + e.stype = unix.SockType(stype) + if err := fdnotifier.AddFD(int32(e.fd), &e.queue); err != nil { + return err + } + return nil +} + +// newEndpoint creates a new host endpoint. +func newEndpoint(fd int, srfd int) (*endpoint, error) { + ep := &endpoint{fd: fd, srfd: srfd} + if err := ep.init(); err != nil { + return nil, err + } + return ep, nil +} + +// newSocket allocates a new unix socket with host endpoint. +func newSocket(ctx context.Context, fd int, saveable bool) (*fs.File, error) { + ownedfd := fd + srfd := -1 + if saveable { + var err error + ownedfd, err = syscall.Dup(fd) + if err != nil { + return nil, err + } + srfd = fd + } + ep, err := newEndpoint(ownedfd, srfd) + if err != nil { + if saveable { + syscall.Close(ownedfd) + } + return nil, err + } + return unixsocket.New(ctx, ep), nil +} + +// NewSocketWithDirent allocates a new unix socket with host endpoint. +// +// This is currently only used by unsaveable Gofer nodes. +// +// NewSocketWithDirent takes ownership of f on success. +func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.FileFlags) (*fs.File, error) { + ep, err := newEndpoint(f.FD(), -1) + if err != nil { + return nil, err + } + + // Take ownship of the FD. + f.Release() + + return unixsocket.NewWithDirent(ctx, d, ep, flags), nil +} + +// Close implements unix.Endpoint.Close. +func (e *endpoint) Close() { + fdnotifier.RemoveFD(int32(e.fd)) + syscall.Close(e.fd) + e.fd = -1 +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (e *endpoint) EventRegister(we *waiter.Entry, mask waiter.EventMask) { + e.queue.EventRegister(we, mask) + fdnotifier.UpdateFD(int32(e.fd)) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (e *endpoint) EventUnregister(we *waiter.Entry) { + e.queue.EventUnregister(we) + fdnotifier.UpdateFD(int32(e.fd)) +} + +// Readiness implements unix.Endpoint.Readiness. +func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { + return fdnotifier.NonBlockingPoll(int32(e.fd), mask) +} + +// Type implements unix.Endpoint.Type. +func (e *endpoint) Type() unix.SockType { + return e.stype +} + +// Connect implements unix.Endpoint.Connect. +func (e *endpoint) Connect(server unix.BoundEndpoint) *tcpip.Error { + return tcpip.ErrInvalidEndpointState +} + +// Bind implements unix.Endpoint.Bind. +func (e *endpoint) Bind(address tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error { + return tcpip.ErrInvalidEndpointState +} + +// Listen implements unix.Endpoint.Listen. +func (e *endpoint) Listen(backlog int) *tcpip.Error { + return tcpip.ErrInvalidEndpointState +} + +// Accept implements unix.Endpoint.Accept. +func (e *endpoint) Accept() (unix.Endpoint, *tcpip.Error) { + return nil, tcpip.ErrInvalidEndpointState +} + +// Shutdown implements unix.Endpoint.Shutdown. +func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { + return tcpip.ErrInvalidEndpointState +} + +// GetSockOpt implements unix.Endpoint.GetSockOpt. +func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { + switch o := opt.(type) { + case tcpip.ErrorOption: + _, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_ERROR) + return translateError(err) + case *tcpip.PasscredOption: + // We don't support passcred on host sockets. + *o = 0 + return nil + case *tcpip.SendBufferSizeOption: + v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF) + *o = tcpip.SendBufferSizeOption(v) + return translateError(err) + case *tcpip.ReceiveBufferSizeOption: + v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF) + *o = tcpip.ReceiveBufferSizeOption(v) + return translateError(err) + case *tcpip.ReuseAddressOption: + v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_REUSEADDR) + *o = tcpip.ReuseAddressOption(v) + return translateError(err) + case *tcpip.ReceiveQueueSizeOption: + return tcpip.ErrQueueSizeNotSupported + } + return tcpip.ErrInvalidEndpointState +} + +// SetSockOpt implements unix.Endpoint.SetSockOpt. +func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { + return nil +} + +// GetLocalAddress implements unix.Endpoint.GetLocalAddress. +func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { + return tcpip.FullAddress{}, nil +} + +// GetRemoteAddress implements unix.Endpoint.GetRemoteAddress. +func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { + return tcpip.FullAddress{}, nil +} + +// Passcred returns whether or not the SO_PASSCRED socket option is +// enabled on this end. +func (e *endpoint) Passcred() bool { + // We don't support credential passing for host sockets. + return false +} + +// ConnectedPasscred returns whether or not the SO_PASSCRED socket option +// is enabled on the connected end. +func (e *endpoint) ConnectedPasscred() bool { + // We don't support credential passing for host sockets. + return false +} + +// SendMsg implements unix.Endpoint.SendMsg. +func (e *endpoint) SendMsg(data [][]byte, controlMessages unix.ControlMessages, to unix.BoundEndpoint) (uintptr, *tcpip.Error) { + if to != nil { + return 0, tcpip.ErrInvalidEndpointState + } + return sendMsg(e.fd, data, controlMessages) +} + +func sendMsg(fd int, data [][]byte, controlMessages unix.ControlMessages) (uintptr, *tcpip.Error) { + if !controlMessages.Empty() { + return 0, tcpip.ErrInvalidEndpointState + } + n, err := fdWriteVec(fd, data) + return n, translateError(err) +} + +// RecvMsg implements unix.Endpoint.RecvMsg. +func (e *endpoint) RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) { + return recvMsg(e.fd, data, numRights, peek, addr) +} + +func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) { + var cm unet.ControlMessage + if numRights > 0 { + cm.EnableFDs(int(numRights)) + } + rl, ml, cl, err := fdReadVec(fd, data, []byte(cm), peek) + if err == syscall.EAGAIN { + return 0, 0, unix.ControlMessages{}, tcpip.ErrWouldBlock + } + if err != nil { + return 0, 0, unix.ControlMessages{}, translateError(err) + } + + // Trim the control data if we received less than the full amount. + if cl < uint64(len(cm)) { + cm = cm[:cl] + } + + // Avoid extra allocations in the case where there isn't any control data. + if len(cm) == 0 { + return rl, ml, unix.ControlMessages{}, nil + } + + fds, err := cm.ExtractFDs() + if err != nil { + return 0, 0, unix.ControlMessages{}, translateError(err) + } + + if len(fds) == 0 { + return rl, ml, unix.ControlMessages{}, nil + } + return rl, ml, control.New(nil, nil, newSCMRights(fds)), nil +} + +// NewConnectedEndpoint creates a new unix.Receiver and unix.ConnectedEndpoint +// backed by a host FD that will pretend to be bound at a given sentry path. +func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (unix.Receiver, unix.ConnectedEndpoint, *tcpip.Error) { + if err := fdnotifier.AddFD(int32(file.FD()), queue); err != nil { + return nil, nil, translateError(err) + } + + e := &connectedEndpoint{path: path, queue: queue, file: file} + + // AtomicRefCounters start off with a single reference. We need two. + e.ref.IncRef() + + return e, e, nil +} + +// connectedEndpoint is a host FD backed implementation of +// unix.ConnectedEndpoint and unix.Receiver. +// +// connectedEndpoint does not support save/restore for now. +type connectedEndpoint struct { + queue *waiter.Queue + path string + + // ref keeps track of references to a connectedEndpoint. + ref refs.AtomicRefCount + + // mu protects fd, readClosed and writeClosed. + mu sync.RWMutex + + // file is an *fd.FD containing the FD backing this endpoint. It must be + // set to nil if it has been closed. + file *fd.FD + + // readClosed is true if the FD has read shutdown or if it has been closed. + readClosed bool + + // writeClosed is true if the FD has write shutdown or if it has been + // closed. + writeClosed bool +} + +// Send implements unix.ConnectedEndpoint.Send. +func (c *connectedEndpoint) Send(data [][]byte, controlMessages unix.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *tcpip.Error) { + c.mu.RLock() + defer c.mu.RUnlock() + if c.writeClosed { + return 0, false, tcpip.ErrClosedForSend + } + n, err := sendMsg(c.file.FD(), data, controlMessages) + // There is no need for the callee to call SendNotify because sendMsg uses + // the host's sendmsg(2) and the host kernel's queue. + return n, false, err +} + +// SendNotify implements unix.ConnectedEndpoint.SendNotify. +func (c *connectedEndpoint) SendNotify() {} + +// CloseSend implements unix.ConnectedEndpoint.CloseSend. +func (c *connectedEndpoint) CloseSend() { + c.mu.Lock() + c.writeClosed = true + c.mu.Unlock() +} + +// CloseNotify implements unix.ConnectedEndpoint.CloseNotify. +func (c *connectedEndpoint) CloseNotify() {} + +// Writable implements unix.ConnectedEndpoint.Writable. +func (c *connectedEndpoint) Writable() bool { + c.mu.RLock() + defer c.mu.RUnlock() + if c.writeClosed { + return true + } + return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventOut)&waiter.EventOut != 0 +} + +// Passcred implements unix.ConnectedEndpoint.Passcred. +func (c *connectedEndpoint) Passcred() bool { + // We don't support credential passing for host sockets. + return false +} + +// GetLocalAddress implements unix.ConnectedEndpoint.GetLocalAddress. +func (c *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { + return tcpip.FullAddress{Addr: tcpip.Address(c.path)}, nil +} + +// EventUpdate implements unix.ConnectedEndpoint.EventUpdate. +func (c *connectedEndpoint) EventUpdate() { + c.mu.RLock() + defer c.mu.RUnlock() + if c.file.FD() != -1 { + fdnotifier.UpdateFD(int32(c.file.FD())) + } +} + +// Recv implements unix.Receiver.Recv. +func (c *connectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, unix.ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) { + c.mu.RLock() + defer c.mu.RUnlock() + if c.readClosed { + return 0, 0, unix.ControlMessages{}, tcpip.FullAddress{}, false, tcpip.ErrClosedForReceive + } + rl, ml, cm, err := recvMsg(c.file.FD(), data, numRights, peek, nil) + // There is no need for the callee to call RecvNotify because recvMsg uses + // the host's recvmsg(2) and the host kernel's queue. + return rl, ml, cm, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, err +} + +// close releases all resources related to the endpoint. +func (c *connectedEndpoint) close() { + fdnotifier.RemoveFD(int32(c.file.FD())) + c.file.Close() + c.file = nil +} + +// RecvNotify implements unix.Receiver.RecvNotify. +func (c *connectedEndpoint) RecvNotify() {} + +// CloseRecv implements unix.Receiver.CloseRecv. +func (c *connectedEndpoint) CloseRecv() { + c.mu.Lock() + c.readClosed = true + c.mu.Unlock() +} + +// Readable implements unix.Receiver.Readable. +func (c *connectedEndpoint) Readable() bool { + c.mu.RLock() + defer c.mu.RUnlock() + if c.readClosed { + return true + } + return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventIn)&waiter.EventIn != 0 +} + +// SendQueuedSize implements unix.Receiver.SendQueuedSize. +func (c *connectedEndpoint) SendQueuedSize() int64 { + // SendQueuedSize isn't supported for host sockets because we don't allow the + // sentry to call ioctl(2). + return -1 +} + +// RecvQueuedSize implements unix.Receiver.RecvQueuedSize. +func (c *connectedEndpoint) RecvQueuedSize() int64 { + // RecvQueuedSize isn't supported for host sockets because we don't allow the + // sentry to call ioctl(2). + return -1 +} + +// SendMaxQueueSize implements unix.Receiver.SendMaxQueueSize. +func (c *connectedEndpoint) SendMaxQueueSize() int64 { + v, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_SNDBUF) + if err != nil { + return -1 + } + return int64(v) +} + +// RecvMaxQueueSize implements unix.Receiver.RecvMaxQueueSize. +func (c *connectedEndpoint) RecvMaxQueueSize() int64 { + v, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_RCVBUF) + if err != nil { + return -1 + } + return int64(v) +} + +// Release implements unix.ConnectedEndpoint.Release and unix.Receiver.Release. +func (c *connectedEndpoint) Release() { + c.ref.DecRefWithDestructor(c.close) +} + +func translateError(err error) *tcpip.Error { + if err == nil { + return nil + } + return rawfile.TranslateErrno(err.(syscall.Errno)) +} diff --git a/pkg/sentry/fs/host/socket_state.go b/pkg/sentry/fs/host/socket_state.go new file mode 100644 index 000000000..6acabd55a --- /dev/null +++ b/pkg/sentry/fs/host/socket_state.go @@ -0,0 +1,39 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "syscall" +) + +// beforeSave is invoked by stateify. +func (ep *endpoint) beforeSave() { + if ep.srfd < 0 { + panic("only host file descriptors provided at sentry startup can be saved") + } +} + +// afterLoad is invoked by stateify. +func (ep *endpoint) afterLoad() { + fd, err := syscall.Dup(ep.srfd) + if err != nil { + panic(fmt.Sprintf("failed to dup restored fd %d: %v", ep.srfd, err)) + } + ep.fd = fd + if err := ep.init(); err != nil { + panic(fmt.Sprintf("Could not restore host socket fd %d: %v", ep.srfd, err)) + } +} diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go new file mode 100644 index 000000000..80c46dcfa --- /dev/null +++ b/pkg/sentry/fs/host/socket_test.go @@ -0,0 +1,401 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "reflect" + "syscall" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserr" + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" + "gvisor.googlesource.com/gvisor/pkg/waiter" + "gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier" +) + +var ( + // Make sure that connectedEndpoint implements unix.ConnectedEndpoint. + _ = unix.ConnectedEndpoint(new(connectedEndpoint)) + + // Make sure that connectedEndpoint implements unix.Receiver. + _ = unix.Receiver(new(connectedEndpoint)) +) + +func getFl(fd int) (uint32, error) { + fl, _, err := syscall.RawSyscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0) + if err == 0 { + return uint32(fl), nil + } + return 0, err +} + +func TestSocketIsBlocking(t *testing.T) { + // Using socketpair here because it's already connected. + pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + t.Fatalf("host socket creation failed: %v", err) + } + + fl, err := getFl(pair[0]) + if err != nil { + t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err) + } + if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK { + t.Fatalf("Expected socket %v to be blocking", pair[0]) + } + if fl, err = getFl(pair[1]); err != nil { + t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[1], err) + } + if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK { + t.Fatalf("Expected socket %v to be blocking", pair[1]) + } + sock, err := newSocket(contexttest.Context(t), pair[0], false) + if err != nil { + t.Fatalf("newSocket(%v) failed => %v", pair[0], err) + } + defer sock.DecRef() + // Test that the socket now is non blocking. + if fl, err = getFl(pair[0]); err != nil { + t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err) + } + if fl&syscall.O_NONBLOCK != syscall.O_NONBLOCK { + t.Errorf("Expected socket %v to have becoming non blocking", pair[0]) + } + if fl, err = getFl(pair[1]); err != nil { + t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[1], err) + } + if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK { + t.Errorf("Did not expect socket %v to become non blocking", pair[1]) + } +} + +func TestSocketWritev(t *testing.T) { + // Using socketpair here because it's already connected. + pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + t.Fatalf("host socket creation failed: %v", err) + } + socket, err := newSocket(contexttest.Context(t), pair[0], false) + if err != nil { + t.Fatalf("newSocket(%v) => %v", pair[0], err) + } + defer socket.DecRef() + buf := []byte("hello world\n") + n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(buf)) + if err != nil { + t.Fatalf("socket writev failed: %v", err) + } + + if n != int64(len(buf)) { + t.Fatalf("socket writev wrote incorrect bytes: %d", n) + } +} + +func TestSocketWritevLen0(t *testing.T) { + // Using socketpair here because it's already connected. + pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + t.Fatalf("host socket creation failed: %v", err) + } + socket, err := newSocket(contexttest.Context(t), pair[0], false) + if err != nil { + t.Fatalf("newSocket(%v) => %v", pair[0], err) + } + defer socket.DecRef() + n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(nil)) + if err != nil { + t.Fatalf("socket writev failed: %v", err) + } + + if n != 0 { + t.Fatalf("socket writev wrote incorrect bytes: %d", n) + } +} + +func TestSocketSendMsgLen0(t *testing.T) { + // Using socketpair here because it's already connected. + pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + t.Fatalf("host socket creation failed: %v", err) + } + sfile, err := newSocket(contexttest.Context(t), pair[0], false) + if err != nil { + t.Fatalf("newSocket(%v) => %v", pair[0], err) + } + defer sfile.DecRef() + + s := sfile.FileOperations.(socket.Socket) + n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, unix.ControlMessages{}) + if n != 0 { + t.Fatalf("socket sendmsg() failed: %v wrote: %d", terr, n) + } + + if terr != nil { + t.Fatalf("socket sendmsg() failed: %v", terr) + } +} + +func TestListen(t *testing.T) { + pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err) + } + sfile1, err := newSocket(contexttest.Context(t), pair[0], false) + if err != nil { + t.Fatalf("newSocket(%v) => %v", pair[0], err) + } + defer sfile1.DecRef() + socket1 := sfile1.FileOperations.(socket.Socket) + + sfile2, err := newSocket(contexttest.Context(t), pair[1], false) + if err != nil { + t.Fatalf("newSocket(%v) => %v", pair[1], err) + } + defer sfile2.DecRef() + socket2 := sfile2.FileOperations.(socket.Socket) + + // Socketpairs can not be listened to. + if err := socket1.Listen(nil, 64); err != syserr.ErrInvalidEndpointState { + t.Fatalf("socket1.Listen(nil, 64) => %v, want syserr.ErrInvalidEndpointState", err) + } + if err := socket2.Listen(nil, 64); err != syserr.ErrInvalidEndpointState { + t.Fatalf("socket2.Listen(nil, 64) => %v, want syserr.ErrInvalidEndpointState", err) + } + + // Create a Unix socket, do not bind it. + sock, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err) + } + sfile3, err := newSocket(contexttest.Context(t), sock, false) + if err != nil { + t.Fatalf("newSocket(%v) => %v", sock, err) + } + defer sfile3.DecRef() + socket3 := sfile3.FileOperations.(socket.Socket) + + // This socket is not bound so we can't listen on it. + if err := socket3.Listen(nil, 64); err != syserr.ErrInvalidEndpointState { + t.Fatalf("socket3.Listen(nil, 64) => %v, want syserr.ErrInvalidEndpointState", err) + } +} + +func TestSend(t *testing.T) { + e := connectedEndpoint{writeClosed: true} + if _, _, err := e.Send(nil, unix.ControlMessages{}, tcpip.FullAddress{}); err != tcpip.ErrClosedForSend { + t.Errorf("Got %#v.Send() = %v, want = %v", e, err, tcpip.ErrClosedForSend) + } +} + +func TestRecv(t *testing.T) { + e := connectedEndpoint{readClosed: true} + if _, _, _, _, _, err := e.Recv(nil, false, 0, false); err != tcpip.ErrClosedForReceive { + t.Errorf("Got %#v.Recv() = %v, want = %v", e, err, tcpip.ErrClosedForReceive) + } +} + +func TestPasscred(t *testing.T) { + e := connectedEndpoint{} + if got, want := e.Passcred(), false; got != want { + t.Errorf("Got %#v.Passcred() = %t, want = %t", e, got, want) + } +} + +func TestGetLocalAddress(t *testing.T) { + e := connectedEndpoint{path: "foo"} + want := tcpip.FullAddress{Addr: tcpip.Address("foo")} + if got, err := e.GetLocalAddress(); err != nil || got != want { + t.Errorf("Got %#v.GetLocalAddress() = %#v, %v, want = %#v, %v", e, got, err, want, nil) + } +} + +func TestQueuedSize(t *testing.T) { + e := connectedEndpoint{} + tests := []struct { + name string + f func() int64 + }{ + {"SendQueuedSize", e.SendQueuedSize}, + {"RecvQueuedSize", e.RecvQueuedSize}, + } + + for _, test := range tests { + if got, want := test.f(), int64(-1); got != want { + t.Errorf("Got %#v.%s() = %d, want = %d", e, test.name, got, want) + } + } +} + +func TestReadable(t *testing.T) { + e := connectedEndpoint{readClosed: true} + if got, want := e.Readable(), true; got != want { + t.Errorf("Got %#v.Readable() = %t, want = %t", e, got, want) + } +} + +func TestWritable(t *testing.T) { + e := connectedEndpoint{writeClosed: true} + if got, want := e.Writable(), true; got != want { + t.Errorf("Got %#v.Writable() = %t, want = %t", e, got, want) + } +} + +func TestRelease(t *testing.T) { + f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c := &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)} + want := &connectedEndpoint{queue: c.queue} + want.ref.DecRef() + fdnotifier.AddFD(int32(c.file.FD()), nil) + c.Release() + if !reflect.DeepEqual(c, want) { + t.Errorf("got = %#v, want = %#v", c, want) + } +} + +func TestClose(t *testing.T) { + type testCase struct { + name string + cep *connectedEndpoint + addFD bool + f func() + want *connectedEndpoint + } + + var tests []testCase + + // nil is the value used by connectedEndpoint to indicate a closed file. + // Non-nil files are used to check if the file gets closed. + + f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c := &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)} + tests = append(tests, testCase{ + name: "First CloseRecv", + cep: c, + addFD: false, + f: c.CloseRecv, + want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true}, + }) + + f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true} + tests = append(tests, testCase{ + name: "Second CloseRecv", + cep: c, + addFD: false, + f: c.CloseRecv, + want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true}, + }) + + f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)} + tests = append(tests, testCase{ + name: "First CloseSend", + cep: c, + addFD: false, + f: c.CloseSend, + want: &connectedEndpoint{queue: c.queue, file: c.file, writeClosed: true}, + }) + + f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true} + tests = append(tests, testCase{ + name: "Second CloseSend", + cep: c, + addFD: false, + f: c.CloseSend, + want: &connectedEndpoint{queue: c.queue, file: c.file, writeClosed: true}, + }) + + f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true} + tests = append(tests, testCase{ + name: "CloseSend then CloseRecv", + cep: c, + addFD: true, + f: c.CloseRecv, + want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true}, + }) + + f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true} + tests = append(tests, testCase{ + name: "CloseRecv then CloseSend", + cep: c, + addFD: true, + f: c.CloseSend, + want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true}, + }) + + f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true} + tests = append(tests, testCase{ + name: "Full close then CloseRecv", + cep: c, + addFD: false, + f: c.CloseRecv, + want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true}, + }) + + f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true} + tests = append(tests, testCase{ + name: "Full close then CloseSend", + cep: c, + addFD: false, + f: c.CloseSend, + want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true}, + }) + + for _, test := range tests { + if test.addFD { + fdnotifier.AddFD(int32(test.cep.file.FD()), nil) + } + if test.f(); !reflect.DeepEqual(test.cep, test.want) { + t.Errorf("%s: got = %#v, want = %#v", test.name, test.cep, test.want) + } + } +} diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go new file mode 100644 index 000000000..bf8da6867 --- /dev/null +++ b/pkg/sentry/fs/host/socket_unsafe.go @@ -0,0 +1,82 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" +) + +// buildIovec builds an iovec slice from the given []byte slice. +func buildIovec(bufs [][]byte) (uintptr, []syscall.Iovec) { + var length uintptr + iovecs := make([]syscall.Iovec, 0, 10) + for i := range bufs { + if l := len(bufs[i]); l > 0 { + length += uintptr(l) + iovecs = append(iovecs, syscall.Iovec{ + Base: &bufs[i][0], + Len: uint64(l), + }) + } + } + return length, iovecs +} + +func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool) (readLen uintptr, msgLen uintptr, controlLen uint64, err error) { + flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC) + if peek { + flags |= syscall.MSG_PEEK + } + + length, iovecs := buildIovec(bufs) + + var msg syscall.Msghdr + if len(control) != 0 { + msg.Control = &control[0] + msg.Controllen = uint64(len(control)) + } + + if len(iovecs) != 0 { + msg.Iov = &iovecs[0] + msg.Iovlen = uint64(len(iovecs)) + } + n, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags) + if e != 0 { + return 0, 0, 0, e + } + + if n > length { + return length, n, msg.Controllen, nil + } + + return n, n, msg.Controllen, nil +} + +func fdWriteVec(fd int, bufs [][]byte) (uintptr, error) { + _, iovecs := buildIovec(bufs) + + var msg syscall.Msghdr + if len(iovecs) > 0 { + msg.Iov = &iovecs[0] + msg.Iovlen = uint64(len(iovecs)) + } + n, _, e := syscall.RawSyscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), syscall.MSG_DONTWAIT|syscall.MSG_NOSIGNAL) + if e != 0 { + return 0, e + } + + return n, nil +} diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go new file mode 100644 index 000000000..74c703eb7 --- /dev/null +++ b/pkg/sentry/fs/host/util.go @@ -0,0 +1,197 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "os" + "path" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +func open(parent *inodeOperations, name string) (int, error) { + if parent == nil && !path.IsAbs(name) { + return -1, syserror.EINVAL + } + name = path.Clean(name) + + // Don't follow through symlinks. + flags := syscall.O_NOFOLLOW + + if fd, err := openAt(parent, name, flags|syscall.O_RDWR, 0); err == nil { + return fd, nil + } + // Retry as read-only. + if fd, err := openAt(parent, name, flags|syscall.O_RDONLY, 0); err == nil { + return fd, nil + } + + // Retry as write-only. + if fd, err := openAt(parent, name, flags|syscall.O_WRONLY, 0); err == nil { + return fd, nil + } + + // Retry as a symlink, by including O_PATH as an option. + fd, err := openAt(parent, name, linux.O_PATH|flags, 0) + if err == nil { + return fd, nil + } + + // Everything failed. + return -1, err +} + +func openAt(parent *inodeOperations, name string, flags int, perm linux.FileMode) (int, error) { + if parent == nil { + return syscall.Open(name, flags, uint32(perm)) + } + return syscall.Openat(parent.fileState.FD(), name, flags, uint32(perm)) +} + +func nodeType(s *syscall.Stat_t) fs.InodeType { + switch x := (s.Mode & syscall.S_IFMT); x { + case syscall.S_IFLNK: + return fs.Symlink + case syscall.S_IFIFO: + return fs.Pipe + case syscall.S_IFCHR: + return fs.CharacterDevice + case syscall.S_IFBLK: + return fs.BlockDevice + case syscall.S_IFSOCK: + return fs.Socket + case syscall.S_IFDIR: + return fs.Directory + case syscall.S_IFREG: + return fs.RegularFile + default: + // This shouldn't happen, but just in case... + log.Warningf("unknown host file type %d: assuming regular", x) + return fs.RegularFile + } +} + +func wouldBlock(s *syscall.Stat_t) bool { + typ := nodeType(s) + return typ == fs.Pipe || typ == fs.Socket || typ == fs.CharacterDevice +} + +func stableAttr(s *syscall.Stat_t) fs.StableAttr { + return fs.StableAttr{ + Type: nodeType(s), + DeviceID: hostFileDevice.DeviceID(), + InodeID: hostFileDevice.Map(device.MultiDeviceKey{ + Device: s.Dev, + Inode: s.Ino, + }), + BlockSize: int64(s.Blksize), + } +} + +func owner(mo *superOperations, s *syscall.Stat_t) fs.FileOwner { + // User requested no translation, just return actual owner. + if mo.dontTranslateOwnership { + return fs.FileOwner{auth.KUID(s.Uid), auth.KGID(s.Gid)} + } + + // Show only IDs relevant to the sandboxed task. I.e. if we not own the + // file, no sandboxed task can own the file. In that case, we + // use OverflowID for UID, implying that the IDs are not mapped in the + // "root" user namespace. + // + // E.g. + // sandbox's host EUID/EGID is 1/1. + // some_dir's host UID/GID is 2/1. + // Task that mounted this fs has virtualized EUID/EGID 5/5. + // + // If you executed `ls -n` in the sandboxed task, it would show: + // drwxwrxwrx [...] 65534 5 [...] some_dir + + // Files are owned by OverflowID by default. + owner := fs.FileOwner{auth.KUID(auth.OverflowUID), auth.KGID(auth.OverflowGID)} + + // If we own file on host, let mounting task's initial EUID own + // the file. + if s.Uid == hostUID { + owner.UID = mo.mounter.UID + } + + // If our group matches file's group, make file's group match + // the mounting task's initial EGID. + for _, gid := range hostGIDs { + if s.Gid == gid { + owner.GID = mo.mounter.GID + break + } + } + return owner +} + +func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr { + return fs.UnstableAttr{ + Size: s.Size, + Usage: s.Blocks * 512, + Perms: fs.FilePermsFromMode(linux.FileMode(s.Mode)), + Owner: owner(mo, s), + AccessTime: ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec), + ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec), + StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec), + Links: s.Nlink, + } +} + +type dirInfo struct { + buf []byte // buffer for directory I/O. + nbuf int // length of buf; return value from ReadDirent. + bufp int // location of next record in buf. +} + +// isBlockError unwraps os errors and checks if they are caused by EAGAIN or +// EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock. +func isBlockError(err error) bool { + if err == syserror.EAGAIN || err == syserror.EWOULDBLOCK { + return true + } + if pe, ok := err.(*os.PathError); ok { + return isBlockError(pe.Err) + } + return false +} + +func hostEffectiveKIDs() (uint32, []uint32, error) { + gids, err := os.Getgroups() + if err != nil { + return 0, nil, err + } + egids := make([]uint32, len(gids)) + for i, gid := range gids { + egids[i] = uint32(gid) + } + return uint32(os.Geteuid()), append(egids, uint32(os.Getegid())), nil +} + +var hostUID uint32 +var hostGIDs []uint32 + +func init() { + hostUID, hostGIDs, _ = hostEffectiveKIDs() +} diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go new file mode 100644 index 000000000..c38d2392d --- /dev/null +++ b/pkg/sentry/fs/host/util_unsafe.go @@ -0,0 +1,137 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" +) + +func createLink(fd int, name string, linkName string) error { + namePtr, err := syscall.BytePtrFromString(name) + if err != nil { + return err + } + linkNamePtr, err := syscall.BytePtrFromString(linkName) + if err != nil { + return err + } + _, _, errno := syscall.Syscall( + syscall.SYS_SYMLINKAT, + uintptr(unsafe.Pointer(namePtr)), + uintptr(fd), + uintptr(unsafe.Pointer(linkNamePtr))) + if errno != 0 { + return errno + } + return nil +} + +func readLink(fd int) (string, error) { + // Buffer sizing copied from os.Readlink. + for l := 128; ; l *= 2 { + b := make([]byte, l) + n, _, errno := syscall.Syscall6( + syscall.SYS_READLINKAT, + uintptr(fd), + uintptr(unsafe.Pointer(syscall.StringBytePtr(""))), + uintptr(unsafe.Pointer(&b[0])), + uintptr(l), + 0, 0) + if n < 0 { + n = 0 + } + if errno != 0 { + return "", errno + } + if n < uintptr(l) { + return string(b[:n]), nil + } + } +} + +func unlinkAt(fd int, name string, dir bool) error { + namePtr, err := syscall.BytePtrFromString(name) + if err != nil { + return err + } + var flags uintptr + if dir { + flags = linux.AT_REMOVEDIR + } + _, _, errno := syscall.Syscall( + syscall.SYS_UNLINKAT, + uintptr(fd), + uintptr(unsafe.Pointer(namePtr)), + flags, + ) + if errno != 0 { + return errno + } + return nil +} + +func timespecFromTimestamp(t ktime.Time, omit, setSysTime bool) syscall.Timespec { + if omit { + return syscall.Timespec{0, linux.UTIME_OMIT} + } + if setSysTime { + return syscall.Timespec{0, linux.UTIME_NOW} + } + return syscall.NsecToTimespec(t.Nanoseconds()) +} + +func setTimestamps(fd int, ts fs.TimeSpec) error { + if ts.ATimeOmit && ts.MTimeOmit { + return nil + } + var sts [2]syscall.Timespec + sts[0] = timespecFromTimestamp(ts.ATime, ts.ATimeOmit, ts.ATimeSetSystemTime) + sts[1] = timespecFromTimestamp(ts.MTime, ts.MTimeOmit, ts.MTimeSetSystemTime) + _, _, errno := syscall.Syscall6( + syscall.SYS_UTIMENSAT, + uintptr(fd), + 0, /* path */ + uintptr(unsafe.Pointer(&sts)), + 0, /* flags */ + 0, 0) + if errno != 0 { + return errno + } + return nil +} + +func fstatat(fd int, name string, flags int) (syscall.Stat_t, error) { + var stat syscall.Stat_t + namePtr, err := syscall.BytePtrFromString(name) + if err != nil { + return stat, err + } + _, _, errno := syscall.Syscall6( + syscall.SYS_NEWFSTATAT, + uintptr(fd), + uintptr(unsafe.Pointer(namePtr)), + uintptr(unsafe.Pointer(&stat)), + uintptr(flags), + 0, 0) + if errno != 0 { + return stat, errno + } + return stat, nil +} diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go new file mode 100644 index 000000000..c5f5c9c0d --- /dev/null +++ b/pkg/sentry/fs/host/wait_test.go @@ -0,0 +1,70 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "testing" + "time" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +func TestWait(t *testing.T) { + var fds [2]int + err := syscall.Pipe(fds[:]) + if err != nil { + t.Fatalf("Unable to create pipe: %v", err) + } + + defer syscall.Close(fds[1]) + + ctx := contexttest.Context(t) + file, err := NewFile(ctx, fds[0], fs.RootOwner) + if err != nil { + syscall.Close(fds[0]) + t.Fatalf("NewFile failed: %v", err) + } + + defer file.DecRef() + + r := file.Readiness(waiter.EventIn) + if r != 0 { + t.Fatalf("File is ready for read when it shouldn't be.") + } + + e, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&e, waiter.EventIn) + defer file.EventUnregister(&e) + + // Check that there are no notifications yet. + if len(ch) != 0 { + t.Fatalf("Channel is non-empty") + } + + // Write to the pipe, so it should be writable now. + syscall.Write(fds[1], []byte{1}) + + // Check that we get a notification. We need to yield the current thread + // so that the fdnotifier can deliver notifications, so we use a + // 1-second timeout instead of just checking the length of the channel. + select { + case <-ch: + case <-time.After(1 * time.Second): + t.Fatalf("Channel not notified") + } +} |