diff options
Diffstat (limited to 'pkg/sentry/fs')
71 files changed, 1609 insertions, 1585 deletions
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index b060a12ff..ab1424c95 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -222,8 +222,8 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { } childUpper, err := parentUpper.Lookup(ctx, next.name) if err != nil { - log.Warningf("copy up failed to lookup directory: %v", err) - cleanupUpper(ctx, parentUpper, next.name) + werr := fmt.Errorf("copy up failed to lookup directory: %v", err) + cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } defer childUpper.DecRef() @@ -242,8 +242,8 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { } childUpper, err := parentUpper.Lookup(ctx, next.name) if err != nil { - log.Warningf("copy up failed to lookup symlink: %v", err) - cleanupUpper(ctx, parentUpper, next.name) + werr := fmt.Errorf("copy up failed to lookup symlink: %v", err) + cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } defer childUpper.DecRef() @@ -256,23 +256,23 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { // Bring file attributes up to date. This does not include size, which will be // brought up to date with copyContentsLocked. if err := copyAttributesLocked(ctx, childUpperInode, next.Inode.overlay.lower); err != nil { - log.Warningf("copy up failed to copy up attributes: %v", err) - cleanupUpper(ctx, parentUpper, next.name) + werr := fmt.Errorf("copy up failed to copy up attributes: %v", err) + cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } // Copy the entire file. if err := copyContentsLocked(ctx, childUpperInode, next.Inode.overlay.lower, attrs.Size); err != nil { - log.Warningf("copy up failed to copy up contents: %v", err) - cleanupUpper(ctx, parentUpper, next.name) + werr := fmt.Errorf("copy up failed to copy up contents: %v", err) + cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } lowerMappable := next.Inode.overlay.lower.Mappable() upperMappable := childUpperInode.Mappable() if lowerMappable != nil && upperMappable == nil { - log.Warningf("copy up failed: cannot ensure memory mapping coherence") - cleanupUpper(ctx, parentUpper, next.name) + werr := fmt.Errorf("copy up failed: cannot ensure memory mapping coherence") + cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } @@ -324,12 +324,14 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { return nil } -// cleanupUpper removes name from parent, and panics if it is unsuccessful. -func cleanupUpper(ctx context.Context, parent *Inode, name string) { +// cleanupUpper is called when copy-up fails. It logs the copy-up error and +// attempts to remove name from parent. If that fails, then it panics. +func cleanupUpper(ctx context.Context, parent *Inode, name string, copyUpErr error) { + log.Warningf(copyUpErr.Error()) if err := parent.InodeOperations.Remove(ctx, parent, name); err != nil { // Unfortunately we don't have much choice. We shouldn't // willingly give the caller access to a nonsense filesystem. - panic(fmt.Sprintf("overlay filesystem is in an inconsistent state: failed to remove %q from upper filesystem: %v", name, err)) + panic(fmt.Sprintf("overlay filesystem is in an inconsistent state: copyUp got error: %v; then cleanup failed to remove %q from upper filesystem: %v.", copyUpErr, name, err)) } } diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD index 4c4b7d5cc..9379a4d7b 100644 --- a/pkg/sentry/fs/dev/BUILD +++ b/pkg/sentry/fs/dev/BUILD @@ -9,6 +9,7 @@ go_library( "device.go", "fs.go", "full.go", + "net_tun.go", "null.go", "random.go", "tty.go", @@ -19,15 +20,20 @@ go_library( "//pkg/context", "//pkg/rand", "//pkg/safemem", + "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/ramfs", "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", "//pkg/sentry/memmap", "//pkg/sentry/mm", "//pkg/sentry/pgalloc", + "//pkg/sentry/socket/netstack", "//pkg/syserror", + "//pkg/tcpip/link/tun", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go index 35bd23991..acbd401a0 100644 --- a/pkg/sentry/fs/dev/dev.go +++ b/pkg/sentry/fs/dev/dev.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/usermem" ) @@ -66,8 +67,8 @@ func newMemDevice(ctx context.Context, iops fs.InodeOperations, msrc *fs.MountSo }) } -func newDirectory(ctx context.Context, msrc *fs.MountSource) *fs.Inode { - iops := ramfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555)) +func newDirectory(ctx context.Context, contents map[string]*fs.Inode, msrc *fs.MountSource) *fs.Inode { + iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) return fs.NewInode(ctx, iops, msrc, fs.StableAttr{ DeviceID: devDevice.DeviceID(), InodeID: devDevice.NextIno(), @@ -111,7 +112,7 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode { // A devpts is typically mounted at /dev/pts to provide // pseudoterminal support. Place an empty directory there for // the devpts to be mounted over. - "pts": newDirectory(ctx, msrc), + "pts": newDirectory(ctx, nil, msrc), // Similarly, applications expect a ptmx device at /dev/ptmx // connected to the terminals provided by /dev/pts/. Rather // than creating a device directly (which requires a hairy @@ -126,6 +127,12 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode { "tty": newCharacterDevice(ctx, newTTYDevice(ctx, fs.RootOwner, 0666), msrc, ttyDevMajor, ttyDevMinor), } + if isNetTunSupported(inet.StackFromContext(ctx)) { + contents["net"] = newDirectory(ctx, map[string]*fs.Inode{ + "tun": newCharacterDevice(ctx, newNetTunDevice(ctx, fs.RootOwner, 0666), msrc, netTunDevMajor, netTunDevMinor), + }, msrc) + } + iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) return fs.NewInode(ctx, iops, msrc, fs.StableAttr{ DeviceID: devDevice.DeviceID(), diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go new file mode 100644 index 000000000..dc7ad075a --- /dev/null +++ b/pkg/sentry/fs/dev/net_tun.go @@ -0,0 +1,177 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dev + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip/link/tun" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +const ( + netTunDevMajor = 10 + netTunDevMinor = 200 +) + +// +stateify savable +type netTunInodeOperations struct { + fsutil.InodeGenericChecker `state:"nosave"` + fsutil.InodeNoExtendedAttributes `state:"nosave"` + fsutil.InodeNoopAllocate `state:"nosave"` + fsutil.InodeNoopRelease `state:"nosave"` + fsutil.InodeNoopTruncate `state:"nosave"` + fsutil.InodeNoopWriteOut `state:"nosave"` + fsutil.InodeNotDirectory `state:"nosave"` + fsutil.InodeNotMappable `state:"nosave"` + fsutil.InodeNotSocket `state:"nosave"` + fsutil.InodeNotSymlink `state:"nosave"` + fsutil.InodeVirtual `state:"nosave"` + + fsutil.InodeSimpleAttributes +} + +var _ fs.InodeOperations = (*netTunInodeOperations)(nil) + +func newNetTunDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *netTunInodeOperations { + return &netTunInodeOperations{ + InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC), + } +} + +// GetFile implements fs.InodeOperations.GetFile. +func (iops *netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return fs.NewFile(ctx, d, flags, &netTunFileOperations{}), nil +} + +// +stateify savable +type netTunFileOperations struct { + fsutil.FileNoSeek `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileNoopFsync `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + device tun.Device +} + +var _ fs.FileOperations = (*netTunFileOperations)(nil) + +// Release implements fs.FileOperations.Release. +func (fops *netTunFileOperations) Release() { + fops.device.Release() +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + request := args[1].Uint() + data := args[2].Pointer() + + switch request { + case linux.TUNSETIFF: + t := kernel.TaskFromContext(ctx) + if t == nil { + panic("Ioctl should be called from a task context") + } + if !t.HasCapability(linux.CAP_NET_ADMIN) { + return 0, syserror.EPERM + } + stack, ok := t.NetworkContext().(*netstack.Stack) + if !ok { + return 0, syserror.EINVAL + } + + var req linux.IFReq + if _, err := usermem.CopyObjectIn(ctx, io, data, &req, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + flags := usermem.ByteOrder.Uint16(req.Data[:]) + return 0, fops.device.SetIff(stack.Stack, req.Name(), flags) + + case linux.TUNGETIFF: + var req linux.IFReq + + copy(req.IFName[:], fops.device.Name()) + + // Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when + // there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c. + flags := fops.device.Flags() | linux.IFF_NOFILTER + usermem.ByteOrder.PutUint16(req.Data[:], flags) + + _, err := usermem.CopyObjectOut(ctx, io, data, &req, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + default: + return 0, syserror.ENOTTY + } +} + +// Write implements fs.FileOperations.Write. +func (fops *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + data := make([]byte, src.NumBytes()) + if _, err := src.CopyIn(ctx, data); err != nil { + return 0, err + } + return fops.device.Write(data) +} + +// Read implements fs.FileOperations.Read. +func (fops *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + data, err := fops.device.Read() + if err != nil { + return 0, err + } + n, err := dst.CopyOut(ctx, data) + if n > 0 && n < len(data) { + // Not an error for partial copying. Packet truncated. + err = nil + } + return int64(n), err +} + +// Readiness implements watier.Waitable.Readiness. +func (fops *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + return fops.device.Readiness(mask) +} + +// EventRegister implements watier.Waitable.EventRegister. +func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fops.device.EventRegister(e, mask) +} + +// EventUnregister implements watier.Waitable.EventUnregister. +func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) { + fops.device.EventUnregister(e) +} + +// isNetTunSupported returns whether /dev/net/tun device is supported for s. +func isNetTunSupported(s inet.Stack) bool { + _, ok := s.(*netstack.Stack) + return ok +} diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index acab0411a..65be12175 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -17,7 +17,6 @@ package fs import ( "fmt" "path" - "sort" "sync/atomic" "syscall" @@ -121,9 +120,6 @@ type Dirent struct { // deleted may be set atomically when removed. deleted int32 - // frozen indicates this entry can't walk to unknown nodes. - frozen bool - // mounted is true if Dirent is a mount point, similar to include/linux/dcache.h:DCACHE_MOUNTED. mounted bool @@ -253,8 +249,7 @@ func (d *Dirent) IsNegative() bool { return d.Inode == nil } -// hashChild will hash child into the children list of its new parent d, carrying over -// any "frozen" state from d. +// hashChild will hash child into the children list of its new parent d. // // Returns (*WeakRef, true) if hashing child caused a Dirent to be unhashed. The caller must // validate the returned unhashed weak reference. Common cases: @@ -282,9 +277,6 @@ func (d *Dirent) hashChild(child *Dirent) (*refs.WeakRef, bool) { d.IncRef() } - // Carry over parent's frozen state. - child.frozen = d.frozen - return d.hashChildParentSet(child) } @@ -320,9 +312,9 @@ func (d *Dirent) SyncAll(ctx context.Context) { // There is nothing to sync for a read-only filesystem. if !d.Inode.MountSource.Flags.ReadOnly { - // FIXME(b/34856369): This should be a mount traversal, not a - // Dirent traversal, because some Inodes that need to be synced - // may no longer be reachable by name (after sys_unlink). + // NOTE(b/34856369): This should be a mount traversal, not a Dirent + // traversal, because some Inodes that need to be synced may no longer + // be reachable by name (after sys_unlink). // // Write out metadata, dirty page cached pages, and sync disk/remote // caches. @@ -400,38 +392,6 @@ func (d *Dirent) MountRoot() *Dirent { return mountRoot } -// Freeze prevents this dirent from walking to more nodes. Freeze is applied -// recursively to all children. -// -// If this particular Dirent represents a Virtual node, then Walks and Creates -// may proceed as before. -// -// Freeze can only be called before the application starts running, otherwise -// the root it might be out of sync with the application root if modified by -// sys_chroot. -func (d *Dirent) Freeze() { - d.mu.Lock() - defer d.mu.Unlock() - if d.frozen { - // Already frozen. - return - } - d.frozen = true - - // Take a reference when freezing. - for _, w := range d.children { - if child := w.Get(); child != nil { - // NOTE: We would normally drop the reference here. But - // instead we're hanging on to it. - ch := child.(*Dirent) - ch.Freeze() - } - } - - // Drop all expired weak references. - d.flush() -} - // descendantOf returns true if the receiver dirent is equal to, or a // descendant of, the argument dirent. // @@ -524,11 +484,6 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl w.Drop() } - // Are we allowed to do the lookup? - if d.frozen && !d.Inode.IsVirtual() { - return nil, syscall.ENOENT - } - // Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be // expensive, if possible release the lock and re-acquire it. if walkMayUnlock { @@ -659,11 +614,6 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi return nil, syscall.EEXIST } - // Are we frozen? - if d.frozen && !d.Inode.IsVirtual() { - return nil, syscall.ENOENT - } - // Try the create. We need to trust the file system to return EEXIST (or something // that will translate to EEXIST) if name already exists. file, err := d.Inode.Create(ctx, d, name, flags, perms) @@ -727,11 +677,6 @@ func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, c return syscall.EEXIST } - // Are we frozen? - if d.frozen && !d.Inode.IsVirtual() { - return syscall.ENOENT - } - // Remove any negative Dirent. We've already asserted above with d.exists // that the only thing remaining here can be a negative Dirent. if w, ok := d.children[name]; ok { @@ -862,49 +807,6 @@ func (d *Dirent) GetDotAttrs(root *Dirent) (DentAttr, DentAttr) { return dot, dot } -// readdirFrozen returns readdir results based solely on the frozen children. -func (d *Dirent) readdirFrozen(root *Dirent, offset int64, dirCtx *DirCtx) (int64, error) { - // Collect attrs for "." and "..". - attrs := make(map[string]DentAttr) - names := []string{".", ".."} - attrs["."], attrs[".."] = d.GetDotAttrs(root) - - // Get info from all children. - d.mu.Lock() - defer d.mu.Unlock() - for name, w := range d.children { - if child := w.Get(); child != nil { - defer child.DecRef() - - // Skip negative children. - if child.(*Dirent).IsNegative() { - continue - } - - sattr := child.(*Dirent).Inode.StableAttr - attrs[name] = DentAttr{ - Type: sattr.Type, - InodeID: sattr.InodeID, - } - names = append(names, name) - } - } - - sort.Strings(names) - - if int(offset) >= len(names) { - return offset, nil - } - names = names[int(offset):] - for _, name := range names { - if err := dirCtx.DirEmit(name, attrs[name]); err != nil { - return offset, err - } - offset++ - } - return offset, nil -} - // DirIterator is an open directory containing directory entries that can be read. type DirIterator interface { // IterateDir emits directory entries by calling dirCtx.EmitDir, beginning @@ -964,10 +866,6 @@ func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, return offset, nil } - if d.frozen { - return d.readdirFrozen(root, offset, dirCtx) - } - // Collect attrs for "." and "..". dot, dotdot := d.GetDotAttrs(root) @@ -1068,11 +966,6 @@ func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err return nil, syserror.EINVAL } - // Are we frozen? - if d.parent.frozen && !d.parent.Inode.IsVirtual() { - return nil, syserror.ENOENT - } - // Dirent that'll replace d. // // Note that NewDirent returns with one reference taken; the reference @@ -1101,11 +994,6 @@ func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error { return syserror.ENOENT } - // Are we frozen? - if d.parent.frozen && !d.parent.Inode.IsVirtual() { - return syserror.ENOENT - } - // Remount our former child in its place. // // As replacement used to be our child, it must already have the right @@ -1135,11 +1023,6 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath unlock := d.lockDirectory() defer unlock() - // Are we frozen? - if d.frozen && !d.Inode.IsVirtual() { - return syscall.ENOENT - } - // Try to walk to the node. child, err := d.walk(ctx, root, name, false /* may unlock */) if err != nil { @@ -1201,11 +1084,6 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) unlock := d.lockDirectory() defer unlock() - // Are we frozen? - if d.frozen && !d.Inode.IsVirtual() { - return syscall.ENOENT - } - // Check for dots. if name == "." { // Rejected as the last component by rmdir(2). @@ -1438,8 +1316,8 @@ func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName }, nil } -func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error { - uattr, err := dir.Inode.UnstableAttr(ctx) +func (d *Dirent) checkSticky(ctx context.Context, victim *Dirent) error { + uattr, err := d.Inode.UnstableAttr(ctx) if err != nil { return syserror.EPERM } @@ -1465,30 +1343,33 @@ func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error { return syserror.EPERM } -// MayDelete determines whether `name`, a child of `dir`, can be deleted or +// MayDelete determines whether `name`, a child of `d`, can be deleted or // renamed by `ctx`. // // Compare Linux kernel fs/namei.c:may_delete. -func MayDelete(ctx context.Context, root, dir *Dirent, name string) error { - if err := dir.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil { +func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error { + if err := d.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil { return err } - victim, err := dir.Walk(ctx, root, name) + unlock := d.lockDirectory() + defer unlock() + + victim, err := d.walk(ctx, root, name, true /* may unlock */) if err != nil { return err } defer victim.DecRef() - return mayDelete(ctx, dir, victim) + return d.mayDelete(ctx, victim) } // mayDelete determines whether `victim`, a child of `dir`, can be deleted or // renamed by `ctx`. // // Preconditions: `dir` is writable and executable by `ctx`. -func mayDelete(ctx context.Context, dir, victim *Dirent) error { - if err := checkSticky(ctx, dir, victim); err != nil { +func (d *Dirent) mayDelete(ctx context.Context, victim *Dirent) error { + if err := d.checkSticky(ctx, victim); err != nil { return err } @@ -1516,15 +1397,6 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string return err } - // Are we frozen? - // TODO(jamieliu): Is this the right errno? - if oldParent.frozen && !oldParent.Inode.IsVirtual() { - return syscall.ENOENT - } - if newParent.frozen && !newParent.Inode.IsVirtual() { - return syscall.ENOENT - } - // Do we have general permission to remove from oldParent and // create/replace in newParent? if err := oldParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil { @@ -1542,7 +1414,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string defer renamed.DecRef() // Check that the renamed dirent is deletable. - if err := mayDelete(ctx, oldParent, renamed); err != nil { + if err := oldParent.mayDelete(ctx, renamed); err != nil { return err } @@ -1580,7 +1452,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // across the Rename, so must call DecRef manually (no defer). // Check that we can delete replaced. - if err := mayDelete(ctx, newParent, replaced); err != nil { + if err := newParent.mayDelete(ctx, replaced); err != nil { replaced.DecRef() return err } diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go index 25514ace4..33de32c69 100644 --- a/pkg/sentry/fs/dirent_cache.go +++ b/pkg/sentry/fs/dirent_cache.go @@ -101,8 +101,6 @@ func (c *DirentCache) remove(d *Dirent) { panic(fmt.Sprintf("trying to remove %v, which is not in the dirent cache", d)) } c.list.Remove(d) - d.SetPrev(nil) - d.SetNext(nil) d.DecRef() c.currentSize-- if c.limit != nil { diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go index 5aff0cc95..a0082ecca 100644 --- a/pkg/sentry/fs/fdpipe/pipe_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_test.go @@ -119,7 +119,7 @@ func TestNewPipe(t *testing.T) { continue } if flags := p.flags; test.flags != flags { - t.Errorf("%s: got file flags %s, want %s", test.desc, flags, test.flags) + t.Errorf("%s: got file flags %v, want %v", test.desc, flags, test.flags) continue } if len(test.readAheadBuffer) != len(p.readAheadBuffer) { @@ -136,7 +136,7 @@ func TestNewPipe(t *testing.T) { continue } if !fdnotifier.HasFD(int32(f.FD())) { - t.Errorf("%s: pipe fd %d is not registered for events", test.desc, f.FD) + t.Errorf("%s: pipe fd %d is not registered for events", test.desc, f.FD()) } } } diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 78100e448..2a278fbe3 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -44,7 +44,7 @@ var ( RecordWaitTime = false reads = metric.MustCreateNewUint64Metric("/fs/reads", false /* sync */, "Number of file reads.") - readWait = metric.MustCreateNewUint64Metric("/fs/read_wait", false /* sync */, "Time waiting on file reads, in nanoseconds.") + readWait = metric.MustCreateNewUint64NanosecondsMetric("/fs/read_wait", false /* sync */, "Time waiting on file reads, in nanoseconds.") ) // IncrementWait increments the given wait time metric, if enabled. @@ -310,7 +310,6 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error if !f.mu.Lock(ctx) { return 0, syserror.ErrInterrupted } - unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append) // Handle append mode. if f.Flags().Append { @@ -355,7 +354,6 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64 // offset." unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append) defer unlockAppendMu() - if f.Flags().Append { if err := f.offsetForAppend(ctx, &offset); err != nil { return 0, err @@ -374,9 +372,10 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64 return f.FileOperations.Write(ctx, f, src, offset) } -// offsetForAppend sets the given offset to the end of the file. +// offsetForAppend atomically sets the given offset to the end of the file. // -// Precondition: the file.Dirent.Inode.appendMu mutex should be held for writing. +// Precondition: the file.Dirent.Inode.appendMu mutex should be held for +// writing. func (f *File) offsetForAppend(ctx context.Context, offset *int64) error { uattr, err := f.Dirent.Inode.UnstableAttr(ctx) if err != nil { @@ -386,7 +385,7 @@ func (f *File) offsetForAppend(ctx context.Context, offset *int64) error { } // Update the offset. - *offset = uattr.Size + atomic.StoreInt64(offset, uattr.Size) return nil } diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go index a76d87e3a..1971cc680 100644 --- a/pkg/sentry/fs/file_overlay_test.go +++ b/pkg/sentry/fs/file_overlay_test.go @@ -175,90 +175,6 @@ func TestReaddirRevalidation(t *testing.T) { } } -// TestReaddirOverlayFrozen tests that calling Readdir on an overlay file with -// a frozen dirent tree does not make Readdir calls to the underlying files. -// This is a regression test for b/114808269. -func TestReaddirOverlayFrozen(t *testing.T) { - ctx := contexttest.Context(t) - - // Create an overlay with two directories, each with two files. - upper := newTestRamfsDir(ctx, []dirContent{{name: "upper-file1"}, {name: "upper-file2"}}, nil) - lower := newTestRamfsDir(ctx, []dirContent{{name: "lower-file1"}, {name: "lower-file2"}}, nil) - overlayInode := fs.NewTestOverlayDir(ctx, upper, lower, false) - - // Set that overlay as the root. - root := fs.NewDirent(ctx, overlayInode, "root") - ctx = &rootContext{ - Context: ctx, - root: root, - } - - // Check that calling Readdir on the root now returns all 4 files (2 - // from each layer in the overlay). - rootFile, err := root.Inode.GetFile(ctx, root, fs.FileFlags{Read: true}) - if err != nil { - t.Fatalf("root.Inode.GetFile failed: %v", err) - } - defer rootFile.DecRef() - ser := &fs.CollectEntriesSerializer{} - if err := rootFile.Readdir(ctx, ser); err != nil { - t.Fatalf("rootFile.Readdir failed: %v", err) - } - if got, want := ser.Order, []string{".", "..", "lower-file1", "lower-file2", "upper-file1", "upper-file2"}; !reflect.DeepEqual(got, want) { - t.Errorf("Readdir got names %v, want %v", got, want) - } - - // Readdir should have been called on upper and lower. - upperDir := upper.InodeOperations.(*dir) - lowerDir := lower.InodeOperations.(*dir) - if !upperDir.ReaddirCalled { - t.Errorf("upperDir.ReaddirCalled got %v, want true", upperDir.ReaddirCalled) - } - if !lowerDir.ReaddirCalled { - t.Errorf("lowerDir.ReaddirCalled got %v, want true", lowerDir.ReaddirCalled) - } - - // Reset. - upperDir.ReaddirCalled = false - lowerDir.ReaddirCalled = false - - // Take references on "upper-file1" and "lower-file1", pinning them in - // the dirent tree. - for _, name := range []string{"upper-file1", "lower-file1"} { - if _, err := root.Walk(ctx, root, name); err != nil { - t.Fatalf("root.Walk(%q) failed: %v", name, err) - } - // Don't drop a reference on the returned dirent so that it - // will stay in the tree. - } - - // Freeze the dirent tree. - root.Freeze() - - // Seek back to the beginning of the file. - if _, err := rootFile.Seek(ctx, fs.SeekSet, 0); err != nil { - t.Fatalf("error seeking to beginning of directory: %v", err) - } - - // Calling Readdir on the root now will return only the pinned - // children. - ser = &fs.CollectEntriesSerializer{} - if err := rootFile.Readdir(ctx, ser); err != nil { - t.Fatalf("rootFile.Readdir failed: %v", err) - } - if got, want := ser.Order, []string{".", "..", "lower-file1", "upper-file1"}; !reflect.DeepEqual(got, want) { - t.Errorf("Readdir got names %v, want %v", got, want) - } - - // Readdir should NOT have been called on upper or lower. - if upperDir.ReaddirCalled { - t.Errorf("upperDir.ReaddirCalled got %v, want false", upperDir.ReaddirCalled) - } - if lowerDir.ReaddirCalled { - t.Errorf("lowerDir.ReaddirCalled got %v, want false", lowerDir.ReaddirCalled) - } -} - type rootContext struct { context.Context root *fs.Dirent diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go index bdba6efe5..d2dbff268 100644 --- a/pkg/sentry/fs/fs.go +++ b/pkg/sentry/fs/fs.go @@ -42,9 +42,10 @@ // Dirent.dirMu // Dirent.mu // DirentCache.mu -// Locks in InodeOperations implementations or overlayEntry // Inode.Watches.mu (see `Inotify` for other lock ordering) // MountSource.mu +// Inode.appendMu +// Locks in InodeOperations implementations or overlayEntry // // If multiple Dirent or MountSource locks must be taken, locks in the parent must be // taken before locks in their children. diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go index 6564fd0c6..dd6f5aba6 100644 --- a/pkg/sentry/fs/fsutil/frame_ref_set.go +++ b/pkg/sentry/fs/fsutil/frame_ref_set.go @@ -18,6 +18,7 @@ import ( "math" "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/usage" ) // FrameRefSetFunctions implements segment.Functions for FrameRefSet. @@ -49,3 +50,42 @@ func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform. func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { return val, val } + +// IncRefAndAccount adds a reference on the range fr. All newly inserted segments +// are accounted as host page cache memory mappings. +func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) { + seg, gap := refs.Find(fr.Start) + for { + switch { + case seg.Ok() && seg.Start() < fr.End: + seg = refs.Isolate(seg, fr) + seg.SetValue(seg.Value() + 1) + seg, gap = seg.NextNonEmpty() + case gap.Ok() && gap.Start() < fr.End: + newRange := gap.Range().Intersect(fr) + usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) + seg, gap = refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() + default: + refs.MergeAdjacent(fr) + return + } + } +} + +// DecRefAndAccount removes a reference on the range fr and untracks segments +// that are removed from memory accounting. +func (refs *FrameRefSet) DecRefAndAccount(fr platform.FileRange) { + seg := refs.FindSegment(fr.Start) + + for seg.Ok() && seg.Start() < fr.End { + seg = refs.Isolate(seg, fr) + if old := seg.Value(); old == 1 { + usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) + seg = refs.Remove(seg).NextSegment() + } else { + seg.SetValue(old - 1) + seg = seg.NextSegment() + } + } + refs.MergeAdjacent(fr) +} diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go index daecc4ffe..1922ff08c 100644 --- a/pkg/sentry/fs/fsutil/inode.go +++ b/pkg/sentry/fs/fsutil/inode.go @@ -259,8 +259,8 @@ func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode, ui // RemoveXattr implements fs.InodeOperations.RemoveXattr. func (i *InodeSimpleExtendedAttributes) RemoveXattr(_ context.Context, _ *fs.Inode, name string) error { - i.mu.RLock() - defer i.mu.RUnlock() + i.mu.Lock() + defer i.mu.Unlock() if _, ok := i.xattrs[name]; ok { delete(i.xattrs, name) return nil diff --git a/pkg/sentry/fs/g3doc/.gitignore b/pkg/sentry/fs/g3doc/.gitignore new file mode 100644 index 000000000..2d19fc766 --- /dev/null +++ b/pkg/sentry/fs/g3doc/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md new file mode 100644 index 000000000..2ca84dd74 --- /dev/null +++ b/pkg/sentry/fs/g3doc/fuse.md @@ -0,0 +1,263 @@ +# Foreword + +This document describes an on-going project to support FUSE filesystems within +the sentry. This is intended to become the final documentation for this +subsystem, and is therefore written in the past tense. However FUSE support is +currently incomplete and the document will be updated as things progress. + +# FUSE: Filesystem in Userspace + +The sentry supports dispatching filesystem operations to a FUSE server, allowing +FUSE filesystem to be used with a sandbox. + +## Overview + +FUSE has two main components: + +1. A client kernel driver (canonically `fuse.ko` in Linux), which forwards + filesystem operations (usually initiated by syscalls) to the server. + +2. A server, which is a userspace daemon that implements the actual filesystem. + +The sentry implements the client component, which allows a server daemon running +within the sandbox to implement a filesystem within the sandbox. + +A FUSE filesystem is initialized with `mount(2)`, typically with the help of a +utility like `fusermount(1)`. Various mount options exist for establishing +ownership and access permissions on the filesystem, but the most important mount +option is a file descriptor used to establish communication between the client +and server. + +The FUSE device FD is obtained by opening `/dev/fuse`. During regular operation, +the client and server use the FUSE protocol described in `fuse(4)` to service +filesystem operations. See the "Protocol" section below for more information +about this protocol. The core of the sentry support for FUSE is the client-side +implementation of this protocol. + +## FUSE in the Sentry + +The sentry's FUSE client targets VFS2 and has the following components: + +- An implementation of `/dev/fuse`. + +- A VFS2 filesystem for mapping syscalls to FUSE ops. Since we're targeting + VFS2, one point of contention may be the lack of inodes in VFS2. We can + tentatively implement a kernfs-based filesystem to bridge the gap in APIs. + The kernfs base functionality can serve the role of the Linux inode cache + and, the filesystem can map VFS2 syscalls to kernfs inode operations; see + the `kernfs.Inode` interface. + +The FUSE protocol lends itself well to marshaling with `go_marshal`. The various +request and response packets can be defined in the ABI package and converted to +and from the wire format using `go_marshal`. + +### Design Goals + +- While filesystem performance is always important, the sentry's FUSE support + is primarily concerned with compatibility, with performance as a secondary + concern. + +- Avoiding deadlocks from a hung server daemon. + +- Consider the potential for denial of service from a malicious server daemon. + Protecting itself from userspace is already a design goal for the sentry, + but needs additional consideration for FUSE. Normally, an operating system + doesn't rely on userspace to make progress with filesystem operations. Since + this changes with FUSE, it opens up the possibility of creating a chain of + dependencies controlled by userspace, which could affect an entire sandbox. + For example: a FUSE op can block a syscall, which could be holding a + subsystem lock, which can then block another task goroutine. + +### Milestones + +Below are some broad goals to aim for while implementing FUSE in the sentry. +Many FUSE ops can be grouped into broad categories of functionality, and most +ops can be implemented in parallel. + +#### Minimal client that can mount a trivial FUSE filesystem. + +- Implement `/dev/fuse` - a character device used to establish an FD for + communication between the sentry and the server daemon. + +- Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`. + +#### Read-only mount with basic file operations + +- Implement the majority of file, directory and file descriptor FUSE ops. For + this milestone, we can skip uncommon or complex operations like mmap, mknod, + file locking, poll, and extended attributes. We can stub these out along + with any ops that modify the filesystem. The exact list of required ops are + to be determined, but the goal is to mount a real filesystem as read-only, + and be able to read contents from the filesystem in the sentry. + +#### Full read-write support + +- Implement the remaining FUSE ops and decide if we can omit rarely used + operations like ioctl. + +# Appendix + +## FUSE Protocol + +The FUSE protocol is a request-response protocol. All requests are initiated by +the client. The wire-format for the protocol is raw C structs serialized to +memory. + +All FUSE requests begin with the following request header: + +```c +struct fuse_in_header { + uint32_t len; // Length of the request, including this header. + uint32_t opcode; // Requested operation. + uint64_t unique; // A unique identifier for this request. + uint64_t nodeid; // ID of the filesystem object being operated on. + uint32_t uid; // UID of the requesting process. + uint32_t gid; // GID of the requesting process. + uint32_t pid; // PID of the requesting process. + uint32_t padding; +}; +``` + +The request is then followed by a payload specific to the `opcode`. + +All responses begin with this response header: + +```c +struct fuse_out_header { + uint32_t len; // Length of the response, including this header. + int32_t error; // Status of the request, 0 if success. + uint64_t unique; // The unique identifier from the corresponding request. +}; +``` + +The response payload also depends on the request `opcode`. If `error != 0`, the +response payload must be empty. + +### Operations + +The following is a list of all FUSE operations used in `fuse_in_header.opcode` +as of Linux v4.4, and a brief description of their purpose. These are defined in +`uapi/linux/fuse.h`. Many of these have a corresponding request and response +payload struct; `fuse(4)` has details for some of these. We also note how these +operations map to the sentry virtual filesystem. + +#### FUSE meta-operations + +These operations are specific to FUSE and don't have a corresponding action in a +generic filesystem. + +- `FUSE_INIT`: This operation initializes a new FUSE filesystem, and is the + first message sent by the client after mount. This is used for version and + feature negotiation. This is related to `mount(2)`. +- `FUSE_DESTROY`: Teardown a FUSE filesystem, related to `unmount(2)`. +- `FUSE_INTERRUPT`: Interrupts an in-flight operation, specified by the + `fuse_in_header.unique` value provided in the corresponding request header. + The client can send at most one of these per request, and will enter an + uninterruptible wait for a reply. The server is expected to reply promptly. +- `FUSE_FORGET`: A hint to the server that server should evict the indicate + node from any caches. This is wired up to `(struct + super_operations).evict_inode` in Linux, which is in turned hooked as the + inode cache shrinker which is typically triggered by system memory pressure. +- `FUSE_BATCH_FORGET`: Batch version of `FUSE_FORGET`. + +#### Filesystem Syscalls + +These FUSE ops map directly to an equivalent filesystem syscall, or family of +syscalls. The relevant syscalls have a similar name to the operation, unless +otherwise noted. + +Node creation: + +- `FUSE_MKNOD` +- `FUSE_MKDIR` +- `FUSE_CREATE`: This is equivalent to `open(2)` and `creat(2)`, which + atomically creates and opens a node. + +Node attributes and extended attributes: + +- `FUSE_GETATTR` +- `FUSE_SETATTR` +- `FUSE_SETXATTR` +- `FUSE_GETXATTR` +- `FUSE_LISTXATTR` +- `FUSE_REMOVEXATTR` + +Node link manipulation: + +- `FUSE_READLINK` +- `FUSE_LINK` +- `FUSE_SYMLINK` +- `FUSE_UNLINK` + +Directory operations: + +- `FUSE_RMDIR` +- `FUSE_RENAME` +- `FUSE_RENAME2` +- `FUSE_OPENDIR`: `open(2)` for directories. +- `FUSE_RELEASEDIR`: `close(2)` for directories. +- `FUSE_READDIR` +- `FUSE_READDIRPLUS` +- `FUSE_FSYNCDIR`: `fsync(2)` for directories. +- `FUSE_LOOKUP`: Establishes a unique identifier for a FS node. This is + reminiscent of `VirtualFilesystem.GetDentryAt` in that it resolves a path + component to a node. However the returned identifier is opaque to the + client. The server must remember this mapping, as this is how the client + will reference the node in the future. + +File operations: + +- `FUSE_OPEN`: `open(2)` for files. +- `FUSE_RELEASE`: `close(2)` for files. +- `FUSE_FSYNC` +- `FUSE_FALLOCATE` +- `FUSE_SETUPMAPPING`: Creates a memory map on a file for `mmap(2)`. +- `FUSE_REMOVEMAPPING`: Removes a memory map for `munmap(2)`. + +File locking: + +- `FUSE_GETLK` +- `FUSE_SETLK` +- `FUSE_SETLKW` +- `FUSE_COPY_FILE_RANGE` + +File descriptor operations: + +- `FUSE_IOCTL` +- `FUSE_POLL` +- `FUSE_LSEEK` + +Filesystem operations: + +- `FUSE_STATFS` + +#### Permissions + +- `FUSE_ACCESS` is used to check if a node is accessible, as part of many + syscall implementations. Maps to `vfs.FilesystemImpl.AccessAt` in the + sentry. + +#### I/O Operations + +These ops are used to read and write file pages. They're used to implement both +I/O syscalls like `read(2)`, `write(2)` and `mmap(2)`. + +- `FUSE_READ` +- `FUSE_WRITE` + +#### Miscellaneous + +- `FUSE_FLUSH`: Used by the client to indicate when a file descriptor is + closed. Distinct from `FUSE_FSYNC`, which corresponds to an `fsync(2)` + syscall from the user. Maps to `vfs.FileDescriptorImpl.Release` in the + sentry. +- `FUSE_BMAP`: Old address space API for block defrag. Probably not needed. +- `FUSE_NOTIFY_REPLY`: [TODO: what does this do?] + +# References + +- [fuse(4) Linux manual page](https://www.man7.org/linux/man-pages/man4/fuse.4.html) +- [Linux kernel FUSE documentation](https://www.kernel.org/doc/html/latest/filesystems/fuse.html) +- [The reference implementation of the Linux FUSE (Filesystem in Userspace) + interface](https://github.com/libfuse/libfuse) +- [The kernel interface of FUSE](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fuse.h) diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go index 6db4b762d..d481baf77 100644 --- a/pkg/sentry/fs/gofer/attr.go +++ b/pkg/sentry/fs/gofer/attr.go @@ -75,10 +75,18 @@ func owner(mounter fs.FileOwner, valid p9.AttrMask, pattr p9.Attr) fs.FileOwner // task's EUID/EGID. owner := mounter if valid.UID { - owner.UID = auth.KUID(pattr.UID) + if pattr.UID.Ok() { + owner.UID = auth.KUID(pattr.UID) + } else { + owner.UID = auth.KUID(auth.OverflowUID) + } } if valid.GID { - owner.GID = auth.KGID(pattr.GID) + if pattr.GID.Ok() { + owner.GID = auth.KGID(pattr.GID) + } else { + owner.GID = auth.KGID(auth.OverflowGID) + } } return owner } diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go index 23296f246..b2fcab127 100644 --- a/pkg/sentry/fs/gofer/file.go +++ b/pkg/sentry/fs/gofer/file.go @@ -37,9 +37,9 @@ var ( opens9P = metric.MustCreateNewUint64Metric("/gofer/opens_9p", false /* sync */, "Number of times a 9P file was opened from a gofer.") opensHost = metric.MustCreateNewUint64Metric("/gofer/opens_host", false /* sync */, "Number of times a host file was opened from a gofer.") reads9P = metric.MustCreateNewUint64Metric("/gofer/reads_9p", false /* sync */, "Number of 9P file reads from a gofer.") - readWait9P = metric.MustCreateNewUint64Metric("/gofer/read_wait_9p", false /* sync */, "Time waiting on 9P file reads from a gofer, in nanoseconds.") + readWait9P = metric.MustCreateNewUint64NanosecondsMetric("/gofer/read_wait_9p", false /* sync */, "Time waiting on 9P file reads from a gofer, in nanoseconds.") readsHost = metric.MustCreateNewUint64Metric("/gofer/reads_host", false /* sync */, "Number of host file reads from a gofer.") - readWaitHost = metric.MustCreateNewUint64Metric("/gofer/read_wait_host", false /* sync */, "Time waiting on host file reads from a gofer, in nanoseconds.") + readWaitHost = metric.MustCreateNewUint64NanosecondsMetric("/gofer/read_wait_host", false /* sync */, "Time waiting on host file reads from a gofer, in nanoseconds.") ) // fileOperations implements fs.FileOperations for a remote file system. diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go index ff96b28ba..edd6576aa 100644 --- a/pkg/sentry/fs/gofer/file_state.go +++ b/pkg/sentry/fs/gofer/file_state.go @@ -34,7 +34,6 @@ func (f *fileOperations) afterLoad() { flags := f.flags flags.Truncate = false - // TODO(b/38173783): Context is not plumbed to save/restore. f.handles, err = f.inodeOperations.fileState.getHandles(context.Background(), flags, f.inodeOperations.cachingInodeOps) if err != nil { return fmt.Errorf("failed to re-open handle: %v", err) diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go index 9d41fcbdb..8ae2d78d7 100644 --- a/pkg/sentry/fs/gofer/fs.go +++ b/pkg/sentry/fs/gofer/fs.go @@ -60,8 +60,7 @@ const ( limitHostFDTranslationKey = "limit_host_fd_translation" // overlayfsStaleRead if present closes cached readonly file after the first - // write. This is done to workaround a limitation of overlayfs in kernels - // before 4.19 where open FDs are not updated after the file is copied up. + // write. This is done to workaround a limitation of Linux overlayfs. overlayfsStaleRead = "overlayfs_stale_read" ) diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go index 9f7c3e89f..fc14249be 100644 --- a/pkg/sentry/fs/gofer/handles.go +++ b/pkg/sentry/fs/gofer/handles.go @@ -57,7 +57,6 @@ func (h *handles) DecRef() { } } } - // FIXME(b/38173783): Context is not plumbed here. if err := h.File.close(context.Background()); err != nil { log.Warningf("error closing p9 file: %v", err) } diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index 1c934981b..a016c896e 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -273,7 +273,7 @@ func (i *inodeFileState) recreateReadHandles(ctx context.Context, writer *handle // operations on the old will see the new data. Then, make the new handle take // ownereship of the old FD and mark the old readHandle to not close the FD // when done. - if err := syscall.Dup3(h.Host.FD(), i.readHandles.Host.FD(), 0); err != nil { + if err := syscall.Dup3(h.Host.FD(), i.readHandles.Host.FD(), syscall.O_CLOEXEC); err != nil { return err } @@ -710,13 +710,10 @@ func init() { } // AddLink implements InodeOperations.AddLink, but is currently a noop. -// FIXME(b/63117438): Remove this from InodeOperations altogether. func (*inodeOperations) AddLink() {} // DropLink implements InodeOperations.DropLink, but is currently a noop. -// FIXME(b/63117438): Remove this from InodeOperations altogether. func (*inodeOperations) DropLink() {} // NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange. -// FIXME(b/63117438): Remove this from InodeOperations altogether. func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {} diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go index 238f7804c..a3402e343 100644 --- a/pkg/sentry/fs/gofer/inode_state.go +++ b/pkg/sentry/fs/gofer/inode_state.go @@ -123,7 +123,6 @@ func (i *inodeFileState) afterLoad() { // beforeSave. return fmt.Errorf("failed to find path for inode number %d. Device %s contains %s", i.sattr.InodeID, i.s.connID, fs.InodeMappings(i.s.inodeMappings)) } - // TODO(b/38173783): Context is not plumbed to save/restore. ctx := &dummyClockContext{context.Background()} _, i.file, err = i.s.attach.walk(ctx, splitAbsolutePath(name)) diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index a35c3a23d..cf9800100 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -16,7 +16,6 @@ package gofer import ( "fmt" - "syscall" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" @@ -68,7 +67,7 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string // Get a p9.File for name. qids, newFile, mask, p9attr, err := i.fileState.file.walkGetAttr(ctx, []string{name}) if err != nil { - if err == syscall.ENOENT { + if err == syserror.ENOENT { if cp.cacheNegativeDirents() { // Return a negative Dirent. It will stay cached until something // is created over it. @@ -207,7 +206,7 @@ func (i *inodeOperations) CreateHardLink(ctx context.Context, inode *fs.Inode, t targetOpts, ok := target.InodeOperations.(*inodeOperations) if !ok { - return syscall.EXDEV + return syserror.EXDEV } if err := i.fileState.file.link(ctx, &targetOpts.fileState.file, newName); err != nil { @@ -251,7 +250,7 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, } if i.session().overrides == nil { - return nil, syscall.EOPNOTSUPP + return nil, syserror.EOPNOTSUPP } // Stabilize the override map while creation is in progress. @@ -280,7 +279,7 @@ func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name st // N.B. FIFOs use major/minor numbers 0. if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { - if i.session().overrides == nil || err != syscall.EPERM { + if i.session().overrides == nil || err != syserror.EPERM { return err } // If gofer doesn't support mknod, check if we can create an internal fifo. @@ -427,17 +426,16 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent return syserror.ENAMETOOLONG } - // Unwrap the new parent to a *inodeOperations. - newParentInodeOperations, ok := newParent.InodeOperations.(*inodeOperations) - if !ok { - return syscall.EXDEV + // Don't allow renames across different mounts. + if newParent.MountSource != oldParent.MountSource { + return syserror.EXDEV } + // Unwrap the new parent to a *inodeOperations. + newParentInodeOperations := newParent.InodeOperations.(*inodeOperations) + // Unwrap the old parent to a *inodeOperations. - oldParentInodeOperations, ok := oldParent.InodeOperations.(*inodeOperations) - if !ok { - return syscall.EXDEV - } + oldParentInodeOperations := oldParent.InodeOperations.(*inodeOperations) // Do the rename. if err := i.fileState.file.rename(ctx, newParentInodeOperations.fileState.file, newName); err != nil { diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index f6b3ef178..b5efc86f2 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -190,9 +190,9 @@ type session struct { // be socket/pipe files. This allows unix domain sockets and named pipes to // be used with paths that belong to a gofer. // - // TODO(gvisor.dev/issue/1200): there are few possible races with someone - // stat'ing the file and another deleting it concurrently, where the file - // will not be reported as socket file. + // There are a few possible races with someone stat'ing the file and another + // deleting it concurrently, where the file will not be reported as socket + // file. overrides *overrideMaps `state:"wait"` } diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go index 111da59f9..2d398b753 100644 --- a/pkg/sentry/fs/gofer/session_state.go +++ b/pkg/sentry/fs/gofer/session_state.go @@ -104,7 +104,6 @@ func (s *session) afterLoad() { // If private unix sockets are enabled, create and fill the session's endpoint // maps. if opts.privateunixsocket { - // TODO(b/38173783): Context is not plumbed to save/restore. ctx := &dummyClockContext{context.Background()} if err = s.restoreEndpointMaps(ctx); err != nil { diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index 10ba2f5f0..40f2c1cad 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -47,6 +47,8 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport. return &endpoint{inode, i.fileState.file.file, path} } +// LINT.IfChange + // endpoint is a Gofer-backed transport.BoundEndpoint. // // An endpoint's lifetime is the time between when InodeOperations.BoundEndpoint() @@ -146,3 +148,5 @@ func (e *endpoint) Release() { func (e *endpoint) Passcred() bool { return false } + +// LINT.ThenChange(../../fsimpl/gofer/socket.go) diff --git a/pkg/sentry/fs/gofer/util.go b/pkg/sentry/fs/gofer/util.go index 2d8d3a2ea..47a6c69bf 100644 --- a/pkg/sentry/fs/gofer/util.go +++ b/pkg/sentry/fs/gofer/util.go @@ -20,17 +20,29 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/fs" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" ) func utimes(ctx context.Context, file contextFile, ts fs.TimeSpec) error { if ts.ATimeOmit && ts.MTimeOmit { return nil } + + // Replace requests to use the "system time" with the current time to + // ensure that timestamps remain consistent with the remote + // filesystem. + now := ktime.NowFromContext(ctx) + if ts.ATimeSetSystemTime { + ts.ATime = now + } + if ts.MTimeSetSystemTime { + ts.MTime = now + } mask := p9.SetAttrMask{ ATime: !ts.ATimeOmit, - ATimeNotSystemTime: !ts.ATimeSetSystemTime, + ATimeNotSystemTime: true, MTime: !ts.MTimeOmit, - MTimeNotSystemTime: !ts.MTimeSetSystemTime, + MTimeNotSystemTime: true, } as, ans := ts.ATime.Unix() ms, mns := ts.MTime.Unix() diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD index 21003ea45..aabce6cc9 100644 --- a/pkg/sentry/fs/host/BUILD +++ b/pkg/sentry/fs/host/BUILD @@ -10,7 +10,7 @@ go_library( "descriptor_state.go", "device.go", "file.go", - "fs.go", + "host.go", "inode.go", "inode_state.go", "ioctl_unsafe.go", @@ -62,18 +62,15 @@ go_test( size = "small", srcs = [ "descriptor_test.go", - "fs_test.go", "inode_test.go", "socket_test.go", "wait_test.go", ], library = ":host", deps = [ - "//pkg/context", "//pkg/fd", "//pkg/fdnotifier", "//pkg/sentry/contexttest", - "//pkg/sentry/fs", "//pkg/sentry/kernel/time", "//pkg/sentry/socket", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go index 1658979fc..39299b7e4 100644 --- a/pkg/sentry/fs/host/control.go +++ b/pkg/sentry/fs/host/control.go @@ -23,6 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" ) +// LINT.IfChange + type scmRights struct { fds []int } @@ -76,7 +78,7 @@ func fdsToFiles(ctx context.Context, fds []int) []*fs.File { } // Create the file backed by hostFD. - file, err := NewFile(ctx, fd, fs.FileOwnerFromContext(ctx)) + file, err := NewFile(ctx, fd) if err != nil { ctx.Warningf("Error creating file from host FD: %v", err) break @@ -91,3 +93,5 @@ func fdsToFiles(ctx context.Context, fds []int) []*fs.File { } return files } + +// LINT.ThenChange(../../fsimpl/host/control.go) diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go index 2a4d1b291..cfdce6a74 100644 --- a/pkg/sentry/fs/host/descriptor.go +++ b/pkg/sentry/fs/host/descriptor.go @@ -16,7 +16,6 @@ package host import ( "fmt" - "path" "syscall" "gvisor.dev/gvisor/pkg/fdnotifier" @@ -28,12 +27,9 @@ import ( // // +stateify savable type descriptor struct { - // donated is true if the host fd was donated by another process. - donated bool - // If origFD >= 0, it is the host fd that this file was originally created // from, which must be available at time of restore. The FD can be closed - // after descriptor is created. Only set if donated is true. + // after descriptor is created. origFD int // wouldBlock is true if value (below) points to a file that can @@ -41,15 +37,13 @@ type descriptor struct { wouldBlock bool // value is the wrapped host fd. It is never saved or restored - // directly. How it is restored depends on whether it was - // donated and the fs.MountSource it was originally - // opened/created from. + // directly. value int `state:"nosave"` } // newDescriptor returns a wrapped host file descriptor. On success, // the descriptor is registered for event notifications with queue. -func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue *waiter.Queue) (*descriptor, error) { +func newDescriptor(fd int, saveable bool, wouldBlock bool, queue *waiter.Queue) (*descriptor, error) { ownedFD := fd origFD := -1 if saveable { @@ -69,7 +63,6 @@ func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue * } } return &descriptor{ - donated: donated, origFD: origFD, wouldBlock: wouldBlock, value: ownedFD, @@ -77,25 +70,11 @@ func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue * } // initAfterLoad initializes the value of the descriptor after Load. -func (d *descriptor) initAfterLoad(mo *superOperations, id uint64, queue *waiter.Queue) error { - if d.donated { - var err error - d.value, err = syscall.Dup(d.origFD) - if err != nil { - return fmt.Errorf("failed to dup restored fd %d: %v", d.origFD, err) - } - } else { - name, ok := mo.inodeMappings[id] - if !ok { - return fmt.Errorf("failed to find path for inode number %d", id) - } - fullpath := path.Join(mo.root, name) - - var err error - d.value, err = open(nil, fullpath) - if err != nil { - return fmt.Errorf("failed to open %q: %v", fullpath, err) - } +func (d *descriptor) initAfterLoad(id uint64, queue *waiter.Queue) error { + var err error + d.value, err = syscall.Dup(d.origFD) + if err != nil { + return fmt.Errorf("failed to dup restored fd %d: %v", d.origFD, err) } if d.wouldBlock { if err := syscall.SetNonblock(d.value, true); err != nil { diff --git a/pkg/sentry/fs/host/descriptor_state.go b/pkg/sentry/fs/host/descriptor_state.go index 8167390a9..e880582ab 100644 --- a/pkg/sentry/fs/host/descriptor_state.go +++ b/pkg/sentry/fs/host/descriptor_state.go @@ -16,7 +16,7 @@ package host // beforeSave is invoked by stateify. func (d *descriptor) beforeSave() { - if d.donated && d.origFD < 0 { + if d.origFD < 0 { panic("donated file descriptor cannot be saved") } } diff --git a/pkg/sentry/fs/host/descriptor_test.go b/pkg/sentry/fs/host/descriptor_test.go index 4205981f5..d8e4605b6 100644 --- a/pkg/sentry/fs/host/descriptor_test.go +++ b/pkg/sentry/fs/host/descriptor_test.go @@ -47,10 +47,10 @@ func TestDescriptorRelease(t *testing.T) { // FD ownership is transferred to the descritor. queue := &waiter.Queue{} - d, err := newDescriptor(fd, false /* donated*/, tc.saveable, tc.wouldBlock, queue) + d, err := newDescriptor(fd, tc.saveable, tc.wouldBlock, queue) if err != nil { syscall.Close(fd) - t.Fatalf("newDescriptor(%d, %t, false, %t, queue) failed, err: %v", fd, tc.saveable, tc.wouldBlock, err) + t.Fatalf("newDescriptor(%d, %t, %t, queue) failed, err: %v", fd, tc.saveable, tc.wouldBlock, err) } if tc.saveable { if d.origFD < 0 { diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go index e08f56d04..3e48b8b2c 100644 --- a/pkg/sentry/fs/host/file.go +++ b/pkg/sentry/fs/host/file.go @@ -60,8 +60,8 @@ var _ fs.FileOperations = (*fileOperations)(nil) // The returned File cannot be saved, since there is no guarantee that the same // FD will exist or represent the same file at time of restore. If such a // guarantee does exist, use ImportFile instead. -func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error) { - return newFileFromDonatedFD(ctx, fd, mounter, false, false) +func NewFile(ctx context.Context, fd int) (*fs.File, error) { + return newFileFromDonatedFD(ctx, fd, false, false) } // ImportFile creates a new File backed by the provided host file descriptor. @@ -71,13 +71,13 @@ func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error // If the returned file is saved, it will be restored by re-importing the FD // originally passed to ImportFile. It is the restorer's responsibility to // ensure that the FD represents the same file. -func ImportFile(ctx context.Context, fd int, mounter fs.FileOwner, isTTY bool) (*fs.File, error) { - return newFileFromDonatedFD(ctx, fd, mounter, true, isTTY) +func ImportFile(ctx context.Context, fd int, isTTY bool) (*fs.File, error) { + return newFileFromDonatedFD(ctx, fd, true, isTTY) } // newFileFromDonatedFD returns an fs.File from a donated FD. If the FD is // saveable, then saveable is true. -func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner, saveable, isTTY bool) (*fs.File, error) { +func newFileFromDonatedFD(ctx context.Context, donated int, saveable, isTTY bool) (*fs.File, error) { var s syscall.Stat_t if err := syscall.Fstat(donated, &s); err != nil { return nil, err @@ -101,8 +101,8 @@ func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner }) return s, nil default: - msrc := newMountSource(ctx, "/", mounter, &Filesystem{}, fs.MountSourceFlags{}, false /* dontTranslateOwnership */) - inode, err := newInode(ctx, msrc, donated, saveable, true /* donated */) + msrc := fs.NewNonCachingMountSource(ctx, &filesystem{}, fs.MountSourceFlags{}) + inode, err := newInode(ctx, msrc, donated, saveable) if err != nil { return nil, err } diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go deleted file mode 100644 index d3e8e3a36..000000000 --- a/pkg/sentry/fs/host/fs.go +++ /dev/null @@ -1,339 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package host implements an fs.Filesystem for files backed by host -// file descriptors. -package host - -import ( - "fmt" - "path" - "path/filepath" - "strconv" - "strings" - - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/fs" -) - -// FilesystemName is the name under which Filesystem is registered. -const FilesystemName = "whitelistfs" - -const ( - // whitelistKey is the mount option containing a comma-separated list - // of host paths to whitelist. - whitelistKey = "whitelist" - - // rootPathKey is the mount option containing the root path of the - // mount. - rootPathKey = "root" - - // dontTranslateOwnershipKey is the key to superOperations.dontTranslateOwnership. - dontTranslateOwnershipKey = "dont_translate_ownership" -) - -// maxTraversals determines link traversals in building the whitelist. -const maxTraversals = 10 - -// Filesystem is a pseudo file system that is only available during the setup -// to lock down the configurations. This filesystem should only be mounted at root. -// -// Think twice before exposing this to applications. -// -// +stateify savable -type Filesystem struct { - // whitelist is a set of host paths to whitelist. - paths []string -} - -var _ fs.Filesystem = (*Filesystem)(nil) - -// Name is the identifier of this file system. -func (*Filesystem) Name() string { - return FilesystemName -} - -// AllowUserMount prohibits users from using mount(2) with this file system. -func (*Filesystem) AllowUserMount() bool { - return false -} - -// AllowUserList allows this filesystem to be listed in /proc/filesystems. -func (*Filesystem) AllowUserList() bool { - return true -} - -// Flags returns that there is nothing special about this file system. -func (*Filesystem) Flags() fs.FilesystemFlags { - return 0 -} - -// Mount returns an fs.Inode exposing the host file system. It is intended to be locked -// down in PreExec below. -func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) { - // Parse generic comma-separated key=value options. - options := fs.GenericMountSourceOptions(data) - - // Grab the whitelist if one was specified. - // TODO(edahlgren/mpratt/hzy): require another option "testonly" in order to allow - // no whitelist. - if wl, ok := options[whitelistKey]; ok { - f.paths = strings.Split(wl, "|") - delete(options, whitelistKey) - } - - // If the rootPath was set, use it. Othewise default to the root of the - // host fs. - rootPath := "/" - if rp, ok := options[rootPathKey]; ok { - rootPath = rp - delete(options, rootPathKey) - - // We must relativize the whitelisted paths to the new root. - for i, p := range f.paths { - rel, err := filepath.Rel(rootPath, p) - if err != nil { - return nil, fmt.Errorf("whitelist path %q must be a child of root path %q", p, rootPath) - } - f.paths[i] = path.Join("/", rel) - } - } - fd, err := open(nil, rootPath) - if err != nil { - return nil, fmt.Errorf("failed to find root: %v", err) - } - - var dontTranslateOwnership bool - if v, ok := options[dontTranslateOwnershipKey]; ok { - b, err := strconv.ParseBool(v) - if err != nil { - return nil, fmt.Errorf("invalid value for %q: %v", dontTranslateOwnershipKey, err) - } - dontTranslateOwnership = b - delete(options, dontTranslateOwnershipKey) - } - - // Fail if the caller passed us more options than we know about. - if len(options) > 0 { - return nil, fmt.Errorf("unsupported mount options: %v", options) - } - - // The mounting EUID/EGID will be cached by this file system. This will - // be used to assign ownership to files that we own. - owner := fs.FileOwnerFromContext(ctx) - - // Construct the host file system mount and inode. - msrc := newMountSource(ctx, rootPath, owner, f, flags, dontTranslateOwnership) - return newInode(ctx, msrc, fd, false /* saveable */, false /* donated */) -} - -// InstallWhitelist locks down the MountNamespace to only the currently installed -// Dirents and the given paths. -func (f *Filesystem) InstallWhitelist(ctx context.Context, m *fs.MountNamespace) error { - return installWhitelist(ctx, m, f.paths) -} - -func installWhitelist(ctx context.Context, m *fs.MountNamespace, paths []string) error { - if len(paths) == 0 || (len(paths) == 1 && paths[0] == "") { - // Warning will be logged during filter installation if the empty - // whitelist matters (allows for host file access). - return nil - } - - // Done tracks entries already added. - done := make(map[string]bool) - root := m.Root() - defer root.DecRef() - - for i := 0; i < len(paths); i++ { - // Make sure the path is absolute. This is a sanity check. - if !path.IsAbs(paths[i]) { - return fmt.Errorf("path %q is not absolute", paths[i]) - } - - // We need to add all the intermediate paths, in case one of - // them is a symlink that needs to be resolved. - for j := 1; j <= len(paths[i]); j++ { - if j < len(paths[i]) && paths[i][j] != '/' { - continue - } - current := paths[i][:j] - - // Lookup the given component in the tree. - remainingTraversals := uint(maxTraversals) - d, err := m.FindLink(ctx, root, nil, current, &remainingTraversals) - if err != nil { - log.Warningf("populate failed for %q: %v", current, err) - continue - } - - // It's critical that this DecRef happens after the - // freeze below. This ensures that the dentry is in - // place to be frozen. Otherwise, we freeze without - // these entries. - defer d.DecRef() - - // Expand the last component if necessary. - if current == paths[i] { - // Is it a directory or symlink? - sattr := d.Inode.StableAttr - if fs.IsDir(sattr) { - for name := range childDentAttrs(ctx, d) { - paths = append(paths, path.Join(current, name)) - } - } - if fs.IsSymlink(sattr) { - // Only expand symlinks once. The - // folder structure may contain - // recursive symlinks and we don't want - // to end up infinitely expanding this - // symlink. This is safe because this - // is the last component. If a later - // path wants to symlink something - // beneath this symlink that will still - // be handled by the FindLink above. - if done[current] { - continue - } - - s, err := d.Inode.Readlink(ctx) - if err != nil { - log.Warningf("readlink failed for %q: %v", current, err) - continue - } - if path.IsAbs(s) { - paths = append(paths, s) - } else { - target := path.Join(path.Dir(current), s) - paths = append(paths, target) - } - } - } - - // Only report this one once even though we may look - // it up more than once. If we whitelist /a/b,/a then - // /a will be "done" when it is looked up for /a/b, - // however we still need to expand all of its contents - // when whitelisting /a. - if !done[current] { - log.Debugf("whitelisted: %s", current) - } - done[current] = true - } - } - - // Freeze the mount tree in place. This prevents any new paths from - // being opened and any old ones from being removed. If we do provide - // tmpfs mounts, we'll want to freeze/thaw those separately. - m.Freeze() - return nil -} - -func childDentAttrs(ctx context.Context, d *fs.Dirent) map[string]fs.DentAttr { - dirname, _ := d.FullName(nil /* root */) - dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) - if err != nil { - log.Warningf("failed to open directory %q: %v", dirname, err) - return nil - } - dir.DecRef() - var stubSerializer fs.CollectEntriesSerializer - if err := dir.Readdir(ctx, &stubSerializer); err != nil { - log.Warningf("failed to iterate on host directory %q: %v", dirname, err) - return nil - } - delete(stubSerializer.Entries, ".") - delete(stubSerializer.Entries, "..") - return stubSerializer.Entries -} - -// newMountSource constructs a new host fs.MountSource -// relative to a root path. The root should match the mount point. -func newMountSource(ctx context.Context, root string, mounter fs.FileOwner, filesystem fs.Filesystem, flags fs.MountSourceFlags, dontTranslateOwnership bool) *fs.MountSource { - return fs.NewMountSource(ctx, &superOperations{ - root: root, - inodeMappings: make(map[uint64]string), - mounter: mounter, - dontTranslateOwnership: dontTranslateOwnership, - }, filesystem, flags) -} - -// superOperations implements fs.MountSourceOperations. -// -// +stateify savable -type superOperations struct { - fs.SimpleMountSourceOperations - - // root is the path of the mount point. All inode mappings - // are relative to this root. - root string - - // inodeMappings contains mappings of fs.Inodes associated - // with this MountSource to paths under root. - inodeMappings map[uint64]string - - // mounter is the cached EUID/EGID that mounted this file system. - mounter fs.FileOwner - - // dontTranslateOwnership indicates whether to not translate file - // ownership. - // - // By default, files/directories owned by the sandbox uses UID/GID - // of the mounter. For files/directories that are not owned by the - // sandbox, file UID/GID is translated to a UID/GID which cannot - // be mapped in the sandboxed application's user namespace. The - // UID/GID will look like the nobody UID/GID (65534) but is not - // strictly owned by the user "nobody". - // - // If whitelistfs is a lower filesystem in an overlay, set - // dont_translate_ownership=true in mount options. - dontTranslateOwnership bool -} - -var _ fs.MountSourceOperations = (*superOperations)(nil) - -// ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings. -func (m *superOperations) ResetInodeMappings() { - m.inodeMappings = make(map[uint64]string) -} - -// SaveInodeMapping implements fs.MountSourceOperations.SaveInodeMapping. -func (m *superOperations) SaveInodeMapping(inode *fs.Inode, path string) { - // This is very unintuitive. We *CANNOT* trust the inode's StableAttrs, - // because overlay copyUp may have changed them out from under us. - // So much for "immutable". - sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr - m.inodeMappings[sattr.InodeID] = path -} - -// Keep implements fs.MountSourceOperations.Keep. -// -// TODO(b/72455313,b/77596690): It is possible to change the permissions on a -// host file while it is in the dirent cache (say from RO to RW), but it is not -// possible to re-open the file with more relaxed permissions, since the host -// FD is already open and stored in the inode. -// -// Using the dirent LRU cache increases the odds that this bug is encountered. -// Since host file access is relatively fast anyways, we disable the LRU cache -// for host fs files. Once we can properly deal with permissions changes and -// re-opening host files, we should revisit whether or not to make use of the -// LRU cache. -func (*superOperations) Keep(*fs.Dirent) bool { - return false -} - -func init() { - fs.RegisterFilesystem(&Filesystem{}) -} diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go deleted file mode 100644 index 3111d2df9..000000000 --- a/pkg/sentry/fs/host/fs_test.go +++ /dev/null @@ -1,380 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package host - -import ( - "fmt" - "io/ioutil" - "os" - "path" - "reflect" - "sort" - "testing" - - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/sentry/fs" -) - -// newTestMountNamespace creates a MountNamespace with a ramfs root. -// It returns the host folder created, which should be removed when done. -func newTestMountNamespace(t *testing.T) (*fs.MountNamespace, string, error) { - p, err := ioutil.TempDir("", "root") - if err != nil { - return nil, "", err - } - - fd, err := open(nil, p) - if err != nil { - os.RemoveAll(p) - return nil, "", err - } - ctx := contexttest.Context(t) - root, err := newInode(ctx, newMountSource(ctx, p, fs.RootOwner, &Filesystem{}, fs.MountSourceFlags{}, false), fd, false, false) - if err != nil { - os.RemoveAll(p) - return nil, "", err - } - mm, err := fs.NewMountNamespace(ctx, root) - if err != nil { - os.RemoveAll(p) - return nil, "", err - } - return mm, p, nil -} - -// createTestDirs populates the root with some test files and directories. -// /a/a1.txt -// /a/a2.txt -// /b/b1.txt -// /b/c/c1.txt -// /symlinks/normal.txt -// /symlinks/to_normal.txt -> /symlinks/normal.txt -// /symlinks/recursive -> /symlinks -func createTestDirs(ctx context.Context, t *testing.T, m *fs.MountNamespace) error { - r := m.Root() - defer r.DecRef() - - if err := r.CreateDirectory(ctx, r, "a", fs.FilePermsFromMode(0777)); err != nil { - return err - } - - a, err := r.Walk(ctx, r, "a") - if err != nil { - return err - } - defer a.DecRef() - - a1, err := a.Create(ctx, r, "a1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666)) - if err != nil { - return err - } - a1.DecRef() - - a2, err := a.Create(ctx, r, "a2.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666)) - if err != nil { - return err - } - a2.DecRef() - - if err := r.CreateDirectory(ctx, r, "b", fs.FilePermsFromMode(0777)); err != nil { - return err - } - - b, err := r.Walk(ctx, r, "b") - if err != nil { - return err - } - defer b.DecRef() - - b1, err := b.Create(ctx, r, "b1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666)) - if err != nil { - return err - } - b1.DecRef() - - if err := b.CreateDirectory(ctx, r, "c", fs.FilePermsFromMode(0777)); err != nil { - return err - } - - c, err := b.Walk(ctx, r, "c") - if err != nil { - return err - } - defer c.DecRef() - - c1, err := c.Create(ctx, r, "c1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666)) - if err != nil { - return err - } - c1.DecRef() - - if err := r.CreateDirectory(ctx, r, "symlinks", fs.FilePermsFromMode(0777)); err != nil { - return err - } - - symlinks, err := r.Walk(ctx, r, "symlinks") - if err != nil { - return err - } - defer symlinks.DecRef() - - normal, err := symlinks.Create(ctx, r, "normal.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666)) - if err != nil { - return err - } - normal.DecRef() - - if err := symlinks.CreateLink(ctx, r, "/symlinks/normal.txt", "to_normal.txt"); err != nil { - return err - } - - return symlinks.CreateLink(ctx, r, "/symlinks", "recursive") -} - -// allPaths returns a slice of all paths of entries visible in the rootfs. -func allPaths(ctx context.Context, t *testing.T, m *fs.MountNamespace, base string) ([]string, error) { - var paths []string - root := m.Root() - defer root.DecRef() - - maxTraversals := uint(1) - d, err := m.FindLink(ctx, root, nil, base, &maxTraversals) - if err != nil { - t.Logf("FindLink failed for %q", base) - return paths, err - } - defer d.DecRef() - - if fs.IsDir(d.Inode.StableAttr) { - dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) - if err != nil { - return nil, fmt.Errorf("failed to open directory %q: %v", base, err) - } - iter, ok := dir.FileOperations.(fs.DirIterator) - if !ok { - return nil, fmt.Errorf("cannot directly iterate on host directory %q", base) - } - dirCtx := &fs.DirCtx{ - Serializer: noopDentrySerializer{}, - } - if _, err := fs.DirentReaddir(ctx, d, iter, root, dirCtx, 0); err != nil { - return nil, err - } - for name := range dirCtx.DentAttrs() { - if name == "." || name == ".." { - continue - } - - fullName := path.Join(base, name) - paths = append(paths, fullName) - - // Recurse. - subpaths, err := allPaths(ctx, t, m, fullName) - if err != nil { - return paths, err - } - paths = append(paths, subpaths...) - } - } - - return paths, nil -} - -type noopDentrySerializer struct{} - -func (noopDentrySerializer) CopyOut(string, fs.DentAttr) error { - return nil -} -func (noopDentrySerializer) Written() int { - return 4096 -} - -// pathsEqual returns true if the two string slices contain the same entries. -func pathsEqual(got, want []string) bool { - sort.Strings(got) - sort.Strings(want) - - if len(got) != len(want) { - return false - } - - for i := range got { - if got[i] != want[i] { - return false - } - } - - return true -} - -func TestWhitelist(t *testing.T) { - for _, test := range []struct { - // description of the test. - desc string - // paths are the paths to whitelist - paths []string - // want are all of the directory entries that should be - // visible (nothing beyond this set should be visible). - want []string - }{ - { - desc: "root", - paths: []string{"/"}, - want: []string{"/a", "/a/a1.txt", "/a/a2.txt", "/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt", "/symlinks", "/symlinks/normal.txt", "/symlinks/to_normal.txt", "/symlinks/recursive"}, - }, - { - desc: "top-level directories", - paths: []string{"/a", "/b"}, - want: []string{"/a", "/a/a1.txt", "/a/a2.txt", "/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"}, - }, - { - desc: "nested directories (1/2)", - paths: []string{"/b", "/b/c"}, - want: []string{"/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"}, - }, - { - desc: "nested directories (2/2)", - paths: []string{"/b/c", "/b"}, - want: []string{"/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"}, - }, - { - desc: "single file", - paths: []string{"/b/c/c1.txt"}, - want: []string{"/b", "/b/c", "/b/c/c1.txt"}, - }, - { - desc: "single file and directory", - paths: []string{"/a/a1.txt", "/b/c"}, - want: []string{"/a", "/a/a1.txt", "/b", "/b/c", "/b/c/c1.txt"}, - }, - { - desc: "symlink", - paths: []string{"/symlinks/to_normal.txt"}, - want: []string{"/symlinks", "/symlinks/normal.txt", "/symlinks/to_normal.txt"}, - }, - { - desc: "recursive symlink", - paths: []string{"/symlinks/recursive/normal.txt"}, - want: []string{"/symlinks", "/symlinks/normal.txt", "/symlinks/recursive"}, - }, - } { - t.Run(test.desc, func(t *testing.T) { - m, p, err := newTestMountNamespace(t) - if err != nil { - t.Errorf("Failed to create MountNamespace: %v", err) - } - defer os.RemoveAll(p) - - ctx := withRoot(contexttest.RootContext(t), m.Root()) - if err := createTestDirs(ctx, t, m); err != nil { - t.Errorf("Failed to create test dirs: %v", err) - } - - if err := installWhitelist(ctx, m, test.paths); err != nil { - t.Errorf("installWhitelist(%v) err got %v want nil", test.paths, err) - } - - got, err := allPaths(ctx, t, m, "/") - if err != nil { - t.Fatalf("Failed to lookup paths (whitelisted: %v): %v", test.paths, err) - } - - if !pathsEqual(got, test.want) { - t.Errorf("For paths %v got %v want %v", test.paths, got, test.want) - } - }) - } -} - -func TestRootPath(t *testing.T) { - // Create a temp dir, which will be the root of our mounted fs. - rootPath, err := ioutil.TempDir(os.TempDir(), "root") - if err != nil { - t.Fatalf("TempDir failed: %v", err) - } - defer os.RemoveAll(rootPath) - - // Create two files inside the new root, one which will be whitelisted - // and one not. - whitelisted, err := ioutil.TempFile(rootPath, "white") - if err != nil { - t.Fatalf("TempFile failed: %v", err) - } - if _, err := ioutil.TempFile(rootPath, "black"); err != nil { - t.Fatalf("TempFile failed: %v", err) - } - - // Create a mount with a root path and single whitelisted file. - hostFS := &Filesystem{} - ctx := contexttest.Context(t) - data := fmt.Sprintf("%s=%s,%s=%s", rootPathKey, rootPath, whitelistKey, whitelisted.Name()) - inode, err := hostFS.Mount(ctx, "", fs.MountSourceFlags{}, data, nil) - if err != nil { - t.Fatalf("Mount failed: %v", err) - } - mm, err := fs.NewMountNamespace(ctx, inode) - if err != nil { - t.Fatalf("NewMountNamespace failed: %v", err) - } - if err := hostFS.InstallWhitelist(ctx, mm); err != nil { - t.Fatalf("InstallWhitelist failed: %v", err) - } - - // Get the contents of the root directory. - rootDir := mm.Root() - rctx := withRoot(ctx, rootDir) - f, err := rootDir.Inode.GetFile(rctx, rootDir, fs.FileFlags{}) - if err != nil { - t.Fatalf("GetFile failed: %v", err) - } - c := &fs.CollectEntriesSerializer{} - if err := f.Readdir(rctx, c); err != nil { - t.Fatalf("Readdir failed: %v", err) - } - - // We should have only our whitelisted file, plus the dots. - want := []string{path.Base(whitelisted.Name()), ".", ".."} - got := c.Order - sort.Strings(want) - sort.Strings(got) - if !reflect.DeepEqual(got, want) { - t.Errorf("Readdir got %v, wanted %v", got, want) - } -} - -type rootContext struct { - context.Context - root *fs.Dirent -} - -// withRoot returns a copy of ctx with the given root. -func withRoot(ctx context.Context, root *fs.Dirent) context.Context { - return &rootContext{ - Context: ctx, - root: root, - } -} - -// Value implements Context.Value. -func (rc rootContext) Value(key interface{}) interface{} { - switch key { - case fs.CtxRoot: - rc.root.IncRef() - return rc.root - default: - return rc.Context.Value(key) - } -} diff --git a/pkg/sentry/fs/host/host.go b/pkg/sentry/fs/host/host.go new file mode 100644 index 000000000..081ba1dd8 --- /dev/null +++ b/pkg/sentry/fs/host/host.go @@ -0,0 +1,59 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package host supports file descriptors imported directly. +package host + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// filesystem is a host filesystem. +// +// +stateify savable +type filesystem struct{} + +func init() { + fs.RegisterFilesystem(&filesystem{}) +} + +// FilesystemName is the name under which the filesystem is registered. +const FilesystemName = "host" + +// Name is the name of the filesystem. +func (*filesystem) Name() string { + return FilesystemName +} + +// Mount returns an error. Mounting hostfs is not allowed. +func (*filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, dataObj interface{}) (*fs.Inode, error) { + return nil, syserror.EPERM +} + +// AllowUserMount prohibits users from using mount(2) with this file system. +func (*filesystem) AllowUserMount() bool { + return false +} + +// AllowUserList prohibits this filesystem to be listed in /proc/filesystems. +func (*filesystem) AllowUserList() bool { + return false +} + +// Flags returns that there is nothing special about this file system. +func (*filesystem) Flags() fs.FilesystemFlags { + return 0 +} diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go index 6fa39caab..62f1246aa 100644 --- a/pkg/sentry/fs/host/inode.go +++ b/pkg/sentry/fs/host/inode.go @@ -17,12 +17,10 @@ package host import ( "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/secio" - "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -69,9 +67,6 @@ type inodeOperations struct { // // +stateify savable type inodeFileState struct { - // Common file system state. - mops *superOperations `state:"wait"` - // descriptor is the backing host FD. descriptor *descriptor `state:"wait"` @@ -160,7 +155,7 @@ func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, err if err := syscall.Fstat(i.FD(), &s); err != nil { return fs.UnstableAttr{}, err } - return unstableAttr(i.mops, &s), nil + return unstableAttr(&s), nil } // Allocate implements fsutil.CachedFileObject.Allocate. @@ -172,7 +167,7 @@ func (i *inodeFileState) Allocate(_ context.Context, offset, length int64) error var _ fs.InodeOperations = (*inodeOperations)(nil) // newInode returns a new fs.Inode backed by the host FD. -func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool, donated bool) (*fs.Inode, error) { +func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool) (*fs.Inode, error) { // Retrieve metadata. var s syscall.Stat_t err := syscall.Fstat(fd, &s) @@ -181,24 +176,17 @@ func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool, } fileState := &inodeFileState{ - mops: msrc.MountSourceOperations.(*superOperations), sattr: stableAttr(&s), } // Initialize the wrapped host file descriptor. - fileState.descriptor, err = newDescriptor( - fd, - donated, - saveable, - wouldBlock(&s), - &fileState.queue, - ) + fileState.descriptor, err = newDescriptor(fd, saveable, wouldBlock(&s), &fileState.queue) if err != nil { return nil, err } // Build the fs.InodeOperations. - uattr := unstableAttr(msrc.MountSourceOperations.(*superOperations), &s) + uattr := unstableAttr(&s) iops := &inodeOperations{ fileState: fileState, cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, fsutil.CachingInodeOperationsOptions{ @@ -232,54 +220,23 @@ func (i *inodeOperations) Release(context.Context) { // Lookup implements fs.InodeOperations.Lookup. func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) { - // Get a new FD relative to i at name. - fd, err := open(i, name) - if err != nil { - if err == syserror.ENOENT { - return nil, syserror.ENOENT - } - return nil, err - } - - inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */) - if err != nil { - return nil, err - } - - // Return the fs.Dirent. - return fs.NewDirent(ctx, inode, name), nil + return nil, syserror.ENOENT } // Create implements fs.InodeOperations.Create. func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) { - // Create a file relative to i at name. - // - // N.B. We always open this file O_RDWR regardless of flags because a - // future GetFile might want more access. Open allows this regardless - // of perm. - fd, err := openAt(i, name, syscall.O_RDWR|syscall.O_CREAT|syscall.O_EXCL, perm.LinuxMode()) - if err != nil { - return nil, err - } - - inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */) - if err != nil { - return nil, err - } + return nil, syserror.EPERM - d := fs.NewDirent(ctx, inode, name) - defer d.DecRef() - return inode.GetFile(ctx, d, flags) } // CreateDirectory implements fs.InodeOperations.CreateDirectory. func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { - return syscall.Mkdirat(i.fileState.FD(), name, uint32(perm.LinuxMode())) + return syserror.EPERM } // CreateLink implements fs.InodeOperations.CreateLink. func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error { - return createLink(i.fileState.FD(), oldname, newname) + return syserror.EPERM } // CreateHardLink implements fs.InodeOperations.CreateHardLink. @@ -294,25 +251,17 @@ func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePe // Remove implements fs.InodeOperations.Remove. func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error { - return unlinkAt(i.fileState.FD(), name, false /* dir */) + return syserror.EPERM } // RemoveDirectory implements fs.InodeOperations.RemoveDirectory. func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error { - return unlinkAt(i.fileState.FD(), name, true /* dir */) + return syserror.EPERM } // Rename implements fs.InodeOperations.Rename. func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error { - op, ok := oldParent.InodeOperations.(*inodeOperations) - if !ok { - return syscall.EXDEV - } - np, ok := newParent.InodeOperations.(*inodeOperations) - if !ok { - return syscall.EXDEV - } - return syscall.Renameat(op.fileState.FD(), oldName, np.fileState.FD(), newName) + return syserror.EPERM } // Bind implements fs.InodeOperations.Bind. @@ -448,82 +397,17 @@ func (i *inodeOperations) StatFS(context.Context) (fs.Info, error) { } // AddLink implements fs.InodeOperations.AddLink. -// FIXME(b/63117438): Remove this from InodeOperations altogether. func (i *inodeOperations) AddLink() {} // DropLink implements fs.InodeOperations.DropLink. -// FIXME(b/63117438): Remove this from InodeOperations altogether. func (i *inodeOperations) DropLink() {} // NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange. -// FIXME(b/63117438): Remove this from InodeOperations altogether. func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {} // readdirAll returns all of the directory entries in i. func (i *inodeOperations) readdirAll(d *dirInfo) (map[string]fs.DentAttr, error) { - i.readdirMu.Lock() - defer i.readdirMu.Unlock() - - fd := i.fileState.FD() - - // syscall.ReadDirent will use getdents, which will seek the file past - // the last directory entry. To read the directory entries a second - // time, we need to seek back to the beginning. - if _, err := syscall.Seek(fd, 0, 0); err != nil { - if err == syscall.ESPIPE { - // All directories should be seekable. If this file - // isn't seekable, it is not a directory and we should - // return that more sane error. - err = syscall.ENOTDIR - } - return nil, err - } - - names := make([]string, 0, 100) - for { - // Refill the buffer if necessary - if d.bufp >= d.nbuf { - d.bufp = 0 - // ReadDirent will just do a sys_getdents64 to the kernel. - n, err := syscall.ReadDirent(fd, d.buf) - if err != nil { - return nil, err - } - if n == 0 { - break // EOF - } - d.nbuf = n - } - - var nb int - // Parse the dirent buffer we just get and return the directory names along - // with the number of bytes consumed in the buffer. - nb, _, names = syscall.ParseDirent(d.buf[d.bufp:d.nbuf], -1, names) - d.bufp += nb - } - - entries := make(map[string]fs.DentAttr) - for _, filename := range names { - // Lookup the type and host device and inode. - stat, lerr := fstatat(fd, filename, linux.AT_SYMLINK_NOFOLLOW) - if lerr == syscall.ENOENT { - // File disappeared between readdir and lstat. - // Just treat it as if it didn't exist. - continue - } - - // There was a serious problem, we should probably report it. - if lerr != nil { - return nil, lerr - } - - entries[filename] = fs.DentAttr{ - Type: nodeType(&stat), - InodeID: hostFileDevice.Map(device.MultiDeviceKey{ - Device: stat.Dev, - Inode: stat.Ino, - }), - } - } - return entries, nil + // We only support non-directory file descriptors that have been + // imported, so just claim that this isn't a directory, even if it is. + return nil, syscall.ENOTDIR } diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go index 299e0e0b0..1adbd4562 100644 --- a/pkg/sentry/fs/host/inode_state.go +++ b/pkg/sentry/fs/host/inode_state.go @@ -18,29 +18,14 @@ import ( "fmt" "syscall" - "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" ) -// beforeSave is invoked by stateify. -func (i *inodeFileState) beforeSave() { - if !i.queue.IsEmpty() { - panic("event queue must be empty") - } - if !i.descriptor.donated && i.sattr.Type == fs.RegularFile { - uattr, err := i.unstableAttr(context.Background()) - if err != nil { - panic(fs.ErrSaveRejection{fmt.Errorf("failed to get unstable atttribute of %s: %v", i.mops.inodeMappings[i.sattr.InodeID], err)}) - } - i.savedUAttr = &uattr - } -} - // afterLoad is invoked by stateify. func (i *inodeFileState) afterLoad() { // Initialize the descriptor value. - if err := i.descriptor.initAfterLoad(i.mops, i.sattr.InodeID, &i.queue); err != nil { + if err := i.descriptor.initAfterLoad(i.sattr.InodeID, &i.queue); err != nil { panic(fmt.Sprintf("failed to load value of descriptor: %v", err)) } @@ -61,19 +46,4 @@ func (i *inodeFileState) afterLoad() { // change across save and restore, error out. panic(fs.ErrCorruption{fmt.Errorf("host %s conflict in host device mappings: %s", key, hostFileDevice)}) } - - if !i.descriptor.donated && i.sattr.Type == fs.RegularFile { - env, ok := fs.CurrentRestoreEnvironment() - if !ok { - panic("missing restore environment") - } - uattr := unstableAttr(i.mops, &s) - if env.ValidateFileSize && uattr.Size != i.savedUAttr.Size { - panic(fs.ErrCorruption{fmt.Errorf("file size has changed for %s: previously %d, now %d", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size)}) - } - if env.ValidateFileTimestamp && uattr.ModificationTime != i.savedUAttr.ModificationTime { - panic(fs.ErrCorruption{fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime)}) - } - i.savedUAttr = nil - } } diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go index 7221bc825..c507f57eb 100644 --- a/pkg/sentry/fs/host/inode_test.go +++ b/pkg/sentry/fs/host/inode_test.go @@ -15,79 +15,12 @@ package host import ( - "io/ioutil" - "os" - "path" "syscall" "testing" "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/sentry/fs" ) -// TestMultipleReaddir verifies that multiple Readdir calls return the same -// thing if they use different dir contexts. -func TestMultipleReaddir(t *testing.T) { - p, err := ioutil.TempDir("", "readdir") - if err != nil { - t.Fatalf("Failed to create test dir: %v", err) - } - defer os.RemoveAll(p) - - f, err := os.Create(path.Join(p, "a.txt")) - if err != nil { - t.Fatalf("Failed to create a.txt: %v", err) - } - f.Close() - - f, err = os.Create(path.Join(p, "b.txt")) - if err != nil { - t.Fatalf("Failed to create b.txt: %v", err) - } - f.Close() - - fd, err := open(nil, p) - if err != nil { - t.Fatalf("Failed to open %q: %v", p, err) - } - ctx := contexttest.Context(t) - n, err := newInode(ctx, newMountSource(ctx, p, fs.RootOwner, &Filesystem{}, fs.MountSourceFlags{}, false), fd, false, false) - if err != nil { - t.Fatalf("Failed to create inode: %v", err) - } - - dirent := fs.NewDirent(ctx, n, "readdir") - openFile, err := n.GetFile(ctx, dirent, fs.FileFlags{Read: true}) - if err != nil { - t.Fatalf("Failed to get file: %v", err) - } - defer openFile.DecRef() - - c1 := &fs.DirCtx{DirCursor: new(string)} - if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, dirent, c1, 0); err != nil { - t.Fatalf("First Readdir failed: %v", err) - } - - c2 := &fs.DirCtx{DirCursor: new(string)} - if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, dirent, c2, 0); err != nil { - t.Errorf("Second Readdir failed: %v", err) - } - - if _, ok := c1.DentAttrs()["a.txt"]; !ok { - t.Errorf("want a.txt in first Readdir, got %v", c1.DentAttrs()) - } - if _, ok := c1.DentAttrs()["b.txt"]; !ok { - t.Errorf("want b.txt in first Readdir, got %v", c1.DentAttrs()) - } - - if _, ok := c2.DentAttrs()["a.txt"]; !ok { - t.Errorf("want a.txt in second Readdir, got %v", c2.DentAttrs()) - } - if _, ok := c2.DentAttrs()["b.txt"]; !ok { - t.Errorf("want b.txt in second Readdir, got %v", c2.DentAttrs()) - } -} - // TestCloseFD verifies fds will be closed. func TestCloseFD(t *testing.T) { var p [2]int @@ -99,7 +32,7 @@ func TestCloseFD(t *testing.T) { // Use the write-end because we will detect if it's closed on the read end. ctx := contexttest.Context(t) - file, err := NewFile(ctx, p[1], fs.RootOwner) + file, err := NewFile(ctx, p[1]) if err != nil { t.Fatalf("Failed to create File: %v", err) } diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go index 271582e54..150ac8e19 100644 --- a/pkg/sentry/fs/host/ioctl_unsafe.go +++ b/pkg/sentry/fs/host/ioctl_unsafe.go @@ -21,6 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" ) +// LINT.IfChange + func ioctlGetTermios(fd int) (*linux.Termios, error) { var t linux.Termios _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t))) @@ -54,3 +56,5 @@ func ioctlSetWinsize(fd int, w *linux.Winsize) error { } return nil } + +// LINT.ThenChange(../../fsimpl/host/ioctl_unsafe.go) diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index 06fc2d80a..b6e94583e 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -37,6 +37,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // maxSendBufferSize is the maximum host send buffer size allowed for endpoint. // // N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max). @@ -388,3 +390,5 @@ func (c *ConnectedEndpoint) Release() { // CloseUnread implements transport.ConnectedEndpoint.CloseUnread. func (c *ConnectedEndpoint) CloseUnread() {} + +// LINT.ThenChange(../../fsimpl/host/socket.go) diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go index af6955675..5c18dbd5e 100644 --- a/pkg/sentry/fs/host/socket_iovec.go +++ b/pkg/sentry/fs/host/socket_iovec.go @@ -21,6 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// LINT.IfChange + // maxIovs is the maximum number of iovecs to pass to the host. var maxIovs = linux.UIO_MAXIOV @@ -111,3 +113,5 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec return total, iovecs, nil, err } + +// LINT.ThenChange(../../fsimpl/host/socket_iovec.go) diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go index eb4afe520..affdbcacb 100644 --- a/pkg/sentry/fs/host/socket_test.go +++ b/pkg/sentry/fs/host/socket_test.go @@ -199,14 +199,14 @@ func TestListen(t *testing.T) { } func TestPasscred(t *testing.T) { - e := ConnectedEndpoint{} + e := &ConnectedEndpoint{} if got, want := e.Passcred(), false; got != want { t.Errorf("Got %#v.Passcred() = %t, want = %t", e, got, want) } } func TestGetLocalAddress(t *testing.T) { - e := ConnectedEndpoint{path: "foo"} + e := &ConnectedEndpoint{path: "foo"} want := tcpip.FullAddress{Addr: tcpip.Address("foo")} if got, err := e.GetLocalAddress(); err != nil || got != want { t.Errorf("Got %#v.GetLocalAddress() = %#v, %v, want = %#v, %v", e, got, err, want, nil) @@ -214,7 +214,7 @@ func TestGetLocalAddress(t *testing.T) { } func TestQueuedSize(t *testing.T) { - e := ConnectedEndpoint{} + e := &ConnectedEndpoint{} tests := []struct { name string f func() int64 diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go index f3bbed7ea..5d4f312cf 100644 --- a/pkg/sentry/fs/host/socket_unsafe.go +++ b/pkg/sentry/fs/host/socket_unsafe.go @@ -19,6 +19,8 @@ import ( "unsafe" ) +// LINT.IfChange + // fdReadVec receives from fd to bufs. // // If the total length of bufs is > maxlen, fdReadVec will do a partial read @@ -99,3 +101,5 @@ func fdWriteVec(fd int, bufs [][]byte, maxlen int64, truncate bool) (int64, int6 return int64(n), length, err } + +// LINT.ThenChange(../../fsimpl/host/socket_unsafe.go) diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index 3f218b4a7..cb91355ab 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -26,6 +26,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + // TTYFileOperations implements fs.FileOperations for a host file descriptor // that wraps a TTY FD. // @@ -43,6 +45,7 @@ type TTYFileOperations struct { // connected to this TTY. fgProcessGroup *kernel.ProcessGroup + // termios contains the terminal attributes for this TTY. termios linux.KernelTermios } @@ -357,3 +360,5 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) return kernel.ERESTARTSYS } + +// LINT.ThenChange(../../fsimpl/host/tty.go) diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go index e37e687c6..1b0356930 100644 --- a/pkg/sentry/fs/host/util.go +++ b/pkg/sentry/fs/host/util.go @@ -16,7 +16,6 @@ package host import ( "os" - "path" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -28,45 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -func open(parent *inodeOperations, name string) (int, error) { - if parent == nil && !path.IsAbs(name) { - return -1, syserror.EINVAL - } - name = path.Clean(name) - - // Don't follow through symlinks. - flags := syscall.O_NOFOLLOW - - if fd, err := openAt(parent, name, flags|syscall.O_RDWR, 0); err == nil { - return fd, nil - } - // Retry as read-only. - if fd, err := openAt(parent, name, flags|syscall.O_RDONLY, 0); err == nil { - return fd, nil - } - - // Retry as write-only. - if fd, err := openAt(parent, name, flags|syscall.O_WRONLY, 0); err == nil { - return fd, nil - } - - // Retry as a symlink, by including O_PATH as an option. - fd, err := openAt(parent, name, linux.O_PATH|flags, 0) - if err == nil { - return fd, nil - } - - // Everything failed. - return -1, err -} - -func openAt(parent *inodeOperations, name string, flags int, perm linux.FileMode) (int, error) { - if parent == nil { - return syscall.Open(name, flags, uint32(perm)) - } - return syscall.Openat(parent.fileState.FD(), name, flags, uint32(perm)) -} - func nodeType(s *syscall.Stat_t) fs.InodeType { switch x := (s.Mode & syscall.S_IFMT); x { case syscall.S_IFLNK: @@ -107,51 +67,19 @@ func stableAttr(s *syscall.Stat_t) fs.StableAttr { } } -func owner(mo *superOperations, s *syscall.Stat_t) fs.FileOwner { - // User requested no translation, just return actual owner. - if mo.dontTranslateOwnership { - return fs.FileOwner{auth.KUID(s.Uid), auth.KGID(s.Gid)} +func owner(s *syscall.Stat_t) fs.FileOwner { + return fs.FileOwner{ + UID: auth.KUID(s.Uid), + GID: auth.KGID(s.Gid), } - - // Show only IDs relevant to the sandboxed task. I.e. if we not own the - // file, no sandboxed task can own the file. In that case, we - // use OverflowID for UID, implying that the IDs are not mapped in the - // "root" user namespace. - // - // E.g. - // sandbox's host EUID/EGID is 1/1. - // some_dir's host UID/GID is 2/1. - // Task that mounted this fs has virtualized EUID/EGID 5/5. - // - // If you executed `ls -n` in the sandboxed task, it would show: - // drwxwrxwrx [...] 65534 5 [...] some_dir - - // Files are owned by OverflowID by default. - owner := fs.FileOwner{auth.KUID(auth.OverflowUID), auth.KGID(auth.OverflowGID)} - - // If we own file on host, let mounting task's initial EUID own - // the file. - if s.Uid == hostUID { - owner.UID = mo.mounter.UID - } - - // If our group matches file's group, make file's group match - // the mounting task's initial EGID. - for _, gid := range hostGIDs { - if s.Gid == gid { - owner.GID = mo.mounter.GID - break - } - } - return owner } -func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr { +func unstableAttr(s *syscall.Stat_t) fs.UnstableAttr { return fs.UnstableAttr{ Size: s.Size, Usage: s.Blocks * 512, Perms: fs.FilePermsFromMode(linux.FileMode(s.Mode)), - Owner: owner(mo, s), + Owner: owner(s), AccessTime: ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec), ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec), StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec), @@ -165,6 +93,8 @@ type dirInfo struct { bufp int // location of next record in buf. } +// LINT.IfChange + // isBlockError unwraps os errors and checks if they are caused by EAGAIN or // EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock. func isBlockError(err error) bool { @@ -177,6 +107,8 @@ func isBlockError(err error) bool { return false } +// LINT.ThenChange(../../fsimpl/host/util.go) + func hostEffectiveKIDs() (uint32, []uint32, error) { gids, err := os.Getgroups() if err != nil { diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go index 3ab36b088..23bd35d64 100644 --- a/pkg/sentry/fs/host/util_unsafe.go +++ b/pkg/sentry/fs/host/util_unsafe.go @@ -26,26 +26,6 @@ import ( // NulByte is a single NUL byte. It is passed to readlinkat as an empty string. var NulByte byte = '\x00' -func createLink(fd int, name string, linkName string) error { - namePtr, err := syscall.BytePtrFromString(name) - if err != nil { - return err - } - linkNamePtr, err := syscall.BytePtrFromString(linkName) - if err != nil { - return err - } - _, _, errno := syscall.Syscall( - syscall.SYS_SYMLINKAT, - uintptr(unsafe.Pointer(namePtr)), - uintptr(fd), - uintptr(unsafe.Pointer(linkNamePtr))) - if errno != 0 { - return errno - } - return nil -} - func readLink(fd int) (string, error) { // Buffer sizing copied from os.Readlink. for l := 128; ; l *= 2 { @@ -66,27 +46,6 @@ func readLink(fd int) (string, error) { } } -func unlinkAt(fd int, name string, dir bool) error { - namePtr, err := syscall.BytePtrFromString(name) - if err != nil { - return err - } - var flags uintptr - if dir { - flags = linux.AT_REMOVEDIR - } - _, _, errno := syscall.Syscall( - syscall.SYS_UNLINKAT, - uintptr(fd), - uintptr(unsafe.Pointer(namePtr)), - flags, - ) - if errno != 0 { - return errno - } - return nil -} - func timespecFromTimestamp(t ktime.Time, omit, setSysTime bool) syscall.Timespec { if omit { return syscall.Timespec{0, linux.UTIME_OMIT} diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go index d49c3a635..ce397a5e3 100644 --- a/pkg/sentry/fs/host/wait_test.go +++ b/pkg/sentry/fs/host/wait_test.go @@ -20,7 +20,6 @@ import ( "time" "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/waiter" ) @@ -34,7 +33,7 @@ func TestWait(t *testing.T) { defer syscall.Close(fds[1]) ctx := contexttest.Context(t) - file, err := NewFile(ctx, fds[0], fs.RootOwner) + file, err := NewFile(ctx, fds[0]) if err != nil { syscall.Close(fds[0]) t.Fatalf("NewFile failed: %v", err) diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index 55fb71c16..a34fbc946 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -102,7 +102,6 @@ func (i *Inode) DecRef() { // destroy releases the Inode and releases the msrc reference taken. func (i *Inode) destroy() { - // FIXME(b/38173783): Context is not plumbed here. ctx := context.Background() if err := i.WriteOut(ctx); err != nil { // FIXME(b/65209558): Mark as warning again once noatime is @@ -397,8 +396,6 @@ func (i *Inode) Getlink(ctx context.Context) (*Dirent, error) { // AddLink calls i.InodeOperations.AddLink. func (i *Inode) AddLink() { if i.overlay != nil { - // FIXME(b/63117438): Remove this from InodeOperations altogether. - // // This interface is only used by ramfs to update metadata of // children. These filesystems should _never_ have overlay // Inodes cached as children. So explicitly disallow this diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index 5ada33a32..537c8d257 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -231,7 +231,8 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st upperFile.Dirent.Inode.IncRef() entry, err := newOverlayEntry(ctx, upperFile.Dirent.Inode, nil, false) if err != nil { - cleanupUpper(ctx, o.upper, name) + werr := fmt.Errorf("newOverlayEntry failed: %v", err) + cleanupUpper(ctx, o.upper, name, werr) return nil, err } diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index 928c90aa0..e3a715c1f 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -143,7 +143,10 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i } var writeLen int64 - for event := i.events.Front(); event != nil; event = event.Next() { + for it := i.events.Front(); it != nil; { + event := it + it = it.Next() + // Does the buffer have enough remaining space to hold the event we're // about to write out? if dst.NumBytes() < int64(event.sizeOf()) { diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go index e672a438c..a3d10770b 100644 --- a/pkg/sentry/fs/mount_test.go +++ b/pkg/sentry/fs/mount_test.go @@ -36,11 +36,12 @@ func mountPathsAre(root *Dirent, got []*Mount, want ...string) error { gotPaths := make(map[string]struct{}, len(got)) gotStr := make([]string, len(got)) for i, g := range got { - groot := g.Root() - name, _ := groot.FullName(root) - groot.DecRef() - gotStr[i] = name - gotPaths[name] = struct{}{} + if groot := g.Root(); groot != nil { + name, _ := groot.FullName(root) + groot.DecRef() + gotStr[i] = name + gotPaths[name] = struct{}{} + } } if len(got) != len(want) { return fmt.Errorf("mount paths are different, got: %q, want: %q", gotStr, want) diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index 574a2cc91..3f2bd0e87 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -17,13 +17,9 @@ package fs import ( "fmt" "math" - "path" - "strings" "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" @@ -100,10 +96,14 @@ func newUndoMount(d *Dirent) *Mount { } } -// Root returns the root dirent of this mount. Callers must call DecRef on the -// returned dirent. +// Root returns the root dirent of this mount. +// +// This may return nil if the mount has already been free. Callers must handle this +// case appropriately. If non-nil, callers must call DecRef on the returned *Dirent. func (m *Mount) Root() *Dirent { - m.root.IncRef() + if !m.root.TryIncRef() { + return nil + } return m.root } @@ -269,19 +269,6 @@ func (mns *MountNamespace) DecRef() { mns.DecRefWithDestructor(mns.destroy) } -// Freeze freezes the entire mount tree. -func (mns *MountNamespace) Freeze() { - mns.mu.Lock() - defer mns.mu.Unlock() - - // We only want to freeze Dirents with active references, not Dirents referenced - // by a mount's MountSource. - mns.flushMountSourceRefsLocked() - - // Freeze the entire shebang. - mns.root.Freeze() -} - // withMountLocked prevents further walks to `node`, because `node` is about to // be a mount point. func (mns *MountNamespace) withMountLocked(node *Dirent, fn func() error) error { @@ -634,71 +621,3 @@ func (mns *MountNamespace) SyncAll(ctx context.Context) { defer mns.mu.Unlock() mns.root.SyncAll(ctx) } - -// ResolveExecutablePath resolves the given executable name given a set of -// paths that might contain it. -func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name string, paths []string) (string, error) { - // Absolute paths can be used directly. - if path.IsAbs(name) { - return name, nil - } - - // Paths with '/' in them should be joined to the working directory, or - // to the root if working directory is not set. - if strings.IndexByte(name, '/') > 0 { - if wd == "" { - wd = "/" - } - if !path.IsAbs(wd) { - return "", fmt.Errorf("working directory %q must be absolute", wd) - } - return path.Join(wd, name), nil - } - - // Otherwise, We must lookup the name in the paths, starting from the - // calling context's root directory. - root := RootFromContext(ctx) - if root == nil { - // Caller has no root. Don't bother traversing anything. - return "", syserror.ENOENT - } - defer root.DecRef() - for _, p := range paths { - binPath := path.Join(p, name) - traversals := uint(linux.MaxSymlinkTraversals) - d, err := mns.FindInode(ctx, root, nil, binPath, &traversals) - if err == syserror.ENOENT || err == syserror.EACCES { - // Didn't find it here. - continue - } - if err != nil { - return "", err - } - defer d.DecRef() - - // Check that it is a regular file. - if !IsRegular(d.Inode.StableAttr) { - continue - } - - // Check whether we can read and execute the found file. - if err := d.Inode.CheckPermission(ctx, PermMask{Read: true, Execute: true}); err != nil { - log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err) - continue - } - return path.Join("/", p, name), nil - } - return "", syserror.ENOENT -} - -// GetPath returns the PATH as a slice of strings given the environment -// variables. -func GetPath(env []string) []string { - const prefix = "PATH=" - for _, e := range env { - if strings.HasPrefix(e, prefix) { - return strings.Split(strings.TrimPrefix(e, prefix), ":") - } - } - return nil -} diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index 280093c5e..77c2c5c0e 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -36,6 +36,7 @@ go_library( "//pkg/sentry/fs/proc/device", "//pkg/sentry/fs/proc/seqfile", "//pkg/sentry/fs/ramfs", + "//pkg/sentry/fsbridge", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go index 465b47da9..91617267d 100644 --- a/pkg/sentry/fs/proc/meminfo.go +++ b/pkg/sentry/fs/proc/meminfo.go @@ -58,12 +58,16 @@ func (d *meminfoData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) var buf bytes.Buffer fmt.Fprintf(&buf, "MemTotal: %8d kB\n", totalSize/1024) - memFree := (totalSize - totalUsage) / 1024 + memFree := totalSize - totalUsage + if memFree > totalSize { + // Underflow. + memFree = 0 + } // We use MemFree as MemAvailable because we don't swap. // TODO(rahat): When reclaim is implemented the value of MemAvailable // should change. - fmt.Fprintf(&buf, "MemFree: %8d kB\n", memFree) - fmt.Fprintf(&buf, "MemAvailable: %8d kB\n", memFree) + fmt.Fprintf(&buf, "MemFree: %8d kB\n", memFree/1024) + fmt.Fprintf(&buf, "MemAvailable: %8d kB\n", memFree/1024) fmt.Fprintf(&buf, "Buffers: 0 kB\n") // memory usage by block devices fmt.Fprintf(&buf, "Cached: %8d kB\n", (file+snapshot.Tmpfs)/1024) // Emulate a system with no swap, which disables inactivation of anon pages. diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go index c10888100..1fc9c703c 100644 --- a/pkg/sentry/fs/proc/mounts.go +++ b/pkg/sentry/fs/proc/mounts.go @@ -60,13 +60,15 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) { }) for _, m := range ms { mroot := m.Root() + if mroot == nil { + continue // No longer valid. + } mountPath, desc := mroot.FullName(rootDir) mroot.DecRef() if !desc { // MountSources that are not descendants of the chroot jail are ignored. continue } - fn(mountPath, m) } } @@ -91,6 +93,12 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se var buf bytes.Buffer forEachMount(mif.t, func(mountPath string, m *fs.Mount) { + mroot := m.Root() + if mroot == nil { + return // No longer valid. + } + defer mroot.DecRef() + // Format: // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) @@ -107,9 +115,6 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se // (3) Major:Minor device ID. We don't have a superblock, so we // just use the root inode device number. - mroot := m.Root() - defer mroot.DecRef() - sa := mroot.Inode.StableAttr fmt.Fprintf(&buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor) @@ -165,7 +170,8 @@ func superBlockOpts(mountPath string, msrc *fs.MountSource) string { // NOTE(b/147673608): If the mount is a cgroup, we also need to include // the cgroup name in the options. For now we just read that from the // path. - // TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we + // + // TODO(gvisor.dev/issue/190): Once gVisor has full cgroup support, we // should get this value from the cgroup itself, and not rely on the // path. if msrc.FilesystemType == "cgroup" { @@ -207,6 +213,9 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan // // The "needs dump"and fsck flags are always 0, which is allowed. root := m.Root() + if root == nil { + return // No longer valid. + } defer root.DecRef() flags := root.Inode.MountSource.Flags diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index 6f2775344..bd18177d4 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -40,44 +40,48 @@ import ( // LINT.IfChange -// newNet creates a new proc net entry. -func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSource) *fs.Inode { +// newNetDir creates a new proc net entry. +func newNetDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + k := t.Kernel() + var contents map[string]*fs.Inode - if s := p.k.NetworkStack(); s != nil { + if s := t.NetworkNamespace().Stack(); s != nil { + // TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task + // network namespace. contents = map[string]*fs.Inode{ - "dev": seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc), - "snmp": seqfile.NewSeqFileInode(ctx, &netSnmp{s: s}, msrc), + "dev": seqfile.NewSeqFileInode(t, &netDev{s: s}, msrc), + "snmp": seqfile.NewSeqFileInode(t, &netSnmp{s: s}, msrc), // The following files are simple stubs until they are // implemented in netstack, if the file contains a // header the stub is just the header otherwise it is // an empty file. - "arp": newStaticProcInode(ctx, msrc, []byte("IP address HW type Flags HW address Mask Device\n")), + "arp": newStaticProcInode(t, msrc, []byte("IP address HW type Flags HW address Mask Device\n")), - "netlink": newStaticProcInode(ctx, msrc, []byte("sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode\n")), - "netstat": newStaticProcInode(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")), - "packet": newStaticProcInode(ctx, msrc, []byte("sk RefCnt Type Proto Iface R Rmem User Inode\n")), - "protocols": newStaticProcInode(ctx, msrc, []byte("protocol size sockets memory press maxhdr slab module cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n")), + "netlink": newStaticProcInode(t, msrc, []byte("sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode\n")), + "netstat": newStaticProcInode(t, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")), + "packet": newStaticProcInode(t, msrc, []byte("sk RefCnt Type Proto Iface R Rmem User Inode\n")), + "protocols": newStaticProcInode(t, msrc, []byte("protocol size sockets memory press maxhdr slab module cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n")), // Linux sets psched values to: nsec per usec, psched // tick in ns, 1000000, high res timer ticks per sec // (ClockGetres returns 1ns resolution). - "psched": newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))), - "ptype": newStaticProcInode(ctx, msrc, []byte("Type Device Function\n")), - "route": seqfile.NewSeqFileInode(ctx, &netRoute{s: s}, msrc), - "tcp": seqfile.NewSeqFileInode(ctx, &netTCP{k: k}, msrc), - "udp": seqfile.NewSeqFileInode(ctx, &netUDP{k: k}, msrc), - "unix": seqfile.NewSeqFileInode(ctx, &netUnix{k: k}, msrc), + "psched": newStaticProcInode(t, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))), + "ptype": newStaticProcInode(t, msrc, []byte("Type Device Function\n")), + "route": seqfile.NewSeqFileInode(t, &netRoute{s: s}, msrc), + "tcp": seqfile.NewSeqFileInode(t, &netTCP{k: k}, msrc), + "udp": seqfile.NewSeqFileInode(t, &netUDP{k: k}, msrc), + "unix": seqfile.NewSeqFileInode(t, &netUnix{k: k}, msrc), } if s.SupportsIPv6() { - contents["if_inet6"] = seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc) - contents["ipv6_route"] = newStaticProcInode(ctx, msrc, []byte("")) - contents["tcp6"] = seqfile.NewSeqFileInode(ctx, &netTCP6{k: k}, msrc) - contents["udp6"] = newStaticProcInode(ctx, msrc, []byte(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n")) + contents["if_inet6"] = seqfile.NewSeqFileInode(t, &ifinet6{s: s}, msrc) + contents["ipv6_route"] = newStaticProcInode(t, msrc, []byte("")) + contents["tcp6"] = seqfile.NewSeqFileInode(t, &netTCP6{k: k}, msrc) + contents["udp6"] = newStaticProcInode(t, msrc, []byte(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n")) } } - d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) - return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil) + d := ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) + return newProcInode(t, d, msrc, fs.SpecialDirectory, t) } // ifinet6 implements seqfile.SeqSource for /proc/net/if_inet6. @@ -834,4 +838,4 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se return data, 0 } -// LINT.ThenChange(../../fsimpl/proc/tasks_net.go) +// LINT.ThenChange(../../fsimpl/proc/task_net.go) diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go index c8abb5052..c659224a7 100644 --- a/pkg/sentry/fs/proc/proc.go +++ b/pkg/sentry/fs/proc/proc.go @@ -70,6 +70,7 @@ func New(ctx context.Context, msrc *fs.MountSource, cgroupControllers map[string "loadavg": seqfile.NewSeqFileInode(ctx, &loadavgData{}, msrc), "meminfo": seqfile.NewSeqFileInode(ctx, &meminfoData{k}, msrc), "mounts": newProcInode(ctx, ramfs.NewSymlink(ctx, fs.RootOwner, "self/mounts"), msrc, fs.Symlink, nil), + "net": newProcInode(ctx, ramfs.NewSymlink(ctx, fs.RootOwner, "self/net"), msrc, fs.Symlink, nil), "self": newSelf(ctx, pidns, msrc), "stat": seqfile.NewSeqFileInode(ctx, &statData{k}, msrc), "thread-self": newThreadSelf(ctx, pidns, msrc), @@ -86,7 +87,6 @@ func New(ctx context.Context, msrc *fs.MountSource, cgroupControllers map[string } // Add more contents that need proc to be initialized. - p.AddChild(ctx, "net", p.newNetDir(ctx, k, msrc)) p.AddChild(ctx, "sys", p.newSysDir(ctx, msrc)) return newProcInode(ctx, p, msrc, fs.SpecialDirectory, nil), nil diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index 0772d4ae4..702fdd392 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -80,7 +80,7 @@ func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir } // Truncate implements fs.InodeOperations.Truncate. -func (tcpMemInode) Truncate(context.Context, *fs.Inode, int64) error { +func (*tcpMemInode) Truncate(context.Context, *fs.Inode, int64) error { return nil } @@ -196,7 +196,7 @@ func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *f } // Truncate implements fs.InodeOperations.Truncate. -func (tcpSack) Truncate(context.Context, *fs.Inode, int64) error { +func (*tcpSack) Truncate(context.Context, *fs.Inode, int64) error { return nil } @@ -357,7 +357,9 @@ func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s ine func (p *proc) newSysNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode { var contents map[string]*fs.Inode - if s := p.k.NetworkStack(); s != nil { + // TODO(gvisor.dev/issue/1833): Support for using the network stack in the + // network namespace of the calling process. + if s := p.k.RootNetworkNamespace().Stack(); s != nil { contents = map[string]*fs.Inode{ "ipv4": p.newSysNetIPv4Dir(ctx, msrc, s), "core": p.newSysNetCore(ctx, msrc, s), diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index ca020e11e..4bbe90198 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -28,6 +28,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/mm" @@ -56,14 +57,23 @@ func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) { return m, nil } +func checkTaskState(t *kernel.Task) error { + switch t.ExitState() { + case kernel.TaskExitZombie: + return syserror.EACCES + case kernel.TaskExitDead: + return syserror.ESRCH + } + return nil +} + // taskDir represents a task-level directory. // // +stateify savable type taskDir struct { ramfs.Dir - t *kernel.Task - pidns *kernel.PIDNamespace + t *kernel.Task } var _ fs.InodeOperations = (*taskDir)(nil) @@ -71,24 +81,27 @@ var _ fs.InodeOperations = (*taskDir)(nil) // newTaskDir creates a new proc task entry. func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode { contents := map[string]*fs.Inode{ - "auxv": newAuxvec(t, msrc), - "cmdline": newExecArgInode(t, msrc, cmdlineExecArg), - "comm": newComm(t, msrc), - "environ": newExecArgInode(t, msrc, environExecArg), - "exe": newExe(t, msrc), - "fd": newFdDir(t, msrc), - "fdinfo": newFdInfoDir(t, msrc), - "gid_map": newGIDMap(t, msrc), - "io": newIO(t, msrc, isThreadGroup), - "maps": newMaps(t, msrc), - "mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), - "mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), - "ns": newNamespaceDir(t, msrc), - "smaps": newSmaps(t, msrc), - "stat": newTaskStat(t, msrc, isThreadGroup, p.pidns), - "statm": newStatm(t, msrc), - "status": newStatus(t, msrc, p.pidns), - "uid_map": newUIDMap(t, msrc), + "auxv": newAuxvec(t, msrc), + "cmdline": newExecArgInode(t, msrc, cmdlineExecArg), + "comm": newComm(t, msrc), + "environ": newExecArgInode(t, msrc, environExecArg), + "exe": newExe(t, msrc), + "fd": newFdDir(t, msrc), + "fdinfo": newFdInfoDir(t, msrc), + "gid_map": newGIDMap(t, msrc), + "io": newIO(t, msrc, isThreadGroup), + "maps": newMaps(t, msrc), + "mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), + "mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), + "net": newNetDir(t, msrc), + "ns": newNamespaceDir(t, msrc), + "oom_score": newOOMScore(t, msrc), + "oom_score_adj": newOOMScoreAdj(t, msrc), + "smaps": newSmaps(t, msrc), + "stat": newTaskStat(t, msrc, isThreadGroup, p.pidns), + "statm": newStatm(t, msrc), + "status": newStatus(t, msrc, p.pidns), + "uid_map": newUIDMap(t, msrc), } if isThreadGroup { contents["task"] = p.newSubtasks(t, msrc) @@ -249,12 +262,13 @@ func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { return newProcInode(t, exeSymlink, msrc, fs.Symlink, t) } -func (e *exe) executable() (d *fs.Dirent, err error) { +func (e *exe) executable() (file fsbridge.File, err error) { + if err := checkTaskState(e.t); err != nil { + return nil, err + } e.t.WithMuLocked(func(t *kernel.Task) { mm := t.MemoryManager() if mm == nil { - // TODO(b/34851096): Check shouldn't allow Readlink once the - // Task is zombied. err = syserror.EACCES return } @@ -262,9 +276,9 @@ func (e *exe) executable() (d *fs.Dirent, err error) { // The MemoryManager may be destroyed, in which case // MemoryManager.destroy will simply set the executable to nil // (with locks held). - d = mm.Executable() - if d == nil { - err = syserror.ENOENT + file = mm.Executable() + if file == nil { + err = syserror.ESRCH } }) return @@ -283,15 +297,7 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { } defer exec.DecRef() - root := fs.RootFromContext(ctx) - if root == nil { - // This doesn't correspond to anything in Linux because the vfs is - // global there. - return "", syserror.EINVAL - } - defer root.DecRef() - n, _ := exec.FullName(root) - return n, nil + return exec.PathnameWithDeleted(ctx), nil } // namespaceSymlink represents a symlink in the namespacefs, such as the files @@ -317,11 +323,22 @@ func newNamespaceSymlink(t *kernel.Task, msrc *fs.MountSource, name string) *fs. return newProcInode(t, n, msrc, fs.Symlink, t) } +// Readlink reads the symlink value. +func (n *namespaceSymlink) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { + if err := checkTaskState(n.t); err != nil { + return "", err + } + return n.Symlink.Readlink(ctx, inode) +} + // Getlink implements fs.InodeOperations.Getlink. func (n *namespaceSymlink) Getlink(ctx context.Context, inode *fs.Inode) (*fs.Dirent, error) { if !kernel.ContextCanTrace(ctx, n.t, false) { return nil, syserror.EACCES } + if err := checkTaskState(n.t); err != nil { + return nil, err + } // Create a new regular file to fake the namespace file. iops := fsutil.NewNoReadWriteFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0777), linux.PROC_SUPER_MAGIC) @@ -803,4 +820,95 @@ func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequenc return int64(n), err } +// newOOMScore returns a oom_score file. It is a stub that always returns 0. +// TODO(gvisor.dev/issue/1967) +func newOOMScore(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + return newStaticProcInode(t, msrc, []byte("0\n")) +} + +// oomScoreAdj is a file containing the oom_score adjustment for a task. +// +// +stateify savable +type oomScoreAdj struct { + fsutil.SimpleFileInode + + t *kernel.Task +} + +// +stateify savable +type oomScoreAdjFile struct { + fsutil.FileGenericSeek `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileNoopFsync `state:"nosave"` + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + waiter.AlwaysReady `state:"nosave"` + + t *kernel.Task +} + +// newOOMScoreAdj returns a oom_score_adj file. +func newOOMScoreAdj(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + i := &oomScoreAdj{ + SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC), + t: t, + } + return newProcInode(t, i, msrc, fs.SpecialFile, t) +} + +// Truncate implements fs.InodeOperations.Truncate. Truncate is called when +// O_TRUNC is specified for any kind of existing Dirent but is not called via +// (f)truncate for proc files. +func (*oomScoreAdj) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + +// GetFile implements fs.InodeOperations.GetFile. +func (o *oomScoreAdj) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return fs.NewFile(ctx, dirent, flags, &oomScoreAdjFile{t: o.t}), nil +} + +// Read implements fs.FileOperations.Read. +func (f *oomScoreAdjFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + if f.t.ExitState() == kernel.TaskExitDead { + return 0, syserror.ESRCH + } + var buf bytes.Buffer + fmt.Fprintf(&buf, "%d\n", f.t.OOMScoreAdj()) + if offset >= int64(buf.Len()) { + return 0, io.EOF + } + n, err := dst.CopyOut(ctx, buf.Bytes()[offset:]) + return int64(n), err +} + +// Write implements fs.FileOperations.Write. +func (f *oomScoreAdjFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + + // Limit input size so as not to impact performance if input size is large. + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return 0, err + } + + if f.t.ExitState() == kernel.TaskExitDead { + return 0, syserror.ESRCH + } + if err := f.t.SetOOMScoreAdj(v); err != nil { + return 0, err + } + + return n, nil +} + // LINT.ThenChange(../../fsimpl/proc/task.go|../../fsimpl/proc/task_files.go) diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go index d5be56c3f..bc117ca6a 100644 --- a/pkg/sentry/fs/tmpfs/fs.go +++ b/pkg/sentry/fs/tmpfs/fs.go @@ -44,9 +44,6 @@ const ( // lookup. cacheRevalidate = "revalidate" - // TODO(edahlgren/mpratt): support a tmpfs size limit. - // size = "size" - // Permissions that exceed modeMask will be rejected. modeMask = 01777 diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go index 25abbc151..1dc75291d 100644 --- a/pkg/sentry/fs/tmpfs/inode_file.go +++ b/pkg/sentry/fs/tmpfs/inode_file.go @@ -39,7 +39,7 @@ var ( opensRO = metric.MustCreateNewUint64Metric("/in_memory_file/opens_ro", false /* sync */, "Number of times an in-memory file was opened in read-only mode.") opensW = metric.MustCreateNewUint64Metric("/in_memory_file/opens_w", false /* sync */, "Number of times an in-memory file was opened in write mode.") reads = metric.MustCreateNewUint64Metric("/in_memory_file/reads", false /* sync */, "Number of in-memory file reads.") - readWait = metric.MustCreateNewUint64Metric("/in_memory_file/read_wait", false /* sync */, "Time waiting on in-memory file reads, in nanoseconds.") + readWait = metric.MustCreateNewUint64NanosecondsMetric("/in_memory_file/read_wait", false /* sync */, "Time waiting on in-memory file reads, in nanoseconds.") ) // fileInodeOperations implements fs.InodeOperations for a regular tmpfs file. diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go index 3c2b583ae..b095312fe 100644 --- a/pkg/sentry/fs/tmpfs/tmpfs.go +++ b/pkg/sentry/fs/tmpfs/tmpfs.go @@ -39,14 +39,13 @@ var fsInfo = fs.Info{ // rename implements fs.InodeOperations.Rename for tmpfs nodes. func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error { - op, ok := oldParent.InodeOperations.(*Dir) - if !ok { - return syserror.EXDEV - } - np, ok := newParent.InodeOperations.(*Dir) - if !ok { + // Don't allow renames across different mounts. + if newParent.MountSource != oldParent.MountSource { return syserror.EXDEV } + + op := oldParent.InodeOperations.(*Dir) + np := newParent.InodeOperations.(*Dir) return ramfs.Rename(ctx, op.ramfsDir, oldName, np.ramfsDir, newName, replacement) } diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go index 12b1c6097..2e9dd2d55 100644 --- a/pkg/sentry/fs/tty/line_discipline.go +++ b/pkg/sentry/fs/tty/line_discipline.go @@ -27,6 +27,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + const ( // canonMaxBytes is the number of bytes that fit into a single line of // terminal input in canonical mode. This corresponds to N_TTY_BUF_SIZE @@ -443,3 +445,5 @@ func (l *lineDiscipline) peek(b []byte) int { } return size } + +// LINT.ThenChange(../../fsimpl/devpts/line_discipline.go) diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go index f62da49bd..fe07fa929 100644 --- a/pkg/sentry/fs/tty/master.go +++ b/pkg/sentry/fs/tty/master.go @@ -26,6 +26,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // masterInodeOperations are the fs.InodeOperations for the master end of the // Terminal (ptmx file). // @@ -232,3 +234,5 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) { unimpl.EmitUnimplementedEvent(ctx) } } + +// LINT.ThenChange(../../fsimpl/devpts/master.go) diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go index 1ca79c0b2..ceabb9b1e 100644 --- a/pkg/sentry/fs/tty/queue.go +++ b/pkg/sentry/fs/tty/queue.go @@ -25,6 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // waitBufMaxBytes is the maximum size of a wait buffer. It is based on // TTYB_DEFAULT_MEM_LIMIT. const waitBufMaxBytes = 131072 @@ -234,3 +236,5 @@ func (q *queue) waitBufAppend(b []byte) { q.waitBuf = append(q.waitBuf, b) q.waitBufLen += uint64(len(b)) } + +// LINT.ThenChange(../../fsimpl/devpts/queue.go) diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go index 6a2dbc576..9871f6fc6 100644 --- a/pkg/sentry/fs/tty/slave.go +++ b/pkg/sentry/fs/tty/slave.go @@ -25,6 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // slaveInodeOperations are the fs.InodeOperations for the slave end of the // Terminal (pts file). // @@ -172,3 +174,5 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem return 0, syserror.ENOTTY } } + +// LINT.ThenChange(../../fsimpl/devpts/slave.go) diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go index 5883f26db..ddcccf4da 100644 --- a/pkg/sentry/fs/tty/terminal.go +++ b/pkg/sentry/fs/tty/terminal.go @@ -23,6 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + // Terminal is a pseudoterminal. // // +stateify savable @@ -126,3 +128,5 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY { } return tm.slaveKTTY } + +// LINT.ThenChange(../../fsimpl/devpts/terminal.go) diff --git a/pkg/sentry/fs/user/BUILD b/pkg/sentry/fs/user/BUILD new file mode 100644 index 000000000..bd5dac373 --- /dev/null +++ b/pkg/sentry/fs/user/BUILD @@ -0,0 +1,39 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "user", + srcs = [ + "path.go", + "user.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/log", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + ], +) + +go_test( + name = "user_test", + size = "small", + srcs = ["user_test.go"], + library = ":user", + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/contexttest", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go new file mode 100644 index 000000000..fbd4547a7 --- /dev/null +++ b/pkg/sentry/fs/user/path.go @@ -0,0 +1,169 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package user + +import ( + "fmt" + "path" + "strings" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// ResolveExecutablePath resolves the given executable name given the working +// dir and environment. +func ResolveExecutablePath(ctx context.Context, creds *auth.Credentials, mns *fs.MountNamespace, envv []string, wd, name string) (string, error) { + // Absolute paths can be used directly. + if path.IsAbs(name) { + return name, nil + } + + // Paths with '/' in them should be joined to the working directory, or + // to the root if working directory is not set. + if strings.IndexByte(name, '/') > 0 { + if wd == "" { + wd = "/" + } + if !path.IsAbs(wd) { + return "", fmt.Errorf("working directory %q must be absolute", wd) + } + return path.Join(wd, name), nil + } + + // Otherwise, We must lookup the name in the paths, starting from the + // calling context's root directory. + paths := getPath(envv) + + root := fs.RootFromContext(ctx) + if root == nil { + // Caller has no root. Don't bother traversing anything. + return "", syserror.ENOENT + } + defer root.DecRef() + for _, p := range paths { + if !path.IsAbs(p) { + // Relative paths aren't safe, no one should be using them. + log.Warningf("Skipping relative path %q in $PATH", p) + continue + } + + binPath := path.Join(p, name) + traversals := uint(linux.MaxSymlinkTraversals) + d, err := mns.FindInode(ctx, root, nil, binPath, &traversals) + if err == syserror.ENOENT || err == syserror.EACCES { + // Didn't find it here. + continue + } + if err != nil { + return "", err + } + defer d.DecRef() + + // Check that it is a regular file. + if !fs.IsRegular(d.Inode.StableAttr) { + continue + } + + // Check whether we can read and execute the found file. + if err := d.Inode.CheckPermission(ctx, fs.PermMask{Read: true, Execute: true}); err != nil { + log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err) + continue + } + return path.Join("/", p, name), nil + } + + // Couldn't find it. + return "", syserror.ENOENT +} + +// ResolveExecutablePathVFS2 resolves the given executable name given the +// working dir and environment. +func ResolveExecutablePathVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, envv []string, wd, name string) (string, error) { + // Absolute paths can be used directly. + if path.IsAbs(name) { + return name, nil + } + + // Paths with '/' in them should be joined to the working directory, or + // to the root if working directory is not set. + if strings.IndexByte(name, '/') > 0 { + if wd == "" { + wd = "/" + } + if !path.IsAbs(wd) { + return "", fmt.Errorf("working directory %q must be absolute", wd) + } + return path.Join(wd, name), nil + } + + // Otherwise, We must lookup the name in the paths, starting from the + // calling context's root directory. + paths := getPath(envv) + + root := mns.Root() + defer root.DecRef() + for _, p := range paths { + if !path.IsAbs(p) { + // Relative paths aren't safe, no one should be using them. + log.Warningf("Skipping relative path %q in $PATH", p) + continue + } + + binPath := path.Join(p, name) + pop := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(binPath), + FollowFinalSymlink: true, + } + opts := &vfs.OpenOptions{ + FileExec: true, + Flags: linux.O_RDONLY, + } + dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts) + if err == syserror.ENOENT || err == syserror.EACCES { + // Didn't find it here. + continue + } + if err != nil { + return "", err + } + dentry.DecRef() + + return binPath, nil + } + + // Couldn't find it. + return "", syserror.ENOENT +} + +// getPath returns the PATH as a slice of strings given the environment +// variables. +func getPath(env []string) []string { + const prefix = "PATH=" + for _, e := range env { + if strings.HasPrefix(e, prefix) { + return strings.Split(strings.TrimPrefix(e, prefix), ":") + } + } + return nil +} diff --git a/pkg/sentry/fs/user/user.go b/pkg/sentry/fs/user/user.go new file mode 100644 index 000000000..f4d525523 --- /dev/null +++ b/pkg/sentry/fs/user/user.go @@ -0,0 +1,239 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package user contains methods for resolving filesystem paths based on the +// user and their environment. +package user + +import ( + "bufio" + "fmt" + "io" + "strconv" + "strings" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +type fileReader struct { + // Ctx is the context for the file reader. + Ctx context.Context + + // File is the file to read from. + File *fs.File +} + +// Read implements io.Reader.Read. +func (r *fileReader) Read(buf []byte) (int, error) { + n, err := r.File.Readv(r.Ctx, usermem.BytesIOSequence(buf)) + return int(n), err +} + +// getExecUserHome returns the home directory of the executing user read from +// /etc/passwd as read from the container filesystem. +func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.KUID) (string, error) { + // The default user home directory to return if no user matching the user + // if found in the /etc/passwd found in the image. + const defaultHome = "/" + + // Open the /etc/passwd file from the dirent via the root mount namespace. + mnsRoot := rootMns.Root() + maxTraversals := uint(linux.MaxSymlinkTraversals) + dirent, err := rootMns.FindInode(ctx, mnsRoot, nil, "/etc/passwd", &maxTraversals) + if err != nil { + // NOTE: Ignore errors opening the passwd file. If the passwd file + // doesn't exist we will return the default home directory. + return defaultHome, nil + } + defer dirent.DecRef() + + // Check read permissions on the file. + if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Read: true}); err != nil { + // NOTE: Ignore permissions errors here and return default root dir. + return defaultHome, nil + } + + // Only open regular files. We don't open other files like named pipes as + // they may block and might present some attack surface to the container. + // Note that runc does not seem to do this kind of checking. + if !fs.IsRegular(dirent.Inode.StableAttr) { + return defaultHome, nil + } + + f, err := dirent.Inode.GetFile(ctx, dirent, fs.FileFlags{Read: true, Directory: false}) + if err != nil { + return "", err + } + defer f.DecRef() + + r := &fileReader{ + Ctx: ctx, + File: f, + } + + return findHomeInPasswd(uint32(uid), r, defaultHome) +} + +type fileReaderVFS2 struct { + ctx context.Context + fd *vfs.FileDescription +} + +func (r *fileReaderVFS2) Read(buf []byte) (int, error) { + n, err := r.fd.Read(r.ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) + return int(n), err +} + +func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth.KUID) (string, error) { + const defaultHome = "/" + + root := mns.Root() + defer root.DecRef() + + creds := auth.CredentialsFromContext(ctx) + + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse("/etc/passwd"), + } + + opts := &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + } + + fd, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, target, opts) + if err != nil { + return defaultHome, nil + } + defer fd.DecRef() + + r := &fileReaderVFS2{ + ctx: ctx, + fd: fd, + } + + homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome) + if err != nil { + return "", err + } + + return homeDir, nil +} + +// MaybeAddExecUserHome returns a new slice with the HOME enviroment variable +// set if the slice does not already contain it, otherwise it returns the +// original slice unmodified. +func MaybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) { + // Check if the envv already contains HOME. + for _, env := range envv { + if strings.HasPrefix(env, "HOME=") { + // We have it. Return the original slice unmodified. + return envv, nil + } + } + + // Read /etc/passwd for the user's HOME directory and set the HOME + // environment variable as required by POSIX if it is not overridden by + // the user. + homeDir, err := getExecUserHome(ctx, mns, uid) + if err != nil { + return nil, fmt.Errorf("error reading exec user: %v", err) + } + + return append(envv, "HOME="+homeDir), nil +} + +// MaybeAddExecUserHomeVFS2 returns a new slice with the HOME enviroment +// variable set if the slice does not already contain it, otherwise it returns +// the original slice unmodified. +func MaybeAddExecUserHomeVFS2(ctx context.Context, vmns *vfs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) { + // Check if the envv already contains HOME. + for _, env := range envv { + if strings.HasPrefix(env, "HOME=") { + // We have it. Return the original slice unmodified. + return envv, nil + } + } + + // Read /etc/passwd for the user's HOME directory and set the HOME + // environment variable as required by POSIX if it is not overridden by + // the user. + homeDir, err := getExecUserHomeVFS2(ctx, vmns, uid) + if err != nil { + return nil, fmt.Errorf("error reading exec user: %v", err) + } + return append(envv, "HOME="+homeDir), nil +} + +// findHomeInPasswd parses a passwd file and returns the given user's home +// directory. This function does it's best to replicate the runc's behavior. +func findHomeInPasswd(uid uint32, passwd io.Reader, defaultHome string) (string, error) { + s := bufio.NewScanner(passwd) + + for s.Scan() { + if err := s.Err(); err != nil { + return "", err + } + + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // Pull out part of passwd entry. Loosely parse the passwd entry as some + // passwd files could be poorly written and for compatibility with runc. + // + // Per 'man 5 passwd' + // /etc/passwd contains one line for each user account, with seven + // fields delimited by colons (“:”). These fields are: + // + // - login name + // - optional encrypted password + // - numerical user ID + // - numerical group ID + // - user name or comment field + // - user home directory + // - optional user command interpreter + parts := strings.Split(line, ":") + + found := false + homeDir := "" + for i, p := range parts { + switch i { + case 2: + parsedUID, err := strconv.ParseUint(p, 10, 32) + if err == nil && parsedUID == uint64(uid) { + found = true + } + case 5: + homeDir = p + } + } + if found { + // NOTE: If the uid is present but the home directory is not + // present in the /etc/passwd entry we return an empty string. This + // is, for better or worse, what runc does. + return homeDir, nil + } + } + + return defaultHome, nil +} diff --git a/pkg/sentry/fs/user/user_test.go b/pkg/sentry/fs/user/user_test.go new file mode 100644 index 000000000..7d8e9ac7c --- /dev/null +++ b/pkg/sentry/fs/user/user_test.go @@ -0,0 +1,198 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package user + +import ( + "fmt" + "strings" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" + "gvisor.dev/gvisor/pkg/usermem" +) + +// createEtcPasswd creates /etc/passwd with the given contents and mode. If +// mode is empty, then no file will be created. If mode is not a regular file +// mode, then contents is ignored. +func createEtcPasswd(ctx context.Context, root *fs.Dirent, contents string, mode linux.FileMode) error { + if err := root.CreateDirectory(ctx, root, "etc", fs.FilePermsFromMode(0755)); err != nil { + return err + } + etc, err := root.Walk(ctx, root, "etc") + if err != nil { + return err + } + defer etc.DecRef() + switch mode.FileType() { + case 0: + // Don't create anything. + return nil + case linux.S_IFREG: + passwd, err := etc.Create(ctx, root, "passwd", fs.FileFlags{Write: true}, fs.FilePermsFromMode(mode)) + if err != nil { + return err + } + defer passwd.DecRef() + if _, err := passwd.Writev(ctx, usermem.BytesIOSequence([]byte(contents))); err != nil { + return err + } + return nil + case linux.S_IFDIR: + return etc.CreateDirectory(ctx, root, "passwd", fs.FilePermsFromMode(mode)) + case linux.S_IFIFO: + return etc.CreateFifo(ctx, root, "passwd", fs.FilePermsFromMode(mode)) + default: + return fmt.Errorf("unknown file type %x", mode.FileType()) + } +} + +// TestGetExecUserHome tests the getExecUserHome function. +func TestGetExecUserHome(t *testing.T) { + tests := map[string]struct { + uid auth.KUID + passwdContents string + passwdMode linux.FileMode + expected string + }{ + "success": { + uid: 1000, + passwdContents: "adin::1000:1111::/home/adin:/bin/sh", + passwdMode: linux.S_IFREG | 0666, + expected: "/home/adin", + }, + "no_perms": { + uid: 1000, + passwdContents: "adin::1000:1111::/home/adin:/bin/sh", + passwdMode: linux.S_IFREG, + expected: "/", + }, + "no_passwd": { + uid: 1000, + expected: "/", + }, + "directory": { + uid: 1000, + passwdMode: linux.S_IFDIR | 0666, + expected: "/", + }, + // Currently we don't allow named pipes. + "named_pipe": { + uid: 1000, + passwdMode: linux.S_IFIFO | 0666, + expected: "/", + }, + } + + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + ctx := contexttest.Context(t) + msrc := fs.NewPseudoMountSource(ctx) + rootInode := tmpfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0777), msrc) + + mns, err := fs.NewMountNamespace(ctx, rootInode) + if err != nil { + t.Fatalf("NewMountNamespace failed: %v", err) + } + defer mns.DecRef() + root := mns.Root() + defer root.DecRef() + ctx = fs.WithRoot(ctx, root) + + if err := createEtcPasswd(ctx, root, tc.passwdContents, tc.passwdMode); err != nil { + t.Fatalf("createEtcPasswd failed: %v", err) + } + + got, err := getExecUserHome(ctx, mns, tc.uid) + if err != nil { + t.Fatalf("failed to get user home: %v", err) + } + + if got != tc.expected { + t.Fatalf("expected %v, got: %v", tc.expected, got) + } + }) + } +} + +// TestFindHomeInPasswd tests the findHomeInPasswd function's passwd file parsing. +func TestFindHomeInPasswd(t *testing.T) { + tests := map[string]struct { + uid uint32 + passwd string + expected string + def string + }{ + "empty": { + uid: 1000, + passwd: "", + expected: "/", + def: "/", + }, + "whitespace": { + uid: 1000, + passwd: " ", + expected: "/", + def: "/", + }, + "full": { + uid: 1000, + passwd: "adin::1000:1111::/home/adin:/bin/sh", + expected: "/home/adin", + def: "/", + }, + // For better or worse, this is how runc works. + "partial": { + uid: 1000, + passwd: "adin::1000:1111:", + expected: "", + def: "/", + }, + "multiple": { + uid: 1001, + passwd: "adin::1000:1111::/home/adin:/bin/sh\nian::1001:1111::/home/ian:/bin/sh", + expected: "/home/ian", + def: "/", + }, + "duplicate": { + uid: 1000, + passwd: "adin::1000:1111::/home/adin:/bin/sh\nian::1000:1111::/home/ian:/bin/sh", + expected: "/home/adin", + def: "/", + }, + "empty_lines": { + uid: 1001, + passwd: "adin::1000:1111::/home/adin:/bin/sh\n\n\nian::1001:1111::/home/ian:/bin/sh", + expected: "/home/ian", + def: "/", + }, + } + + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + got, err := findHomeInPasswd(tc.uid, strings.NewReader(tc.passwd), tc.def) + if err != nil { + t.Fatalf("error parsing passwd: %v", err) + } + if tc.expected != got { + t.Fatalf("expected %v, got: %v", tc.expected, got) + } + }) + } +} |