diff options
Diffstat (limited to 'pkg/sentry/fsimpl')
35 files changed, 1062 insertions, 326 deletions
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 48eaccdbc..afea58f65 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -476,7 +476,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath } // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { _, _, err := fs.walk(rp, false) if err != nil { return nil, err @@ -485,7 +485,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([ } // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { _, _, err := fs.walk(rp, false) if err != nil { return "", err diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index d15a36709..99d1e3f8f 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) @@ -54,3 +54,13 @@ go_library( "//pkg/usermem", ], ) + +go_test( + name = "gofer_test", + srcs = ["gofer_test.go"], + library = ":gofer", + deps = [ + "//pkg/p9", + "//pkg/sentry/contexttest", + ], +) diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 5dbfc6250..49d9f859b 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -56,14 +56,19 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.mu.Lock() defer fd.mu.Unlock() + d := fd.dentry() if fd.dirents == nil { - ds, err := fd.dentry().getDirents(ctx) + ds, err := d.getDirents(ctx) if err != nil { return err } fd.dirents = ds } + if d.fs.opts.interop != InteropModeShared { + d.touchAtime(fd.vfsfd.Mount()) + } + for fd.off < int64(len(fd.dirents)) { if err := cb.Handle(fd.dirents[fd.off]); err != nil { return err diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 269624362..cd744bf5e 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -356,7 +356,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err := create(parent, name); err != nil { return err } - parent.touchCMtime(ctx) + if fs.opts.interop != InteropModeShared { + parent.touchCMtime() + } delete(parent.negativeChildren, name) parent.dirents = nil return nil @@ -435,14 +437,19 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b flags := uint32(0) if dir { if child != nil && !child.isDir() { + vfsObj.AbortDeleteDentry(childVFSD) return syserror.ENOTDIR } flags = linux.AT_REMOVEDIR } else { if child != nil && child.isDir() { + vfsObj.AbortDeleteDentry(childVFSD) return syserror.EISDIR } if rp.MustBeDir() { + if childVFSD != nil { + vfsObj.AbortDeleteDentry(childVFSD) + } return syserror.ENOTDIR } } @@ -454,7 +461,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b return err } if fs.opts.interop != InteropModeShared { - parent.touchCMtime(ctx) + parent.touchCMtime() if dir { parent.decLinks() } @@ -802,7 +809,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving d.IncRef() // reference held by child on its parent d d.vfsd.InsertChild(&child.vfsd, name) if d.fs.opts.interop != InteropModeShared { - d.touchCMtime(ctx) delete(d.negativeChildren, name) d.dirents = nil } @@ -834,6 +840,9 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } + if d.fs.opts.interop != InteropModeShared { + d.touchCMtime() + } return childVFSFD, nil } @@ -975,6 +984,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa oldParent.decLinks() newParent.incLinks() } + oldParent.touchCMtime() + newParent.touchCMtime() + renamed.touchCtime() } vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD) return nil @@ -1068,7 +1080,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath } // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) @@ -1076,11 +1088,11 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([ if err != nil { return nil, err } - return d.listxattr(ctx) + return d.listxattr(ctx, rp.Credentials(), size) } // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) @@ -1088,7 +1100,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, nam if err != nil { return "", err } - return d.getxattr(ctx, name) + return d.getxattr(ctx, rp.Credentials(), &opts) } // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. @@ -1100,7 +1112,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt if err != nil { return err } - return d.setxattr(ctx, &opts) + return d.setxattr(ctx, rp.Credentials(), &opts) } // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. @@ -1112,7 +1124,7 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, if err != nil { return err } - return d.removexattr(ctx, name) + return d.removexattr(ctx, rp.Credentials(), name) } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 8e41b6b1c..2485cdb53 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -34,6 +34,7 @@ package gofer import ( "fmt" "strconv" + "strings" "sync" "sync/atomic" "syscall" @@ -44,6 +45,7 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -72,6 +74,9 @@ type filesystem struct { // client is the client used by this filesystem. client is immutable. client *p9.Client + // clock is a realtime clock used to set timestamps in file operations. + clock ktime.Clock + // uid and gid are the effective KUID and KGID of the filesystem's creator, // and are used as the owner and group for files that don't specify one. // uid and gid are immutable. @@ -376,6 +381,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt uid: creds.EffectiveKUID, gid: creds.EffectiveKGID, client: client, + clock: ktime.RealtimeClockFromContext(ctx), dentries: make(map[*dentry]struct{}), specialFileFDs: make(map[*specialFileFD]struct{}), } @@ -439,7 +445,8 @@ type dentry struct { // refs is the reference count. Each dentry holds a reference on its // parent, even if disowned. refs is accessed using atomic memory - // operations. + // operations. When refs reaches 0, the dentry may be added to the cache or + // destroyed. If refs==-1 the dentry has already been destroyed. refs int64 // fs is the owning filesystem. fs is immutable. @@ -779,10 +786,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin // data, so there's no cache to truncate either.) return nil } - now, haveNow := nowFromContext(ctx) - if !haveNow { - ctx.Warningf("gofer.dentry.setStat: current time not available") - } + now := d.fs.clock.Now().Nanoseconds() if stat.Mask&linux.STATX_MODE != 0 { atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode)) } @@ -794,25 +798,19 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin } if setLocalAtime { if stat.Atime.Nsec == linux.UTIME_NOW { - if haveNow { - atomic.StoreInt64(&d.atime, now) - } + atomic.StoreInt64(&d.atime, now) } else { atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime)) } } if setLocalMtime { if stat.Mtime.Nsec == linux.UTIME_NOW { - if haveNow { - atomic.StoreInt64(&d.mtime, now) - } + atomic.StoreInt64(&d.mtime, now) } else { atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime)) } } - if haveNow { - atomic.StoreInt64(&d.ctime, now) - } + atomic.StoreInt64(&d.ctime, now) if stat.Mask&linux.STATX_SIZE != 0 { d.dataMu.Lock() oldSize := d.size @@ -864,7 +862,7 @@ func (d *dentry) IncRef() { func (d *dentry) TryIncRef() bool { for { refs := atomic.LoadInt64(&d.refs) - if refs == 0 { + if refs <= 0 { return false } if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) { @@ -887,13 +885,20 @@ func (d *dentry) DecRef() { // checkCachingLocked should be called after d's reference count becomes 0 or it // becomes disowned. // +// It may be called on a destroyed dentry. For example, +// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times +// for the same dentry when the dentry is visited more than once in the same +// operation. One of the calls may destroy the dentry, so subsequent calls will +// do nothing. +// // Preconditions: d.fs.renameMu must be locked for writing. func (d *dentry) checkCachingLocked() { // Dentries with a non-zero reference count must be retained. (The only way // to obtain a reference on a dentry with zero references is via path // resolution, which requires renameMu, so if d.refs is zero then it will // remain zero while we hold renameMu for writing.) - if atomic.LoadInt64(&d.refs) != 0 { + refs := atomic.LoadInt64(&d.refs) + if refs > 0 { if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- @@ -901,6 +906,10 @@ func (d *dentry) checkCachingLocked() { } return } + if refs == -1 { + // Dentry has already been destroyed. + return + } // Non-child dentries with zero references are no longer reachable by path // resolution and should be dropped immediately. if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() { @@ -953,9 +962,22 @@ func (d *dentry) checkCachingLocked() { } } +// destroyLocked destroys the dentry. It may flushes dirty pages from cache, +// close p9 file and remove reference on parent dentry. +// // Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is // not a child dentry. func (d *dentry) destroyLocked() { + switch atomic.LoadInt64(&d.refs) { + case 0: + // Mark the dentry destroyed. + atomic.StoreInt64(&d.refs, -1) + case -1: + panic("dentry.destroyLocked() called on already destroyed dentry") + default: + panic("dentry.destroyLocked() called with references on the dentry") + } + ctx := context.Background() d.handleMu.Lock() if !d.handle.file.isNil() { @@ -975,7 +997,10 @@ func (d *dentry) destroyLocked() { d.handle.close(ctx) } d.handleMu.Unlock() - d.file.close(ctx) + if !d.file.isNil() { + d.file.close(ctx) + d.file = p9file{} + } // Remove d from the set of all dentries. d.fs.syncMu.Lock() delete(d.fs.dentries, d) @@ -1000,21 +1025,50 @@ func (d *dentry) setDeleted() { atomic.StoreUint32(&d.deleted, 1) } -func (d *dentry) listxattr(ctx context.Context) ([]string, error) { - return nil, syserror.ENOTSUP +// We only support xattrs prefixed with "user." (see b/148380782). Currently, +// there is no need to expose any other xattrs through a gofer. +func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) { + xattrMap, err := d.file.listXattr(ctx, size) + if err != nil { + return nil, err + } + xattrs := make([]string, 0, len(xattrMap)) + for x := range xattrMap { + if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) { + xattrs = append(xattrs, x) + } + } + return xattrs, nil } -func (d *dentry) getxattr(ctx context.Context, name string) (string, error) { - // TODO(jamieliu): add vfs.GetxattrOptions.Size - return d.file.getXattr(ctx, name, linux.XATTR_SIZE_MAX) +func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { + if err := d.checkPermissions(creds, vfs.MayRead); err != nil { + return "", err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return "", syserror.EOPNOTSUPP + } + return d.file.getXattr(ctx, opts.Name, opts.Size) } -func (d *dentry) setxattr(ctx context.Context, opts *vfs.SetxattrOptions) error { +func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error { + if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags) } -func (d *dentry) removexattr(ctx context.Context, name string) error { - return syserror.ENOTSUP +func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error { + if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + return d.file.removeXattr(ctx, name) } // Preconditions: d.isRegularFile() || d.isDirectory(). @@ -1065,7 +1119,7 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // description, but this doesn't matter since they refer to the // same file (unless d.fs.opts.overlayfsStaleRead is true, // which we handle separately). - if err := syscall.Dup3(int(h.fd), int(d.handle.fd), 0); err != nil { + if err := syscall.Dup3(int(h.fd), int(d.handle.fd), syscall.O_CLOEXEC); err != nil { d.handleMu.Unlock() ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err) h.close(ctx) @@ -1165,21 +1219,21 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) } // Listxattr implements vfs.FileDescriptionImpl.Listxattr. -func (fd *fileDescription) Listxattr(ctx context.Context) ([]string, error) { - return fd.dentry().listxattr(ctx) +func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { + return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size) } // Getxattr implements vfs.FileDescriptionImpl.Getxattr. -func (fd *fileDescription) Getxattr(ctx context.Context, name string) (string, error) { - return fd.dentry().getxattr(ctx, name) +func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) { + return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts) } // Setxattr implements vfs.FileDescriptionImpl.Setxattr. func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { - return fd.dentry().setxattr(ctx, &opts) + return fd.dentry().setxattr(ctx, auth.CredentialsFromContext(ctx), &opts) } // Removexattr implements vfs.FileDescriptionImpl.Removexattr. func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { - return fd.dentry().removexattr(ctx, name) + return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name) } diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go new file mode 100644 index 000000000..82bc239db --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -0,0 +1,64 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync/atomic" + "testing" + + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/contexttest" +) + +func TestDestroyIdempotent(t *testing.T) { + fs := filesystem{ + dentries: make(map[*dentry]struct{}), + opts: filesystemOptions{ + // Test relies on no dentry being held in the cache. + maxCachedDentries: 0, + }, + } + + ctx := contexttest.Context(t) + attr := &p9.Attr{ + Mode: p9.ModeRegular, + } + mask := p9.AttrMask{ + Mode: true, + Size: true, + } + parent, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr) + if err != nil { + t.Fatalf("fs.newDentry(): %v", err) + } + + child, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr) + if err != nil { + t.Fatalf("fs.newDentry(): %v", err) + } + parent.IncRef() // reference held by child on its parent. + parent.vfsd.InsertChild(&child.vfsd, "child") + + child.checkCachingLocked() + if got := atomic.LoadInt64(&child.refs); got != -1 { + t.Fatalf("child.refs=%d, want: -1", got) + } + // Parent will also be destroyed when child reference is removed. + if got := atomic.LoadInt64(&parent.refs); got != -1 { + t.Fatalf("parent.refs=%d, want: -1", got) + } + child.checkCachingLocked() + child.checkCachingLocked() +} diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go index 755ac2985..87f0b877f 100644 --- a/pkg/sentry/fsimpl/gofer/p9file.go +++ b/pkg/sentry/fsimpl/gofer/p9file.go @@ -85,6 +85,13 @@ func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAt return err } +func (f p9file) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) { + ctx.UninterruptibleSleepStart(false) + xattrs, err := f.file.ListXattr(size) + ctx.UninterruptibleSleepFinish(false) + return xattrs, err +} + func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) { ctx.UninterruptibleSleepStart(false) val, err := f.file.GetXattr(name, size) @@ -99,6 +106,13 @@ func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32) return err } +func (f p9file) removeXattr(ctx context.Context, name string) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.RemoveXattr(name) + ctx.UninterruptibleSleepFinish(false) + return err +} + func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error { ctx.UninterruptibleSleepStart(false) err := f.file.Allocate(mode, offset, length) diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 3593eb1d5..857f7c74e 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -104,7 +104,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs putDentryReadWriter(rw) if d.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). - d.touchAtime(ctx, fd.vfsfd.Mount()) + d.touchAtime(fd.vfsfd.Mount()) } return n, err } @@ -139,10 +139,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // Compare Linux's mm/filemap.c:__generic_file_write_iter() => // file_update_time(). This is d.touchCMtime(), but without locking // d.metadataMu (recursively). - if now, ok := nowFromContext(ctx); ok { - atomic.StoreInt64(&d.mtime, now) - atomic.StoreInt64(&d.ctime, now) - } + d.touchCMtimeLocked() } if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { // Write dirty cached pages that will be touched by the write back to diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 274f7346f..507e0e276 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -76,7 +76,7 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs // hold here since specialFileFD doesn't client-cache data. Just buffer the // read instead. if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { - d.touchAtime(ctx, fd.vfsfd.Mount()) + d.touchAtime(fd.vfsfd.Mount()) } buf := make([]byte, dst.NumBytes()) n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) @@ -117,7 +117,7 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // Do a buffered write. See rationale in PRead. if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { - d.touchCMtime(ctx) + d.touchCMtime() } buf := make([]byte, src.NumBytes()) // Don't do partial writes if we get a partial read from src. diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go index adf43be60..2ec819f86 100644 --- a/pkg/sentry/fsimpl/gofer/symlink.go +++ b/pkg/sentry/fsimpl/gofer/symlink.go @@ -27,7 +27,7 @@ func (d *dentry) isSymlink() bool { // Precondition: d.isSymlink(). func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { if d.fs.opts.interop != InteropModeShared { - d.touchAtime(ctx, mnt) + d.touchAtime(mnt) d.dataMu.Lock() if d.haveTarget { target := d.target diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go index 7598ec6a8..2608e7e1d 100644 --- a/pkg/sentry/fsimpl/gofer/time.go +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -18,8 +18,6 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -38,23 +36,12 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp { } } -func nowFromContext(ctx context.Context) (int64, bool) { - if clock := ktime.RealtimeClockFromContext(ctx); clock != nil { - return clock.Now().Nanoseconds(), true - } - return 0, false -} - // Preconditions: fs.interop != InteropModeShared. -func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) { +func (d *dentry) touchAtime(mnt *vfs.Mount) { if err := mnt.CheckBeginWrite(); err != nil { return } - now, ok := nowFromContext(ctx) - if !ok { - mnt.EndWrite() - return - } + now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() atomic.StoreInt64(&d.atime, now) d.metadataMu.Unlock() @@ -63,13 +50,25 @@ func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) { // Preconditions: fs.interop != InteropModeShared. The caller has successfully // called vfs.Mount.CheckBeginWrite(). -func (d *dentry) touchCMtime(ctx context.Context) { - now, ok := nowFromContext(ctx) - if !ok { - return - } +func (d *dentry) touchCtime() { + now := d.fs.clock.Now().Nanoseconds() + d.metadataMu.Lock() + atomic.StoreInt64(&d.ctime, now) + d.metadataMu.Unlock() +} + +// Preconditions: fs.interop != InteropModeShared. The caller has successfully +// called vfs.Mount.CheckBeginWrite(). +func (d *dentry) touchCMtime() { + now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() atomic.StoreInt64(&d.mtime, now) atomic.StoreInt64(&d.ctime, now) d.metadataMu.Unlock() } + +func (d *dentry) touchCMtimeLocked() { + now := d.fs.clock.Now().Nanoseconds() + atomic.StoreInt64(&d.mtime, now) + atomic.StoreInt64(&d.ctime, now) +} diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 7d9dcd4c9..fe14476f1 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -74,31 +74,33 @@ func ImportFD(mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, err } // Retrieve metadata. - var s syscall.Stat_t - if err := syscall.Fstat(hostFD, &s); err != nil { + var s unix.Stat_t + if err := unix.Fstat(hostFD, &s); err != nil { return nil, err } fileMode := linux.FileMode(s.Mode) fileType := fileMode.FileType() - // Pipes, character devices, and sockets. - isStream := fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK + + // Determine if hostFD is seekable. If not, this syscall will return ESPIPE + // (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character + // devices. + _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR) + seekable := err != syserror.ESPIPE i := &inode{ hostFD: hostFD, - isStream: isStream, + seekable: seekable, isTTY: isTTY, canMap: canMap(uint32(fileType)), ino: fs.NextIno(), - mode: fileMode, - // For simplicity, set offset to 0. Technically, we should - // only set to 0 on files that are not seekable (sockets, pipes, etc.), - // and use the offset from the host fd otherwise. + // For simplicity, set offset to 0. Technically, we should use the existing + // offset on the host if the file is seekable. offset: 0, } - // These files can't be memory mapped, assert this. - if i.isStream && i.canMap { + // Non-seekable files can't be memory mapped, assert this. + if !i.seekable && i.canMap { panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") } @@ -124,12 +126,12 @@ type inode struct { // This field is initialized at creation time and is immutable. hostFD int - // isStream is true if the host fd points to a file representing a stream, + // seekable is false if the host fd points to a file representing a stream, // e.g. a socket or a pipe. Such files are not seekable and can return // EWOULDBLOCK for I/O operations. // // This field is initialized at creation time and is immutable. - isStream bool + seekable bool // isTTY is true if this file represents a TTY. // @@ -146,20 +148,6 @@ type inode struct { // This field is initialized at creation time and is immutable. ino uint64 - // modeMu protects mode. - modeMu sync.Mutex - - // mode is a cached version of the file mode on the host. Note that it may - // become out of date if the mode is changed on the host, e.g. with chmod. - // - // Generally, it is better to retrieve the mode from the host through an - // fstat syscall. We only use this value in inode.Mode(), which cannot - // return an error, if the syscall to host fails. - // - // FIXME(b/152294168): Plumb error into Inode.Mode() return value so we - // can get rid of this. - mode linux.FileMode - // offsetMu protects offset. offsetMu sync.Mutex @@ -192,10 +180,11 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a // Mode implements kernfs.Inode. func (i *inode) Mode() linux.FileMode { mode, _, _, err := i.getPermissions() + // Retrieving the mode from the host fd using fstat(2) should not fail. + // If the syscall does not succeed, something is fundamentally wrong. if err != nil { - return i.mode + panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err)) } - return linux.FileMode(mode) } @@ -205,11 +194,6 @@ func (i *inode) getPermissions() (linux.FileMode, auth.KUID, auth.KGID, error) { if err := syscall.Fstat(i.hostFD, &s); err != nil { return 0, 0, 0, err } - - // Update cached mode. - i.modeMu.Lock() - i.mode = linux.FileMode(s.Mode) - i.modeMu.Unlock() return linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid), nil } @@ -289,12 +273,6 @@ func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, erro ls.Ino = i.ino } - // Update cached mode. - if (mask&linux.STATX_TYPE != 0) && (mask&linux.STATX_MODE != 0) { - i.modeMu.Lock() - i.mode = linux.FileMode(s.Mode) - i.modeMu.Unlock() - } return ls, nil } @@ -361,9 +339,6 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre if err := syscall.Fchmod(i.hostFD, uint32(s.Mode)); err != nil { return err } - i.modeMu.Lock() - i.mode = linux.FileMode(s.Mode) - i.modeMu.Unlock() } if m&linux.STATX_SIZE != 0 { if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil { @@ -481,8 +456,7 @@ func (f *fileDescription) Release() { // PRead implements FileDescriptionImpl. func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { i := f.inode - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if i.isStream { + if !i.seekable { return 0, syserror.ESPIPE } @@ -492,8 +466,7 @@ func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, off // Read implements FileDescriptionImpl. func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { i := f.inode - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if i.isStream { + if !i.seekable { n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags) if isBlockError(err) { // If we got any data at all, return it as a "completed" partial read @@ -538,8 +511,7 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off // PWrite implements FileDescriptionImpl. func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { i := f.inode - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if i.isStream { + if !i.seekable { return 0, syserror.ESPIPE } @@ -549,8 +521,7 @@ func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, of // Write implements FileDescriptionImpl. func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { i := f.inode - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if i.isStream { + if !i.seekable { n, err := writeToHostFD(ctx, i.hostFD, src, -1, opts.Flags) if isBlockError(err) { err = syserror.ErrWouldBlock @@ -593,8 +564,7 @@ func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offs // allow directory fds to be imported at all. func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) { i := f.inode - // TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null. - if i.isStream { + if !i.seekable { return 0, syserror.ESPIPE } diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index a429fa23d..baf81b4db 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -63,6 +63,9 @@ afterSymlink: rp.Advance() return nextVFSD, nil } + if len(name) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG + } d.dirMu.Lock() nextVFSD, err := rp.ResolveChild(vfsd, name) if err != nil { @@ -76,16 +79,22 @@ afterSymlink: } // Resolve any symlink at current path component. if rp.ShouldFollowSymlink() && next.isSymlink() { - // TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks". - target, err := next.inode.Readlink(ctx) + targetVD, targetPathname, err := next.inode.Getlink(ctx) if err != nil { return nil, err } - if err := rp.HandleSymlink(target); err != nil { - return nil, err + if targetVD.Ok() { + err := rp.HandleJump(targetVD) + targetVD.DecRef() + if err != nil { + return nil, err + } + } else { + if err := rp.HandleSymlink(targetPathname); err != nil { + return nil, err + } } goto afterSymlink - } rp.Advance() return &next.vfsd, nil @@ -191,6 +200,9 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v if pc == "." || pc == ".." { return "", syserror.EEXIST } + if len(pc) > linux.NAME_MAX { + return "", syserror.ENAMETOOLONG + } childVFSD, err := rp.ResolveChild(parentVFSD, pc) if err != nil { return "", err @@ -433,6 +445,9 @@ afterTrailingSymlink: if pc == "." || pc == ".." { return nil, syserror.EISDIR } + if len(pc) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG + } // Determine whether or not we need to create a file. childVFSD, err := rp.ResolveChild(parentVFSD, pc) if err != nil { @@ -461,19 +476,25 @@ afterTrailingSymlink: } childDentry := childVFSD.Impl().(*Dentry) childInode := childDentry.inode - if rp.ShouldFollowSymlink() { - if childDentry.isSymlink() { - target, err := childInode.Readlink(ctx) + if rp.ShouldFollowSymlink() && childDentry.isSymlink() { + targetVD, targetPathname, err := childInode.Getlink(ctx) + if err != nil { + return nil, err + } + if targetVD.Ok() { + err := rp.HandleJump(targetVD) + targetVD.DecRef() if err != nil { return nil, err } - if err := rp.HandleSymlink(target); err != nil { + } else { + if err := rp.HandleSymlink(targetPathname); err != nil { return nil, err } - // rp.Final() may no longer be true since we now need to resolve the - // symlink target. - goto afterTrailingSymlink } + // rp.Final() may no longer be true since we now need to resolve the + // symlink target. + goto afterTrailingSymlink } if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err @@ -661,7 +682,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu if err != nil { return linux.Statfs{}, err } - // TODO: actually implement statfs + // TODO(gvisor.dev/issue/1193): actually implement statfs. return linux.Statfs{}, syserror.ENOSYS } @@ -742,7 +763,7 @@ func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath } // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { +func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() @@ -755,7 +776,7 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([ } // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { +func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 5c84b10c9..65f09af5d 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -181,6 +181,11 @@ func (InodeNotSymlink) Readlink(context.Context) (string, error) { return "", syserror.EINVAL } +// Getlink implements Inode.Getlink. +func (InodeNotSymlink) Getlink(context.Context) (vfs.VirtualDentry, string, error) { + return vfs.VirtualDentry{}, "", syserror.EINVAL +} + // InodeAttrs partially implements the Inode interface, specifically the // inodeMetadata sub interface. InodeAttrs provides functionality related to // inode attributes. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 2cefef020..ad76b9f64 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -414,7 +414,21 @@ type inodeDynamicLookup interface { } type inodeSymlink interface { - // Readlink resolves the target of a symbolic link. If an inode is not a + // Readlink returns the target of a symbolic link. If an inode is not a // symlink, the implementation should return EINVAL. Readlink(ctx context.Context) (string, error) + + // Getlink returns the target of a symbolic link, as used by path + // resolution: + // + // - If the inode is a "magic link" (a link whose target is most accurately + // represented as a VirtualDentry), Getlink returns (ok VirtualDentry, "", + // nil). A reference is taken on the returned VirtualDentry. + // + // - If the inode is an ordinary symlink, Getlink returns (zero-value + // VirtualDentry, symlink target, nil). + // + // - If the inode is not a symlink, Getlink returns (zero-value + // VirtualDentry, "", EINVAL). + Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) } diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 5918d3309..018aa503c 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -55,6 +55,11 @@ func (s *StaticSymlink) Readlink(_ context.Context) (string, error) { return s.target, nil } +// Getlink implements Inode.Getlink. +func (s *StaticSymlink) Getlink(_ context.Context) (vfs.VirtualDentry, string, error) { + return vfs.VirtualDentry{}, s.target, nil +} + // SetStat implements Inode.SetStat not allowing inode attributes to be changed. func (*StaticSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD new file mode 100644 index 000000000..0d411606f --- /dev/null +++ b/pkg/sentry/fsimpl/pipefs/BUILD @@ -0,0 +1,20 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "pipefs", + srcs = ["pipefs.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", + "//pkg/sentry/kernel/time", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go new file mode 100644 index 000000000..faf3179bc --- /dev/null +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -0,0 +1,148 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pipefs provides the filesystem implementation backing +// Kernel.PipeMount. +package pipefs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +type filesystemType struct{} + +// Name implements vfs.FilesystemType.Name. +func (filesystemType) Name() string { + return "pipefs" +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + panic("pipefs.filesystemType.GetFilesystem should never be called") +} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem + + // TODO(gvisor.dev/issue/1193): + // + // - kernfs does not provide a way to implement statfs, from which we + // should indicate PIPEFS_MAGIC. + // + // - kernfs does not provide a way to override names for + // vfs.FilesystemImpl.PrependPath(); pipefs inodes should use synthetic + // name fmt.Sprintf("pipe:[%d]", inode.ino). +} + +// NewFilesystem sets up and returns a new vfs.Filesystem implemented by +// pipefs. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { + fs := &filesystem{} + fs.Init(vfsObj, filesystemType{}) + return fs.VFSFilesystem() +} + +// inode implements kernfs.Inode. +type inode struct { + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + kernfs.InodeNoopRefCount + + pipe *pipe.VFSPipe + + ino uint64 + uid auth.KUID + gid auth.KGID + // We use the creation timestamp for all of atime, mtime, and ctime. + ctime ktime.Time +} + +func newInode(ctx context.Context, fs *kernfs.Filesystem) *inode { + creds := auth.CredentialsFromContext(ctx) + return &inode{ + pipe: pipe.NewVFSPipe(false /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize), + ino: fs.NextIno(), + uid: creds.EffectiveKUID, + gid: creds.EffectiveKGID, + ctime: ktime.NowFromContext(ctx), + } +} + +const pipeMode = 0600 | linux.S_IFIFO + +// CheckPermissions implements kernfs.Inode.CheckPermissions. +func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, pipeMode, i.uid, i.gid) +} + +// Mode implements kernfs.Inode.Mode. +func (i *inode) Mode() linux.FileMode { + return pipeMode +} + +// Stat implements kernfs.Inode.Stat. +func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds()) + return linux.Statx{ + Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, + Blksize: usermem.PageSize, + Nlink: 1, + UID: uint32(i.uid), + GID: uint32(i.gid), + Mode: pipeMode, + Ino: i.ino, + Size: 0, + Blocks: 0, + Atime: ts, + Ctime: ts, + Mtime: ts, + // TODO(gvisor.dev/issue/1197): Device number. + }, nil +} + +// SetStat implements kernfs.Inode.SetStat. +func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + if opts.Stat.Mask == 0 { + return nil + } + return syserror.EPERM +} + +// Open implements kernfs.Inode.Open. +func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // FIXME(b/38173783): kernfs does not plumb Context here. + return i.pipe.Open(context.Background(), rp.Mount(), vfsd, opts.Flags) +} + +// NewConnectedPipeFDs returns a pair of FileDescriptions representing the read +// and write ends of a newly-created pipe, as for pipe(2) and pipe2(2). +// +// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem(). +func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, *vfs.FileDescription) { + fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) + inode := newInode(ctx, fs) + var d kernfs.Dentry + d.Init(inode) + defer d.DecRef() + return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags) +} diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 8156984eb..17c1342b5 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -22,7 +22,6 @@ go_library( "//pkg/log", "//pkg/refs", "//pkg/safemem", - "//pkg/sentry/fs", "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/inet", diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index aee2a4392..888afc0fd 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -214,22 +214,6 @@ func newIO(t *kernel.Task, isThreadGroup bool) *ioData { return &ioData{ioUsage: t} } -func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { - // Namespace symlinks should contain the namespace name and the inode number - // for the namespace instance, so for example user:[123456]. We currently fake - // the inode number by sticking the symlink inode in its place. - target := fmt.Sprintf("%s:[%d]", ns, ino) - - inode := &kernfs.StaticSymlink{} - // Note: credentials are overridden by taskOwnedInode. - inode.Init(task.Credentials(), ino, target) - - taskInode := &taskOwnedInode{Inode: inode, owner: task} - d := &kernfs.Dentry{} - d.Init(taskInode) - return d -} - // newCgroupData creates inode that shows cgroup information. // From man 7 cgroups: "For each cgroup hierarchy of which the process is a // member, there is one entry containing three colon-separated fields: diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 76bfc5307..046265eca 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -30,34 +30,35 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -type fdDir struct { - inoGen InoGenerator - task *kernel.Task - - // When produceSymlinks is set, dirents produces for the FDs are reported - // as symlink. Otherwise, they are reported as regular files. - produceSymlink bool -} - -func (i *fdDir) lookup(name string) (*vfs.FileDescription, kernel.FDFlags, error) { - fd, err := strconv.ParseUint(name, 10, 64) - if err != nil { - return nil, kernel.FDFlags{}, syserror.ENOENT - } - +func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) { var ( file *vfs.FileDescription flags kernel.FDFlags ) - i.task.WithMuLocked(func(t *kernel.Task) { - if fdTable := t.FDTable(); fdTable != nil { - file, flags = fdTable.GetVFS2(int32(fd)) + t.WithMuLocked(func(t *kernel.Task) { + if fdt := t.FDTable(); fdt != nil { + file, flags = fdt.GetVFS2(fd) } }) + return file, flags +} + +func taskFDExists(t *kernel.Task, fd int32) bool { + file, _ := getTaskFD(t, fd) if file == nil { - return nil, kernel.FDFlags{}, syserror.ENOENT + return false } - return file, flags, nil + file.DecRef() + return true +} + +type fdDir struct { + inoGen InoGenerator + task *kernel.Task + + // When produceSymlinks is set, dirents produces for the FDs are reported + // as symlink. Otherwise, they are reported as regular files. + produceSymlink bool } // IterDirents implements kernfs.inodeDynamicLookup. @@ -128,11 +129,15 @@ func newFDDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { // Lookup implements kernfs.inodeDynamicLookup. func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { - file, _, err := i.lookup(name) + fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { - return nil, err + return nil, syserror.ENOENT + } + fd := int32(fdInt) + if !taskFDExists(i.task, fd) { + return nil, syserror.ENOENT } - taskDentry := newFDSymlink(i.task.Credentials(), file, i.inoGen.NextIno()) + taskDentry := newFDSymlink(i.task, fd, i.inoGen.NextIno()) return taskDentry.VFSDentry(), nil } @@ -169,19 +174,22 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia // // +stateify savable type fdSymlink struct { - refs.AtomicRefCount kernfs.InodeAttrs + kernfs.InodeNoopRefCount kernfs.InodeSymlink - file *vfs.FileDescription + task *kernel.Task + fd int32 } var _ kernfs.Inode = (*fdSymlink)(nil) -func newFDSymlink(creds *auth.Credentials, file *vfs.FileDescription, ino uint64) *kernfs.Dentry { - file.IncRef() - inode := &fdSymlink{file: file} - inode.Init(creds, ino, linux.ModeSymlink|0777) +func newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry { + inode := &fdSymlink{ + task: task, + fd: fd, + } + inode.Init(task.Credentials(), ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -189,21 +197,25 @@ func newFDSymlink(creds *auth.Credentials, file *vfs.FileDescription, ino uint64 } func (s *fdSymlink) Readlink(ctx context.Context) (string, error) { + file, _ := getTaskFD(s.task, s.fd) + if file == nil { + return "", syserror.ENOENT + } + defer file.DecRef() root := vfs.RootFromContext(ctx) defer root.DecRef() - - vfsObj := s.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem() - return vfsObj.PathnameWithDeleted(ctx, root, s.file.VirtualDentry()) -} - -func (s *fdSymlink) DecRef() { - s.AtomicRefCount.DecRefWithDestructor(func() { - s.Destroy() - }) + return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry()) } -func (s *fdSymlink) Destroy() { - s.file.DecRef() +func (s *fdSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) { + file, _ := getTaskFD(s.task, s.fd) + if file == nil { + return vfs.VirtualDentry{}, "", syserror.ENOENT + } + defer file.DecRef() + vd := file.VirtualDentry() + vd.IncRef() + return vd, "", nil } // fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory. @@ -238,12 +250,18 @@ func newFDInfoDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { // Lookup implements kernfs.inodeDynamicLookup. func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { - file, flags, err := i.lookup(name) + fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { - return nil, err + return nil, syserror.ENOENT + } + fd := int32(fdInt) + if !taskFDExists(i.task, fd) { + return nil, syserror.ENOENT + } + data := &fdInfoData{ + task: i.task, + fd: fd, } - - data := &fdInfoData{file: file, flags: flags} dentry := newTaskOwnedFile(i.task, i.inoGen.NextIno(), 0444, data) return dentry.VFSDentry(), nil } @@ -262,26 +280,23 @@ type fdInfoData struct { kernfs.DynamicBytesFile refs.AtomicRefCount - file *vfs.FileDescription - flags kernel.FDFlags + task *kernel.Task + fd int32 } var _ dynamicInode = (*fdInfoData)(nil) -func (d *fdInfoData) DecRef() { - d.AtomicRefCount.DecRefWithDestructor(d.destroy) -} - -func (d *fdInfoData) destroy() { - d.file.DecRef() -} - // Generate implements vfs.DynamicBytesSource.Generate. func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + file, descriptorFlags := getTaskFD(d.task, d.fd) + if file == nil { + return syserror.ENOENT + } + defer file.DecRef() // TODO(b/121266871): Include pos, locks, and other data. For now we only // have flags. // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt - flags := uint(d.file.StatusFlags()) | d.flags.ToLinuxFileFlags() + flags := uint(file.StatusFlags()) | descriptorFlags.ToLinuxFileFlags() fmt.Fprintf(buf, "flags:\t0%o\n", flags) return nil } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index df0d1bcc5..2c6f8bdfc 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -64,6 +64,16 @@ func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) { return m, nil } +func checkTaskState(t *kernel.Task) error { + switch t.ExitState() { + case kernel.TaskExitZombie: + return syserror.EACCES + case kernel.TaskExitDead: + return syserror.ESRCH + } + return nil +} + type bufferWriter struct { buf *bytes.Buffer } @@ -610,12 +620,31 @@ func (s *exeSymlink) Readlink(ctx context.Context) (string, error) { return exec.PathnameWithDeleted(ctx), nil } +// Getlink implements kernfs.Inode.Getlink. +func (s *exeSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) { + if !kernel.ContextCanTrace(ctx, s.task, false) { + return vfs.VirtualDentry{}, "", syserror.EACCES + } + + exec, err := s.executable() + if err != nil { + return vfs.VirtualDentry{}, "", err + } + defer exec.DecRef() + + vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry() + vd.IncRef() + return vd, "", nil +} + func (s *exeSymlink) executable() (file fsbridge.File, err error) { + if err := checkTaskState(s.task); err != nil { + return nil, err + } + s.task.WithMuLocked(func(t *kernel.Task) { mm := t.MemoryManager() if mm == nil { - // TODO(b/34851096): Check shouldn't allow Readlink once the - // Task is zombied. err = syserror.EACCES return } @@ -625,7 +654,7 @@ func (s *exeSymlink) executable() (file fsbridge.File, err error) { // (with locks held). file = mm.Executable() if file == nil { - err = syserror.ENOENT + err = syserror.ESRCH } }) return @@ -692,3 +721,41 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) return nil } + +type namespaceSymlink struct { + kernfs.StaticSymlink + + task *kernel.Task +} + +func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { + // Namespace symlinks should contain the namespace name and the inode number + // for the namespace instance, so for example user:[123456]. We currently fake + // the inode number by sticking the symlink inode in its place. + target := fmt.Sprintf("%s:[%d]", ns, ino) + + inode := &namespaceSymlink{task: task} + // Note: credentials are overridden by taskOwnedInode. + inode.Init(task.Credentials(), ino, target) + + taskInode := &taskOwnedInode{Inode: inode, owner: task} + d := &kernfs.Dentry{} + d.Init(taskInode) + return d +} + +// Readlink implements Inode. +func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) { + if err := checkTaskState(s.task); err != nil { + return "", err + } + return s.StaticSymlink.Readlink(ctx) +} + +// Getlink implements Inode.Getlink. +func (s *namespaceSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) { + if err := checkTaskState(s.task); err != nil { + return vfs.VirtualDentry{}, "", err + } + return s.StaticSymlink.Getlink(ctx) +} diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 373a7b17d..6595fcee6 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -32,6 +31,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/usermem" @@ -206,22 +206,21 @@ var _ dynamicInode = (*netUnixData)(nil) func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { buf.WriteString("Num RefCount Protocol Flags Type St Inode Path\n") for _, se := range n.kernel.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock) + s := se.SockVFS2 + if !s.TryIncRef() { + log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) continue } - sfile := s.(*fs.File) - if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { + if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX { s.DecRef() // Not a unix socket. continue } - sops := sfile.FileOperations.(*unix.SocketOperations) + sops := s.Impl().(*unix.SocketVFS2) addr, err := sops.Endpoint().GetLocalAddress() if err != nil { - log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err) + log.Warningf("Failed to retrieve socket name from %+v: %v", s, err) addr.Addr = "<unknown>" } @@ -234,6 +233,15 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { } } + // Get inode number. + var ino uint64 + stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_INO}) + if statErr != nil || stat.Mask&linux.STATX_INO == 0 { + log.Warningf("Failed to retrieve ino for socket file: %v", statErr) + } else { + ino = stat.Ino + } + // In the socket entry below, the value for the 'Num' field requires // some consideration. Linux prints the address to the struct // unix_sock representing a socket in the kernel, but may redact the @@ -252,14 +260,14 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { // the definition of this struct changes over time. // // For now, we always redact this pointer. - fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d", + fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d", (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct. - sfile.ReadRefs()-1, // RefCount, don't count our own ref. + s.Refs()-1, // RefCount, don't count our own ref. 0, // Protocol, always 0 for UDS. sockFlags, // Flags. sops.Endpoint().Type(), // Type. sops.State(), // State. - sfile.InodeID(), // Inode. + ino, // Inode. ) // Path @@ -341,15 +349,14 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, t := kernel.TaskFromContext(ctx) for _, se := range k.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) + s := se.SockVFS2 + if !s.TryIncRef() { + log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) continue } - sfile := s.(*fs.File) - sops, ok := sfile.FileOperations.(socket.Socket) + sops, ok := s.Impl().(socket.SocketVFS2) if !ok { - panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { s.DecRef() @@ -398,14 +405,15 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, // Unimplemented. fmt.Fprintf(buf, "%08X ", 0) + stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO}) + // Field: uid. - uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) - if err != nil { - log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + if statErr != nil || stat.Mask&linux.STATX_UID == 0 { + log.Warningf("Failed to retrieve uid for socket file: %v", statErr) fmt.Fprintf(buf, "%5d ", 0) } else { creds := auth.CredentialsFromContext(ctx) - fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow())) + fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow())) } // Field: timeout; number of unanswered 0-window probes. @@ -413,11 +421,16 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, fmt.Fprintf(buf, "%8d ", 0) // Field: inode. - fmt.Fprintf(buf, "%8d ", sfile.InodeID()) + if statErr != nil || stat.Mask&linux.STATX_INO == 0 { + log.Warningf("Failed to retrieve inode for socket file: %v", statErr) + fmt.Fprintf(buf, "%8d ", 0) + } else { + fmt.Fprintf(buf, "%8d ", stat.Ino) + } // Field: refcount. Don't count the ref we obtain while deferencing // the weakref to this socket. - fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) + fmt.Fprintf(buf, "%d ", s.Refs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. @@ -499,15 +512,14 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { t := kernel.TaskFromContext(ctx) for _, se := range d.kernel.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) + s := se.SockVFS2 + if !s.TryIncRef() { + log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) continue } - sfile := s.(*fs.File) - sops, ok := sfile.FileOperations.(socket.Socket) + sops, ok := s.Impl().(socket.SocketVFS2) if !ok { - panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { s.DecRef() @@ -551,25 +563,31 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Field: retrnsmt. Always 0 for UDP. fmt.Fprintf(buf, "%08X ", 0) + stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO}) + // Field: uid. - uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) - if err != nil { - log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + if statErr != nil || stat.Mask&linux.STATX_UID == 0 { + log.Warningf("Failed to retrieve uid for socket file: %v", statErr) fmt.Fprintf(buf, "%5d ", 0) } else { creds := auth.CredentialsFromContext(ctx) - fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow())) + fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow())) } // Field: timeout. Always 0 for UDP. fmt.Fprintf(buf, "%8d ", 0) // Field: inode. - fmt.Fprintf(buf, "%8d ", sfile.InodeID()) + if statErr != nil || stat.Mask&linux.STATX_INO == 0 { + log.Warningf("Failed to retrieve inode for socket file: %v", statErr) + fmt.Fprintf(buf, "%8d ", 0) + } else { + fmt.Fprintf(buf, "%8d ", stat.Ino) + } // Field: ref; reference count on the socket inode. Don't count the ref // we obtain while deferencing the weakref to this socket. - fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) + fmt.Fprintf(buf, "%d ", s.Refs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. @@ -670,9 +688,9 @@ func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error { if line.prefix == "Tcp" { tcp := stat.(*inet.StatSNMPTCP) // "Tcp" needs special processing because MaxConn is signed. RFC 2012. - fmt.Sprintf("%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:])) + fmt.Fprintf(buf, "%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:])) } else { - fmt.Sprintf("%s: %s\n", line.prefix, sprintSlice(toSlice(stat))) + fmt.Fprintf(buf, "%s: %s\n", line.prefix, sprintSlice(toSlice(stat))) } } return nil diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 882c1981e..4621e2de0 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -63,6 +63,11 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { return strconv.FormatUint(uint64(tgid), 10), nil } +func (s *selfSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) { + target, err := s.Readlink(ctx) + return vfs.VirtualDentry{}, target, err +} + // SetStat implements Inode.SetStat not allowing inode attributes to be changed. func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM @@ -101,6 +106,11 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { return fmt.Sprintf("%d/task/%d", tgid, tid), nil } +func (s *threadSelfSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) { + target, err := s.Readlink(ctx) + return vfs.VirtualDentry{}, target, err +} + // SetStat implements Inode.SetStat not allowing inode attributes to be changed. func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD new file mode 100644 index 000000000..52084ddb5 --- /dev/null +++ b/pkg/sentry/fsimpl/sockfs/BUILD @@ -0,0 +1,17 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "sockfs", + srcs = ["sockfs.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go new file mode 100644 index 000000000..3f7ad1d65 --- /dev/null +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -0,0 +1,102 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sockfs provides a filesystem implementation for anonymous sockets. +package sockfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// NewFilesystem creates a new sockfs filesystem. +// +// Note that there should only ever be one instance of sockfs.Filesystem, +// backing a global socket mount. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { + fs, _, err := filesystemType{}.GetFilesystem(nil, vfsObj, nil, "", vfs.GetFilesystemOptions{}) + if err != nil { + panic("failed to create sockfs filesystem") + } + return fs +} + +// filesystemType implements vfs.FilesystemType. +type filesystemType struct{} + +// GetFilesystem implements FilesystemType.GetFilesystem. +func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + fs := &filesystem{} + fs.Init(vfsObj, fsType) + return fs.VFSFilesystem(), nil, nil +} + +// Name implements FilesystemType.Name. +// +// Note that registering sockfs is unnecessary, except for the fact that it +// will not show up under /proc/filesystems as a result. This is a very minor +// discrepancy from Linux. +func (filesystemType) Name() string { + return "sockfs" +} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem +} + +// inode implements kernfs.Inode. +// +// TODO(gvisor.dev/issue/1476): Add device numbers to this inode (which are +// not included in InodeAttrs) to store the numbers of the appropriate +// socket device. Override InodeAttrs.Stat() accordingly. +type inode struct { + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + kernfs.InodeAttrs + kernfs.InodeNoopRefCount +} + +// Open implements kernfs.Inode.Open. +func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + return nil, syserror.ENXIO +} + +// InitSocket initializes a socket FileDescription, with a corresponding +// Dentry in mnt. +// +// fd should be the FileDescription associated with socketImpl, i.e. its first +// field. mnt should be the global socket mount, Kernel.socketMount. +func InitSocket(socketImpl vfs.FileDescriptionImpl, fd *vfs.FileDescription, mnt *vfs.Mount, creds *auth.Credentials) error { + fsimpl := mnt.Filesystem().Impl() + fs := fsimpl.(*kernfs.Filesystem) + + // File mode matches net/socket.c:sock_alloc. + filemode := linux.FileMode(linux.S_IFSOCK | 0600) + i := &inode{} + i.Init(creds, fs.NextIno(), filemode) + + d := &kernfs.Dentry{} + d.Init(i) + + opts := &vfs.FileDescriptionOptions{UseDentryMetadata: true} + if err := fd.Init(socketImpl, linux.O_RDWR, mnt, d.VFSDentry(), opts); err != nil { + return err + } + return nil +} diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 6ea35affb..4e6cd3491 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -24,6 +24,7 @@ go_library( "filesystem.go", "named_pipe.go", "regular_file.go", + "socket_file.go", "symlink.go", "tmpfs.go", ], @@ -50,6 +51,7 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/sentry/vfs/lock", + "//pkg/sentry/vfs/memxattr", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 383133e44..651912169 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -168,7 +168,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { } } -func BenchmarkVFS2MemfsStat(b *testing.B) { +func BenchmarkVFS2TmpfsStat(b *testing.B) { for _, depth := range depths { b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { ctx := contexttest.Context(b) @@ -362,7 +362,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { } } -func BenchmarkVFS2MemfsMountStat(b *testing.B) { +func BenchmarkVFS2TmpfsMountStat(b *testing.B) { for _, depth := range depths { b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { ctx := contexttest.Context(b) diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 37c75ab64..45712c9b9 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -68,6 +68,8 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fs.mu.Lock() defer fs.mu.Unlock() + fd.inode().touchAtime(fd.vfsfd.Mount()) + if fd.off == 0 { if err := cb.Handle(vfs.Dirent{ Name: ".", diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index e678ecc37..452c4e2e0 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -46,6 +46,9 @@ func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { return nil, err } afterSymlink: + if len(rp.Component()) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG + } nextVFSD, err := rp.ResolveComponent(&d.vfsd) if err != nil { return nil, err @@ -57,7 +60,7 @@ afterSymlink: } next := nextVFSD.Impl().(*dentry) if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { - // TODO(gvisor.dev/issues/1197): Symlink traversals updates + // TODO(gvisor.dev/issue/1197): Symlink traversals updates // access time. if err := rp.HandleSymlink(symlink.target); err != nil { return nil, err @@ -133,6 +136,9 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if name == "." || name == ".." { return syserror.EEXIST } + if len(name) > linux.NAME_MAX { + return syserror.ENAMETOOLONG + } // Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(), // because if the child exists we want to return EEXIST immediately instead // of attempting symlink/mount traversal. @@ -142,7 +148,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if !dir && rp.MustBeDir() { return syserror.ENOENT } - // In memfs, the only way to cause a dentry to be disowned is by removing + // In tmpfs, the only way to cause a dentry to be disowned is by removing // it from the filesystem, so this check is equivalent to checking if // parent has been removed. if parent.vfsd.IsDisowned() { @@ -153,7 +159,11 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa return err } defer mnt.EndWrite() - return create(parent, name) + if err := create(parent, name); err != nil { + return err + } + parent.inode.touchCMtime() + return nil } // AccessAt implements vfs.Filesystem.Impl.AccessAt. @@ -251,8 +261,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v case linux.S_IFCHR: childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor) case linux.S_IFSOCK: - // Not yet supported. - return syserror.EPERM + childInode = fs.newSocketFile(rp.Credentials(), opts.Mode, opts.Endpoint) default: return syserror.EINVAL } @@ -328,7 +337,12 @@ afterTrailingSymlink: child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) parent.vfsd.InsertChild(&child.vfsd, name) parent.inode.impl.(*directory).childList.PushBack(child) - return child.open(ctx, rp, &opts, true) + fd, err := child.open(ctx, rp, &opts, true) + if err != nil { + return nil, err + } + parent.inode.touchCMtime() + return fd, nil } if err != nil { return nil, err @@ -378,9 +392,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open // Can't open symlinks without O_PATH (which is unimplemented). return nil, syserror.ELOOP case *namedPipe: - return newNamedPipeFD(ctx, impl, rp, &d.vfsd, opts.Flags) + return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags) case *deviceFile: return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts) + case *socketFile: + return nil, syserror.ENXIO default: panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl)) } @@ -398,6 +414,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st if !ok { return "", syserror.EINVAL } + symlink.inode.touchAtime(rp.Mount()) return symlink.target, nil } @@ -515,7 +532,10 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa oldParent.inode.decLinksLocked() newParent.inode.incLinksLocked() } - // TODO(gvisor.dev/issues/1197): Update timestamps and parent directory + oldParent.inode.touchCMtime() + newParent.inode.touchCMtime() + renamed.inode.touchCtime() + // TODO(gvisor.dev/issue/1197): Update timestamps and parent directory // sizes. vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD) return nil @@ -565,6 +585,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error parent.inode.decLinksLocked() // from child's ".." child.inode.decLinksLocked() vfsObj.CommitDeleteDentry(childVFSD) + parent.inode.touchCMtime() return nil } @@ -600,7 +621,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu if err != nil { return linux.Statfs{}, err } - // TODO(gvisor.dev/issues/1197): Actually implement statfs. + // TODO(gvisor.dev/issue/1197): Actually implement statfs. return linux.Statfs{}, syserror.ENOSYS } @@ -654,62 +675,68 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error parent.inode.impl.(*directory).childList.Remove(child) child.inode.decLinksLocked() vfsObj.CommitDeleteDentry(childVFSD) + parent.inode.touchCMtime() return nil } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. -// -// TODO(gvisor.dev/issue/1476): Implement BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { - return nil, syserror.ECONNREFUSED + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return nil, err + } + switch impl := d.inode.impl.(type) { + case *socketFile: + return impl.ep, nil + default: + return nil, syserror.ECONNREFUSED + } } // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) + d, err := resolveLocked(rp) if err != nil { return nil, err } - // TODO(b/127675828): support extended attributes - return nil, syserror.ENOTSUP + return d.inode.listxattr(size) } // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) + d, err := resolveLocked(rp) if err != nil { return "", err } - // TODO(b/127675828): support extended attributes - return "", syserror.ENOTSUP + return d.inode.getxattr(rp.Credentials(), &opts) } // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) + d, err := resolveLocked(rp) if err != nil { return err } - // TODO(b/127675828): support extended attributes - return syserror.ENOTSUP + return d.inode.setxattr(rp.Credentials(), &opts) } // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) + d, err := resolveLocked(rp) if err != nil { return err } - // TODO(b/127675828): support extended attributes - return syserror.ENOTSUP + return d.inode.removexattr(rp.Credentials(), name) } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go index 2c5c739df..8d77b3fa8 100644 --- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -16,10 +16,8 @@ package tmpfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" - "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) @@ -33,27 +31,8 @@ type namedPipe struct { // * fs.mu must be locked. // * rp.Mount().CheckBeginWrite() has been called successfully. func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode { - file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)} + file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)} file.inode.init(file, fs, creds, linux.S_IFIFO|mode) file.inode.nlink = 1 // Only the parent has a link. return &file.inode } - -// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented -// entirely via struct embedding. -type namedPipeFD struct { - fileDescription - - *pipe.VFSPipeFD -} - -func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { - var err error - var fd namedPipeFD - fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, vfsd, &fd.vfsfd, flags) - if err != nil { - return nil, err - } - fd.vfsfd.Init(&fd, flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}) - return &fd.vfsfd, nil -} diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 26cd65605..57e5e28ec 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -286,7 +286,8 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs rw := getRegularFileReadWriter(f, offset) n, err := dst.CopyOutFrom(ctx, rw) putRegularFileReadWriter(rw) - return int64(n), err + fd.inode().touchAtime(fd.vfsfd.Mount()) + return n, err } // Read implements vfs.FileDescriptionImpl.Read. @@ -323,6 +324,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off f.inode.mu.Lock() rw := getRegularFileReadWriter(f, offset) n, err := src.CopyInTo(ctx, rw) + fd.inode().touchCMtimeLocked() f.inode.mu.Unlock() putRegularFileReadWriter(rw) return n, err diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go new file mode 100644 index 000000000..25c2321af --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go @@ -0,0 +1,34 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" +) + +// socketFile is a socket (=S_IFSOCK) tmpfs file. +type socketFile struct { + inode inode + ep transport.BoundEndpoint +} + +func (fs *filesystem) newSocketFile(creds *auth.Credentials, mode linux.FileMode, ep transport.BoundEndpoint) *inode { + file := &socketFile{ep: ep} + file.inode.init(file, fs, creds, mode) + file.inode.nlink = 1 // from parent directory + return &file.inode +} diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go index ebe035dee..d4f59ee5b 100644 --- a/pkg/sentry/fsimpl/tmpfs/stat_test.go +++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go @@ -29,7 +29,7 @@ func TestStatAfterCreate(t *testing.T) { mode := linux.FileMode(0644) // Run with different file types. - // TODO(gvisor.dev/issues/1197): Also test symlinks and sockets. + // TODO(gvisor.dev/issue/1197): Also test symlinks and sockets. for _, typ := range []string{"file", "dir", "pipe"} { t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { var ( @@ -140,7 +140,7 @@ func TestSetStatAtime(t *testing.T) { Mask: 0, Atime: linux.NsecToStatxTimestamp(100), }}); err != nil { - t.Errorf("SetStat atime without mask failed: %v") + t.Errorf("SetStat atime without mask failed: %v", err) } // Atime should be unchanged. if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { @@ -155,7 +155,7 @@ func TestSetStatAtime(t *testing.T) { Atime: linux.NsecToStatxTimestamp(100), } if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil { - t.Errorf("SetStat atime with mask failed: %v") + t.Errorf("SetStat atime with mask failed: %v", err) } if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { t.Errorf("Stat got error: %v", err) @@ -169,7 +169,7 @@ func TestSetStat(t *testing.T) { mode := linux.FileMode(0644) // Run with different file types. - // TODO(gvisor.dev/issues/1197): Also test symlinks and sockets. + // TODO(gvisor.dev/issue/1197): Also test symlinks and sockets. for _, typ := range []string{"file", "dir", "pipe"} { t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { var ( @@ -205,7 +205,7 @@ func TestSetStat(t *testing.T) { Mask: 0, Atime: linux.NsecToStatxTimestamp(100), }}); err != nil { - t.Errorf("SetStat atime without mask failed: %v") + t.Errorf("SetStat atime without mask failed: %v", err) } // Atime should be unchanged. if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { @@ -220,7 +220,7 @@ func TestSetStat(t *testing.T) { Atime: linux.NsecToStatxTimestamp(100), } if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil { - t.Errorf("SetStat atime with mask failed: %v") + t.Errorf("SetStat atime with mask failed: %v", err) } if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { t.Errorf("Stat got error: %v", err) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index afd9f8533..82c709b43 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -27,6 +27,7 @@ package tmpfs import ( "fmt" "math" + "strings" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -37,6 +38,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/vfs/lock" + "gvisor.dev/gvisor/pkg/sentry/vfs/memxattr" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -77,6 +79,11 @@ type FilesystemOpts struct { // RootSymlinkTarget is the target of the root symlink. Only valid if // RootFileType == S_IFLNK. RootSymlinkTarget string + + // FilesystemType allows setting a different FilesystemType for this + // tmpfs filesystem. This allows tmpfs to "impersonate" other + // filesystems, like ramdiskfs and cgroupfs. + FilesystemType vfs.FilesystemType } // GetFilesystem implements vfs.FilesystemType.GetFilesystem. @@ -91,15 +98,22 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt clock: clock, } - fs.vfsfs.Init(vfsObj, &fstype, &fs) - - typ := uint16(linux.S_IFDIR) + rootFileType := uint16(linux.S_IFDIR) + newFSType := vfs.FilesystemType(&fstype) tmpfsOpts, ok := opts.InternalData.(FilesystemOpts) - if ok && tmpfsOpts.RootFileType != 0 { - typ = tmpfsOpts.RootFileType + if ok { + if tmpfsOpts.RootFileType != 0 { + rootFileType = tmpfsOpts.RootFileType + } + if tmpfsOpts.FilesystemType != nil { + newFSType = tmpfsOpts.FilesystemType + } } + + fs.vfsfs.Init(vfsObj, newFSType, &fs) + var root *inode - switch typ { + switch rootFileType { case linux.S_IFREG: root = fs.newRegularFile(creds, 0777) case linux.S_IFLNK: @@ -107,7 +121,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt case linux.S_IFDIR: root = fs.newDirectory(creds, 01777) default: - return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", typ) + return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) } return &fs.vfsfs, &fs.newDentry(root).vfsd, nil } @@ -174,6 +188,11 @@ type inode struct { // filesystem.RmdirAt() drops the reference. refs int64 + // xattrs implements extended attributes. + // + // TODO(b/148380782): Support xattrs other than user.* + xattrs memxattr.SimpleExtendedAttributes + // Inode metadata. Writing multiple fields atomically requires holding // mu, othewise atomic operations can be used. mu sync.Mutex @@ -228,7 +247,7 @@ func (i *inode) incLinksLocked() { panic("tmpfs.inode.incLinksLocked() called with no existing links") } if i.nlink == maxLinks { - panic("memfs.inode.incLinksLocked() called with maximum link count") + panic("tmpfs.inode.incLinksLocked() called with maximum link count") } atomic.AddUint32(&i.nlink, 1) } @@ -303,7 +322,7 @@ func (i *inode) statTo(stat *linux.Statx) { stat.Atime = linux.NsecToStatxTimestamp(i.atime) stat.Ctime = linux.NsecToStatxTimestamp(i.ctime) stat.Mtime = linux.NsecToStatxTimestamp(i.mtime) - // TODO(gvisor.dev/issues/1197): Device number. + // TODO(gvisor.dev/issue/1197): Device number. switch impl := i.impl.(type) { case *regularFile: stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS @@ -319,7 +338,7 @@ func (i *inode) statTo(stat *linux.Statx) { case *deviceFile: stat.RdevMajor = impl.major stat.RdevMinor = impl.minor - case *directory, *namedPipe: + case *socketFile, *directory, *namedPipe: // Nothing to do. default: panic(fmt.Sprintf("unknown inode type: %T", i.impl)) @@ -338,6 +357,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu return err } i.mu.Lock() + defer i.mu.Unlock() var ( needsMtimeBump bool needsCtimeBump bool @@ -373,29 +393,41 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu return syserror.EINVAL } } + now := i.clock.Now().Nanoseconds() if mask&linux.STATX_ATIME != 0 { - atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped()) + if stat.Atime.Nsec == linux.UTIME_NOW { + atomic.StoreInt64(&i.atime, now) + } else { + atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped()) + } needsCtimeBump = true } if mask&linux.STATX_MTIME != 0 { - atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped()) + if stat.Mtime.Nsec == linux.UTIME_NOW { + atomic.StoreInt64(&i.mtime, now) + } else { + atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped()) + } needsCtimeBump = true // Ignore the mtime bump, since we just set it ourselves. needsMtimeBump = false } if mask&linux.STATX_CTIME != 0 { - atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped()) + if stat.Ctime.Nsec == linux.UTIME_NOW { + atomic.StoreInt64(&i.ctime, now) + } else { + atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped()) + } // Ignore the ctime bump, since we just set it ourselves. needsCtimeBump = false } - now := i.clock.Now().Nanoseconds() if needsMtimeBump { atomic.StoreInt64(&i.mtime, now) } if needsCtimeBump { atomic.StoreInt64(&i.ctime, now) } - i.mu.Unlock() + return nil } @@ -454,6 +486,8 @@ func (i *inode) direntType() uint8 { return linux.DT_DIR case *symlink: return linux.DT_LNK + case *socketFile: + return linux.DT_SOCK case *deviceFile: switch impl.kind { case vfs.BlockDevice: @@ -472,6 +506,92 @@ func (i *inode) isDir() bool { return linux.FileMode(i.mode).FileType() == linux.S_IFDIR } +func (i *inode) touchAtime(mnt *vfs.Mount) { + if err := mnt.CheckBeginWrite(); err != nil { + return + } + now := i.clock.Now().Nanoseconds() + i.mu.Lock() + atomic.StoreInt64(&i.atime, now) + i.mu.Unlock() + mnt.EndWrite() +} + +// Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). +func (i *inode) touchCtime() { + now := i.clock.Now().Nanoseconds() + i.mu.Lock() + atomic.StoreInt64(&i.ctime, now) + i.mu.Unlock() +} + +// Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). +func (i *inode) touchCMtime() { + now := i.clock.Now().Nanoseconds() + i.mu.Lock() + atomic.StoreInt64(&i.mtime, now) + atomic.StoreInt64(&i.ctime, now) + i.mu.Unlock() +} + +// Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds +// inode.mu. +func (i *inode) touchCMtimeLocked() { + now := i.clock.Now().Nanoseconds() + atomic.StoreInt64(&i.mtime, now) + atomic.StoreInt64(&i.ctime, now) +} + +func (i *inode) listxattr(size uint64) ([]string, error) { + return i.xattrs.Listxattr(size) +} + +func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { + if err := i.checkPermissions(creds, vfs.MayRead); err != nil { + return "", err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return "", syserror.EOPNOTSUPP + } + if !i.userXattrSupported() { + return "", syserror.ENODATA + } + return i.xattrs.Getxattr(opts) +} + +func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error { + if err := i.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + if !i.userXattrSupported() { + return syserror.EPERM + } + return i.xattrs.Setxattr(opts) +} + +func (i *inode) removexattr(creds *auth.Credentials, name string) error { + if err := i.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + if !i.userXattrSupported() { + return syserror.EPERM + } + return i.xattrs.Removexattr(name) +} + +// Extended attributes in the user.* namespace are only supported for regular +// files and directories. +func (i *inode) userXattrSupported() bool { + filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode) + return filetype == linux.S_IFREG || filetype == linux.S_IFDIR +} + // fileDescription is embedded by tmpfs implementations of // vfs.FileDescriptionImpl. type fileDescription struct { @@ -499,3 +619,23 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) creds := auth.CredentialsFromContext(ctx) return fd.inode().setStat(ctx, creds, &opts.Stat) } + +// Listxattr implements vfs.FileDescriptionImpl.Listxattr. +func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { + return fd.inode().listxattr(size) +} + +// Getxattr implements vfs.FileDescriptionImpl.Getxattr. +func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) { + return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts) +} + +// Setxattr implements vfs.FileDescriptionImpl.Setxattr. +func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { + return fd.inode().setxattr(auth.CredentialsFromContext(ctx), &opts) +} + +// Removexattr implements vfs.FileDescriptionImpl.Removexattr. +func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { + return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name) +} |