diff options
Diffstat (limited to 'pkg/sentry/fsimpl/gofer')
-rw-r--r-- | pkg/sentry/fsimpl/gofer/BUILD | 39 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/directory.go | 231 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/filesystem.go | 720 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/gofer.go | 571 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/gofer_test.go | 63 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/handle.go | 5 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/handle_unsafe.go | 66 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/host_named_pipe.go | 97 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/p9file.go | 14 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/pagemath.go | 31 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/regular_file.go | 68 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/socket.go | 146 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/special_file.go | 115 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/symlink.go | 2 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/time.go | 40 |
15 files changed, 1595 insertions, 613 deletions
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 4ba76a1e8..f5f35a3bc 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) @@ -15,18 +15,30 @@ go_template_instance( }, ) +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "gofer", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "dentry", + }, +) + go_library( name = "gofer", srcs = [ "dentry_list.go", "directory.go", "filesystem.go", + "fstree.go", "gofer.go", "handle.go", - "handle_unsafe.go", + "host_named_pipe.go", "p9file.go", - "pagemath.go", "regular_file.go", + "socket.go", "special_file.go", "symlink.go", "time.go", @@ -36,20 +48,41 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fd", + "//pkg/fdnotifier", "//pkg/fspath", "//pkg/log", "//pkg/p9", "//pkg/safemem", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsimpl/host", + "//pkg/sentry/hostfd", + "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", + "//pkg/sentry/socket/control", + "//pkg/sentry/socket/unix", + "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", + "//pkg/syserr", "//pkg/syserror", "//pkg/unet", "//pkg/usermem", + "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "gofer_test", + srcs = ["gofer_test.go"], + library = ":gofer", + deps = [ + "//pkg/p9", + "//pkg/sentry/contexttest", ], ) diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 6d4ebc2bf..b98218753 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -15,27 +15,100 @@ package gofer import ( + "fmt" "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) func (d *dentry) isDir() bool { return d.fileType() == linux.S_IFDIR } -// Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop != -// InteropModeShared. -func (d *dentry) cacheNegativeChildLocked(name string) { - if d.negativeChildren == nil { - d.negativeChildren = make(map[string]struct{}) +// Preconditions: filesystem.renameMu must be locked. d.dirMu must be locked. +// d.isDir(). child must be a newly-created dentry that has never had a parent. +func (d *dentry) cacheNewChildLocked(child *dentry, name string) { + d.IncRef() // reference held by child on its parent + child.parent = d + child.name = name + if d.children == nil { + d.children = make(map[string]*dentry) } - d.negativeChildren[name] = struct{}{} + d.children[name] = child +} + +// Preconditions: d.dirMu must be locked. d.isDir(). +func (d *dentry) cacheNegativeLookupLocked(name string) { + // Don't cache negative lookups if InteropModeShared is in effect (since + // this makes remote lookup unavoidable), or if d.isSynthetic() (in which + // case the only files in the directory are those for which a dentry exists + // in d.children). Instead, just delete any previously-cached dentry. + if d.fs.opts.interop == InteropModeShared || d.isSynthetic() { + delete(d.children, name) + return + } + if d.children == nil { + d.children = make(map[string]*dentry) + } + d.children[name] = nil +} + +type createSyntheticOpts struct { + name string + mode linux.FileMode + kuid auth.KUID + kgid auth.KGID + + // The endpoint for a synthetic socket. endpoint should be nil if the file + // being created is not a socket. + endpoint transport.BoundEndpoint + + // pipe should be nil if the file being created is not a pipe. + pipe *pipe.VFSPipe +} + +// createSyntheticChildLocked creates a synthetic file with the given name +// in d. +// +// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain +// a child with the given name. +func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) { + d2 := &dentry{ + refs: 1, // held by d + fs: d.fs, + mode: uint32(opts.mode), + uid: uint32(opts.kuid), + gid: uint32(opts.kgid), + blockSize: usermem.PageSize, // arbitrary + handle: handle{ + fd: -1, + }, + nlink: uint32(2), + } + switch opts.mode.FileType() { + case linux.S_IFDIR: + // Nothing else needs to be done. + case linux.S_IFSOCK: + d2.endpoint = opts.endpoint + case linux.S_IFIFO: + d2.pipe = opts.pipe + default: + panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType())) + } + d2.pf.dentry = d2 + d2.vfsd.Init(d2) + + d.cacheNewChildLocked(d2, opts.name) + d.syntheticChildren++ } type directoryFD struct { @@ -56,17 +129,22 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.mu.Lock() defer fd.mu.Unlock() + d := fd.dentry() if fd.dirents == nil { - ds, err := fd.dentry().getDirents(ctx) + ds, err := d.getDirents(ctx) if err != nil { return err } fd.dirents = ds } + if d.cachedMetadataAuthoritative() { + d.touchAtime(fd.vfsfd.Mount()) + } + for fd.off < int64(len(fd.dirents)) { - if !cb.Handle(fd.dirents[fd.off]) { - return nil + if err := cb.Handle(fd.dirents[fd.off]); err != nil { + return err } fd.off++ } @@ -75,23 +153,21 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba // Preconditions: d.isDir(). There exists at least one directoryFD representing d. func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { - // 9P2000.L's readdir does not specify behavior in the presence of - // concurrent mutation of an iterated directory, so implementations may - // duplicate or omit entries in this case, which violates POSIX semantics. - // Thus we read all directory entries while holding d.dirMu to exclude - // directory mutations. (Note that it is impossible for the client to - // exclude concurrent mutation from other remote filesystem users. Since - // there is no way to detect if the server has incorrectly omitted - // directory entries, we simply assume that the server is well-behaved - // under InteropModeShared.) This is inconsistent with Linux (which appears - // to assume that directory fids have the correct semantics, and translates - // struct file_operations::readdir calls directly to readdir RPCs), but is - // consistent with VFS1. - // - // NOTE(b/135560623): In particular, some gofer implementations may not - // retain state between calls to Readdir, so may not provide a coherent - // directory stream across in the presence of mutation. + // NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the + // presence of concurrent mutation of an iterated directory, so + // implementations may duplicate or omit entries in this case, which + // violates POSIX semantics. Thus we read all directory entries while + // holding d.dirMu to exclude directory mutations. (Note that it is + // impossible for the client to exclude concurrent mutation from other + // remote filesystem users. Since there is no way to detect if the server + // has incorrectly omitted directory entries, we simply assume that the + // server is well-behaved under InteropModeShared.) This is inconsistent + // with Linux (which appears to assume that directory fids have the correct + // semantics, and translates struct file_operations::readdir calls directly + // to readdir RPCs), but is consistent with VFS1. + // filesystem.renameMu is needed for d.parent, and must be locked before + // dentry.dirMu. d.fs.renameMu.RLock() defer d.fs.renameMu.RUnlock() d.dirMu.Lock() @@ -102,7 +178,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { // It's not clear if 9P2000.L's readdir is expected to return "." and "..", // so we generate them here. - parent := d.vfsd.ParentOrSelf().Impl().(*dentry) + parent := genericParentOrSelf(d) dirents := []vfs.Dirent{ { Name: ".", @@ -117,50 +193,81 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { NextOff: 2, }, } - off := uint64(0) - const count = 64 * 1024 // for consistency with the vfs1 client - d.handleMu.RLock() - defer d.handleMu.RUnlock() - if !d.handleReadable { - // This should not be possible because a readable handle should have - // been opened when the calling directoryFD was opened. - panic("gofer.dentry.getDirents called without a readable handle") - } - for { - p9ds, err := d.handle.file.readdir(ctx, off, count) - if err != nil { - return nil, err + var realChildren map[string]struct{} + if !d.isSynthetic() { + if d.syntheticChildren != 0 && d.fs.opts.interop == InteropModeShared { + // Record the set of children d actually has so that we don't emit + // duplicate entries for synthetic children. + realChildren = make(map[string]struct{}) } - if len(p9ds) == 0 { - // Cache dirents for future directoryFDs if permitted. - if d.fs.opts.interop != InteropModeShared { - d.dirents = dirents + off := uint64(0) + const count = 64 * 1024 // for consistency with the vfs1 client + d.handleMu.RLock() + if !d.handleReadable { + // This should not be possible because a readable handle should + // have been opened when the calling directoryFD was opened. + d.handleMu.RUnlock() + panic("gofer.dentry.getDirents called without a readable handle") + } + for { + p9ds, err := d.handle.file.readdir(ctx, off, count) + if err != nil { + d.handleMu.RUnlock() + return nil, err + } + if len(p9ds) == 0 { + d.handleMu.RUnlock() + break } - return dirents, nil + for _, p9d := range p9ds { + if p9d.Name == "." || p9d.Name == ".." { + continue + } + dirent := vfs.Dirent{ + Name: p9d.Name, + Ino: p9d.QID.Path, + NextOff: int64(len(dirents) + 1), + } + // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or + // DMSOCKET. + switch p9d.Type { + case p9.TypeSymlink: + dirent.Type = linux.DT_LNK + case p9.TypeDir: + dirent.Type = linux.DT_DIR + default: + dirent.Type = linux.DT_REG + } + dirents = append(dirents, dirent) + if realChildren != nil { + realChildren[p9d.Name] = struct{}{} + } + } + off = p9ds[len(p9ds)-1].Offset } - for _, p9d := range p9ds { - if p9d.Name == "." || p9d.Name == ".." { + } + // Emit entries for synthetic children. + if d.syntheticChildren != 0 { + for _, child := range d.children { + if child == nil || !child.isSynthetic() { continue } - dirent := vfs.Dirent{ - Name: p9d.Name, - Ino: p9d.QID.Path, - NextOff: int64(len(dirents) + 1), - } - // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or - // DMSOCKET. - switch p9d.Type { - case p9.TypeSymlink: - dirent.Type = linux.DT_LNK - case p9.TypeDir: - dirent.Type = linux.DT_DIR - default: - dirent.Type = linux.DT_REG + if _, ok := realChildren[child.name]; ok { + continue } - dirents = append(dirents, dirent) + dirents = append(dirents, vfs.Dirent{ + Name: child.name, + Type: uint8(atomic.LoadUint32(&child.mode) >> 12), + Ino: child.ino, + NextOff: int64(len(dirents) + 1), + }) } - off = p9ds[len(p9ds)-1].Offset } + // Cache dirents for future directoryFDs if permitted. + if d.cachedMetadataAuthoritative() { + d.dirents = dirents + } + return dirents, nil } // Seek implements vfs.FileDescriptionImpl.Seek. diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 138adb9f7..36e0e1856 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -21,20 +21,28 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { - // Snapshot current dentries and special files. + // Snapshot current syncable dentries and special files. fs.syncMu.Lock() - ds := make([]*dentry, 0, len(fs.dentries)) - for d := range fs.dentries { + ds := make([]*dentry, 0, len(fs.syncableDentries)) + for d := range fs.syncableDentries { + d.IncRef() ds = append(ds, d) } sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs)) for sffd := range fs.specialFileFDs { + sffd.vfsfd.IncRef() sffds = append(sffds, sffd) } fs.syncMu.Unlock() @@ -45,9 +53,6 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync regular files. for _, d := range ds { - if !d.TryIncRef() { - continue - } err := d.syncSharedHandle(ctx) d.DecRef() if err != nil && retErr == nil { @@ -58,9 +63,6 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync special files, which may be writable but do not use dentry shared // handles (so they won't be synced by the above). for _, sffd := range sffds { - if !sffd.vfsfd.TryIncRef() { - continue - } err := sffd.Sync(ctx) sffd.vfsfd.DecRef() if err != nil && retErr == nil { @@ -112,13 +114,15 @@ func putDentrySlice(ds *[]*dentry) { // to *ds. // // Preconditions: fs.renameMu must be locked. d.dirMu must be locked. -// !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached -// metadata must be up to date. +// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata +// must be up to date. +// +// Postconditions: The returned dentry's cached metadata is up to date. func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { if !d.isDir() { return nil, syserror.ENOTDIR } - if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } afterSymlink: @@ -128,39 +132,42 @@ afterSymlink: return d, nil } if name == ".." { - parentVFSD, err := rp.ResolveParent(&d.vfsd) - if err != nil { + if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return d, nil } - parent := parentVFSD.Impl().(*dentry) - if fs.opts.interop == InteropModeShared { - // We must assume that parentVFSD is correct, because if d has been - // moved elsewhere in the remote filesystem so that its parent has - // changed, we have no way of determining its new parent's location - // in the filesystem. Get updated metadata for parentVFSD. - _, attrMask, attr, err := parent.file.getAttr(ctx, dentryAttrMask()) + // We must assume that d.parent is correct, because if d has been moved + // elsewhere in the remote filesystem so that its parent has changed, + // we have no way of determining its new parent's location in the + // filesystem. + // + // Call rp.CheckMount() before updating d.parent's metadata, since if + // we traverse to another mount then d.parent's metadata is irrelevant. + if err := rp.CheckMount(&d.parent.vfsd); err != nil { + return nil, err + } + if d != d.parent && !d.cachedMetadataAuthoritative() { + _, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask()) if err != nil { return nil, err } - parent.updateFromP9Attrs(attrMask, &attr) + d.parent.updateFromP9Attrs(attrMask, &attr) } rp.Advance() - return parent, nil + return d.parent, nil } - childVFSD, err := rp.ResolveChild(&d.vfsd, name) - if err != nil { - return nil, err - } - // FIXME(jamieliu): Linux performs revalidation before mount lookup - // (fs/namei.c:lookup_fast() => __d_lookup_rcu(), d_revalidate(), - // __follow_mount_rcu()). - child, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, childVFSD, ds) + child, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), d, name, ds) if err != nil { return nil, err } if child == nil { return nil, syserror.ENOENT } + if err := rp.CheckMount(&child.vfsd); err != nil { + return nil, err + } if child.isSymlink() && rp.ShouldFollowSymlink() { target, err := child.readlink(ctx, rp.Mount()) if err != nil { @@ -175,38 +182,37 @@ afterSymlink: return child, nil } -// revalidateChildLocked must be called after a call to parent.vfsd.Child(name) -// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be -// nil) to verify that the returned child (or lack thereof) is correct. If no file -// exists at name, revalidateChildLocked returns (nil, nil). +// getChildLocked returns a dentry representing the child of parent with the +// given name. If no such child exists, getChildLocked returns (nil, nil). // // Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. // parent.isDir(). name is not "." or "..". // -// Postconditions: If revalidateChildLocked returns a non-nil dentry, its -// cached metadata is up to date. -func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, childVFSD *vfs.Dentry, ds **[]*dentry) (*dentry, error) { - if childVFSD != nil && fs.opts.interop != InteropModeShared { - // We have a cached dentry that is assumed to be correct. - return childVFSD.Impl().(*dentry), nil - } - // We either don't have a cached dentry or need to verify that it's still - // correct, either of which requires a remote lookup. Check if this name is - // valid before performing the lookup. +// Postconditions: If getChildLocked returns a non-nil dentry, its cached +// metadata is up to date. +func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { if len(name) > maxFilenameLen { return nil, syserror.ENAMETOOLONG } - // Check if we've already cached this lookup with a negative result. - if _, ok := parent.negativeChildren[name]; ok { - return nil, nil + child, ok := parent.children[name] + if (ok && fs.opts.interop != InteropModeShared) || parent.isSynthetic() { + // Whether child is nil or not, it is cached information that is + // assumed to be correct. + return child, nil } - // Perform the remote lookup. + // We either don't have cached information or need to verify that it's + // still correct, either of which requires a remote lookup. Check if this + // name is valid before performing the lookup. + return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds) +} + +// Preconditions: As for getChildLocked. !parent.isSynthetic(). +func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) { qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) if err != nil && err != syserror.ENOENT { return nil, err } - if childVFSD != nil { - child := childVFSD.Impl().(*dentry) + if child != nil { if !file.isNil() && qid.Path == child.ino { // The file at this path hasn't changed. Just update cached // metadata. @@ -214,29 +220,44 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir child.updateFromP9Attrs(attrMask, &attr) return child, nil } - // The file at this path has changed or no longer exists. Remove - // the stale dentry from the tree, and re-evaluate its caching - // status (i.e. if it has 0 references, drop it). - vfsObj.ForceDeleteDentry(childVFSD) + if file.isNil() && child.isSynthetic() { + // We have a synthetic file, and no remote file has arisen to + // replace it. + return child, nil + } + // The file at this path has changed or no longer exists. Mark the + // dentry invalidated, and re-evaluate its caching status (i.e. if it + // has 0 references, drop it). Wait to update parent.children until we + // know what to replace the existing dentry with (i.e. one of the + // returns below), to avoid a redundant map access. + vfsObj.InvalidateDentry(&child.vfsd) + if child.isSynthetic() { + // Normally we don't mark invalidated dentries as deleted since + // they may still exist (but at a different path), and also for + // consistency with Linux. However, synthetic files are guaranteed + // to become unreachable if their dentries are invalidated, so + // treat their invalidation as deletion. + child.setDeleted() + parent.syntheticChildren-- + child.decRefLocked() + parent.dirents = nil + } *ds = appendDentry(*ds, child) - childVFSD = nil } if file.isNil() { // No file exists at this path now. Cache the negative lookup if // allowed. - if fs.opts.interop != InteropModeShared { - parent.cacheNegativeChildLocked(name) - } + parent.cacheNegativeLookupLocked(name) return nil, nil } // Create a new dentry representing the file. - child, err := fs.newDentry(ctx, file, qid, attrMask, &attr) + child, err = fs.newDentry(ctx, file, qid, attrMask, &attr) if err != nil { file.close(ctx) + delete(parent.children, name) return nil, err } - parent.IncRef() // reference held by child on its parent - parent.vfsd.InsertChild(&child.vfsd, name) + parent.cacheNewChildLocked(child, name) // For now, child has 0 references, so our caller should call // child.checkCachingLocked(). *ds = appendDentry(*ds, child) @@ -248,8 +269,9 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // rp.Start().Impl().(*dentry)). It does not check that the returned directory // is searchable by the provider of rp. // -// Preconditions: fs.renameMu must be locked. !rp.Done(). If fs.opts.interop == -// InteropModeShared, then d's cached metadata must be up to date. +// Preconditions: fs.renameMu must be locked. !rp.Done(). If +// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to +// date. func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { for !rp.Final() { d.dirMu.Lock() @@ -271,7 +293,7 @@ func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving // Preconditions: fs.renameMu must be locked. func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { d := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !d.cachedMetadataAuthoritative() { // Get updated metadata for rp.Start() as required by fs.stepLocked(). if err := d.updateFromGetattr(ctx); err != nil { return nil, err @@ -293,16 +315,17 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, } // doCreateAt checks that creating a file at rp is permitted, then invokes -// create to do so. +// createInRemoteDir (if the parent directory is a real remote directory) or +// createInSyntheticDir (if the parent directory is synthetic) to do so. // // Preconditions: !rp.Done(). For the final path component in rp, // !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error { +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by // fs.walkParentDirLocked(). if err := start.updateFromGetattr(ctx); err != nil { @@ -313,12 +336,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err != nil { return err } - if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } - if parent.isDeleted() { - return syserror.ENOENT - } name := rp.Component() if name == "." || name == ".." { return syserror.EEXIST @@ -329,6 +349,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if !dir && rp.MustBeDir() { return syserror.ENOENT } + if parent.isDeleted() { + return syserror.ENOENT + } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { return err @@ -336,6 +359,20 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir defer mnt.EndWrite() parent.dirMu.Lock() defer parent.dirMu.Unlock() + if parent.isSynthetic() { + if child := parent.children[name]; child != nil { + return syserror.EEXIST + } + if createInSyntheticDir == nil { + return syserror.EPERM + } + if err := createInSyntheticDir(parent, name); err != nil { + return err + } + parent.touchCMtime() + parent.dirents = nil + return nil + } if fs.opts.interop == InteropModeShared { // The existence of a dentry at name would be inconclusive because the // file it represents may have been deleted from the remote filesystem, @@ -344,18 +381,21 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir // will fail with EEXIST like we would have. If the RPC succeeds, and a // stale dentry exists, the dentry will fail revalidation next time // it's used. - return create(parent, name) + return createInRemoteDir(parent, name) } - if parent.vfsd.Child(name) != nil { + if child := parent.children[name]; child != nil { return syserror.EEXIST } // No cached dentry exists; however, there might still be an existing file // at name. As above, we attempt the file creation RPC anyway. - if err := create(parent, name); err != nil { + if err := createInRemoteDir(parent, name); err != nil { return err } - parent.touchCMtime(ctx) - delete(parent.negativeChildren, name) + if child, ok := parent.children[name]; ok && child == nil { + // Delete the now-stale negative dentry. + delete(parent.children, name) + } + parent.touchCMtime() parent.dirents = nil return nil } @@ -366,7 +406,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by // fs.walkParentDirLocked(). if err := start.updateFromGetattr(ctx); err != nil { @@ -377,7 +417,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b if err != nil { return err } - if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } if err := rp.Mount().CheckBeginWrite(); err != nil { @@ -400,66 +440,110 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() parent.dirMu.Lock() defer parent.dirMu.Unlock() - childVFSD := parent.vfsd.Child(name) - var child *dentry + child, ok := parent.children[name] + if ok && child == nil { + return syserror.ENOENT + } // We only need a dentry representing the file at name if it can be a mount - // point. If childVFSD is nil, then it can't be a mount point. If childVFSD - // is non-nil but stale, the actual file can't be a mount point either; we + // point. If child is nil, then it can't be a mount point. If child is + // non-nil but stale, the actual file can't be a mount point either; we // detect this case by just speculatively calling PrepareDeleteDentry and // only revalidating the dentry if that fails (indicating that the existing // dentry is a mount point). - if childVFSD != nil { - child = childVFSD.Impl().(*dentry) - if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { - child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, childVFSD, &ds) + if child != nil { + child.dirMu.Lock() + defer child.dirMu.Unlock() + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { + if parent.cachedMetadataAuthoritative() { + return err + } + child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds) if err != nil { return err } if child != nil { - childVFSD = &child.vfsd - if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } - } else { - childVFSD = nil } } - } else if _, ok := parent.negativeChildren[name]; ok { - return syserror.ENOENT } flags := uint32(0) + // If a dentry exists, use it for best-effort checks on its deletability. if dir { - if child != nil && !child.isDir() { - return syserror.ENOTDIR + if child != nil { + // child must be an empty directory. + if child.syntheticChildren != 0 { + // This is definitely not an empty directory, irrespective of + // fs.opts.interop. + vfsObj.AbortDeleteDentry(&child.vfsd) + return syserror.ENOTEMPTY + } + // If InteropModeShared is in effect and the first call to + // PrepareDeleteDentry above succeeded, then child wasn't + // revalidated (so we can't expect its file type to be correct) and + // individually revalidating its children (to confirm that they + // still exist) would be a waste of time. + if child.cachedMetadataAuthoritative() { + if !child.isDir() { + vfsObj.AbortDeleteDentry(&child.vfsd) + return syserror.ENOTDIR + } + for _, grandchild := range child.children { + if grandchild != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + return syserror.ENOTEMPTY + } + } + } } flags = linux.AT_REMOVEDIR } else { + // child must be a non-directory file. if child != nil && child.isDir() { + vfsObj.AbortDeleteDentry(&child.vfsd) return syserror.EISDIR } if rp.MustBeDir() { + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + } return syserror.ENOTDIR } } - err = parent.file.unlinkAt(ctx, name, flags) - if err != nil { - if childVFSD != nil { - vfsObj.AbortDeleteDentry(childVFSD) + if parent.isSynthetic() { + if child == nil { + return syserror.ENOENT + } + } else { + err = parent.file.unlinkAt(ctx, name, flags) + if err != nil { + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + } + return err } - return err - } - if fs.opts.interop != InteropModeShared { - parent.touchCMtime(ctx) - parent.cacheNegativeChildLocked(name) - parent.dirents = nil } if child != nil { + vfsObj.CommitDeleteDentry(&child.vfsd) child.setDeleted() - vfsObj.CommitDeleteDentry(childVFSD) + if child.isSynthetic() { + parent.syntheticChildren-- + child.decRefLocked() + } ds = appendDentry(ds, child) } + parent.cacheNegativeLookupLocked(name) + if parent.cachedMetadataAuthoritative() { + parent.dirents = nil + parent.touchCMtime() + if dir { + parent.decLinks() + } + } return nil } @@ -498,6 +582,18 @@ func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) { putDentrySlice(*ds) } +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.checkPermissions(creds, ats) +} + // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { var ds *[]*dentry @@ -511,7 +607,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op if !d.isDir() { return nil, syserror.ENOTDIR } - if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } @@ -525,7 +621,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by // fs.walkParentDirLocked(). if err := start.updateFromGetattr(ctx); err != nil { @@ -548,15 +644,42 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. } // 9P2000.L supports hard links, but we don't. return syserror.EPERM - }) + }, nil) } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + creds := rp.Credentials() return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error { - creds := rp.Credentials() - _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) - return err + if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil { + if !opts.ForSyntheticMountpoint || err == syserror.EEXIST { + return err + } + ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err) + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: linux.S_IFDIR | opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + }) + } + if fs.opts.interop != InteropModeShared { + parent.incLinks() + } + return nil + }, func(parent *dentry, name string) error { + if !opts.ForSyntheticMountpoint { + // Can't create non-synthetic files in synthetic directories. + return syserror.EPERM + } + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: linux.S_IFDIR | opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + }) + parent.incLinks() + return nil }) } @@ -565,8 +688,32 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error { creds := rp.Credentials() _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + // If the gofer does not allow creating a socket or pipe, create a + // synthetic one, i.e. one that is kept entirely in memory. + if err == syserror.EPERM { + switch opts.Mode.FileType() { + case linux.S_IFSOCK: + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + endpoint: opts.Endpoint, + }) + return nil + case linux.S_IFIFO: + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize), + }) + return nil + } + } return err - }) + }, nil) } // OpenAt implements vfs.FilesystemImpl.OpenAt. @@ -586,7 +733,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf defer fs.renameMuRUnlockAndCheckCaching(&ds) start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by fs.stepLocked(). if err := start.updateFromGetattr(ctx); err != nil { return nil, err @@ -602,14 +749,18 @@ afterTrailingSymlink: return nil, err } // Check for search permission in the parent directory. - if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Determine whether or not we need to create a file. parent.dirMu.Lock() child, err := fs.stepLocked(ctx, rp, parent, &ds) if err == syserror.ENOENT && mayCreate { - fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts) + if parent.isSynthetic() { + parent.dirMu.Unlock() + return nil, syserror.EPERM + } + fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts, &ds) parent.dirMu.Unlock() return fd, err } @@ -639,24 +790,25 @@ afterTrailingSymlink: // Preconditions: fs.renameMu must be locked. func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) - if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil { + if err := d.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } mnt := rp.Mount() - filetype := d.fileType() - switch { - case filetype == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD: - if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil { - return nil, err - } - fd := ®ularFileFD{} - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ - AllowDirectIO: true, - }); err != nil { - return nil, err + switch d.fileType() { + case linux.S_IFREG: + if !d.fs.opts.regularFilesUseSpecialFileFD { + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil { + return nil, err + } + fd := ®ularFileFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + AllowDirectIO: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil } - return &fd.vfsfd, nil - case filetype == linux.S_IFDIR: + case linux.S_IFDIR: // Can't open directories with O_CREAT. if opts.Flags&linux.O_CREAT != 0 { return nil, syserror.EISDIR @@ -668,39 +820,100 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf if opts.Flags&linux.O_DIRECT != 0 { return nil, syserror.EINVAL } - if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { - return nil, err + if !d.isSynthetic() { + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { + return nil, err + } } fd := &directoryFD{} if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil - case filetype == linux.S_IFLNK: + case linux.S_IFLNK: // Can't open symlinks without O_PATH (which is unimplemented). return nil, syserror.ELOOP - default: - if opts.Flags&linux.O_DIRECT != 0 { - return nil, syserror.EINVAL + case linux.S_IFSOCK: + if d.isSynthetic() { + return nil, syserror.ENXIO } - h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0) - if err != nil { - return nil, err + if d.fs.iopts.OpenSocketsByConnecting { + return d.connectSocketLocked(ctx, opts) } - fd := &specialFileFD{ - handle: h, + case linux.S_IFIFO: + if d.isSynthetic() { + return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags) } - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + } + return d.openSpecialFileLocked(ctx, mnt, opts) +} + +func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + if opts.Flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + fdObj, err := d.file.connect(ctx, p9.AnonymousSocket) + if err != nil { + return nil, err + } + fd, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fdObj.FD(), &host.NewFDOptions{ + HaveFlags: true, + Flags: opts.Flags, + }) + if err != nil { + fdObj.Close() + return nil, err + } + fdObj.Release() + return fd, nil +} + +func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts) + if opts.Flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + // We assume that the server silently inserts O_NONBLOCK in the open flags + // for all named pipes (because all existing gofers do this). + // + // NOTE(b/133875563): This makes named pipe opens racy, because the + // mechanisms for translating nonblocking to blocking opens can only detect + // the instantaneous presence of a peer holding the other end of the pipe + // open, not whether the pipe was *previously* opened by a peer that has + // since closed its end. + isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0 +retry: + h, err := openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0) + if err != nil { + if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && err == syserror.ENXIO { + // An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails + // with ENXIO if opening the same named pipe with O_WRONLY would + // block because there are no readers of the pipe. + if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil { + return nil, err + } + goto retry + } + return nil, err + } + if isBlockingOpenOfNamedPipe && ats == vfs.MayRead && h.fd >= 0 { + if err := blockUntilNonblockingPipeHasWriter(ctx, h.fd); err != nil { h.close(ctx) return nil, err } - return &fd.vfsfd, nil } + fd, err := newSpecialFileFD(h, mnt, d, opts.Flags) + if err != nil { + h.close(ctx) + return nil, err + } + return &fd.vfsfd, nil } // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked. -func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { - if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { +// !d.isSynthetic(). +func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) { + if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if d.isDeleted() { @@ -721,7 +934,11 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } creds := rp.Credentials() name := rp.Component() - fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, (p9.OpenFlags)(opts.Flags), (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + // Filter file creation flags and O_LARGEFILE out; the create RPC already + // has the semantics of O_CREAT|O_EXCL, while some servers will choke on + // O_LARGEFILE. + createFlags := p9.OpenFlags(opts.Flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_LARGEFILE)) + fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) if err != nil { dirfile.close(ctx) return nil, err @@ -729,7 +946,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving // Then we need to walk to the file we just created to get a non-open fid // representing it, and to get its metadata. This must use d.file since, as // explained above, dirfile was invalidated by dirfile.Create(). - walkQID, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) + _, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) if err != nil { openFile.close(ctx) if fdobj != nil { @@ -737,17 +954,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } return nil, err } - // Sanity-check that we walked to the file we created. - if createQID.Path != walkQID.Path { - // Probably due to concurrent remote filesystem mutation? - ctx.Warningf("gofer.dentry.createAndOpenChildLocked: created file has QID %v before walk, QID %v after (interop=%v)", createQID, walkQID, d.fs.opts.interop) - nonOpenFile.close(ctx) - openFile.close(ctx) - if fdobj != nil { - fdobj.Close() - } - return nil, syserror.EAGAIN - } // Construct the new dentry. child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr) @@ -759,6 +965,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } return nil, err } + *ds = appendDentry(*ds, child) // Incorporate the fid that was opened by lcreate. useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD if useRegularFileFD { @@ -771,22 +978,15 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags) child.handleMu.Unlock() } - // Take a reference on the new dentry to be held by the new file - // description. (This reference also means that the new dentry is not - // eligible for caching yet, so we don't need to append to a dentry slice.) - child.refs = 1 // Insert the dentry into the tree. - d.IncRef() // reference held by child on its parent d - d.vfsd.InsertChild(&child.vfsd, name) - if d.fs.opts.interop != InteropModeShared { - d.touchCMtime(ctx) - delete(d.negativeChildren, name) + d.cacheNewChildLocked(child, name) + if d.cachedMetadataAuthoritative() { + d.touchCMtime() d.dirents = nil } // Finally, construct a file description representing the created file. var childVFSFD *vfs.FileDescription - mnt.IncRef() if useRegularFileFD { fd := ®ularFileFD{} if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ @@ -796,17 +996,16 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } else { - fd := &specialFileFD{ - handle: handle{ - file: openFile, - fd: -1, - }, + h := handle{ + file: openFile, + fd: -1, } if fdobj != nil { - fd.handle.fd = int32(fdobj.Release()) + h.fd = int32(fdobj.Release()) } - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{}); err != nil { - fd.handle.close(ctx) + fd, err := newSpecialFileFD(h, mnt, child, opts.Flags) + if err != nil { + h.close(ctx) return nil, err } childVFSFD = &fd.vfsfd @@ -857,12 +1056,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa defer mnt.EndWrite() oldParent := oldParentVD.Dentry().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !oldParent.cachedMetadataAuthoritative() { if err := oldParent.updateFromGetattr(ctx); err != nil { return err } } - if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } vfsObj := rp.VirtualFilesystem() @@ -870,7 +1069,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // directory, we need to check for write permission on it. oldParent.dirMu.Lock() defer oldParent.dirMu.Unlock() - renamed, err := fs.revalidateChildLocked(ctx, vfsObj, oldParent, oldName, oldParent.vfsd.Child(oldName), &ds) + renamed, err := fs.getChildLocked(ctx, vfsObj, oldParent, oldName, &ds) if err != nil { return err } @@ -878,11 +1077,11 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return syserror.ENOENT } if renamed.isDir() { - if renamed == newParent || renamed.vfsd.IsAncestorOf(&newParent.vfsd) { + if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { return syserror.EINVAL } if oldParent != newParent { - if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return err } } @@ -893,7 +1092,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if oldParent != newParent { - if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } newParent.dirMu.Lock() @@ -902,52 +1101,99 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if newParent.isDeleted() { return syserror.ENOENT } - replacedVFSD := newParent.vfsd.Child(newName) - var replaced *dentry - // This is similar to unlinkAt, except: - // - // - We revalidate the replaced dentry unconditionally for simplicity. - // - // - If rp.MustBeDir(), then we need a dentry representing the replaced - // file regardless to confirm that it's a directory. - if replacedVFSD != nil || rp.MustBeDir() { - replaced, err = fs.revalidateChildLocked(ctx, vfsObj, newParent, newName, replacedVFSD, &ds) - if err != nil { - return err - } - if replaced != nil { - if replaced.isDir() { - if !renamed.isDir() { - return syserror.EISDIR - } - } else { - if rp.MustBeDir() || renamed.isDir() { - return syserror.ENOTDIR - } + replaced, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds) + if err != nil { + return err + } + var replacedVFSD *vfs.Dentry + if replaced != nil { + replacedVFSD = &replaced.vfsd + if replaced.isDir() { + if !renamed.isDir() { + return syserror.EISDIR } - replacedVFSD = &replaced.vfsd } else { - replacedVFSD = nil + if rp.MustBeDir() || renamed.isDir() { + return syserror.ENOTDIR + } } } if oldParent == newParent && oldName == newName { return nil } - if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), &renamed.vfsd, replacedVFSD); err != nil { + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { return err } - if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { - vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) - return err + + // Update the remote filesystem. + if !renamed.isSynthetic() { + if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + } else if replaced != nil && !replaced.isSynthetic() { + // We are replacing an existing real file with a synthetic one, so we + // need to unlink the former. + flags := uint32(0) + if replaced.isDir() { + flags = linux.AT_REMOVEDIR + } + if err := newParent.file.unlinkAt(ctx, newName, flags); err != nil { + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + } + + // Update the dentry tree. + vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + if replaced != nil { + replaced.setDeleted() + if replaced.isSynthetic() { + newParent.syntheticChildren-- + replaced.decRefLocked() + } + ds = appendDentry(ds, replaced) } - if fs.opts.interop != InteropModeShared { - oldParent.cacheNegativeChildLocked(oldName) + oldParent.cacheNegativeLookupLocked(oldName) + // We don't use newParent.cacheNewChildLocked() since we don't want to mess + // with reference counts and queue oldParent for checkCachingLocked if the + // parent isn't actually changing. + if oldParent != newParent { + ds = appendDentry(ds, oldParent) + newParent.IncRef() + if renamed.isSynthetic() { + oldParent.syntheticChildren-- + newParent.syntheticChildren++ + } + } + renamed.parent = newParent + renamed.name = newName + if newParent.children == nil { + newParent.children = make(map[string]*dentry) + } + newParent.children[newName] = renamed + + // Update metadata. + if renamed.cachedMetadataAuthoritative() { + renamed.touchCtime() + } + if oldParent.cachedMetadataAuthoritative() { oldParent.dirents = nil - delete(newParent.negativeChildren, newName) + oldParent.touchCMtime() + if renamed.isDir() { + oldParent.decLinks() + } + } + if newParent.cachedMetadataAuthoritative() { newParent.dirents = nil + newParent.touchCMtime() + if renamed.isDir() { + newParent.incLinks() + } } - vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD) return nil } @@ -994,6 +1240,10 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu if err != nil { return linux.Statfs{}, err } + // If d is synthetic, invoke statfs on the first ancestor of d that isn't. + for d.isSynthetic() { + d = d.parent + } fsstat, err := d.file.statFS(ctx) if err != nil { return linux.Statfs{}, err @@ -1023,7 +1273,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ creds := rp.Credentials() _, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) return err - }) + }, nil) } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. @@ -1031,8 +1281,34 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return fs.unlinkAt(ctx, rp, false /* dir */) } +// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } + if d.isSocket() { + if !d.isSynthetic() { + d.IncRef() + return &endpoint{ + dentry: d, + file: d.file.file, + path: opts.Addr, + }, nil + } + return d.endpoint, nil + } + return nil, syserror.ECONNREFUSED +} + // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) @@ -1040,11 +1316,11 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([ if err != nil { return nil, err } - return d.listxattr(ctx) + return d.listxattr(ctx, rp.Credentials(), size) } // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) @@ -1052,7 +1328,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, nam if err != nil { return "", err } - return d.getxattr(ctx, name) + return d.getxattr(ctx, rp.Credentials(), &opts) } // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. @@ -1064,7 +1340,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt if err != nil { return err } - return d.setxattr(ctx, &opts) + return d.setxattr(ctx, rp.Credentials(), &opts) } // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. @@ -1076,12 +1352,12 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, if err != nil { return err } - return d.removexattr(ctx, name) + return d.removexattr(ctx, rp.Credentials(), name) } // PrependPath implements vfs.FilesystemImpl.PrependPath. func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.renameMu.RLock() defer fs.renameMu.RUnlock() - return vfs.GenericPrependPath(vfsroot, vd, b) + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index d0552bd99..3f3bd56f0 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -27,13 +27,15 @@ // dentry.handleMu // dentry.dataMu // -// Locking dentry.dirMu in multiple dentries requires holding -// filesystem.renameMu for writing. +// Locking dentry.dirMu in multiple dentries requires that either ancestor +// dentries are locked before descendant dentries, or that filesystem.renameMu +// is locked for writing. package gofer import ( "fmt" "strconv" + "strings" "sync" "sync/atomic" "syscall" @@ -44,14 +46,20 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/usermem" ) +// Name is the default filesystem name. +const Name = "9p" + // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} @@ -64,16 +72,17 @@ type filesystem struct { mfp pgalloc.MemoryFileProvider // Immutable options. - opts filesystemOptions + opts filesystemOptions + iopts InternalFilesystemOptions // client is the client used by this filesystem. client is immutable. client *p9.Client - // uid and gid are the effective KUID and KGID of the filesystem's creator, - // and are used as the owner and group for files that don't specify one. - // uid and gid are immutable. - uid auth.KUID - gid auth.KGID + // clock is a realtime clock used to set timestamps in file operations. + clock ktime.Clock + + // devMinor is the filesystem's minor device number. devMinor is immutable. + devMinor uint32 // renameMu serves two purposes: // @@ -94,11 +103,12 @@ type filesystem struct { cachedDentries dentryList cachedDentriesLen uint64 - // dentries contains all dentries in this filesystem. specialFileFDs - // contains all open specialFileFDs. These fields are protected by syncMu. - syncMu sync.Mutex - dentries map[*dentry]struct{} - specialFileFDs map[*specialFileFD]struct{} + // syncableDentries contains all dentries in this filesystem for which + // !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs. + // These fields are protected by syncMu. + syncMu sync.Mutex + syncableDentries map[*dentry]struct{} + specialFileFDs map[*specialFileFD]struct{} } type filesystemOptions struct { @@ -106,6 +116,8 @@ type filesystemOptions struct { fd int aname string interop InteropMode // derived from the "cache" mount option + dfltuid auth.KUID + dfltgid auth.KGID msize uint32 version string @@ -127,9 +139,12 @@ type filesystemOptions struct { // If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote // filesystem may not be coherent with writable host FDs opened later, so - // mappings of the former must be replaced by mappings of the latter. This - // is usually only the case when the remote filesystem is an overlayfs - // mount on Linux < 4.19. + // all uses of the former must be replaced by uses of the latter. This is + // usually only the case when the remote filesystem is a Linux overlayfs + // mount. (Prior to Linux 4.18, patch series centered on commit + // d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were + // incoherent between pre-copy-up and post-copy-up FDs; after that patch + // series, only memory mappings are incoherent.) overlayfsStaleRead bool // If regularFilesUseSpecialFileFD is true, application FDs representing @@ -179,7 +194,8 @@ const ( // InteropModeShared is appropriate when there are users of the remote // filesystem that may mutate its state other than the client. // - // - The client must verify cached filesystem state before using it. + // - The client must verify ("revalidate") cached filesystem state before + // using it. // // - Client changes to filesystem state must be sent to the remote // filesystem synchronously. @@ -196,6 +212,34 @@ const ( InteropModeShared ) +// InternalFilesystemOptions may be passed as +// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. +type InternalFilesystemOptions struct { + // If LeakConnection is true, do not close the connection to the server + // when the Filesystem is released. This is necessary for deployments in + // which servers can handle only a single client and report failure if that + // client disconnects. + LeakConnection bool + + // If OpenSocketsByConnecting is true, silently translate attempts to open + // files identifying as sockets to connect RPCs. + OpenSocketsByConnecting bool +} + +// _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default +// UIDs and GIDs used for files that do not provide a specific owner or group +// respectively. +const ( + // uint32(-2) doesn't work in Go. + _V9FS_DEFUID = auth.KUID(4294967294) + _V9FS_DEFGID = auth.KGID(4294967294) +) + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { mfp := pgalloc.MemoryFileProviderFromContext(ctx) @@ -276,6 +320,31 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } } + // Parse the default UID and GID. + fsopts.dfltuid = _V9FS_DEFUID + if dfltuidstr, ok := mopts["dfltuid"]; ok { + delete(mopts, "dfltuid") + dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltuid=%s", dfltuidstr) + return nil, nil, syserror.EINVAL + } + // In Linux, dfltuid is interpreted as a UID and is converted to a KUID + // in the caller's user namespace, but goferfs isn't + // application-mountable. + fsopts.dfltuid = auth.KUID(dfltuid) + } + fsopts.dfltgid = _V9FS_DEFGID + if dfltgidstr, ok := mopts["dfltgid"]; ok { + delete(mopts, "dfltgid") + dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltgid=%s", dfltgidstr) + return nil, nil, syserror.EINVAL + } + fsopts.dfltgid = auth.KGID(dfltgid) + } + // Parse the 9P message size. fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M if msizestr, ok := mopts["msize"]; ok { @@ -329,6 +398,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return nil, nil, syserror.EINVAL } + // Handle internal options. + iopts, ok := opts.InternalData.(InternalFilesystemOptions) + if opts.InternalData != nil && !ok { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData) + return nil, nil, syserror.EINVAL + } + // If !ok, iopts being the zero value is correct. + // Establish a connection with the server. conn, err := unet.NewSocket(fsopts.fd) if err != nil { @@ -362,16 +439,23 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } // Construct the filesystem object. + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + attachFile.close(ctx) + client.Close() + return nil, nil, err + } fs := &filesystem{ - mfp: mfp, - opts: fsopts, - uid: creds.EffectiveKUID, - gid: creds.EffectiveKGID, - client: client, - dentries: make(map[*dentry]struct{}), - specialFileFDs: make(map[*specialFileFD]struct{}), + mfp: mfp, + opts: fsopts, + iopts: iopts, + client: client, + clock: ktime.RealtimeClockFromContext(ctx), + devMinor: devMinor, + syncableDentries: make(map[*dentry]struct{}), + specialFileFDs: make(map[*specialFileFD]struct{}), } - fs.vfsfs.Init(vfsObj, fs) + fs.vfsfs.Init(vfsObj, &fstype, fs) // Construct the root dentry. root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr) @@ -395,7 +479,7 @@ func (fs *filesystem) Release() { mf := fs.mfp.MemoryFile() fs.syncMu.Lock() - for d := range fs.dentries { + for d := range fs.syncableDentries { d.handleMu.Lock() d.dataMu.Lock() if d.handleWritable { @@ -421,8 +505,12 @@ func (fs *filesystem) Release() { // fs. fs.syncMu.Unlock() - // Close the connection to the server. This implicitly clunks all fids. - fs.client.Close() + if !fs.iopts.LeakConnection { + // Close the connection to the server. This implicitly clunks all fids. + fs.client.Close() + } + + fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } // dentry implements vfs.DentryImpl. @@ -430,16 +518,33 @@ type dentry struct { vfsd vfs.Dentry // refs is the reference count. Each dentry holds a reference on its - // parent, even if disowned. refs is accessed using atomic memory - // operations. + // parent, even if disowned. An additional reference is held on all + // synthetic dentries until they are unlinked or invalidated. When refs + // reaches 0, the dentry may be added to the cache or destroyed. If refs == + // -1, the dentry has already been destroyed. refs is accessed using atomic + // memory operations. refs int64 // fs is the owning filesystem. fs is immutable. fs *filesystem + // parent is this dentry's parent directory. Each dentry holds a reference + // on its parent. If this dentry is a filesystem root, parent is nil. + // parent is protected by filesystem.renameMu. + parent *dentry + + // name is the name of this dentry in its parent. If this dentry is a + // filesystem root, name is the empty string. name is protected by + // filesystem.renameMu. + name string + // We don't support hard links, so each dentry maps 1:1 to an inode. // file is the unopened p9.File that backs this dentry. file is immutable. + // + // If file.isNil(), this dentry represents a synthetic file, i.e. a file + // that does not exist on the remote filesystem. As of this writing, the + // only files that can be synthetic are sockets, pipes, and directories. file p9file // If deleted is non-zero, the file represented by this dentry has been @@ -454,15 +559,26 @@ type dentry struct { dirMu sync.Mutex - // If this dentry represents a directory, and InteropModeShared is not in - // effect, negativeChildren is a set of child names in this directory that - // are known not to exist. negativeChildren is protected by dirMu. - negativeChildren map[string]struct{} - - // If this dentry represents a directory, InteropModeShared is not in - // effect, and dirents is not nil, it is a cache of all entries in the - // directory, in the order they were returned by the server. dirents is - // protected by dirMu. + // If this dentry represents a directory, children contains: + // + // - Mappings of child filenames to dentries representing those children. + // + // - Mappings of child filenames that are known not to exist to nil + // dentries (only if InteropModeShared is not in effect and the directory + // is not synthetic). + // + // children is protected by dirMu. + children map[string]*dentry + + // If this dentry represents a directory, syntheticChildren is the number + // of child dentries for which dentry.isSynthetic() == true. + // syntheticChildren is protected by dirMu. + syntheticChildren int + + // If this dentry represents a directory, + // dentry.cachedMetadataAuthoritative() == true, and dirents is not nil, it + // is a cache of all entries in the directory, in the order they were + // returned by the server. dirents is protected by dirMu. dirents []vfs.Dirent // Cached metadata; protected by metadataMu and accessed using atomic @@ -482,6 +598,11 @@ type dentry struct { // locked to mutate it). size uint64 + // nlink counts the number of hard links to this dentry. It's updated and + // accessed using atomic operations. It's not protected by metadataMu like the + // other metadata fields. + nlink uint32 + mapsMu sync.Mutex // If this dentry represents a regular file, mappings tracks mappings of @@ -533,6 +654,14 @@ type dentry struct { // and target are protected by dataMu. haveTarget bool target string + + // If this dentry represents a synthetic socket file, endpoint is the + // transport endpoint bound to this file. + endpoint transport.BoundEndpoint + + // If this dentry represents a synthetic named pipe, pipe is the pipe + // endpoint bound to this file. + pipe *pipe.VFSPipe } // dentryAttrMask returns a p9.AttrMask enabling all attributes used by the @@ -554,6 +683,8 @@ func dentryAttrMask() p9.AttrMask { // initially has no references, but is not cached; it is the caller's // responsibility to set the dentry's reference count and/or call // dentry.checkCachingLocked() as appropriate. +// +// Preconditions: !file.isNil(). func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) { if !mask.Mode { ctx.Warningf("can't create gofer.dentry without file type") @@ -569,8 +700,8 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma file: file, ino: qid.Path, mode: uint32(attr.Mode), - uid: uint32(fs.uid), - gid: uint32(fs.gid), + uid: uint32(fs.opts.dfltuid), + gid: uint32(fs.opts.dfltgid), blockSize: usermem.PageSize, handle: handle{ fd: -1, @@ -578,10 +709,10 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma } d.pf.dentry = d if mask.UID { - d.uid = uint32(attr.UID) + d.uid = dentryUIDFromP9UID(attr.UID) } if mask.GID { - d.gid = uint32(attr.GID) + d.gid = dentryGIDFromP9GID(attr.GID) } if mask.Size { d.size = attr.Size @@ -601,14 +732,25 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma if mask.BTime { d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds) } + if mask.NLink { + d.nlink = uint32(attr.NLink) + } d.vfsd.Init(d) fs.syncMu.Lock() - fs.dentries[d] = struct{}{} + fs.syncableDentries[d] = struct{}{} fs.syncMu.Unlock() return d, nil } +func (d *dentry) isSynthetic() bool { + return d.file.isNil() +} + +func (d *dentry) cachedMetadataAuthoritative() bool { + return d.fs.opts.interop != InteropModeShared || d.isSynthetic() +} + // updateFromP9Attrs is called to update d's metadata after an update from the // remote filesystem. func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { @@ -621,10 +763,10 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { atomic.StoreUint32(&d.mode, uint32(attr.Mode)) } if mask.UID { - atomic.StoreUint32(&d.uid, uint32(attr.UID)) + atomic.StoreUint32(&d.uid, dentryUIDFromP9UID(attr.UID)) } if mask.GID { - atomic.StoreUint32(&d.gid, uint32(attr.GID)) + atomic.StoreUint32(&d.gid, dentryGIDFromP9GID(attr.GID)) } // There is no P9_GETATTR_* bit for I/O block size. if attr.BlockSize != 0 { @@ -642,6 +784,9 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { if mask.BTime { atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)) } + if mask.NLink { + atomic.StoreUint32(&d.nlink, uint32(attr.NLink)) + } if mask.Size { d.dataMu.Lock() atomic.StoreUint64(&d.size, attr.Size) @@ -650,6 +795,7 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { d.metadataMu.Unlock() } +// Preconditions: !d.isSynthetic() func (d *dentry) updateFromGetattr(ctx context.Context) error { // Use d.handle.file, which represents a 9P fid that has been opened, in // preference to d.file, which represents a 9P fid that has not. This may @@ -684,10 +830,7 @@ func (d *dentry) fileType() uint32 { func (d *dentry) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME stat.Blksize = atomic.LoadUint32(&d.blockSize) - stat.Nlink = 1 - if d.isDir() { - stat.Nlink = 2 - } + stat.Nlink = atomic.LoadUint32(&d.nlink) stat.UID = atomic.LoadUint32(&d.uid) stat.GID = atomic.LoadUint32(&d.gid) stat.Mode = uint16(atomic.LoadUint32(&d.mode)) @@ -700,7 +843,8 @@ func (d *dentry) statTo(stat *linux.Statx) { stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime)) stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime)) stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime)) - // TODO(jamieliu): device number + stat.DevMajor = linux.UNNAMED_MAJOR + stat.DevMinor = d.fs.devMinor } func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error { @@ -710,7 +854,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { return err } if err := mnt.CheckBeginWrite(); err != nil { @@ -719,53 +864,63 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin defer mnt.EndWrite() setLocalAtime := false setLocalMtime := false - if d.fs.opts.interop != InteropModeShared { + if d.cachedMetadataAuthoritative() { // Timestamp updates will be handled locally. setLocalAtime = stat.Mask&linux.STATX_ATIME != 0 setLocalMtime = stat.Mask&linux.STATX_MTIME != 0 stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME - if !setLocalMtime && (stat.Mask&linux.STATX_SIZE != 0) { - // Truncate updates mtime. - setLocalMtime = true - stat.Mtime.Nsec = linux.UTIME_NOW + + // Prepare for truncate. + if stat.Mask&linux.STATX_SIZE != 0 { + switch d.mode & linux.S_IFMT { + case linux.S_IFREG: + if !setLocalMtime { + // Truncate updates mtime. + setLocalMtime = true + stat.Mtime.Nsec = linux.UTIME_NOW + } + case linux.S_IFDIR: + return syserror.EISDIR + default: + return syserror.EINVAL + } } } d.metadataMu.Lock() defer d.metadataMu.Unlock() - if stat.Mask != 0 { - if err := d.file.setAttr(ctx, p9.SetAttrMask{ - Permissions: stat.Mask&linux.STATX_MODE != 0, - UID: stat.Mask&linux.STATX_UID != 0, - GID: stat.Mask&linux.STATX_GID != 0, - Size: stat.Mask&linux.STATX_SIZE != 0, - ATime: stat.Mask&linux.STATX_ATIME != 0, - MTime: stat.Mask&linux.STATX_MTIME != 0, - ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW, - MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW, - }, p9.SetAttr{ - Permissions: p9.FileMode(stat.Mode), - UID: p9.UID(stat.UID), - GID: p9.GID(stat.GID), - Size: stat.Size, - ATimeSeconds: uint64(stat.Atime.Sec), - ATimeNanoSeconds: uint64(stat.Atime.Nsec), - MTimeSeconds: uint64(stat.Mtime.Sec), - MTimeNanoSeconds: uint64(stat.Mtime.Nsec), - }); err != nil { - return err + if !d.isSynthetic() { + if stat.Mask != 0 { + if err := d.file.setAttr(ctx, p9.SetAttrMask{ + Permissions: stat.Mask&linux.STATX_MODE != 0, + UID: stat.Mask&linux.STATX_UID != 0, + GID: stat.Mask&linux.STATX_GID != 0, + Size: stat.Mask&linux.STATX_SIZE != 0, + ATime: stat.Mask&linux.STATX_ATIME != 0, + MTime: stat.Mask&linux.STATX_MTIME != 0, + ATimeNotSystemTime: stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW, + MTimeNotSystemTime: stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW, + }, p9.SetAttr{ + Permissions: p9.FileMode(stat.Mode), + UID: p9.UID(stat.UID), + GID: p9.GID(stat.GID), + Size: stat.Size, + ATimeSeconds: uint64(stat.Atime.Sec), + ATimeNanoSeconds: uint64(stat.Atime.Nsec), + MTimeSeconds: uint64(stat.Mtime.Sec), + MTimeNanoSeconds: uint64(stat.Mtime.Nsec), + }); err != nil { + return err + } + } + if d.fs.opts.interop == InteropModeShared { + // There's no point to updating d's metadata in this case since + // it'll be overwritten by revalidation before the next time it's + // used anyway. (InteropModeShared inhibits client caching of + // regular file data, so there's no cache to truncate either.) + return nil } } - if d.fs.opts.interop == InteropModeShared { - // There's no point to updating d's metadata in this case since it'll - // be overwritten by revalidation before the next time it's used - // anyway. (InteropModeShared inhibits client caching of regular file - // data, so there's no cache to truncate either.) - return nil - } - now, haveNow := nowFromContext(ctx) - if !haveNow { - ctx.Warningf("gofer.dentry.setStat: current time not available") - } + now := d.fs.clock.Now().Nanoseconds() if stat.Mask&linux.STATX_MODE != 0 { atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode)) } @@ -777,25 +932,19 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin } if setLocalAtime { if stat.Atime.Nsec == linux.UTIME_NOW { - if haveNow { - atomic.StoreInt64(&d.atime, now) - } + atomic.StoreInt64(&d.atime, now) } else { atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime)) } } if setLocalMtime { if stat.Mtime.Nsec == linux.UTIME_NOW { - if haveNow { - atomic.StoreInt64(&d.mtime, now) - } + atomic.StoreInt64(&d.mtime, now) } else { atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime)) } } - if haveNow { - atomic.StoreInt64(&d.ctime, now) - } + atomic.StoreInt64(&d.ctime, now) if stat.Mask&linux.STATX_SIZE != 0 { d.dataMu.Lock() oldSize := d.size @@ -807,8 +956,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin // so we can't race with Write or another truncate.) d.dataMu.Unlock() if d.size < oldSize { - oldpgend := pageRoundUp(oldSize) - newpgend := pageRoundUp(d.size) + oldpgend, _ := usermem.PageRoundUp(oldSize) + newpgend, _ := usermem.PageRoundUp(d.size) if oldpgend != newpgend { d.mapsMu.Lock() d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ @@ -832,8 +981,22 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin return nil } -func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { - return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&d.mode))&0777, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) +func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) +} + +func dentryUIDFromP9UID(uid p9.UID) uint32 { + if !uid.Ok() { + return uint32(auth.OverflowUID) + } + return uint32(uid) +} + +func dentryGIDFromP9GID(gid p9.GID) uint32 { + if !gid.Ok() { + return uint32(auth.OverflowGID) + } + return uint32(gid) } // IncRef implements vfs.DentryImpl.IncRef. @@ -847,7 +1010,7 @@ func (d *dentry) IncRef() { func (d *dentry) TryIncRef() bool { for { refs := atomic.LoadInt64(&d.refs) - if refs == 0 { + if refs <= 0 { return false } if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) { @@ -867,16 +1030,44 @@ func (d *dentry) DecRef() { } } +// decRefLocked decrements d's reference count without calling +// d.checkCachingLocked, even if d's reference count reaches 0; callers are +// responsible for ensuring that d.checkCachingLocked will be called later. +func (d *dentry) decRefLocked() { + if refs := atomic.AddInt64(&d.refs, -1); refs < 0 { + panic("gofer.dentry.decRefLocked() called without holding a reference") + } +} + +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {} + +// Watches implements vfs.DentryImpl.Watches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *dentry) Watches() *vfs.Watches { + return nil +} + // checkCachingLocked should be called after d's reference count becomes 0 or it // becomes disowned. // +// It may be called on a destroyed dentry. For example, +// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times +// for the same dentry when the dentry is visited more than once in the same +// operation. One of the calls may destroy the dentry, so subsequent calls will +// do nothing. +// // Preconditions: d.fs.renameMu must be locked for writing. func (d *dentry) checkCachingLocked() { // Dentries with a non-zero reference count must be retained. (The only way // to obtain a reference on a dentry with zero references is via path // resolution, which requires renameMu, so if d.refs is zero then it will // remain zero while we hold renameMu for writing.) - if atomic.LoadInt64(&d.refs) != 0 { + refs := atomic.LoadInt64(&d.refs) + if refs > 0 { if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- @@ -884,9 +1075,13 @@ func (d *dentry) checkCachingLocked() { } return } - // Non-child dentries with zero references are no longer reachable by path - // resolution and should be dropped immediately. - if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() { + if refs == -1 { + // Dentry has already been destroyed. + return + } + // Deleted and invalidated dentries with zero references are no longer + // reachable by path resolution and should be dropped immediately. + if d.vfsd.IsDead() { if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- @@ -911,34 +1106,45 @@ func (d *dentry) checkCachingLocked() { d.fs.cachedDentries.Remove(victim) d.fs.cachedDentriesLen-- victim.cached = false - // victim.refs may have become non-zero from an earlier path - // resolution since it was inserted into fs.cachedDentries; see - // dentry.incRefLocked(). Either way, we brought - // fs.cachedDentriesLen back down to fs.opts.maxCachedDentries, so - // we don't loop. + // victim.refs may have become non-zero from an earlier path resolution + // since it was inserted into fs.cachedDentries. if atomic.LoadInt64(&victim.refs) == 0 { - if victimParentVFSD := victim.vfsd.Parent(); victimParentVFSD != nil { - victimParent := victimParentVFSD.Impl().(*dentry) - victimParent.dirMu.Lock() - if !victim.vfsd.IsDisowned() { - // victim can't be a mount point (in any mount - // namespace), since VFS holds references on mount - // points. - d.fs.vfsfs.VirtualFilesystem().ForceDeleteDentry(&victim.vfsd) + if victim.parent != nil { + victim.parent.dirMu.Lock() + if !victim.vfsd.IsDead() { + // Note that victim can't be a mount point (in any mount + // namespace), since VFS holds references on mount points. + d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(&victim.vfsd) + delete(victim.parent.children, victim.name) // We're only deleting the dentry, not the file it // represents, so we don't need to update // victimParent.dirents etc. } - victimParent.dirMu.Unlock() + victim.parent.dirMu.Unlock() } victim.destroyLocked() } + // Whether or not victim was destroyed, we brought fs.cachedDentriesLen + // back down to fs.opts.maxCachedDentries, so we don't loop. } } +// destroyLocked destroys the dentry. It may flushes dirty pages from cache, +// close p9 file and remove reference on parent dentry. +// // Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is // not a child dentry. func (d *dentry) destroyLocked() { + switch atomic.LoadInt64(&d.refs) { + case 0: + // Mark the dentry destroyed. + atomic.StoreInt64(&d.refs, -1) + case -1: + panic("dentry.destroyLocked() called on already destroyed dentry") + default: + panic("dentry.destroyLocked() called with references on the dentry") + } + ctx := context.Background() d.handleMu.Lock() if !d.handle.file.isNil() { @@ -958,17 +1164,20 @@ func (d *dentry) destroyLocked() { d.handle.close(ctx) } d.handleMu.Unlock() - d.file.close(ctx) - // Remove d from the set of all dentries. - d.fs.syncMu.Lock() - delete(d.fs.dentries, d) - d.fs.syncMu.Unlock() - // Drop the reference held by d on its parent. - if parentVFSD := d.vfsd.Parent(); parentVFSD != nil { - parent := parentVFSD.Impl().(*dentry) - // This is parent.DecRef() without recursive locking of d.fs.renameMu. - if refs := atomic.AddInt64(&parent.refs, -1); refs == 0 { - parent.checkCachingLocked() + + if !d.file.isNil() { + d.file.close(ctx) + d.file = p9file{} + // Remove d from the set of syncable dentries. + d.fs.syncMu.Lock() + delete(d.fs.syncableDentries, d) + d.fs.syncMu.Unlock() + } + // Drop the reference held by d on its parent without recursively locking + // d.fs.renameMu. + if d.parent != nil { + if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { + d.parent.checkCachingLocked() } else if refs < 0 { panic("gofer.dentry.DecRef() called without holding a reference") } @@ -983,24 +1192,65 @@ func (d *dentry) setDeleted() { atomic.StoreUint32(&d.deleted, 1) } -func (d *dentry) listxattr(ctx context.Context) ([]string, error) { - return nil, syserror.ENOTSUP +// We only support xattrs prefixed with "user." (see b/148380782). Currently, +// there is no need to expose any other xattrs through a gofer. +func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) { + if d.file.isNil() { + return nil, nil + } + xattrMap, err := d.file.listXattr(ctx, size) + if err != nil { + return nil, err + } + xattrs := make([]string, 0, len(xattrMap)) + for x := range xattrMap { + if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) { + xattrs = append(xattrs, x) + } + } + return xattrs, nil } -func (d *dentry) getxattr(ctx context.Context, name string) (string, error) { - // TODO(jamieliu): add vfs.GetxattrOptions.Size - return d.file.getXattr(ctx, name, linux.XATTR_SIZE_MAX) +func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { + if d.file.isNil() { + return "", syserror.ENODATA + } + if err := d.checkPermissions(creds, vfs.MayRead); err != nil { + return "", err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return "", syserror.EOPNOTSUPP + } + return d.file.getXattr(ctx, opts.Name, opts.Size) } -func (d *dentry) setxattr(ctx context.Context, opts *vfs.SetxattrOptions) error { +func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error { + if d.file.isNil() { + return syserror.EPERM + } + if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags) } -func (d *dentry) removexattr(ctx context.Context, name string) error { - return syserror.ENOTSUP +func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error { + if d.file.isNil() { + return syserror.EPERM + } + if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + return d.file.removeXattr(ctx, name) } -// Preconditions: d.isRegularFile() || d.isDirectory(). +// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDirectory(). func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { // O_TRUNC unconditionally requires us to obtain a new handle (opened with // O_TRUNC). @@ -1042,13 +1292,13 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // using the old file descriptor, preventing us from safely // closing it. We could handle this by invalidating existing // memmap.Translations, but this is expensive. Instead, use - // dup2() to make the old file descriptor refer to the new file + // dup3 to make the old file descriptor refer to the new file // description, then close the new file descriptor (which is no // longer needed). Racing callers may use the old or new file // description, but this doesn't matter since they refer to the // same file (unless d.fs.opts.overlayfsStaleRead is true, // which we handle separately). - if err := syscall.Dup2(int(h.fd), int(d.handle.fd)); err != nil { + if err := syscall.Dup3(int(h.fd), int(d.handle.fd), syscall.O_CLOEXEC); err != nil { d.handleMu.Unlock() ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err) h.close(ctx) @@ -1091,6 +1341,26 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool return nil } +// incLinks increments link count. +// +// Preconditions: d.nlink != 0 && d.nlink < math.MaxUint32. +func (d *dentry) incLinks() { + v := atomic.AddUint32(&d.nlink, 1) + if v < 2 { + panic(fmt.Sprintf("dentry.nlink is invalid (was 0 or overflowed): %d", v)) + } +} + +// decLinks decrements link count. +// +// Preconditions: d.nlink > 1. +func (d *dentry) decLinks() { + v := atomic.AddUint32(&d.nlink, ^uint32(0)) + if v == 0 { + panic(fmt.Sprintf("dentry.nlink must be greater than 0: %d", v)) + } +} + // fileDescription is embedded by gofer implementations of // vfs.FileDescriptionImpl. type fileDescription struct { @@ -1109,7 +1379,8 @@ func (fd *fileDescription) dentry() *dentry { // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { d := fd.dentry() - if d.fs.opts.interop == InteropModeShared && opts.Mask&(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE|linux.STATX_BLOCKS|linux.STATX_BTIME) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { + const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME) + if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { // TODO(jamieliu): Use specialFileFD.handle.file for the getattr if // available? if err := d.updateFromGetattr(ctx); err != nil { @@ -1127,21 +1398,21 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) } // Listxattr implements vfs.FileDescriptionImpl.Listxattr. -func (fd *fileDescription) Listxattr(ctx context.Context) ([]string, error) { - return fd.dentry().listxattr(ctx) +func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { + return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size) } // Getxattr implements vfs.FileDescriptionImpl.Getxattr. -func (fd *fileDescription) Getxattr(ctx context.Context, name string) (string, error) { - return fd.dentry().getxattr(ctx, name) +func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) { + return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts) } // Setxattr implements vfs.FileDescriptionImpl.Setxattr. func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { - return fd.dentry().setxattr(ctx, &opts) + return fd.dentry().setxattr(ctx, auth.CredentialsFromContext(ctx), &opts) } // Removexattr implements vfs.FileDescriptionImpl.Removexattr. func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { - return fd.dentry().removexattr(ctx, name) + return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name) } diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go new file mode 100644 index 000000000..adff39490 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -0,0 +1,63 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync/atomic" + "testing" + + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/contexttest" +) + +func TestDestroyIdempotent(t *testing.T) { + fs := filesystem{ + syncableDentries: make(map[*dentry]struct{}), + opts: filesystemOptions{ + // Test relies on no dentry being held in the cache. + maxCachedDentries: 0, + }, + } + + ctx := contexttest.Context(t) + attr := &p9.Attr{ + Mode: p9.ModeRegular, + } + mask := p9.AttrMask{ + Mode: true, + Size: true, + } + parent, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr) + if err != nil { + t.Fatalf("fs.newDentry(): %v", err) + } + + child, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr) + if err != nil { + t.Fatalf("fs.newDentry(): %v", err) + } + parent.cacheNewChildLocked(child, "child") + + child.checkCachingLocked() + if got := atomic.LoadInt64(&child.refs); got != -1 { + t.Fatalf("child.refs=%d, want: -1", got) + } + // Parent will also be destroyed when child reference is removed. + if got := atomic.LoadInt64(&parent.refs); got != -1 { + t.Fatalf("parent.refs=%d, want: -1", got) + } + child.checkCachingLocked() + child.checkCachingLocked() +} diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go index cfe66f797..724a3f1f7 100644 --- a/pkg/sentry/fsimpl/gofer/handle.go +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/hostfd" ) // handle represents a remote "open file descriptor", consisting of an opened @@ -77,7 +78,7 @@ func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offs } if h.fd >= 0 { ctx.UninterruptibleSleepStart(false) - n, err := hostPreadv(h.fd, dsts, int64(offset)) + n, err := hostfd.Preadv2(h.fd, dsts, int64(offset), 0 /* flags */) ctx.UninterruptibleSleepFinish(false) return n, err } @@ -103,7 +104,7 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o } if h.fd >= 0 { ctx.UninterruptibleSleepStart(false) - n, err := hostPwritev(h.fd, srcs, int64(offset)) + n, err := hostfd.Pwritev2(h.fd, srcs, int64(offset), 0 /* flags */) ctx.UninterruptibleSleepFinish(false) return n, err } diff --git a/pkg/sentry/fsimpl/gofer/handle_unsafe.go b/pkg/sentry/fsimpl/gofer/handle_unsafe.go deleted file mode 100644 index 19560ab26..000000000 --- a/pkg/sentry/fsimpl/gofer/handle_unsafe.go +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "syscall" - "unsafe" - - "gvisor.dev/gvisor/pkg/safemem" -) - -// Preconditions: !dsts.IsEmpty(). -func hostPreadv(fd int32, dsts safemem.BlockSeq, off int64) (uint64, error) { - // No buffering is necessary regardless of safecopy; host syscalls will - // return EFAULT if appropriate, instead of raising SIGBUS. - if dsts.NumBlocks() == 1 { - // Use pread() instead of preadv() to avoid iovec allocation and - // copying. - dst := dsts.Head() - n, _, e := syscall.Syscall6(syscall.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil - } - iovs := safemem.IovecsFromBlockSeq(dsts) - n, _, e := syscall.Syscall6(syscall.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil -} - -// Preconditions: !srcs.IsEmpty(). -func hostPwritev(fd int32, srcs safemem.BlockSeq, off int64) (uint64, error) { - // No buffering is necessary regardless of safecopy; host syscalls will - // return EFAULT if appropriate, instead of raising SIGBUS. - if srcs.NumBlocks() == 1 { - // Use pwrite() instead of pwritev() to avoid iovec allocation and - // copying. - src := srcs.Head() - n, _, e := syscall.Syscall6(syscall.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil - } - iovs := safemem.IovecsFromBlockSeq(srcs) - n, _, e := syscall.Syscall6(syscall.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil -} diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go new file mode 100644 index 000000000..7294de7d6 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go @@ -0,0 +1,97 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "fmt" + "sync" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Global pipe used by blockUntilNonblockingPipeHasWriter since we can't create +// pipes after sentry initialization due to syscall filters. +var ( + tempPipeMu sync.Mutex + tempPipeReadFD int + tempPipeWriteFD int + tempPipeBuf [1]byte +) + +func init() { + var pipeFDs [2]int + if err := unix.Pipe(pipeFDs[:]); err != nil { + panic(fmt.Sprintf("failed to create pipe for gofer.blockUntilNonblockingPipeHasWriter: %v", err)) + } + tempPipeReadFD = pipeFDs[0] + tempPipeWriteFD = pipeFDs[1] +} + +func blockUntilNonblockingPipeHasWriter(ctx context.Context, fd int32) error { + for { + ok, err := nonblockingPipeHasWriter(fd) + if err != nil { + return err + } + if ok { + return nil + } + if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil { + return err + } + } +} + +func nonblockingPipeHasWriter(fd int32) (bool, error) { + tempPipeMu.Lock() + defer tempPipeMu.Unlock() + // Copy 1 byte from fd into the temporary pipe. + n, err := unix.Tee(int(fd), tempPipeWriteFD, 1, unix.SPLICE_F_NONBLOCK) + if err == syserror.EAGAIN { + // The pipe represented by fd is empty, but has a writer. + return true, nil + } + if err != nil { + return false, err + } + if n == 0 { + // The pipe represented by fd is empty and has no writer. + return false, nil + } + // The pipe represented by fd is non-empty, so it either has, or has + // previously had, a writer. Remove the byte copied to the temporary pipe + // before returning. + if n, err := unix.Read(tempPipeReadFD, tempPipeBuf[:]); err != nil || n != 1 { + panic(fmt.Sprintf("failed to drain pipe for gofer.blockUntilNonblockingPipeHasWriter: got (%d, %v), wanted (1, nil)", n, err)) + } + return true, nil +} + +func sleepBetweenNamedPipeOpenChecks(ctx context.Context) error { + t := time.NewTimer(100 * time.Millisecond) + defer t.Stop() + cancel := ctx.SleepStart() + select { + case <-t.C: + ctx.SleepFinish(true) + return nil + case <-cancel: + ctx.SleepFinish(false) + return syserror.ErrInterrupted + } +} diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go index 755ac2985..87f0b877f 100644 --- a/pkg/sentry/fsimpl/gofer/p9file.go +++ b/pkg/sentry/fsimpl/gofer/p9file.go @@ -85,6 +85,13 @@ func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAt return err } +func (f p9file) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) { + ctx.UninterruptibleSleepStart(false) + xattrs, err := f.file.ListXattr(size) + ctx.UninterruptibleSleepFinish(false) + return xattrs, err +} + func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) { ctx.UninterruptibleSleepStart(false) val, err := f.file.GetXattr(name, size) @@ -99,6 +106,13 @@ func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32) return err } +func (f p9file) removeXattr(ctx context.Context, name string) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.RemoveXattr(name) + ctx.UninterruptibleSleepFinish(false) + return err +} + func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error { ctx.UninterruptibleSleepStart(false) err := f.file.Allocate(mode, offset, length) diff --git a/pkg/sentry/fsimpl/gofer/pagemath.go b/pkg/sentry/fsimpl/gofer/pagemath.go deleted file mode 100644 index 847cb0784..000000000 --- a/pkg/sentry/fsimpl/gofer/pagemath.go +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "gvisor.dev/gvisor/pkg/usermem" -) - -// This are equivalent to usermem.Addr.RoundDown/Up, but without the -// potentially truncating conversion to usermem.Addr. This is necessary because -// there is no way to define generic "PageRoundDown/Up" functions in Go. - -func pageRoundDown(x uint64) uint64 { - return x &^ (usermem.PageSize - 1) -} - -func pageRoundUp(x uint64) uint64 { - return pageRoundDown(x + usermem.PageSize - 1) -} diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 54c1031a7..0d10cf7ac 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -104,7 +104,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs putDentryReadWriter(rw) if d.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). - d.touchAtime(ctx, fd.vfsfd.Mount()) + d.touchAtime(fd.vfsfd.Mount()) } return n, err } @@ -126,6 +126,11 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off if opts.Flags != 0 { return 0, syserror.EOPNOTSUPP } + limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) + if err != nil { + return 0, err + } + src = src.TakeFirst64(limit) d := fd.dentry() d.metadataMu.Lock() @@ -134,10 +139,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // Compare Linux's mm/filemap.c:__generic_file_write_iter() => // file_update_time(). This is d.touchCMtime(), but without locking // d.metadataMu (recursively). - if now, ok := nowFromContext(ctx); ok { - atomic.StoreInt64(&d.mtime, now) - atomic.StoreInt64(&d.ctime, now) - } + d.touchCMtimeLocked() } if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { // Write dirty cached pages that will be touched by the write back to @@ -146,9 +148,9 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, err } // Remove touched pages from the cache. - pgstart := pageRoundDown(uint64(offset)) - pgend := pageRoundUp(uint64(offset + src.NumBytes())) - if pgend < pgstart { + pgstart := usermem.PageRoundDown(uint64(offset)) + pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes())) + if !ok { return 0, syserror.EINVAL } mr := memmap.MappableRange{pgstart, pgend} @@ -304,9 +306,10 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) if fillCache { // Read into the cache, then re-enter the loop to read from the // cache. + gapEnd, _ := usermem.PageRoundUp(gapMR.End) reqMR := memmap.MappableRange{ - Start: pageRoundDown(gapMR.Start), - End: pageRoundUp(gapMR.End), + Start: usermem.PageRoundDown(gapMR.Start), + End: gapEnd, } optMR := gap.Range() err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt) @@ -361,8 +364,15 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro rw.d.handleMu.RLock() if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off) - rw.d.handleMu.RUnlock() rw.off += n + rw.d.dataMu.Lock() + if rw.off > rw.d.size { + atomic.StoreUint64(&rw.d.size, rw.off) + // The remote file's size will implicitly be extended to the correct + // value when we write back to it. + } + rw.d.dataMu.Unlock() + rw.d.handleMu.RUnlock() return n, err } @@ -662,7 +672,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab // Constrain translations to d.size (rounded up) to prevent translation to // pages that may be concurrently truncated. - pgend := pageRoundUp(d.size) + pgend, _ := usermem.PageRoundUp(d.size) var beyondEOF bool if required.End > pgend { if required.Start >= pgend { @@ -809,43 +819,15 @@ type dentryPlatformFile struct { // IncRef implements platform.File.IncRef. func (d *dentryPlatformFile) IncRef(fr platform.FileRange) { d.dataMu.Lock() - seg, gap := d.fdRefs.Find(fr.Start) - for { - switch { - case seg.Ok() && seg.Start() < fr.End: - seg = d.fdRefs.Isolate(seg, fr) - seg.SetValue(seg.Value() + 1) - seg, gap = seg.NextNonEmpty() - case gap.Ok() && gap.Start() < fr.End: - newRange := gap.Range().Intersect(fr) - usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) - seg, gap = d.fdRefs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() - default: - d.fdRefs.MergeAdjacent(fr) - d.dataMu.Unlock() - return - } - } + d.fdRefs.IncRefAndAccount(fr) + d.dataMu.Unlock() } // DecRef implements platform.File.DecRef. func (d *dentryPlatformFile) DecRef(fr platform.FileRange) { d.dataMu.Lock() - seg := d.fdRefs.FindSegment(fr.Start) - - for seg.Ok() && seg.Start() < fr.End { - seg = d.fdRefs.Isolate(seg, fr) - if old := seg.Value(); old == 1 { - usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) - seg = d.fdRefs.Remove(seg).NextSegment() - } else { - seg.SetValue(old - 1) - seg = seg.NextSegment() - } - } - d.fdRefs.MergeAdjacent(fr) + d.fdRefs.DecRefAndAccount(fr) d.dataMu.Unlock() - } // MapInternal implements platform.File.MapInternal. diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go new file mode 100644 index 000000000..d6dbe9092 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/socket.go @@ -0,0 +1,146 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/waiter" +) + +func (d *dentry) isSocket() bool { + return d.fileType() == linux.S_IFSOCK +} + +// endpoint is a Gofer-backed transport.BoundEndpoint. +// +// An endpoint's lifetime is the time between when filesystem.BoundEndpointAt() +// is called and either BoundEndpoint.BidirectionalConnect or +// BoundEndpoint.UnidirectionalConnect is called. +type endpoint struct { + // dentry is the filesystem dentry which produced this endpoint. + dentry *dentry + + // file is the p9 file that contains a single unopened fid. + file p9.File + + // path is the sentry path where this endpoint is bound. + path string +} + +func sockTypeToP9(t linux.SockType) (p9.ConnectFlags, bool) { + switch t { + case linux.SOCK_STREAM: + return p9.StreamSocket, true + case linux.SOCK_SEQPACKET: + return p9.SeqpacketSocket, true + case linux.SOCK_DGRAM: + return p9.DgramSocket, true + } + return 0, false +} + +// BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect. +func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error { + cf, ok := sockTypeToP9(ce.Type()) + if !ok { + return syserr.ErrConnectionRefused + } + + // No lock ordering required as only the ConnectingEndpoint has a mutex. + ce.Lock() + + // Check connecting state. + if ce.Connected() { + ce.Unlock() + return syserr.ErrAlreadyConnected + } + if ce.Listening() { + ce.Unlock() + return syserr.ErrInvalidEndpointState + } + + c, err := e.newConnectedEndpoint(ctx, cf, ce.WaiterQueue()) + if err != nil { + ce.Unlock() + return err + } + + returnConnect(c, c) + ce.Unlock() + if err := c.Init(); err != nil { + return syserr.FromError(err) + } + + return nil +} + +// UnidirectionalConnect implements +// transport.BoundEndpoint.UnidirectionalConnect. +func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.ConnectedEndpoint, *syserr.Error) { + c, err := e.newConnectedEndpoint(ctx, p9.DgramSocket, &waiter.Queue{}) + if err != nil { + return nil, err + } + + if err := c.Init(); err != nil { + return nil, syserr.FromError(err) + } + + // We don't need the receiver. + c.CloseRecv() + c.Release() + + return c, nil +} + +func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFlags, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) { + hostFile, err := e.file.Connect(flags) + if err != nil { + return nil, syserr.ErrConnectionRefused + } + // Dup the fd so that the new endpoint can manage its lifetime. + hostFD, err := syscall.Dup(hostFile.FD()) + if err != nil { + log.Warningf("Could not dup host socket fd %d: %v", hostFile.FD(), err) + return nil, syserr.FromError(err) + } + // After duplicating, we no longer need hostFile. + hostFile.Close() + + c, serr := host.NewSCMEndpoint(ctx, hostFD, queue, e.path) + if serr != nil { + log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.file, flags, serr) + return nil, serr + } + return c, nil +} + +// Release implements transport.BoundEndpoint.Release. +func (e *endpoint) Release() { + e.dentry.DecRef() +} + +// Passcred implements transport.BoundEndpoint.Passcred. +func (e *endpoint) Passcred() bool { + return false +} diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 08c691c47..ff6126b87 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -19,33 +19,69 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" ) -// specialFileFD implements vfs.FileDescriptionImpl for files other than -// regular files, directories, and symlinks: pipes, sockets, etc. It is also -// used for regular files when filesystemOptions.specialRegularFiles is in -// effect. specialFileFD differs from regularFileFD by using per-FD handles -// instead of shared per-dentry handles, and never buffering I/O. +// specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device +// special files, and (when filesystemOptions.specialRegularFiles is in effect) +// regular files. specialFileFD differs from regularFileFD by using per-FD +// handles instead of shared per-dentry handles, and never buffering I/O. type specialFileFD struct { fileDescription - // handle is immutable. + // handle is used for file I/O. handle is immutable. handle handle - // off is the file offset. off is protected by mu. (POSIX 2.9.7 only - // requires operations using the file offset to be atomic for regular files - // and symlinks; however, since specialFileFD may be used for regular - // files, we apply this atomicity unconditionally.) + // seekable is true if this file description represents a file for which + // file offset is significant, i.e. a regular file. seekable is immutable. + seekable bool + + // mayBlock is true if this file description represents a file for which + // queue may send I/O readiness events. mayBlock is immutable. + mayBlock bool + queue waiter.Queue + + // If seekable is true, off is the file offset. off is protected by mu. mu sync.Mutex off int64 } +func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) { + ftype := d.fileType() + seekable := ftype == linux.S_IFREG + mayBlock := ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK + fd := &specialFileFD{ + handle: h, + seekable: seekable, + mayBlock: mayBlock, + } + if mayBlock && h.fd >= 0 { + if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil { + return nil, err + } + } + if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + DenyPRead: !seekable, + DenyPWrite: !seekable, + }); err != nil { + if mayBlock && h.fd >= 0 { + fdnotifier.RemoveFD(h.fd) + } + return nil, err + } + return fd, nil +} + // Release implements vfs.FileDescriptionImpl.Release. func (fd *specialFileFD) Release() { + if fd.mayBlock && fd.handle.fd >= 0 { + fdnotifier.RemoveFD(fd.handle.fd) + } fd.handle.close(context.Background()) fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) fs.syncMu.Lock() @@ -61,9 +97,35 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error { return fd.handle.file.flush(ctx) } +// Readiness implements waiter.Waitable.Readiness. +func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask { + if fd.mayBlock { + return fdnotifier.NonBlockingPoll(fd.handle.fd, mask) + } + return fd.fileDescription.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + if fd.mayBlock { + fd.queue.EventRegister(e, mask) + return + } + fd.fileDescription.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (fd *specialFileFD) EventUnregister(e *waiter.Entry) { + if fd.mayBlock { + fd.queue.EventUnregister(e) + return + } + fd.fileDescription.EventUnregister(e) +} + // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - if offset < 0 { + if fd.seekable && offset < 0 { return 0, syserror.EINVAL } if opts.Flags != 0 { @@ -76,10 +138,13 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs // hold here since specialFileFD doesn't client-cache data. Just buffer the // read instead. if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { - d.touchAtime(ctx, fd.vfsfd.Mount()) + d.touchAtime(fd.vfsfd.Mount()) } buf := make([]byte, dst.NumBytes()) n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + if err == syserror.EAGAIN { + err = syserror.ErrWouldBlock + } if n == 0 { return 0, err } @@ -91,6 +156,10 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs // Read implements vfs.FileDescriptionImpl.Read. func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + if !fd.seekable { + return fd.PRead(ctx, dst, -1, opts) + } + fd.mu.Lock() n, err := fd.PRead(ctx, dst, fd.off, opts) fd.off += n @@ -100,16 +169,24 @@ func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - if offset < 0 { + if fd.seekable && offset < 0 { return 0, syserror.EINVAL } if opts.Flags != 0 { return 0, syserror.EOPNOTSUPP } + if fd.seekable { + limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) + if err != nil { + return 0, err + } + src = src.TakeFirst64(limit) + } + // Do a buffered write. See rationale in PRead. if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { - d.touchCMtime(ctx) + d.touchCMtime() } buf := make([]byte, src.NumBytes()) // Don't do partial writes if we get a partial read from src. @@ -117,11 +194,18 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, err } n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + if err == syserror.EAGAIN { + err = syserror.ErrWouldBlock + } return int64(n), err } // Write implements vfs.FileDescriptionImpl.Write. func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + if !fd.seekable { + return fd.PWrite(ctx, src, -1, opts) + } + fd.mu.Lock() n, err := fd.PWrite(ctx, src, fd.off, opts) fd.off += n @@ -131,6 +215,9 @@ func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + if !fd.seekable { + return 0, syserror.ESPIPE + } fd.mu.Lock() defer fd.mu.Unlock() switch whence { diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go index adf43be60..2ec819f86 100644 --- a/pkg/sentry/fsimpl/gofer/symlink.go +++ b/pkg/sentry/fsimpl/gofer/symlink.go @@ -27,7 +27,7 @@ func (d *dentry) isSymlink() bool { // Precondition: d.isSymlink(). func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { if d.fs.opts.interop != InteropModeShared { - d.touchAtime(ctx, mnt) + d.touchAtime(mnt) d.dataMu.Lock() if d.haveTarget { target := d.target diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go index 7598ec6a8..1d5aa82dc 100644 --- a/pkg/sentry/fsimpl/gofer/time.go +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -18,8 +18,6 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -38,23 +36,15 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp { } } -func nowFromContext(ctx context.Context) (int64, bool) { - if clock := ktime.RealtimeClockFromContext(ctx); clock != nil { - return clock.Now().Nanoseconds(), true - } - return 0, false -} - // Preconditions: fs.interop != InteropModeShared. -func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) { - if err := mnt.CheckBeginWrite(); err != nil { +func (d *dentry) touchAtime(mnt *vfs.Mount) { + if mnt.Flags.NoATime { return } - now, ok := nowFromContext(ctx) - if !ok { - mnt.EndWrite() + if err := mnt.CheckBeginWrite(); err != nil { return } + now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() atomic.StoreInt64(&d.atime, now) d.metadataMu.Unlock() @@ -63,13 +53,25 @@ func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) { // Preconditions: fs.interop != InteropModeShared. The caller has successfully // called vfs.Mount.CheckBeginWrite(). -func (d *dentry) touchCMtime(ctx context.Context) { - now, ok := nowFromContext(ctx) - if !ok { - return - } +func (d *dentry) touchCtime() { + now := d.fs.clock.Now().Nanoseconds() + d.metadataMu.Lock() + atomic.StoreInt64(&d.ctime, now) + d.metadataMu.Unlock() +} + +// Preconditions: fs.interop != InteropModeShared. The caller has successfully +// called vfs.Mount.CheckBeginWrite(). +func (d *dentry) touchCMtime() { + now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() atomic.StoreInt64(&d.mtime, now) atomic.StoreInt64(&d.ctime, now) d.metadataMu.Unlock() } + +func (d *dentry) touchCMtimeLocked() { + now := d.fs.clock.Now().Nanoseconds() + atomic.StoreInt64(&d.mtime, now) + atomic.StoreInt64(&d.ctime, now) +} |