diff options
Diffstat (limited to 'pkg/sentry/fsimpl/gofer')
-rw-r--r-- | pkg/sentry/fsimpl/gofer/BUILD | 55 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/directory.go | 194 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/filesystem.go | 1090 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/gofer.go | 1150 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/handle.go | 135 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/handle_unsafe.go | 66 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/p9file.go | 219 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/pagemath.go | 31 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/regular_file.go | 872 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/special_file.go | 159 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/symlink.go | 47 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/gofer/time.go | 75 |
12 files changed, 0 insertions, 4093 deletions
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD deleted file mode 100644 index 4ba76a1e8..000000000 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ /dev/null @@ -1,55 +0,0 @@ -load("//tools:defs.bzl", "go_library") -load("//tools/go_generics:defs.bzl", "go_template_instance") - -licenses(["notice"]) - -go_template_instance( - name = "dentry_list", - out = "dentry_list.go", - package = "gofer", - prefix = "dentry", - template = "//pkg/ilist:generic_list", - types = { - "Element": "*dentry", - "Linker": "*dentry", - }, -) - -go_library( - name = "gofer", - srcs = [ - "dentry_list.go", - "directory.go", - "filesystem.go", - "gofer.go", - "handle.go", - "handle_unsafe.go", - "p9file.go", - "pagemath.go", - "regular_file.go", - "special_file.go", - "symlink.go", - "time.go", - ], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "//pkg/context", - "//pkg/fd", - "//pkg/fspath", - "//pkg/log", - "//pkg/p9", - "//pkg/safemem", - "//pkg/sentry/fs/fsutil", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/time", - "//pkg/sentry/memmap", - "//pkg/sentry/pgalloc", - "//pkg/sentry/platform", - "//pkg/sentry/usage", - "//pkg/sentry/vfs", - "//pkg/syserror", - "//pkg/unet", - "//pkg/usermem", - ], -) diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go deleted file mode 100644 index 5dbfc6250..000000000 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ /dev/null @@ -1,194 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "sync" - "sync/atomic" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -func (d *dentry) isDir() bool { - return d.fileType() == linux.S_IFDIR -} - -// Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop != -// InteropModeShared. -func (d *dentry) cacheNegativeChildLocked(name string) { - if d.negativeChildren == nil { - d.negativeChildren = make(map[string]struct{}) - } - d.negativeChildren[name] = struct{}{} -} - -type directoryFD struct { - fileDescription - vfs.DirectoryFileDescriptionDefaultImpl - - mu sync.Mutex - off int64 - dirents []vfs.Dirent -} - -// Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { -} - -// IterDirents implements vfs.FileDescriptionImpl.IterDirents. -func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { - fd.mu.Lock() - defer fd.mu.Unlock() - - if fd.dirents == nil { - ds, err := fd.dentry().getDirents(ctx) - if err != nil { - return err - } - fd.dirents = ds - } - - for fd.off < int64(len(fd.dirents)) { - if err := cb.Handle(fd.dirents[fd.off]); err != nil { - return err - } - fd.off++ - } - return nil -} - -// Preconditions: d.isDir(). There exists at least one directoryFD representing d. -func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { - // 9P2000.L's readdir does not specify behavior in the presence of - // concurrent mutation of an iterated directory, so implementations may - // duplicate or omit entries in this case, which violates POSIX semantics. - // Thus we read all directory entries while holding d.dirMu to exclude - // directory mutations. (Note that it is impossible for the client to - // exclude concurrent mutation from other remote filesystem users. Since - // there is no way to detect if the server has incorrectly omitted - // directory entries, we simply assume that the server is well-behaved - // under InteropModeShared.) This is inconsistent with Linux (which appears - // to assume that directory fids have the correct semantics, and translates - // struct file_operations::readdir calls directly to readdir RPCs), but is - // consistent with VFS1. - // - // NOTE(b/135560623): In particular, some gofer implementations may not - // retain state between calls to Readdir, so may not provide a coherent - // directory stream across in the presence of mutation. - - d.fs.renameMu.RLock() - defer d.fs.renameMu.RUnlock() - d.dirMu.Lock() - defer d.dirMu.Unlock() - if d.dirents != nil { - return d.dirents, nil - } - - // It's not clear if 9P2000.L's readdir is expected to return "." and "..", - // so we generate them here. - parent := d.vfsd.ParentOrSelf().Impl().(*dentry) - dirents := []vfs.Dirent{ - { - Name: ".", - Type: linux.DT_DIR, - Ino: d.ino, - NextOff: 1, - }, - { - Name: "..", - Type: uint8(atomic.LoadUint32(&parent.mode) >> 12), - Ino: parent.ino, - NextOff: 2, - }, - } - off := uint64(0) - const count = 64 * 1024 // for consistency with the vfs1 client - d.handleMu.RLock() - defer d.handleMu.RUnlock() - if !d.handleReadable { - // This should not be possible because a readable handle should have - // been opened when the calling directoryFD was opened. - panic("gofer.dentry.getDirents called without a readable handle") - } - for { - p9ds, err := d.handle.file.readdir(ctx, off, count) - if err != nil { - return nil, err - } - if len(p9ds) == 0 { - // Cache dirents for future directoryFDs if permitted. - if d.fs.opts.interop != InteropModeShared { - d.dirents = dirents - } - return dirents, nil - } - for _, p9d := range p9ds { - if p9d.Name == "." || p9d.Name == ".." { - continue - } - dirent := vfs.Dirent{ - Name: p9d.Name, - Ino: p9d.QID.Path, - NextOff: int64(len(dirents) + 1), - } - // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or - // DMSOCKET. - switch p9d.Type { - case p9.TypeSymlink: - dirent.Type = linux.DT_LNK - case p9.TypeDir: - dirent.Type = linux.DT_DIR - default: - dirent.Type = linux.DT_REG - } - dirents = append(dirents, dirent) - } - off = p9ds[len(p9ds)-1].Offset - } -} - -// Seek implements vfs.FileDescriptionImpl.Seek. -func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - fd.mu.Lock() - defer fd.mu.Unlock() - - switch whence { - case linux.SEEK_SET: - if offset < 0 { - return 0, syserror.EINVAL - } - if offset == 0 { - // Ensure that the next call to fd.IterDirents() calls - // fd.dentry().getDirents(). - fd.dirents = nil - } - fd.off = offset - return fd.off, nil - case linux.SEEK_CUR: - offset += fd.off - if offset < 0 { - return 0, syserror.EINVAL - } - // Don't clear fd.dirents in this case, even if offset == 0. - fd.off = offset - return fd.off, nil - default: - return 0, syserror.EINVAL - } -} diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go deleted file mode 100644 index 5cfb0dc4c..000000000 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ /dev/null @@ -1,1090 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "sync" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -// Sync implements vfs.FilesystemImpl.Sync. -func (fs *filesystem) Sync(ctx context.Context) error { - // Snapshot current dentries and special files. - fs.syncMu.Lock() - ds := make([]*dentry, 0, len(fs.dentries)) - for d := range fs.dentries { - ds = append(ds, d) - } - sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs)) - for sffd := range fs.specialFileFDs { - sffds = append(sffds, sffd) - } - fs.syncMu.Unlock() - - // Return the first error we encounter, but sync everything we can - // regardless. - var retErr error - - // Sync regular files. - for _, d := range ds { - if !d.TryIncRef() { - continue - } - err := d.syncSharedHandle(ctx) - d.DecRef() - if err != nil && retErr == nil { - retErr = err - } - } - - // Sync special files, which may be writable but do not use dentry shared - // handles (so they won't be synced by the above). - for _, sffd := range sffds { - if !sffd.vfsfd.TryIncRef() { - continue - } - err := sffd.Sync(ctx) - sffd.vfsfd.DecRef() - if err != nil && retErr == nil { - retErr = err - } - } - - return retErr -} - -// maxFilenameLen is the maximum length of a filename. This is dictated by 9P's -// encoding of strings, which uses 2 bytes for the length prefix. -const maxFilenameLen = (1 << 16) - 1 - -// dentrySlicePool is a pool of *[]*dentry used to store dentries for which -// dentry.checkCachingLocked() must be called. The pool holds pointers to -// slices because Go lacks generics, so sync.Pool operates on interface{}, so -// every call to (what should be) sync.Pool<[]*dentry>.Put() allocates a copy -// of the slice header on the heap. -var dentrySlicePool = sync.Pool{ - New: func() interface{} { - ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity - return &ds - }, -} - -func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry { - if ds == nil { - ds = dentrySlicePool.Get().(*[]*dentry) - } - *ds = append(*ds, d) - return ds -} - -// Preconditions: ds != nil. -func putDentrySlice(ds *[]*dentry) { - // Allow dentries to be GC'd. - for i := range *ds { - (*ds)[i] = nil - } - *ds = (*ds)[:0] - dentrySlicePool.Put(ds) -} - -// stepLocked resolves rp.Component() to an existing file, starting from the -// given directory. -// -// Dentries which may become cached as a result of the traversal are appended -// to *ds. -// -// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. -// !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached -// metadata must be up to date. -func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { - if !d.isDir() { - return nil, syserror.ENOTDIR - } - if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { - return nil, err - } -afterSymlink: - name := rp.Component() - if name == "." { - rp.Advance() - return d, nil - } - if name == ".." { - parentVFSD, err := rp.ResolveParent(&d.vfsd) - if err != nil { - return nil, err - } - parent := parentVFSD.Impl().(*dentry) - if fs.opts.interop == InteropModeShared { - // We must assume that parentVFSD is correct, because if d has been - // moved elsewhere in the remote filesystem so that its parent has - // changed, we have no way of determining its new parent's location - // in the filesystem. Get updated metadata for parentVFSD. - _, attrMask, attr, err := parent.file.getAttr(ctx, dentryAttrMask()) - if err != nil { - return nil, err - } - parent.updateFromP9Attrs(attrMask, &attr) - } - rp.Advance() - return parent, nil - } - childVFSD, err := rp.ResolveChild(&d.vfsd, name) - if err != nil { - return nil, err - } - // FIXME(jamieliu): Linux performs revalidation before mount lookup - // (fs/namei.c:lookup_fast() => __d_lookup_rcu(), d_revalidate(), - // __follow_mount_rcu()). - child, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, childVFSD, ds) - if err != nil { - return nil, err - } - if child == nil { - return nil, syserror.ENOENT - } - if child.isSymlink() && rp.ShouldFollowSymlink() { - target, err := child.readlink(ctx, rp.Mount()) - if err != nil { - return nil, err - } - if err := rp.HandleSymlink(target); err != nil { - return nil, err - } - goto afterSymlink // don't check the current directory again - } - rp.Advance() - return child, nil -} - -// revalidateChildLocked must be called after a call to parent.vfsd.Child(name) -// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be -// nil) to verify that the returned child (or lack thereof) is correct. If no file -// exists at name, revalidateChildLocked returns (nil, nil). -// -// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. -// parent.isDir(). name is not "." or "..". -// -// Postconditions: If revalidateChildLocked returns a non-nil dentry, its -// cached metadata is up to date. -func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, childVFSD *vfs.Dentry, ds **[]*dentry) (*dentry, error) { - if childVFSD != nil && fs.opts.interop != InteropModeShared { - // We have a cached dentry that is assumed to be correct. - return childVFSD.Impl().(*dentry), nil - } - // We either don't have a cached dentry or need to verify that it's still - // correct, either of which requires a remote lookup. Check if this name is - // valid before performing the lookup. - if len(name) > maxFilenameLen { - return nil, syserror.ENAMETOOLONG - } - // Check if we've already cached this lookup with a negative result. - if _, ok := parent.negativeChildren[name]; ok { - return nil, nil - } - // Perform the remote lookup. - qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) - if err != nil && err != syserror.ENOENT { - return nil, err - } - if childVFSD != nil { - child := childVFSD.Impl().(*dentry) - if !file.isNil() && qid.Path == child.ino { - // The file at this path hasn't changed. Just update cached - // metadata. - file.close(ctx) - child.updateFromP9Attrs(attrMask, &attr) - return child, nil - } - // The file at this path has changed or no longer exists. Remove - // the stale dentry from the tree, and re-evaluate its caching - // status (i.e. if it has 0 references, drop it). - vfsObj.ForceDeleteDentry(childVFSD) - *ds = appendDentry(*ds, child) - childVFSD = nil - } - if file.isNil() { - // No file exists at this path now. Cache the negative lookup if - // allowed. - if fs.opts.interop != InteropModeShared { - parent.cacheNegativeChildLocked(name) - } - return nil, nil - } - // Create a new dentry representing the file. - child, err := fs.newDentry(ctx, file, qid, attrMask, &attr) - if err != nil { - file.close(ctx) - return nil, err - } - parent.IncRef() // reference held by child on its parent - parent.vfsd.InsertChild(&child.vfsd, name) - // For now, child has 0 references, so our caller should call - // child.checkCachingLocked(). - *ds = appendDentry(*ds, child) - return child, nil -} - -// walkParentDirLocked resolves all but the last path component of rp to an -// existing directory, starting from the given directory (which is usually -// rp.Start().Impl().(*dentry)). It does not check that the returned directory -// is searchable by the provider of rp. -// -// Preconditions: fs.renameMu must be locked. !rp.Done(). If fs.opts.interop == -// InteropModeShared, then d's cached metadata must be up to date. -func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { - for !rp.Final() { - d.dirMu.Lock() - next, err := fs.stepLocked(ctx, rp, d, ds) - d.dirMu.Unlock() - if err != nil { - return nil, err - } - d = next - } - if !d.isDir() { - return nil, syserror.ENOTDIR - } - return d, nil -} - -// resolveLocked resolves rp to an existing file. -// -// Preconditions: fs.renameMu must be locked. -func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { - d := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { - // Get updated metadata for rp.Start() as required by fs.stepLocked(). - if err := d.updateFromGetattr(ctx); err != nil { - return nil, err - } - } - for !rp.Done() { - d.dirMu.Lock() - next, err := fs.stepLocked(ctx, rp, d, ds) - d.dirMu.Unlock() - if err != nil { - return nil, err - } - d = next - } - if rp.MustBeDir() && !d.isDir() { - return nil, syserror.ENOTDIR - } - return d, nil -} - -// doCreateAt checks that creating a file at rp is permitted, then invokes -// create to do so. -// -// Preconditions: !rp.Done(). For the final path component in rp, -// !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error { - var ds *[]*dentry - fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) - start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { - // Get updated metadata for start as required by - // fs.walkParentDirLocked(). - if err := start.updateFromGetattr(ctx); err != nil { - return err - } - } - parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) - if err != nil { - return err - } - if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { - return err - } - if parent.isDeleted() { - return syserror.ENOENT - } - name := rp.Component() - if name == "." || name == ".." { - return syserror.EEXIST - } - if len(name) > maxFilenameLen { - return syserror.ENAMETOOLONG - } - if !dir && rp.MustBeDir() { - return syserror.ENOENT - } - mnt := rp.Mount() - if err := mnt.CheckBeginWrite(); err != nil { - return err - } - defer mnt.EndWrite() - parent.dirMu.Lock() - defer parent.dirMu.Unlock() - if fs.opts.interop == InteropModeShared { - // The existence of a dentry at name would be inconclusive because the - // file it represents may have been deleted from the remote filesystem, - // so we would need to make an RPC to revalidate the dentry. Just - // attempt the file creation RPC instead. If a file does exist, the RPC - // will fail with EEXIST like we would have. If the RPC succeeds, and a - // stale dentry exists, the dentry will fail revalidation next time - // it's used. - return create(parent, name) - } - if parent.vfsd.Child(name) != nil { - return syserror.EEXIST - } - // No cached dentry exists; however, there might still be an existing file - // at name. As above, we attempt the file creation RPC anyway. - if err := create(parent, name); err != nil { - return err - } - parent.touchCMtime(ctx) - delete(parent.negativeChildren, name) - parent.dirents = nil - return nil -} - -// Preconditions: !rp.Done(). -func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error { - var ds *[]*dentry - fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) - start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { - // Get updated metadata for start as required by - // fs.walkParentDirLocked(). - if err := start.updateFromGetattr(ctx); err != nil { - return err - } - } - parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) - if err != nil { - return err - } - if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { - return err - } - if err := rp.Mount().CheckBeginWrite(); err != nil { - return err - } - defer rp.Mount().EndWrite() - - name := rp.Component() - if dir { - if name == "." { - return syserror.EINVAL - } - if name == ".." { - return syserror.ENOTEMPTY - } - } else { - if name == "." || name == ".." { - return syserror.EISDIR - } - } - vfsObj := rp.VirtualFilesystem() - mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() - parent.dirMu.Lock() - defer parent.dirMu.Unlock() - childVFSD := parent.vfsd.Child(name) - var child *dentry - // We only need a dentry representing the file at name if it can be a mount - // point. If childVFSD is nil, then it can't be a mount point. If childVFSD - // is non-nil but stale, the actual file can't be a mount point either; we - // detect this case by just speculatively calling PrepareDeleteDentry and - // only revalidating the dentry if that fails (indicating that the existing - // dentry is a mount point). - if childVFSD != nil { - child = childVFSD.Impl().(*dentry) - if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { - child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, childVFSD, &ds) - if err != nil { - return err - } - if child != nil { - childVFSD = &child.vfsd - if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { - return err - } - } else { - childVFSD = nil - } - } - } else if _, ok := parent.negativeChildren[name]; ok { - return syserror.ENOENT - } - flags := uint32(0) - if dir { - if child != nil && !child.isDir() { - return syserror.ENOTDIR - } - flags = linux.AT_REMOVEDIR - } else { - if child != nil && child.isDir() { - return syserror.EISDIR - } - if rp.MustBeDir() { - return syserror.ENOTDIR - } - } - err = parent.file.unlinkAt(ctx, name, flags) - if err != nil { - if childVFSD != nil { - vfsObj.AbortDeleteDentry(childVFSD) - } - return err - } - if fs.opts.interop != InteropModeShared { - parent.touchCMtime(ctx) - parent.cacheNegativeChildLocked(name) - parent.dirents = nil - } - if child != nil { - child.setDeleted() - vfsObj.CommitDeleteDentry(childVFSD) - ds = appendDentry(ds, child) - } - return nil -} - -// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls -// dentry.checkCachingLocked on all dentries in *ds with fs.renameMu locked for -// writing. -// -// ds is a pointer-to-pointer since defer evaluates its arguments immediately, -// but dentry slices are allocated lazily, and it's much easier to say "defer -// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { -// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. -func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) { - fs.renameMu.RUnlock() - if *ds == nil { - return - } - if len(**ds) != 0 { - fs.renameMu.Lock() - for _, d := range **ds { - d.checkCachingLocked() - } - fs.renameMu.Unlock() - } - putDentrySlice(*ds) -} - -func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) { - if *ds == nil { - fs.renameMu.Unlock() - return - } - for _, d := range **ds { - d.checkCachingLocked() - } - fs.renameMu.Unlock() - putDentrySlice(*ds) -} - -// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. -func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { - var ds *[]*dentry - fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) - d, err := fs.resolveLocked(ctx, rp, &ds) - if err != nil { - return nil, err - } - if opts.CheckSearchable { - if !d.isDir() { - return nil, syserror.ENOTDIR - } - if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { - return nil, err - } - } - d.IncRef() - return &d.vfsd, nil -} - -// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. -func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { - var ds *[]*dentry - fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) - start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { - // Get updated metadata for start as required by - // fs.walkParentDirLocked(). - if err := start.updateFromGetattr(ctx); err != nil { - return nil, err - } - } - d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) - if err != nil { - return nil, err - } - d.IncRef() - return &d.vfsd, nil -} - -// LinkAt implements vfs.FilesystemImpl.LinkAt. -func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error { - if rp.Mount() != vd.Mount() { - return syserror.EXDEV - } - // 9P2000.L supports hard links, but we don't. - return syserror.EPERM - }) -} - -// MkdirAt implements vfs.FilesystemImpl.MkdirAt. -func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { - return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error { - creds := rp.Credentials() - _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) - return err - }) -} - -// MknodAt implements vfs.FilesystemImpl.MknodAt. -func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error { - creds := rp.Credentials() - _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) - return err - }) -} - -// OpenAt implements vfs.FilesystemImpl.OpenAt. -func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - // Reject O_TMPFILE, which is not supported; supporting it correctly in the - // presence of other remote filesystem users requires remote filesystem - // support, and it isn't clear that there's any way to implement this in - // 9P. - if opts.Flags&linux.O_TMPFILE != 0 { - return nil, syserror.EOPNOTSUPP - } - mayCreate := opts.Flags&linux.O_CREAT != 0 - mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL) - - var ds *[]*dentry - fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) - - start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { - // Get updated metadata for start as required by fs.stepLocked(). - if err := start.updateFromGetattr(ctx); err != nil { - return nil, err - } - } - if rp.Done() { - return start.openLocked(ctx, rp, &opts) - } - -afterTrailingSymlink: - parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) - if err != nil { - return nil, err - } - // Check for search permission in the parent directory. - if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { - return nil, err - } - // Determine whether or not we need to create a file. - parent.dirMu.Lock() - child, err := fs.stepLocked(ctx, rp, parent, &ds) - if err == syserror.ENOENT && mayCreate { - fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts) - parent.dirMu.Unlock() - return fd, err - } - if err != nil { - parent.dirMu.Unlock() - return nil, err - } - // Open existing child or follow symlink. - parent.dirMu.Unlock() - if mustCreate { - return nil, syserror.EEXIST - } - if child.isSymlink() && rp.ShouldFollowSymlink() { - target, err := child.readlink(ctx, rp.Mount()) - if err != nil { - return nil, err - } - if err := rp.HandleSymlink(target); err != nil { - return nil, err - } - start = parent - goto afterTrailingSymlink - } - return child.openLocked(ctx, rp, &opts) -} - -// Preconditions: fs.renameMu must be locked. -func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { - ats := vfs.AccessTypesForOpenFlags(opts) - if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil { - return nil, err - } - mnt := rp.Mount() - filetype := d.fileType() - switch { - case filetype == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD: - if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil { - return nil, err - } - fd := ®ularFileFD{} - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ - AllowDirectIO: true, - }); err != nil { - return nil, err - } - return &fd.vfsfd, nil - case filetype == linux.S_IFDIR: - // Can't open directories with O_CREAT. - if opts.Flags&linux.O_CREAT != 0 { - return nil, syserror.EISDIR - } - // Can't open directories writably. - if ats&vfs.MayWrite != 0 { - return nil, syserror.EISDIR - } - if opts.Flags&linux.O_DIRECT != 0 { - return nil, syserror.EINVAL - } - if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { - return nil, err - } - fd := &directoryFD{} - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { - return nil, err - } - return &fd.vfsfd, nil - case filetype == linux.S_IFLNK: - // Can't open symlinks without O_PATH (which is unimplemented). - return nil, syserror.ELOOP - default: - if opts.Flags&linux.O_DIRECT != 0 { - return nil, syserror.EINVAL - } - h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0) - if err != nil { - return nil, err - } - fd := &specialFileFD{ - handle: h, - } - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { - h.close(ctx) - return nil, err - } - return &fd.vfsfd, nil - } -} - -// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked. -func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { - if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { - return nil, err - } - if d.isDeleted() { - return nil, syserror.ENOENT - } - mnt := rp.Mount() - if err := mnt.CheckBeginWrite(); err != nil { - return nil, err - } - defer mnt.EndWrite() - - // 9P2000.L's lcreate takes a fid representing the parent directory, and - // converts it into an open fid representing the created file, so we need - // to duplicate the directory fid first. - _, dirfile, err := d.file.walk(ctx, nil) - if err != nil { - return nil, err - } - creds := rp.Credentials() - name := rp.Component() - fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, (p9.OpenFlags)(opts.Flags), (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) - if err != nil { - dirfile.close(ctx) - return nil, err - } - // Then we need to walk to the file we just created to get a non-open fid - // representing it, and to get its metadata. This must use d.file since, as - // explained above, dirfile was invalidated by dirfile.Create(). - walkQID, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) - if err != nil { - openFile.close(ctx) - if fdobj != nil { - fdobj.Close() - } - return nil, err - } - // Sanity-check that we walked to the file we created. - if createQID.Path != walkQID.Path { - // Probably due to concurrent remote filesystem mutation? - ctx.Warningf("gofer.dentry.createAndOpenChildLocked: created file has QID %v before walk, QID %v after (interop=%v)", createQID, walkQID, d.fs.opts.interop) - nonOpenFile.close(ctx) - openFile.close(ctx) - if fdobj != nil { - fdobj.Close() - } - return nil, syserror.EAGAIN - } - - // Construct the new dentry. - child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr) - if err != nil { - nonOpenFile.close(ctx) - openFile.close(ctx) - if fdobj != nil { - fdobj.Close() - } - return nil, err - } - // Incorporate the fid that was opened by lcreate. - useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD - if useRegularFileFD { - child.handleMu.Lock() - child.handle.file = openFile - if fdobj != nil { - child.handle.fd = int32(fdobj.Release()) - } - child.handleReadable = vfs.MayReadFileWithOpenFlags(opts.Flags) - child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags) - child.handleMu.Unlock() - } - // Take a reference on the new dentry to be held by the new file - // description. (This reference also means that the new dentry is not - // eligible for caching yet, so we don't need to append to a dentry slice.) - child.refs = 1 - // Insert the dentry into the tree. - d.IncRef() // reference held by child on its parent d - d.vfsd.InsertChild(&child.vfsd, name) - if d.fs.opts.interop != InteropModeShared { - d.touchCMtime(ctx) - delete(d.negativeChildren, name) - d.dirents = nil - } - - // Finally, construct a file description representing the created file. - var childVFSFD *vfs.FileDescription - mnt.IncRef() - if useRegularFileFD { - fd := ®ularFileFD{} - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ - AllowDirectIO: true, - }); err != nil { - return nil, err - } - childVFSFD = &fd.vfsfd - } else { - fd := &specialFileFD{ - handle: handle{ - file: openFile, - fd: -1, - }, - } - if fdobj != nil { - fd.handle.fd = int32(fdobj.Release()) - } - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{}); err != nil { - fd.handle.close(ctx) - return nil, err - } - childVFSFD = &fd.vfsfd - } - return childVFSFD, nil -} - -// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. -func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { - var ds *[]*dentry - fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) - d, err := fs.resolveLocked(ctx, rp, &ds) - if err != nil { - return "", err - } - if !d.isSymlink() { - return "", syserror.EINVAL - } - return d.readlink(ctx, rp.Mount()) -} - -// RenameAt implements vfs.FilesystemImpl.RenameAt. -func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { - if opts.Flags != 0 { - // Requires 9P support. - return syserror.EINVAL - } - - var ds *[]*dentry - fs.renameMu.Lock() - defer fs.renameMuUnlockAndCheckCaching(&ds) - newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) - if err != nil { - return err - } - newName := rp.Component() - if newName == "." || newName == ".." { - return syserror.EBUSY - } - mnt := rp.Mount() - if mnt != oldParentVD.Mount() { - return syserror.EXDEV - } - if err := mnt.CheckBeginWrite(); err != nil { - return err - } - defer mnt.EndWrite() - - oldParent := oldParentVD.Dentry().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { - if err := oldParent.updateFromGetattr(ctx); err != nil { - return err - } - } - if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { - return err - } - vfsObj := rp.VirtualFilesystem() - // We need a dentry representing the renamed file since, if it's a - // directory, we need to check for write permission on it. - oldParent.dirMu.Lock() - defer oldParent.dirMu.Unlock() - renamed, err := fs.revalidateChildLocked(ctx, vfsObj, oldParent, oldName, oldParent.vfsd.Child(oldName), &ds) - if err != nil { - return err - } - if renamed == nil { - return syserror.ENOENT - } - if renamed.isDir() { - if renamed == newParent || renamed.vfsd.IsAncestorOf(&newParent.vfsd) { - return syserror.EINVAL - } - if oldParent != newParent { - if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { - return err - } - } - } else { - if opts.MustBeDir || rp.MustBeDir() { - return syserror.ENOTDIR - } - } - - if oldParent != newParent { - if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { - return err - } - newParent.dirMu.Lock() - defer newParent.dirMu.Unlock() - } - if newParent.isDeleted() { - return syserror.ENOENT - } - replacedVFSD := newParent.vfsd.Child(newName) - var replaced *dentry - // This is similar to unlinkAt, except: - // - // - We revalidate the replaced dentry unconditionally for simplicity. - // - // - If rp.MustBeDir(), then we need a dentry representing the replaced - // file regardless to confirm that it's a directory. - if replacedVFSD != nil || rp.MustBeDir() { - replaced, err = fs.revalidateChildLocked(ctx, vfsObj, newParent, newName, replacedVFSD, &ds) - if err != nil { - return err - } - if replaced != nil { - if replaced.isDir() { - if !renamed.isDir() { - return syserror.EISDIR - } - } else { - if rp.MustBeDir() || renamed.isDir() { - return syserror.ENOTDIR - } - } - replacedVFSD = &replaced.vfsd - } else { - replacedVFSD = nil - } - } - - if oldParent == newParent && oldName == newName { - return nil - } - mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() - if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { - return err - } - if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { - vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) - return err - } - if fs.opts.interop != InteropModeShared { - oldParent.cacheNegativeChildLocked(oldName) - oldParent.dirents = nil - delete(newParent.negativeChildren, newName) - newParent.dirents = nil - } - vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD) - return nil -} - -// RmdirAt implements vfs.FilesystemImpl.RmdirAt. -func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { - return fs.unlinkAt(ctx, rp, true /* dir */) -} - -// SetStatAt implements vfs.FilesystemImpl.SetStatAt. -func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { - var ds *[]*dentry - fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) - d, err := fs.resolveLocked(ctx, rp, &ds) - if err != nil { - return err - } - return d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount()) -} - -// StatAt implements vfs.FilesystemImpl.StatAt. -func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { - var ds *[]*dentry - fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) - d, err := fs.resolveLocked(ctx, rp, &ds) - if err != nil { - return linux.Statx{}, err - } - // Since walking updates metadata for all traversed dentries under - // InteropModeShared, including the returned one, we can return cached - // metadata here regardless of fs.opts.interop. - var stat linux.Statx - d.statTo(&stat) - return stat, nil -} - -// StatFSAt implements vfs.FilesystemImpl.StatFSAt. -func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { - var ds *[]*dentry - fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) - d, err := fs.resolveLocked(ctx, rp, &ds) - if err != nil { - return linux.Statfs{}, err - } - fsstat, err := d.file.statFS(ctx) - if err != nil { - return linux.Statfs{}, err - } - nameLen := uint64(fsstat.NameLength) - if nameLen > maxFilenameLen { - nameLen = maxFilenameLen - } - return linux.Statfs{ - // This is primarily for distinguishing a gofer file system in - // tests. Testing is important, so instead of defining - // something completely random, use a standard value. - Type: linux.V9FS_MAGIC, - BlockSize: int64(fsstat.BlockSize), - Blocks: fsstat.Blocks, - BlocksFree: fsstat.BlocksFree, - BlocksAvailable: fsstat.BlocksAvailable, - Files: fsstat.Files, - FilesFree: fsstat.FilesFree, - NameLength: nameLen, - }, nil -} - -// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. -func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error { - creds := rp.Credentials() - _, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) - return err - }) -} - -// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. -func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { - return fs.unlinkAt(ctx, rp, false /* dir */) -} - -// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { - var ds *[]*dentry - fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) - d, err := fs.resolveLocked(ctx, rp, &ds) - if err != nil { - return nil, err - } - return d.listxattr(ctx) -} - -// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { - var ds *[]*dentry - fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) - d, err := fs.resolveLocked(ctx, rp, &ds) - if err != nil { - return "", err - } - return d.getxattr(ctx, name) -} - -// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. -func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { - var ds *[]*dentry - fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) - d, err := fs.resolveLocked(ctx, rp, &ds) - if err != nil { - return err - } - return d.setxattr(ctx, &opts) -} - -// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. -func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { - var ds *[]*dentry - fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) - d, err := fs.resolveLocked(ctx, rp, &ds) - if err != nil { - return err - } - return d.removexattr(ctx, name) -} - -// PrependPath implements vfs.FilesystemImpl.PrependPath. -func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { - fs.renameMu.RLock() - defer fs.renameMu.RUnlock() - return vfs.GenericPrependPath(vfsroot, vd, b) -} diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go deleted file mode 100644 index c4a8f0b38..000000000 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ /dev/null @@ -1,1150 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package gofer provides a filesystem implementation that is backed by a 9p -// server, interchangably referred to as "gofers" throughout this package. -// -// Lock order: -// regularFileFD/directoryFD.mu -// filesystem.renameMu -// dentry.dirMu -// filesystem.syncMu -// dentry.metadataMu -// *** "memmap.Mappable locks" below this point -// dentry.mapsMu -// *** "memmap.Mappable locks taken by Translate" below this point -// dentry.handleMu -// dentry.dataMu -// -// Locking dentry.dirMu in multiple dentries requires holding -// filesystem.renameMu for writing. -package gofer - -import ( - "fmt" - "strconv" - "sync" - "sync/atomic" - "syscall" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/unet" - "gvisor.dev/gvisor/pkg/usermem" -) - -// Name is the default filesystem name. -const Name = "9p" - -// FilesystemType implements vfs.FilesystemType. -type FilesystemType struct{} - -// filesystem implements vfs.FilesystemImpl. -type filesystem struct { - vfsfs vfs.Filesystem - - // mfp is used to allocate memory that caches regular file contents. mfp is - // immutable. - mfp pgalloc.MemoryFileProvider - - // Immutable options. - opts filesystemOptions - - // client is the client used by this filesystem. client is immutable. - client *p9.Client - - // uid and gid are the effective KUID and KGID of the filesystem's creator, - // and are used as the owner and group for files that don't specify one. - // uid and gid are immutable. - uid auth.KUID - gid auth.KGID - - // renameMu serves two purposes: - // - // - It synchronizes path resolution with renaming initiated by this - // client. - // - // - It is held by path resolution to ensure that reachable dentries remain - // valid. A dentry is reachable by path resolution if it has a non-zero - // reference count (such that it is usable as vfs.ResolvingPath.Start() or - // is reachable from its children), or if it is a child dentry (such that - // it is reachable from its parent). - renameMu sync.RWMutex - - // cachedDentries contains all dentries with 0 references. (Due to race - // conditions, it may also contain dentries with non-zero references.) - // cachedDentriesLen is the number of dentries in cachedDentries. These - // fields are protected by renameMu. - cachedDentries dentryList - cachedDentriesLen uint64 - - // dentries contains all dentries in this filesystem. specialFileFDs - // contains all open specialFileFDs. These fields are protected by syncMu. - syncMu sync.Mutex - dentries map[*dentry]struct{} - specialFileFDs map[*specialFileFD]struct{} -} - -type filesystemOptions struct { - // "Standard" 9P options. - fd int - aname string - interop InteropMode // derived from the "cache" mount option - msize uint32 - version string - - // maxCachedDentries is the maximum number of dentries with 0 references - // retained by the client. - maxCachedDentries uint64 - - // If forcePageCache is true, host FDs may not be used for application - // memory mappings even if available; instead, the client must perform its - // own caching of regular file pages. This is primarily useful for testing. - forcePageCache bool - - // If limitHostFDTranslation is true, apply maxFillRange() constraints to - // host FD mappings returned by dentry.(memmap.Mappable).Translate(). This - // makes memory accounting behavior more consistent between cases where - // host FDs are / are not available, but may increase the frequency of - // sentry-handled page faults on files for which a host FD is available. - limitHostFDTranslation bool - - // If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote - // filesystem may not be coherent with writable host FDs opened later, so - // mappings of the former must be replaced by mappings of the latter. This - // is usually only the case when the remote filesystem is an overlayfs - // mount on Linux < 4.19. - overlayfsStaleRead bool - - // If regularFilesUseSpecialFileFD is true, application FDs representing - // regular files will use distinct file handles for each FD, in the same - // way that application FDs representing "special files" such as sockets - // do. Note that this disables client caching and mmap for regular files. - regularFilesUseSpecialFileFD bool -} - -// InteropMode controls the client's interaction with other remote filesystem -// users. -type InteropMode uint32 - -const ( - // InteropModeExclusive is appropriate when the filesystem client is the - // only user of the remote filesystem. - // - // - The client may cache arbitrary filesystem state (file data, metadata, - // filesystem structure, etc.). - // - // - Client changes to filesystem state may be sent to the remote - // filesystem asynchronously, except when server permission checks are - // necessary. - // - // - File timestamps are based on client clocks. This ensures that users of - // the client observe timestamps that are coherent with their own clocks - // and consistent with Linux's semantics. However, since it is not always - // possible for clients to set arbitrary atimes and mtimes, and never - // possible for clients to set arbitrary ctimes, file timestamp changes are - // stored in the client only and never sent to the remote filesystem. - InteropModeExclusive InteropMode = iota - - // InteropModeWritethrough is appropriate when there are read-only users of - // the remote filesystem that expect to observe changes made by the - // filesystem client. - // - // - The client may cache arbitrary filesystem state. - // - // - Client changes to filesystem state must be sent to the remote - // filesystem synchronously. - // - // - File timestamps are based on client clocks. As a corollary, access - // timestamp changes from other remote filesystem users will not be visible - // to the client. - InteropModeWritethrough - - // InteropModeShared is appropriate when there are users of the remote - // filesystem that may mutate its state other than the client. - // - // - The client must verify cached filesystem state before using it. - // - // - Client changes to filesystem state must be sent to the remote - // filesystem synchronously. - // - // - File timestamps are based on server clocks. This is necessary to - // ensure that timestamp changes are synchronized between remote filesystem - // users. - // - // Note that the correctness of InteropModeShared depends on the server - // correctly implementing 9P fids (i.e. each fid immutably represents a - // single filesystem object), even in the presence of remote filesystem - // mutations from other users. If this is violated, the behavior of the - // client is undefined. - InteropModeShared -) - -// GetFilesystem implements vfs.FilesystemType.GetFilesystem. -func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - mfp := pgalloc.MemoryFileProviderFromContext(ctx) - if mfp == nil { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider") - return nil, nil, syserror.EINVAL - } - - mopts := vfs.GenericParseMountOptions(opts.Data) - var fsopts filesystemOptions - - // Check that the transport is "fd". - trans, ok := mopts["trans"] - if !ok { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: transport must be specified as 'trans=fd'") - return nil, nil, syserror.EINVAL - } - delete(mopts, "trans") - if trans != "fd" { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: unsupported transport: trans=%s", trans) - return nil, nil, syserror.EINVAL - } - - // Check that read and write FDs are provided and identical. - rfdstr, ok := mopts["rfdno"] - if !ok { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD must be specified as 'rfdno=<file descriptor>") - return nil, nil, syserror.EINVAL - } - delete(mopts, "rfdno") - rfd, err := strconv.Atoi(rfdstr) - if err != nil { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid read FD: rfdno=%s", rfdstr) - return nil, nil, syserror.EINVAL - } - wfdstr, ok := mopts["wfdno"] - if !ok { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: write FD must be specified as 'wfdno=<file descriptor>") - return nil, nil, syserror.EINVAL - } - delete(mopts, "wfdno") - wfd, err := strconv.Atoi(wfdstr) - if err != nil { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid write FD: wfdno=%s", wfdstr) - return nil, nil, syserror.EINVAL - } - if rfd != wfd { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD (%d) and write FD (%d) must be equal", rfd, wfd) - return nil, nil, syserror.EINVAL - } - fsopts.fd = rfd - - // Get the attach name. - fsopts.aname = "/" - if aname, ok := mopts["aname"]; ok { - delete(mopts, "aname") - fsopts.aname = aname - } - - // Parse the cache policy. For historical reasons, this defaults to the - // least generally-applicable option, InteropModeExclusive. - fsopts.interop = InteropModeExclusive - if cache, ok := mopts["cache"]; ok { - delete(mopts, "cache") - switch cache { - case "fscache": - fsopts.interop = InteropModeExclusive - case "fscache_writethrough": - fsopts.interop = InteropModeWritethrough - case "none": - fsopts.regularFilesUseSpecialFileFD = true - fallthrough - case "remote_revalidating": - fsopts.interop = InteropModeShared - default: - ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: cache=%s", cache) - return nil, nil, syserror.EINVAL - } - } - - // Parse the 9P message size. - fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M - if msizestr, ok := mopts["msize"]; ok { - delete(mopts, "msize") - msize, err := strconv.ParseUint(msizestr, 10, 32) - if err != nil { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid message size: msize=%s", msizestr) - return nil, nil, syserror.EINVAL - } - fsopts.msize = uint32(msize) - } - - // Parse the 9P protocol version. - fsopts.version = p9.HighestVersionString() - if version, ok := mopts["version"]; ok { - delete(mopts, "version") - fsopts.version = version - } - - // Parse the dentry cache limit. - fsopts.maxCachedDentries = 1000 - if str, ok := mopts["dentry_cache_limit"]; ok { - delete(mopts, "dentry_cache_limit") - maxCachedDentries, err := strconv.ParseUint(str, 10, 64) - if err != nil { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) - return nil, nil, syserror.EINVAL - } - fsopts.maxCachedDentries = maxCachedDentries - } - - // Handle simple flags. - if _, ok := mopts["force_page_cache"]; ok { - delete(mopts, "force_page_cache") - fsopts.forcePageCache = true - } - if _, ok := mopts["limit_host_fd_translation"]; ok { - delete(mopts, "limit_host_fd_translation") - fsopts.limitHostFDTranslation = true - } - if _, ok := mopts["overlayfs_stale_read"]; ok { - delete(mopts, "overlayfs_stale_read") - fsopts.overlayfsStaleRead = true - } - // fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying - // "cache=none". - - // Check for unparsed options. - if len(mopts) != 0 { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts) - return nil, nil, syserror.EINVAL - } - - // Establish a connection with the server. - conn, err := unet.NewSocket(fsopts.fd) - if err != nil { - return nil, nil, err - } - - // Perform version negotiation with the server. - ctx.UninterruptibleSleepStart(false) - client, err := p9.NewClient(conn, fsopts.msize, fsopts.version) - ctx.UninterruptibleSleepFinish(false) - if err != nil { - conn.Close() - return nil, nil, err - } - // Ownership of conn has been transferred to client. - - // Perform attach to obtain the filesystem root. - ctx.UninterruptibleSleepStart(false) - attached, err := client.Attach(fsopts.aname) - ctx.UninterruptibleSleepFinish(false) - if err != nil { - client.Close() - return nil, nil, err - } - attachFile := p9file{attached} - qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask()) - if err != nil { - attachFile.close(ctx) - client.Close() - return nil, nil, err - } - - // Construct the filesystem object. - fs := &filesystem{ - mfp: mfp, - opts: fsopts, - uid: creds.EffectiveKUID, - gid: creds.EffectiveKGID, - client: client, - dentries: make(map[*dentry]struct{}), - specialFileFDs: make(map[*specialFileFD]struct{}), - } - fs.vfsfs.Init(vfsObj, fs) - - // Construct the root dentry. - root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr) - if err != nil { - attachFile.close(ctx) - fs.vfsfs.DecRef() - return nil, nil, err - } - // Set the root's reference count to 2. One reference is returned to the - // caller, and the other is deliberately leaked to prevent the root from - // being "cached" and subsequently evicted. Its resources will still be - // cleaned up by fs.Release(). - root.refs = 2 - - return &fs.vfsfs, &root.vfsd, nil -} - -// Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { - ctx := context.Background() - mf := fs.mfp.MemoryFile() - - fs.syncMu.Lock() - for d := range fs.dentries { - d.handleMu.Lock() - d.dataMu.Lock() - if d.handleWritable { - // Write dirty cached data to the remote file. - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt); err != nil { - log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err) - } - // TODO(jamieliu): Do we need to flushf/fsync d? - } - // Discard cached pages. - d.cache.DropAll(mf) - d.dirty.RemoveAll() - d.dataMu.Unlock() - // Close the host fd if one exists. - if d.handle.fd >= 0 { - syscall.Close(int(d.handle.fd)) - d.handle.fd = -1 - } - d.handleMu.Unlock() - } - // There can't be any specialFileFDs still using fs, since each such - // FileDescription would hold a reference on a Mount holding a reference on - // fs. - fs.syncMu.Unlock() - - // Close the connection to the server. This implicitly clunks all fids. - fs.client.Close() -} - -// dentry implements vfs.DentryImpl. -type dentry struct { - vfsd vfs.Dentry - - // refs is the reference count. Each dentry holds a reference on its - // parent, even if disowned. refs is accessed using atomic memory - // operations. - refs int64 - - // fs is the owning filesystem. fs is immutable. - fs *filesystem - - // We don't support hard links, so each dentry maps 1:1 to an inode. - - // file is the unopened p9.File that backs this dentry. file is immutable. - file p9file - - // If deleted is non-zero, the file represented by this dentry has been - // deleted. deleted is accessed using atomic memory operations. - deleted uint32 - - // If cached is true, dentryEntry links dentry into - // filesystem.cachedDentries. cached and dentryEntry are protected by - // filesystem.renameMu. - cached bool - dentryEntry - - dirMu sync.Mutex - - // If this dentry represents a directory, and InteropModeShared is not in - // effect, negativeChildren is a set of child names in this directory that - // are known not to exist. negativeChildren is protected by dirMu. - negativeChildren map[string]struct{} - - // If this dentry represents a directory, InteropModeShared is not in - // effect, and dirents is not nil, it is a cache of all entries in the - // directory, in the order they were returned by the server. dirents is - // protected by dirMu. - dirents []vfs.Dirent - - // Cached metadata; protected by metadataMu and accessed using atomic - // memory operations unless otherwise specified. - metadataMu sync.Mutex - ino uint64 // immutable - mode uint32 // type is immutable, perms are mutable - uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic - gid uint32 // auth.KGID, but ... - blockSize uint32 // 0 if unknown - // Timestamps, all nsecs from the Unix epoch. - atime int64 - mtime int64 - ctime int64 - btime int64 - // File size, protected by both metadataMu and dataMu (i.e. both must be - // locked to mutate it). - size uint64 - - mapsMu sync.Mutex - - // If this dentry represents a regular file, mappings tracks mappings of - // the file into memmap.MappingSpaces. mappings is protected by mapsMu. - mappings memmap.MappingSet - - // If this dentry represents a regular file or directory: - // - // - handle is the I/O handle used by all regularFileFDs/directoryFDs - // representing this dentry. - // - // - handleReadable is true if handle is readable. - // - // - handleWritable is true if handle is writable. - // - // Invariants: - // - // - If handleReadable == handleWritable == false, then handle.file == nil - // (i.e. there is no open handle). Conversely, if handleReadable || - // handleWritable == true, then handle.file != nil (i.e. there is an open - // handle). - // - // - handleReadable and handleWritable cannot transition from true to false - // (i.e. handles may not be downgraded). - // - // These fields are protected by handleMu. - handleMu sync.RWMutex - handle handle - handleReadable bool - handleWritable bool - - dataMu sync.RWMutex - - // If this dentry represents a regular file that is client-cached, cache - // maps offsets into the cached file to offsets into - // filesystem.mfp.MemoryFile() that store the file's data. cache is - // protected by dataMu. - cache fsutil.FileRangeSet - - // If this dentry represents a regular file that is client-cached, dirty - // tracks dirty segments in cache. dirty is protected by dataMu. - dirty fsutil.DirtySet - - // pf implements platform.File for mappings of handle.fd. - pf dentryPlatformFile - - // If this dentry represents a symbolic link, InteropModeShared is not in - // effect, and haveTarget is true, target is the symlink target. haveTarget - // and target are protected by dataMu. - haveTarget bool - target string -} - -// dentryAttrMask returns a p9.AttrMask enabling all attributes used by the -// gofer client. -func dentryAttrMask() p9.AttrMask { - return p9.AttrMask{ - Mode: true, - UID: true, - GID: true, - ATime: true, - MTime: true, - CTime: true, - Size: true, - BTime: true, - } -} - -// newDentry creates a new dentry representing the given file. The dentry -// initially has no references, but is not cached; it is the caller's -// responsibility to set the dentry's reference count and/or call -// dentry.checkCachingLocked() as appropriate. -func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) { - if !mask.Mode { - ctx.Warningf("can't create gofer.dentry without file type") - return nil, syserror.EIO - } - if attr.Mode.FileType() == p9.ModeRegular && !mask.Size { - ctx.Warningf("can't create regular file gofer.dentry without file size") - return nil, syserror.EIO - } - - d := &dentry{ - fs: fs, - file: file, - ino: qid.Path, - mode: uint32(attr.Mode), - uid: uint32(fs.uid), - gid: uint32(fs.gid), - blockSize: usermem.PageSize, - handle: handle{ - fd: -1, - }, - } - d.pf.dentry = d - if mask.UID { - d.uid = uint32(attr.UID) - } - if mask.GID { - d.gid = uint32(attr.GID) - } - if mask.Size { - d.size = attr.Size - } - if attr.BlockSize != 0 { - d.blockSize = uint32(attr.BlockSize) - } - if mask.ATime { - d.atime = dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds) - } - if mask.MTime { - d.mtime = dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds) - } - if mask.CTime { - d.ctime = dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds) - } - if mask.BTime { - d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds) - } - d.vfsd.Init(d) - - fs.syncMu.Lock() - fs.dentries[d] = struct{}{} - fs.syncMu.Unlock() - return d, nil -} - -// updateFromP9Attrs is called to update d's metadata after an update from the -// remote filesystem. -func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { - d.metadataMu.Lock() - if mask.Mode { - if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want { - d.metadataMu.Unlock() - panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got)) - } - atomic.StoreUint32(&d.mode, uint32(attr.Mode)) - } - if mask.UID { - atomic.StoreUint32(&d.uid, uint32(attr.UID)) - } - if mask.GID { - atomic.StoreUint32(&d.gid, uint32(attr.GID)) - } - // There is no P9_GETATTR_* bit for I/O block size. - if attr.BlockSize != 0 { - atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize)) - } - if mask.ATime { - atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds)) - } - if mask.MTime { - atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds)) - } - if mask.CTime { - atomic.StoreInt64(&d.ctime, dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds)) - } - if mask.BTime { - atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)) - } - if mask.Size { - d.dataMu.Lock() - atomic.StoreUint64(&d.size, attr.Size) - d.dataMu.Unlock() - } - d.metadataMu.Unlock() -} - -func (d *dentry) updateFromGetattr(ctx context.Context) error { - // Use d.handle.file, which represents a 9P fid that has been opened, in - // preference to d.file, which represents a 9P fid that has not. This may - // be significantly more efficient in some implementations. - var ( - file p9file - handleMuRLocked bool - ) - d.handleMu.RLock() - if !d.handle.file.isNil() { - file = d.handle.file - handleMuRLocked = true - } else { - file = d.file - d.handleMu.RUnlock() - } - _, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask()) - if handleMuRLocked { - d.handleMu.RUnlock() - } - if err != nil { - return err - } - d.updateFromP9Attrs(attrMask, &attr) - return nil -} - -func (d *dentry) fileType() uint32 { - return atomic.LoadUint32(&d.mode) & linux.S_IFMT -} - -func (d *dentry) statTo(stat *linux.Statx) { - stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME - stat.Blksize = atomic.LoadUint32(&d.blockSize) - stat.Nlink = 1 - if d.isDir() { - stat.Nlink = 2 - } - stat.UID = atomic.LoadUint32(&d.uid) - stat.GID = atomic.LoadUint32(&d.gid) - stat.Mode = uint16(atomic.LoadUint32(&d.mode)) - stat.Ino = d.ino - stat.Size = atomic.LoadUint64(&d.size) - // This is consistent with regularFileFD.Seek(), which treats regular files - // as having no holes. - stat.Blocks = (stat.Size + 511) / 512 - stat.Atime = statxTimestampFromDentry(atomic.LoadInt64(&d.atime)) - stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime)) - stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime)) - stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime)) - // TODO(jamieliu): device number -} - -func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error { - if stat.Mask == 0 { - return nil - } - if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { - return syserror.EPERM - } - if err := vfs.CheckSetStat(creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { - return err - } - if err := mnt.CheckBeginWrite(); err != nil { - return err - } - defer mnt.EndWrite() - setLocalAtime := false - setLocalMtime := false - if d.fs.opts.interop != InteropModeShared { - // Timestamp updates will be handled locally. - setLocalAtime = stat.Mask&linux.STATX_ATIME != 0 - setLocalMtime = stat.Mask&linux.STATX_MTIME != 0 - stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME - if !setLocalMtime && (stat.Mask&linux.STATX_SIZE != 0) { - // Truncate updates mtime. - setLocalMtime = true - stat.Mtime.Nsec = linux.UTIME_NOW - } - } - d.metadataMu.Lock() - defer d.metadataMu.Unlock() - if stat.Mask != 0 { - if err := d.file.setAttr(ctx, p9.SetAttrMask{ - Permissions: stat.Mask&linux.STATX_MODE != 0, - UID: stat.Mask&linux.STATX_UID != 0, - GID: stat.Mask&linux.STATX_GID != 0, - Size: stat.Mask&linux.STATX_SIZE != 0, - ATime: stat.Mask&linux.STATX_ATIME != 0, - MTime: stat.Mask&linux.STATX_MTIME != 0, - ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW, - MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW, - }, p9.SetAttr{ - Permissions: p9.FileMode(stat.Mode), - UID: p9.UID(stat.UID), - GID: p9.GID(stat.GID), - Size: stat.Size, - ATimeSeconds: uint64(stat.Atime.Sec), - ATimeNanoSeconds: uint64(stat.Atime.Nsec), - MTimeSeconds: uint64(stat.Mtime.Sec), - MTimeNanoSeconds: uint64(stat.Mtime.Nsec), - }); err != nil { - return err - } - } - if d.fs.opts.interop == InteropModeShared { - // There's no point to updating d's metadata in this case since it'll - // be overwritten by revalidation before the next time it's used - // anyway. (InteropModeShared inhibits client caching of regular file - // data, so there's no cache to truncate either.) - return nil - } - now, haveNow := nowFromContext(ctx) - if !haveNow { - ctx.Warningf("gofer.dentry.setStat: current time not available") - } - if stat.Mask&linux.STATX_MODE != 0 { - atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode)) - } - if stat.Mask&linux.STATX_UID != 0 { - atomic.StoreUint32(&d.uid, stat.UID) - } - if stat.Mask&linux.STATX_GID != 0 { - atomic.StoreUint32(&d.gid, stat.GID) - } - if setLocalAtime { - if stat.Atime.Nsec == linux.UTIME_NOW { - if haveNow { - atomic.StoreInt64(&d.atime, now) - } - } else { - atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime)) - } - } - if setLocalMtime { - if stat.Mtime.Nsec == linux.UTIME_NOW { - if haveNow { - atomic.StoreInt64(&d.mtime, now) - } - } else { - atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime)) - } - } - if haveNow { - atomic.StoreInt64(&d.ctime, now) - } - if stat.Mask&linux.STATX_SIZE != 0 { - d.dataMu.Lock() - oldSize := d.size - d.size = stat.Size - // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings - // below. This allows concurrent calls to Read/Translate/etc. These - // functions synchronize with truncation by refusing to use cache - // contents beyond the new d.size. (We are still holding d.metadataMu, - // so we can't race with Write or another truncate.) - d.dataMu.Unlock() - if d.size < oldSize { - oldpgend := pageRoundUp(oldSize) - newpgend := pageRoundUp(d.size) - if oldpgend != newpgend { - d.mapsMu.Lock() - d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ - // Compare Linux's mm/truncate.c:truncate_setsize() => - // truncate_pagecache() => - // mm/memory.c:unmap_mapping_range(evencows=1). - InvalidatePrivate: true, - }) - d.mapsMu.Unlock() - } - // We are now guaranteed that there are no translations of - // truncated pages, and can remove them from the cache. Since - // truncated pages have been removed from the remote file, they - // should be dropped without being written back. - d.dataMu.Lock() - d.cache.Truncate(d.size, d.fs.mfp.MemoryFile()) - d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend}) - d.dataMu.Unlock() - } - } - return nil -} - -func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { - return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&d.mode))&0777, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) -} - -// IncRef implements vfs.DentryImpl.IncRef. -func (d *dentry) IncRef() { - // d.refs may be 0 if d.fs.renameMu is locked, which serializes against - // d.checkCachingLocked(). - atomic.AddInt64(&d.refs, 1) -} - -// TryIncRef implements vfs.DentryImpl.TryIncRef. -func (d *dentry) TryIncRef() bool { - for { - refs := atomic.LoadInt64(&d.refs) - if refs == 0 { - return false - } - if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) { - return true - } - } -} - -// DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { - if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { - d.fs.renameMu.Lock() - d.checkCachingLocked() - d.fs.renameMu.Unlock() - } else if refs < 0 { - panic("gofer.dentry.DecRef() called without holding a reference") - } -} - -// checkCachingLocked should be called after d's reference count becomes 0 or it -// becomes disowned. -// -// Preconditions: d.fs.renameMu must be locked for writing. -func (d *dentry) checkCachingLocked() { - // Dentries with a non-zero reference count must be retained. (The only way - // to obtain a reference on a dentry with zero references is via path - // resolution, which requires renameMu, so if d.refs is zero then it will - // remain zero while we hold renameMu for writing.) - if atomic.LoadInt64(&d.refs) != 0 { - if d.cached { - d.fs.cachedDentries.Remove(d) - d.fs.cachedDentriesLen-- - d.cached = false - } - return - } - // Non-child dentries with zero references are no longer reachable by path - // resolution and should be dropped immediately. - if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() { - if d.cached { - d.fs.cachedDentries.Remove(d) - d.fs.cachedDentriesLen-- - d.cached = false - } - d.destroyLocked() - return - } - // If d is already cached, just move it to the front of the LRU. - if d.cached { - d.fs.cachedDentries.Remove(d) - d.fs.cachedDentries.PushFront(d) - return - } - // Cache the dentry, then evict the least recently used cached dentry if - // the cache becomes over-full. - d.fs.cachedDentries.PushFront(d) - d.fs.cachedDentriesLen++ - d.cached = true - if d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries { - victim := d.fs.cachedDentries.Back() - d.fs.cachedDentries.Remove(victim) - d.fs.cachedDentriesLen-- - victim.cached = false - // victim.refs may have become non-zero from an earlier path - // resolution since it was inserted into fs.cachedDentries; see - // dentry.incRefLocked(). Either way, we brought - // fs.cachedDentriesLen back down to fs.opts.maxCachedDentries, so - // we don't loop. - if atomic.LoadInt64(&victim.refs) == 0 { - if victimParentVFSD := victim.vfsd.Parent(); victimParentVFSD != nil { - victimParent := victimParentVFSD.Impl().(*dentry) - victimParent.dirMu.Lock() - if !victim.vfsd.IsDisowned() { - // victim can't be a mount point (in any mount - // namespace), since VFS holds references on mount - // points. - d.fs.vfsfs.VirtualFilesystem().ForceDeleteDentry(&victim.vfsd) - // We're only deleting the dentry, not the file it - // represents, so we don't need to update - // victimParent.dirents etc. - } - victimParent.dirMu.Unlock() - } - victim.destroyLocked() - } - } -} - -// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is -// not a child dentry. -func (d *dentry) destroyLocked() { - ctx := context.Background() - d.handleMu.Lock() - if !d.handle.file.isNil() { - mf := d.fs.mfp.MemoryFile() - d.dataMu.Lock() - // Write dirty pages back to the remote filesystem. - if d.handleWritable { - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { - log.Warningf("gofer.dentry.DecRef: failed to write dirty data back: %v", err) - } - } - // Discard cached data. - d.cache.DropAll(mf) - d.dirty.RemoveAll() - d.dataMu.Unlock() - // Clunk open fids and close open host FDs. - d.handle.close(ctx) - } - d.handleMu.Unlock() - d.file.close(ctx) - // Remove d from the set of all dentries. - d.fs.syncMu.Lock() - delete(d.fs.dentries, d) - d.fs.syncMu.Unlock() - // Drop the reference held by d on its parent. - if parentVFSD := d.vfsd.Parent(); parentVFSD != nil { - parent := parentVFSD.Impl().(*dentry) - // This is parent.DecRef() without recursive locking of d.fs.renameMu. - if refs := atomic.AddInt64(&parent.refs, -1); refs == 0 { - parent.checkCachingLocked() - } else if refs < 0 { - panic("gofer.dentry.DecRef() called without holding a reference") - } - } -} - -func (d *dentry) isDeleted() bool { - return atomic.LoadUint32(&d.deleted) != 0 -} - -func (d *dentry) setDeleted() { - atomic.StoreUint32(&d.deleted, 1) -} - -func (d *dentry) listxattr(ctx context.Context) ([]string, error) { - return nil, syserror.ENOTSUP -} - -func (d *dentry) getxattr(ctx context.Context, name string) (string, error) { - // TODO(jamieliu): add vfs.GetxattrOptions.Size - return d.file.getXattr(ctx, name, linux.XATTR_SIZE_MAX) -} - -func (d *dentry) setxattr(ctx context.Context, opts *vfs.SetxattrOptions) error { - return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags) -} - -func (d *dentry) removexattr(ctx context.Context, name string) error { - return syserror.ENOTSUP -} - -// Preconditions: d.isRegularFile() || d.isDirectory(). -func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { - // O_TRUNC unconditionally requires us to obtain a new handle (opened with - // O_TRUNC). - if !trunc { - d.handleMu.RLock() - if (!read || d.handleReadable) && (!write || d.handleWritable) { - // The current handle is sufficient. - d.handleMu.RUnlock() - return nil - } - d.handleMu.RUnlock() - } - - haveOldFD := false - d.handleMu.Lock() - if (read && !d.handleReadable) || (write && !d.handleWritable) || trunc { - // Get a new handle. - wantReadable := d.handleReadable || read - wantWritable := d.handleWritable || write - h, err := openHandle(ctx, d.file, wantReadable, wantWritable, trunc) - if err != nil { - d.handleMu.Unlock() - return err - } - if !d.handle.file.isNil() { - // Check that old and new handles are compatible: If the old handle - // includes a host file descriptor but the new one does not, or - // vice versa, old and new memory mappings may be incoherent. - haveOldFD = d.handle.fd >= 0 - haveNewFD := h.fd >= 0 - if haveOldFD != haveNewFD { - d.handleMu.Unlock() - ctx.Warningf("gofer.dentry.ensureSharedHandle: can't change host FD availability from %v to %v across dentry handle upgrade", haveOldFD, haveNewFD) - h.close(ctx) - return syserror.EIO - } - if haveOldFD { - // We may have raced with callers of d.pf.FD() that are now - // using the old file descriptor, preventing us from safely - // closing it. We could handle this by invalidating existing - // memmap.Translations, but this is expensive. Instead, use - // dup3 to make the old file descriptor refer to the new file - // description, then close the new file descriptor (which is no - // longer needed). Racing callers may use the old or new file - // description, but this doesn't matter since they refer to the - // same file (unless d.fs.opts.overlayfsStaleRead is true, - // which we handle separately). - if err := syscall.Dup3(int(h.fd), int(d.handle.fd), 0); err != nil { - d.handleMu.Unlock() - ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err) - h.close(ctx) - return err - } - syscall.Close(int(h.fd)) - h.fd = d.handle.fd - if d.fs.opts.overlayfsStaleRead { - // Replace sentry mappings of the old FD with mappings of - // the new FD, since the two are not necessarily coherent. - if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { - d.handleMu.Unlock() - ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) - h.close(ctx) - return err - } - } - // Clunk the old fid before making the new handle visible (by - // unlocking d.handleMu). - d.handle.file.close(ctx) - } - } - // Switch to the new handle. - d.handle = h - d.handleReadable = wantReadable - d.handleWritable = wantWritable - } - d.handleMu.Unlock() - - if d.fs.opts.overlayfsStaleRead && haveOldFD { - // Invalidate application mappings that may be using the old FD; they - // will be replaced with mappings using the new FD after future calls - // to d.Translate(). This requires holding d.mapsMu, which precedes - // d.handleMu in the lock order. - d.mapsMu.Lock() - d.mappings.InvalidateAll(memmap.InvalidateOpts{}) - d.mapsMu.Unlock() - } - - return nil -} - -// fileDescription is embedded by gofer implementations of -// vfs.FileDescriptionImpl. -type fileDescription struct { - vfsfd vfs.FileDescription - vfs.FileDescriptionDefaultImpl -} - -func (fd *fileDescription) filesystem() *filesystem { - return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) -} - -func (fd *fileDescription) dentry() *dentry { - return fd.vfsfd.Dentry().Impl().(*dentry) -} - -// Stat implements vfs.FileDescriptionImpl.Stat. -func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { - d := fd.dentry() - if d.fs.opts.interop == InteropModeShared && opts.Mask&(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE|linux.STATX_BLOCKS|linux.STATX_BTIME) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { - // TODO(jamieliu): Use specialFileFD.handle.file for the getattr if - // available? - if err := d.updateFromGetattr(ctx); err != nil { - return linux.Statx{}, err - } - } - var stat linux.Statx - d.statTo(&stat) - return stat, nil -} - -// SetStat implements vfs.FileDescriptionImpl.SetStat. -func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount()) -} - -// Listxattr implements vfs.FileDescriptionImpl.Listxattr. -func (fd *fileDescription) Listxattr(ctx context.Context) ([]string, error) { - return fd.dentry().listxattr(ctx) -} - -// Getxattr implements vfs.FileDescriptionImpl.Getxattr. -func (fd *fileDescription) Getxattr(ctx context.Context, name string) (string, error) { - return fd.dentry().getxattr(ctx, name) -} - -// Setxattr implements vfs.FileDescriptionImpl.Setxattr. -func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { - return fd.dentry().setxattr(ctx, &opts) -} - -// Removexattr implements vfs.FileDescriptionImpl.Removexattr. -func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { - return fd.dentry().removexattr(ctx, name) -} diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go deleted file mode 100644 index cfe66f797..000000000 --- a/pkg/sentry/fsimpl/gofer/handle.go +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "syscall" - - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/safemem" -) - -// handle represents a remote "open file descriptor", consisting of an opened -// fid (p9.File) and optionally a host file descriptor. -type handle struct { - file p9file - fd int32 // -1 if unavailable -} - -// Preconditions: read || write. -func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (handle, error) { - _, newfile, err := file.walk(ctx, nil) - if err != nil { - return handle{fd: -1}, err - } - var flags p9.OpenFlags - switch { - case read && !write: - flags = p9.ReadOnly - case !read && write: - flags = p9.WriteOnly - case read && write: - flags = p9.ReadWrite - } - if trunc { - flags |= p9.OpenTruncate - } - fdobj, _, _, err := newfile.open(ctx, flags) - if err != nil { - newfile.close(ctx) - return handle{fd: -1}, err - } - fd := int32(-1) - if fdobj != nil { - fd = int32(fdobj.Release()) - } - return handle{ - file: newfile, - fd: fd, - }, nil -} - -func (h *handle) close(ctx context.Context) { - h.file.close(ctx) - h.file = p9file{} - if h.fd >= 0 { - syscall.Close(int(h.fd)) - h.fd = -1 - } -} - -func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) { - if dsts.IsEmpty() { - return 0, nil - } - if h.fd >= 0 { - ctx.UninterruptibleSleepStart(false) - n, err := hostPreadv(h.fd, dsts, int64(offset)) - ctx.UninterruptibleSleepFinish(false) - return n, err - } - if dsts.NumBlocks() == 1 && !dsts.Head().NeedSafecopy() { - n, err := h.file.readAt(ctx, dsts.Head().ToSlice(), offset) - return uint64(n), err - } - // Buffer the read since p9.File.ReadAt() takes []byte. - buf := make([]byte, dsts.NumBytes()) - n, err := h.file.readAt(ctx, buf, offset) - if n == 0 { - return 0, err - } - if cp, cperr := safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:n]))); cperr != nil { - return cp, cperr - } - return uint64(n), err -} - -func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { - if srcs.IsEmpty() { - return 0, nil - } - if h.fd >= 0 { - ctx.UninterruptibleSleepStart(false) - n, err := hostPwritev(h.fd, srcs, int64(offset)) - ctx.UninterruptibleSleepFinish(false) - return n, err - } - if srcs.NumBlocks() == 1 && !srcs.Head().NeedSafecopy() { - n, err := h.file.writeAt(ctx, srcs.Head().ToSlice(), offset) - return uint64(n), err - } - // Buffer the write since p9.File.WriteAt() takes []byte. - buf := make([]byte, srcs.NumBytes()) - cp, cperr := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), srcs) - if cp == 0 { - return 0, cperr - } - n, err := h.file.writeAt(ctx, buf[:cp], offset) - if err != nil { - return uint64(n), err - } - return cp, cperr -} - -func (h *handle) sync(ctx context.Context) error { - if h.fd >= 0 { - ctx.UninterruptibleSleepStart(false) - err := syscall.Fsync(int(h.fd)) - ctx.UninterruptibleSleepFinish(false) - return err - } - return h.file.fsync(ctx) -} diff --git a/pkg/sentry/fsimpl/gofer/handle_unsafe.go b/pkg/sentry/fsimpl/gofer/handle_unsafe.go deleted file mode 100644 index 19560ab26..000000000 --- a/pkg/sentry/fsimpl/gofer/handle_unsafe.go +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "syscall" - "unsafe" - - "gvisor.dev/gvisor/pkg/safemem" -) - -// Preconditions: !dsts.IsEmpty(). -func hostPreadv(fd int32, dsts safemem.BlockSeq, off int64) (uint64, error) { - // No buffering is necessary regardless of safecopy; host syscalls will - // return EFAULT if appropriate, instead of raising SIGBUS. - if dsts.NumBlocks() == 1 { - // Use pread() instead of preadv() to avoid iovec allocation and - // copying. - dst := dsts.Head() - n, _, e := syscall.Syscall6(syscall.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil - } - iovs := safemem.IovecsFromBlockSeq(dsts) - n, _, e := syscall.Syscall6(syscall.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil -} - -// Preconditions: !srcs.IsEmpty(). -func hostPwritev(fd int32, srcs safemem.BlockSeq, off int64) (uint64, error) { - // No buffering is necessary regardless of safecopy; host syscalls will - // return EFAULT if appropriate, instead of raising SIGBUS. - if srcs.NumBlocks() == 1 { - // Use pwrite() instead of pwritev() to avoid iovec allocation and - // copying. - src := srcs.Head() - n, _, e := syscall.Syscall6(syscall.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil - } - iovs := safemem.IovecsFromBlockSeq(srcs) - n, _, e := syscall.Syscall6(syscall.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil -} diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go deleted file mode 100644 index 755ac2985..000000000 --- a/pkg/sentry/fsimpl/gofer/p9file.go +++ /dev/null @@ -1,219 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fd" - "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/syserror" -) - -// p9file is a wrapper around p9.File that provides methods that are -// Context-aware. -type p9file struct { - file p9.File -} - -func (f p9file) isNil() bool { - return f.file == nil -} - -func (f p9file) walk(ctx context.Context, names []string) ([]p9.QID, p9file, error) { - ctx.UninterruptibleSleepStart(false) - qids, newfile, err := f.file.Walk(names) - ctx.UninterruptibleSleepFinish(false) - return qids, p9file{newfile}, err -} - -func (f p9file) walkGetAttr(ctx context.Context, names []string) ([]p9.QID, p9file, p9.AttrMask, p9.Attr, error) { - ctx.UninterruptibleSleepStart(false) - qids, newfile, attrMask, attr, err := f.file.WalkGetAttr(names) - ctx.UninterruptibleSleepFinish(false) - return qids, p9file{newfile}, attrMask, attr, err -} - -// walkGetAttrOne is a wrapper around p9.File.WalkGetAttr that takes a single -// path component and returns a single qid. -func (f p9file) walkGetAttrOne(ctx context.Context, name string) (p9.QID, p9file, p9.AttrMask, p9.Attr, error) { - ctx.UninterruptibleSleepStart(false) - qids, newfile, attrMask, attr, err := f.file.WalkGetAttr([]string{name}) - ctx.UninterruptibleSleepFinish(false) - if err != nil { - return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, err - } - if len(qids) != 1 { - ctx.Warningf("p9.File.WalkGetAttr returned %d qids (%v), wanted 1", len(qids), qids) - if newfile != nil { - p9file{newfile}.close(ctx) - } - return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, syserror.EIO - } - return qids[0], p9file{newfile}, attrMask, attr, nil -} - -func (f p9file) statFS(ctx context.Context) (p9.FSStat, error) { - ctx.UninterruptibleSleepStart(false) - fsstat, err := f.file.StatFS() - ctx.UninterruptibleSleepFinish(false) - return fsstat, err -} - -func (f p9file) getAttr(ctx context.Context, req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) { - ctx.UninterruptibleSleepStart(false) - qid, attrMask, attr, err := f.file.GetAttr(req) - ctx.UninterruptibleSleepFinish(false) - return qid, attrMask, attr, err -} - -func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error { - ctx.UninterruptibleSleepStart(false) - err := f.file.SetAttr(valid, attr) - ctx.UninterruptibleSleepFinish(false) - return err -} - -func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) { - ctx.UninterruptibleSleepStart(false) - val, err := f.file.GetXattr(name, size) - ctx.UninterruptibleSleepFinish(false) - return val, err -} - -func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32) error { - ctx.UninterruptibleSleepStart(false) - err := f.file.SetXattr(name, value, flags) - ctx.UninterruptibleSleepFinish(false) - return err -} - -func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error { - ctx.UninterruptibleSleepStart(false) - err := f.file.Allocate(mode, offset, length) - ctx.UninterruptibleSleepFinish(false) - return err -} - -func (f p9file) close(ctx context.Context) error { - ctx.UninterruptibleSleepStart(false) - err := f.file.Close() - ctx.UninterruptibleSleepFinish(false) - return err -} - -func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { - ctx.UninterruptibleSleepStart(false) - fdobj, qid, iounit, err := f.file.Open(flags) - ctx.UninterruptibleSleepFinish(false) - return fdobj, qid, iounit, err -} - -func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (int, error) { - ctx.UninterruptibleSleepStart(false) - n, err := f.file.ReadAt(p, offset) - ctx.UninterruptibleSleepFinish(false) - return n, err -} - -func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) { - ctx.UninterruptibleSleepStart(false) - n, err := f.file.WriteAt(p, offset) - ctx.UninterruptibleSleepFinish(false) - return n, err -} - -func (f p9file) fsync(ctx context.Context) error { - ctx.UninterruptibleSleepStart(false) - err := f.file.FSync() - ctx.UninterruptibleSleepFinish(false) - return err -} - -func (f p9file) create(ctx context.Context, name string, flags p9.OpenFlags, permissions p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9file, p9.QID, uint32, error) { - ctx.UninterruptibleSleepStart(false) - fdobj, newfile, qid, iounit, err := f.file.Create(name, flags, permissions, uid, gid) - ctx.UninterruptibleSleepFinish(false) - return fdobj, p9file{newfile}, qid, iounit, err -} - -func (f p9file) mkdir(ctx context.Context, name string, permissions p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) { - ctx.UninterruptibleSleepStart(false) - qid, err := f.file.Mkdir(name, permissions, uid, gid) - ctx.UninterruptibleSleepFinish(false) - return qid, err -} - -func (f p9file) symlink(ctx context.Context, oldName string, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) { - ctx.UninterruptibleSleepStart(false) - qid, err := f.file.Symlink(oldName, newName, uid, gid) - ctx.UninterruptibleSleepFinish(false) - return qid, err -} - -func (f p9file) link(ctx context.Context, target p9file, newName string) error { - ctx.UninterruptibleSleepStart(false) - err := f.file.Link(target.file, newName) - ctx.UninterruptibleSleepFinish(false) - return err -} - -func (f p9file) mknod(ctx context.Context, name string, mode p9.FileMode, major uint32, minor uint32, uid p9.UID, gid p9.GID) (p9.QID, error) { - ctx.UninterruptibleSleepStart(false) - qid, err := f.file.Mknod(name, mode, major, minor, uid, gid) - ctx.UninterruptibleSleepFinish(false) - return qid, err -} - -func (f p9file) rename(ctx context.Context, newDir p9file, newName string) error { - ctx.UninterruptibleSleepStart(false) - err := f.file.Rename(newDir.file, newName) - ctx.UninterruptibleSleepFinish(false) - return err -} - -func (f p9file) unlinkAt(ctx context.Context, name string, flags uint32) error { - ctx.UninterruptibleSleepStart(false) - err := f.file.UnlinkAt(name, flags) - ctx.UninterruptibleSleepFinish(false) - return err -} - -func (f p9file) readdir(ctx context.Context, offset uint64, count uint32) ([]p9.Dirent, error) { - ctx.UninterruptibleSleepStart(false) - dirents, err := f.file.Readdir(offset, count) - ctx.UninterruptibleSleepFinish(false) - return dirents, err -} - -func (f p9file) readlink(ctx context.Context) (string, error) { - ctx.UninterruptibleSleepStart(false) - target, err := f.file.Readlink() - ctx.UninterruptibleSleepFinish(false) - return target, err -} - -func (f p9file) flush(ctx context.Context) error { - ctx.UninterruptibleSleepStart(false) - err := f.file.Flush() - ctx.UninterruptibleSleepFinish(false) - return err -} - -func (f p9file) connect(ctx context.Context, flags p9.ConnectFlags) (*fd.FD, error) { - ctx.UninterruptibleSleepStart(false) - fdobj, err := f.file.Connect(flags) - ctx.UninterruptibleSleepFinish(false) - return fdobj, err -} diff --git a/pkg/sentry/fsimpl/gofer/pagemath.go b/pkg/sentry/fsimpl/gofer/pagemath.go deleted file mode 100644 index 847cb0784..000000000 --- a/pkg/sentry/fsimpl/gofer/pagemath.go +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "gvisor.dev/gvisor/pkg/usermem" -) - -// This are equivalent to usermem.Addr.RoundDown/Up, but without the -// potentially truncating conversion to usermem.Addr. This is necessary because -// there is no way to define generic "PageRoundDown/Up" functions in Go. - -func pageRoundDown(x uint64) uint64 { - return x &^ (usermem.PageSize - 1) -} - -func pageRoundUp(x uint64) uint64 { - return pageRoundDown(x + usermem.PageSize - 1) -} diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go deleted file mode 100644 index e95209661..000000000 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ /dev/null @@ -1,872 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "fmt" - "io" - "math" - "sync" - "sync/atomic" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -func (d *dentry) isRegularFile() bool { - return d.fileType() == linux.S_IFREG -} - -type regularFileFD struct { - fileDescription - - // off is the file offset. off is protected by mu. - mu sync.Mutex - off int64 -} - -// Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release() { -} - -// OnClose implements vfs.FileDescriptionImpl.OnClose. -func (fd *regularFileFD) OnClose(ctx context.Context) error { - if !fd.vfsfd.IsWritable() { - return nil - } - // Skip flushing if writes may be buffered by the client, since (as with - // the VFS1 client) we don't flush buffered writes on close anyway. - d := fd.dentry() - if d.fs.opts.interop == InteropModeExclusive { - return nil - } - d.handleMu.RLock() - defer d.handleMu.RUnlock() - return d.handle.file.flush(ctx) -} - -// PRead implements vfs.FileDescriptionImpl.PRead. -func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - if offset < 0 { - return 0, syserror.EINVAL - } - if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP - } - - // Check for reading at EOF before calling into MM (but not under - // InteropModeShared, which makes d.size unreliable). - d := fd.dentry() - if d.fs.opts.interop != InteropModeShared && uint64(offset) >= atomic.LoadUint64(&d.size) { - return 0, io.EOF - } - - if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { - // Lock d.metadataMu for the rest of the read to prevent d.size from - // changing. - d.metadataMu.Lock() - defer d.metadataMu.Unlock() - // Write dirty cached pages that will be touched by the read back to - // the remote file. - if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil { - return 0, err - } - } - - rw := getDentryReadWriter(ctx, d, offset) - if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { - // Require the read to go to the remote file. - rw.direct = true - } - n, err := dst.CopyOutFrom(ctx, rw) - putDentryReadWriter(rw) - if d.fs.opts.interop != InteropModeShared { - // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). - d.touchAtime(ctx, fd.vfsfd.Mount()) - } - return n, err -} - -// Read implements vfs.FileDescriptionImpl.Read. -func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - fd.mu.Lock() - n, err := fd.PRead(ctx, dst, fd.off, opts) - fd.off += n - fd.mu.Unlock() - return n, err -} - -// PWrite implements vfs.FileDescriptionImpl.PWrite. -func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - if offset < 0 { - return 0, syserror.EINVAL - } - if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP - } - - d := fd.dentry() - d.metadataMu.Lock() - defer d.metadataMu.Unlock() - if d.fs.opts.interop != InteropModeShared { - // Compare Linux's mm/filemap.c:__generic_file_write_iter() => - // file_update_time(). This is d.touchCMtime(), but without locking - // d.metadataMu (recursively). - if now, ok := nowFromContext(ctx); ok { - atomic.StoreInt64(&d.mtime, now) - atomic.StoreInt64(&d.ctime, now) - } - } - if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { - // Write dirty cached pages that will be touched by the write back to - // the remote file. - if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { - return 0, err - } - // Remove touched pages from the cache. - pgstart := pageRoundDown(uint64(offset)) - pgend := pageRoundUp(uint64(offset + src.NumBytes())) - if pgend < pgstart { - return 0, syserror.EINVAL - } - mr := memmap.MappableRange{pgstart, pgend} - var freed []platform.FileRange - d.dataMu.Lock() - cseg := d.cache.LowerBoundSegment(mr.Start) - for cseg.Ok() && cseg.Start() < mr.End { - cseg = d.cache.Isolate(cseg, mr) - freed = append(freed, platform.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) - cseg = d.cache.Remove(cseg).NextSegment() - } - d.dataMu.Unlock() - // Invalidate mappings of removed pages. - d.mapsMu.Lock() - d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) - d.mapsMu.Unlock() - // Finally free pages removed from the cache. - mf := d.fs.mfp.MemoryFile() - for _, freedFR := range freed { - mf.DecRef(freedFR) - } - } - rw := getDentryReadWriter(ctx, d, offset) - if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { - // Require the write to go to the remote file. - rw.direct = true - } - n, err := src.CopyInTo(ctx, rw) - putDentryReadWriter(rw) - if n != 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { - // Write dirty cached pages touched by the write back to the remote - // file. - if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { - return 0, err - } - // Request the remote filesystem to sync the remote file. - if err := d.handle.file.fsync(ctx); err != nil { - return 0, err - } - } - return n, err -} - -// Write implements vfs.FileDescriptionImpl.Write. -func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - fd.mu.Lock() - n, err := fd.PWrite(ctx, src, fd.off, opts) - fd.off += n - fd.mu.Unlock() - return n, err -} - -type dentryReadWriter struct { - ctx context.Context - d *dentry - off uint64 - direct bool -} - -var dentryReadWriterPool = sync.Pool{ - New: func() interface{} { - return &dentryReadWriter{} - }, -} - -func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter { - rw := dentryReadWriterPool.Get().(*dentryReadWriter) - rw.ctx = ctx - rw.d = d - rw.off = uint64(offset) - rw.direct = false - return rw -} - -func putDentryReadWriter(rw *dentryReadWriter) { - rw.ctx = nil - rw.d = nil - dentryReadWriterPool.Put(rw) -} - -// ReadToBlocks implements safemem.Reader.ReadToBlocks. -func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { - if dsts.IsEmpty() { - return 0, nil - } - - // If we have a mmappable host FD (which must be used here to ensure - // coherence with memory-mapped I/O), or if InteropModeShared is in effect - // (which prevents us from caching file contents and makes dentry.size - // unreliable), or if the file was opened O_DIRECT, read directly from - // dentry.handle without locking dentry.dataMu. - rw.d.handleMu.RLock() - if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { - n, err := rw.d.handle.readToBlocksAt(rw.ctx, dsts, rw.off) - rw.d.handleMu.RUnlock() - rw.off += n - return n, err - } - - // Otherwise read from/through the cache. - mf := rw.d.fs.mfp.MemoryFile() - fillCache := mf.ShouldCacheEvictable() - var dataMuUnlock func() - if fillCache { - rw.d.dataMu.Lock() - dataMuUnlock = rw.d.dataMu.Unlock - } else { - rw.d.dataMu.RLock() - dataMuUnlock = rw.d.dataMu.RUnlock - } - - // Compute the range to read (limited by file size and overflow-checked). - if rw.off >= rw.d.size { - dataMuUnlock() - rw.d.handleMu.RUnlock() - return 0, io.EOF - } - end := rw.d.size - if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { - end = rend - } - - var done uint64 - seg, gap := rw.d.cache.Find(rw.off) - for rw.off < end { - mr := memmap.MappableRange{rw.off, end} - switch { - case seg.Ok(): - // Get internal mappings from the cache. - ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read) - if err != nil { - dataMuUnlock() - rw.d.handleMu.RUnlock() - return done, err - } - - // Copy from internal mappings. - n, err := safemem.CopySeq(dsts, ims) - done += n - rw.off += n - dsts = dsts.DropFirst64(n) - if err != nil { - dataMuUnlock() - rw.d.handleMu.RUnlock() - return done, err - } - - // Continue. - seg, gap = seg.NextNonEmpty() - - case gap.Ok(): - gapMR := gap.Range().Intersect(mr) - if fillCache { - // Read into the cache, then re-enter the loop to read from the - // cache. - reqMR := memmap.MappableRange{ - Start: pageRoundDown(gapMR.Start), - End: pageRoundUp(gapMR.End), - } - optMR := gap.Range() - err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt) - mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End}) - seg, gap = rw.d.cache.Find(rw.off) - if !seg.Ok() { - dataMuUnlock() - rw.d.handleMu.RUnlock() - return done, err - } - // err might have occurred in part of gap.Range() outside - // gapMR. Forget about it for now; if the error matters and - // persists, we'll run into it again in a later iteration of - // this loop. - } else { - // Read directly from the file. - gapDsts := dsts.TakeFirst64(gapMR.Length()) - n, err := rw.d.handle.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start) - done += n - rw.off += n - dsts = dsts.DropFirst64(n) - // Partial reads are fine. But we must stop reading. - if n != gapDsts.NumBytes() || err != nil { - dataMuUnlock() - rw.d.handleMu.RUnlock() - return done, err - } - - // Continue. - seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} - } - } - } - dataMuUnlock() - rw.d.handleMu.RUnlock() - return done, nil -} - -// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. -// -// Preconditions: rw.d.metadataMu must be locked. -func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { - if srcs.IsEmpty() { - return 0, nil - } - - // If we have a mmappable host FD (which must be used here to ensure - // coherence with memory-mapped I/O), or if InteropModeShared is in effect - // (which prevents us from caching file contents), or if the file was - // opened with O_DIRECT, write directly to dentry.handle without locking - // dentry.dataMu. - rw.d.handleMu.RLock() - if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { - n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off) - rw.off += n - rw.d.dataMu.Lock() - if rw.off > rw.d.size { - atomic.StoreUint64(&rw.d.size, rw.off) - // The remote file's size will implicitly be extended to the correct - // value when we write back to it. - } - rw.d.dataMu.Unlock() - rw.d.handleMu.RUnlock() - return n, err - } - - // Otherwise write to/through the cache. - mf := rw.d.fs.mfp.MemoryFile() - rw.d.dataMu.Lock() - - // Compute the range to write (overflow-checked). - start := rw.off - end := rw.off + srcs.NumBytes() - if end <= rw.off { - end = math.MaxInt64 - } - - var ( - done uint64 - retErr error - ) - seg, gap := rw.d.cache.Find(rw.off) - for rw.off < end { - mr := memmap.MappableRange{rw.off, end} - switch { - case seg.Ok(): - // Get internal mappings from the cache. - segMR := seg.Range().Intersect(mr) - ims, err := mf.MapInternal(seg.FileRangeOf(segMR), usermem.Write) - if err != nil { - retErr = err - goto exitLoop - } - - // Copy to internal mappings. - n, err := safemem.CopySeq(ims, srcs) - done += n - rw.off += n - srcs = srcs.DropFirst64(n) - rw.d.dirty.MarkDirty(segMR) - if err != nil { - retErr = err - goto exitLoop - } - - // Continue. - seg, gap = seg.NextNonEmpty() - - case gap.Ok(): - // Write directly to the file. At present, we never fill the cache - // when writing, since doing so can convert small writes into - // inefficient read-modify-write cycles, and we have no mechanism - // for detecting or avoiding this. - gapMR := gap.Range().Intersect(mr) - gapSrcs := srcs.TakeFirst64(gapMR.Length()) - n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start) - done += n - rw.off += n - srcs = srcs.DropFirst64(n) - // Partial writes are fine. But we must stop writing. - if n != gapSrcs.NumBytes() || err != nil { - retErr = err - goto exitLoop - } - - // Continue. - seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} - } - } -exitLoop: - if rw.off > rw.d.size { - atomic.StoreUint64(&rw.d.size, rw.off) - // The remote file's size will implicitly be extended to the correct - // value when we write back to it. - } - // If InteropModeWritethrough is in effect, flush written data back to the - // remote filesystem. - if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 { - if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{ - Start: start, - End: rw.off, - }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, rw.d.handle.writeFromBlocksAt); err != nil { - // We have no idea how many bytes were actually flushed. - rw.off = start - done = 0 - retErr = err - } - } - rw.d.dataMu.Unlock() - rw.d.handleMu.RUnlock() - return done, retErr -} - -func (d *dentry) writeback(ctx context.Context, offset, size int64) error { - if size == 0 { - return nil - } - d.handleMu.RLock() - defer d.handleMu.RUnlock() - d.dataMu.Lock() - defer d.dataMu.Unlock() - // Compute the range of valid bytes (overflow-checked). - if uint64(offset) >= d.size { - return nil - } - end := int64(d.size) - if rend := offset + size; rend > offset && rend < end { - end = rend - } - return fsutil.SyncDirty(ctx, memmap.MappableRange{ - Start: uint64(offset), - End: uint64(end), - }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) -} - -// Seek implements vfs.FileDescriptionImpl.Seek. -func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - fd.mu.Lock() - defer fd.mu.Unlock() - switch whence { - case linux.SEEK_SET: - // Use offset as specified. - case linux.SEEK_CUR: - offset += fd.off - case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE: - // Ensure file size is up to date. - d := fd.dentry() - if fd.filesystem().opts.interop == InteropModeShared { - if err := d.updateFromGetattr(ctx); err != nil { - return 0, err - } - } - size := int64(atomic.LoadUint64(&d.size)) - // For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous - // block of data. - switch whence { - case linux.SEEK_END: - offset += size - case linux.SEEK_DATA: - if offset > size { - return 0, syserror.ENXIO - } - // Use offset as specified. - case linux.SEEK_HOLE: - if offset > size { - return 0, syserror.ENXIO - } - offset = size - } - default: - return 0, syserror.EINVAL - } - if offset < 0 { - return 0, syserror.EINVAL - } - fd.off = offset - return offset, nil -} - -// Sync implements vfs.FileDescriptionImpl.Sync. -func (fd *regularFileFD) Sync(ctx context.Context) error { - return fd.dentry().syncSharedHandle(ctx) -} - -func (d *dentry) syncSharedHandle(ctx context.Context) error { - d.handleMu.RLock() - if !d.handleWritable { - d.handleMu.RUnlock() - return nil - } - d.dataMu.Lock() - // Write dirty cached data to the remote file. - err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) - d.dataMu.Unlock() - if err == nil { - // Sync the remote file. - err = d.handle.sync(ctx) - } - d.handleMu.RUnlock() - return err -} - -// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. -func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - d := fd.dentry() - switch d.fs.opts.interop { - case InteropModeExclusive: - // Any mapping is fine. - case InteropModeWritethrough: - // Shared writable mappings require a host FD, since otherwise we can't - // synchronously flush memory-mapped writes to the remote file. - if opts.Private || !opts.MaxPerms.Write { - break - } - fallthrough - case InteropModeShared: - // All mappings require a host FD to be coherent with other filesystem - // users. - if d.fs.opts.forcePageCache { - // Whether or not we have a host FD, we're not allowed to use it. - return syserror.ENODEV - } - d.handleMu.RLock() - haveFD := d.handle.fd >= 0 - d.handleMu.RUnlock() - if !haveFD { - return syserror.ENODEV - } - default: - panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop)) - } - // After this point, d may be used as a memmap.Mappable. - d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init) - return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts) -} - -func (d *dentry) mayCachePages() bool { - if d.fs.opts.interop == InteropModeShared { - return false - } - if d.fs.opts.forcePageCache { - return true - } - d.handleMu.RLock() - haveFD := d.handle.fd >= 0 - d.handleMu.RUnlock() - return haveFD -} - -// AddMapping implements memmap.Mappable.AddMapping. -func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { - d.mapsMu.Lock() - mapped := d.mappings.AddMapping(ms, ar, offset, writable) - // Do this unconditionally since whether we have a host FD can change - // across save/restore. - for _, r := range mapped { - d.pf.hostFileMapper.IncRefOn(r) - } - if d.mayCachePages() { - // d.Evict() will refuse to evict memory-mapped pages, so tell the - // MemoryFile to not bother trying. - mf := d.fs.mfp.MemoryFile() - for _, r := range mapped { - mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End}) - } - } - d.mapsMu.Unlock() - return nil -} - -// RemoveMapping implements memmap.Mappable.RemoveMapping. -func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { - d.mapsMu.Lock() - unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable) - for _, r := range unmapped { - d.pf.hostFileMapper.DecRefOn(r) - } - if d.mayCachePages() { - // Pages that are no longer referenced by any application memory - // mappings are now considered unused; allow MemoryFile to evict them - // when necessary. - mf := d.fs.mfp.MemoryFile() - d.dataMu.Lock() - for _, r := range unmapped { - // Since these pages are no longer mapped, they are no longer - // concurrently dirtyable by a writable memory mapping. - d.dirty.AllowClean(r) - mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End}) - } - d.dataMu.Unlock() - } - d.mapsMu.Unlock() -} - -// CopyMapping implements memmap.Mappable.CopyMapping. -func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { - return d.AddMapping(ctx, ms, dstAR, offset, writable) -} - -// Translate implements memmap.Mappable.Translate. -func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { - d.handleMu.RLock() - if d.handle.fd >= 0 && !d.fs.opts.forcePageCache { - d.handleMu.RUnlock() - mr := optional - if d.fs.opts.limitHostFDTranslation { - mr = maxFillRange(required, optional) - } - return []memmap.Translation{ - { - Source: mr, - File: &d.pf, - Offset: mr.Start, - Perms: usermem.AnyAccess, - }, - }, nil - } - - d.dataMu.Lock() - - // Constrain translations to d.size (rounded up) to prevent translation to - // pages that may be concurrently truncated. - pgend := pageRoundUp(d.size) - var beyondEOF bool - if required.End > pgend { - if required.Start >= pgend { - d.dataMu.Unlock() - d.handleMu.RUnlock() - return nil, &memmap.BusError{io.EOF} - } - beyondEOF = true - required.End = pgend - } - if optional.End > pgend { - optional.End = pgend - } - - mf := d.fs.mfp.MemoryFile() - cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, d.handle.readToBlocksAt) - - var ts []memmap.Translation - var translatedEnd uint64 - for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { - segMR := seg.Range().Intersect(optional) - // TODO(jamieliu): Make Translations writable even if writability is - // not required if already kept-dirty by another writable translation. - perms := usermem.AccessType{ - Read: true, - Execute: true, - } - if at.Write { - // From this point forward, this memory can be dirtied through the - // mapping at any time. - d.dirty.KeepDirty(segMR) - perms.Write = true - } - ts = append(ts, memmap.Translation{ - Source: segMR, - File: mf, - Offset: seg.FileRangeOf(segMR).Start, - Perms: perms, - }) - translatedEnd = segMR.End - } - - d.dataMu.Unlock() - d.handleMu.RUnlock() - - // Don't return the error returned by c.cache.Fill if it occurred outside - // of required. - if translatedEnd < required.End && cerr != nil { - return ts, &memmap.BusError{cerr} - } - if beyondEOF { - return ts, &memmap.BusError{io.EOF} - } - return ts, nil -} - -func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange { - const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily - if required.Length() >= maxReadahead { - return required - } - if optional.Length() <= maxReadahead { - return optional - } - optional.Start = required.Start - if optional.Length() <= maxReadahead { - return optional - } - optional.End = optional.Start + maxReadahead - return optional -} - -// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. -func (d *dentry) InvalidateUnsavable(ctx context.Context) error { - // Whether we have a host fd (and consequently what platform.File is - // mapped) can change across save/restore, so invalidate all translations - // unconditionally. - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - d.mappings.InvalidateAll(memmap.InvalidateOpts{}) - - // Write the cache's contents back to the remote file so that if we have a - // host fd after restore, the remote file's contents are coherent. - mf := d.fs.mfp.MemoryFile() - d.dataMu.Lock() - defer d.dataMu.Unlock() - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { - return err - } - - // Discard the cache so that it's not stored in saved state. This is safe - // because per InvalidateUnsavable invariants, no new translations can have - // been returned after we invalidated all existing translations above. - d.cache.DropAll(mf) - d.dirty.RemoveAll() - - return nil -} - -// Evict implements pgalloc.EvictableMemoryUser.Evict. -func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - d.dataMu.Lock() - defer d.dataMu.Unlock() - - mr := memmap.MappableRange{er.Start, er.End} - mf := d.fs.mfp.MemoryFile() - // Only allow pages that are no longer memory-mapped to be evicted. - for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { - mgapMR := mgap.Range().Intersect(mr) - if mgapMR.Length() == 0 { - continue - } - if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { - log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) - } - d.cache.Drop(mgapMR, mf) - d.dirty.KeepClean(mgapMR) - } -} - -// dentryPlatformFile implements platform.File. It exists solely because dentry -// cannot implement both vfs.DentryImpl.IncRef and platform.File.IncRef. -// -// dentryPlatformFile is only used when a host FD representing the remote file -// is available (i.e. dentry.handle.fd >= 0), and that FD is used for -// application memory mappings (i.e. !filesystem.opts.forcePageCache). -type dentryPlatformFile struct { - *dentry - - // fdRefs counts references on platform.File offsets. fdRefs is protected - // by dentry.dataMu. - fdRefs fsutil.FrameRefSet - - // If this dentry represents a regular file, and handle.fd >= 0, - // hostFileMapper caches mappings of handle.fd. - hostFileMapper fsutil.HostFileMapper - - // hostFileMapperInitOnce is used to lazily initialize hostFileMapper. - hostFileMapperInitOnce sync.Once -} - -// IncRef implements platform.File.IncRef. -func (d *dentryPlatformFile) IncRef(fr platform.FileRange) { - d.dataMu.Lock() - seg, gap := d.fdRefs.Find(fr.Start) - for { - switch { - case seg.Ok() && seg.Start() < fr.End: - seg = d.fdRefs.Isolate(seg, fr) - seg.SetValue(seg.Value() + 1) - seg, gap = seg.NextNonEmpty() - case gap.Ok() && gap.Start() < fr.End: - newRange := gap.Range().Intersect(fr) - usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) - seg, gap = d.fdRefs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() - default: - d.fdRefs.MergeAdjacent(fr) - d.dataMu.Unlock() - return - } - } -} - -// DecRef implements platform.File.DecRef. -func (d *dentryPlatformFile) DecRef(fr platform.FileRange) { - d.dataMu.Lock() - seg := d.fdRefs.FindSegment(fr.Start) - - for seg.Ok() && seg.Start() < fr.End { - seg = d.fdRefs.Isolate(seg, fr) - if old := seg.Value(); old == 1 { - usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) - seg = d.fdRefs.Remove(seg).NextSegment() - } else { - seg.SetValue(old - 1) - seg = seg.NextSegment() - } - } - d.fdRefs.MergeAdjacent(fr) - d.dataMu.Unlock() - -} - -// MapInternal implements platform.File.MapInternal. -func (d *dentryPlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { - d.handleMu.RLock() - bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write) - d.handleMu.RUnlock() - return bs, err -} - -// FD implements platform.File.FD. -func (d *dentryPlatformFile) FD() int { - d.handleMu.RLock() - fd := d.handle.fd - d.handleMu.RUnlock() - return int(fd) -} diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go deleted file mode 100644 index 08c691c47..000000000 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "sync" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -// specialFileFD implements vfs.FileDescriptionImpl for files other than -// regular files, directories, and symlinks: pipes, sockets, etc. It is also -// used for regular files when filesystemOptions.specialRegularFiles is in -// effect. specialFileFD differs from regularFileFD by using per-FD handles -// instead of shared per-dentry handles, and never buffering I/O. -type specialFileFD struct { - fileDescription - - // handle is immutable. - handle handle - - // off is the file offset. off is protected by mu. (POSIX 2.9.7 only - // requires operations using the file offset to be atomic for regular files - // and symlinks; however, since specialFileFD may be used for regular - // files, we apply this atomicity unconditionally.) - mu sync.Mutex - off int64 -} - -// Release implements vfs.FileDescriptionImpl.Release. -func (fd *specialFileFD) Release() { - fd.handle.close(context.Background()) - fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) - fs.syncMu.Lock() - delete(fs.specialFileFDs, fd) - fs.syncMu.Unlock() -} - -// OnClose implements vfs.FileDescriptionImpl.OnClose. -func (fd *specialFileFD) OnClose(ctx context.Context) error { - if !fd.vfsfd.IsWritable() { - return nil - } - return fd.handle.file.flush(ctx) -} - -// PRead implements vfs.FileDescriptionImpl.PRead. -func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - if offset < 0 { - return 0, syserror.EINVAL - } - if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP - } - - // Going through dst.CopyOutFrom() holds MM locks around file operations of - // unknown duration. For regularFileFD, doing so is necessary to support - // mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't - // hold here since specialFileFD doesn't client-cache data. Just buffer the - // read instead. - if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { - d.touchAtime(ctx, fd.vfsfd.Mount()) - } - buf := make([]byte, dst.NumBytes()) - n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) - if n == 0 { - return 0, err - } - if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil { - return int64(cp), cperr - } - return int64(n), err -} - -// Read implements vfs.FileDescriptionImpl.Read. -func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - fd.mu.Lock() - n, err := fd.PRead(ctx, dst, fd.off, opts) - fd.off += n - fd.mu.Unlock() - return n, err -} - -// PWrite implements vfs.FileDescriptionImpl.PWrite. -func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - if offset < 0 { - return 0, syserror.EINVAL - } - if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP - } - - // Do a buffered write. See rationale in PRead. - if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { - d.touchCMtime(ctx) - } - buf := make([]byte, src.NumBytes()) - // Don't do partial writes if we get a partial read from src. - if _, err := src.CopyIn(ctx, buf); err != nil { - return 0, err - } - n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) - return int64(n), err -} - -// Write implements vfs.FileDescriptionImpl.Write. -func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - fd.mu.Lock() - n, err := fd.PWrite(ctx, src, fd.off, opts) - fd.off += n - fd.mu.Unlock() - return n, err -} - -// Seek implements vfs.FileDescriptionImpl.Seek. -func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - fd.mu.Lock() - defer fd.mu.Unlock() - switch whence { - case linux.SEEK_SET: - // Use offset as given. - case linux.SEEK_CUR: - offset += fd.off - default: - // SEEK_END, SEEK_DATA, and SEEK_HOLE aren't supported since it's not - // clear that file size is even meaningful for these files. - return 0, syserror.EINVAL - } - if offset < 0 { - return 0, syserror.EINVAL - } - fd.off = offset - return offset, nil -} - -// Sync implements vfs.FileDescriptionImpl.Sync. -func (fd *specialFileFD) Sync(ctx context.Context) error { - if !fd.vfsfd.IsWritable() { - return nil - } - return fd.handle.sync(ctx) -} diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go deleted file mode 100644 index adf43be60..000000000 --- a/pkg/sentry/fsimpl/gofer/symlink.go +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/vfs" -) - -func (d *dentry) isSymlink() bool { - return d.fileType() == linux.S_IFLNK -} - -// Precondition: d.isSymlink(). -func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { - if d.fs.opts.interop != InteropModeShared { - d.touchAtime(ctx, mnt) - d.dataMu.Lock() - if d.haveTarget { - target := d.target - d.dataMu.Unlock() - return target, nil - } - } - target, err := d.file.readlink(ctx) - if d.fs.opts.interop != InteropModeShared { - if err == nil { - d.haveTarget = true - d.target = target - } - d.dataMu.Unlock() - } - return target, err -} diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go deleted file mode 100644 index 7598ec6a8..000000000 --- a/pkg/sentry/fsimpl/gofer/time.go +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "sync/atomic" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/vfs" -) - -func dentryTimestampFromP9(s, ns uint64) int64 { - return int64(s*1e9 + ns) -} - -func dentryTimestampFromStatx(ts linux.StatxTimestamp) int64 { - return ts.Sec*1e9 + int64(ts.Nsec) -} - -func statxTimestampFromDentry(ns int64) linux.StatxTimestamp { - return linux.StatxTimestamp{ - Sec: ns / 1e9, - Nsec: uint32(ns % 1e9), - } -} - -func nowFromContext(ctx context.Context) (int64, bool) { - if clock := ktime.RealtimeClockFromContext(ctx); clock != nil { - return clock.Now().Nanoseconds(), true - } - return 0, false -} - -// Preconditions: fs.interop != InteropModeShared. -func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) { - if err := mnt.CheckBeginWrite(); err != nil { - return - } - now, ok := nowFromContext(ctx) - if !ok { - mnt.EndWrite() - return - } - d.metadataMu.Lock() - atomic.StoreInt64(&d.atime, now) - d.metadataMu.Unlock() - mnt.EndWrite() -} - -// Preconditions: fs.interop != InteropModeShared. The caller has successfully -// called vfs.Mount.CheckBeginWrite(). -func (d *dentry) touchCMtime(ctx context.Context) { - now, ok := nowFromContext(ctx) - if !ok { - return - } - d.metadataMu.Lock() - atomic.StoreInt64(&d.mtime, now) - atomic.StoreInt64(&d.ctime, now) - d.metadataMu.Unlock() -} |