summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fsimpl/gofer
diff options
context:
space:
mode:
authorIan Lewis <ianmlewis@gmail.com>2020-08-17 21:44:31 -0400
committerIan Lewis <ianmlewis@gmail.com>2020-08-17 21:44:31 -0400
commitac324f646ee3cb7955b0b45a7453aeb9671cbdf1 (patch)
tree0cbc5018e8807421d701d190dc20525726c7ca76 /pkg/sentry/fsimpl/gofer
parent352ae1022ce19de28fc72e034cc469872ad79d06 (diff)
parent6d0c5803d557d453f15ac6f683697eeb46dab680 (diff)
Merge branch 'master' into ip-forwarding
- Merges aleksej-paschenko's with HEAD - Adds vfs2 support for ip_forward
Diffstat (limited to 'pkg/sentry/fsimpl/gofer')
-rw-r--r--pkg/sentry/fsimpl/gofer/BUILD90
-rw-r--r--pkg/sentry/fsimpl/gofer/directory.go306
-rw-r--r--pkg/sentry/fsimpl/gofer/filesystem.go1550
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go1708
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer_test.go67
-rw-r--r--pkg/sentry/fsimpl/gofer/handle.go130
-rw-r--r--pkg/sentry/fsimpl/gofer/host_named_pipe.go97
-rw-r--r--pkg/sentry/fsimpl/gofer/p9file.go233
-rw-r--r--pkg/sentry/fsimpl/gofer/regular_file.go944
-rw-r--r--pkg/sentry/fsimpl/gofer/socket.go146
-rw-r--r--pkg/sentry/fsimpl/gofer/special_file.go292
-rw-r--r--pkg/sentry/fsimpl/gofer/symlink.go47
-rw-r--r--pkg/sentry/fsimpl/gofer/time.go82
13 files changed, 5692 insertions, 0 deletions
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
new file mode 100644
index 000000000..16787116f
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -0,0 +1,90 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+ name = "dentry_list",
+ out = "dentry_list.go",
+ package = "gofer",
+ prefix = "dentry",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*dentry",
+ "Linker": "*dentry",
+ },
+)
+
+go_template_instance(
+ name = "fstree",
+ out = "fstree.go",
+ package = "gofer",
+ prefix = "generic",
+ template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+ types = {
+ "Dentry": "dentry",
+ },
+)
+
+go_library(
+ name = "gofer",
+ srcs = [
+ "dentry_list.go",
+ "directory.go",
+ "filesystem.go",
+ "fstree.go",
+ "gofer.go",
+ "handle.go",
+ "host_named_pipe.go",
+ "p9file.go",
+ "regular_file.go",
+ "socket.go",
+ "special_file.go",
+ "symlink.go",
+ "time.go",
+ ],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/fd",
+ "//pkg/fdnotifier",
+ "//pkg/fspath",
+ "//pkg/log",
+ "//pkg/p9",
+ "//pkg/safemem",
+ "//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fs/lock",
+ "//pkg/sentry/fsimpl/host",
+ "//pkg/sentry/hostfd",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/pipe",
+ "//pkg/sentry/kernel/time",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/pgalloc",
+ "//pkg/sentry/platform",
+ "//pkg/sentry/socket/control",
+ "//pkg/sentry/socket/unix",
+ "//pkg/sentry/socket/unix/transport",
+ "//pkg/sentry/usage",
+ "//pkg/sentry/vfs",
+ "//pkg/syserr",
+ "//pkg/syserror",
+ "//pkg/unet",
+ "//pkg/usermem",
+ "//pkg/waiter",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
+
+go_test(
+ name = "gofer_test",
+ srcs = ["gofer_test.go"],
+ library = ":gofer",
+ deps = [
+ "//pkg/p9",
+ "//pkg/sentry/contexttest",
+ "//pkg/sentry/pgalloc",
+ ],
+)
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
new file mode 100644
index 000000000..2a8011eb4
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -0,0 +1,306 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+ "fmt"
+ "sync"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (d *dentry) isDir() bool {
+ return d.fileType() == linux.S_IFDIR
+}
+
+// Preconditions: filesystem.renameMu must be locked. d.dirMu must be locked.
+// d.isDir(). child must be a newly-created dentry that has never had a parent.
+func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
+ d.IncRef() // reference held by child on its parent
+ child.parent = d
+ child.name = name
+ if d.children == nil {
+ d.children = make(map[string]*dentry)
+ }
+ d.children[name] = child
+}
+
+// Preconditions: d.dirMu must be locked. d.isDir().
+func (d *dentry) cacheNegativeLookupLocked(name string) {
+ // Don't cache negative lookups if InteropModeShared is in effect (since
+ // this makes remote lookup unavoidable), or if d.isSynthetic() (in which
+ // case the only files in the directory are those for which a dentry exists
+ // in d.children). Instead, just delete any previously-cached dentry.
+ if d.fs.opts.interop == InteropModeShared || d.isSynthetic() {
+ delete(d.children, name)
+ return
+ }
+ if d.children == nil {
+ d.children = make(map[string]*dentry)
+ }
+ d.children[name] = nil
+}
+
+type createSyntheticOpts struct {
+ name string
+ mode linux.FileMode
+ kuid auth.KUID
+ kgid auth.KGID
+
+ // The endpoint for a synthetic socket. endpoint should be nil if the file
+ // being created is not a socket.
+ endpoint transport.BoundEndpoint
+
+ // pipe should be nil if the file being created is not a pipe.
+ pipe *pipe.VFSPipe
+}
+
+// createSyntheticChildLocked creates a synthetic file with the given name
+// in d.
+//
+// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain
+// a child with the given name.
+func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
+ d2 := &dentry{
+ refs: 1, // held by d
+ fs: d.fs,
+ ino: d.fs.nextSyntheticIno(),
+ mode: uint32(opts.mode),
+ uid: uint32(opts.kuid),
+ gid: uint32(opts.kgid),
+ blockSize: usermem.PageSize, // arbitrary
+ hostFD: -1,
+ nlink: uint32(2),
+ }
+ switch opts.mode.FileType() {
+ case linux.S_IFDIR:
+ // Nothing else needs to be done.
+ case linux.S_IFSOCK:
+ d2.endpoint = opts.endpoint
+ case linux.S_IFIFO:
+ d2.pipe = opts.pipe
+ default:
+ panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType()))
+ }
+ d2.pf.dentry = d2
+ d2.vfsd.Init(d2)
+
+ d.cacheNewChildLocked(d2, opts.name)
+ d.syntheticChildren++
+}
+
+type directoryFD struct {
+ fileDescription
+ vfs.DirectoryFileDescriptionDefaultImpl
+
+ mu sync.Mutex
+ off int64
+ dirents []vfs.Dirent
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release(context.Context) {
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+
+ d := fd.dentry()
+ if fd.dirents == nil {
+ ds, err := d.getDirents(ctx)
+ if err != nil {
+ return err
+ }
+ fd.dirents = ds
+ }
+
+ d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent)
+ if d.cachedMetadataAuthoritative() {
+ d.touchAtime(fd.vfsfd.Mount())
+ }
+
+ for fd.off < int64(len(fd.dirents)) {
+ if err := cb.Handle(fd.dirents[fd.off]); err != nil {
+ return err
+ }
+ fd.off++
+ }
+ return nil
+}
+
+// Preconditions: d.isDir(). There exists at least one directoryFD representing d.
+func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
+ // NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the
+ // presence of concurrent mutation of an iterated directory, so
+ // implementations may duplicate or omit entries in this case, which
+ // violates POSIX semantics. Thus we read all directory entries while
+ // holding d.dirMu to exclude directory mutations. (Note that it is
+ // impossible for the client to exclude concurrent mutation from other
+ // remote filesystem users. Since there is no way to detect if the server
+ // has incorrectly omitted directory entries, we simply assume that the
+ // server is well-behaved under InteropModeShared.) This is inconsistent
+ // with Linux (which appears to assume that directory fids have the correct
+ // semantics, and translates struct file_operations::readdir calls directly
+ // to readdir RPCs), but is consistent with VFS1.
+
+ // filesystem.renameMu is needed for d.parent, and must be locked before
+ // dentry.dirMu.
+ d.fs.renameMu.RLock()
+ defer d.fs.renameMu.RUnlock()
+ d.dirMu.Lock()
+ defer d.dirMu.Unlock()
+ if d.dirents != nil {
+ return d.dirents, nil
+ }
+
+ // It's not clear if 9P2000.L's readdir is expected to return "." and "..",
+ // so we generate them here.
+ parent := genericParentOrSelf(d)
+ dirents := []vfs.Dirent{
+ {
+ Name: ".",
+ Type: linux.DT_DIR,
+ Ino: uint64(d.ino),
+ NextOff: 1,
+ },
+ {
+ Name: "..",
+ Type: uint8(atomic.LoadUint32(&parent.mode) >> 12),
+ Ino: uint64(parent.ino),
+ NextOff: 2,
+ },
+ }
+ var realChildren map[string]struct{}
+ if !d.isSynthetic() {
+ if d.syntheticChildren != 0 && d.fs.opts.interop == InteropModeShared {
+ // Record the set of children d actually has so that we don't emit
+ // duplicate entries for synthetic children.
+ realChildren = make(map[string]struct{})
+ }
+ off := uint64(0)
+ const count = 64 * 1024 // for consistency with the vfs1 client
+ d.handleMu.RLock()
+ if d.readFile.isNil() {
+ // This should not be possible because a readable handle should
+ // have been opened when the calling directoryFD was opened.
+ d.handleMu.RUnlock()
+ panic("gofer.dentry.getDirents called without a readable handle")
+ }
+ for {
+ p9ds, err := d.readFile.readdir(ctx, off, count)
+ if err != nil {
+ d.handleMu.RUnlock()
+ return nil, err
+ }
+ if len(p9ds) == 0 {
+ d.handleMu.RUnlock()
+ break
+ }
+ for _, p9d := range p9ds {
+ if p9d.Name == "." || p9d.Name == ".." {
+ continue
+ }
+ dirent := vfs.Dirent{
+ Name: p9d.Name,
+ Ino: uint64(inoFromPath(p9d.QID.Path)),
+ NextOff: int64(len(dirents) + 1),
+ }
+ // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
+ // DMSOCKET.
+ switch p9d.Type {
+ case p9.TypeSymlink:
+ dirent.Type = linux.DT_LNK
+ case p9.TypeDir:
+ dirent.Type = linux.DT_DIR
+ default:
+ dirent.Type = linux.DT_REG
+ }
+ dirents = append(dirents, dirent)
+ if realChildren != nil {
+ realChildren[p9d.Name] = struct{}{}
+ }
+ }
+ off = p9ds[len(p9ds)-1].Offset
+ }
+ }
+ // Emit entries for synthetic children.
+ if d.syntheticChildren != 0 {
+ for _, child := range d.children {
+ if child == nil || !child.isSynthetic() {
+ continue
+ }
+ if _, ok := realChildren[child.name]; ok {
+ continue
+ }
+ dirents = append(dirents, vfs.Dirent{
+ Name: child.name,
+ Type: uint8(atomic.LoadUint32(&child.mode) >> 12),
+ Ino: uint64(child.ino),
+ NextOff: int64(len(dirents) + 1),
+ })
+ }
+ }
+ // Cache dirents for future directoryFDs if permitted.
+ if d.cachedMetadataAuthoritative() {
+ d.dirents = dirents
+ }
+ return dirents, nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+
+ switch whence {
+ case linux.SEEK_SET:
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ if offset == 0 {
+ // Ensure that the next call to fd.IterDirents() calls
+ // fd.dentry().getDirents().
+ fd.dirents = nil
+ }
+ fd.off = offset
+ return fd.off, nil
+ case linux.SEEK_CUR:
+ offset += fd.off
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ // Don't clear fd.dirents in this case, even if offset == 0.
+ fd.off = offset
+ return fd.off, nil
+ default:
+ return 0, syserror.EINVAL
+ }
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *directoryFD) Sync(ctx context.Context) error {
+ return fd.dentry().syncRemoteFile(ctx)
+}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
new file mode 100644
index 000000000..a3903db33
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -0,0 +1,1550 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+ "math"
+ "sync"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+ // Snapshot current syncable dentries and special files.
+ fs.syncMu.Lock()
+ ds := make([]*dentry, 0, len(fs.syncableDentries))
+ for d := range fs.syncableDentries {
+ d.IncRef()
+ ds = append(ds, d)
+ }
+ sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs))
+ for sffd := range fs.specialFileFDs {
+ sffd.vfsfd.IncRef()
+ sffds = append(sffds, sffd)
+ }
+ fs.syncMu.Unlock()
+
+ // Return the first error we encounter, but sync everything we can
+ // regardless.
+ var retErr error
+
+ // Sync regular files.
+ for _, d := range ds {
+ err := d.syncCachedFile(ctx)
+ d.DecRef(ctx)
+ if err != nil && retErr == nil {
+ retErr = err
+ }
+ }
+
+ // Sync special files, which may be writable but do not use dentry shared
+ // handles (so they won't be synced by the above).
+ for _, sffd := range sffds {
+ err := sffd.Sync(ctx)
+ sffd.vfsfd.DecRef(ctx)
+ if err != nil && retErr == nil {
+ retErr = err
+ }
+ }
+
+ return retErr
+}
+
+// maxFilenameLen is the maximum length of a filename. This is dictated by 9P's
+// encoding of strings, which uses 2 bytes for the length prefix.
+const maxFilenameLen = (1 << 16) - 1
+
+// dentrySlicePool is a pool of *[]*dentry used to store dentries for which
+// dentry.checkCachingLocked() must be called. The pool holds pointers to
+// slices because Go lacks generics, so sync.Pool operates on interface{}, so
+// every call to (what should be) sync.Pool<[]*dentry>.Put() allocates a copy
+// of the slice header on the heap.
+var dentrySlicePool = sync.Pool{
+ New: func() interface{} {
+ ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
+ return &ds
+ },
+}
+
+func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
+ if ds == nil {
+ ds = dentrySlicePool.Get().(*[]*dentry)
+ }
+ *ds = append(*ds, d)
+ return ds
+}
+
+// Preconditions: ds != nil.
+func putDentrySlice(ds *[]*dentry) {
+ // Allow dentries to be GC'd.
+ for i := range *ds {
+ (*ds)[i] = nil
+ }
+ *ds = (*ds)[:0]
+ dentrySlicePool.Put(ds)
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may become cached as a result of the traversal are appended
+// to *ds.
+//
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata
+// must be up to date.
+//
+// Postconditions: The returned dentry's cached metadata is up to date.
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+afterSymlink:
+ name := rp.Component()
+ if name == "." {
+ rp.Advance()
+ return d, nil
+ }
+ if name == ".." {
+ if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
+ return nil, err
+ } else if isRoot || d.parent == nil {
+ rp.Advance()
+ return d, nil
+ }
+ // We must assume that d.parent is correct, because if d has been moved
+ // elsewhere in the remote filesystem so that its parent has changed,
+ // we have no way of determining its new parent's location in the
+ // filesystem.
+ //
+ // Call rp.CheckMount() before updating d.parent's metadata, since if
+ // we traverse to another mount then d.parent's metadata is irrelevant.
+ if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+ return nil, err
+ }
+ if d != d.parent && !d.cachedMetadataAuthoritative() {
+ if err := d.parent.updateFromGetattr(ctx); err != nil {
+ return nil, err
+ }
+ }
+ rp.Advance()
+ return d.parent, nil
+ }
+ child, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), d, name, ds)
+ if err != nil {
+ return nil, err
+ }
+ if child == nil {
+ return nil, syserror.ENOENT
+ }
+ if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
+ return nil, err
+ }
+ if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
+ target, err := child.readlink(ctx, rp.Mount())
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.HandleSymlink(target); err != nil {
+ return nil, err
+ }
+ goto afterSymlink // don't check the current directory again
+ }
+ rp.Advance()
+ return child, nil
+}
+
+// getChildLocked returns a dentry representing the child of parent with the
+// given name. If no such child exists, getChildLocked returns (nil, nil).
+//
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// parent.isDir(). name is not "." or "..".
+//
+// Postconditions: If getChildLocked returns a non-nil dentry, its cached
+// metadata is up to date.
+func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
+ if len(name) > maxFilenameLen {
+ return nil, syserror.ENAMETOOLONG
+ }
+ child, ok := parent.children[name]
+ if (ok && fs.opts.interop != InteropModeShared) || parent.isSynthetic() {
+ // Whether child is nil or not, it is cached information that is
+ // assumed to be correct.
+ return child, nil
+ }
+ // We either don't have cached information or need to verify that it's
+ // still correct, either of which requires a remote lookup. Check if this
+ // name is valid before performing the lookup.
+ return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds)
+}
+
+// Preconditions: As for getChildLocked. !parent.isSynthetic().
+func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) {
+ if child != nil {
+ // Need to lock child.metadataMu because we might be updating child
+ // metadata. We need to hold the lock *before* getting metadata from the
+ // server and release it after updating local metadata.
+ child.metadataMu.Lock()
+ }
+ qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
+ if err != nil && err != syserror.ENOENT {
+ if child != nil {
+ child.metadataMu.Unlock()
+ }
+ return nil, err
+ }
+ if child != nil {
+ if !file.isNil() && inoFromPath(qid.Path) == child.ino {
+ // The file at this path hasn't changed. Just update cached metadata.
+ file.close(ctx)
+ child.updateFromP9AttrsLocked(attrMask, &attr)
+ child.metadataMu.Unlock()
+ return child, nil
+ }
+ child.metadataMu.Unlock()
+ if file.isNil() && child.isSynthetic() {
+ // We have a synthetic file, and no remote file has arisen to
+ // replace it.
+ return child, nil
+ }
+ // The file at this path has changed or no longer exists. Mark the
+ // dentry invalidated, and re-evaluate its caching status (i.e. if it
+ // has 0 references, drop it). Wait to update parent.children until we
+ // know what to replace the existing dentry with (i.e. one of the
+ // returns below), to avoid a redundant map access.
+ vfsObj.InvalidateDentry(ctx, &child.vfsd)
+ if child.isSynthetic() {
+ // Normally we don't mark invalidated dentries as deleted since
+ // they may still exist (but at a different path), and also for
+ // consistency with Linux. However, synthetic files are guaranteed
+ // to become unreachable if their dentries are invalidated, so
+ // treat their invalidation as deletion.
+ child.setDeleted()
+ parent.syntheticChildren--
+ child.decRefLocked()
+ parent.dirents = nil
+ }
+ *ds = appendDentry(*ds, child)
+ }
+ if file.isNil() {
+ // No file exists at this path now. Cache the negative lookup if
+ // allowed.
+ parent.cacheNegativeLookupLocked(name)
+ return nil, nil
+ }
+ // Create a new dentry representing the file.
+ child, err = fs.newDentry(ctx, file, qid, attrMask, &attr)
+ if err != nil {
+ file.close(ctx)
+ delete(parent.children, name)
+ return nil, err
+ }
+ parent.cacheNewChildLocked(child, name)
+ // For now, child has 0 references, so our caller should call
+ // child.checkCachingLocked().
+ *ds = appendDentry(*ds, child)
+ return child, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// Preconditions: fs.renameMu must be locked. !rp.Done(). If
+// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to
+// date.
+func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+ for !rp.Final() {
+ d.dirMu.Lock()
+ next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+ d.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ d = next
+ }
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+ d := rp.Start().Impl().(*dentry)
+ if !d.cachedMetadataAuthoritative() {
+ // Get updated metadata for rp.Start() as required by fs.stepLocked().
+ if err := d.updateFromGetattr(ctx); err != nil {
+ return nil, err
+ }
+ }
+ for !rp.Done() {
+ d.dirMu.Lock()
+ next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+ d.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ d = next
+ }
+ if rp.MustBeDir() && !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return d, nil
+}
+
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// createInRemoteDir (if the parent directory is a real remote directory) or
+// createInSyntheticDir (if the parent directory is synthetic) to do so.
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ start := rp.Start().Impl().(*dentry)
+ if !start.cachedMetadataAuthoritative() {
+ // Get updated metadata for start as required by
+ // fs.walkParentDirLocked().
+ if err := start.updateFromGetattr(ctx); err != nil {
+ return err
+ }
+ }
+ parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return err
+ }
+ if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ name := rp.Component()
+ if name == "." || name == ".." {
+ return syserror.EEXIST
+ }
+ if len(name) > maxFilenameLen {
+ return syserror.ENAMETOOLONG
+ }
+ if !dir && rp.MustBeDir() {
+ return syserror.ENOENT
+ }
+ if parent.isDeleted() {
+ return syserror.ENOENT
+ }
+ mnt := rp.Mount()
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ parent.dirMu.Lock()
+ defer parent.dirMu.Unlock()
+ if parent.isSynthetic() {
+ if child := parent.children[name]; child != nil {
+ return syserror.EEXIST
+ }
+ if createInSyntheticDir == nil {
+ return syserror.EPERM
+ }
+ if err := createInSyntheticDir(parent, name); err != nil {
+ return err
+ }
+ parent.touchCMtime()
+ parent.dirents = nil
+ ev := linux.IN_CREATE
+ if dir {
+ ev |= linux.IN_ISDIR
+ }
+ parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
+ return nil
+ }
+ if fs.opts.interop == InteropModeShared {
+ if child := parent.children[name]; child != nil && child.isSynthetic() {
+ return syserror.EEXIST
+ }
+ // The existence of a non-synthetic dentry at name would be inconclusive
+ // because the file it represents may have been deleted from the remote
+ // filesystem, so we would need to make an RPC to revalidate the dentry.
+ // Just attempt the file creation RPC instead. If a file does exist, the
+ // RPC will fail with EEXIST like we would have. If the RPC succeeds, and a
+ // stale dentry exists, the dentry will fail revalidation next time it's
+ // used.
+ if err := createInRemoteDir(parent, name); err != nil {
+ return err
+ }
+ ev := linux.IN_CREATE
+ if dir {
+ ev |= linux.IN_ISDIR
+ }
+ parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
+ return nil
+ }
+ if child := parent.children[name]; child != nil {
+ return syserror.EEXIST
+ }
+ // No cached dentry exists; however, there might still be an existing file
+ // at name. As above, we attempt the file creation RPC anyway.
+ if err := createInRemoteDir(parent, name); err != nil {
+ return err
+ }
+ if child, ok := parent.children[name]; ok && child == nil {
+ // Delete the now-stale negative dentry.
+ delete(parent.children, name)
+ }
+ parent.touchCMtime()
+ parent.dirents = nil
+ ev := linux.IN_CREATE
+ if dir {
+ ev |= linux.IN_ISDIR
+ }
+ parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
+ return nil
+}
+
+// Preconditions: !rp.Done().
+func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ start := rp.Start().Impl().(*dentry)
+ if !start.cachedMetadataAuthoritative() {
+ // Get updated metadata for start as required by
+ // fs.walkParentDirLocked().
+ if err := start.updateFromGetattr(ctx); err != nil {
+ return err
+ }
+ }
+ parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return err
+ }
+ if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+
+ name := rp.Component()
+ if dir {
+ if name == "." {
+ return syserror.EINVAL
+ }
+ if name == ".." {
+ return syserror.ENOTEMPTY
+ }
+ } else {
+ if name == "." || name == ".." {
+ return syserror.EISDIR
+ }
+ }
+ vfsObj := rp.VirtualFilesystem()
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef(ctx)
+ parent.dirMu.Lock()
+ defer parent.dirMu.Unlock()
+
+ child, ok := parent.children[name]
+ if ok && child == nil {
+ return syserror.ENOENT
+ }
+
+ sticky := atomic.LoadUint32(&parent.mode)&linux.ModeSticky != 0
+ if sticky {
+ if !ok {
+ // If the sticky bit is set, we need to retrieve the child to determine
+ // whether removing it is allowed.
+ child, err = fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
+ if err != nil {
+ return err
+ }
+ } else if child != nil && !child.cachedMetadataAuthoritative() {
+ // Make sure the dentry representing the file at name is up to date
+ // before examining its metadata.
+ child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
+ if err != nil {
+ return err
+ }
+ }
+ if err := parent.mayDelete(rp.Credentials(), child); err != nil {
+ return err
+ }
+ }
+
+ // If a child dentry exists, prepare to delete it. This should fail if it is
+ // a mount point. We detect mount points by speculatively calling
+ // PrepareDeleteDentry, which fails if child is a mount point. However, we
+ // may need to revalidate the file in this case to make sure that it has not
+ // been deleted or replaced on the remote fs, in which case the mount point
+ // will have disappeared. If calling PrepareDeleteDentry fails again on the
+ // up-to-date dentry, we can be sure that it is a mount point.
+ //
+ // Also note that if child is nil, then it can't be a mount point.
+ if child != nil {
+ // Hold child.dirMu so we can check child.children and
+ // child.syntheticChildren. We don't access these fields until a bit later,
+ // but locking child.dirMu after calling vfs.PrepareDeleteDentry() would
+ // create an inconsistent lock ordering between dentry.dirMu and
+ // vfs.Dentry.mu (in the VFS lock order, it would make dentry.dirMu both "a
+ // FilesystemImpl lock" and "a lock acquired by a FilesystemImpl between
+ // PrepareDeleteDentry and CommitDeleteDentry). To avoid this, lock
+ // child.dirMu before calling PrepareDeleteDentry.
+ child.dirMu.Lock()
+ defer child.dirMu.Unlock()
+ if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+ // We can skip revalidation in several cases:
+ // - We are not in InteropModeShared
+ // - The parent directory is synthetic, in which case the child must also
+ // be synthetic
+ // - We already updated the child during the sticky bit check above
+ if parent.cachedMetadataAuthoritative() || sticky {
+ return err
+ }
+ child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
+ if err != nil {
+ return err
+ }
+ if child != nil {
+ if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+ return err
+ }
+ }
+ }
+ }
+ flags := uint32(0)
+ // If a dentry exists, use it for best-effort checks on its deletability.
+ if dir {
+ if child != nil {
+ // child must be an empty directory.
+ if child.syntheticChildren != 0 {
+ // This is definitely not an empty directory, irrespective of
+ // fs.opts.interop.
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ return syserror.ENOTEMPTY
+ }
+ // If InteropModeShared is in effect and the first call to
+ // PrepareDeleteDentry above succeeded, then child wasn't
+ // revalidated (so we can't expect its file type to be correct) and
+ // individually revalidating its children (to confirm that they
+ // still exist) would be a waste of time.
+ if child.cachedMetadataAuthoritative() {
+ if !child.isDir() {
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ return syserror.ENOTDIR
+ }
+ for _, grandchild := range child.children {
+ if grandchild != nil {
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ return syserror.ENOTEMPTY
+ }
+ }
+ }
+ }
+ flags = linux.AT_REMOVEDIR
+ } else {
+ // child must be a non-directory file.
+ if child != nil && child.isDir() {
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ return syserror.EISDIR
+ }
+ if rp.MustBeDir() {
+ if child != nil {
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ }
+ return syserror.ENOTDIR
+ }
+ }
+ if parent.isSynthetic() {
+ if child == nil {
+ return syserror.ENOENT
+ }
+ } else if child == nil || !child.isSynthetic() {
+ err = parent.file.unlinkAt(ctx, name, flags)
+ if err != nil {
+ if child != nil {
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ }
+ return err
+ }
+ }
+
+ // Generate inotify events for rmdir or unlink.
+ if dir {
+ parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
+ } else {
+ var cw *vfs.Watches
+ if child != nil {
+ cw = &child.watches
+ }
+ vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name)
+ }
+
+ if child != nil {
+ vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
+ child.setDeleted()
+ if child.isSynthetic() {
+ parent.syntheticChildren--
+ child.decRefLocked()
+ }
+ ds = appendDentry(ds, child)
+ }
+ parent.cacheNegativeLookupLocked(name)
+ if parent.cachedMetadataAuthoritative() {
+ parent.dirents = nil
+ parent.touchCMtime()
+ if dir {
+ parent.decLinks()
+ }
+ }
+ return nil
+}
+
+// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls
+// dentry.checkCachingLocked on all dentries in *ds with fs.renameMu locked for
+// writing.
+//
+// ds is a pointer-to-pointer since defer evaluates its arguments immediately,
+// but dentry slices are allocated lazily, and it's much easier to say "defer
+// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() {
+// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this.
+func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) {
+ fs.renameMu.RUnlock()
+ if *ds == nil {
+ return
+ }
+ if len(**ds) != 0 {
+ fs.renameMu.Lock()
+ for _, d := range **ds {
+ d.checkCachingLocked(ctx)
+ }
+ fs.renameMu.Unlock()
+ }
+ putDentrySlice(*ds)
+}
+
+func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) {
+ if *ds == nil {
+ fs.renameMu.Unlock()
+ return
+ }
+ for _, d := range **ds {
+ d.checkCachingLocked(ctx)
+ }
+ fs.renameMu.Unlock()
+ putDentrySlice(*ds)
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return err
+ }
+ return d.checkPermissions(creds, ats)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return nil, err
+ }
+ if opts.CheckSearchable {
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+ }
+ d.IncRef()
+ return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ start := rp.Start().Impl().(*dentry)
+ if !start.cachedMetadataAuthoritative() {
+ // Get updated metadata for start as required by
+ // fs.walkParentDirLocked().
+ if err := start.updateFromGetattr(ctx); err != nil {
+ return nil, err
+ }
+ }
+ d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return nil, err
+ }
+ d.IncRef()
+ return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error {
+ if rp.Mount() != vd.Mount() {
+ return syserror.EXDEV
+ }
+ d := vd.Dentry().Impl().(*dentry)
+ if d.isDir() {
+ return syserror.EPERM
+ }
+ gid := auth.KGID(atomic.LoadUint32(&d.gid))
+ uid := auth.KUID(atomic.LoadUint32(&d.uid))
+ mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+ if err := vfs.MayLink(rp.Credentials(), mode, uid, gid); err != nil {
+ return err
+ }
+ if d.nlink == 0 {
+ return syserror.ENOENT
+ }
+ if d.nlink == math.MaxUint32 {
+ return syserror.EMLINK
+ }
+ if err := parent.file.link(ctx, d.file, childName); err != nil {
+ return err
+ }
+
+ // Success!
+ atomic.AddUint32(&d.nlink, 1)
+ return nil
+ }, nil)
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+ creds := rp.Credentials()
+ return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
+ if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil {
+ if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
+ return err
+ }
+ ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err)
+ parent.createSyntheticChildLocked(&createSyntheticOpts{
+ name: name,
+ mode: linux.S_IFDIR | opts.Mode,
+ kuid: creds.EffectiveKUID,
+ kgid: creds.EffectiveKGID,
+ })
+ }
+ if fs.opts.interop != InteropModeShared {
+ parent.incLinks()
+ }
+ return nil
+ }, func(parent *dentry, name string) error {
+ if !opts.ForSyntheticMountpoint {
+ // Can't create non-synthetic files in synthetic directories.
+ return syserror.EPERM
+ }
+ parent.createSyntheticChildLocked(&createSyntheticOpts{
+ name: name,
+ mode: linux.S_IFDIR | opts.Mode,
+ kuid: creds.EffectiveKUID,
+ kgid: creds.EffectiveKGID,
+ })
+ parent.incLinks()
+ return nil
+ })
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+ creds := rp.Credentials()
+ _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+ // If the gofer does not allow creating a socket or pipe, create a
+ // synthetic one, i.e. one that is kept entirely in memory.
+ if err == syserror.EPERM {
+ switch opts.Mode.FileType() {
+ case linux.S_IFSOCK:
+ parent.createSyntheticChildLocked(&createSyntheticOpts{
+ name: name,
+ mode: opts.Mode,
+ kuid: creds.EffectiveKUID,
+ kgid: creds.EffectiveKGID,
+ endpoint: opts.Endpoint,
+ })
+ return nil
+ case linux.S_IFIFO:
+ parent.createSyntheticChildLocked(&createSyntheticOpts{
+ name: name,
+ mode: opts.Mode,
+ kuid: creds.EffectiveKUID,
+ kgid: creds.EffectiveKGID,
+ pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
+ })
+ return nil
+ }
+ }
+ return err
+ }, nil)
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ // Reject O_TMPFILE, which is not supported; supporting it correctly in the
+ // presence of other remote filesystem users requires remote filesystem
+ // support, and it isn't clear that there's any way to implement this in
+ // 9P.
+ if opts.Flags&linux.O_TMPFILE != 0 {
+ return nil, syserror.EOPNOTSUPP
+ }
+ mayCreate := opts.Flags&linux.O_CREAT != 0
+ mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL)
+
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+
+ start := rp.Start().Impl().(*dentry)
+ if !start.cachedMetadataAuthoritative() {
+ // Get updated metadata for start as required by fs.stepLocked().
+ if err := start.updateFromGetattr(ctx); err != nil {
+ return nil, err
+ }
+ }
+ if rp.Done() {
+ // Reject attempts to open mount root directory with O_CREAT.
+ if mayCreate && rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ return start.openLocked(ctx, rp, &opts)
+ }
+
+afterTrailingSymlink:
+ parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return nil, err
+ }
+ // Check for search permission in the parent directory.
+ if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+ // Reject attempts to open directories with O_CREAT.
+ if mayCreate && rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
+ // Determine whether or not we need to create a file.
+ parent.dirMu.Lock()
+ child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
+ if err == syserror.ENOENT && mayCreate {
+ if parent.isSynthetic() {
+ parent.dirMu.Unlock()
+ return nil, syserror.EPERM
+ }
+ fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts, &ds)
+ parent.dirMu.Unlock()
+ return fd, err
+ }
+ parent.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ // Open existing child or follow symlink.
+ if child.isSymlink() && rp.ShouldFollowSymlink() {
+ target, err := child.readlink(ctx, rp.Mount())
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.HandleSymlink(target); err != nil {
+ return nil, err
+ }
+ start = parent
+ goto afterTrailingSymlink
+ }
+ if rp.MustBeDir() && !child.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return child.openLocked(ctx, rp, &opts)
+}
+
+// Preconditions: fs.renameMu must be locked.
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+ ats := vfs.AccessTypesForOpenFlags(opts)
+ if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
+ return nil, err
+ }
+
+ trunc := opts.Flags&linux.O_TRUNC != 0 && d.fileType() == linux.S_IFREG
+ if trunc {
+ // Lock metadataMu *while* we open a regular file with O_TRUNC because
+ // open(2) will change the file size on server.
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+ }
+
+ var vfd *vfs.FileDescription
+ var err error
+ mnt := rp.Mount()
+ switch d.fileType() {
+ case linux.S_IFREG:
+ if !d.fs.opts.regularFilesUseSpecialFileFD {
+ if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, trunc); err != nil {
+ return nil, err
+ }
+ fd := &regularFileFD{}
+ fd.LockFD.Init(&d.locks)
+ if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+ AllowDirectIO: true,
+ }); err != nil {
+ return nil, err
+ }
+ vfd = &fd.vfsfd
+ }
+ case linux.S_IFDIR:
+ // Can't open directories with O_CREAT.
+ if opts.Flags&linux.O_CREAT != 0 {
+ return nil, syserror.EISDIR
+ }
+ // Can't open directories writably.
+ if ats&vfs.MayWrite != 0 {
+ return nil, syserror.EISDIR
+ }
+ if opts.Flags&linux.O_DIRECT != 0 {
+ return nil, syserror.EINVAL
+ }
+ if !d.isSynthetic() {
+ if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil {
+ return nil, err
+ }
+ }
+ fd := &directoryFD{}
+ fd.LockFD.Init(&d.locks)
+ if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+ case linux.S_IFLNK:
+ // Can't open symlinks without O_PATH (which is unimplemented).
+ return nil, syserror.ELOOP
+ case linux.S_IFSOCK:
+ if d.isSynthetic() {
+ return nil, syserror.ENXIO
+ }
+ if d.fs.iopts.OpenSocketsByConnecting {
+ return d.connectSocketLocked(ctx, opts)
+ }
+ case linux.S_IFIFO:
+ if d.isSynthetic() {
+ return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks)
+ }
+ }
+
+ if vfd == nil {
+ if vfd, err = d.openSpecialFileLocked(ctx, mnt, opts); err != nil {
+ return nil, err
+ }
+ }
+
+ if trunc {
+ // If no errors occured so far then update file size in memory. This
+ // step is required even if !d.cachedMetadataAuthoritative() because
+ // d.mappings has to be updated.
+ // d.metadataMu has already been acquired if trunc == true.
+ d.updateFileSizeLocked(0)
+
+ if d.cachedMetadataAuthoritative() {
+ d.touchCMtimeLocked()
+ }
+ }
+ return vfd, err
+}
+
+func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+ if opts.Flags&linux.O_DIRECT != 0 {
+ return nil, syserror.EINVAL
+ }
+ fdObj, err := d.file.connect(ctx, p9.AnonymousSocket)
+ if err != nil {
+ return nil, err
+ }
+ fd, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fdObj.FD(), &host.NewFDOptions{
+ HaveFlags: true,
+ Flags: opts.Flags,
+ })
+ if err != nil {
+ fdObj.Close()
+ return nil, err
+ }
+ fdObj.Release()
+ return fd, nil
+}
+
+func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+ ats := vfs.AccessTypesForOpenFlags(opts)
+ if opts.Flags&linux.O_DIRECT != 0 {
+ return nil, syserror.EINVAL
+ }
+ // We assume that the server silently inserts O_NONBLOCK in the open flags
+ // for all named pipes (because all existing gofers do this).
+ //
+ // NOTE(b/133875563): This makes named pipe opens racy, because the
+ // mechanisms for translating nonblocking to blocking opens can only detect
+ // the instantaneous presence of a peer holding the other end of the pipe
+ // open, not whether the pipe was *previously* opened by a peer that has
+ // since closed its end.
+ isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0
+retry:
+ h, err := openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0)
+ if err != nil {
+ if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && err == syserror.ENXIO {
+ // An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails
+ // with ENXIO if opening the same named pipe with O_WRONLY would
+ // block because there are no readers of the pipe.
+ if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
+ return nil, err
+ }
+ goto retry
+ }
+ return nil, err
+ }
+ if isBlockingOpenOfNamedPipe && ats == vfs.MayRead && h.fd >= 0 {
+ if err := blockUntilNonblockingPipeHasWriter(ctx, h.fd); err != nil {
+ h.close(ctx)
+ return nil, err
+ }
+ }
+ fd, err := newSpecialFileFD(h, mnt, d, &d.locks, opts.Flags)
+ if err != nil {
+ h.close(ctx)
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
+// !d.isSynthetic().
+func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
+ if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+ return nil, err
+ }
+ if d.isDeleted() {
+ return nil, syserror.ENOENT
+ }
+ mnt := rp.Mount()
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return nil, err
+ }
+ defer mnt.EndWrite()
+
+ // 9P2000.L's lcreate takes a fid representing the parent directory, and
+ // converts it into an open fid representing the created file, so we need
+ // to duplicate the directory fid first.
+ _, dirfile, err := d.file.walk(ctx, nil)
+ if err != nil {
+ return nil, err
+ }
+ creds := rp.Credentials()
+ name := rp.Component()
+ // We only want the access mode for creating the file.
+ createFlags := p9.OpenFlags(opts.Flags) & p9.OpenFlagsModeMask
+ fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+ if err != nil {
+ dirfile.close(ctx)
+ return nil, err
+ }
+ // Then we need to walk to the file we just created to get a non-open fid
+ // representing it, and to get its metadata. This must use d.file since, as
+ // explained above, dirfile was invalidated by dirfile.Create().
+ _, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name)
+ if err != nil {
+ openFile.close(ctx)
+ if fdobj != nil {
+ fdobj.Close()
+ }
+ return nil, err
+ }
+
+ // Construct the new dentry.
+ child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr)
+ if err != nil {
+ nonOpenFile.close(ctx)
+ openFile.close(ctx)
+ if fdobj != nil {
+ fdobj.Close()
+ }
+ return nil, err
+ }
+ *ds = appendDentry(*ds, child)
+ // Incorporate the fid that was opened by lcreate.
+ useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD
+ if useRegularFileFD {
+ child.handleMu.Lock()
+ if vfs.MayReadFileWithOpenFlags(opts.Flags) {
+ child.readFile = openFile
+ if fdobj != nil {
+ child.hostFD = int32(fdobj.Release())
+ }
+ } else if fdobj != nil {
+ // Can't use fdobj if it's not readable.
+ fdobj.Close()
+ }
+ if vfs.MayWriteFileWithOpenFlags(opts.Flags) {
+ child.writeFile = openFile
+ }
+ child.handleMu.Unlock()
+ }
+ // Insert the dentry into the tree.
+ d.cacheNewChildLocked(child, name)
+ if d.cachedMetadataAuthoritative() {
+ d.touchCMtime()
+ d.dirents = nil
+ }
+
+ // Finally, construct a file description representing the created file.
+ var childVFSFD *vfs.FileDescription
+ if useRegularFileFD {
+ fd := &regularFileFD{}
+ fd.LockFD.Init(&child.locks)
+ if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
+ AllowDirectIO: true,
+ }); err != nil {
+ return nil, err
+ }
+ childVFSFD = &fd.vfsfd
+ } else {
+ h := handle{
+ file: openFile,
+ fd: -1,
+ }
+ if fdobj != nil {
+ h.fd = int32(fdobj.Release())
+ }
+ fd, err := newSpecialFileFD(h, mnt, child, &d.locks, opts.Flags)
+ if err != nil {
+ h.close(ctx)
+ return nil, err
+ }
+ childVFSFD = &fd.vfsfd
+ }
+ d.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
+ return childVFSFD, nil
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return "", err
+ }
+ if !d.isSymlink() {
+ return "", syserror.EINVAL
+ }
+ return d.readlink(ctx, rp.Mount())
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+ if opts.Flags != 0 {
+ // Requires 9P support.
+ return syserror.EINVAL
+ }
+
+ var ds *[]*dentry
+ fs.renameMu.Lock()
+ defer fs.renameMuUnlockAndCheckCaching(ctx, &ds)
+ newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds)
+ if err != nil {
+ return err
+ }
+ newName := rp.Component()
+ if newName == "." || newName == ".." {
+ return syserror.EBUSY
+ }
+ mnt := rp.Mount()
+ if mnt != oldParentVD.Mount() {
+ return syserror.EXDEV
+ }
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+
+ oldParent := oldParentVD.Dentry().Impl().(*dentry)
+ if !oldParent.cachedMetadataAuthoritative() {
+ if err := oldParent.updateFromGetattr(ctx); err != nil {
+ return err
+ }
+ }
+ creds := rp.Credentials()
+ if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ vfsObj := rp.VirtualFilesystem()
+ // We need a dentry representing the renamed file since, if it's a
+ // directory, we need to check for write permission on it.
+ oldParent.dirMu.Lock()
+ defer oldParent.dirMu.Unlock()
+ renamed, err := fs.getChildLocked(ctx, vfsObj, oldParent, oldName, &ds)
+ if err != nil {
+ return err
+ }
+ if renamed == nil {
+ return syserror.ENOENT
+ }
+ if err := oldParent.mayDelete(creds, renamed); err != nil {
+ return err
+ }
+ if renamed.isDir() {
+ if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
+ return syserror.EINVAL
+ }
+ if oldParent != newParent {
+ if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil {
+ return err
+ }
+ }
+ } else {
+ if opts.MustBeDir || rp.MustBeDir() {
+ return syserror.ENOTDIR
+ }
+ }
+
+ if oldParent != newParent {
+ if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ newParent.dirMu.Lock()
+ defer newParent.dirMu.Unlock()
+ }
+ if newParent.isDeleted() {
+ return syserror.ENOENT
+ }
+ replaced, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds)
+ if err != nil {
+ return err
+ }
+ var replacedVFSD *vfs.Dentry
+ if replaced != nil {
+ replacedVFSD = &replaced.vfsd
+ if replaced.isDir() {
+ if !renamed.isDir() {
+ return syserror.EISDIR
+ }
+ } else {
+ if rp.MustBeDir() || renamed.isDir() {
+ return syserror.ENOTDIR
+ }
+ }
+ }
+
+ if oldParent == newParent && oldName == newName {
+ return nil
+ }
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef(ctx)
+ if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
+ return err
+ }
+
+ // Update the remote filesystem.
+ if !renamed.isSynthetic() {
+ if err := renamed.file.rename(ctx, newParent.file, newName); err != nil {
+ vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+ return err
+ }
+ } else if replaced != nil && !replaced.isSynthetic() {
+ // We are replacing an existing real file with a synthetic one, so we
+ // need to unlink the former.
+ flags := uint32(0)
+ if replaced.isDir() {
+ flags = linux.AT_REMOVEDIR
+ }
+ if err := newParent.file.unlinkAt(ctx, newName, flags); err != nil {
+ vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+ return err
+ }
+ }
+
+ // Update the dentry tree.
+ vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
+ if replaced != nil {
+ replaced.setDeleted()
+ if replaced.isSynthetic() {
+ newParent.syntheticChildren--
+ replaced.decRefLocked()
+ }
+ ds = appendDentry(ds, replaced)
+ }
+ oldParent.cacheNegativeLookupLocked(oldName)
+ // We don't use newParent.cacheNewChildLocked() since we don't want to mess
+ // with reference counts and queue oldParent for checkCachingLocked if the
+ // parent isn't actually changing.
+ if oldParent != newParent {
+ ds = appendDentry(ds, oldParent)
+ newParent.IncRef()
+ if renamed.isSynthetic() {
+ oldParent.syntheticChildren--
+ newParent.syntheticChildren++
+ }
+ }
+ renamed.parent = newParent
+ renamed.name = newName
+ if newParent.children == nil {
+ newParent.children = make(map[string]*dentry)
+ }
+ newParent.children[newName] = renamed
+
+ // Update metadata.
+ if renamed.cachedMetadataAuthoritative() {
+ renamed.touchCtime()
+ }
+ if oldParent.cachedMetadataAuthoritative() {
+ oldParent.dirents = nil
+ oldParent.touchCMtime()
+ if renamed.isDir() {
+ oldParent.decLinks()
+ }
+ }
+ if newParent.cachedMetadataAuthoritative() {
+ newParent.dirents = nil
+ newParent.touchCMtime()
+ if renamed.isDir() && (replaced == nil || !replaced.isDir()) {
+ // Increase the link count if we did not replace another directory.
+ newParent.incLinks()
+ }
+ }
+ vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir())
+ return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ return fs.unlinkAt(ctx, rp, true /* dir */)
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ return err
+ }
+ if err := d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()); err != nil {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ return err
+ }
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+
+ if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+ d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+ }
+ return nil
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ // Since walking updates metadata for all traversed dentries under
+ // InteropModeShared, including the returned one, we can return cached
+ // metadata here regardless of fs.opts.interop.
+ var stat linux.Statx
+ d.statTo(&stat)
+ return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return linux.Statfs{}, err
+ }
+ // If d is synthetic, invoke statfs on the first ancestor of d that isn't.
+ for d.isSynthetic() {
+ d = d.parent
+ }
+ fsstat, err := d.file.statFS(ctx)
+ if err != nil {
+ return linux.Statfs{}, err
+ }
+ nameLen := uint64(fsstat.NameLength)
+ if nameLen > maxFilenameLen {
+ nameLen = maxFilenameLen
+ }
+ return linux.Statfs{
+ // This is primarily for distinguishing a gofer file system in
+ // tests. Testing is important, so instead of defining
+ // something completely random, use a standard value.
+ Type: linux.V9FS_MAGIC,
+ BlockSize: int64(fsstat.BlockSize),
+ Blocks: fsstat.Blocks,
+ BlocksFree: fsstat.BlocksFree,
+ BlocksAvailable: fsstat.BlocksAvailable,
+ Files: fsstat.Files,
+ FilesFree: fsstat.FilesFree,
+ NameLength: nameLen,
+ }, nil
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+ creds := rp.Credentials()
+ _, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+ return err
+ }, nil)
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ return fs.unlinkAt(ctx, rp, false /* dir */)
+}
+
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return nil, err
+ }
+ if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+ return nil, err
+ }
+ if d.isSocket() {
+ if !d.isSynthetic() {
+ d.IncRef()
+ return &endpoint{
+ dentry: d,
+ file: d.file.file,
+ path: opts.Addr,
+ }, nil
+ }
+ return d.endpoint, nil
+ }
+ return nil, syserror.ECONNREFUSED
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return nil, err
+ }
+ return d.listxattr(ctx, rp.Credentials(), size)
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ return "", err
+ }
+ return d.getxattr(ctx, rp.Credentials(), &opts)
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ return err
+ }
+ if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ return err
+ }
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ d, err := fs.resolveLocked(ctx, rp, &ds)
+ if err != nil {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ return err
+ }
+ if err := d.removexattr(ctx, rp.Credentials(), name); err != nil {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ return err
+ }
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+ fs.renameMu.RLock()
+ defer fs.renameMu.RUnlock()
+ return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
+}
+
+func (fs *filesystem) nextSyntheticIno() inodeNumber {
+ return inodeNumber(atomic.AddUint64(&fs.syntheticSeq, 1) | syntheticInoMask)
+}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
new file mode 100644
index 000000000..63e589859
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -0,0 +1,1708 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gofer provides a filesystem implementation that is backed by a 9p
+// server, interchangably referred to as "gofers" throughout this package.
+//
+// Lock order:
+// regularFileFD/directoryFD.mu
+// filesystem.renameMu
+// dentry.dirMu
+// filesystem.syncMu
+// dentry.metadataMu
+// *** "memmap.Mappable locks" below this point
+// dentry.mapsMu
+// *** "memmap.Mappable locks taken by Translate" below this point
+// dentry.handleMu
+// dentry.dataMu
+//
+// Locking dentry.dirMu in multiple dentries requires that either ancestor
+// dentries are locked before descendant dentries, or that filesystem.renameMu
+// is locked for writing.
+package gofer
+
+import (
+ "fmt"
+ "strconv"
+ "strings"
+ "sync"
+ "sync/atomic"
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/unet"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Name is the default filesystem name.
+const Name = "9p"
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+ vfsfs vfs.Filesystem
+
+ // mfp is used to allocate memory that caches regular file contents. mfp is
+ // immutable.
+ mfp pgalloc.MemoryFileProvider
+
+ // Immutable options.
+ opts filesystemOptions
+ iopts InternalFilesystemOptions
+
+ // client is the client used by this filesystem. client is immutable.
+ client *p9.Client
+
+ // clock is a realtime clock used to set timestamps in file operations.
+ clock ktime.Clock
+
+ // devMinor is the filesystem's minor device number. devMinor is immutable.
+ devMinor uint32
+
+ // renameMu serves two purposes:
+ //
+ // - It synchronizes path resolution with renaming initiated by this
+ // client.
+ //
+ // - It is held by path resolution to ensure that reachable dentries remain
+ // valid. A dentry is reachable by path resolution if it has a non-zero
+ // reference count (such that it is usable as vfs.ResolvingPath.Start() or
+ // is reachable from its children), or if it is a child dentry (such that
+ // it is reachable from its parent).
+ renameMu sync.RWMutex
+
+ // cachedDentries contains all dentries with 0 references. (Due to race
+ // conditions, it may also contain dentries with non-zero references.)
+ // cachedDentriesLen is the number of dentries in cachedDentries. These
+ // fields are protected by renameMu.
+ cachedDentries dentryList
+ cachedDentriesLen uint64
+
+ // syncableDentries contains all dentries in this filesystem for which
+ // !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs.
+ // These fields are protected by syncMu.
+ syncMu sync.Mutex
+ syncableDentries map[*dentry]struct{}
+ specialFileFDs map[*specialFileFD]struct{}
+
+ // syntheticSeq stores a counter to used to generate unique inodeNumber for
+ // synthetic dentries.
+ syntheticSeq uint64
+}
+
+// inodeNumber represents inode number reported in Dirent.Ino. For regular
+// dentries, it comes from QID.Path from the 9P server. Synthetic dentries
+// have have their inodeNumber generated sequentially, with the MSB reserved to
+// prevent conflicts with regular dentries.
+type inodeNumber uint64
+
+// Reserve MSB for synthetic mounts.
+const syntheticInoMask = uint64(1) << 63
+
+func inoFromPath(path uint64) inodeNumber {
+ if path&syntheticInoMask != 0 {
+ log.Warningf("Dropping MSB from ino, collision is possible. Original: %d, new: %d", path, path&^syntheticInoMask)
+ }
+ return inodeNumber(path &^ syntheticInoMask)
+}
+
+type filesystemOptions struct {
+ // "Standard" 9P options.
+ fd int
+ aname string
+ interop InteropMode // derived from the "cache" mount option
+ dfltuid auth.KUID
+ dfltgid auth.KGID
+ msize uint32
+ version string
+
+ // maxCachedDentries is the maximum number of dentries with 0 references
+ // retained by the client.
+ maxCachedDentries uint64
+
+ // If forcePageCache is true, host FDs may not be used for application
+ // memory mappings even if available; instead, the client must perform its
+ // own caching of regular file pages. This is primarily useful for testing.
+ forcePageCache bool
+
+ // If limitHostFDTranslation is true, apply maxFillRange() constraints to
+ // host FD mappings returned by dentry.(memmap.Mappable).Translate(). This
+ // makes memory accounting behavior more consistent between cases where
+ // host FDs are / are not available, but may increase the frequency of
+ // sentry-handled page faults on files for which a host FD is available.
+ limitHostFDTranslation bool
+
+ // If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote
+ // filesystem may not be coherent with writable host FDs opened later, so
+ // all uses of the former must be replaced by uses of the latter. This is
+ // usually only the case when the remote filesystem is a Linux overlayfs
+ // mount. (Prior to Linux 4.18, patch series centered on commit
+ // d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were
+ // incoherent between pre-copy-up and post-copy-up FDs; after that patch
+ // series, only memory mappings are incoherent.)
+ overlayfsStaleRead bool
+
+ // If regularFilesUseSpecialFileFD is true, application FDs representing
+ // regular files will use distinct file handles for each FD, in the same
+ // way that application FDs representing "special files" such as sockets
+ // do. Note that this disables client caching and mmap for regular files.
+ regularFilesUseSpecialFileFD bool
+}
+
+// InteropMode controls the client's interaction with other remote filesystem
+// users.
+type InteropMode uint32
+
+const (
+ // InteropModeExclusive is appropriate when the filesystem client is the
+ // only user of the remote filesystem.
+ //
+ // - The client may cache arbitrary filesystem state (file data, metadata,
+ // filesystem structure, etc.).
+ //
+ // - Client changes to filesystem state may be sent to the remote
+ // filesystem asynchronously, except when server permission checks are
+ // necessary.
+ //
+ // - File timestamps are based on client clocks. This ensures that users of
+ // the client observe timestamps that are coherent with their own clocks
+ // and consistent with Linux's semantics (in particular, it is not always
+ // possible for clients to set arbitrary atimes and mtimes depending on the
+ // remote filesystem implementation, and never possible for clients to set
+ // arbitrary ctimes.) If a dentry containing a client-defined atime or
+ // mtime is evicted from cache, client timestamps will be sent to the
+ // remote filesystem on a best-effort basis to attempt to ensure that
+ // timestamps will be preserved when another dentry representing the same
+ // file is instantiated.
+ InteropModeExclusive InteropMode = iota
+
+ // InteropModeWritethrough is appropriate when there are read-only users of
+ // the remote filesystem that expect to observe changes made by the
+ // filesystem client.
+ //
+ // - The client may cache arbitrary filesystem state.
+ //
+ // - Client changes to filesystem state must be sent to the remote
+ // filesystem synchronously.
+ //
+ // - File timestamps are based on client clocks. As a corollary, access
+ // timestamp changes from other remote filesystem users will not be visible
+ // to the client.
+ InteropModeWritethrough
+
+ // InteropModeShared is appropriate when there are users of the remote
+ // filesystem that may mutate its state other than the client.
+ //
+ // - The client must verify ("revalidate") cached filesystem state before
+ // using it.
+ //
+ // - Client changes to filesystem state must be sent to the remote
+ // filesystem synchronously.
+ //
+ // - File timestamps are based on server clocks. This is necessary to
+ // ensure that timestamp changes are synchronized between remote filesystem
+ // users.
+ //
+ // Note that the correctness of InteropModeShared depends on the server
+ // correctly implementing 9P fids (i.e. each fid immutably represents a
+ // single filesystem object), even in the presence of remote filesystem
+ // mutations from other users. If this is violated, the behavior of the
+ // client is undefined.
+ InteropModeShared
+)
+
+// InternalFilesystemOptions may be passed as
+// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+type InternalFilesystemOptions struct {
+ // If LeakConnection is true, do not close the connection to the server
+ // when the Filesystem is released. This is necessary for deployments in
+ // which servers can handle only a single client and report failure if that
+ // client disconnects.
+ LeakConnection bool
+
+ // If OpenSocketsByConnecting is true, silently translate attempts to open
+ // files identifying as sockets to connect RPCs.
+ OpenSocketsByConnecting bool
+}
+
+// _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default
+// UIDs and GIDs used for files that do not provide a specific owner or group
+// respectively.
+const (
+ // uint32(-2) doesn't work in Go.
+ _V9FS_DEFUID = auth.KUID(4294967294)
+ _V9FS_DEFGID = auth.KGID(4294967294)
+)
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+ return Name
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+ if mfp == nil {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider")
+ return nil, nil, syserror.EINVAL
+ }
+
+ mopts := vfs.GenericParseMountOptions(opts.Data)
+ var fsopts filesystemOptions
+
+ // Check that the transport is "fd".
+ trans, ok := mopts["trans"]
+ if !ok {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: transport must be specified as 'trans=fd'")
+ return nil, nil, syserror.EINVAL
+ }
+ delete(mopts, "trans")
+ if trans != "fd" {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: unsupported transport: trans=%s", trans)
+ return nil, nil, syserror.EINVAL
+ }
+
+ // Check that read and write FDs are provided and identical.
+ rfdstr, ok := mopts["rfdno"]
+ if !ok {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD must be specified as 'rfdno=<file descriptor>")
+ return nil, nil, syserror.EINVAL
+ }
+ delete(mopts, "rfdno")
+ rfd, err := strconv.Atoi(rfdstr)
+ if err != nil {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid read FD: rfdno=%s", rfdstr)
+ return nil, nil, syserror.EINVAL
+ }
+ wfdstr, ok := mopts["wfdno"]
+ if !ok {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: write FD must be specified as 'wfdno=<file descriptor>")
+ return nil, nil, syserror.EINVAL
+ }
+ delete(mopts, "wfdno")
+ wfd, err := strconv.Atoi(wfdstr)
+ if err != nil {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid write FD: wfdno=%s", wfdstr)
+ return nil, nil, syserror.EINVAL
+ }
+ if rfd != wfd {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD (%d) and write FD (%d) must be equal", rfd, wfd)
+ return nil, nil, syserror.EINVAL
+ }
+ fsopts.fd = rfd
+
+ // Get the attach name.
+ fsopts.aname = "/"
+ if aname, ok := mopts["aname"]; ok {
+ delete(mopts, "aname")
+ fsopts.aname = aname
+ }
+
+ // Parse the cache policy. For historical reasons, this defaults to the
+ // least generally-applicable option, InteropModeExclusive.
+ fsopts.interop = InteropModeExclusive
+ if cache, ok := mopts["cache"]; ok {
+ delete(mopts, "cache")
+ switch cache {
+ case "fscache":
+ fsopts.interop = InteropModeExclusive
+ case "fscache_writethrough":
+ fsopts.interop = InteropModeWritethrough
+ case "none":
+ fsopts.regularFilesUseSpecialFileFD = true
+ fallthrough
+ case "remote_revalidating":
+ fsopts.interop = InteropModeShared
+ default:
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: cache=%s", cache)
+ return nil, nil, syserror.EINVAL
+ }
+ }
+
+ // Parse the default UID and GID.
+ fsopts.dfltuid = _V9FS_DEFUID
+ if dfltuidstr, ok := mopts["dfltuid"]; ok {
+ delete(mopts, "dfltuid")
+ dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32)
+ if err != nil {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltuid=%s", dfltuidstr)
+ return nil, nil, syserror.EINVAL
+ }
+ // In Linux, dfltuid is interpreted as a UID and is converted to a KUID
+ // in the caller's user namespace, but goferfs isn't
+ // application-mountable.
+ fsopts.dfltuid = auth.KUID(dfltuid)
+ }
+ fsopts.dfltgid = _V9FS_DEFGID
+ if dfltgidstr, ok := mopts["dfltgid"]; ok {
+ delete(mopts, "dfltgid")
+ dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32)
+ if err != nil {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltgid=%s", dfltgidstr)
+ return nil, nil, syserror.EINVAL
+ }
+ fsopts.dfltgid = auth.KGID(dfltgid)
+ }
+
+ // Parse the 9P message size.
+ fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M
+ if msizestr, ok := mopts["msize"]; ok {
+ delete(mopts, "msize")
+ msize, err := strconv.ParseUint(msizestr, 10, 32)
+ if err != nil {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid message size: msize=%s", msizestr)
+ return nil, nil, syserror.EINVAL
+ }
+ fsopts.msize = uint32(msize)
+ }
+
+ // Parse the 9P protocol version.
+ fsopts.version = p9.HighestVersionString()
+ if version, ok := mopts["version"]; ok {
+ delete(mopts, "version")
+ fsopts.version = version
+ }
+
+ // Parse the dentry cache limit.
+ fsopts.maxCachedDentries = 1000
+ if str, ok := mopts["dentry_cache_limit"]; ok {
+ delete(mopts, "dentry_cache_limit")
+ maxCachedDentries, err := strconv.ParseUint(str, 10, 64)
+ if err != nil {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
+ return nil, nil, syserror.EINVAL
+ }
+ fsopts.maxCachedDentries = maxCachedDentries
+ }
+
+ // Handle simple flags.
+ if _, ok := mopts["force_page_cache"]; ok {
+ delete(mopts, "force_page_cache")
+ fsopts.forcePageCache = true
+ }
+ if _, ok := mopts["limit_host_fd_translation"]; ok {
+ delete(mopts, "limit_host_fd_translation")
+ fsopts.limitHostFDTranslation = true
+ }
+ if _, ok := mopts["overlayfs_stale_read"]; ok {
+ delete(mopts, "overlayfs_stale_read")
+ fsopts.overlayfsStaleRead = true
+ }
+ // fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying
+ // "cache=none".
+
+ // Check for unparsed options.
+ if len(mopts) != 0 {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts)
+ return nil, nil, syserror.EINVAL
+ }
+
+ // Handle internal options.
+ iopts, ok := opts.InternalData.(InternalFilesystemOptions)
+ if opts.InternalData != nil && !ok {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData)
+ return nil, nil, syserror.EINVAL
+ }
+ // If !ok, iopts being the zero value is correct.
+
+ // Establish a connection with the server.
+ conn, err := unet.NewSocket(fsopts.fd)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ // Perform version negotiation with the server.
+ ctx.UninterruptibleSleepStart(false)
+ client, err := p9.NewClient(conn, fsopts.msize, fsopts.version)
+ ctx.UninterruptibleSleepFinish(false)
+ if err != nil {
+ conn.Close()
+ return nil, nil, err
+ }
+ // Ownership of conn has been transferred to client.
+
+ // Perform attach to obtain the filesystem root.
+ ctx.UninterruptibleSleepStart(false)
+ attached, err := client.Attach(fsopts.aname)
+ ctx.UninterruptibleSleepFinish(false)
+ if err != nil {
+ client.Close()
+ return nil, nil, err
+ }
+ attachFile := p9file{attached}
+ qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask())
+ if err != nil {
+ attachFile.close(ctx)
+ client.Close()
+ return nil, nil, err
+ }
+
+ // Construct the filesystem object.
+ devMinor, err := vfsObj.GetAnonBlockDevMinor()
+ if err != nil {
+ attachFile.close(ctx)
+ client.Close()
+ return nil, nil, err
+ }
+ fs := &filesystem{
+ mfp: mfp,
+ opts: fsopts,
+ iopts: iopts,
+ client: client,
+ clock: ktime.RealtimeClockFromContext(ctx),
+ devMinor: devMinor,
+ syncableDentries: make(map[*dentry]struct{}),
+ specialFileFDs: make(map[*specialFileFD]struct{}),
+ }
+ fs.vfsfs.Init(vfsObj, &fstype, fs)
+
+ // Construct the root dentry.
+ root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr)
+ if err != nil {
+ attachFile.close(ctx)
+ fs.vfsfs.DecRef(ctx)
+ return nil, nil, err
+ }
+ // Set the root's reference count to 2. One reference is returned to the
+ // caller, and the other is deliberately leaked to prevent the root from
+ // being "cached" and subsequently evicted. Its resources will still be
+ // cleaned up by fs.Release().
+ root.refs = 2
+
+ return &fs.vfsfs, &root.vfsd, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+ mf := fs.mfp.MemoryFile()
+
+ fs.syncMu.Lock()
+ for d := range fs.syncableDentries {
+ d.handleMu.Lock()
+ d.dataMu.Lock()
+ if h := d.writeHandleLocked(); h.isOpen() {
+ // Write dirty cached data to the remote file.
+ if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), h.writeFromBlocksAt); err != nil {
+ log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err)
+ }
+ // TODO(jamieliu): Do we need to flushf/fsync d?
+ }
+ // Discard cached pages.
+ d.cache.DropAll(mf)
+ d.dirty.RemoveAll()
+ d.dataMu.Unlock()
+ // Close the host fd if one exists.
+ if d.hostFD >= 0 {
+ syscall.Close(int(d.hostFD))
+ d.hostFD = -1
+ }
+ d.handleMu.Unlock()
+ }
+ // There can't be any specialFileFDs still using fs, since each such
+ // FileDescription would hold a reference on a Mount holding a reference on
+ // fs.
+ fs.syncMu.Unlock()
+
+ if !fs.iopts.LeakConnection {
+ // Close the connection to the server. This implicitly clunks all fids.
+ fs.client.Close()
+ }
+
+ fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+}
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+ vfsd vfs.Dentry
+
+ // refs is the reference count. Each dentry holds a reference on its
+ // parent, even if disowned. An additional reference is held on all
+ // synthetic dentries until they are unlinked or invalidated. When refs
+ // reaches 0, the dentry may be added to the cache or destroyed. If refs ==
+ // -1, the dentry has already been destroyed. refs is accessed using atomic
+ // memory operations.
+ refs int64
+
+ // fs is the owning filesystem. fs is immutable.
+ fs *filesystem
+
+ // parent is this dentry's parent directory. Each dentry holds a reference
+ // on its parent. If this dentry is a filesystem root, parent is nil.
+ // parent is protected by filesystem.renameMu.
+ parent *dentry
+
+ // name is the name of this dentry in its parent. If this dentry is a
+ // filesystem root, name is the empty string. name is protected by
+ // filesystem.renameMu.
+ name string
+
+ // file is the unopened p9.File that backs this dentry. file is immutable.
+ //
+ // If file.isNil(), this dentry represents a synthetic file, i.e. a file
+ // that does not exist on the remote filesystem. As of this writing, the
+ // only files that can be synthetic are sockets, pipes, and directories.
+ file p9file
+
+ // If deleted is non-zero, the file represented by this dentry has been
+ // deleted. deleted is accessed using atomic memory operations.
+ deleted uint32
+
+ // If cached is true, dentryEntry links dentry into
+ // filesystem.cachedDentries. cached and dentryEntry are protected by
+ // filesystem.renameMu.
+ cached bool
+ dentryEntry
+
+ dirMu sync.Mutex
+
+ // If this dentry represents a directory, children contains:
+ //
+ // - Mappings of child filenames to dentries representing those children.
+ //
+ // - Mappings of child filenames that are known not to exist to nil
+ // dentries (only if InteropModeShared is not in effect and the directory
+ // is not synthetic).
+ //
+ // children is protected by dirMu.
+ children map[string]*dentry
+
+ // If this dentry represents a directory, syntheticChildren is the number
+ // of child dentries for which dentry.isSynthetic() == true.
+ // syntheticChildren is protected by dirMu.
+ syntheticChildren int
+
+ // If this dentry represents a directory,
+ // dentry.cachedMetadataAuthoritative() == true, and dirents is not nil, it
+ // is a cache of all entries in the directory, in the order they were
+ // returned by the server. dirents is protected by dirMu.
+ dirents []vfs.Dirent
+
+ // Cached metadata; protected by metadataMu.
+ // To access:
+ // - In situations where consistency is not required (like stat), these
+ // can be accessed using atomic operations only (without locking).
+ // - Lock metadataMu and can access without atomic operations.
+ // To mutate:
+ // - Lock metadataMu and use atomic operations to update because we might
+ // have atomic readers that don't hold the lock.
+ metadataMu sync.Mutex
+ ino inodeNumber // immutable
+ mode uint32 // type is immutable, perms are mutable
+ uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+ gid uint32 // auth.KGID, but ...
+ blockSize uint32 // 0 if unknown
+ // Timestamps, all nsecs from the Unix epoch.
+ atime int64
+ mtime int64
+ ctime int64
+ btime int64
+ // File size, which differs from other metadata in two ways:
+ //
+ // - We make a best-effort attempt to keep it up to date even if
+ // !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes.
+ //
+ // - size is protected by both metadataMu and dataMu (i.e. both must be
+ // locked to mutate it; locking either is sufficient to access it).
+ size uint64
+ // If this dentry does not represent a synthetic file, deleted is 0, and
+ // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the
+ // remote file's timestamps, which should be updated when this dentry is
+ // evicted.
+ atimeDirty uint32
+ mtimeDirty uint32
+
+ // nlink counts the number of hard links to this dentry. It's updated and
+ // accessed using atomic operations. It's not protected by metadataMu like the
+ // other metadata fields.
+ nlink uint32
+
+ mapsMu sync.Mutex
+
+ // If this dentry represents a regular file, mappings tracks mappings of
+ // the file into memmap.MappingSpaces. mappings is protected by mapsMu.
+ mappings memmap.MappingSet
+
+ // - If this dentry represents a regular file or directory, readFile is the
+ // p9.File used for reads by all regularFileFDs/directoryFDs representing
+ // this dentry.
+ //
+ // - If this dentry represents a regular file, writeFile is the p9.File
+ // used for writes by all regularFileFDs representing this dentry.
+ //
+ // - If this dentry represents a regular file, hostFD is the host FD used
+ // for memory mappings and I/O (when applicable) in preference to readFile
+ // and writeFile. hostFD is always readable; if !writeFile.isNil(), it must
+ // also be writable. If hostFD is -1, no such host FD is available.
+ //
+ // These fields are protected by handleMu.
+ //
+ // readFile and writeFile may or may not represent the same p9.File. Once
+ // either p9.File transitions from closed (isNil() == true) to open
+ // (isNil() == false), it may be mutated with handleMu locked, but cannot
+ // be closed until the dentry is destroyed.
+ handleMu sync.RWMutex
+ readFile p9file
+ writeFile p9file
+ hostFD int32
+
+ dataMu sync.RWMutex
+
+ // If this dentry represents a regular file that is client-cached, cache
+ // maps offsets into the cached file to offsets into
+ // filesystem.mfp.MemoryFile() that store the file's data. cache is
+ // protected by dataMu.
+ cache fsutil.FileRangeSet
+
+ // If this dentry represents a regular file that is client-cached, dirty
+ // tracks dirty segments in cache. dirty is protected by dataMu.
+ dirty fsutil.DirtySet
+
+ // pf implements platform.File for mappings of hostFD.
+ pf dentryPlatformFile
+
+ // If this dentry represents a symbolic link, InteropModeShared is not in
+ // effect, and haveTarget is true, target is the symlink target. haveTarget
+ // and target are protected by dataMu.
+ haveTarget bool
+ target string
+
+ // If this dentry represents a synthetic socket file, endpoint is the
+ // transport endpoint bound to this file.
+ endpoint transport.BoundEndpoint
+
+ // If this dentry represents a synthetic named pipe, pipe is the pipe
+ // endpoint bound to this file.
+ pipe *pipe.VFSPipe
+
+ locks vfs.FileLocks
+
+ // Inotify watches for this dentry.
+ watches vfs.Watches
+}
+
+// dentryAttrMask returns a p9.AttrMask enabling all attributes used by the
+// gofer client.
+func dentryAttrMask() p9.AttrMask {
+ return p9.AttrMask{
+ Mode: true,
+ UID: true,
+ GID: true,
+ ATime: true,
+ MTime: true,
+ CTime: true,
+ Size: true,
+ BTime: true,
+ }
+}
+
+// newDentry creates a new dentry representing the given file. The dentry
+// initially has no references, but is not cached; it is the caller's
+// responsibility to set the dentry's reference count and/or call
+// dentry.checkCachingLocked() as appropriate.
+//
+// Preconditions: !file.isNil().
+func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) {
+ if !mask.Mode {
+ ctx.Warningf("can't create gofer.dentry without file type")
+ return nil, syserror.EIO
+ }
+ if attr.Mode.FileType() == p9.ModeRegular && !mask.Size {
+ ctx.Warningf("can't create regular file gofer.dentry without file size")
+ return nil, syserror.EIO
+ }
+
+ d := &dentry{
+ fs: fs,
+ file: file,
+ ino: inoFromPath(qid.Path),
+ mode: uint32(attr.Mode),
+ uid: uint32(fs.opts.dfltuid),
+ gid: uint32(fs.opts.dfltgid),
+ blockSize: usermem.PageSize,
+ hostFD: -1,
+ }
+ d.pf.dentry = d
+ if mask.UID {
+ d.uid = dentryUIDFromP9UID(attr.UID)
+ }
+ if mask.GID {
+ d.gid = dentryGIDFromP9GID(attr.GID)
+ }
+ if mask.Size {
+ d.size = attr.Size
+ }
+ if attr.BlockSize != 0 {
+ d.blockSize = uint32(attr.BlockSize)
+ }
+ if mask.ATime {
+ d.atime = dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds)
+ }
+ if mask.MTime {
+ d.mtime = dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds)
+ }
+ if mask.CTime {
+ d.ctime = dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds)
+ }
+ if mask.BTime {
+ d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)
+ }
+ if mask.NLink {
+ d.nlink = uint32(attr.NLink)
+ }
+ d.vfsd.Init(d)
+
+ fs.syncMu.Lock()
+ fs.syncableDentries[d] = struct{}{}
+ fs.syncMu.Unlock()
+ return d, nil
+}
+
+func (d *dentry) isSynthetic() bool {
+ return d.file.isNil()
+}
+
+func (d *dentry) cachedMetadataAuthoritative() bool {
+ return d.fs.opts.interop != InteropModeShared || d.isSynthetic()
+}
+
+// updateFromP9Attrs is called to update d's metadata after an update from the
+// remote filesystem.
+// Precondition: d.metadataMu must be locked.
+func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
+ if mask.Mode {
+ if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want {
+ d.metadataMu.Unlock()
+ panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got))
+ }
+ atomic.StoreUint32(&d.mode, uint32(attr.Mode))
+ }
+ if mask.UID {
+ atomic.StoreUint32(&d.uid, dentryUIDFromP9UID(attr.UID))
+ }
+ if mask.GID {
+ atomic.StoreUint32(&d.gid, dentryGIDFromP9GID(attr.GID))
+ }
+ // There is no P9_GETATTR_* bit for I/O block size.
+ if attr.BlockSize != 0 {
+ atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize))
+ }
+ // Don't override newer client-defined timestamps with old server-defined
+ // ones.
+ if mask.ATime && atomic.LoadUint32(&d.atimeDirty) == 0 {
+ atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds))
+ }
+ if mask.MTime && atomic.LoadUint32(&d.mtimeDirty) == 0 {
+ atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds))
+ }
+ if mask.CTime {
+ atomic.StoreInt64(&d.ctime, dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds))
+ }
+ if mask.BTime {
+ atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds))
+ }
+ if mask.NLink {
+ atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
+ }
+ if mask.Size {
+ d.updateFileSizeLocked(attr.Size)
+ }
+}
+
+// Preconditions: !d.isSynthetic()
+func (d *dentry) updateFromGetattr(ctx context.Context) error {
+ // Use d.readFile or d.writeFile, which represent 9P fids that have been
+ // opened, in preference to d.file, which represents a 9P fid that has not.
+ // This may be significantly more efficient in some implementations. Prefer
+ // d.writeFile over d.readFile since some filesystem implementations may
+ // update a writable handle's metadata after writes to that handle, without
+ // making metadata updates immediately visible to read-only handles
+ // representing the same file.
+ var (
+ file p9file
+ handleMuRLocked bool
+ )
+ // d.metadataMu must be locked *before* we getAttr so that we do not end up
+ // updating stale attributes in d.updateFromP9AttrsLocked().
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+ d.handleMu.RLock()
+ if !d.writeFile.isNil() {
+ file = d.writeFile
+ handleMuRLocked = true
+ } else if !d.readFile.isNil() {
+ file = d.readFile
+ handleMuRLocked = true
+ } else {
+ file = d.file
+ d.handleMu.RUnlock()
+ }
+ _, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask())
+ if handleMuRLocked {
+ d.handleMu.RUnlock()
+ }
+ if err != nil {
+ return err
+ }
+ d.updateFromP9AttrsLocked(attrMask, &attr)
+ return nil
+}
+
+func (d *dentry) fileType() uint32 {
+ return atomic.LoadUint32(&d.mode) & linux.S_IFMT
+}
+
+func (d *dentry) statTo(stat *linux.Statx) {
+ stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
+ stat.Blksize = atomic.LoadUint32(&d.blockSize)
+ stat.Nlink = atomic.LoadUint32(&d.nlink)
+ if stat.Nlink == 0 {
+ // The remote filesystem doesn't support link count; just make
+ // something up. This is consistent with Linux, where
+ // fs/inode.c:inode_init_always() initializes link count to 1, and
+ // fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if
+ // it's not provided by the remote filesystem.
+ stat.Nlink = 1
+ }
+ stat.UID = atomic.LoadUint32(&d.uid)
+ stat.GID = atomic.LoadUint32(&d.gid)
+ stat.Mode = uint16(atomic.LoadUint32(&d.mode))
+ stat.Ino = uint64(d.ino)
+ stat.Size = atomic.LoadUint64(&d.size)
+ // This is consistent with regularFileFD.Seek(), which treats regular files
+ // as having no holes.
+ stat.Blocks = (stat.Size + 511) / 512
+ stat.Atime = statxTimestampFromDentry(atomic.LoadInt64(&d.atime))
+ stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime))
+ stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime))
+ stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime))
+ stat.DevMajor = linux.UNNAMED_MAJOR
+ stat.DevMinor = d.fs.devMinor
+}
+
+func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error {
+ stat := &opts.Stat
+ if stat.Mask == 0 {
+ return nil
+ }
+ if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
+ return syserror.EPERM
+ }
+ mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+ if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+ return err
+ }
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+
+ if stat.Mask&linux.STATX_SIZE != 0 {
+ // Reject attempts to truncate files other than regular files, since
+ // filesystem implementations may return the wrong errno.
+ switch mode.FileType() {
+ case linux.S_IFREG:
+ // ok
+ case linux.S_IFDIR:
+ return syserror.EISDIR
+ default:
+ return syserror.EINVAL
+ }
+ }
+
+ var now int64
+ if d.cachedMetadataAuthoritative() {
+ // Truncate updates mtime.
+ if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE {
+ stat.Mask |= linux.STATX_MTIME
+ stat.Mtime = linux.StatxTimestamp{
+ Nsec: linux.UTIME_NOW,
+ }
+ }
+
+ // Use client clocks for timestamps.
+ now = d.fs.clock.Now().Nanoseconds()
+ if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW {
+ stat.Atime = statxTimestampFromDentry(now)
+ }
+ if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW {
+ stat.Mtime = statxTimestampFromDentry(now)
+ }
+ }
+
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+ if !d.isSynthetic() {
+ if stat.Mask != 0 {
+ if err := d.file.setAttr(ctx, p9.SetAttrMask{
+ Permissions: stat.Mask&linux.STATX_MODE != 0,
+ UID: stat.Mask&linux.STATX_UID != 0,
+ GID: stat.Mask&linux.STATX_GID != 0,
+ Size: stat.Mask&linux.STATX_SIZE != 0,
+ ATime: stat.Mask&linux.STATX_ATIME != 0,
+ MTime: stat.Mask&linux.STATX_MTIME != 0,
+ ATimeNotSystemTime: stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW,
+ MTimeNotSystemTime: stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW,
+ }, p9.SetAttr{
+ Permissions: p9.FileMode(stat.Mode),
+ UID: p9.UID(stat.UID),
+ GID: p9.GID(stat.GID),
+ Size: stat.Size,
+ ATimeSeconds: uint64(stat.Atime.Sec),
+ ATimeNanoSeconds: uint64(stat.Atime.Nsec),
+ MTimeSeconds: uint64(stat.Mtime.Sec),
+ MTimeNanoSeconds: uint64(stat.Mtime.Nsec),
+ }); err != nil {
+ return err
+ }
+ if stat.Mask&linux.STATX_SIZE != 0 {
+ // d.size should be kept up to date, and privatized
+ // copy-on-write mappings of truncated pages need to be
+ // invalidated, even if InteropModeShared is in effect.
+ d.updateFileSizeLocked(stat.Size)
+ }
+ }
+ if d.fs.opts.interop == InteropModeShared {
+ // There's no point to updating d's metadata in this case since
+ // it'll be overwritten by revalidation before the next time it's
+ // used anyway. (InteropModeShared inhibits client caching of
+ // regular file data, so there's no cache to truncate either.)
+ return nil
+ }
+ }
+ if stat.Mask&linux.STATX_MODE != 0 {
+ atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode))
+ }
+ if stat.Mask&linux.STATX_UID != 0 {
+ atomic.StoreUint32(&d.uid, stat.UID)
+ }
+ if stat.Mask&linux.STATX_GID != 0 {
+ atomic.StoreUint32(&d.gid, stat.GID)
+ }
+ // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because
+ // if d.cachedMetadataAuthoritative() then we converted stat.Atime and
+ // stat.Mtime to client-local timestamps above, and if
+ // !d.cachedMetadataAuthoritative() then we returned after calling
+ // d.file.setAttr(). For the same reason, now must have been initialized.
+ if stat.Mask&linux.STATX_ATIME != 0 {
+ atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
+ atomic.StoreUint32(&d.atimeDirty, 0)
+ }
+ if stat.Mask&linux.STATX_MTIME != 0 {
+ atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
+ atomic.StoreUint32(&d.mtimeDirty, 0)
+ }
+ atomic.StoreInt64(&d.ctime, now)
+ return nil
+}
+
+// Preconditions: d.metadataMu must be locked.
+func (d *dentry) updateFileSizeLocked(newSize uint64) {
+ d.dataMu.Lock()
+ oldSize := d.size
+ atomic.StoreUint64(&d.size, newSize)
+ // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
+ // below. This allows concurrent calls to Read/Translate/etc. These
+ // functions synchronize with truncation by refusing to use cache
+ // contents beyond the new d.size. (We are still holding d.metadataMu,
+ // so we can't race with Write or another truncate.)
+ d.dataMu.Unlock()
+ if d.size < oldSize {
+ oldpgend, _ := usermem.PageRoundUp(oldSize)
+ newpgend, _ := usermem.PageRoundUp(d.size)
+ if oldpgend != newpgend {
+ d.mapsMu.Lock()
+ d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+ // Compare Linux's mm/truncate.c:truncate_setsize() =>
+ // truncate_pagecache() =>
+ // mm/memory.c:unmap_mapping_range(evencows=1).
+ InvalidatePrivate: true,
+ })
+ d.mapsMu.Unlock()
+ }
+ // We are now guaranteed that there are no translations of
+ // truncated pages, and can remove them from the cache. Since
+ // truncated pages have been removed from the remote file, they
+ // should be dropped without being written back.
+ d.dataMu.Lock()
+ d.cache.Truncate(d.size, d.fs.mfp.MemoryFile())
+ d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend})
+ d.dataMu.Unlock()
+ }
+}
+
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+ return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+}
+
+func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
+ return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid)))
+}
+
+func dentryUIDFromP9UID(uid p9.UID) uint32 {
+ if !uid.Ok() {
+ return uint32(auth.OverflowUID)
+ }
+ return uint32(uid)
+}
+
+func dentryGIDFromP9GID(gid p9.GID) uint32 {
+ if !gid.Ok() {
+ return uint32(auth.OverflowGID)
+ }
+ return uint32(gid)
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+ // d.refs may be 0 if d.fs.renameMu is locked, which serializes against
+ // d.checkCachingLocked().
+ atomic.AddInt64(&d.refs, 1)
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+ for {
+ refs := atomic.LoadInt64(&d.refs)
+ if refs <= 0 {
+ return false
+ }
+ if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
+ return true
+ }
+ }
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef(ctx context.Context) {
+ if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
+ d.fs.renameMu.Lock()
+ d.checkCachingLocked(ctx)
+ d.fs.renameMu.Unlock()
+ } else if refs < 0 {
+ panic("gofer.dentry.DecRef() called without holding a reference")
+ }
+}
+
+// decRefLocked decrements d's reference count without calling
+// d.checkCachingLocked, even if d's reference count reaches 0; callers are
+// responsible for ensuring that d.checkCachingLocked will be called later.
+func (d *dentry) decRefLocked() {
+ if refs := atomic.AddInt64(&d.refs, -1); refs < 0 {
+ panic("gofer.dentry.decRefLocked() called without holding a reference")
+ }
+}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
+ if d.isDir() {
+ events |= linux.IN_ISDIR
+ }
+
+ d.fs.renameMu.RLock()
+ // The ordering below is important, Linux always notifies the parent first.
+ if d.parent != nil {
+ d.parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted())
+ }
+ d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted())
+ d.fs.renameMu.RUnlock()
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+ return &d.watches
+}
+
+// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
+//
+// If no watches are left on this dentry and it has no references, cache it.
+func (d *dentry) OnZeroWatches(ctx context.Context) {
+ if atomic.LoadInt64(&d.refs) == 0 {
+ d.fs.renameMu.Lock()
+ d.checkCachingLocked(ctx)
+ d.fs.renameMu.Unlock()
+ }
+}
+
+// checkCachingLocked should be called after d's reference count becomes 0 or it
+// becomes disowned.
+//
+// It may be called on a destroyed dentry. For example,
+// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times
+// for the same dentry when the dentry is visited more than once in the same
+// operation. One of the calls may destroy the dentry, so subsequent calls will
+// do nothing.
+//
+// Preconditions: d.fs.renameMu must be locked for writing; it may be
+// temporarily unlocked.
+func (d *dentry) checkCachingLocked(ctx context.Context) {
+ // Dentries with a non-zero reference count must be retained. (The only way
+ // to obtain a reference on a dentry with zero references is via path
+ // resolution, which requires renameMu, so if d.refs is zero then it will
+ // remain zero while we hold renameMu for writing.)
+ refs := atomic.LoadInt64(&d.refs)
+ if refs > 0 {
+ if d.cached {
+ d.fs.cachedDentries.Remove(d)
+ d.fs.cachedDentriesLen--
+ d.cached = false
+ }
+ return
+ }
+ if refs == -1 {
+ // Dentry has already been destroyed.
+ return
+ }
+ // Deleted and invalidated dentries with zero references are no longer
+ // reachable by path resolution and should be dropped immediately.
+ if d.vfsd.IsDead() {
+ if d.isDeleted() {
+ d.watches.HandleDeletion(ctx)
+ }
+ if d.cached {
+ d.fs.cachedDentries.Remove(d)
+ d.fs.cachedDentriesLen--
+ d.cached = false
+ }
+ d.destroyLocked(ctx)
+ return
+ }
+ // If d still has inotify watches and it is not deleted or invalidated, we
+ // cannot cache it and allow it to be evicted. Otherwise, we will lose its
+ // watches, even if a new dentry is created for the same file in the future.
+ // Note that the size of d.watches cannot concurrently transition from zero
+ // to non-zero, because adding a watch requires holding a reference on d.
+ if d.watches.Size() > 0 {
+ return
+ }
+ // If d is already cached, just move it to the front of the LRU.
+ if d.cached {
+ d.fs.cachedDentries.Remove(d)
+ d.fs.cachedDentries.PushFront(d)
+ return
+ }
+ // Cache the dentry, then evict the least recently used cached dentry if
+ // the cache becomes over-full.
+ d.fs.cachedDentries.PushFront(d)
+ d.fs.cachedDentriesLen++
+ d.cached = true
+ if d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries {
+ victim := d.fs.cachedDentries.Back()
+ d.fs.cachedDentries.Remove(victim)
+ d.fs.cachedDentriesLen--
+ victim.cached = false
+ // victim.refs may have become non-zero from an earlier path resolution
+ // since it was inserted into fs.cachedDentries.
+ if atomic.LoadInt64(&victim.refs) == 0 {
+ if victim.parent != nil {
+ victim.parent.dirMu.Lock()
+ if !victim.vfsd.IsDead() {
+ // Note that victim can't be a mount point (in any mount
+ // namespace), since VFS holds references on mount points.
+ d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd)
+ delete(victim.parent.children, victim.name)
+ // We're only deleting the dentry, not the file it
+ // represents, so we don't need to update
+ // victimParent.dirents etc.
+ }
+ victim.parent.dirMu.Unlock()
+ }
+ victim.destroyLocked(ctx)
+ }
+ // Whether or not victim was destroyed, we brought fs.cachedDentriesLen
+ // back down to fs.opts.maxCachedDentries, so we don't loop.
+ }
+}
+
+// destroyLocked destroys the dentry.
+//
+// Preconditions:
+// * d.fs.renameMu must be locked for writing; it may be temporarily unlocked.
+// * d.refs == 0.
+// * d.parent.children[d.name] != d, i.e. d is not reachable by path traversal
+// from its former parent dentry.
+func (d *dentry) destroyLocked(ctx context.Context) {
+ switch atomic.LoadInt64(&d.refs) {
+ case 0:
+ // Mark the dentry destroyed.
+ atomic.StoreInt64(&d.refs, -1)
+ case -1:
+ panic("dentry.destroyLocked() called on already destroyed dentry")
+ default:
+ panic("dentry.destroyLocked() called with references on the dentry")
+ }
+
+ // Allow the following to proceed without renameMu locked to improve
+ // scalability.
+ d.fs.renameMu.Unlock()
+
+ mf := d.fs.mfp.MemoryFile()
+ d.handleMu.Lock()
+ d.dataMu.Lock()
+ if h := d.writeHandleLocked(); h.isOpen() {
+ // Write dirty pages back to the remote filesystem.
+ if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
+ log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err)
+ }
+ }
+ // Discard cached data.
+ if !d.cache.IsEmpty() {
+ mf.MarkAllUnevictable(d)
+ d.cache.DropAll(mf)
+ d.dirty.RemoveAll()
+ }
+ d.dataMu.Unlock()
+ // Clunk open fids and close open host FDs.
+ if !d.readFile.isNil() {
+ d.readFile.close(ctx)
+ }
+ if !d.writeFile.isNil() && d.readFile != d.writeFile {
+ d.writeFile.close(ctx)
+ }
+ d.readFile = p9file{}
+ d.writeFile = p9file{}
+ if d.hostFD >= 0 {
+ syscall.Close(int(d.hostFD))
+ d.hostFD = -1
+ }
+ d.handleMu.Unlock()
+
+ if !d.file.isNil() {
+ if !d.isDeleted() {
+ // Write dirty timestamps back to the remote filesystem.
+ atimeDirty := atomic.LoadUint32(&d.atimeDirty) != 0
+ mtimeDirty := atomic.LoadUint32(&d.mtimeDirty) != 0
+ if atimeDirty || mtimeDirty {
+ atime := atomic.LoadInt64(&d.atime)
+ mtime := atomic.LoadInt64(&d.mtime)
+ if err := d.file.setAttr(ctx, p9.SetAttrMask{
+ ATime: atimeDirty,
+ ATimeNotSystemTime: atimeDirty,
+ MTime: mtimeDirty,
+ MTimeNotSystemTime: mtimeDirty,
+ }, p9.SetAttr{
+ ATimeSeconds: uint64(atime / 1e9),
+ ATimeNanoSeconds: uint64(atime % 1e9),
+ MTimeSeconds: uint64(mtime / 1e9),
+ MTimeNanoSeconds: uint64(mtime % 1e9),
+ }); err != nil {
+ log.Warningf("gofer.dentry.destroyLocked: failed to write dirty timestamps back: %v", err)
+ }
+ }
+ }
+ d.file.close(ctx)
+ d.file = p9file{}
+ // Remove d from the set of syncable dentries.
+ d.fs.syncMu.Lock()
+ delete(d.fs.syncableDentries, d)
+ d.fs.syncMu.Unlock()
+ }
+
+ d.fs.renameMu.Lock()
+
+ // Drop the reference held by d on its parent without recursively locking
+ // d.fs.renameMu.
+ if d.parent != nil {
+ if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
+ d.parent.checkCachingLocked(ctx)
+ } else if refs < 0 {
+ panic("gofer.dentry.DecRef() called without holding a reference")
+ }
+ }
+}
+
+func (d *dentry) isDeleted() bool {
+ return atomic.LoadUint32(&d.deleted) != 0
+}
+
+func (d *dentry) setDeleted() {
+ atomic.StoreUint32(&d.deleted, 1)
+}
+
+// We only support xattrs prefixed with "user." (see b/148380782). Currently,
+// there is no need to expose any other xattrs through a gofer.
+func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
+ if d.file.isNil() || !d.userXattrSupported() {
+ return nil, nil
+ }
+ xattrMap, err := d.file.listXattr(ctx, size)
+ if err != nil {
+ return nil, err
+ }
+ xattrs := make([]string, 0, len(xattrMap))
+ for x := range xattrMap {
+ if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
+ xattrs = append(xattrs, x)
+ }
+ }
+ return xattrs, nil
+}
+
+func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+ if d.file.isNil() {
+ return "", syserror.ENODATA
+ }
+ if err := d.checkPermissions(creds, vfs.MayRead); err != nil {
+ return "", err
+ }
+ if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+ return "", syserror.EOPNOTSUPP
+ }
+ if !d.userXattrSupported() {
+ return "", syserror.ENODATA
+ }
+ return d.file.getXattr(ctx, opts.Name, opts.Size)
+}
+
+func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+ if d.file.isNil() {
+ return syserror.EPERM
+ }
+ if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+ return err
+ }
+ if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+ return syserror.EOPNOTSUPP
+ }
+ if !d.userXattrSupported() {
+ return syserror.EPERM
+ }
+ return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
+}
+
+func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error {
+ if d.file.isNil() {
+ return syserror.EPERM
+ }
+ if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+ return err
+ }
+ if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+ return syserror.EOPNOTSUPP
+ }
+ if !d.userXattrSupported() {
+ return syserror.EPERM
+ }
+ return d.file.removeXattr(ctx, name)
+}
+
+// Extended attributes in the user.* namespace are only supported for regular
+// files and directories.
+func (d *dentry) userXattrSupported() bool {
+ filetype := linux.FileMode(atomic.LoadUint32(&d.mode)).FileType()
+ return filetype == linux.ModeRegular || filetype == linux.ModeDirectory
+}
+
+// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDir().
+func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
+ // O_TRUNC unconditionally requires us to obtain a new handle (opened with
+ // O_TRUNC).
+ if !trunc {
+ d.handleMu.RLock()
+ if (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil()) {
+ // Current handles are sufficient.
+ d.handleMu.RUnlock()
+ return nil
+ }
+ d.handleMu.RUnlock()
+ }
+
+ fdToClose := int32(-1)
+ invalidateTranslations := false
+ d.handleMu.Lock()
+ if (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc {
+ // Get a new handle. If this file has been opened for both reading and
+ // writing, try to get a single handle that is usable for both:
+ //
+ // - Writable memory mappings of a host FD require that the host FD is
+ // opened for both reading and writing.
+ //
+ // - NOTE(b/141991141): Some filesystems may not ensure coherence
+ // between multiple handles for the same file.
+ openReadable := !d.readFile.isNil() || read
+ openWritable := !d.writeFile.isNil() || write
+ h, err := openHandle(ctx, d.file, openReadable, openWritable, trunc)
+ if err == syserror.EACCES && (openReadable != read || openWritable != write) {
+ // It may not be possible to use a single handle for both
+ // reading and writing, since permissions on the file may have
+ // changed to e.g. disallow reading after previously being
+ // opened for reading. In this case, we have no choice but to
+ // use separate handles for reading and writing.
+ ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d)
+ openReadable = read
+ openWritable = write
+ h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc)
+ }
+ if err != nil {
+ d.handleMu.Unlock()
+ return err
+ }
+
+ if d.hostFD < 0 && openReadable && h.fd >= 0 {
+ // We have no existing FD; use the new FD for at least reading.
+ d.hostFD = h.fd
+ } else if d.hostFD >= 0 && d.writeFile.isNil() && openWritable {
+ // We have an existing read-only FD, but the file has just been
+ // opened for writing, so we need to start supporting writable memory
+ // mappings. This may race with callers of d.pf.FD() using the existing
+ // FD, so in most cases we need to delay closing the old FD until after
+ // invalidating memmap.Translations that might have observed it.
+ if !openReadable || h.fd < 0 {
+ // We don't have a read/write FD, so we have no FD that can be
+ // used to create writable memory mappings. Switch to using the
+ // internal page cache.
+ invalidateTranslations = true
+ fdToClose = d.hostFD
+ d.hostFD = -1
+ } else if d.fs.opts.overlayfsStaleRead {
+ // We do have a read/write FD, but it may not be coherent with
+ // the existing read-only FD, so we must switch to mappings of
+ // the new FD in both the application and sentry.
+ if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
+ d.handleMu.Unlock()
+ ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
+ h.close(ctx)
+ return err
+ }
+ invalidateTranslations = true
+ fdToClose = d.hostFD
+ d.hostFD = h.fd
+ } else {
+ // We do have a read/write FD. To avoid invalidating existing
+ // memmap.Translations (which is expensive), use dup3 to make
+ // the old file descriptor refer to the new file description,
+ // then close the new file descriptor (which is no longer
+ // needed). Racing callers of d.pf.FD() may use the old or new
+ // file description, but this doesn't matter since they refer
+ // to the same file, and any racing mappings must be read-only.
+ if err := syscall.Dup3(int(h.fd), int(d.hostFD), syscall.O_CLOEXEC); err != nil {
+ oldHostFD := d.hostFD
+ d.handleMu.Unlock()
+ ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldHostFD, err)
+ h.close(ctx)
+ return err
+ }
+ fdToClose = h.fd
+ }
+ } else {
+ // h.fd is not useful.
+ fdToClose = h.fd
+ }
+
+ // Switch to new fids.
+ var oldReadFile p9file
+ if openReadable {
+ oldReadFile = d.readFile
+ d.readFile = h.file
+ }
+ var oldWriteFile p9file
+ if openWritable {
+ oldWriteFile = d.writeFile
+ d.writeFile = h.file
+ }
+ // NOTE(b/141991141): Clunk old fids before making new fids visible (by
+ // unlocking d.handleMu).
+ if !oldReadFile.isNil() {
+ oldReadFile.close(ctx)
+ }
+ if !oldWriteFile.isNil() && oldReadFile != oldWriteFile {
+ oldWriteFile.close(ctx)
+ }
+ }
+ d.handleMu.Unlock()
+
+ if invalidateTranslations {
+ // Invalidate application mappings that may be using an old FD; they
+ // will be replaced with mappings using the new FD after future calls
+ // to d.Translate(). This requires holding d.mapsMu, which precedes
+ // d.handleMu in the lock order.
+ d.mapsMu.Lock()
+ d.mappings.InvalidateAll(memmap.InvalidateOpts{})
+ d.mapsMu.Unlock()
+ }
+ if fdToClose >= 0 {
+ syscall.Close(int(fdToClose))
+ }
+
+ return nil
+}
+
+// Preconditions: d.handleMu must be locked.
+func (d *dentry) readHandleLocked() handle {
+ return handle{
+ file: d.readFile,
+ fd: d.hostFD,
+ }
+}
+
+// Preconditions: d.handleMu must be locked.
+func (d *dentry) writeHandleLocked() handle {
+ return handle{
+ file: d.writeFile,
+ fd: d.hostFD,
+ }
+}
+
+func (d *dentry) syncRemoteFile(ctx context.Context) error {
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ return d.syncRemoteFileLocked(ctx)
+}
+
+// Preconditions: d.handleMu must be locked.
+func (d *dentry) syncRemoteFileLocked(ctx context.Context) error {
+ // If we have a host FD, fsyncing it is likely to be faster than an fsync
+ // RPC.
+ if d.hostFD >= 0 {
+ ctx.UninterruptibleSleepStart(false)
+ err := syscall.Fsync(int(d.hostFD))
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+ }
+ if !d.writeFile.isNil() {
+ return d.writeFile.fsync(ctx)
+ }
+ if !d.readFile.isNil() {
+ return d.readFile.fsync(ctx)
+ }
+ return nil
+}
+
+// incLinks increments link count.
+func (d *dentry) incLinks() {
+ if atomic.LoadUint32(&d.nlink) == 0 {
+ // The remote filesystem doesn't support link count.
+ return
+ }
+ atomic.AddUint32(&d.nlink, 1)
+}
+
+// decLinks decrements link count.
+func (d *dentry) decLinks() {
+ if atomic.LoadUint32(&d.nlink) == 0 {
+ // The remote filesystem doesn't support link count.
+ return
+ }
+ atomic.AddUint32(&d.nlink, ^uint32(0))
+}
+
+// fileDescription is embedded by gofer implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+ vfs.LockFD
+
+ lockLogging sync.Once
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+ return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) dentry() *dentry {
+ return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ d := fd.dentry()
+ const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME)
+ if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
+ // TODO(jamieliu): Use specialFileFD.handle.file for the getattr if
+ // available?
+ if err := d.updateFromGetattr(ctx); err != nil {
+ return linux.Statx{}, err
+ }
+ }
+ var stat linux.Statx
+ d.statTo(&stat)
+ return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ if err := fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount()); err != nil {
+ return err
+ }
+ if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+ fd.dentry().InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+ }
+ return nil
+}
+
+// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
+func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+ return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size)
+}
+
+// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
+func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
+ return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
+}
+
+// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
+func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+ d := fd.dentry()
+ if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
+ return err
+ }
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
+}
+
+// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
+func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+ d := fd.dentry()
+ if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
+ return err
+ }
+ d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+ return nil
+}
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+ fd.lockLogging.Do(func() {
+ log.Infof("File lock using gofer file handled internally.")
+ })
+ return fd.LockFD.LockBSD(ctx, uid, t, block)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ fd.lockLogging.Do(func() {
+ log.Infof("Range lock using gofer file handled internally.")
+ })
+ return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go
new file mode 100644
index 000000000..bfe75dfe4
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/gofer_test.go
@@ -0,0 +1,67 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+ "sync/atomic"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/sentry/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+)
+
+func TestDestroyIdempotent(t *testing.T) {
+ ctx := contexttest.Context(t)
+ fs := filesystem{
+ mfp: pgalloc.MemoryFileProviderFromContext(ctx),
+ syncableDentries: make(map[*dentry]struct{}),
+ opts: filesystemOptions{
+ // Test relies on no dentry being held in the cache.
+ maxCachedDentries: 0,
+ },
+ }
+
+ attr := &p9.Attr{
+ Mode: p9.ModeRegular,
+ }
+ mask := p9.AttrMask{
+ Mode: true,
+ Size: true,
+ }
+ parent, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr)
+ if err != nil {
+ t.Fatalf("fs.newDentry(): %v", err)
+ }
+
+ child, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr)
+ if err != nil {
+ t.Fatalf("fs.newDentry(): %v", err)
+ }
+ parent.cacheNewChildLocked(child, "child")
+
+ fs.renameMu.Lock()
+ defer fs.renameMu.Unlock()
+ child.checkCachingLocked(ctx)
+ if got := atomic.LoadInt64(&child.refs); got != -1 {
+ t.Fatalf("child.refs=%d, want: -1", got)
+ }
+ // Parent will also be destroyed when child reference is removed.
+ if got := atomic.LoadInt64(&parent.refs); got != -1 {
+ t.Fatalf("parent.refs=%d, want: -1", got)
+ }
+ child.checkCachingLocked(ctx)
+ child.checkCachingLocked(ctx)
+}
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
new file mode 100644
index 000000000..104157512
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -0,0 +1,130 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/hostfd"
+)
+
+// handle represents a remote "open file descriptor", consisting of an opened
+// fid (p9.File) and optionally a host file descriptor.
+type handle struct {
+ file p9file
+ fd int32 // -1 if unavailable
+}
+
+// Preconditions: read || write.
+func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (handle, error) {
+ _, newfile, err := file.walk(ctx, nil)
+ if err != nil {
+ return handle{fd: -1}, err
+ }
+ var flags p9.OpenFlags
+ switch {
+ case read && !write:
+ flags = p9.ReadOnly
+ case !read && write:
+ flags = p9.WriteOnly
+ case read && write:
+ flags = p9.ReadWrite
+ }
+ if trunc {
+ flags |= p9.OpenTruncate
+ }
+ fdobj, _, _, err := newfile.open(ctx, flags)
+ if err != nil {
+ newfile.close(ctx)
+ return handle{fd: -1}, err
+ }
+ fd := int32(-1)
+ if fdobj != nil {
+ fd = int32(fdobj.Release())
+ }
+ return handle{
+ file: newfile,
+ fd: fd,
+ }, nil
+}
+
+func (h *handle) isOpen() bool {
+ return !h.file.isNil()
+}
+
+func (h *handle) close(ctx context.Context) {
+ h.file.close(ctx)
+ h.file = p9file{}
+ if h.fd >= 0 {
+ syscall.Close(int(h.fd))
+ h.fd = -1
+ }
+}
+
+func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
+ if dsts.IsEmpty() {
+ return 0, nil
+ }
+ if h.fd >= 0 {
+ ctx.UninterruptibleSleepStart(false)
+ n, err := hostfd.Preadv2(h.fd, dsts, int64(offset), 0 /* flags */)
+ ctx.UninterruptibleSleepFinish(false)
+ return n, err
+ }
+ if dsts.NumBlocks() == 1 && !dsts.Head().NeedSafecopy() {
+ n, err := h.file.readAt(ctx, dsts.Head().ToSlice(), offset)
+ return uint64(n), err
+ }
+ // Buffer the read since p9.File.ReadAt() takes []byte.
+ buf := make([]byte, dsts.NumBytes())
+ n, err := h.file.readAt(ctx, buf, offset)
+ if n == 0 {
+ return 0, err
+ }
+ if cp, cperr := safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:n]))); cperr != nil {
+ return cp, cperr
+ }
+ return uint64(n), err
+}
+
+func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
+ if srcs.IsEmpty() {
+ return 0, nil
+ }
+ if h.fd >= 0 {
+ ctx.UninterruptibleSleepStart(false)
+ n, err := hostfd.Pwritev2(h.fd, srcs, int64(offset), 0 /* flags */)
+ ctx.UninterruptibleSleepFinish(false)
+ return n, err
+ }
+ if srcs.NumBlocks() == 1 && !srcs.Head().NeedSafecopy() {
+ n, err := h.file.writeAt(ctx, srcs.Head().ToSlice(), offset)
+ return uint64(n), err
+ }
+ // Buffer the write since p9.File.WriteAt() takes []byte.
+ buf := make([]byte, srcs.NumBytes())
+ cp, cperr := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), srcs)
+ if cp == 0 {
+ return 0, cperr
+ }
+ n, err := h.file.writeAt(ctx, buf[:cp], offset)
+ if err != nil {
+ return uint64(n), err
+ }
+ return cp, cperr
+}
diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
new file mode 100644
index 000000000..7294de7d6
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
@@ -0,0 +1,97 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+ "fmt"
+ "sync"
+ "time"
+
+ "golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Global pipe used by blockUntilNonblockingPipeHasWriter since we can't create
+// pipes after sentry initialization due to syscall filters.
+var (
+ tempPipeMu sync.Mutex
+ tempPipeReadFD int
+ tempPipeWriteFD int
+ tempPipeBuf [1]byte
+)
+
+func init() {
+ var pipeFDs [2]int
+ if err := unix.Pipe(pipeFDs[:]); err != nil {
+ panic(fmt.Sprintf("failed to create pipe for gofer.blockUntilNonblockingPipeHasWriter: %v", err))
+ }
+ tempPipeReadFD = pipeFDs[0]
+ tempPipeWriteFD = pipeFDs[1]
+}
+
+func blockUntilNonblockingPipeHasWriter(ctx context.Context, fd int32) error {
+ for {
+ ok, err := nonblockingPipeHasWriter(fd)
+ if err != nil {
+ return err
+ }
+ if ok {
+ return nil
+ }
+ if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
+ return err
+ }
+ }
+}
+
+func nonblockingPipeHasWriter(fd int32) (bool, error) {
+ tempPipeMu.Lock()
+ defer tempPipeMu.Unlock()
+ // Copy 1 byte from fd into the temporary pipe.
+ n, err := unix.Tee(int(fd), tempPipeWriteFD, 1, unix.SPLICE_F_NONBLOCK)
+ if err == syserror.EAGAIN {
+ // The pipe represented by fd is empty, but has a writer.
+ return true, nil
+ }
+ if err != nil {
+ return false, err
+ }
+ if n == 0 {
+ // The pipe represented by fd is empty and has no writer.
+ return false, nil
+ }
+ // The pipe represented by fd is non-empty, so it either has, or has
+ // previously had, a writer. Remove the byte copied to the temporary pipe
+ // before returning.
+ if n, err := unix.Read(tempPipeReadFD, tempPipeBuf[:]); err != nil || n != 1 {
+ panic(fmt.Sprintf("failed to drain pipe for gofer.blockUntilNonblockingPipeHasWriter: got (%d, %v), wanted (1, nil)", n, err))
+ }
+ return true, nil
+}
+
+func sleepBetweenNamedPipeOpenChecks(ctx context.Context) error {
+ t := time.NewTimer(100 * time.Millisecond)
+ defer t.Stop()
+ cancel := ctx.SleepStart()
+ select {
+ case <-t.C:
+ ctx.SleepFinish(true)
+ return nil
+ case <-cancel:
+ ctx.SleepFinish(false)
+ return syserror.ErrInterrupted
+ }
+}
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
new file mode 100644
index 000000000..87f0b877f
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -0,0 +1,233 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fd"
+ "gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// p9file is a wrapper around p9.File that provides methods that are
+// Context-aware.
+type p9file struct {
+ file p9.File
+}
+
+func (f p9file) isNil() bool {
+ return f.file == nil
+}
+
+func (f p9file) walk(ctx context.Context, names []string) ([]p9.QID, p9file, error) {
+ ctx.UninterruptibleSleepStart(false)
+ qids, newfile, err := f.file.Walk(names)
+ ctx.UninterruptibleSleepFinish(false)
+ return qids, p9file{newfile}, err
+}
+
+func (f p9file) walkGetAttr(ctx context.Context, names []string) ([]p9.QID, p9file, p9.AttrMask, p9.Attr, error) {
+ ctx.UninterruptibleSleepStart(false)
+ qids, newfile, attrMask, attr, err := f.file.WalkGetAttr(names)
+ ctx.UninterruptibleSleepFinish(false)
+ return qids, p9file{newfile}, attrMask, attr, err
+}
+
+// walkGetAttrOne is a wrapper around p9.File.WalkGetAttr that takes a single
+// path component and returns a single qid.
+func (f p9file) walkGetAttrOne(ctx context.Context, name string) (p9.QID, p9file, p9.AttrMask, p9.Attr, error) {
+ ctx.UninterruptibleSleepStart(false)
+ qids, newfile, attrMask, attr, err := f.file.WalkGetAttr([]string{name})
+ ctx.UninterruptibleSleepFinish(false)
+ if err != nil {
+ return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, err
+ }
+ if len(qids) != 1 {
+ ctx.Warningf("p9.File.WalkGetAttr returned %d qids (%v), wanted 1", len(qids), qids)
+ if newfile != nil {
+ p9file{newfile}.close(ctx)
+ }
+ return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, syserror.EIO
+ }
+ return qids[0], p9file{newfile}, attrMask, attr, nil
+}
+
+func (f p9file) statFS(ctx context.Context) (p9.FSStat, error) {
+ ctx.UninterruptibleSleepStart(false)
+ fsstat, err := f.file.StatFS()
+ ctx.UninterruptibleSleepFinish(false)
+ return fsstat, err
+}
+
+func (f p9file) getAttr(ctx context.Context, req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
+ ctx.UninterruptibleSleepStart(false)
+ qid, attrMask, attr, err := f.file.GetAttr(req)
+ ctx.UninterruptibleSleepFinish(false)
+ return qid, attrMask, attr, err
+}
+
+func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error {
+ ctx.UninterruptibleSleepStart(false)
+ err := f.file.SetAttr(valid, attr)
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+}
+
+func (f p9file) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) {
+ ctx.UninterruptibleSleepStart(false)
+ xattrs, err := f.file.ListXattr(size)
+ ctx.UninterruptibleSleepFinish(false)
+ return xattrs, err
+}
+
+func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) {
+ ctx.UninterruptibleSleepStart(false)
+ val, err := f.file.GetXattr(name, size)
+ ctx.UninterruptibleSleepFinish(false)
+ return val, err
+}
+
+func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32) error {
+ ctx.UninterruptibleSleepStart(false)
+ err := f.file.SetXattr(name, value, flags)
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+}
+
+func (f p9file) removeXattr(ctx context.Context, name string) error {
+ ctx.UninterruptibleSleepStart(false)
+ err := f.file.RemoveXattr(name)
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+}
+
+func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
+ ctx.UninterruptibleSleepStart(false)
+ err := f.file.Allocate(mode, offset, length)
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+}
+
+func (f p9file) close(ctx context.Context) error {
+ ctx.UninterruptibleSleepStart(false)
+ err := f.file.Close()
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+}
+
+func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
+ ctx.UninterruptibleSleepStart(false)
+ fdobj, qid, iounit, err := f.file.Open(flags)
+ ctx.UninterruptibleSleepFinish(false)
+ return fdobj, qid, iounit, err
+}
+
+func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (int, error) {
+ ctx.UninterruptibleSleepStart(false)
+ n, err := f.file.ReadAt(p, offset)
+ ctx.UninterruptibleSleepFinish(false)
+ return n, err
+}
+
+func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) {
+ ctx.UninterruptibleSleepStart(false)
+ n, err := f.file.WriteAt(p, offset)
+ ctx.UninterruptibleSleepFinish(false)
+ return n, err
+}
+
+func (f p9file) fsync(ctx context.Context) error {
+ ctx.UninterruptibleSleepStart(false)
+ err := f.file.FSync()
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+}
+
+func (f p9file) create(ctx context.Context, name string, flags p9.OpenFlags, permissions p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9file, p9.QID, uint32, error) {
+ ctx.UninterruptibleSleepStart(false)
+ fdobj, newfile, qid, iounit, err := f.file.Create(name, flags, permissions, uid, gid)
+ ctx.UninterruptibleSleepFinish(false)
+ return fdobj, p9file{newfile}, qid, iounit, err
+}
+
+func (f p9file) mkdir(ctx context.Context, name string, permissions p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
+ ctx.UninterruptibleSleepStart(false)
+ qid, err := f.file.Mkdir(name, permissions, uid, gid)
+ ctx.UninterruptibleSleepFinish(false)
+ return qid, err
+}
+
+func (f p9file) symlink(ctx context.Context, oldName string, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
+ ctx.UninterruptibleSleepStart(false)
+ qid, err := f.file.Symlink(oldName, newName, uid, gid)
+ ctx.UninterruptibleSleepFinish(false)
+ return qid, err
+}
+
+func (f p9file) link(ctx context.Context, target p9file, newName string) error {
+ ctx.UninterruptibleSleepStart(false)
+ err := f.file.Link(target.file, newName)
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+}
+
+func (f p9file) mknod(ctx context.Context, name string, mode p9.FileMode, major uint32, minor uint32, uid p9.UID, gid p9.GID) (p9.QID, error) {
+ ctx.UninterruptibleSleepStart(false)
+ qid, err := f.file.Mknod(name, mode, major, minor, uid, gid)
+ ctx.UninterruptibleSleepFinish(false)
+ return qid, err
+}
+
+func (f p9file) rename(ctx context.Context, newDir p9file, newName string) error {
+ ctx.UninterruptibleSleepStart(false)
+ err := f.file.Rename(newDir.file, newName)
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+}
+
+func (f p9file) unlinkAt(ctx context.Context, name string, flags uint32) error {
+ ctx.UninterruptibleSleepStart(false)
+ err := f.file.UnlinkAt(name, flags)
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+}
+
+func (f p9file) readdir(ctx context.Context, offset uint64, count uint32) ([]p9.Dirent, error) {
+ ctx.UninterruptibleSleepStart(false)
+ dirents, err := f.file.Readdir(offset, count)
+ ctx.UninterruptibleSleepFinish(false)
+ return dirents, err
+}
+
+func (f p9file) readlink(ctx context.Context) (string, error) {
+ ctx.UninterruptibleSleepStart(false)
+ target, err := f.file.Readlink()
+ ctx.UninterruptibleSleepFinish(false)
+ return target, err
+}
+
+func (f p9file) flush(ctx context.Context) error {
+ ctx.UninterruptibleSleepStart(false)
+ err := f.file.Flush()
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+}
+
+func (f p9file) connect(ctx context.Context, flags p9.ConnectFlags) (*fd.FD, error) {
+ ctx.UninterruptibleSleepStart(false)
+ fdobj, err := f.file.Connect(flags)
+ ctx.UninterruptibleSleepFinish(false)
+ return fdobj, err
+}
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
new file mode 100644
index 000000000..7e1cbf065
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -0,0 +1,944 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+ "fmt"
+ "io"
+ "math"
+ "sync"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (d *dentry) isRegularFile() bool {
+ return d.fileType() == linux.S_IFREG
+}
+
+type regularFileFD struct {
+ fileDescription
+
+ // off is the file offset. off is protected by mu.
+ mu sync.Mutex
+ off int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release(context.Context) {
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *regularFileFD) OnClose(ctx context.Context) error {
+ if !fd.vfsfd.IsWritable() {
+ return nil
+ }
+ // Skip flushing if writes may be buffered by the client, since (as with
+ // the VFS1 client) we don't flush buffered writes on close anyway.
+ d := fd.dentry()
+ if d.fs.opts.interop == InteropModeExclusive {
+ return nil
+ }
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ if d.writeFile.isNil() {
+ return nil
+ }
+ return d.writeFile.flush(ctx)
+}
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ d := fd.dentry()
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+
+ // Allocating a smaller size is a noop.
+ size := offset + length
+ if d.cachedMetadataAuthoritative() && size <= d.size {
+ return nil
+ }
+
+ d.handleMu.RLock()
+ err := d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+ d.handleMu.RUnlock()
+ if err != nil {
+ return err
+ }
+ d.dataMu.Lock()
+ atomic.StoreUint64(&d.size, size)
+ d.dataMu.Unlock()
+ if d.cachedMetadataAuthoritative() {
+ d.touchCMtimeLocked()
+ }
+ return nil
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
+ return 0, syserror.EOPNOTSUPP
+ }
+
+ // Check for reading at EOF before calling into MM (but not under
+ // InteropModeShared, which makes d.size unreliable).
+ d := fd.dentry()
+ if d.cachedMetadataAuthoritative() && uint64(offset) >= atomic.LoadUint64(&d.size) {
+ return 0, io.EOF
+ }
+
+ if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+ // Lock d.metadataMu for the rest of the read to prevent d.size from
+ // changing.
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+ // Write dirty cached pages that will be touched by the read back to
+ // the remote file.
+ if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil {
+ return 0, err
+ }
+ }
+
+ rw := getDentryReadWriter(ctx, d, offset)
+ if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+ // Require the read to go to the remote file.
+ rw.direct = true
+ }
+ n, err := dst.CopyOutFrom(ctx, rw)
+ putDentryReadWriter(rw)
+ if d.fs.opts.interop != InteropModeShared {
+ // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+ d.touchAtime(fd.vfsfd.Mount())
+ }
+ return n, err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ fd.mu.Lock()
+ n, err := fd.PRead(ctx, dst, fd.off, opts)
+ fd.off += n
+ fd.mu.Unlock()
+ return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ n, _, err := fd.pwrite(ctx, src, offset, opts)
+ return n, err
+}
+
+// pwrite returns the number of bytes written, final offset, error. The final
+// offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
+ if offset < 0 {
+ return 0, offset, syserror.EINVAL
+ }
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
+ return 0, offset, syserror.EOPNOTSUPP
+ }
+
+ d := fd.dentry()
+ // If the fd was opened with O_APPEND, make sure the file size is updated.
+ // There is a possible race here if size is modified externally after
+ // metadata cache is updated.
+ if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
+ if err := d.updateFromGetattr(ctx); err != nil {
+ return 0, offset, err
+ }
+ }
+
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+
+ // Set offset to file size if the fd was opened with O_APPEND.
+ if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+ // Holding d.metadataMu is sufficient for reading d.size.
+ offset = int64(d.size)
+ }
+ limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
+ if err != nil {
+ return 0, offset, err
+ }
+ src = src.TakeFirst64(limit)
+
+ if d.fs.opts.interop != InteropModeShared {
+ // Compare Linux's mm/filemap.c:__generic_file_write_iter() =>
+ // file_update_time(). This is d.touchCMtime(), but without locking
+ // d.metadataMu (recursively).
+ d.touchCMtimeLocked()
+ }
+
+ rw := getDentryReadWriter(ctx, d, offset)
+ defer putDentryReadWriter(rw)
+
+ if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+ if err := fd.writeCache(ctx, d, offset, src); err != nil {
+ return 0, offset, err
+ }
+
+ // Require the write to go to the remote file.
+ rw.direct = true
+ }
+
+ n, err := src.CopyInTo(ctx, rw)
+ if err != nil {
+ return n, offset + n, err
+ }
+ if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
+ // Note that if any of the following fail, then we can't guarantee that
+ // any data was actually written with the semantics of O_DSYNC or
+ // O_SYNC, so we return zero bytes written. Compare Linux's
+ // mm/filemap.c:generic_file_write_iter() =>
+ // include/linux/fs.h:generic_write_sync().
+ //
+ // Write dirty cached pages touched by the write back to the remote
+ // file.
+ if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
+ return 0, offset, err
+ }
+ // Request the remote filesystem to sync the remote file.
+ if err := d.syncRemoteFile(ctx); err != nil {
+ return 0, offset, err
+ }
+ }
+ return n, offset + n, nil
+}
+
+func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64, src usermem.IOSequence) error {
+ // Write dirty cached pages that will be touched by the write back to
+ // the remote file.
+ if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
+ return err
+ }
+
+ // Remove touched pages from the cache.
+ pgstart := usermem.PageRoundDown(uint64(offset))
+ pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes()))
+ if !ok {
+ return syserror.EINVAL
+ }
+ mr := memmap.MappableRange{pgstart, pgend}
+ var freed []memmap.FileRange
+
+ d.dataMu.Lock()
+ cseg := d.cache.LowerBoundSegment(mr.Start)
+ for cseg.Ok() && cseg.Start() < mr.End {
+ cseg = d.cache.Isolate(cseg, mr)
+ freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()})
+ cseg = d.cache.Remove(cseg).NextSegment()
+ }
+ d.dataMu.Unlock()
+
+ // Invalidate mappings of removed pages.
+ d.mapsMu.Lock()
+ d.mappings.Invalidate(mr, memmap.InvalidateOpts{})
+ d.mapsMu.Unlock()
+
+ // Finally free pages removed from the cache.
+ mf := d.fs.mfp.MemoryFile()
+ for _, freedFR := range freed {
+ mf.DecRef(freedFR)
+ }
+ return nil
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ fd.mu.Lock()
+ n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+ fd.off = off
+ fd.mu.Unlock()
+ return n, err
+}
+
+type dentryReadWriter struct {
+ ctx context.Context
+ d *dentry
+ off uint64
+ direct bool
+}
+
+var dentryReadWriterPool = sync.Pool{
+ New: func() interface{} {
+ return &dentryReadWriter{}
+ },
+}
+
+func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter {
+ rw := dentryReadWriterPool.Get().(*dentryReadWriter)
+ rw.ctx = ctx
+ rw.d = d
+ rw.off = uint64(offset)
+ rw.direct = false
+ return rw
+}
+
+func putDentryReadWriter(rw *dentryReadWriter) {
+ rw.ctx = nil
+ rw.d = nil
+ dentryReadWriterPool.Put(rw)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+ if dsts.IsEmpty() {
+ return 0, nil
+ }
+
+ // If we have a mmappable host FD (which must be used here to ensure
+ // coherence with memory-mapped I/O), or if InteropModeShared is in effect
+ // (which prevents us from caching file contents and makes dentry.size
+ // unreliable), or if the file was opened O_DIRECT, read directly from
+ // dentry.readHandleLocked() without locking dentry.dataMu.
+ rw.d.handleMu.RLock()
+ h := rw.d.readHandleLocked()
+ if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+ n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off)
+ rw.d.handleMu.RUnlock()
+ rw.off += n
+ return n, err
+ }
+
+ // Otherwise read from/through the cache.
+ mf := rw.d.fs.mfp.MemoryFile()
+ fillCache := mf.ShouldCacheEvictable()
+ var dataMuUnlock func()
+ if fillCache {
+ rw.d.dataMu.Lock()
+ dataMuUnlock = rw.d.dataMu.Unlock
+ } else {
+ rw.d.dataMu.RLock()
+ dataMuUnlock = rw.d.dataMu.RUnlock
+ }
+
+ // Compute the range to read (limited by file size and overflow-checked).
+ if rw.off >= rw.d.size {
+ dataMuUnlock()
+ rw.d.handleMu.RUnlock()
+ return 0, io.EOF
+ }
+ end := rw.d.size
+ if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
+ end = rend
+ }
+
+ var done uint64
+ seg, gap := rw.d.cache.Find(rw.off)
+ for rw.off < end {
+ mr := memmap.MappableRange{rw.off, end}
+ switch {
+ case seg.Ok():
+ // Get internal mappings from the cache.
+ ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
+ if err != nil {
+ dataMuUnlock()
+ rw.d.handleMu.RUnlock()
+ return done, err
+ }
+
+ // Copy from internal mappings.
+ n, err := safemem.CopySeq(dsts, ims)
+ done += n
+ rw.off += n
+ dsts = dsts.DropFirst64(n)
+ if err != nil {
+ dataMuUnlock()
+ rw.d.handleMu.RUnlock()
+ return done, err
+ }
+
+ // Continue.
+ seg, gap = seg.NextNonEmpty()
+
+ case gap.Ok():
+ gapMR := gap.Range().Intersect(mr)
+ if fillCache {
+ // Read into the cache, then re-enter the loop to read from the
+ // cache.
+ gapEnd, _ := usermem.PageRoundUp(gapMR.End)
+ reqMR := memmap.MappableRange{
+ Start: usermem.PageRoundDown(gapMR.Start),
+ End: gapEnd,
+ }
+ optMR := gap.Range()
+ err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, h.readToBlocksAt)
+ mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End})
+ seg, gap = rw.d.cache.Find(rw.off)
+ if !seg.Ok() {
+ dataMuUnlock()
+ rw.d.handleMu.RUnlock()
+ return done, err
+ }
+ // err might have occurred in part of gap.Range() outside
+ // gapMR. Forget about it for now; if the error matters and
+ // persists, we'll run into it again in a later iteration of
+ // this loop.
+ } else {
+ // Read directly from the file.
+ gapDsts := dsts.TakeFirst64(gapMR.Length())
+ n, err := h.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start)
+ done += n
+ rw.off += n
+ dsts = dsts.DropFirst64(n)
+ // Partial reads are fine. But we must stop reading.
+ if n != gapDsts.NumBytes() || err != nil {
+ dataMuUnlock()
+ rw.d.handleMu.RUnlock()
+ return done, err
+ }
+
+ // Continue.
+ seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+ }
+ }
+ }
+ dataMuUnlock()
+ rw.d.handleMu.RUnlock()
+ return done, nil
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+//
+// Preconditions: rw.d.metadataMu must be locked.
+func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+ if srcs.IsEmpty() {
+ return 0, nil
+ }
+
+ // If we have a mmappable host FD (which must be used here to ensure
+ // coherence with memory-mapped I/O), or if InteropModeShared is in effect
+ // (which prevents us from caching file contents), or if the file was
+ // opened with O_DIRECT, write directly to dentry.writeHandleLocked()
+ // without locking dentry.dataMu.
+ rw.d.handleMu.RLock()
+ h := rw.d.writeHandleLocked()
+ if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+ n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off)
+ rw.off += n
+ rw.d.dataMu.Lock()
+ if rw.off > rw.d.size {
+ atomic.StoreUint64(&rw.d.size, rw.off)
+ // The remote file's size will implicitly be extended to the correct
+ // value when we write back to it.
+ }
+ rw.d.dataMu.Unlock()
+ rw.d.handleMu.RUnlock()
+ return n, err
+ }
+
+ // Otherwise write to/through the cache.
+ mf := rw.d.fs.mfp.MemoryFile()
+ rw.d.dataMu.Lock()
+
+ // Compute the range to write (overflow-checked).
+ start := rw.off
+ end := rw.off + srcs.NumBytes()
+ if end <= rw.off {
+ end = math.MaxInt64
+ }
+
+ var (
+ done uint64
+ retErr error
+ )
+ seg, gap := rw.d.cache.Find(rw.off)
+ for rw.off < end {
+ mr := memmap.MappableRange{rw.off, end}
+ switch {
+ case seg.Ok():
+ // Get internal mappings from the cache.
+ segMR := seg.Range().Intersect(mr)
+ ims, err := mf.MapInternal(seg.FileRangeOf(segMR), usermem.Write)
+ if err != nil {
+ retErr = err
+ goto exitLoop
+ }
+
+ // Copy to internal mappings.
+ n, err := safemem.CopySeq(ims, srcs)
+ done += n
+ rw.off += n
+ srcs = srcs.DropFirst64(n)
+ rw.d.dirty.MarkDirty(segMR)
+ if err != nil {
+ retErr = err
+ goto exitLoop
+ }
+
+ // Continue.
+ seg, gap = seg.NextNonEmpty()
+
+ case gap.Ok():
+ // Write directly to the file. At present, we never fill the cache
+ // when writing, since doing so can convert small writes into
+ // inefficient read-modify-write cycles, and we have no mechanism
+ // for detecting or avoiding this.
+ gapMR := gap.Range().Intersect(mr)
+ gapSrcs := srcs.TakeFirst64(gapMR.Length())
+ n, err := h.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start)
+ done += n
+ rw.off += n
+ srcs = srcs.DropFirst64(n)
+ // Partial writes are fine. But we must stop writing.
+ if n != gapSrcs.NumBytes() || err != nil {
+ retErr = err
+ goto exitLoop
+ }
+
+ // Continue.
+ seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+ }
+ }
+exitLoop:
+ if rw.off > rw.d.size {
+ atomic.StoreUint64(&rw.d.size, rw.off)
+ // The remote file's size will implicitly be extended to the correct
+ // value when we write back to it.
+ }
+ // If InteropModeWritethrough is in effect, flush written data back to the
+ // remote filesystem.
+ if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 {
+ if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{
+ Start: start,
+ End: rw.off,
+ }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, h.writeFromBlocksAt); err != nil {
+ // We have no idea how many bytes were actually flushed.
+ rw.off = start
+ done = 0
+ retErr = err
+ }
+ }
+ rw.d.dataMu.Unlock()
+ rw.d.handleMu.RUnlock()
+ return done, retErr
+}
+
+func (d *dentry) writeback(ctx context.Context, offset, size int64) error {
+ if size == 0 {
+ return nil
+ }
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ h := d.writeHandleLocked()
+ d.dataMu.Lock()
+ defer d.dataMu.Unlock()
+ // Compute the range of valid bytes (overflow-checked).
+ if uint64(offset) >= d.size {
+ return nil
+ }
+ end := int64(d.size)
+ if rend := offset + size; rend > offset && rend < end {
+ end = rend
+ }
+ return fsutil.SyncDirty(ctx, memmap.MappableRange{
+ Start: uint64(offset),
+ End: uint64(end),
+ }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence)
+ if err != nil {
+ return 0, err
+ }
+ fd.off = newOffset
+ return newOffset, nil
+}
+
+// Calculate the new offset for a seek operation on a regular file.
+func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int64, whence int32) (int64, error) {
+ switch whence {
+ case linux.SEEK_SET:
+ // Use offset as specified.
+ case linux.SEEK_CUR:
+ offset += fdOffset
+ case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE:
+ // Ensure file size is up to date.
+ if !d.cachedMetadataAuthoritative() {
+ if err := d.updateFromGetattr(ctx); err != nil {
+ return 0, err
+ }
+ }
+ size := int64(atomic.LoadUint64(&d.size))
+ // For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous
+ // block of data.
+ switch whence {
+ case linux.SEEK_END:
+ offset += size
+ case linux.SEEK_DATA:
+ if offset > size {
+ return 0, syserror.ENXIO
+ }
+ // Use offset as specified.
+ case linux.SEEK_HOLE:
+ if offset > size {
+ return 0, syserror.ENXIO
+ }
+ offset = size
+ }
+ default:
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ return offset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+ return fd.dentry().syncCachedFile(ctx)
+}
+
+func (d *dentry) syncCachedFile(ctx context.Context) error {
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+
+ if h := d.writeHandleLocked(); h.isOpen() {
+ d.dataMu.Lock()
+ // Write dirty cached data to the remote file.
+ err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
+ d.dataMu.Unlock()
+ if err != nil {
+ return err
+ }
+ }
+ return d.syncRemoteFileLocked(ctx)
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+ d := fd.dentry()
+ switch d.fs.opts.interop {
+ case InteropModeExclusive:
+ // Any mapping is fine.
+ case InteropModeWritethrough:
+ // Shared writable mappings require a host FD, since otherwise we can't
+ // synchronously flush memory-mapped writes to the remote file.
+ if opts.Private || !opts.MaxPerms.Write {
+ break
+ }
+ fallthrough
+ case InteropModeShared:
+ // All mappings require a host FD to be coherent with other filesystem
+ // users.
+ if d.fs.opts.forcePageCache {
+ // Whether or not we have a host FD, we're not allowed to use it.
+ return syserror.ENODEV
+ }
+ d.handleMu.RLock()
+ haveFD := d.hostFD >= 0
+ d.handleMu.RUnlock()
+ if !haveFD {
+ return syserror.ENODEV
+ }
+ default:
+ panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop))
+ }
+ // After this point, d may be used as a memmap.Mappable.
+ d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init)
+ return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts)
+}
+
+func (d *dentry) mayCachePages() bool {
+ if d.fs.opts.interop == InteropModeShared {
+ return false
+ }
+ if d.fs.opts.forcePageCache {
+ return true
+ }
+ d.handleMu.RLock()
+ haveFD := d.hostFD >= 0
+ d.handleMu.RUnlock()
+ return haveFD
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+ d.mapsMu.Lock()
+ mapped := d.mappings.AddMapping(ms, ar, offset, writable)
+ // Do this unconditionally since whether we have a host FD can change
+ // across save/restore.
+ for _, r := range mapped {
+ d.pf.hostFileMapper.IncRefOn(r)
+ }
+ if d.mayCachePages() {
+ // d.Evict() will refuse to evict memory-mapped pages, so tell the
+ // MemoryFile to not bother trying.
+ mf := d.fs.mfp.MemoryFile()
+ for _, r := range mapped {
+ mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End})
+ }
+ }
+ d.mapsMu.Unlock()
+ return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+ d.mapsMu.Lock()
+ unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable)
+ for _, r := range unmapped {
+ d.pf.hostFileMapper.DecRefOn(r)
+ }
+ if d.mayCachePages() {
+ // Pages that are no longer referenced by any application memory
+ // mappings are now considered unused; allow MemoryFile to evict them
+ // when necessary.
+ mf := d.fs.mfp.MemoryFile()
+ d.dataMu.Lock()
+ for _, r := range unmapped {
+ // Since these pages are no longer mapped, they are no longer
+ // concurrently dirtyable by a writable memory mapping.
+ d.dirty.AllowClean(r)
+ mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End})
+ }
+ d.dataMu.Unlock()
+ }
+ d.mapsMu.Unlock()
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+ return d.AddMapping(ctx, ms, dstAR, offset, writable)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+ d.handleMu.RLock()
+ if d.hostFD >= 0 && !d.fs.opts.forcePageCache {
+ d.handleMu.RUnlock()
+ mr := optional
+ if d.fs.opts.limitHostFDTranslation {
+ mr = maxFillRange(required, optional)
+ }
+ return []memmap.Translation{
+ {
+ Source: mr,
+ File: &d.pf,
+ Offset: mr.Start,
+ Perms: usermem.AnyAccess,
+ },
+ }, nil
+ }
+
+ d.dataMu.Lock()
+
+ // Constrain translations to d.size (rounded up) to prevent translation to
+ // pages that may be concurrently truncated.
+ pgend, _ := usermem.PageRoundUp(d.size)
+ var beyondEOF bool
+ if required.End > pgend {
+ if required.Start >= pgend {
+ d.dataMu.Unlock()
+ d.handleMu.RUnlock()
+ return nil, &memmap.BusError{io.EOF}
+ }
+ beyondEOF = true
+ required.End = pgend
+ }
+ if optional.End > pgend {
+ optional.End = pgend
+ }
+
+ mf := d.fs.mfp.MemoryFile()
+ h := d.readHandleLocked()
+ cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, h.readToBlocksAt)
+
+ var ts []memmap.Translation
+ var translatedEnd uint64
+ for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
+ segMR := seg.Range().Intersect(optional)
+ // TODO(jamieliu): Make Translations writable even if writability is
+ // not required if already kept-dirty by another writable translation.
+ perms := usermem.AccessType{
+ Read: true,
+ Execute: true,
+ }
+ if at.Write {
+ // From this point forward, this memory can be dirtied through the
+ // mapping at any time.
+ d.dirty.KeepDirty(segMR)
+ perms.Write = true
+ }
+ ts = append(ts, memmap.Translation{
+ Source: segMR,
+ File: mf,
+ Offset: seg.FileRangeOf(segMR).Start,
+ Perms: perms,
+ })
+ translatedEnd = segMR.End
+ }
+
+ d.dataMu.Unlock()
+ d.handleMu.RUnlock()
+
+ // Don't return the error returned by c.cache.Fill if it occurred outside
+ // of required.
+ if translatedEnd < required.End && cerr != nil {
+ return ts, &memmap.BusError{cerr}
+ }
+ if beyondEOF {
+ return ts, &memmap.BusError{io.EOF}
+ }
+ return ts, nil
+}
+
+func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange {
+ const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily
+ if required.Length() >= maxReadahead {
+ return required
+ }
+ if optional.Length() <= maxReadahead {
+ return optional
+ }
+ optional.Start = required.Start
+ if optional.Length() <= maxReadahead {
+ return optional
+ }
+ optional.End = optional.Start + maxReadahead
+ return optional
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
+ // Whether we have a host fd (and consequently what memmap.File is
+ // mapped) can change across save/restore, so invalidate all translations
+ // unconditionally.
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ d.mappings.InvalidateAll(memmap.InvalidateOpts{})
+
+ // Write the cache's contents back to the remote file so that if we have a
+ // host fd after restore, the remote file's contents are coherent.
+ mf := d.fs.mfp.MemoryFile()
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ h := d.writeHandleLocked()
+ d.dataMu.Lock()
+ defer d.dataMu.Unlock()
+ if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
+ return err
+ }
+
+ // Discard the cache so that it's not stored in saved state. This is safe
+ // because per InvalidateUnsavable invariants, no new translations can have
+ // been returned after we invalidated all existing translations above.
+ d.cache.DropAll(mf)
+ d.dirty.RemoveAll()
+
+ return nil
+}
+
+// Evict implements pgalloc.EvictableMemoryUser.Evict.
+func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
+ mr := memmap.MappableRange{er.Start, er.End}
+ mf := d.fs.mfp.MemoryFile()
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ h := d.writeHandleLocked()
+ d.dataMu.Lock()
+ defer d.dataMu.Unlock()
+
+ // Only allow pages that are no longer memory-mapped to be evicted.
+ for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() {
+ mgapMR := mgap.Range().Intersect(mr)
+ if mgapMR.Length() == 0 {
+ continue
+ }
+ if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
+ log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err)
+ }
+ d.cache.Drop(mgapMR, mf)
+ d.dirty.KeepClean(mgapMR)
+ }
+}
+
+// dentryPlatformFile implements memmap.File. It exists solely because dentry
+// cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef.
+//
+// dentryPlatformFile is only used when a host FD representing the remote file
+// is available (i.e. dentry.hostFD >= 0), and that FD is used for application
+// memory mappings (i.e. !filesystem.opts.forcePageCache).
+type dentryPlatformFile struct {
+ *dentry
+
+ // fdRefs counts references on memmap.File offsets. fdRefs is protected
+ // by dentry.dataMu.
+ fdRefs fsutil.FrameRefSet
+
+ // If this dentry represents a regular file, and dentry.hostFD >= 0,
+ // hostFileMapper caches mappings of dentry.hostFD.
+ hostFileMapper fsutil.HostFileMapper
+
+ // hostFileMapperInitOnce is used to lazily initialize hostFileMapper.
+ hostFileMapperInitOnce sync.Once
+}
+
+// IncRef implements memmap.File.IncRef.
+func (d *dentryPlatformFile) IncRef(fr memmap.FileRange) {
+ d.dataMu.Lock()
+ d.fdRefs.IncRefAndAccount(fr)
+ d.dataMu.Unlock()
+}
+
+// DecRef implements memmap.File.DecRef.
+func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) {
+ d.dataMu.Lock()
+ d.fdRefs.DecRefAndAccount(fr)
+ d.dataMu.Unlock()
+}
+
+// MapInternal implements memmap.File.MapInternal.
+func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ return d.hostFileMapper.MapInternal(fr, int(d.hostFD), at.Write)
+}
+
+// FD implements memmap.File.FD.
+func (d *dentryPlatformFile) FD() int {
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ return int(d.hostFD)
+}
diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go
new file mode 100644
index 000000000..85d2bee72
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/socket.go
@@ -0,0 +1,146 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/syserr"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+func (d *dentry) isSocket() bool {
+ return d.fileType() == linux.S_IFSOCK
+}
+
+// endpoint is a Gofer-backed transport.BoundEndpoint.
+//
+// An endpoint's lifetime is the time between when filesystem.BoundEndpointAt()
+// is called and either BoundEndpoint.BidirectionalConnect or
+// BoundEndpoint.UnidirectionalConnect is called.
+type endpoint struct {
+ // dentry is the filesystem dentry which produced this endpoint.
+ dentry *dentry
+
+ // file is the p9 file that contains a single unopened fid.
+ file p9.File
+
+ // path is the sentry path where this endpoint is bound.
+ path string
+}
+
+func sockTypeToP9(t linux.SockType) (p9.ConnectFlags, bool) {
+ switch t {
+ case linux.SOCK_STREAM:
+ return p9.StreamSocket, true
+ case linux.SOCK_SEQPACKET:
+ return p9.SeqpacketSocket, true
+ case linux.SOCK_DGRAM:
+ return p9.DgramSocket, true
+ }
+ return 0, false
+}
+
+// BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect.
+func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error {
+ cf, ok := sockTypeToP9(ce.Type())
+ if !ok {
+ return syserr.ErrConnectionRefused
+ }
+
+ // No lock ordering required as only the ConnectingEndpoint has a mutex.
+ ce.Lock()
+
+ // Check connecting state.
+ if ce.Connected() {
+ ce.Unlock()
+ return syserr.ErrAlreadyConnected
+ }
+ if ce.Listening() {
+ ce.Unlock()
+ return syserr.ErrInvalidEndpointState
+ }
+
+ c, err := e.newConnectedEndpoint(ctx, cf, ce.WaiterQueue())
+ if err != nil {
+ ce.Unlock()
+ return err
+ }
+
+ returnConnect(c, c)
+ ce.Unlock()
+ if err := c.Init(); err != nil {
+ return syserr.FromError(err)
+ }
+
+ return nil
+}
+
+// UnidirectionalConnect implements
+// transport.BoundEndpoint.UnidirectionalConnect.
+func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.ConnectedEndpoint, *syserr.Error) {
+ c, err := e.newConnectedEndpoint(ctx, p9.DgramSocket, &waiter.Queue{})
+ if err != nil {
+ return nil, err
+ }
+
+ if err := c.Init(); err != nil {
+ return nil, syserr.FromError(err)
+ }
+
+ // We don't need the receiver.
+ c.CloseRecv()
+ c.Release(ctx)
+
+ return c, nil
+}
+
+func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFlags, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) {
+ hostFile, err := e.file.Connect(flags)
+ if err != nil {
+ return nil, syserr.ErrConnectionRefused
+ }
+ // Dup the fd so that the new endpoint can manage its lifetime.
+ hostFD, err := syscall.Dup(hostFile.FD())
+ if err != nil {
+ log.Warningf("Could not dup host socket fd %d: %v", hostFile.FD(), err)
+ return nil, syserr.FromError(err)
+ }
+ // After duplicating, we no longer need hostFile.
+ hostFile.Close()
+
+ c, serr := host.NewSCMEndpoint(ctx, hostFD, queue, e.path)
+ if serr != nil {
+ log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.file, flags, serr)
+ return nil, serr
+ }
+ return c, nil
+}
+
+// Release implements transport.BoundEndpoint.Release.
+func (e *endpoint) Release(ctx context.Context) {
+ e.dentry.DecRef(ctx)
+}
+
+// Passcred implements transport.BoundEndpoint.Passcred.
+func (e *endpoint) Passcred() bool {
+ return false
+}
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
new file mode 100644
index 000000000..a6368fdd0
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -0,0 +1,292 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+ "sync"
+ "sync/atomic"
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fdnotifier"
+ "gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device
+// special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is
+// in effect) regular files. specialFileFD differs from regularFileFD by using
+// per-FD handles instead of shared per-dentry handles, and never buffering I/O.
+type specialFileFD struct {
+ fileDescription
+
+ // handle is used for file I/O. handle is immutable.
+ handle handle
+
+ // seekable is true if this file description represents a file for which
+ // file offset is significant, i.e. a regular file. seekable is immutable.
+ seekable bool
+
+ // haveQueue is true if this file description represents a file for which
+ // queue may send I/O readiness events. haveQueue is immutable.
+ haveQueue bool
+ queue waiter.Queue
+
+ // If seekable is true, off is the file offset. off is protected by mu.
+ mu sync.Mutex
+ off int64
+}
+
+func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) {
+ ftype := d.fileType()
+ seekable := ftype == linux.S_IFREG
+ haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && h.fd >= 0
+ fd := &specialFileFD{
+ handle: h,
+ seekable: seekable,
+ haveQueue: haveQueue,
+ }
+ fd.LockFD.Init(locks)
+ if haveQueue {
+ if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil {
+ return nil, err
+ }
+ }
+ if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+ DenyPRead: !seekable,
+ DenyPWrite: !seekable,
+ }); err != nil {
+ if haveQueue {
+ fdnotifier.RemoveFD(h.fd)
+ }
+ return nil, err
+ }
+ return fd, nil
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *specialFileFD) Release(ctx context.Context) {
+ if fd.haveQueue {
+ fdnotifier.RemoveFD(fd.handle.fd)
+ }
+ fd.handle.close(ctx)
+ fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+ fs.syncMu.Lock()
+ delete(fs.specialFileFDs, fd)
+ fs.syncMu.Unlock()
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *specialFileFD) OnClose(ctx context.Context) error {
+ if !fd.vfsfd.IsWritable() {
+ return nil
+ }
+ return fd.handle.file.flush(ctx)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+ if fd.haveQueue {
+ return fdnotifier.NonBlockingPoll(fd.handle.fd, mask)
+ }
+ return fd.fileDescription.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ if fd.haveQueue {
+ fd.queue.EventRegister(e, mask)
+ fdnotifier.UpdateFD(fd.handle.fd)
+ return
+ }
+ fd.fileDescription.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *specialFileFD) EventUnregister(e *waiter.Entry) {
+ if fd.haveQueue {
+ fd.queue.EventUnregister(e)
+ fdnotifier.UpdateFD(fd.handle.fd)
+ return
+ }
+ fd.fileDescription.EventUnregister(e)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ if fd.seekable && offset < 0 {
+ return 0, syserror.EINVAL
+ }
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
+ return 0, syserror.EOPNOTSUPP
+ }
+
+ // Going through dst.CopyOutFrom() holds MM locks around file operations of
+ // unknown duration. For regularFileFD, doing so is necessary to support
+ // mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't
+ // hold here since specialFileFD doesn't client-cache data. Just buffer the
+ // read instead.
+ if d := fd.dentry(); d.cachedMetadataAuthoritative() {
+ d.touchAtime(fd.vfsfd.Mount())
+ }
+ buf := make([]byte, dst.NumBytes())
+ n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+ if err == syserror.EAGAIN {
+ err = syserror.ErrWouldBlock
+ }
+ if n == 0 {
+ return 0, err
+ }
+ if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil {
+ return int64(cp), cperr
+ }
+ return int64(n), err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ if !fd.seekable {
+ return fd.PRead(ctx, dst, -1, opts)
+ }
+
+ fd.mu.Lock()
+ n, err := fd.PRead(ctx, dst, fd.off, opts)
+ fd.off += n
+ fd.mu.Unlock()
+ return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ n, _, err := fd.pwrite(ctx, src, offset, opts)
+ return n, err
+}
+
+// pwrite returns the number of bytes written, final offset, error. The final
+// offset should be ignored by PWrite.
+func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
+ if fd.seekable && offset < 0 {
+ return 0, offset, syserror.EINVAL
+ }
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
+ return 0, offset, syserror.EOPNOTSUPP
+ }
+
+ d := fd.dentry()
+ // If the regular file fd was opened with O_APPEND, make sure the file size
+ // is updated. There is a possible race here if size is modified externally
+ // after metadata cache is updated.
+ if fd.seekable && fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
+ if err := d.updateFromGetattr(ctx); err != nil {
+ return 0, offset, err
+ }
+ }
+
+ if fd.seekable {
+ // We need to hold the metadataMu *while* writing to a regular file.
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+
+ // Set offset to file size if the regular file was opened with O_APPEND.
+ if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+ // Holding d.metadataMu is sufficient for reading d.size.
+ offset = int64(d.size)
+ }
+ limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
+ if err != nil {
+ return 0, offset, err
+ }
+ src = src.TakeFirst64(limit)
+ }
+
+ // Do a buffered write. See rationale in PRead.
+ if d.cachedMetadataAuthoritative() {
+ d.touchCMtime()
+ }
+ buf := make([]byte, src.NumBytes())
+ // Don't do partial writes if we get a partial read from src.
+ if _, err := src.CopyIn(ctx, buf); err != nil {
+ return 0, offset, err
+ }
+ n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+ if err == syserror.EAGAIN {
+ err = syserror.ErrWouldBlock
+ }
+ finalOff = offset
+ // Update file size for regular files.
+ if fd.seekable {
+ finalOff += int64(n)
+ // d.metadataMu is already locked at this point.
+ if uint64(finalOff) > d.size {
+ d.dataMu.Lock()
+ defer d.dataMu.Unlock()
+ atomic.StoreUint64(&d.size, uint64(finalOff))
+ }
+ }
+ return int64(n), finalOff, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ if !fd.seekable {
+ return fd.PWrite(ctx, src, -1, opts)
+ }
+
+ fd.mu.Lock()
+ n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+ fd.off = off
+ fd.mu.Unlock()
+ return n, err
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ if !fd.seekable {
+ return 0, syserror.ESPIPE
+ }
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence)
+ if err != nil {
+ return 0, err
+ }
+ fd.off = newOffset
+ return newOffset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *specialFileFD) Sync(ctx context.Context) error {
+ // If we have a host FD, fsyncing it is likely to be faster than an fsync
+ // RPC.
+ if fd.handle.fd >= 0 {
+ ctx.UninterruptibleSleepStart(false)
+ err := syscall.Fsync(int(fd.handle.fd))
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+ }
+ return fd.handle.file.fsync(ctx)
+}
diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go
new file mode 100644
index 000000000..2ec819f86
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/symlink.go
@@ -0,0 +1,47 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func (d *dentry) isSymlink() bool {
+ return d.fileType() == linux.S_IFLNK
+}
+
+// Precondition: d.isSymlink().
+func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
+ if d.fs.opts.interop != InteropModeShared {
+ d.touchAtime(mnt)
+ d.dataMu.Lock()
+ if d.haveTarget {
+ target := d.target
+ d.dataMu.Unlock()
+ return target, nil
+ }
+ }
+ target, err := d.file.readlink(ctx)
+ if d.fs.opts.interop != InteropModeShared {
+ if err == nil {
+ d.haveTarget = true
+ d.target = target
+ }
+ d.dataMu.Unlock()
+ }
+ return target, err
+}
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
new file mode 100644
index 000000000..2cb8191b9
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -0,0 +1,82 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func dentryTimestampFromP9(s, ns uint64) int64 {
+ return int64(s*1e9 + ns)
+}
+
+func dentryTimestampFromStatx(ts linux.StatxTimestamp) int64 {
+ return ts.Sec*1e9 + int64(ts.Nsec)
+}
+
+func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
+ return linux.StatxTimestamp{
+ Sec: ns / 1e9,
+ Nsec: uint32(ns % 1e9),
+ }
+}
+
+// Preconditions: d.cachedMetadataAuthoritative() == true.
+func (d *dentry) touchAtime(mnt *vfs.Mount) {
+ if mnt.Flags.NoATime {
+ return
+ }
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return
+ }
+ now := d.fs.clock.Now().Nanoseconds()
+ d.metadataMu.Lock()
+ atomic.StoreInt64(&d.atime, now)
+ atomic.StoreUint32(&d.atimeDirty, 1)
+ d.metadataMu.Unlock()
+ mnt.EndWrite()
+}
+
+// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
+// successfully called vfs.Mount.CheckBeginWrite().
+func (d *dentry) touchCtime() {
+ now := d.fs.clock.Now().Nanoseconds()
+ d.metadataMu.Lock()
+ atomic.StoreInt64(&d.ctime, now)
+ d.metadataMu.Unlock()
+}
+
+// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
+// successfully called vfs.Mount.CheckBeginWrite().
+func (d *dentry) touchCMtime() {
+ now := d.fs.clock.Now().Nanoseconds()
+ d.metadataMu.Lock()
+ atomic.StoreInt64(&d.mtime, now)
+ atomic.StoreInt64(&d.ctime, now)
+ atomic.StoreUint32(&d.mtimeDirty, 1)
+ d.metadataMu.Unlock()
+}
+
+// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
+// locked d.metadataMu.
+func (d *dentry) touchCMtimeLocked() {
+ now := d.fs.clock.Now().Nanoseconds()
+ atomic.StoreInt64(&d.mtime, now)
+ atomic.StoreInt64(&d.ctime, now)
+ atomic.StoreUint32(&d.mtimeDirty, 1)
+}