// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package gofer provides a filesystem implementation that is backed by a 9p
// server, interchangably referred to as "gofers" throughout this package.
//
// Lock order:
//   regularFileFD/directoryFD.mu
//     filesystem.renameMu
//       dentry.dirMu
//         filesystem.syncMu
//         dentry.metadataMu
//           *** "memmap.Mappable locks" below this point
//           dentry.mapsMu
//             *** "memmap.Mappable locks taken by Translate" below this point
//             dentry.handleMu
//               dentry.dataMu
//           filesystem.inoMu
//   specialFileFD.mu
//     specialFileFD.bufMu
//
// Locking dentry.dirMu in multiple dentries requires that either ancestor
// dentries are locked before descendant dentries, or that filesystem.renameMu
// is locked for writing.
package gofer

import (
	"fmt"
	"strconv"
	"strings"
	"sync/atomic"
	"syscall"

	"gvisor.dev/gvisor/pkg/abi/linux"
	"gvisor.dev/gvisor/pkg/context"
	"gvisor.dev/gvisor/pkg/log"
	"gvisor.dev/gvisor/pkg/p9"
	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
	"gvisor.dev/gvisor/pkg/sentry/memmap"
	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
	"gvisor.dev/gvisor/pkg/sentry/vfs"
	"gvisor.dev/gvisor/pkg/sync"
	"gvisor.dev/gvisor/pkg/syserror"
	"gvisor.dev/gvisor/pkg/unet"
	"gvisor.dev/gvisor/pkg/usermem"
)

// Name is the default filesystem name.
const Name = "9p"

// FilesystemType implements vfs.FilesystemType.
//
// +stateify savable
type FilesystemType struct{}

// filesystem implements vfs.FilesystemImpl.
//
// +stateify savable
type filesystem struct {
	vfsfs vfs.Filesystem

	// mfp is used to allocate memory that caches regular file contents. mfp is
	// immutable.
	mfp pgalloc.MemoryFileProvider

	// Immutable options.
	opts  filesystemOptions
	iopts InternalFilesystemOptions

	// client is the client used by this filesystem. client is immutable.
	client *p9.Client `state:"nosave"`

	// clock is a realtime clock used to set timestamps in file operations.
	clock ktime.Clock

	// devMinor is the filesystem's minor device number. devMinor is immutable.
	devMinor uint32

	// root is the root dentry. root is immutable.
	root *dentry

	// renameMu serves two purposes:
	//
	// - It synchronizes path resolution with renaming initiated by this
	// client.
	//
	// - It is held by path resolution to ensure that reachable dentries remain
	// valid. A dentry is reachable by path resolution if it has a non-zero
	// reference count (such that it is usable as vfs.ResolvingPath.Start() or
	// is reachable from its children), or if it is a child dentry (such that
	// it is reachable from its parent).
	renameMu sync.RWMutex `state:"nosave"`

	// cachedDentries contains all dentries with 0 references. (Due to race
	// conditions, it may also contain dentries with non-zero references.)
	// cachedDentriesLen is the number of dentries in cachedDentries. These
	// fields are protected by renameMu.
	cachedDentries    dentryList
	cachedDentriesLen uint64

	// syncableDentries contains all non-synthetic dentries. specialFileFDs
	// contains all open specialFileFDs. These fields are protected by syncMu.
	syncMu           sync.Mutex `state:"nosave"`
	syncableDentries map[*dentry]struct{}
	specialFileFDs   map[*specialFileFD]struct{}

	// inoByQIDPath maps previously-observed QID.Paths to inode numbers
	// assigned to those paths. inoByQIDPath is not preserved across
	// checkpoint/restore because QIDs may be reused between different gofer
	// processes, so QIDs may be repeated for different files across
	// checkpoint/restore. inoByQIDPath is protected by inoMu.
	inoMu        sync.Mutex        `state:"nosave"`
	inoByQIDPath map[uint64]uint64 `state:"nosave"`

	// lastIno is the last inode number assigned to a file. lastIno is accessed
	// using atomic memory operations.
	lastIno uint64

	// savedDentryRW records open read/write handles during save/restore.
	savedDentryRW map[*dentry]savedDentryRW
}

// +stateify savable
type filesystemOptions struct {
	// "Standard" 9P options.
	fd      int
	aname   string
	interop InteropMode // derived from the "cache" mount option
	dfltuid auth.KUID
	dfltgid auth.KGID
	msize   uint32
	version string

	// maxCachedDentries is the maximum number of dentries with 0 references
	// retained by the client.
	maxCachedDentries uint64

	// If forcePageCache is true, host FDs may not be used for application
	// memory mappings even if available; instead, the client must perform its
	// own caching of regular file pages. This is primarily useful for testing.
	forcePageCache bool

	// If limitHostFDTranslation is true, apply maxFillRange() constraints to
	// host FD mappings returned by dentry.(memmap.Mappable).Translate(). This
	// makes memory accounting behavior more consistent between cases where
	// host FDs are / are not available, but may increase the frequency of
	// sentry-handled page faults on files for which a host FD is available.
	limitHostFDTranslation bool

	// If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote
	// filesystem may not be coherent with writable host FDs opened later, so
	// all uses of the former must be replaced by uses of the latter. This is
	// usually only the case when the remote filesystem is a Linux overlayfs
	// mount. (Prior to Linux 4.18, patch series centered on commit
	// d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were
	// incoherent between pre-copy-up and post-copy-up FDs; after that patch
	// series, only memory mappings are incoherent.)
	overlayfsStaleRead bool

	// If regularFilesUseSpecialFileFD is true, application FDs representing
	// regular files will use distinct file handles for each FD, in the same
	// way that application FDs representing "special files" such as sockets
	// do. Note that this disables client caching and mmap for regular files.
	regularFilesUseSpecialFileFD bool
}

// InteropMode controls the client's interaction with other remote filesystem
// users.
//
// +stateify savable
type InteropMode uint32

const (
	// InteropModeExclusive is appropriate when the filesystem client is the
	// only user of the remote filesystem.
	//
	// - The client may cache arbitrary filesystem state (file data, metadata,
	// filesystem structure, etc.).
	//
	// - Client changes to filesystem state may be sent to the remote
	// filesystem asynchronously, except when server permission checks are
	// necessary.
	//
	// - File timestamps are based on client clocks. This ensures that users of
	// the client observe timestamps that are coherent with their own clocks
	// and consistent with Linux's semantics (in particular, it is not always
	// possible for clients to set arbitrary atimes and mtimes depending on the
	// remote filesystem implementation, and never possible for clients to set
	// arbitrary ctimes.)
	InteropModeExclusive InteropMode = iota

	// InteropModeWritethrough is appropriate when there are read-only users of
	// the remote filesystem that expect to observe changes made by the
	// filesystem client.
	//
	// - The client may cache arbitrary filesystem state.
	//
	// - Client changes to filesystem state must be sent to the remote
	// filesystem synchronously.
	//
	// - File timestamps are based on client clocks. As a corollary, access
	// timestamp changes from other remote filesystem users will not be visible
	// to the client.
	InteropModeWritethrough

	// InteropModeShared is appropriate when there are users of the remote
	// filesystem that may mutate its state other than the client.
	//
	// - The client must verify ("revalidate") cached filesystem state before
	// using it.
	//
	// - Client changes to filesystem state must be sent to the remote
	// filesystem synchronously.
	//
	// - File timestamps are based on server clocks. This is necessary to
	// ensure that timestamp changes are synchronized between remote filesystem
	// users.
	//
	// Note that the correctness of InteropModeShared depends on the server
	// correctly implementing 9P fids (i.e. each fid immutably represents a
	// single filesystem object), even in the presence of remote filesystem
	// mutations from other users. If this is violated, the behavior of the
	// client is undefined.
	InteropModeShared
)

// InternalFilesystemOptions may be passed as
// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
//
// +stateify savable
type InternalFilesystemOptions struct {
	// If UniqueID is non-empty, it is an opaque string used to reassociate the
	// filesystem with a new server FD during restoration from checkpoint.
	UniqueID string

	// If LeakConnection is true, do not close the connection to the server
	// when the Filesystem is released. This is necessary for deployments in
	// which servers can handle only a single client and report failure if that
	// client disconnects.
	LeakConnection bool

	// If OpenSocketsByConnecting is true, silently translate attempts to open
	// files identifying as sockets to connect RPCs.
	OpenSocketsByConnecting bool
}

// _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default
// UIDs and GIDs used for files that do not provide a specific owner or group
// respectively.
const (
	// uint32(-2) doesn't work in Go.
	_V9FS_DEFUID = auth.KUID(4294967294)
	_V9FS_DEFGID = auth.KGID(4294967294)
)

// Name implements vfs.FilesystemType.Name.
func (FilesystemType) Name() string {
	return Name
}

// Release implements vfs.FilesystemType.Release.
func (FilesystemType) Release(ctx context.Context) {}

// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
	if mfp == nil {
		ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider")
		return nil, nil, syserror.EINVAL
	}

	mopts := vfs.GenericParseMountOptions(opts.Data)
	var fsopts filesystemOptions

	fd, err := getFDFromMountOptionsMap(ctx, mopts)
	if err != nil {
		return nil, nil, err
	}
	fsopts.fd = fd

	// Get the attach name.
	fsopts.aname = "/"
	if aname, ok := mopts["aname"]; ok {
		delete(mopts, "aname")
		fsopts.aname = aname
	}

	// Parse the cache policy. For historical reasons, this defaults to the
	// least generally-applicable option, InteropModeExclusive.
	fsopts.interop = InteropModeExclusive
	if cache, ok := mopts["cache"]; ok {
		delete(mopts, "cache")
		switch cache {
		case "fscache":
			fsopts.interop = InteropModeExclusive
		case "fscache_writethrough":
			fsopts.interop = InteropModeWritethrough
		case "none":
			fsopts.regularFilesUseSpecialFileFD = true
			fallthrough
		case "remote_revalidating":
			fsopts.interop = InteropModeShared
		default:
			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: cache=%s", cache)
			return nil, nil, syserror.EINVAL
		}
	}

	// Parse the default UID and GID.
	fsopts.dfltuid = _V9FS_DEFUID
	if dfltuidstr, ok := mopts["dfltuid"]; ok {
		delete(mopts, "dfltuid")
		dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32)
		if err != nil {
			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltuid=%s", dfltuidstr)
			return nil, nil, syserror.EINVAL
		}
		// In Linux, dfltuid is interpreted as a UID and is converted to a KUID
		// in the caller's user namespace, but goferfs isn't
		// application-mountable.
		fsopts.dfltuid = auth.KUID(dfltuid)
	}
	fsopts.dfltgid = _V9FS_DEFGID
	if dfltgidstr, ok := mopts["dfltgid"]; ok {
		delete(mopts, "dfltgid")
		dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32)
		if err != nil {
			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltgid=%s", dfltgidstr)
			return nil, nil, syserror.EINVAL
		}
		fsopts.dfltgid = auth.KGID(dfltgid)
	}

	// Parse the 9P message size.
	fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M
	if msizestr, ok := mopts["msize"]; ok {
		delete(mopts, "msize")
		msize, err := strconv.ParseUint(msizestr, 10, 32)
		if err != nil {
			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid message size: msize=%s", msizestr)
			return nil, nil, syserror.EINVAL
		}
		fsopts.msize = uint32(msize)
	}

	// Parse the 9P protocol version.
	fsopts.version = p9.HighestVersionString()
	if version, ok := mopts["version"]; ok {
		delete(mopts, "version")
		fsopts.version = version
	}

	// Parse the dentry cache limit.
	fsopts.maxCachedDentries = 1000
	if str, ok := mopts["dentry_cache_limit"]; ok {
		delete(mopts, "dentry_cache_limit")
		maxCachedDentries, err := strconv.ParseUint(str, 10, 64)
		if err != nil {
			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
			return nil, nil, syserror.EINVAL
		}
		fsopts.maxCachedDentries = maxCachedDentries
	}

	// Handle simple flags.
	if _, ok := mopts["force_page_cache"]; ok {
		delete(mopts, "force_page_cache")
		fsopts.forcePageCache = true
	}
	if _, ok := mopts["limit_host_fd_translation"]; ok {
		delete(mopts, "limit_host_fd_translation")
		fsopts.limitHostFDTranslation = true
	}
	if _, ok := mopts["overlayfs_stale_read"]; ok {
		delete(mopts, "overlayfs_stale_read")
		fsopts.overlayfsStaleRead = true
	}
	// fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying
	// "cache=none".

	// Check for unparsed options.
	if len(mopts) != 0 {
		ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts)
		return nil, nil, syserror.EINVAL
	}

	// Handle internal options.
	iopts, ok := opts.InternalData.(InternalFilesystemOptions)
	if opts.InternalData != nil && !ok {
		ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData)
		return nil, nil, syserror.EINVAL
	}
	// If !ok, iopts being the zero value is correct.

	// Construct the filesystem object.
	devMinor, err := vfsObj.GetAnonBlockDevMinor()
	if err != nil {
		return nil, nil, err
	}
	fs := &filesystem{
		mfp:              mfp,
		opts:             fsopts,
		iopts:            iopts,
		clock:            ktime.RealtimeClockFromContext(ctx),
		devMinor:         devMinor,
		syncableDentries: make(map[*dentry]struct{}),
		specialFileFDs:   make(map[*specialFileFD]struct{}),
		inoByQIDPath:     make(map[uint64]uint64),
	}
	fs.vfsfs.Init(vfsObj, &fstype, fs)

	// Connect to the server.
	if err := fs.dial(ctx); err != nil {
		return nil, nil, err
	}

	// Perform attach to obtain the filesystem root.
	ctx.UninterruptibleSleepStart(false)
	attached, err := fs.client.Attach(fsopts.aname)
	ctx.UninterruptibleSleepFinish(false)
	if err != nil {
		fs.vfsfs.DecRef(ctx)
		return nil, nil, err
	}
	attachFile := p9file{attached}
	qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask())
	if err != nil {
		attachFile.close(ctx)
		fs.vfsfs.DecRef(ctx)
		return nil, nil, err
	}

	// Construct the root dentry.
	root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr)
	if err != nil {
		attachFile.close(ctx)
		fs.vfsfs.DecRef(ctx)
		return nil, nil, err
	}
	// Set the root's reference count to 2. One reference is returned to the
	// caller, and the other is deliberately leaked to prevent the root from
	// being "cached" and subsequently evicted. Its resources will still be
	// cleaned up by fs.Release().
	root.refs = 2
	fs.root = root

	return &fs.vfsfs, &root.vfsd, nil
}

func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) {
	// Check that the transport is "fd".
	trans, ok := mopts["trans"]
	if !ok || trans != "fd" {
		ctx.Warningf("gofer.getFDFromMountOptionsMap: transport must be specified as 'trans=fd'")
		return -1, syserror.EINVAL
	}
	delete(mopts, "trans")

	// Check that read and write FDs are provided and identical.
	rfdstr, ok := mopts["rfdno"]
	if !ok {
		ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD must be specified as 'rfdno=<file descriptor>'")
		return -1, syserror.EINVAL
	}
	delete(mopts, "rfdno")
	rfd, err := strconv.Atoi(rfdstr)
	if err != nil {
		ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid read FD: rfdno=%s", rfdstr)
		return -1, syserror.EINVAL
	}
	wfdstr, ok := mopts["wfdno"]
	if !ok {
		ctx.Warningf("gofer.getFDFromMountOptionsMap: write FD must be specified as 'wfdno=<file descriptor>'")
		return -1, syserror.EINVAL
	}
	delete(mopts, "wfdno")
	wfd, err := strconv.Atoi(wfdstr)
	if err != nil {
		ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid write FD: wfdno=%s", wfdstr)
		return -1, syserror.EINVAL
	}
	if rfd != wfd {
		ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD (%d) and write FD (%d) must be equal", rfd, wfd)
		return -1, syserror.EINVAL
	}
	return rfd, nil
}

// Preconditions: fs.client == nil.
func (fs *filesystem) dial(ctx context.Context) error {
	// Establish a connection with the server.
	conn, err := unet.NewSocket(fs.opts.fd)
	if err != nil {
		return err
	}

	// Perform version negotiation with the server.
	ctx.UninterruptibleSleepStart(false)
	client, err := p9.NewClient(conn, fs.opts.msize, fs.opts.version)
	ctx.UninterruptibleSleepFinish(false)
	if err != nil {
		conn.Close()
		return err
	}
	// Ownership of conn has been transferred to client.

	fs.client = client
	return nil
}

// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
	mf := fs.mfp.MemoryFile()

	fs.syncMu.Lock()
	for d := range fs.syncableDentries {
		d.handleMu.Lock()
		d.dataMu.Lock()
		if h := d.writeHandleLocked(); h.isOpen() {
			// Write dirty cached data to the remote file.
			if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), h.writeFromBlocksAt); err != nil {
				log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err)
			}
			// TODO(jamieliu): Do we need to flushf/fsync d?
		}
		// Discard cached pages.
		d.cache.DropAll(mf)
		d.dirty.RemoveAll()
		d.dataMu.Unlock()
		// Close the host fd if one exists.
		if d.hostFD >= 0 {
			syscall.Close(int(d.hostFD))
			d.hostFD = -1
		}
		d.handleMu.Unlock()
	}
	// There can't be any specialFileFDs still using fs, since each such
	// FileDescription would hold a reference on a Mount holding a reference on
	// fs.
	fs.syncMu.Unlock()

	if !fs.iopts.LeakConnection {
		// Close the connection to the server. This implicitly clunks all fids.
		fs.client.Close()
	}

	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
}

// dentry implements vfs.DentryImpl.
//
// +stateify savable
type dentry struct {
	vfsd vfs.Dentry

	// refs is the reference count. Each dentry holds a reference on its
	// parent, even if disowned. An additional reference is held on all
	// synthetic dentries until they are unlinked or invalidated. When refs
	// reaches 0, the dentry may be added to the cache or destroyed. If refs ==
	// -1, the dentry has already been destroyed. refs is accessed using atomic
	// memory operations.
	refs int64

	// fs is the owning filesystem. fs is immutable.
	fs *filesystem

	// parent is this dentry's parent directory. Each dentry holds a reference
	// on its parent. If this dentry is a filesystem root, parent is nil.
	// parent is protected by filesystem.renameMu.
	parent *dentry

	// name is the name of this dentry in its parent. If this dentry is a
	// filesystem root, name is the empty string. name is protected by
	// filesystem.renameMu.
	name string

	// qidPath is the p9.QID.Path for this file. qidPath is immutable.
	qidPath uint64

	// file is the unopened p9.File that backs this dentry. file is immutable.
	//
	// If file.isNil(), this dentry represents a synthetic file, i.e. a file
	// that does not exist on the remote filesystem. As of this writing, the
	// only files that can be synthetic are sockets, pipes, and directories.
	file p9file `state:"nosave"`

	// If deleted is non-zero, the file represented by this dentry has been
	// deleted. deleted is accessed using atomic memory operations.
	deleted uint32

	// If cached is true, dentryEntry links dentry into
	// filesystem.cachedDentries. cached and dentryEntry are protected by
	// filesystem.renameMu.
	cached bool
	dentryEntry

	dirMu sync.Mutex `state:"nosave"`

	// If this dentry represents a directory, children contains:
	//
	// - Mappings of child filenames to dentries representing those children.
	//
	// - Mappings of child filenames that are known not to exist to nil
	// dentries (only if InteropModeShared is not in effect and the directory
	// is not synthetic).
	//
	// children is protected by dirMu.
	children map[string]*dentry

	// If this dentry represents a directory, syntheticChildren is the number
	// of child dentries for which dentry.isSynthetic() == true.
	// syntheticChildren is protected by dirMu.
	syntheticChildren int

	// If this dentry represents a directory,
	// dentry.cachedMetadataAuthoritative() == true, and dirents is not nil, it
	// is a cache of all entries in the directory, in the order they were
	// returned by the server. dirents is protected by dirMu.
	dirents []vfs.Dirent

	// Cached metadata; protected by metadataMu.
	// To access:
	//   - In situations where consistency is not required (like stat), these
	//     can be accessed using atomic operations only (without locking).
	//   - Lock metadataMu and can access without atomic operations.
	// To mutate:
	//   - Lock metadataMu and use atomic operations to update because we might
	//     have atomic readers that don't hold the lock.
	metadataMu sync.Mutex `state:"nosave"`
	ino        uint64     // immutable
	mode       uint32     // type is immutable, perms are mutable
	uid        uint32     // auth.KUID, but stored as raw uint32 for sync/atomic
	gid        uint32     // auth.KGID, but ...
	blockSize  uint32     // 0 if unknown
	// Timestamps, all nsecs from the Unix epoch.
	atime int64
	mtime int64
	ctime int64
	btime int64
	// File size, which differs from other metadata in two ways:
	//
	// - We make a best-effort attempt to keep it up to date even if
	// !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes.
	//
	// - size is protected by both metadataMu and dataMu (i.e. both must be
	// locked to mutate it; locking either is sufficient to access it).
	size uint64
	// If this dentry does not represent a synthetic file, deleted is 0, and
	// atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the
	// remote file's timestamps, which should be updated when this dentry is
	// evicted.
	atimeDirty uint32
	mtimeDirty uint32

	// nlink counts the number of hard links to this dentry. It's updated and
	// accessed using atomic operations. It's not protected by metadataMu like the
	// other metadata fields.
	nlink uint32

	mapsMu sync.Mutex `state:"nosave"`

	// If this dentry represents a regular file, mappings tracks mappings of
	// the file into memmap.MappingSpaces. mappings is protected by mapsMu.
	mappings memmap.MappingSet

	// - If this dentry represents a regular file or directory, readFile is the
	// p9.File used for reads by all regularFileFDs/directoryFDs representing
	// this dentry.
	//
	// - If this dentry represents a regular file, writeFile is the p9.File
	// used for writes by all regularFileFDs representing this dentry.
	//
	// - If this dentry represents a regular file, hostFD is the host FD used
	// for memory mappings and I/O (when applicable) in preference to readFile
	// and writeFile. hostFD is always readable; if !writeFile.isNil(), it must
	// also be writable. If hostFD is -1, no such host FD is available.
	//
	// These fields are protected by handleMu.
	//
	// readFile and writeFile may or may not represent the same p9.File. Once
	// either p9.File transitions from closed (isNil() == true) to open
	// (isNil() == false), it may be mutated with handleMu locked, but cannot
	// be closed until the dentry is destroyed.
	handleMu  sync.RWMutex `state:"nosave"`
	readFile  p9file       `state:"nosave"`
	writeFile p9file       `state:"nosave"`
	hostFD    int32        `state:"nosave"`

	dataMu sync.RWMutex `state:"nosave"`

	// If this dentry represents a regular file that is client-cached, cache
	// maps offsets into the cached file to offsets into
	// filesystem.mfp.MemoryFile() that store the file's data. cache is
	// protected by dataMu.
	cache fsutil.FileRangeSet

	// If this dentry represents a regular file that is client-cached, dirty
	// tracks dirty segments in cache. dirty is protected by dataMu.
	dirty fsutil.DirtySet

	// pf implements platform.File for mappings of hostFD.
	pf dentryPlatformFile

	// If this dentry represents a symbolic link, InteropModeShared is not in
	// effect, and haveTarget is true, target is the symlink target. haveTarget
	// and target are protected by dataMu.
	haveTarget bool
	target     string

	// If this dentry represents a synthetic socket file, endpoint is the
	// transport endpoint bound to this file.
	endpoint transport.BoundEndpoint

	// If this dentry represents a synthetic named pipe, pipe is the pipe
	// endpoint bound to this file.
	pipe *pipe.VFSPipe

	locks vfs.FileLocks

	// Inotify watches for this dentry.
	//
	// Note that inotify may behave unexpectedly in the presence of hard links,
	// because dentries corresponding to the same file have separate inotify
	// watches when they should share the same set. This is the case because it is
	// impossible for us to know for sure whether two dentries correspond to the
	// same underlying file (see the gofer filesystem section fo vfs/inotify.md for
	// a more in-depth discussion on this matter).
	watches vfs.Watches
}

// dentryAttrMask returns a p9.AttrMask enabling all attributes used by the
// gofer client.
func dentryAttrMask() p9.AttrMask {
	return p9.AttrMask{
		Mode:  true,
		UID:   true,
		GID:   true,
		ATime: true,
		MTime: true,
		CTime: true,
		Size:  true,
		BTime: true,
	}
}

// newDentry creates a new dentry representing the given file. The dentry
// initially has no references, but is not cached; it is the caller's
// responsibility to set the dentry's reference count and/or call
// dentry.checkCachingLocked() as appropriate.
//
// Preconditions: !file.isNil().
func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) {
	if !mask.Mode {
		ctx.Warningf("can't create gofer.dentry without file type")
		return nil, syserror.EIO
	}
	if attr.Mode.FileType() == p9.ModeRegular && !mask.Size {
		ctx.Warningf("can't create regular file gofer.dentry without file size")
		return nil, syserror.EIO
	}

	d := &dentry{
		fs:        fs,
		qidPath:   qid.Path,
		file:      file,
		ino:       fs.inoFromQIDPath(qid.Path),
		mode:      uint32(attr.Mode),
		uid:       uint32(fs.opts.dfltuid),
		gid:       uint32(fs.opts.dfltgid),
		blockSize: usermem.PageSize,
		hostFD:    -1,
	}
	d.pf.dentry = d
	if mask.UID {
		d.uid = dentryUIDFromP9UID(attr.UID)
	}
	if mask.GID {
		d.gid = dentryGIDFromP9GID(attr.GID)
	}
	if mask.Size {
		d.size = attr.Size
	}
	if attr.BlockSize != 0 {
		d.blockSize = uint32(attr.BlockSize)
	}
	if mask.ATime {
		d.atime = dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds)
	}
	if mask.MTime {
		d.mtime = dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds)
	}
	if mask.CTime {
		d.ctime = dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds)
	}
	if mask.BTime {
		d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)
	}
	if mask.NLink {
		d.nlink = uint32(attr.NLink)
	}
	d.vfsd.Init(d)

	fs.syncMu.Lock()
	fs.syncableDentries[d] = struct{}{}
	fs.syncMu.Unlock()
	return d, nil
}

func (fs *filesystem) inoFromQIDPath(qidPath uint64) uint64 {
	fs.inoMu.Lock()
	defer fs.inoMu.Unlock()
	if ino, ok := fs.inoByQIDPath[qidPath]; ok {
		return ino
	}
	ino := fs.nextIno()
	fs.inoByQIDPath[qidPath] = ino
	return ino
}

func (fs *filesystem) nextIno() uint64 {
	return atomic.AddUint64(&fs.lastIno, 1)
}

func (d *dentry) isSynthetic() bool {
	return d.file.isNil()
}

func (d *dentry) cachedMetadataAuthoritative() bool {
	return d.fs.opts.interop != InteropModeShared || d.isSynthetic()
}

// updateFromP9Attrs is called to update d's metadata after an update from the
// remote filesystem.
// Precondition: d.metadataMu must be locked.
func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
	if mask.Mode {
		if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want {
			d.metadataMu.Unlock()
			panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got))
		}
		atomic.StoreUint32(&d.mode, uint32(attr.Mode))
	}
	if mask.UID {
		atomic.StoreUint32(&d.uid, dentryUIDFromP9UID(attr.UID))
	}
	if mask.GID {
		atomic.StoreUint32(&d.gid, dentryGIDFromP9GID(attr.GID))
	}
	// There is no P9_GETATTR_* bit for I/O block size.
	if attr.BlockSize != 0 {
		atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize))
	}
	// Don't override newer client-defined timestamps with old server-defined
	// ones.
	if mask.ATime && atomic.LoadUint32(&d.atimeDirty) == 0 {
		atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds))
	}
	if mask.MTime && atomic.LoadUint32(&d.mtimeDirty) == 0 {
		atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds))
	}
	if mask.CTime {
		atomic.StoreInt64(&d.ctime, dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds))
	}
	if mask.BTime {
		atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds))
	}
	if mask.NLink {
		atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
	}
	if mask.Size {
		d.updateSizeLocked(attr.Size)
	}
}

// Preconditions: !d.isSynthetic().
func (d *dentry) updateFromGetattr(ctx context.Context) error {
	// Use d.readFile or d.writeFile, which represent 9P fids that have been
	// opened, in preference to d.file, which represents a 9P fid that has not.
	// This may be significantly more efficient in some implementations. Prefer
	// d.writeFile over d.readFile since some filesystem implementations may
	// update a writable handle's metadata after writes to that handle, without
	// making metadata updates immediately visible to read-only handles
	// representing the same file.
	var (
		file            p9file
		handleMuRLocked bool
	)
	// d.metadataMu must be locked *before* we getAttr so that we do not end up
	// updating stale attributes in d.updateFromP9AttrsLocked().
	d.metadataMu.Lock()
	defer d.metadataMu.Unlock()
	d.handleMu.RLock()
	if !d.writeFile.isNil() {
		file = d.writeFile
		handleMuRLocked = true
	} else if !d.readFile.isNil() {
		file = d.readFile
		handleMuRLocked = true
	} else {
		file = d.file
		d.handleMu.RUnlock()
	}
	_, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask())
	if handleMuRLocked {
		d.handleMu.RUnlock()
	}
	if err != nil {
		return err
	}
	d.updateFromP9AttrsLocked(attrMask, &attr)
	return nil
}

func (d *dentry) fileType() uint32 {
	return atomic.LoadUint32(&d.mode) & linux.S_IFMT
}

func (d *dentry) statTo(stat *linux.Statx) {
	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
	stat.Blksize = atomic.LoadUint32(&d.blockSize)
	stat.Nlink = atomic.LoadUint32(&d.nlink)
	if stat.Nlink == 0 {
		// The remote filesystem doesn't support link count; just make
		// something up. This is consistent with Linux, where
		// fs/inode.c:inode_init_always() initializes link count to 1, and
		// fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if
		// it's not provided by the remote filesystem.
		stat.Nlink = 1
	}
	stat.UID = atomic.LoadUint32(&d.uid)
	stat.GID = atomic.LoadUint32(&d.gid)
	stat.Mode = uint16(atomic.LoadUint32(&d.mode))
	stat.Ino = uint64(d.ino)
	stat.Size = atomic.LoadUint64(&d.size)
	// This is consistent with regularFileFD.Seek(), which treats regular files
	// as having no holes.
	stat.Blocks = (stat.Size + 511) / 512
	stat.Atime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.atime))
	stat.Btime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.btime))
	stat.Ctime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.ctime))
	stat.Mtime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.mtime))
	stat.DevMajor = linux.UNNAMED_MAJOR
	stat.DevMinor = d.fs.devMinor
}

func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error {
	stat := &opts.Stat
	if stat.Mask == 0 {
		return nil
	}
	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
		return syserror.EPERM
	}
	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
	if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
		return err
	}
	if err := mnt.CheckBeginWrite(); err != nil {
		return err
	}
	defer mnt.EndWrite()

	if stat.Mask&linux.STATX_SIZE != 0 {
		// Reject attempts to truncate files other than regular files, since
		// filesystem implementations may return the wrong errno.
		switch mode.FileType() {
		case linux.S_IFREG:
			// ok
		case linux.S_IFDIR:
			return syserror.EISDIR
		default:
			return syserror.EINVAL
		}
	}

	var now int64
	if d.cachedMetadataAuthoritative() {
		// Truncate updates mtime.
		if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE {
			stat.Mask |= linux.STATX_MTIME
			stat.Mtime = linux.StatxTimestamp{
				Nsec: linux.UTIME_NOW,
			}
		}

		// Use client clocks for timestamps.
		now = d.fs.clock.Now().Nanoseconds()
		if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW {
			stat.Atime = linux.NsecToStatxTimestamp(now)
		}
		if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW {
			stat.Mtime = linux.NsecToStatxTimestamp(now)
		}
	}

	d.metadataMu.Lock()
	defer d.metadataMu.Unlock()
	if !d.isSynthetic() {
		if stat.Mask != 0 {
			if err := d.file.setAttr(ctx, p9.SetAttrMask{
				Permissions:        stat.Mask&linux.STATX_MODE != 0,
				UID:                stat.Mask&linux.STATX_UID != 0,
				GID:                stat.Mask&linux.STATX_GID != 0,
				Size:               stat.Mask&linux.STATX_SIZE != 0,
				ATime:              stat.Mask&linux.STATX_ATIME != 0,
				MTime:              stat.Mask&linux.STATX_MTIME != 0,
				ATimeNotSystemTime: stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW,
				MTimeNotSystemTime: stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW,
			}, p9.SetAttr{
				Permissions:      p9.FileMode(stat.Mode),
				UID:              p9.UID(stat.UID),
				GID:              p9.GID(stat.GID),
				Size:             stat.Size,
				ATimeSeconds:     uint64(stat.Atime.Sec),
				ATimeNanoSeconds: uint64(stat.Atime.Nsec),
				MTimeSeconds:     uint64(stat.Mtime.Sec),
				MTimeNanoSeconds: uint64(stat.Mtime.Nsec),
			}); err != nil {
				return err
			}
			if stat.Mask&linux.STATX_SIZE != 0 {
				// d.size should be kept up to date, and privatized
				// copy-on-write mappings of truncated pages need to be
				// invalidated, even if InteropModeShared is in effect.
				d.updateSizeLocked(stat.Size)
			}
		}
		if d.fs.opts.interop == InteropModeShared {
			// There's no point to updating d's metadata in this case since
			// it'll be overwritten by revalidation before the next time it's
			// used anyway. (InteropModeShared inhibits client caching of
			// regular file data, so there's no cache to truncate either.)
			return nil
		}
	}
	if stat.Mask&linux.STATX_MODE != 0 {
		atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode))
	}
	if stat.Mask&linux.STATX_UID != 0 {
		atomic.StoreUint32(&d.uid, stat.UID)
	}
	if stat.Mask&linux.STATX_GID != 0 {
		atomic.StoreUint32(&d.gid, stat.GID)
	}
	// Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because
	// if d.cachedMetadataAuthoritative() then we converted stat.Atime and
	// stat.Mtime to client-local timestamps above, and if
	// !d.cachedMetadataAuthoritative() then we returned after calling
	// d.file.setAttr(). For the same reason, now must have been initialized.
	if stat.Mask&linux.STATX_ATIME != 0 {
		atomic.StoreInt64(&d.atime, stat.Atime.ToNsec())
		atomic.StoreUint32(&d.atimeDirty, 0)
	}
	if stat.Mask&linux.STATX_MTIME != 0 {
		atomic.StoreInt64(&d.mtime, stat.Mtime.ToNsec())
		atomic.StoreUint32(&d.mtimeDirty, 0)
	}
	atomic.StoreInt64(&d.ctime, now)
	return nil
}

// doAllocate performs an allocate operation on d. Note that d.metadataMu will
// be held when allocate is called.
func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error {
	d.metadataMu.Lock()
	defer d.metadataMu.Unlock()

	// Allocating a smaller size is a noop.
	size := offset + length
	if d.cachedMetadataAuthoritative() && size <= d.size {
		return nil
	}

	err := allocate()
	if err != nil {
		return err
	}
	d.updateSizeLocked(size)
	if d.cachedMetadataAuthoritative() {
		d.touchCMtimeLocked()
	}
	return nil
}

// Preconditions: d.metadataMu must be locked.
func (d *dentry) updateSizeLocked(newSize uint64) {
	d.dataMu.Lock()
	oldSize := d.size
	atomic.StoreUint64(&d.size, newSize)
	// d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
	// below. This allows concurrent calls to Read/Translate/etc. These
	// functions synchronize with truncation by refusing to use cache
	// contents beyond the new d.size. (We are still holding d.metadataMu,
	// so we can't race with Write or another truncate.)
	d.dataMu.Unlock()
	if d.size < oldSize {
		oldpgend, _ := usermem.PageRoundUp(oldSize)
		newpgend, _ := usermem.PageRoundUp(d.size)
		if oldpgend != newpgend {
			d.mapsMu.Lock()
			d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
				// Compare Linux's mm/truncate.c:truncate_setsize() =>
				// truncate_pagecache() =>
				// mm/memory.c:unmap_mapping_range(evencows=1).
				InvalidatePrivate: true,
			})
			d.mapsMu.Unlock()
		}
		// We are now guaranteed that there are no translations of
		// truncated pages, and can remove them from the cache. Since
		// truncated pages have been removed from the remote file, they
		// should be dropped without being written back.
		d.dataMu.Lock()
		d.cache.Truncate(d.size, d.fs.mfp.MemoryFile())
		d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend})
		d.dataMu.Unlock()
	}
}

func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
}

func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
	// We only support xattrs prefixed with "user." (see b/148380782). Currently,
	// there is no need to expose any other xattrs through a gofer.
	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
		return syserror.EOPNOTSUPP
	}
	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
	kuid := auth.KUID(atomic.LoadUint32(&d.uid))
	kgid := auth.KGID(atomic.LoadUint32(&d.gid))
	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
		return err
	}
	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
}

func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
	return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid)))
}

func dentryUIDFromP9UID(uid p9.UID) uint32 {
	if !uid.Ok() {
		return uint32(auth.OverflowUID)
	}
	return uint32(uid)
}

func dentryGIDFromP9GID(gid p9.GID) uint32 {
	if !gid.Ok() {
		return uint32(auth.OverflowGID)
	}
	return uint32(gid)
}

// IncRef implements vfs.DentryImpl.IncRef.
func (d *dentry) IncRef() {
	// d.refs may be 0 if d.fs.renameMu is locked, which serializes against
	// d.checkCachingLocked().
	atomic.AddInt64(&d.refs, 1)
}

// TryIncRef implements vfs.DentryImpl.TryIncRef.
func (d *dentry) TryIncRef() bool {
	for {
		refs := atomic.LoadInt64(&d.refs)
		if refs <= 0 {
			return false
		}
		if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
			return true
		}
	}
}

// DecRef implements vfs.DentryImpl.DecRef.
func (d *dentry) DecRef(ctx context.Context) {
	if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
		d.fs.renameMu.Lock()
		d.checkCachingLocked(ctx)
		d.fs.renameMu.Unlock()
	} else if refs < 0 {
		panic("gofer.dentry.DecRef() called without holding a reference")
	}
}

// decRefLocked decrements d's reference count without calling
// d.checkCachingLocked, even if d's reference count reaches 0; callers are
// responsible for ensuring that d.checkCachingLocked will be called later.
func (d *dentry) decRefLocked() {
	if refs := atomic.AddInt64(&d.refs, -1); refs < 0 {
		panic("gofer.dentry.decRefLocked() called without holding a reference")
	}
}

// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
	if d.isDir() {
		events |= linux.IN_ISDIR
	}

	d.fs.renameMu.RLock()
	// The ordering below is important, Linux always notifies the parent first.
	if d.parent != nil {
		d.parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted())
	}
	d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted())
	d.fs.renameMu.RUnlock()
}

// Watches implements vfs.DentryImpl.Watches.
func (d *dentry) Watches() *vfs.Watches {
	return &d.watches
}

// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
//
// If no watches are left on this dentry and it has no references, cache it.
func (d *dentry) OnZeroWatches(ctx context.Context) {
	if atomic.LoadInt64(&d.refs) == 0 {
		d.fs.renameMu.Lock()
		d.checkCachingLocked(ctx)
		d.fs.renameMu.Unlock()
	}
}

// checkCachingLocked should be called after d's reference count becomes 0 or it
// becomes disowned.
//
// It may be called on a destroyed dentry. For example,
// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times
// for the same dentry when the dentry is visited more than once in the same
// operation. One of the calls may destroy the dentry, so subsequent calls will
// do nothing.
//
// Preconditions: d.fs.renameMu must be locked for writing; it may be
// temporarily unlocked.
func (d *dentry) checkCachingLocked(ctx context.Context) {
	// Dentries with a non-zero reference count must be retained. (The only way
	// to obtain a reference on a dentry with zero references is via path
	// resolution, which requires renameMu, so if d.refs is zero then it will
	// remain zero while we hold renameMu for writing.)
	refs := atomic.LoadInt64(&d.refs)
	if refs > 0 {
		if d.cached {
			d.fs.cachedDentries.Remove(d)
			d.fs.cachedDentriesLen--
			d.cached = false
		}
		return
	}
	if refs == -1 {
		// Dentry has already been destroyed.
		return
	}
	// Deleted and invalidated dentries with zero references are no longer
	// reachable by path resolution and should be dropped immediately.
	if d.vfsd.IsDead() {
		if d.isDeleted() {
			d.watches.HandleDeletion(ctx)
		}
		if d.cached {
			d.fs.cachedDentries.Remove(d)
			d.fs.cachedDentriesLen--
			d.cached = false
		}
		d.destroyLocked(ctx)
		return
	}
	// If d still has inotify watches and it is not deleted or invalidated, we
	// cannot cache it and allow it to be evicted. Otherwise, we will lose its
	// watches, even if a new dentry is created for the same file in the future.
	// Note that the size of d.watches cannot concurrently transition from zero
	// to non-zero, because adding a watch requires holding a reference on d.
	if d.watches.Size() > 0 {
		return
	}
	// If d is already cached, just move it to the front of the LRU.
	if d.cached {
		d.fs.cachedDentries.Remove(d)
		d.fs.cachedDentries.PushFront(d)
		return
	}
	// Cache the dentry, then evict the least recently used cached dentry if
	// the cache becomes over-full.
	d.fs.cachedDentries.PushFront(d)
	d.fs.cachedDentriesLen++
	d.cached = true
	if d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries {
		d.fs.evictCachedDentryLocked(ctx)
		// Whether or not victim was destroyed, we brought fs.cachedDentriesLen
		// back down to fs.opts.maxCachedDentries, so we don't loop.
	}
}

// Preconditions:
// * fs.renameMu must be locked for writing; it may be temporarily unlocked.
// * fs.cachedDentriesLen != 0.
func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) {
	victim := fs.cachedDentries.Back()
	fs.cachedDentries.Remove(victim)
	fs.cachedDentriesLen--
	victim.cached = false
	// victim.refs may have become non-zero from an earlier path resolution
	// since it was inserted into fs.cachedDentries.
	if atomic.LoadInt64(&victim.refs) == 0 {
		if victim.parent != nil {
			victim.parent.dirMu.Lock()
			if !victim.vfsd.IsDead() {
				// Note that victim can't be a mount point (in any mount
				// namespace), since VFS holds references on mount points.
				fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd)
				delete(victim.parent.children, victim.name)
				// We're only deleting the dentry, not the file it
				// represents, so we don't need to update
				// victimParent.dirents etc.
			}
			victim.parent.dirMu.Unlock()
		}
		victim.destroyLocked(ctx)
	}
}

// destroyLocked destroys the dentry.
//
// Preconditions:
// * d.fs.renameMu must be locked for writing; it may be temporarily unlocked.
// * d.refs == 0.
// * d.parent.children[d.name] != d, i.e. d is not reachable by path traversal
//   from its former parent dentry.
func (d *dentry) destroyLocked(ctx context.Context) {
	switch atomic.LoadInt64(&d.refs) {
	case 0:
		// Mark the dentry destroyed.
		atomic.StoreInt64(&d.refs, -1)
	case -1:
		panic("dentry.destroyLocked() called on already destroyed dentry")
	default:
		panic("dentry.destroyLocked() called with references on the dentry")
	}

	// Allow the following to proceed without renameMu locked to improve
	// scalability.
	d.fs.renameMu.Unlock()

	mf := d.fs.mfp.MemoryFile()
	d.handleMu.Lock()
	d.dataMu.Lock()
	if h := d.writeHandleLocked(); h.isOpen() {
		// Write dirty pages back to the remote filesystem.
		if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
			log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err)
		}
	}
	// Discard cached data.
	if !d.cache.IsEmpty() {
		mf.MarkAllUnevictable(d)
		d.cache.DropAll(mf)
		d.dirty.RemoveAll()
	}
	d.dataMu.Unlock()
	// Clunk open fids and close open host FDs.
	if !d.readFile.isNil() {
		d.readFile.close(ctx)
	}
	if !d.writeFile.isNil() && d.readFile != d.writeFile {
		d.writeFile.close(ctx)
	}
	d.readFile = p9file{}
	d.writeFile = p9file{}
	if d.hostFD >= 0 {
		syscall.Close(int(d.hostFD))
		d.hostFD = -1
	}
	d.handleMu.Unlock()

	if !d.file.isNil() {
		// Note that it's possible that d.atimeDirty or d.mtimeDirty are true,
		// i.e. client and server timestamps may differ (because e.g. a client
		// write was serviced by the page cache, and only written back to the
		// remote file later). Ideally, we'd write client timestamps back to
		// the remote filesystem so that timestamps for a new dentry
		// instantiated for the same file would remain coherent. Unfortunately,
		// this turns out to be too expensive in many cases, so for now we
		// don't do this.
		if err := d.file.close(ctx); err != nil {
			log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err)
		}
		d.file = p9file{}

		// Remove d from the set of syncable dentries.
		d.fs.syncMu.Lock()
		delete(d.fs.syncableDentries, d)
		d.fs.syncMu.Unlock()
	}

	d.fs.renameMu.Lock()

	// Drop the reference held by d on its parent without recursively locking
	// d.fs.renameMu.
	if d.parent != nil {
		if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
			d.parent.checkCachingLocked(ctx)
		} else if refs < 0 {
			panic("gofer.dentry.DecRef() called without holding a reference")
		}
	}
}

func (d *dentry) isDeleted() bool {
	return atomic.LoadUint32(&d.deleted) != 0
}

func (d *dentry) setDeleted() {
	atomic.StoreUint32(&d.deleted, 1)
}

func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
	if d.file.isNil() || !d.userXattrSupported() {
		return nil, nil
	}
	xattrMap, err := d.file.listXattr(ctx, size)
	if err != nil {
		return nil, err
	}
	xattrs := make([]string, 0, len(xattrMap))
	for x := range xattrMap {
		// We only support xattrs in the user.* namespace.
		if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
			xattrs = append(xattrs, x)
		}
	}
	return xattrs, nil
}

func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
	if d.file.isNil() {
		return "", syserror.ENODATA
	}
	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
		return "", err
	}
	return d.file.getXattr(ctx, opts.Name, opts.Size)
}

func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
	if d.file.isNil() {
		return syserror.EPERM
	}
	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
		return err
	}
	return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
}

func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error {
	if d.file.isNil() {
		return syserror.EPERM
	}
	if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
		return err
	}
	return d.file.removeXattr(ctx, name)
}

// Extended attributes in the user.* namespace are only supported for regular
// files and directories.
func (d *dentry) userXattrSupported() bool {
	filetype := linux.FileMode(atomic.LoadUint32(&d.mode)).FileType()
	return filetype == linux.ModeRegular || filetype == linux.ModeDirectory
}

// Preconditions:
// * !d.isSynthetic().
// * d.isRegularFile() || d.isDir().
func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
	// O_TRUNC unconditionally requires us to obtain a new handle (opened with
	// O_TRUNC).
	if !trunc {
		d.handleMu.RLock()
		if (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil()) {
			// Current handles are sufficient.
			d.handleMu.RUnlock()
			return nil
		}
		d.handleMu.RUnlock()
	}

	fdToClose := int32(-1)
	invalidateTranslations := false
	d.handleMu.Lock()
	if (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc {
		// Get a new handle. If this file has been opened for both reading and
		// writing, try to get a single handle that is usable for both:
		//
		// - Writable memory mappings of a host FD require that the host FD is
		// opened for both reading and writing.
		//
		// - NOTE(b/141991141): Some filesystems may not ensure coherence
		// between multiple handles for the same file.
		openReadable := !d.readFile.isNil() || read
		openWritable := !d.writeFile.isNil() || write
		h, err := openHandle(ctx, d.file, openReadable, openWritable, trunc)
		if err == syserror.EACCES && (openReadable != read || openWritable != write) {
			// It may not be possible to use a single handle for both
			// reading and writing, since permissions on the file may have
			// changed to e.g. disallow reading after previously being
			// opened for reading. In this case, we have no choice but to
			// use separate handles for reading and writing.
			ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d)
			openReadable = read
			openWritable = write
			h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc)
		}
		if err != nil {
			d.handleMu.Unlock()
			return err
		}

		if d.hostFD < 0 && h.fd >= 0 && openReadable && (d.writeFile.isNil() || openWritable) {
			// We have no existing FD, and the new FD meets the requirements
			// for d.hostFD, so start using it.
			d.hostFD = h.fd
		} else if d.hostFD >= 0 && d.writeFile.isNil() && openWritable {
			// We have an existing read-only FD, but the file has just been
			// opened for writing, so we need to start supporting writable memory
			// mappings. This may race with callers of d.pf.FD() using the existing
			// FD, so in most cases we need to delay closing the old FD until after
			// invalidating memmap.Translations that might have observed it.
			if !openReadable || h.fd < 0 {
				// We don't have a read/write FD, so we have no FD that can be
				// used to create writable memory mappings. Switch to using the
				// internal page cache.
				invalidateTranslations = true
				fdToClose = d.hostFD
				d.hostFD = -1
			} else if d.fs.opts.overlayfsStaleRead {
				// We do have a read/write FD, but it may not be coherent with
				// the existing read-only FD, so we must switch to mappings of
				// the new FD in both the application and sentry.
				if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
					d.handleMu.Unlock()
					ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
					h.close(ctx)
					return err
				}
				invalidateTranslations = true
				fdToClose = d.hostFD
				d.hostFD = h.fd
			} else {
				// We do have a read/write FD. To avoid invalidating existing
				// memmap.Translations (which is expensive), use dup3 to make
				// the old file descriptor refer to the new file description,
				// then close the new file descriptor (which is no longer
				// needed). Racing callers of d.pf.FD() may use the old or new
				// file description, but this doesn't matter since they refer
				// to the same file, and any racing mappings must be read-only.
				if err := syscall.Dup3(int(h.fd), int(d.hostFD), syscall.O_CLOEXEC); err != nil {
					oldHostFD := d.hostFD
					d.handleMu.Unlock()
					ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldHostFD, err)
					h.close(ctx)
					return err
				}
				fdToClose = h.fd
			}
		} else {
			// h.fd is not useful.
			fdToClose = h.fd
		}

		// Switch to new fids.
		var oldReadFile p9file
		if openReadable {
			oldReadFile = d.readFile
			d.readFile = h.file
		}
		var oldWriteFile p9file
		if openWritable {
			oldWriteFile = d.writeFile
			d.writeFile = h.file
		}
		// NOTE(b/141991141): Clunk old fids before making new fids visible (by
		// unlocking d.handleMu).
		if !oldReadFile.isNil() {
			oldReadFile.close(ctx)
		}
		if !oldWriteFile.isNil() && oldReadFile != oldWriteFile {
			oldWriteFile.close(ctx)
		}
	}
	d.handleMu.Unlock()

	if invalidateTranslations {
		// Invalidate application mappings that may be using an old FD; they
		// will be replaced with mappings using the new FD after future calls
		// to d.Translate(). This requires holding d.mapsMu, which precedes
		// d.handleMu in the lock order.
		d.mapsMu.Lock()
		d.mappings.InvalidateAll(memmap.InvalidateOpts{})
		d.mapsMu.Unlock()
	}
	if fdToClose >= 0 {
		syscall.Close(int(fdToClose))
	}

	return nil
}

// Preconditions: d.handleMu must be locked.
func (d *dentry) readHandleLocked() handle {
	return handle{
		file: d.readFile,
		fd:   d.hostFD,
	}
}

// Preconditions: d.handleMu must be locked.
func (d *dentry) writeHandleLocked() handle {
	return handle{
		file: d.writeFile,
		fd:   d.hostFD,
	}
}

func (d *dentry) syncRemoteFile(ctx context.Context) error {
	d.handleMu.RLock()
	defer d.handleMu.RUnlock()
	return d.syncRemoteFileLocked(ctx)
}

// Preconditions: d.handleMu must be locked.
func (d *dentry) syncRemoteFileLocked(ctx context.Context) error {
	// If we have a host FD, fsyncing it is likely to be faster than an fsync
	// RPC.
	if d.hostFD >= 0 {
		ctx.UninterruptibleSleepStart(false)
		err := syscall.Fsync(int(d.hostFD))
		ctx.UninterruptibleSleepFinish(false)
		return err
	}
	if !d.writeFile.isNil() {
		return d.writeFile.fsync(ctx)
	}
	if !d.readFile.isNil() {
		return d.readFile.fsync(ctx)
	}
	return nil
}

func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error {
	d.handleMu.RLock()
	defer d.handleMu.RUnlock()
	h := d.writeHandleLocked()
	if h.isOpen() {
		// Write back dirty pages to the remote file.
		d.dataMu.Lock()
		err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
		d.dataMu.Unlock()
		if err != nil {
			return err
		}
	}
	if err := d.syncRemoteFileLocked(ctx); err != nil {
		if !forFilesystemSync {
			return err
		}
		// Only return err if we can reasonably have expected sync to succeed
		// (d is a regular file and was opened for writing).
		if d.isRegularFile() && h.isOpen() {
			return err
		}
		ctx.Debugf("gofer.dentry.syncCachedFile: syncing non-writable or non-regular-file dentry failed: %v", err)
	}
	return nil
}

// incLinks increments link count.
func (d *dentry) incLinks() {
	if atomic.LoadUint32(&d.nlink) == 0 {
		// The remote filesystem doesn't support link count.
		return
	}
	atomic.AddUint32(&d.nlink, 1)
}

// decLinks decrements link count.
func (d *dentry) decLinks() {
	if atomic.LoadUint32(&d.nlink) == 0 {
		// The remote filesystem doesn't support link count.
		return
	}
	atomic.AddUint32(&d.nlink, ^uint32(0))
}

// fileDescription is embedded by gofer implementations of
// vfs.FileDescriptionImpl.
//
// +stateify savable
type fileDescription struct {
	vfsfd vfs.FileDescription
	vfs.FileDescriptionDefaultImpl
	vfs.LockFD

	lockLogging sync.Once `state:"nosave"`
}

func (fd *fileDescription) filesystem() *filesystem {
	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
}

func (fd *fileDescription) dentry() *dentry {
	return fd.vfsfd.Dentry().Impl().(*dentry)
}

// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
	d := fd.dentry()
	const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME)
	if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
		// TODO(jamieliu): Use specialFileFD.handle.file for the getattr if
		// available?
		if err := d.updateFromGetattr(ctx); err != nil {
			return linux.Statx{}, err
		}
	}
	var stat linux.Statx
	d.statTo(&stat)
	return stat, nil
}

// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
	if err := fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount()); err != nil {
		return err
	}
	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
		fd.dentry().InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
	}
	return nil
}

// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
	return fd.dentry().listXattr(ctx, auth.CredentialsFromContext(ctx), size)
}

// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
	return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts)
}

// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
	d := fd.dentry()
	if err := d.setXattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
		return err
	}
	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
	return nil
}

// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
	d := fd.dentry()
	if err := d.removeXattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
		return err
	}
	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
	return nil
}

// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
	fd.lockLogging.Do(func() {
		log.Infof("File lock using gofer file handled internally.")
	})
	return fd.LockFD.LockBSD(ctx, uid, t, block)
}

// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
	fd.lockLogging.Do(func() {
		log.Infof("Range lock using gofer file handled internally.")
	})
	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
}

// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
}