summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fs/dirent.go
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/fs/dirent.go')
-rw-r--r--pkg/sentry/fs/dirent.go1605
1 files changed, 1605 insertions, 0 deletions
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
new file mode 100644
index 000000000..a75c7ea7e
--- /dev/null
+++ b/pkg/sentry/fs/dirent.go
@@ -0,0 +1,1605 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+ "fmt"
+ "path"
+ "sort"
+ "sync"
+ "sync/atomic"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/refs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+type globalDirentMap struct {
+ mu sync.Mutex
+ dirents map[*Dirent]struct{}
+}
+
+func (g *globalDirentMap) add(d *Dirent) {
+ g.mu.Lock()
+ g.dirents[d] = struct{}{}
+ g.mu.Unlock()
+}
+
+func (g *globalDirentMap) remove(d *Dirent) {
+ g.mu.Lock()
+ delete(g.dirents, d)
+ g.mu.Unlock()
+}
+
+// allDirents keeps track of all Dirents that need to be considered in
+// Save/Restore for inode mappings.
+//
+// Because inodes do not hold paths, but inodes for external file systems map
+// to an external path, every user-visible Dirent is stored in this map and
+// iterated through upon save to keep inode ID -> restore path mappings.
+var allDirents = globalDirentMap{
+ dirents: map[*Dirent]struct{}{},
+}
+
+// renameMu protects the parent of *all* Dirents. (See explanation in
+// lockForRename.)
+//
+// See fs.go for lock ordering.
+var renameMu sync.RWMutex
+
+// Dirent holds an Inode in memory.
+//
+// A Dirent may be negative or positive:
+//
+// A negative Dirent contains a nil Inode and indicates that a path does not exist. This
+// is a convention taken from the Linux dcache, see fs/dcache.c. A negative Dirent remains
+// cached until a create operation replaces it with a positive Dirent. A negative Dirent
+// always has one reference owned by its parent and takes _no_ reference on its parent. This
+// ensures that its parent can be unhashed regardless of negative children.
+//
+// A positive Dirent contains a non-nil Inode. It remains cached for as long as there remain
+// references to it. A positive Dirent always takes a reference on its parent.
+//
+// A Dirent may be a root Dirent (parent is nil) or be parented (non-nil parent).
+//
+// Dirents currently do not attempt to free entries that lack application references under
+// memory pressure.
+type Dirent struct {
+ // AtomicRefCount is our reference count.
+ refs.AtomicRefCount
+
+ // userVisible indicates whether the Dirent is visible to the user or
+ // not. Only user-visible Dirents should save inode mappings in
+ // save/restore, as only they hold the real path to the underlying
+ // inode.
+ //
+ // See newDirent and Dirent.afterLoad.
+ userVisible bool
+
+ // Inode is the underlying file object.
+ //
+ // Inode is exported currently to assist in implementing overlay Inodes (where a
+ // Inode.InodeOperations.Lookup may need to merge the Inode contained in a positive Dirent with
+ // another Inode). This is normally done before the Dirent is parented (there are
+ // no external references to it).
+ //
+ // Other objects in the VFS may take a reference to this Inode but only while holding
+ // a reference to this Dirent.
+ Inode *Inode
+
+ // name is the name (i.e. basename) of this entry.
+ //
+ // N.B. name is protected by parent.mu, not this node's mu!
+ name string
+
+ // parent is the parent directory.
+ //
+ // We hold a hard reference to the parent.
+ //
+ // parent is protected by renameMu.
+ parent *Dirent
+
+ // deleted may be set atomically when removed.
+ deleted int32 `state:"nosave"`
+
+ // frozen indicates this entry can't walk to unknown nodes.
+ frozen bool
+
+ // mounted is true if Dirent is a mount point, similar to include/linux/dcache.h:DCACHE_MOUNTED.
+ mounted bool
+
+ // direntEntry identifies this Dirent as an element in a DirentCache. DirentCaches
+ // and their contents are not saved.
+ direntEntry `state:"nosave"`
+
+ // dirMu is a read-write mutex that protects caching decisions made by directory operations.
+ // Lock ordering: dirMu must be taken before mu (see below). Details:
+ //
+ // dirMu does not participate in Rename; instead mu and renameMu are used, see lockForRename.
+ //
+ // Creation and Removal operations must be synchronized with Walk to prevent stale negative
+ // caching. Note that this requirement is not specific to a _Dirent_ doing negative caching.
+ // The following race exists at any level of the VFS:
+ //
+ // For an object D that represents a directory, containing a cache of non-existent paths,
+ // protected by D.cacheMu:
+ //
+ // T1: T2:
+ // D.lookup(name)
+ // --> ENOENT
+ // D.create(name)
+ // --> success
+ // D.cacheMu.Lock
+ // delete(D.cache, name)
+ // D.cacheMu.Unlock
+ // D.cacheMu.Lock
+ // D.cache[name] = true
+ // D.cacheMu.Unlock
+ //
+ // D.lookup(name)
+ // D.cacheMu.Lock
+ // if D.cache[name] {
+ // --> ENOENT (wrong)
+ // }
+ // D.cacheMu.Lock
+ //
+ // Correct:
+ //
+ // T1: T2:
+ // D.cacheMu.Lock
+ // D.lookup(name)
+ // --> ENOENT
+ // D.cache[name] = true
+ // D.cacheMu.Unlock
+ // D.cacheMu.Lock
+ // D.create(name)
+ // --> success
+ // delete(D.cache, name)
+ // D.cacheMu.Unlock
+ //
+ // D.cacheMu.Lock
+ // D.lookup(name)
+ // --> EXISTS (right)
+ // D.cacheMu.Unlock
+ //
+ // Note that the above "correct" solution causes too much lock contention: all lookups are
+ // synchronized with each other. This is a problem because lookups are involved in any VFS
+ // path operation.
+ //
+ // A Dirent diverges from the single D.cacheMu and instead uses two locks: dirMu to protect
+ // concurrent creation/removal/lookup caching, and mu to protect the Dirent's children map
+ // in general.
+ //
+ // This allows for concurrent Walks to be executed in order to pipeline lookups. For instance
+ // for a hot directory /a/b, threads T1, T2, T3 will only block on each other update the
+ // children map of /a/b when their individual lookups complete.
+ //
+ // T1: T2: T3:
+ // stat(/a/b/c) stat(/a/b/d) stat(/a/b/e)
+ dirMu sync.RWMutex `state:"nosave"`
+
+ // mu protects the below fields. Lock ordering: mu must be taken after dirMu.
+ mu sync.Mutex `state:"nosave"`
+
+ // children are cached via weak references.
+ children map[string]*refs.WeakRef
+}
+
+// NewDirent returns a new root Dirent, taking the caller's reference on inode. The caller
+// holds the only reference to the Dirent. Parents may call hashChild to parent this Dirent.
+func NewDirent(inode *Inode, name string) *Dirent {
+ d := newDirent(inode, name)
+ allDirents.add(d)
+ d.userVisible = true
+ return d
+}
+
+// NewTransientDirent creates a transient Dirent that shouldn't actually be
+// visible to users.
+func NewTransientDirent(inode *Inode) *Dirent {
+ return newDirent(inode, "transient")
+}
+
+func newDirent(inode *Inode, name string) *Dirent {
+ // The Dirent needs to maintain one reference to MountSource.
+ if inode != nil {
+ inode.MountSource.IncDirentRefs()
+ }
+ return &Dirent{
+ Inode: inode,
+ name: name,
+ children: make(map[string]*refs.WeakRef),
+ }
+}
+
+// NewNegativeDirent returns a new root negative Dirent. Otherwise same as NewDirent.
+func NewNegativeDirent(name string) *Dirent {
+ return newDirent(nil, name)
+}
+
+// IsRoot returns true if d is a root Dirent.
+func (d *Dirent) IsRoot() bool {
+ return d.parent == nil
+}
+
+// IsNegative returns true if d represents a path that does not exist.
+func (d *Dirent) IsNegative() bool {
+ return d.Inode == nil
+}
+
+// hashChild will hash child into the children list of its new parent d, carrying over
+// any "frozen" state from d.
+//
+// Returns (*WeakRef, true) if hashing child caused a Dirent to be unhashed. The caller must
+// validate the returned unhashed weak reference. Common cases:
+//
+// * Remove: hashing a negative Dirent unhashes a positive Dirent (unimplemented).
+// * Create: hashing a positive Dirent unhashes a negative Dirent.
+// * Lookup: hashing any Dirent should not unhash any other Dirent.
+//
+// Preconditions:
+// * d.mu must be held.
+// * child must be a root Dirent.
+func (d *Dirent) hashChild(child *Dirent) (*refs.WeakRef, bool) {
+ if !child.IsRoot() {
+ panic("hashChild must be a root Dirent")
+ }
+
+ // Assign parentage.
+ child.parent = d
+
+ // Avoid letting negative Dirents take a reference on their parent; these Dirents
+ // don't have a role outside of the Dirent cache and should not keep their parent
+ // indefinitely pinned.
+ if !child.IsNegative() {
+ // Positive dirents must take a reference on their parent.
+ d.IncRef()
+ }
+
+ // Carry over parent's frozen state.
+ child.frozen = d.frozen
+
+ return d.hashChildParentSet(child)
+}
+
+// hashChildParentSet will rehash child into the children list of its parent d.
+//
+// Assumes that child.parent = d already.
+func (d *Dirent) hashChildParentSet(child *Dirent) (*refs.WeakRef, bool) {
+ if child.parent != d {
+ panic("hashChildParentSet assumes the child already belongs to the parent")
+ }
+
+ // Save any replaced child so our caller can validate it.
+ old, ok := d.children[child.name]
+
+ // Hash the child.
+ d.children[child.name] = refs.NewWeakRef(child, nil)
+
+ // Return any replaced child.
+ return old, ok
+}
+
+// SyncAll iterates through mount points under d and writes back their buffered
+// modifications to filesystems.
+func (d *Dirent) SyncAll(ctx context.Context) {
+ d.mu.Lock()
+ defer d.mu.Unlock()
+
+ // For negative Dirents there is nothing to sync. By definition these are
+ // leaves (there is nothing left to traverse).
+ if d.IsNegative() {
+ return
+ }
+
+ // There is nothing to sync for a read-only filesystem.
+ if !d.Inode.MountSource.Flags.ReadOnly {
+ // FIXME: This should be a mount traversal, not a
+ // Dirent traversal, because some Inodes that need to be synced
+ // may no longer be reachable by name (after sys_unlink).
+ //
+ // Write out metadata, dirty page cached pages, and sync disk/remote
+ // caches.
+ d.Inode.WriteOut(ctx)
+ }
+
+ // Continue iterating through other mounted filesystems.
+ for _, w := range d.children {
+ if child := w.Get(); child != nil {
+ child.(*Dirent).SyncAll(ctx)
+ child.DecRef()
+ }
+ }
+}
+
+// FullName returns the fully-qualified name and a boolean value representing
+// whether this Dirent was a descendant of root.
+// If the root argument is nil it is assumed to be the root of the Dirent tree.
+func (d *Dirent) FullName(root *Dirent) (string, bool) {
+ renameMu.RLock()
+ defer renameMu.RUnlock()
+ return d.fullName(root)
+}
+
+// fullName returns the fully-qualified name and a boolean value representing
+// if the root node was reachable from this Dirent.
+func (d *Dirent) fullName(root *Dirent) (string, bool) {
+ if d == root {
+ return "/", true
+ }
+
+ if d.IsRoot() {
+ if root != nil {
+ // We reached the top of the Dirent tree but did not encounter
+ // the given root. Return false for reachable so the caller
+ // can handle this situation accordingly.
+ return d.name, false
+ }
+ return d.name, true
+ }
+
+ // Traverse up to parent.
+ d.parent.mu.Lock()
+ name := d.name
+ d.parent.mu.Unlock()
+ parentName, reachable := d.parent.fullName(root)
+ s := path.Join(parentName, name)
+ if atomic.LoadInt32(&d.deleted) != 0 {
+ return s + " (deleted)", reachable
+ }
+ return s, reachable
+}
+
+func (d *Dirent) freeze() {
+ if d.frozen {
+ // Already frozen.
+ return
+ }
+ d.frozen = true
+
+ // Take a reference when freezing.
+ for _, w := range d.children {
+ if child := w.Get(); child != nil {
+ // NOTE: We would normally drop the reference here. But
+ // instead we're hanging on to it.
+ ch := child.(*Dirent)
+ ch.Freeze()
+ }
+ }
+
+ // Drop all expired weak references.
+ d.flush()
+}
+
+// Freeze prevents this dirent from walking to more nodes. Freeze is applied
+// recursively to all children.
+//
+// If this particular Dirent represents a Virtual node, then Walks and Creates
+// may proceed as before.
+//
+// Freeze can only be called before the application starts running, otherwise
+// the root it might be out of sync with the application root if modified by
+// sys_chroot.
+func (d *Dirent) Freeze() {
+ d.mu.Lock()
+ defer d.mu.Unlock()
+ d.freeze()
+}
+
+// descendantOf returns true if the receiver dirent is equal to, or a
+// descendant of, the argument dirent.
+//
+// d.mu must be held.
+func (d *Dirent) descendantOf(p *Dirent) bool {
+ if d == p {
+ return true
+ }
+ if d.IsRoot() {
+ return false
+ }
+ return d.parent.descendantOf(p)
+}
+
+// walk walks to path name starting at the dirent, and will not traverse above
+// root Dirent.
+//
+// If walkMayUnlock is true then walk can unlock d.mu to execute a slow
+// Inode.Lookup, otherwise walk will keep d.mu locked.
+//
+// Preconditions:
+// - d.mu must be held.
+// - name must must not contain "/"s.
+func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnlock bool) (*Dirent, error) {
+ if !IsDir(d.Inode.StableAttr) {
+ return nil, syscall.ENOTDIR
+ }
+ if name == "" || name == "." {
+ d.IncRef()
+ return d, nil
+ } else if name == ".." {
+ renameMu.RLock()
+ // Respect the chroot. Note that in Linux there is no check to enforce
+ // that d is a descendant of root.
+ if d == root {
+ d.IncRef()
+ renameMu.RUnlock()
+ return d, nil
+ }
+ // Are we already at the root? Then ".." is ".".
+ if d.IsRoot() {
+ d.IncRef()
+ renameMu.RUnlock()
+ return d, nil
+ }
+ d.parent.IncRef()
+ renameMu.RUnlock()
+ return d.parent, nil
+ }
+
+ if w, ok := d.children[name]; ok {
+ // Try to resolve the weak reference to a hard reference.
+ if child := w.Get(); child != nil {
+ cd := child.(*Dirent)
+
+ // Is this a negative Dirent?
+ if cd.IsNegative() {
+ // Don't leak a reference; this doesn't matter as much for negative Dirents,
+ // which don't hold a hard reference on their parent (their parent holds a
+ // hard reference on them, and they contain virtually no state). But this is
+ // good house-keeping.
+ child.DecRef()
+ return nil, syscall.ENOENT
+ }
+
+ // Do we need to revalidate this child?
+ //
+ // We never allow the file system to revalidate mounts, that could cause them
+ // to unexpectedly drop out before umount.
+ if cd.mounted || !cd.Inode.MountSource.Revalidate(cd) {
+ // Good to go. This is the fast-path.
+ return cd, nil
+ }
+
+ // If we're revalidating a child, we must ensure all inotify watches release
+ // their pins on the child. Inotify doesn't properly support filesystems that
+ // revalidate dirents (since watches are lost on revalidation), but if we fail
+ // to unpin the watches child will never be GCed.
+ cd.Inode.Watches.Unpin(cd)
+
+ // This child needs to be revalidated, fallthrough to unhash it. Make sure
+ // to not leak a reference from Get().
+ //
+ // Note that previous lookups may still have a reference to this stale child;
+ // this can't be helped, but we can ensure that *new* lookups are up-to-date.
+ child.DecRef()
+ }
+
+ // Either our weak reference expired or we need to revalidate it. Unhash child first, we're
+ // about to replace it.
+ delete(d.children, name)
+ w.Drop()
+ }
+
+ // Are we allowed to do the lookup?
+ if d.frozen && !d.Inode.IsVirtual() {
+ return nil, syscall.ENOENT
+ }
+
+ // Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be expensive,
+ // if possible release the lock and re-acquire it.
+ if walkMayUnlock {
+ d.mu.Unlock()
+ }
+ c, err := d.Inode.Lookup(ctx, name)
+ if walkMayUnlock {
+ d.mu.Lock()
+ }
+ // No dice.
+ if err != nil {
+ return nil, err
+ }
+
+ // Sanity check c, its name must be consistent.
+ if c.name != name {
+ panic(fmt.Sprintf("lookup from %q to %q returned unexpected name %q", d.name, name, c.name))
+ }
+
+ // Now that we have the lock again, check if we raced.
+ if w, ok := d.children[name]; ok {
+ // Someone else looked up or created a child at name before us.
+ if child := w.Get(); child != nil {
+ cd := child.(*Dirent)
+
+ // There are active references to the existing child, prefer it to the one we
+ // retrieved from Lookup. Likely the Lookup happened very close to the insertion
+ // of child, so considering one stale over the other is fairly arbitrary.
+ c.DecRef()
+
+ // The child that was installed could be negative.
+ if cd.IsNegative() {
+ // If so, don't leak a reference and short circuit.
+ child.DecRef()
+ return nil, syscall.ENOENT
+ }
+
+ // We make the judgement call that if c raced with cd they are close enough to have
+ // the same staleness, so we don't attempt to revalidate cd. In Linux revalidations
+ // can continue indefinitely (see fs/namei.c, retry_estale); we try to avoid this.
+ return cd, nil
+ }
+
+ // Weak reference expired. We went through a full cycle of create/destroy in the time
+ // we did the Inode.Lookup. Fully drop the weak reference and fallback to using the child
+ // we looked up.
+ delete(d.children, name)
+ w.Drop()
+ }
+
+ // Give the looked up child a parent. We cannot kick out entries, since we just checked above
+ // that there is nothing at name in d's children list.
+ if _, kicked := d.hashChild(c); kicked {
+ // Yell loudly.
+ panic(fmt.Sprintf("hashed child %q over existing child", c.name))
+ }
+
+ // Is this a negative Dirent?
+ if c.IsNegative() {
+ // Don't drop a reference on the negative Dirent, it was just installed and this is the
+ // only reference we'll ever get. d owns the reference.
+ return nil, syscall.ENOENT
+ }
+
+ // Return the positive Dirent.
+ return c, nil
+}
+
+// Walk walks to a new dirent, and will not walk higher than the given root
+// Dirent, which must not be nil.
+func (d *Dirent) Walk(ctx context.Context, root *Dirent, name string) (*Dirent, error) {
+ if root == nil {
+ panic("Dirent.Walk: root must not be nil")
+ }
+
+ d.dirMu.RLock()
+ d.mu.Lock()
+ child, err := d.walk(ctx, root, name, true /* may unlock */)
+ d.mu.Unlock()
+ d.dirMu.RUnlock()
+
+ return child, err
+}
+
+// exists returns true if name exists in relation to d.
+//
+// Preconditions: d.mu must be held.
+func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool {
+ child, err := d.walk(ctx, root, name, true /* may unlock */)
+ if err != nil {
+ // Child may not exist.
+ return false
+ }
+ // Child exists.
+ child.DecRef()
+ return true
+}
+
+// lockDirectory should be called for any operation that changes this `d`s
+// children (creating or removing them).
+func (d *Dirent) lockDirectory() func() {
+ if d.Inode.overlay != nil {
+ // overlay copyUp may need to look at Dirent parents, and hence
+ // may need renameMu.
+ renameMu.RLock()
+ d.dirMu.Lock()
+ d.mu.Lock()
+ return func() {
+ d.mu.Unlock()
+ d.dirMu.Unlock()
+ renameMu.RUnlock()
+ }
+ }
+
+ d.dirMu.Lock()
+ d.mu.Lock()
+ return func() {
+ d.mu.Unlock()
+ d.dirMu.Unlock()
+ }
+}
+
+// Create creates a new regular file in this directory.
+func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags FileFlags, perms FilePermissions) (*File, error) {
+ unlock := d.lockDirectory()
+ defer unlock()
+
+ // Does something already exist?
+ if d.exists(ctx, root, name) {
+ return nil, syscall.EEXIST
+ }
+
+ // Are we frozen?
+ if d.frozen && !d.Inode.IsVirtual() {
+ return nil, syscall.ENOENT
+ }
+
+ // Try the create. We need to trust the file system to return EEXIST (or something
+ // that will translate to EEXIST) if name already exists.
+ file, err := d.Inode.Create(ctx, d, name, flags, perms)
+ if err != nil {
+ return nil, err
+ }
+ child := file.Dirent
+
+ // Sanity check c, its name must be consistent.
+ if child.name != name {
+ panic(fmt.Sprintf("create from %q to %q returned unexpected name %q", d.name, name, child.name))
+ }
+
+ // File systems cannot return a negative Dirent on Create, that makes no sense.
+ if child.IsNegative() {
+ panic(fmt.Sprintf("create from %q to %q returned negative Dirent", d.name, name))
+ }
+
+ // Hash the child into its parent. We can only kick out a Dirent if it is negative
+ // (we are replacing something that does not exist with something that now does).
+ if w, kicked := d.hashChild(child); kicked {
+ if old := w.Get(); old != nil {
+ if !old.(*Dirent).IsNegative() {
+ panic(fmt.Sprintf("hashed child %q over a positive child", child.name))
+ }
+ // Don't leak a reference.
+ old.DecRef()
+
+ // Drop d's reference.
+ old.DecRef()
+ }
+
+ // Finally drop the useless weak reference on the floor.
+ w.Drop()
+ }
+
+ d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
+
+ // Allow the file system to take extra references on c.
+ child.maybeExtendReference()
+
+ // Return the reference and the new file. When the last reference to
+ // the file is dropped, file.Dirent may no longer be cached.
+ return file, nil
+}
+
+// genericCreate executes create if name does not exist. Removes a negative Dirent at name if
+// create succeeds.
+//
+// Preconditions: d.mu must be held.
+func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, create func() error) error {
+ // Does something already exist?
+ if d.exists(ctx, root, name) {
+ return syscall.EEXIST
+ }
+
+ // Are we frozen?
+ if d.frozen && !d.Inode.IsVirtual() {
+ return syscall.ENOENT
+ }
+
+ // Execute the create operation.
+ if err := create(); err != nil {
+ return err
+ }
+
+ // Remove any negative Dirent. We've already asserted above with d.exists
+ // that the only thing remaining here can be a negative Dirent.
+ if w, ok := d.children[name]; ok {
+ // Same as Create.
+ if old := w.Get(); old != nil {
+ if !old.(*Dirent).IsNegative() {
+ panic(fmt.Sprintf("hashed over a positive child %q", old.(*Dirent).name))
+ }
+ // Don't leak a reference.
+ old.DecRef()
+
+ // Drop d's reference.
+ old.DecRef()
+ }
+
+ // Unhash the negative Dirent, name needs to exist now.
+ delete(d.children, name)
+
+ // Finally drop the useless weak reference on the floor.
+ w.Drop()
+ }
+
+ return nil
+}
+
+// CreateLink creates a new link in this directory.
+func (d *Dirent) CreateLink(ctx context.Context, root *Dirent, oldname, newname string) error {
+ unlock := d.lockDirectory()
+ defer unlock()
+
+ return d.genericCreate(ctx, root, newname, func() error {
+ if err := d.Inode.CreateLink(ctx, d, oldname, newname); err != nil {
+ return err
+ }
+ d.Inode.Watches.Notify(newname, linux.IN_CREATE, 0)
+ return nil
+ })
+}
+
+// CreateHardLink creates a new hard link in this directory.
+func (d *Dirent) CreateHardLink(ctx context.Context, root *Dirent, target *Dirent, name string) error {
+ unlock := d.lockDirectory()
+ defer unlock()
+
+ // Make sure that target does not span filesystems.
+ if d.Inode.MountSource != target.Inode.MountSource {
+ return syscall.EXDEV
+ }
+
+ return d.genericCreate(ctx, root, name, func() error {
+ if err := d.Inode.CreateHardLink(ctx, d, target, name); err != nil {
+ return err
+ }
+ target.Inode.Watches.Notify("", linux.IN_ATTRIB, 0) // Link count change.
+ d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
+ return nil
+ })
+}
+
+// CreateDirectory creates a new directory under this dirent.
+func (d *Dirent) CreateDirectory(ctx context.Context, root *Dirent, name string, perms FilePermissions) error {
+ unlock := d.lockDirectory()
+ defer unlock()
+
+ return d.genericCreate(ctx, root, name, func() error {
+ if err := d.Inode.CreateDirectory(ctx, d, name, perms); err != nil {
+ return err
+ }
+ d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_CREATE, 0)
+ return nil
+ })
+}
+
+// Bind satisfies the InodeOperations interface; otherwise same as GetFile.
+func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, socket unix.BoundEndpoint, perms FilePermissions) error {
+ d.dirMu.Lock()
+ defer d.dirMu.Unlock()
+ d.mu.Lock()
+ defer d.mu.Unlock()
+
+ err := d.genericCreate(ctx, root, name, func() error {
+ if err := d.Inode.Bind(ctx, name, socket, perms); err != nil {
+ return err
+ }
+ d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
+ return nil
+ })
+ if err == syscall.EEXIST {
+ return syscall.EADDRINUSE
+ }
+ return err
+}
+
+// CreateFifo creates a new named pipe under this dirent.
+func (d *Dirent) CreateFifo(ctx context.Context, root *Dirent, name string, perms FilePermissions) error {
+ unlock := d.lockDirectory()
+ defer unlock()
+
+ return d.genericCreate(ctx, root, name, func() error {
+ if err := d.Inode.CreateFifo(ctx, d, name, perms); err != nil {
+ return err
+ }
+ d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
+ return nil
+ })
+}
+
+// getDotAttrs returns the DentAttrs corresponding to "." and ".." directories.
+func (d *Dirent) getDotAttrs(root *Dirent) (DentAttr, DentAttr) {
+ // Get '.'.
+ sattr := d.Inode.StableAttr
+ dot := DentAttr{
+ Type: sattr.Type,
+ InodeID: sattr.InodeID,
+ }
+
+ // Get '..'.
+ if !d.IsRoot() && d.descendantOf(root) {
+ // Dirent is a descendant of the root. Get its parent's attrs.
+ psattr := d.parent.Inode.StableAttr
+ dotdot := DentAttr{
+ Type: psattr.Type,
+ InodeID: psattr.InodeID,
+ }
+ return dot, dotdot
+ }
+ // Dirent is either root or not a descendant of the root. ".." is the
+ // same as ".".
+ return dot, dot
+}
+
+// readdirFrozen returns readdir results based solely on the frozen children.
+func (d *Dirent) readdirFrozen(root *Dirent, offset int64, dirCtx *DirCtx) (int64, error) {
+ // Collect attrs for "." and "..".
+ attrs := make(map[string]DentAttr)
+ names := []string{".", ".."}
+ attrs["."], attrs[".."] = d.getDotAttrs(root)
+
+ // Get info from all children.
+ d.mu.Lock()
+ defer d.mu.Unlock()
+ for name, w := range d.children {
+ if child := w.Get(); child != nil {
+ defer child.DecRef()
+
+ // Skip negative children.
+ if child.(*Dirent).IsNegative() {
+ continue
+ }
+
+ sattr := child.(*Dirent).Inode.StableAttr
+ attrs[name] = DentAttr{
+ Type: sattr.Type,
+ InodeID: sattr.InodeID,
+ }
+ names = append(names, name)
+ }
+ }
+
+ sort.Strings(names)
+
+ if int(offset) >= len(names) {
+ return offset, nil
+ }
+ names = names[int(offset):]
+ for _, name := range names {
+ if err := dirCtx.DirEmit(name, attrs[name]); err != nil {
+ return offset, err
+ }
+ offset++
+ }
+ return offset, nil
+}
+
+// DirIterator is an open directory containing directory entries that can be read.
+type DirIterator interface {
+ // IterateDir emits directory entries by calling dirCtx.EmitDir, beginning
+ // with the entry at offset and returning the next directory offset.
+ //
+ // Entries for "." and ".." must *not* be included.
+ //
+ // If the offset returned is the same as the argument offset, then
+ // nothing has been serialized. This is equivalent to reaching EOF.
+ // In this case serializer.Written() should return 0.
+ //
+ // The order of entries to emit must be consistent between Readdir
+ // calls, and must start with the given offset.
+ //
+ // The caller must ensure that this operation is permitted.
+ IterateDir(ctx context.Context, dirCtx *DirCtx, offset int) (int, error)
+}
+
+// DirentReaddir serializes the directory entries of d including "." and "..".
+//
+// Arguments:
+//
+// * d: the Dirent of the directory being read; required to provide "." and "..".
+// * it: the directory iterator; which represents an open directory handle.
+// * root: fs root; if d is equal to the root, then '..' will refer to d.
+// * ctx: context provided to file systems in order to select and serialize entries.
+// * offset: the current directory offset.
+//
+// Returns the offset of the *next* element which was not serialized.
+func DirentReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, dirCtx *DirCtx, offset int64) (int64, error) {
+ offset, err := direntReaddir(ctx, d, it, root, dirCtx, offset)
+ // Serializing any directory entries at all means success.
+ if dirCtx.Serializer.Written() > 0 {
+ return offset, nil
+ }
+ return offset, err
+}
+
+func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, dirCtx *DirCtx, offset int64) (int64, error) {
+ if root == nil {
+ panic("Dirent.Readdir: root must not be nil")
+ }
+ if dirCtx.Serializer == nil {
+ panic("Dirent.Readdir: serializer must not be nil")
+ }
+ if d.frozen {
+ return d.readdirFrozen(root, offset, dirCtx)
+ }
+
+ // Check that this is actually a directory before emitting anything.
+ // Once we have written entries for "." and "..", future errors from
+ // IterateDir will be hidden.
+ if !IsDir(d.Inode.StableAttr) {
+ return 0, syserror.ENOTDIR
+ }
+
+ // Collect attrs for "." and "..".
+ dot, dotdot := d.getDotAttrs(root)
+
+ // Emit "." and ".." if the offset is low enough.
+ if offset == 0 {
+ // Serialize ".".
+ if err := dirCtx.DirEmit(".", dot); err != nil {
+ return offset, err
+ }
+ offset++
+ }
+ if offset == 1 {
+ // Serialize "..".
+ if err := dirCtx.DirEmit("..", dotdot); err != nil {
+ return offset, err
+ }
+ offset++
+ }
+
+ // it.IterateDir should be passed an offset that does not include the
+ // initial dot elements. We will add them back later.
+ offset -= 2
+ newOffset, err := it.IterateDir(ctx, dirCtx, int(offset))
+ if int64(newOffset) < offset {
+ panic(fmt.Sprintf("node.Readdir returned offset %v less that input offset %v", offset, newOffset))
+ }
+ // Add the initial nodes back to the offset count.
+ newOffset += 2
+ return int64(newOffset), err
+}
+
+// flush flushes all weak references recursively, and removes any cached
+// references to children.
+//
+// Preconditions: d.mu must be held.
+func (d *Dirent) flush() {
+ expired := make(map[string]*refs.WeakRef)
+ for n, w := range d.children {
+ // Call flush recursively on each child before removing our
+ // reference on it, and removing the cache's reference.
+ if child := w.Get(); child != nil {
+ cd := child.(*Dirent)
+
+ if !cd.IsNegative() {
+ // Flush the child.
+ cd.mu.Lock()
+ cd.flush()
+ cd.mu.Unlock()
+
+ // Allow the file system to drop extra references on child.
+ cd.dropExtendedReference()
+ }
+
+ // Don't leak a reference.
+ child.DecRef()
+ }
+ // Check if the child dirent is closed, and mark it as expired if it is.
+ // We must call w.Get() again here, since the child could have been closed
+ // by the calls to flush() and cache.Remove() in the above if-block.
+ if child := w.Get(); child != nil {
+ child.DecRef()
+ } else {
+ expired[n] = w
+ }
+ }
+
+ // Remove expired entries.
+ for n, w := range expired {
+ delete(d.children, n)
+ w.Drop()
+ }
+}
+
+// Busy indicates whether this Dirent is a mount point or root dirent, or has
+// active positive children.
+//
+// This is expensive, since it flushes the children cache.
+//
+// TODO: Fix this busy-ness check.
+func (d *Dirent) Busy() bool {
+ d.mu.Lock()
+ defer d.mu.Unlock()
+
+ if d.mounted || d.parent == nil {
+ return true
+ }
+
+ // Flush any cached references to children that are doomed.
+ d.flush()
+
+ // Count positive children.
+ var nonNegative int
+ for _, w := range d.children {
+ if child := w.Get(); child != nil {
+ if !child.(*Dirent).IsNegative() {
+ nonNegative++
+ }
+ child.DecRef()
+ }
+ }
+ return nonNegative > 0
+}
+
+// mount mounts a new dirent with the given inode over d.
+//
+// Precondition: must be called with mm.withMountLocked held on `d`.
+func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err error) {
+ // Did we race with deletion?
+ if atomic.LoadInt32(&d.deleted) != 0 {
+ return nil, syserror.ENOENT
+ }
+
+ // Refuse to mount a symlink.
+ //
+ // See Linux equivalent in fs/namespace.c:do_add_mount.
+ if IsSymlink(inode.StableAttr) {
+ return nil, syserror.EINVAL
+ }
+
+ // Are we frozen?
+ if d.parent.frozen && !d.parent.Inode.IsVirtual() {
+ return nil, syserror.ENOENT
+ }
+
+ // Dirent that'll replace d.
+ //
+ // Note that NewDirent returns with one reference taken; the reference
+ // is donated to the caller as the mount reference.
+ replacement := NewDirent(inode, d.name)
+ replacement.mounted = true
+
+ weakRef, ok := d.parent.hashChild(replacement)
+ if !ok {
+ panic("mount must mount over an existing dirent")
+ }
+ weakRef.Drop()
+
+ // Note that even though `d` is now hidden, it still holds a reference
+ // to its parent.
+ return replacement, nil
+}
+
+// unmount unmounts `d` and replaces it with the last Dirent that was in its
+// place, supplied by the MountNamespace as `replacement`.
+//
+// Precondition: must be called with mm.withMountLocked held on `d`.
+func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error {
+ // Did we race with deletion?
+ if atomic.LoadInt32(&d.deleted) != 0 {
+ return syserror.ENOENT
+ }
+
+ // Are we frozen?
+ if d.parent.frozen && !d.parent.Inode.IsVirtual() {
+ return syserror.ENOENT
+ }
+
+ // Remount our former child in its place.
+ //
+ // As replacement used to be our child, it must already have the right
+ // parent.
+ weakRef, ok := d.parent.hashChildParentSet(replacement)
+ if !ok {
+ panic("mount must mount over an existing dirent")
+ }
+ weakRef.Drop()
+
+ // d is not reachable anymore, and hence not mounted anymore.
+ d.mounted = false
+
+ // Drop mount reference.
+ d.DecRef()
+ return nil
+}
+
+// Remove removes the given file or symlink. The root dirent is used to
+// resolve name, and must not be nil.
+func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string) error {
+ // Check the root.
+ if root == nil {
+ panic("Dirent.Remove: root must not be nil")
+ }
+
+ unlock := d.lockDirectory()
+ defer unlock()
+
+ // Are we frozen?
+ if d.frozen && !d.Inode.IsVirtual() {
+ return syscall.ENOENT
+ }
+
+ // Try to walk to the node.
+ child, err := d.walk(ctx, root, name, false /* may unlock */)
+ if err != nil {
+ // Child does not exist.
+ return err
+ }
+ defer child.DecRef()
+
+ // Remove cannot remove directories.
+ if IsDir(child.Inode.StableAttr) {
+ return syscall.EISDIR
+ }
+
+ // Remove cannot remove a mount point.
+ if child.Busy() {
+ return syscall.EBUSY
+ }
+
+ // Try to remove name on the file system.
+ if err := d.Inode.Remove(ctx, d, child); err != nil {
+ return err
+ }
+
+ // Link count changed, this only applies to non-directory nodes.
+ child.Inode.Watches.Notify("", linux.IN_ATTRIB, 0)
+
+ // Mark name as deleted and remove from children.
+ atomic.StoreInt32(&child.deleted, 1)
+ if w, ok := d.children[name]; ok {
+ delete(d.children, name)
+ w.Drop()
+ }
+
+ // Allow the file system to drop extra references on child.
+ child.dropExtendedReference()
+
+ // Finally, let inotify know the child is being unlinked. Drop any extra
+ // refs from inotify to this child dirent. This doesn't necessarily mean the
+ // watches on the underlying inode will be destroyed, since the underlying
+ // inode may have other links. If this was the last link, the events for the
+ // watch removal will be queued by the inode destructor.
+ child.Inode.Watches.MarkUnlinked()
+ child.Inode.Watches.Unpin(child)
+ d.Inode.Watches.Notify(name, linux.IN_DELETE, 0)
+
+ return nil
+}
+
+// RemoveDirectory removes the given directory. The root dirent is used to
+// resolve name, and must not be nil.
+func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) error {
+ // Check the root.
+ if root == nil {
+ panic("Dirent.Remove: root must not be nil")
+ }
+
+ unlock := d.lockDirectory()
+ defer unlock()
+
+ // Are we frozen?
+ if d.frozen && !d.Inode.IsVirtual() {
+ return syscall.ENOENT
+ }
+
+ // Check for dots.
+ if name == "." {
+ // Rejected as the last component by rmdir(2).
+ return syscall.EINVAL
+ }
+ if name == ".." {
+ // If d was found, then its parent is not empty.
+ return syscall.ENOTEMPTY
+ }
+
+ // Try to walk to the node.
+ child, err := d.walk(ctx, root, name, false /* may unlock */)
+ if err != nil {
+ // Child does not exist.
+ return err
+ }
+ defer child.DecRef()
+
+ // RemoveDirectory can only remove directories.
+ if !IsDir(child.Inode.StableAttr) {
+ return syscall.ENOTDIR
+ }
+
+ // Remove cannot remove a mount point.
+ if child.Busy() {
+ return syscall.EBUSY
+ }
+
+ // Try to remove name on the file system.
+ if err := d.Inode.Remove(ctx, d, child); err != nil {
+ return err
+ }
+
+ // Mark name as deleted and remove from children.
+ atomic.StoreInt32(&child.deleted, 1)
+ if w, ok := d.children[name]; ok {
+ delete(d.children, name)
+ w.Drop()
+ }
+
+ // Allow the file system to drop extra references on child.
+ child.dropExtendedReference()
+
+ // Finally, let inotify know the child is being unlinked. Drop any extra
+ // refs from inotify to this child dirent.
+ child.Inode.Watches.MarkUnlinked()
+ child.Inode.Watches.Unpin(child)
+ d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_DELETE, 0)
+
+ return nil
+}
+
+// destroy closes this node and all children.
+func (d *Dirent) destroy() {
+ if d.IsNegative() {
+ // Nothing to tear-down and no parent references to drop, since a negative
+ // Dirent does not take a references on its parent, has no Inode and no children.
+ return
+ }
+
+ var wg sync.WaitGroup
+ defer wg.Wait()
+ d.mu.Lock()
+ defer d.mu.Unlock()
+
+ // Drop all weak references.
+ for _, w := range d.children {
+ w.Drop()
+ }
+ d.children = nil
+
+ allDirents.remove(d)
+
+ // Drop our reference to the Inode.
+ d.Inode.DecRef()
+
+ // Allow the Dirent to be GC'ed after this point, since the Inode may still
+ // be referenced after the Dirent is destroyed (for instance by filesystem
+ // internal caches or hard links).
+ d.Inode = nil
+
+ // Drop the reference we have on our parent if we took one. renameMu doesn't need to be
+ // held because d can't be reparented without any references to it left.
+ if d.parent != nil {
+ d.parent.DecRef()
+ }
+}
+
+// IncRef increases the Dirent's refcount as well as its mount's refcount.
+//
+// IncRef implements RefCounter.IncRef.
+func (d *Dirent) IncRef() {
+ if d.Inode != nil {
+ d.Inode.MountSource.IncDirentRefs()
+ }
+ d.AtomicRefCount.IncRef()
+}
+
+// TryIncRef implements RefCounter.TryIncRef.
+func (d *Dirent) TryIncRef() bool {
+ ok := d.AtomicRefCount.TryIncRef()
+ if ok && d.Inode != nil {
+ d.Inode.MountSource.IncDirentRefs()
+ }
+ return ok
+}
+
+// DecRef decreases the Dirent's refcount and drops its reference on its mount.
+//
+// DecRef implements RefCounter.DecRef with destructor d.destroy.
+func (d *Dirent) DecRef() {
+ if d.Inode != nil {
+ // Keep mount around, since DecRef may destroy d.Inode.
+ msrc := d.Inode.MountSource
+ d.DecRefWithDestructor(d.destroy)
+ msrc.DecDirentRefs()
+ } else {
+ d.DecRefWithDestructor(d.destroy)
+ }
+}
+
+// InotifyEvent notifies all watches on the inode for this dirent and its parent
+// of potential events. The events may not actually propagate up to the user,
+// depending on the event masks. InotifyEvent automatically provides the name of
+// the current dirent as the subject of the event as required, and adds the
+// IN_ISDIR flag for dirents that refer to directories.
+func (d *Dirent) InotifyEvent(events, cookie uint32) {
+ // N.B. We don't defer the unlocks because InotifyEvent is in the hot
+ // path of all IO operations, and the defers cost too much for small IO
+ // operations.
+ renameMu.RLock()
+
+ if IsDir(d.Inode.StableAttr) {
+ events |= linux.IN_ISDIR
+ }
+
+ // The ordering below is important, Linux always notifies the parent first.
+ if d.parent != nil {
+ d.parent.Inode.Watches.Notify(d.name, events, cookie)
+ }
+ d.Inode.Watches.Notify("", events, cookie)
+
+ renameMu.RUnlock()
+}
+
+// maybeExtendReference caches a reference on this Dirent if
+// MountSourceOperations.Keep returns true.
+func (d *Dirent) maybeExtendReference() {
+ if msrc := d.Inode.MountSource; msrc.Keep(d) {
+ msrc.fscache.Add(d)
+ }
+}
+
+// dropExtendedReference drops any cached reference held by the
+// MountSource on the dirent.
+func (d *Dirent) dropExtendedReference() {
+ d.Inode.MountSource.fscache.Remove(d)
+}
+
+// lockForRename takes locks on oldParent and newParent as required by Rename
+// and returns a function that will unlock the locks taken. The returned
+// function must be called even if a non-nil error is returned.
+func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName string) (func(), error) {
+ if oldParent == newParent {
+ oldParent.mu.Lock()
+ return oldParent.mu.Unlock, nil
+ }
+
+ // Renaming between directories is a bit subtle:
+ //
+ // - A concurrent cross-directory Rename may try to lock in the opposite
+ // order; take renameMu to prevent this from happening.
+ //
+ // - If either directory is an ancestor of the other, then a concurrent
+ // Remove may lock the descendant (in DecRef -> closeAll) while holding a
+ // lock on the ancestor; to avoid this, ensure we take locks in the same
+ // ancestor-to-descendant order. (Holding renameMu prevents this
+ // relationship from changing.)
+ renameMu.Lock()
+
+ // First check if newParent is a descendant of oldParent.
+ child := newParent
+ for p := newParent.parent; p != nil; p = p.parent {
+ if p == oldParent {
+ oldParent.mu.Lock()
+ newParent.mu.Lock()
+ var err error
+ if child.name == oldName {
+ // newParent is not just a descendant of oldParent, but
+ // more specifically of oldParent/oldName. That is, we're
+ // trying to rename something into a subdirectory of
+ // itself.
+ err = syscall.EINVAL
+ }
+ return func() {
+ newParent.mu.Unlock()
+ oldParent.mu.Unlock()
+ renameMu.Unlock()
+ }, err
+ }
+ child = p
+ }
+
+ // Otherwise, either oldParent is a descendant of newParent or the two
+ // have no relationship; in either case we can do this:
+ newParent.mu.Lock()
+ oldParent.mu.Lock()
+ return func() {
+ oldParent.mu.Unlock()
+ newParent.mu.Unlock()
+ renameMu.Unlock()
+ }, nil
+}
+
+func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error {
+ uattr, err := dir.Inode.UnstableAttr(ctx)
+ if err != nil {
+ return syserror.EPERM
+ }
+ if !uattr.Perms.Sticky {
+ return nil
+ }
+
+ creds := auth.CredentialsFromContext(ctx)
+ if uattr.Owner.UID == creds.EffectiveKUID {
+ return nil
+ }
+
+ vuattr, err := victim.Inode.UnstableAttr(ctx)
+ if err != nil {
+ return syserror.EPERM
+ }
+ if vuattr.Owner.UID == creds.EffectiveKUID {
+ return nil
+ }
+ if victim.Inode.CheckCapability(ctx, linux.CAP_FOWNER) {
+ return nil
+ }
+ return syserror.EPERM
+}
+
+// MayDelete determines whether `name`, a child of `dir`, can be deleted or
+// renamed by `ctx`.
+//
+// Compare Linux kernel fs/namei.c:may_delete.
+func MayDelete(ctx context.Context, root, dir *Dirent, name string) error {
+ victim, err := dir.Walk(ctx, root, name)
+ if err != nil {
+ return err
+ }
+ defer victim.DecRef()
+
+ return mayDelete(ctx, dir, victim)
+}
+
+func mayDelete(ctx context.Context, dir *Dirent, victim *Dirent) error {
+ if err := dir.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
+ return err
+ }
+
+ return checkSticky(ctx, dir, victim)
+}
+
+// Rename atomically converts the child of oldParent named oldName to a
+// child of newParent named newName.
+func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string, newParent *Dirent, newName string) error {
+ if root == nil {
+ panic("Rename: root must not be nil")
+ }
+ if oldParent == newParent && oldName == newName {
+ return nil
+ }
+
+ // Acquire global renameMu lock, and mu locks on oldParent/newParent.
+ unlock, err := lockForRename(oldParent, oldName, newParent, newName)
+ defer unlock()
+ if err != nil {
+ return err
+ }
+
+ // Are we frozen?
+ // TODO: Is this the right errno?
+ if oldParent.frozen && !oldParent.Inode.IsVirtual() {
+ return syscall.ENOENT
+ }
+ if newParent.frozen && !newParent.Inode.IsVirtual() {
+ return syscall.ENOENT
+ }
+
+ // Check constraints on the object being renamed.
+ renamed, err := oldParent.walk(ctx, root, oldName, false /* may unlock */)
+ if err != nil {
+ return err
+ }
+ defer renamed.DecRef()
+
+ // Make sure we have write permissions on old and new parent.
+ if err := mayDelete(ctx, oldParent, renamed); err != nil {
+ return err
+ }
+ if newParent != oldParent {
+ if err := newParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
+ return err
+ }
+ }
+
+ // Source should not be an ancestor of the target.
+ if renamed == newParent {
+ return syscall.EINVAL
+ }
+
+ // Is the thing we're trying to rename busy?
+ if renamed.Busy() {
+ return syscall.EBUSY
+ }
+
+ // Per rename(2): "... EACCES: ... or oldpath is a directory and does not
+ // allow write permission (needed to update the .. entry)."
+ if IsDir(renamed.Inode.StableAttr) {
+ if err := renamed.Inode.CheckPermission(ctx, PermMask{Write: true}); err != nil {
+ return err
+ }
+ }
+
+ // Check constraints on the object being replaced, if any.
+ replaced, err := newParent.walk(ctx, root, newName, false /* may unlock */)
+ if err == nil {
+ defer replaced.DecRef()
+
+ // Target should not be an ancestor of source.
+ if replaced == oldParent {
+ // Why is this not EINVAL? See fs/namei.c.
+ return syscall.ENOTEMPTY
+ }
+
+ // Is the thing we're trying to replace busy?
+ if replaced.Busy() {
+ return syscall.EBUSY
+ }
+
+ // Require that a directory is replaced by a directory.
+ oldIsDir := IsDir(renamed.Inode.StableAttr)
+ newIsDir := IsDir(replaced.Inode.StableAttr)
+ if !newIsDir && oldIsDir {
+ return syscall.ENOTDIR
+ }
+ if !oldIsDir && newIsDir {
+ return syscall.EISDIR
+ }
+
+ // Allow the file system to drop extra references on replaced.
+ replaced.dropExtendedReference()
+
+ // NOTE: Keeping a dirent
+ // open across renames is currently broken for multiple
+ // reasons, so we flush all references on the replaced node and
+ // its children.
+ replaced.Inode.Watches.Unpin(replaced)
+ replaced.flush()
+ }
+
+ if err := renamed.Inode.Rename(ctx, oldParent, renamed, newParent, newName); err != nil {
+ return err
+ }
+
+ renamed.name = newName
+ renamed.parent = newParent
+ if oldParent != newParent {
+ // Reparent the reference held by renamed.parent. oldParent.DecRef
+ // can't destroy oldParent (and try to retake its lock) because
+ // Rename's caller must be holding a reference.
+ newParent.IncRef()
+ oldParent.DecRef()
+ }
+ if w, ok := newParent.children[newName]; ok {
+ w.Drop()
+ delete(newParent.children, newName)
+ }
+ if w, ok := oldParent.children[oldName]; ok {
+ w.Drop()
+ delete(oldParent.children, oldName)
+ }
+
+ // Add a weak reference from the new parent. This ensures that the child
+ // can still be found from the new parent if a prior hard reference is
+ // held on renamed.
+ //
+ // This is required for file lock correctness because file locks are per-Dirent
+ // and without maintaining the a cached child (via a weak reference) for renamed,
+ // multiple Dirents can correspond to the same resource (by virtue of the renamed
+ // Dirent being unreachable by its parent and it being looked up).
+ newParent.children[newName] = refs.NewWeakRef(renamed, nil)
+
+ // Queue inotify events for the rename.
+ var ev uint32
+ if IsDir(renamed.Inode.StableAttr) {
+ ev |= linux.IN_ISDIR
+ }
+
+ cookie := uniqueid.InotifyCookie(ctx)
+ oldParent.Inode.Watches.Notify(oldName, ev|linux.IN_MOVED_FROM, cookie)
+ newParent.Inode.Watches.Notify(newName, ev|linux.IN_MOVED_TO, cookie)
+ // Somewhat surprisingly, self move events do not have a cookie.
+ renamed.Inode.Watches.Notify("", linux.IN_MOVE_SELF, 0)
+
+ // Allow the file system to drop extra references on renamed.
+ renamed.dropExtendedReference()
+
+ // Same as replaced.flush above.
+ renamed.flush()
+
+ return nil
+}