// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fs import ( "fmt" "strings" "sync" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/third_party/gvsync" ) // The virtual filesystem implements an overlay configuration. For a high-level // description, see README.md. // // Note on whiteouts: // // This implementation does not use the "Docker-style" whiteouts (symlinks with // ".wh." prefix). Instead upper filesystem directories support a set of extended // attributes to encode whiteouts: "trusted.overlay.whiteout.<filename>". This // gives flexibility to persist whiteouts independently of the filesystem layout // while additionally preventing name conflicts with files prefixed with ".wh.". // // Known deficiencies: // // - The device number of two files under the same overlay mount point may be // different. This can happen if a file is found in the lower filesystem (takes // the lower filesystem device) and another file is created in the upper // filesystem (takes the upper filesystem device). This may appear odd but // should not break applications. // // - Registered events on files (i.e. for notification of read/write readiness) // are not copied across copy up. This is fine in the common case of files that // do not block. For files that do block, like pipes and sockets, copy up is not // supported. // // - Hardlinks in a lower filesystem are broken by copy up. For this reason, no // attempt is made to preserve link count across copy up. // // - The maximum length of an extended attribute name is the same as the maximum // length of a file path in Linux (XATTR_NAME_MAX == NAME_MAX). This means that // whiteout attributes, if set directly on the host, are limited additionally by // the extra whiteout prefix length (file paths must be strictly shorter than // NAME_MAX). This is not a problem for in-memory filesystems which don't enforce // XATTR_NAME_MAX. const ( // XattrOverlayPrefix is the prefix for extended attributes that affect // the behavior of an overlay. XattrOverlayPrefix = "trusted.overlay." // XattrOverlayWhiteoutPrefix is the prefix for extended attributes // that indicate that a whiteout exists. XattrOverlayWhiteoutPrefix = XattrOverlayPrefix + "whiteout." ) // XattrOverlayWhiteout returns an extended attribute that indicates a // whiteout exists for name. It is supported by directories that wish to // mask the existence of name. func XattrOverlayWhiteout(name string) string { return XattrOverlayWhiteoutPrefix + name } // isXattrOverlay returns whether the given extended attribute configures the // overlay. func isXattrOverlay(name string) bool { return strings.HasPrefix(name, XattrOverlayPrefix) } // NewOverlayRoot produces the root of an overlay. // // Preconditions: // // - upper and lower must be non-nil. // - upper must not be an overlay. // - lower should not expose character devices, pipes, or sockets, because // copying up these types of files is not supported. // - lower must not require that file objects be revalidated. // - lower must not have dynamic file/directory content. func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags MountSourceFlags) (*Inode, error) { if !IsDir(upper.StableAttr) { return nil, fmt.Errorf("upper Inode is a %v, not a directory", upper.StableAttr.Type) } if !IsDir(lower.StableAttr) { return nil, fmt.Errorf("lower Inode is a %v, not a directory", lower.StableAttr.Type) } if upper.overlay != nil { return nil, fmt.Errorf("cannot nest overlay in upper file of another overlay") } msrc := newOverlayMountSource(ctx, upper.MountSource, lower.MountSource, flags) overlay, err := newOverlayEntry(ctx, upper, lower, true) if err != nil { msrc.DecRef() return nil, err } return newOverlayInode(ctx, overlay, msrc), nil } // NewOverlayRootFile produces the root of an overlay that points to a file. // // Preconditions: // // - lower must be non-nil. // - lower should not expose character devices, pipes, or sockets, because // copying up these types of files is not supported. Neither it can be a dir. // - lower must not require that file objects be revalidated. // - lower must not have dynamic file/directory content. func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode, flags MountSourceFlags) (*Inode, error) { if !IsRegular(lower.StableAttr) { return nil, fmt.Errorf("lower Inode is not a regular file") } msrc := newOverlayMountSource(ctx, upperMS, lower.MountSource, flags) overlay, err := newOverlayEntry(ctx, nil, lower, true) if err != nil { msrc.DecRef() return nil, err } return newOverlayInode(ctx, overlay, msrc), nil } // newOverlayInode creates a new Inode for an overlay. func newOverlayInode(ctx context.Context, o *overlayEntry, msrc *MountSource) *Inode { var inode *Inode if o.upper != nil { inode = NewInode(ctx, nil, msrc, o.upper.StableAttr) } else { inode = NewInode(ctx, nil, msrc, o.lower.StableAttr) } inode.overlay = o return inode } // overlayEntry is the overlay metadata of an Inode. It implements Mappable. // // +stateify savable type overlayEntry struct { // lowerExists is true if an Inode exists for this file in the lower // filesystem. If lowerExists is true, then the overlay must create // a whiteout entry when renaming and removing this entry to mask the // lower Inode. // // Note that this is distinct from actually holding onto a non-nil // lower Inode (below). The overlay does not need to keep a lower Inode // around unless it needs to operate on it, but it always needs to know // whether the lower Inode exists to correctly execute a rename or // remove operation. lowerExists bool // lower is an Inode from a lower filesystem. Modifications are // never made on this Inode. lower *Inode // copyMu serializes copy-up for operations above // mm.MemoryManager.mappingMu in the lock order. copyMu sync.RWMutex `state:"nosave"` // mapsMu serializes copy-up for operations between // mm.MemoryManager.mappingMu and mm.MemoryManager.activeMu in the lock // order. mapsMu sync.Mutex `state:"nosave"` // mappings tracks memory mappings of this Mappable so they can be removed // from the lower filesystem Mappable and added to the upper filesystem // Mappable when copy up occurs. It is strictly unnecessary after copy-up. // // mappings is protected by mapsMu. mappings memmap.MappingSet // dataMu serializes copy-up for operations below mm.MemoryManager.activeMu // in the lock order. dataMu sync.RWMutex `state:"nosave"` // upper is an Inode from an upper filesystem. It is non-nil if // the file exists in the upper filesystem. It becomes non-nil // when the Inode that owns this overlayEntry is modified. // // upper is protected by all of copyMu, mapsMu, and dataMu. Holding any of // these locks is sufficient to read upper; holding all three for writing // is required to mutate it. upper *Inode // dirCacheMu protects dirCache. dirCacheMu gvsync.DowngradableRWMutex `state:"nosave"` // dirCache is cache of DentAttrs from upper and lower Inodes. dirCache *SortedDentryMap } // newOverlayEntry returns a new overlayEntry. func newOverlayEntry(ctx context.Context, upper *Inode, lower *Inode, lowerExists bool) (*overlayEntry, error) { if upper == nil && lower == nil { panic("invalid overlayEntry, needs at least one Inode") } if upper != nil && upper.overlay != nil { panic("nested writable layers are not supported") } // Check for supported lower filesystem types. if lower != nil { switch lower.StableAttr.Type { case RegularFile, Directory, Symlink, Socket: default: // We don't support copying up from character devices, // named pipes, or anything weird (like proc files). log.Warningf("%s not supported in lower filesytem", lower.StableAttr.Type) return nil, syserror.EINVAL } } return &overlayEntry{ lowerExists: lowerExists, lower: lower, upper: upper, }, nil } func (o *overlayEntry) release() { // We drop a reference on upper and lower file system Inodes // rather than releasing them, because in-memory filesystems // may hold an extra reference to these Inodes so that they // stay in memory. if o.upper != nil { o.upper.DecRef() } if o.lower != nil { o.lower.DecRef() } } // overlayUpperMountSource gives the upper mount of an overlay mount. // // The caller may not use this MountSource past the lifetime of overlayMountSource and may // not call DecRef on it. func overlayUpperMountSource(overlayMountSource *MountSource) *MountSource { return overlayMountSource.MountSourceOperations.(*overlayMountSourceOperations).upper } // Preconditions: At least one of o.copyMu, o.mapsMu, or o.dataMu must be locked. func (o *overlayEntry) inodeLocked() *Inode { if o.upper != nil { return o.upper } return o.lower } // Preconditions: At least one of o.copyMu, o.mapsMu, or o.dataMu must be locked. func (o *overlayEntry) isMappableLocked() bool { return o.inodeLocked().Mappable() != nil } // markDirectoryDirty marks any cached data dirty for this directory. This is // necessary in order to ensure that this node does not retain stale state // throughout its lifetime across multiple open directory handles. // // Currently this means invalidating any readdir caches. func (o *overlayEntry) markDirectoryDirty() { o.dirCacheMu.Lock() o.dirCache = nil o.dirCacheMu.Unlock() } // AddMapping implements memmap.Mappable.AddMapping. func (o *overlayEntry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { o.mapsMu.Lock() defer o.mapsMu.Unlock() if err := o.inodeLocked().Mappable().AddMapping(ctx, ms, ar, offset, writable); err != nil { return err } o.mappings.AddMapping(ms, ar, offset, writable) return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (o *overlayEntry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { o.mapsMu.Lock() defer o.mapsMu.Unlock() o.inodeLocked().Mappable().RemoveMapping(ctx, ms, ar, offset, writable) o.mappings.RemoveMapping(ms, ar, offset, writable) } // CopyMapping implements memmap.Mappable.CopyMapping. func (o *overlayEntry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { o.mapsMu.Lock() defer o.mapsMu.Unlock() if err := o.inodeLocked().Mappable().CopyMapping(ctx, ms, srcAR, dstAR, offset, writable); err != nil { return err } o.mappings.AddMapping(ms, dstAR, offset, writable) return nil } // Translate implements memmap.Mappable.Translate. func (o *overlayEntry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { o.dataMu.RLock() defer o.dataMu.RUnlock() return o.inodeLocked().Mappable().Translate(ctx, required, optional, at) } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (o *overlayEntry) InvalidateUnsavable(ctx context.Context) error { o.mapsMu.Lock() defer o.mapsMu.Unlock() return o.inodeLocked().Mappable().InvalidateUnsavable(ctx) }