// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fsutil import ( "fmt" "io" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/time" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) // Lock order (compare the lock order model in mm/mm.go): // // CachingInodeOperations.attrMu ("fs locks") // CachingInodeOperations.mapsMu ("memmap.Mappable locks not taken by Translate") // CachingInodeOperations.dataMu ("memmap.Mappable locks taken by Translate") // CachedFileObject locks // CachingInodeOperations caches the metadata and content of a CachedFileObject. // It implements a subset of InodeOperations. As a utility it can be used to // implement the full set of InodeOperations. Generally it should not be // embedded to avoid unexpected inherited behavior. // // CachingInodeOperations implements Mappable for the CachedFileObject: // // - If CachedFileObject.FD returns a value >= 0 then the file descriptor // will be memory mapped on the host. // // - Otherwise, the contents of CachedFileObject are buffered into memory // managed by the CachingInodeOperations. // // Implementations of FileOperations for a CachedFileObject must read and // write through CachingInodeOperations using Read and Write respectively. // // Implementations of InodeOperations.WriteOut must call Sync to write out // in-memory modifications of data and metadata to the CachedFileObject. // // +stateify savable type CachingInodeOperations struct { // backingFile is a handle to a cached file object. backingFile CachedFileObject // mfp is used to allocate memory that caches backingFile's contents. mfp pgalloc.MemoryFileProvider // opts contains options. opts is immutable. opts CachingInodeOperationsOptions attrMu sync.Mutex `state:"nosave"` // attr is unstable cached metadata. // // attr is protected by attrMu. attr.Size is protected by both attrMu and // dataMu; reading it requires locking either mutex, while mutating it // requires locking both. attr fs.UnstableAttr // dirtyAttr is metadata that was updated in-place but hasn't yet // been successfully written out. // // dirtyAttr is protected by attrMu. dirtyAttr fs.AttrMask mapsMu sync.Mutex `state:"nosave"` // mappings tracks mappings of the cached file object into // memmap.MappingSpaces. // // mappings is protected by mapsMu. mappings memmap.MappingSet dataMu sync.RWMutex `state:"nosave"` // cache maps offsets into the cached file to offsets into // mfp.MemoryFile() that store the file's data. // // cache is protected by dataMu. cache FileRangeSet // dirty tracks dirty segments in cache. // // dirty is protected by dataMu. dirty DirtySet // hostFileMapper caches internal mappings of backingFile.FD(). hostFileMapper *HostFileMapper // refs tracks active references to data in the cache. // // refs is protected by dataMu. refs frameRefSet } // CachingInodeOperationsOptions configures a CachingInodeOperations. // // +stateify savable type CachingInodeOperationsOptions struct { // If ForcePageCache is true, use the sentry page cache even if a host file // descriptor is available. ForcePageCache bool // If LimitHostFDTranslation is true, apply maxFillRange() constraints to // host file descriptor mappings returned by // CachingInodeOperations.Translate(). LimitHostFDTranslation bool } // CachedFileObject is a file that may require caching. type CachedFileObject interface { // ReadToBlocksAt reads up to dsts.NumBytes() bytes from the file to dsts, // starting at offset, and returns the number of bytes read. ReadToBlocksAt // may return a partial read without an error. ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) // WriteFromBlocksAt writes up to srcs.NumBytes() bytes from srcs to the // file, starting at offset, and returns the number of bytes written. // WriteFromBlocksAt may return a partial write without an error. WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) // SetMaskedAttributes sets the attributes in attr that are true in // mask on the backing file. If the mask contains only ATime or MTime // and the CachedFileObject has an FD to the file, then this operation // is a noop unless forceSetTimestamps is true. This avoids an extra // RPC to the gofer in the open-read/write-close case, when the // timestamps on the file will be updated by the host kernel for us. // // SetMaskedAttributes may be called at any point, regardless of whether // the file was opened. SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr, forceSetTimestamps bool) error // Allocate allows the caller to reserve disk space for the inode. // It's equivalent to fallocate(2) with 'mode=0'. Allocate(ctx context.Context, offset int64, length int64) error // Sync instructs the remote filesystem to sync the file to stable storage. Sync(ctx context.Context) error // FD returns a host file descriptor. If it is possible for // CachingInodeOperations.AddMapping to have ever been called with writable // = true, the FD must have been opened O_RDWR; otherwise, it may have been // opened O_RDONLY or O_RDWR. (mmap unconditionally requires that mapped // files are readable.) If no host file descriptor is available, FD returns // a negative number. // // For any given CachedFileObject, if FD() ever succeeds (returns a // non-negative number), it must always succeed. // // FD is called iff the file has been memory mapped. This implies that // the file was opened (see fs.InodeOperations.GetFile). FD() int } // NewCachingInodeOperations returns a new CachingInodeOperations backed by // a CachedFileObject and its initial unstable attributes. func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, opts CachingInodeOperationsOptions) *CachingInodeOperations { mfp := pgalloc.MemoryFileProviderFromContext(ctx) if mfp == nil { panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider)) } return &CachingInodeOperations{ backingFile: backingFile, mfp: mfp, opts: opts, attr: uattr, hostFileMapper: NewHostFileMapper(), } } // Release implements fs.InodeOperations.Release. func (c *CachingInodeOperations) Release() { c.mapsMu.Lock() defer c.mapsMu.Unlock() c.dataMu.Lock() defer c.dataMu.Unlock() // Something has gone terribly wrong if we're releasing an inode that is // still memory-mapped. if !c.mappings.IsEmpty() { panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s", &c.mappings)) } // Drop any cached pages that are still awaiting MemoryFile eviction. (This // means that MemoryFile no longer needs to evict them.) mf := c.mfp.MemoryFile() mf.MarkAllUnevictable(c) if err := SyncDirtyAll(context.Background(), &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil { panic(fmt.Sprintf("Failed to writeback cached data: %v", err)) } c.cache.DropAll(mf) c.dirty.RemoveAll() } // UnstableAttr implements fs.InodeOperations.UnstableAttr. func (c *CachingInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { c.attrMu.Lock() attr := c.attr c.attrMu.Unlock() return attr, nil } // SetPermissions implements fs.InodeOperations.SetPermissions. func (c *CachingInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, perms fs.FilePermissions) bool { c.attrMu.Lock() defer c.attrMu.Unlock() now := ktime.NowFromContext(ctx) masked := fs.AttrMask{Perms: true} if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Perms: perms}, false); err != nil { return false } c.attr.Perms = perms c.touchStatusChangeTimeLocked(now) return true } // SetOwner implements fs.InodeOperations.SetOwner. func (c *CachingInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error { if !owner.UID.Ok() && !owner.GID.Ok() { return nil } c.attrMu.Lock() defer c.attrMu.Unlock() now := ktime.NowFromContext(ctx) masked := fs.AttrMask{ UID: owner.UID.Ok(), GID: owner.GID.Ok(), } if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Owner: owner}, false); err != nil { return err } if owner.UID.Ok() { c.attr.Owner.UID = owner.UID } if owner.GID.Ok() { c.attr.Owner.GID = owner.GID } c.touchStatusChangeTimeLocked(now) return nil } // SetTimestamps implements fs.InodeOperations.SetTimestamps. func (c *CachingInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error { if ts.ATimeOmit && ts.MTimeOmit { return nil } c.attrMu.Lock() defer c.attrMu.Unlock() // Replace requests to use the "system time" with the current time to // ensure that cached timestamps remain consistent with the remote // filesystem. now := ktime.NowFromContext(ctx) if ts.ATimeSetSystemTime { ts.ATime = now } if ts.MTimeSetSystemTime { ts.MTime = now } masked := fs.AttrMask{ AccessTime: !ts.ATimeOmit, ModificationTime: !ts.MTimeOmit, } // Call SetMaskedAttributes with forceSetTimestamps = true to make sure // the timestamp is updated. if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{AccessTime: ts.ATime, ModificationTime: ts.MTime}, true); err != nil { return err } if !ts.ATimeOmit { c.attr.AccessTime = ts.ATime } if !ts.MTimeOmit { c.attr.ModificationTime = ts.MTime } c.touchStatusChangeTimeLocked(now) return nil } // Truncate implements fs.InodeOperations.Truncate. func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error { c.attrMu.Lock() defer c.attrMu.Unlock() // c.attr.Size is protected by both c.attrMu and c.dataMu. c.dataMu.Lock() now := ktime.NowFromContext(ctx) masked := fs.AttrMask{Size: true} attr := fs.UnstableAttr{Size: size} if err := c.backingFile.SetMaskedAttributes(ctx, masked, attr, false); err != nil { c.dataMu.Unlock() return err } oldSize := c.attr.Size c.attr.Size = size c.touchModificationAndStatusChangeTimeLocked(now) // We drop c.dataMu here so that we can lock c.mapsMu and invalidate // mappings below. This allows concurrent calls to Read/Translate/etc. // These functions synchronize with an in-progress Truncate by refusing to // use cache contents beyond the new c.attr.Size. (We are still holding // c.attrMu, so we can't race with Truncate/Write.) c.dataMu.Unlock() // Nothing left to do unless shrinking the file. if size >= oldSize { return nil } oldpgend := fs.OffsetPageEnd(oldSize) newpgend := fs.OffsetPageEnd(size) // Invalidate past translations of truncated pages. if newpgend != oldpgend { c.mapsMu.Lock() c.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ // Compare Linux's mm/truncate.c:truncate_setsize() => // truncate_pagecache() => // mm/memory.c:unmap_mapping_range(evencows=1). InvalidatePrivate: true, }) c.mapsMu.Unlock() } // We are now guaranteed that there are no translations of truncated pages, // and can remove them from the cache. Since truncated pages have been // removed from the backing file, they should be dropped without being // written back. c.dataMu.Lock() defer c.dataMu.Unlock() c.cache.Truncate(uint64(size), c.mfp.MemoryFile()) c.dirty.KeepClean(memmap.MappableRange{uint64(size), oldpgend}) return nil } // Allocate implements fs.InodeOperations.Allocate. func (c *CachingInodeOperations) Allocate(ctx context.Context, offset, length int64) error { newSize := offset + length // c.attr.Size is protected by both c.attrMu and c.dataMu. c.attrMu.Lock() defer c.attrMu.Unlock() c.dataMu.Lock() defer c.dataMu.Unlock() if newSize <= c.attr.Size { return nil } now := ktime.NowFromContext(ctx) if err := c.backingFile.Allocate(ctx, offset, length); err != nil { return err } c.attr.Size = newSize c.touchModificationAndStatusChangeTimeLocked(now) return nil } // WriteOut implements fs.InodeOperations.WriteOut. func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error { c.attrMu.Lock() // Write dirty pages back. c.dataMu.Lock() err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.mfp.MemoryFile(), c.backingFile.WriteFromBlocksAt) c.dataMu.Unlock() if err != nil { c.attrMu.Unlock() return err } // SyncDirtyAll above would have grown if needed. On shrinks, the backing // file is called directly, so size is never needs to be updated. c.dirtyAttr.Size = false // Write out cached attributes. if err := c.backingFile.SetMaskedAttributes(ctx, c.dirtyAttr, c.attr, false); err != nil { c.attrMu.Unlock() return err } c.dirtyAttr = fs.AttrMask{} c.attrMu.Unlock() // Fsync the remote file. return c.backingFile.Sync(ctx) } // IncLinks increases the link count and updates cached modification time. func (c *CachingInodeOperations) IncLinks(ctx context.Context) { c.attrMu.Lock() c.attr.Links++ c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx)) c.attrMu.Unlock() } // DecLinks decreases the link count and updates cached modification time. func (c *CachingInodeOperations) DecLinks(ctx context.Context) { c.attrMu.Lock() c.attr.Links-- c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx)) c.attrMu.Unlock() } // TouchAccessTime updates the cached access time in-place to the // current time. It does not update status change time in-place. See // mm/filemap.c:do_generic_file_read -> include/linux/h:file_accessed. func (c *CachingInodeOperations) TouchAccessTime(ctx context.Context, inode *fs.Inode) { if inode.MountSource.Flags.NoAtime { return } c.attrMu.Lock() c.touchAccessTimeLocked(ktime.NowFromContext(ctx)) c.attrMu.Unlock() } // touchAccesstimeLocked updates the cached access time in-place to the current // time. // // Preconditions: c.attrMu is locked for writing. func (c *CachingInodeOperations) touchAccessTimeLocked(now time.Time) { c.attr.AccessTime = now c.dirtyAttr.AccessTime = true } // TouchModificationAndStatusChangeTime updates the cached modification and // status change times in-place to the current time. func (c *CachingInodeOperations) TouchModificationAndStatusChangeTime(ctx context.Context) { c.attrMu.Lock() c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx)) c.attrMu.Unlock() } // touchModificationAndStatusChangeTimeLocked updates the cached modification // and status change times in-place to the current time. // // Preconditions: c.attrMu is locked for writing. func (c *CachingInodeOperations) touchModificationAndStatusChangeTimeLocked(now time.Time) { c.attr.ModificationTime = now c.dirtyAttr.ModificationTime = true c.attr.StatusChangeTime = now c.dirtyAttr.StatusChangeTime = true } // TouchStatusChangeTime updates the cached status change time in-place to the // current time. func (c *CachingInodeOperations) TouchStatusChangeTime(ctx context.Context) { c.attrMu.Lock() c.touchStatusChangeTimeLocked(ktime.NowFromContext(ctx)) c.attrMu.Unlock() } // touchStatusChangeTimeLocked updates the cached status change time // in-place to the current time. // // Preconditions: c.attrMu is locked for writing. func (c *CachingInodeOperations) touchStatusChangeTimeLocked(now time.Time) { c.attr.StatusChangeTime = now c.dirtyAttr.StatusChangeTime = true } // UpdateUnstable updates the cached unstable attributes. Only non-dirty // attributes are updated. func (c *CachingInodeOperations) UpdateUnstable(attr fs.UnstableAttr) { // All attributes are protected by attrMu. c.attrMu.Lock() if !c.dirtyAttr.Usage { c.attr.Usage = attr.Usage } if !c.dirtyAttr.Perms { c.attr.Perms = attr.Perms } if !c.dirtyAttr.UID { c.attr.Owner.UID = attr.Owner.UID } if !c.dirtyAttr.GID { c.attr.Owner.GID = attr.Owner.GID } if !c.dirtyAttr.AccessTime { c.attr.AccessTime = attr.AccessTime } if !c.dirtyAttr.ModificationTime { c.attr.ModificationTime = attr.ModificationTime } if !c.dirtyAttr.StatusChangeTime { c.attr.StatusChangeTime = attr.StatusChangeTime } if !c.dirtyAttr.Links { c.attr.Links = attr.Links } // Size requires holding attrMu and dataMu. c.dataMu.Lock() if !c.dirtyAttr.Size { c.attr.Size = attr.Size } c.dataMu.Unlock() c.attrMu.Unlock() } // Read reads from frames and otherwise directly from the backing file // into dst starting at offset until dst is full, EOF is reached, or an // error is encountered. // // Read may partially fill dst and return a nil error. func (c *CachingInodeOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { if dst.NumBytes() == 0 { return 0, nil } // Have we reached EOF? We check for this again in // inodeReadWriter.ReadToBlocks to avoid holding c.attrMu (which would // serialize reads) or c.dataMu (which would violate lock ordering), but // check here first (before calling into MM) since reading at EOF is // common: getting a return value of 0 from a read syscall is the only way // to detect EOF. // // TODO(jamieliu): Separate out c.attr.Size and use atomics instead of // c.dataMu. c.dataMu.RLock() size := c.attr.Size c.dataMu.RUnlock() if offset >= size { return 0, io.EOF } n, err := dst.CopyOutFrom(ctx, &inodeReadWriter{ctx, c, offset}) // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). c.TouchAccessTime(ctx, file.Dirent.Inode) return n, err } // Write writes to frames and otherwise directly to the backing file // from src starting at offset and until src is empty or an error is // encountered. // // If Write partially fills src, a non-nil error is returned. func (c *CachingInodeOperations) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { // Hot path. Avoid defers. if src.NumBytes() == 0 { return 0, nil } c.attrMu.Lock() // Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time(). c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx)) n, err := src.CopyInTo(ctx, &inodeReadWriter{ctx, c, offset}) c.attrMu.Unlock() return n, err } type inodeReadWriter struct { ctx context.Context c *CachingInodeOperations offset int64 } // ReadToBlocks implements safemem.Reader.ReadToBlocks. func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { mem := rw.c.mfp.MemoryFile() fillCache := !rw.c.useHostPageCache() && mem.ShouldCacheEvictable() // Hot path. Avoid defers. var unlock func() if fillCache { rw.c.dataMu.Lock() unlock = rw.c.dataMu.Unlock } else { rw.c.dataMu.RLock() unlock = rw.c.dataMu.RUnlock } // Compute the range to read. if rw.offset >= rw.c.attr.Size { unlock() return 0, io.EOF } end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.c.attr.Size) if end == rw.offset { // dsts.NumBytes() == 0? unlock() return 0, nil } var done uint64 seg, gap := rw.c.cache.Find(uint64(rw.offset)) for rw.offset < end { mr := memmap.MappableRange{uint64(rw.offset), uint64(end)} switch { case seg.Ok(): // Get internal mappings from the cache. ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read) if err != nil { unlock() return done, err } // Copy from internal mappings. n, err := safemem.CopySeq(dsts, ims) done += n rw.offset += int64(n) dsts = dsts.DropFirst64(n) if err != nil { unlock() return done, err } // Continue. seg, gap = seg.NextNonEmpty() case gap.Ok(): gapMR := gap.Range().Intersect(mr) if fillCache { // Read into the cache, then re-enter the loop to read from the // cache. reqMR := memmap.MappableRange{ Start: uint64(usermem.Addr(gapMR.Start).RoundDown()), End: fs.OffsetPageEnd(int64(gapMR.End)), } optMR := gap.Range() err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt) mem.MarkEvictable(rw.c, pgalloc.EvictableRange{optMR.Start, optMR.End}) seg, gap = rw.c.cache.Find(uint64(rw.offset)) if !seg.Ok() { unlock() return done, err } // err might have occurred in part of gap.Range() outside // gapMR. Forget about it for now; if the error matters and // persists, we'll run into it again in a later iteration of // this loop. } else { // Read directly from the backing file. dst := dsts.TakeFirst64(gapMR.Length()) n, err := rw.c.backingFile.ReadToBlocksAt(rw.ctx, dst, gapMR.Start) done += n rw.offset += int64(n) dsts = dsts.DropFirst64(n) // Partial reads are fine. But we must stop reading. if n != dst.NumBytes() || err != nil { unlock() return done, err } // Continue. seg, gap = gap.NextSegment(), FileRangeGapIterator{} } default: break } } unlock() return done, nil } // maybeGrowFile grows the file's size if data has been written past the old // size. // // Preconditions: rw.c.attrMu and rw.c.dataMu bust be locked. func (rw *inodeReadWriter) maybeGrowFile() { // If the write ends beyond the file's previous size, it causes the // file to grow. if rw.offset > rw.c.attr.Size { rw.c.attr.Size = rw.offset rw.c.dirtyAttr.Size = true } if rw.offset > rw.c.attr.Usage { // This is incorrect if CachingInodeOperations is caching a sparse // file. (In Linux, keeping inode::i_blocks up to date is the // filesystem's responsibility.) rw.c.attr.Usage = rw.offset rw.c.dirtyAttr.Usage = true } } // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. // // Preconditions: rw.c.attrMu must be locked. func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { // Hot path. Avoid defers. rw.c.dataMu.Lock() // Compute the range to write. end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes())) if end == rw.offset { // srcs.NumBytes() == 0? rw.c.dataMu.Unlock() return 0, nil } mf := rw.c.mfp.MemoryFile() var done uint64 seg, gap := rw.c.cache.Find(uint64(rw.offset)) for rw.offset < end { mr := memmap.MappableRange{uint64(rw.offset), uint64(end)} switch { case seg.Ok() && seg.Start() < mr.End: // Get internal mappings from the cache. segMR := seg.Range().Intersect(mr) ims, err := mf.MapInternal(seg.FileRangeOf(segMR), usermem.Write) if err != nil { rw.maybeGrowFile() rw.c.dataMu.Unlock() return done, err } // Copy to internal mappings. n, err := safemem.CopySeq(ims, srcs) done += n rw.offset += int64(n) srcs = srcs.DropFirst64(n) rw.c.dirty.MarkDirty(segMR) if err != nil { rw.maybeGrowFile() rw.c.dataMu.Unlock() return done, err } // Continue. seg, gap = seg.NextNonEmpty() case gap.Ok() && gap.Start() < mr.End: // Write directly to the backing file. At present, we never fill // the cache when writing, since doing so can convert small writes // into inefficient read-modify-write cycles, and we have no // mechanism for detecting or avoiding this. gapmr := gap.Range().Intersect(mr) src := srcs.TakeFirst64(gapmr.Length()) n, err := rw.c.backingFile.WriteFromBlocksAt(rw.ctx, src, gapmr.Start) done += n rw.offset += int64(n) srcs = srcs.DropFirst64(n) // Partial writes are fine. But we must stop writing. if n != src.NumBytes() || err != nil { rw.maybeGrowFile() rw.c.dataMu.Unlock() return done, err } // Continue. seg, gap = gap.NextSegment(), FileRangeGapIterator{} default: break } } rw.maybeGrowFile() rw.c.dataMu.Unlock() return done, nil } // useHostPageCache returns true if c uses c.backingFile.FD() for all file I/O // and memory mappings, and false if c.cache may contain data cached from // c.backingFile. func (c *CachingInodeOperations) useHostPageCache() bool { return !c.opts.ForcePageCache && c.backingFile.FD() >= 0 } // AddMapping implements memmap.Mappable.AddMapping. func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { // Hot path. Avoid defers. c.mapsMu.Lock() mapped := c.mappings.AddMapping(ms, ar, offset, writable) // Do this unconditionally since whether we have c.backingFile.FD() >= 0 // can change across save/restore. for _, r := range mapped { c.hostFileMapper.IncRefOn(r) } if !c.useHostPageCache() { // c.Evict() will refuse to evict memory-mapped pages, so tell the // MemoryFile to not bother trying. mf := c.mfp.MemoryFile() for _, r := range mapped { mf.MarkUnevictable(c, pgalloc.EvictableRange{r.Start, r.End}) } } c.mapsMu.Unlock() return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { // Hot path. Avoid defers. c.mapsMu.Lock() unmapped := c.mappings.RemoveMapping(ms, ar, offset, writable) for _, r := range unmapped { c.hostFileMapper.DecRefOn(r) } if c.useHostPageCache() { c.mapsMu.Unlock() return } // Pages that are no longer referenced by any application memory mappings // are now considered unused; allow MemoryFile to evict them when // necessary. mf := c.mfp.MemoryFile() c.dataMu.Lock() for _, r := range unmapped { // Since these pages are no longer mapped, they are no longer // concurrently dirtyable by a writable memory mapping. c.dirty.AllowClean(r) mf.MarkEvictable(c, pgalloc.EvictableRange{r.Start, r.End}) } c.dataMu.Unlock() c.mapsMu.Unlock() } // CopyMapping implements memmap.Mappable.CopyMapping. func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { return c.AddMapping(ctx, ms, dstAR, offset, writable) } // Translate implements memmap.Mappable.Translate. func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { // Hot path. Avoid defer. if c.useHostPageCache() { mr := optional if c.opts.LimitHostFDTranslation { mr = maxFillRange(required, optional) } return []memmap.Translation{ { Source: mr, File: c, Offset: mr.Start, Perms: usermem.AnyAccess, }, }, nil } c.dataMu.Lock() // Constrain translations to c.attr.Size (rounded up) to prevent // translation to pages that may be concurrently truncated. pgend := fs.OffsetPageEnd(c.attr.Size) var beyondEOF bool if required.End > pgend { if required.Start >= pgend { c.dataMu.Unlock() return nil, &memmap.BusError{io.EOF} } beyondEOF = true required.End = pgend } if optional.End > pgend { optional.End = pgend } mf := c.mfp.MemoryFile() cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, c.backingFile.ReadToBlocksAt) var ts []memmap.Translation var translatedEnd uint64 for seg := c.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { segMR := seg.Range().Intersect(optional) // TODO(jamieliu): Make Translations writable even if writability is // not required if already kept-dirty by another writable translation. perms := usermem.AccessType{ Read: true, Execute: true, } if at.Write { // From this point forward, this memory can be dirtied through the // mapping at any time. c.dirty.KeepDirty(segMR) perms.Write = true } ts = append(ts, memmap.Translation{ Source: segMR, File: mf, Offset: seg.FileRangeOf(segMR).Start, Perms: perms, }) translatedEnd = segMR.End } c.dataMu.Unlock() // Don't return the error returned by c.cache.Fill if it occurred outside // of required. if translatedEnd < required.End && cerr != nil { return ts, &memmap.BusError{cerr} } if beyondEOF { return ts, &memmap.BusError{io.EOF} } return ts, nil } func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange { const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily if required.Length() >= maxReadahead { return required } if optional.Length() <= maxReadahead { return optional } optional.Start = required.Start if optional.Length() <= maxReadahead { return optional } optional.End = optional.Start + maxReadahead return optional } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error { // Whether we have a host fd (and consequently what platform.File is // mapped) can change across save/restore, so invalidate all translations // unconditionally. c.mapsMu.Lock() defer c.mapsMu.Unlock() c.mappings.InvalidateAll(memmap.InvalidateOpts{}) // Sync the cache's contents so that if we have a host fd after restore, // the remote file's contents are coherent. mf := c.mfp.MemoryFile() c.dataMu.Lock() defer c.dataMu.Unlock() if err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil { return err } // Discard the cache so that it's not stored in saved state. This is safe // because per InvalidateUnsavable invariants, no new translations can have // been returned after we invalidated all existing translations above. c.cache.DropAll(mf) c.dirty.RemoveAll() return nil } // NotifyChangeFD must be called after the file description represented by // CachedFileObject.FD() changes. func (c *CachingInodeOperations) NotifyChangeFD() error { // Update existing sentry mappings to refer to the new file description. if err := c.hostFileMapper.RegenerateMappings(c.backingFile.FD()); err != nil { return err } // Shoot down existing application mappings of the old file description; // they will be remapped with the new file description on demand. c.mapsMu.Lock() defer c.mapsMu.Unlock() c.mappings.InvalidateAll(memmap.InvalidateOpts{}) return nil } // Evict implements pgalloc.EvictableMemoryUser.Evict. func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.EvictableRange) { c.mapsMu.Lock() defer c.mapsMu.Unlock() c.dataMu.Lock() defer c.dataMu.Unlock() mr := memmap.MappableRange{er.Start, er.End} mf := c.mfp.MemoryFile() // Only allow pages that are no longer memory-mapped to be evicted. for mgap := c.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { mgapMR := mgap.Range().Intersect(mr) if mgapMR.Length() == 0 { continue } if err := SyncDirty(ctx, mgapMR, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil { log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) } c.cache.Drop(mgapMR, mf) c.dirty.KeepClean(mgapMR) } } // IncRef implements platform.File.IncRef. This is used when we directly map an // underlying host fd and CachingInodeOperations is used as the platform.File // during translation. func (c *CachingInodeOperations) IncRef(fr platform.FileRange) { // Hot path. Avoid defers. c.dataMu.Lock() seg, gap := c.refs.Find(fr.Start) for { switch { case seg.Ok() && seg.Start() < fr.End: seg = c.refs.Isolate(seg, fr) seg.SetValue(seg.Value() + 1) seg, gap = seg.NextNonEmpty() case gap.Ok() && gap.Start() < fr.End: newRange := gap.Range().Intersect(fr) usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) seg, gap = c.refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() default: c.refs.MergeAdjacent(fr) c.dataMu.Unlock() return } } } // DecRef implements platform.File.DecRef. This is used when we directly map an // underlying host fd and CachingInodeOperations is used as the platform.File // during translation. func (c *CachingInodeOperations) DecRef(fr platform.FileRange) { // Hot path. Avoid defers. c.dataMu.Lock() seg := c.refs.FindSegment(fr.Start) for seg.Ok() && seg.Start() < fr.End { seg = c.refs.Isolate(seg, fr) if old := seg.Value(); old == 1 { usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) seg = c.refs.Remove(seg).NextSegment() } else { seg.SetValue(old - 1) seg = seg.NextSegment() } } c.refs.MergeAdjacent(fr) c.dataMu.Unlock() } // MapInternal implements platform.File.MapInternal. This is used when we // directly map an underlying host fd and CachingInodeOperations is used as the // platform.File during translation. func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write) } // FD implements platform.File.FD. This is used when we directly map an // underlying host fd and CachingInodeOperations is used as the platform.File // during translation. func (c *CachingInodeOperations) FD() int { return c.backingFile.FD() }