// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gofer import ( "fmt" "io" "math" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) func (d *dentry) isRegularFile() bool { return d.fileType() == linux.S_IFREG } // +stateify savable type regularFileFD struct { fileDescription // off is the file offset. off is protected by mu. mu sync.Mutex `state:"nosave"` off int64 } func newRegularFileFD(mnt *vfs.Mount, d *dentry, flags uint32) (*regularFileFD, error) { fd := ®ularFileFD{} fd.LockFD.Init(&d.locks) if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ AllowDirectIO: true, }); err != nil { return nil, err } if fd.vfsfd.IsWritable() && (atomic.LoadUint32(&d.mode)&0111 != 0) { metric.SuspiciousOperationsMetric.Increment("opened_write_execute_file") } if atomic.LoadInt32(&d.mmapFD) >= 0 { fsmetric.GoferOpensHost.Increment() } else { fsmetric.GoferOpens9P.Increment() } return fd, nil } // Release implements vfs.FileDescriptionImpl.Release. func (fd *regularFileFD) Release(context.Context) { } // OnClose implements vfs.FileDescriptionImpl.OnClose. func (fd *regularFileFD) OnClose(ctx context.Context) error { if !fd.vfsfd.IsWritable() { return nil } d := fd.dentry() if d.fs.opts.interop == InteropModeExclusive { // d may have dirty pages that we won't write back now (and wouldn't // have in VFS1), making a flushf RPC ineffective. If this is the case, // skip the flushf. // // Note that it's also possible to have dirty pages under other interop // modes if forcePageCache is in effect; we conservatively assume that // applications have some way of tolerating this and still want the // flushf. d.dataMu.RLock() haveDirtyPages := !d.dirty.IsEmpty() d.dataMu.RUnlock() if haveDirtyPages { return nil } } d.handleMu.RLock() defer d.handleMu.RUnlock() if d.fs.opts.lisaEnabled { if !d.writeFDLisa.Ok() { return nil } return d.writeFDLisa.Flush(ctx) } if d.writeFile.isNil() { return nil } return d.writeFile.flush(ctx) } // Allocate implements vfs.FileDescriptionImpl.Allocate. func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { d := fd.dentry() return d.doAllocate(ctx, offset, length, func() error { d.handleMu.RLock() defer d.handleMu.RUnlock() if d.fs.opts.lisaEnabled { return d.writeFDLisa.Allocate(ctx, mode, offset, length) } return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length) }) } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { start := fsmetric.StartReadWait() d := fd.dentry() defer func() { if atomic.LoadInt32(&d.readFD) >= 0 { fsmetric.GoferReadsHost.Increment() fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start) } else { fsmetric.GoferReads9P.Increment() fsmetric.FinishReadWait(fsmetric.GoferReadWait9P, start) } }() if offset < 0 { return 0, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, linuxerr.EOPNOTSUPP } // Check for reading at EOF before calling into MM (but not under // InteropModeShared, which makes d.size unreliable). if d.cachedMetadataAuthoritative() && uint64(offset) >= atomic.LoadUint64(&d.size) { return 0, io.EOF } var ( n int64 readErr error ) if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { // Lock d.metadataMu for the rest of the read to prevent d.size from // changing. d.metadataMu.Lock() defer d.metadataMu.Unlock() // Write dirty cached pages that will be touched by the read back to // the remote file. if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil { return 0, err } rw := getDentryReadWriter(ctx, d, offset) // Require the read to go to the remote file. rw.direct = true n, readErr = dst.CopyOutFrom(ctx, rw) putDentryReadWriter(rw) if d.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). d.touchAtimeLocked(fd.vfsfd.Mount()) } } else { rw := getDentryReadWriter(ctx, d, offset) n, readErr = dst.CopyOutFrom(ctx, rw) putDentryReadWriter(rw) if d.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). d.touchAtime(fd.vfsfd.Mount()) } } return n, readErr } // Read implements vfs.FileDescriptionImpl.Read. func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { fd.mu.Lock() n, err := fd.PRead(ctx, dst, fd.off, opts) fd.off += n fd.mu.Unlock() return n, err } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { n, _, err := fd.pwrite(ctx, src, offset, opts) return n, err } // pwrite returns the number of bytes written, final offset, error. The final // offset should be ignored by PWrite. func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if offset < 0 { return 0, offset, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, offset, linuxerr.EOPNOTSUPP } d := fd.dentry() d.metadataMu.Lock() defer d.metadataMu.Unlock() // If the fd was opened with O_APPEND, make sure the file size is updated. // There is a possible race here if size is modified externally after // metadata cache is updated. if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() { if err := d.refreshSizeLocked(ctx); err != nil { return 0, offset, err } } // Set offset to file size if the fd was opened with O_APPEND. if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { // Holding d.metadataMu is sufficient for reading d.size. offset = int64(d.size) } limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { return 0, offset, err } src = src.TakeFirst64(limit) if d.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:__generic_file_write_iter() => // file_update_time(). This is d.touchCMtime(), but without locking // d.metadataMu (recursively). d.touchCMtimeLocked() } rw := getDentryReadWriter(ctx, d, offset) defer putDentryReadWriter(rw) if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { if err := fd.writeCache(ctx, d, offset, src); err != nil { return 0, offset, err } // Require the write to go to the remote file. rw.direct = true } n, err := src.CopyInTo(ctx, rw) if err != nil { return n, offset + n, err } if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { // Note that if any of the following fail, then we can't guarantee that // any data was actually written with the semantics of O_DSYNC or // O_SYNC, so we return zero bytes written. Compare Linux's // mm/filemap.c:generic_file_write_iter() => // include/linux/fs.h:generic_write_sync(). // // Write dirty cached pages touched by the write back to the remote // file. if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { return 0, offset, err } // Request the remote filesystem to sync the remote file. if err := d.syncRemoteFile(ctx); err != nil { return 0, offset, err } } // As with Linux, writing clears the setuid and setgid bits. if n > 0 { oldMode := atomic.LoadUint32(&d.mode) // If setuid or setgid were set, update d.mode and propagate // changes to the host. if newMode := vfs.ClearSUIDAndSGID(oldMode); newMode != oldMode { atomic.StoreUint32(&d.mode, newMode) if d.fs.opts.lisaEnabled { stat := linux.Statx{Mask: linux.STATX_MODE, Mode: uint16(newMode)} failureMask, failureErr, err := d.controlFDLisa.SetStat(ctx, &stat) if err != nil { return 0, offset, err } if failureMask != 0 { return 0, offset, failureErr } } else { if err := d.file.setAttr(ctx, p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(newMode)}); err != nil { return 0, offset, err } } } } return n, offset + n, nil } func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64, src usermem.IOSequence) error { // Write dirty cached pages that will be touched by the write back to // the remote file. if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { return err } // Remove touched pages from the cache. pgstart := hostarch.PageRoundDown(uint64(offset)) pgend, ok := hostarch.PageRoundUp(uint64(offset + src.NumBytes())) if !ok { return linuxerr.EINVAL } mr := memmap.MappableRange{pgstart, pgend} var freed []memmap.FileRange d.dataMu.Lock() cseg := d.cache.LowerBoundSegment(mr.Start) for cseg.Ok() && cseg.Start() < mr.End { cseg = d.cache.Isolate(cseg, mr) freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) cseg = d.cache.Remove(cseg).NextSegment() } d.dataMu.Unlock() // Invalidate mappings of removed pages. d.mapsMu.Lock() d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) d.mapsMu.Unlock() // Finally free pages removed from the cache. mf := d.fs.mfp.MemoryFile() for _, freedFR := range freed { mf.DecRef(freedFR) } return nil } // Write implements vfs.FileDescriptionImpl.Write. func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { fd.mu.Lock() n, off, err := fd.pwrite(ctx, src, fd.off, opts) fd.off = off fd.mu.Unlock() return n, err } type dentryReadWriter struct { ctx context.Context d *dentry off uint64 direct bool } var dentryReadWriterPool = sync.Pool{ New: func() interface{} { return &dentryReadWriter{} }, } func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter { rw := dentryReadWriterPool.Get().(*dentryReadWriter) rw.ctx = ctx rw.d = d rw.off = uint64(offset) rw.direct = false return rw } func putDentryReadWriter(rw *dentryReadWriter) { rw.ctx = nil rw.d = nil dentryReadWriterPool.Put(rw) } // ReadToBlocks implements safemem.Reader.ReadToBlocks. func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { if dsts.IsEmpty() { return 0, nil } // If we have a mmappable host FD (which must be used here to ensure // coherence with memory-mapped I/O), or if InteropModeShared is in effect // (which prevents us from caching file contents and makes dentry.size // unreliable), or if the file was opened O_DIRECT, read directly from // dentry.readHandleLocked() without locking dentry.dataMu. rw.d.handleMu.RLock() h := rw.d.readHandleLocked() if (rw.d.mmapFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off) rw.d.handleMu.RUnlock() rw.off += n return n, err } // Otherwise read from/through the cache. mf := rw.d.fs.mfp.MemoryFile() fillCache := mf.ShouldCacheEvictable() var dataMuUnlock func() if fillCache { rw.d.dataMu.Lock() dataMuUnlock = rw.d.dataMu.Unlock } else { rw.d.dataMu.RLock() dataMuUnlock = rw.d.dataMu.RUnlock } // Compute the range to read (limited by file size and overflow-checked). if rw.off >= rw.d.size { dataMuUnlock() rw.d.handleMu.RUnlock() return 0, io.EOF } end := rw.d.size if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { end = rend } var done uint64 seg, gap := rw.d.cache.Find(rw.off) for rw.off < end { mr := memmap.MappableRange{rw.off, end} switch { case seg.Ok(): // Get internal mappings from the cache. ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read) if err != nil { dataMuUnlock() rw.d.handleMu.RUnlock() return done, err } // Copy from internal mappings. n, err := safemem.CopySeq(dsts, ims) done += n rw.off += n dsts = dsts.DropFirst64(n) if err != nil { dataMuUnlock() rw.d.handleMu.RUnlock() return done, err } // Continue. seg, gap = seg.NextNonEmpty() case gap.Ok(): gapMR := gap.Range().Intersect(mr) if fillCache { // Read into the cache, then re-enter the loop to read from the // cache. gapEnd, _ := hostarch.PageRoundUp(gapMR.End) reqMR := memmap.MappableRange{ Start: hostarch.PageRoundDown(gapMR.Start), End: gapEnd, } optMR := gap.Range() err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.size, mf, usage.PageCache, h.readToBlocksAt) mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End}) seg, gap = rw.d.cache.Find(rw.off) if !seg.Ok() { dataMuUnlock() rw.d.handleMu.RUnlock() return done, err } // err might have occurred in part of gap.Range() outside gapMR // (in particular, gap.End() might be beyond EOF). Forget about // it for now; if the error matters and persists, we'll run // into it again in a later iteration of this loop. } else { // Read directly from the file. gapDsts := dsts.TakeFirst64(gapMR.Length()) n, err := h.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start) done += n rw.off += n dsts = dsts.DropFirst64(n) // Partial reads are fine. But we must stop reading. if n != gapDsts.NumBytes() || err != nil { dataMuUnlock() rw.d.handleMu.RUnlock() return done, err } // Continue. seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} } } } dataMuUnlock() rw.d.handleMu.RUnlock() return done, nil } // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. // // Preconditions: rw.d.metadataMu must be locked. func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { if srcs.IsEmpty() { return 0, nil } // If we have a mmappable host FD (which must be used here to ensure // coherence with memory-mapped I/O), or if InteropModeShared is in effect // (which prevents us from caching file contents), or if the file was // opened with O_DIRECT, write directly to dentry.writeHandleLocked() // without locking dentry.dataMu. rw.d.handleMu.RLock() h := rw.d.writeHandleLocked() if (rw.d.mmapFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off) rw.off += n rw.d.dataMu.Lock() if rw.off > rw.d.size { atomic.StoreUint64(&rw.d.size, rw.off) // The remote file's size will implicitly be extended to the correct // value when we write back to it. } rw.d.dataMu.Unlock() rw.d.handleMu.RUnlock() return n, err } // Otherwise write to/through the cache. mf := rw.d.fs.mfp.MemoryFile() rw.d.dataMu.Lock() // Compute the range to write (overflow-checked). start := rw.off end := rw.off + srcs.NumBytes() if end <= rw.off { end = math.MaxInt64 } var ( done uint64 retErr error ) seg, gap := rw.d.cache.Find(rw.off) for rw.off < end { mr := memmap.MappableRange{rw.off, end} switch { case seg.Ok(): // Get internal mappings from the cache. segMR := seg.Range().Intersect(mr) ims, err := mf.MapInternal(seg.FileRangeOf(segMR), hostarch.Write) if err != nil { retErr = err goto exitLoop } // Copy to internal mappings. n, err := safemem.CopySeq(ims, srcs) done += n rw.off += n srcs = srcs.DropFirst64(n) rw.d.dirty.MarkDirty(segMR) if err != nil { retErr = err goto exitLoop } // Continue. seg, gap = seg.NextNonEmpty() case gap.Ok(): // Write directly to the file. At present, we never fill the cache // when writing, since doing so can convert small writes into // inefficient read-modify-write cycles, and we have no mechanism // for detecting or avoiding this. gapMR := gap.Range().Intersect(mr) gapSrcs := srcs.TakeFirst64(gapMR.Length()) n, err := h.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start) done += n rw.off += n srcs = srcs.DropFirst64(n) // Partial writes are fine. But we must stop writing. if n != gapSrcs.NumBytes() || err != nil { retErr = err goto exitLoop } // Continue. seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} } } exitLoop: if rw.off > rw.d.size { atomic.StoreUint64(&rw.d.size, rw.off) // The remote file's size will implicitly be extended to the correct // value when we write back to it. } // If InteropModeWritethrough is in effect, flush written data back to the // remote filesystem. if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 { if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{ Start: start, End: rw.off, }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, h.writeFromBlocksAt); err != nil { // We have no idea how many bytes were actually flushed. rw.off = start done = 0 retErr = err } } rw.d.dataMu.Unlock() rw.d.handleMu.RUnlock() return done, retErr } func (d *dentry) writeback(ctx context.Context, offset, size int64) error { if size == 0 { return nil } d.handleMu.RLock() defer d.handleMu.RUnlock() h := d.writeHandleLocked() d.dataMu.Lock() defer d.dataMu.Unlock() // Compute the range of valid bytes (overflow-checked). if uint64(offset) >= d.size { return nil } end := int64(d.size) if rend := offset + size; rend > offset && rend < end { end = rend } return fsutil.SyncDirty(ctx, memmap.MappableRange{ Start: uint64(offset), End: uint64(end), }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt) } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { fd.mu.Lock() defer fd.mu.Unlock() newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence) if err != nil { return 0, err } fd.off = newOffset return newOffset, nil } // Calculate the new offset for a seek operation on a regular file. func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int64, whence int32) (int64, error) { switch whence { case linux.SEEK_SET: // Use offset as specified. case linux.SEEK_CUR: offset += fdOffset case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE: // Ensure file size is up to date. if !d.cachedMetadataAuthoritative() { if err := d.updateFromGetattr(ctx); err != nil { return 0, err } } size := int64(atomic.LoadUint64(&d.size)) // For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous // block of data. switch whence { case linux.SEEK_END: offset += size case linux.SEEK_DATA: if offset > size { return 0, linuxerr.ENXIO } // Use offset as specified. case linux.SEEK_HOLE: if offset > size { return 0, linuxerr.ENXIO } offset = size } default: return 0, linuxerr.EINVAL } if offset < 0 { return 0, linuxerr.EINVAL } return offset, nil } // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *regularFileFD) Sync(ctx context.Context) error { return fd.dentry().syncCachedFile(ctx, false /* forFilesystemSync */, nil /* accFsyncFDIDsLisa */) } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { d := fd.dentry() // Force sentry page caching at your own risk. if !d.fs.opts.forcePageCache { switch d.fs.opts.interop { case InteropModeExclusive: // Any mapping is fine. case InteropModeWritethrough: // Shared writable mappings require a host FD, since otherwise we // can't synchronously flush memory-mapped writes to the remote // file. if opts.Private || !opts.MaxPerms.Write { break } fallthrough case InteropModeShared: // All mappings require a host FD to be coherent with other // filesystem users. if atomic.LoadInt32(&d.mmapFD) < 0 { return linuxerr.ENODEV } default: panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop)) } } // After this point, d may be used as a memmap.Mappable. d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init) opts.SentryOwnedContent = d.fs.opts.forcePageCache return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts) } func (fs *filesystem) mayCachePagesInMemoryFile() bool { return fs.opts.forcePageCache || fs.opts.interop != InteropModeShared } // AddMapping implements memmap.Mappable.AddMapping. func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { d.mapsMu.Lock() mapped := d.mappings.AddMapping(ms, ar, offset, writable) // Do this unconditionally since whether we have a host FD can change // across save/restore. for _, r := range mapped { d.pf.hostFileMapper.IncRefOn(r) } if d.fs.mayCachePagesInMemoryFile() { // d.Evict() will refuse to evict memory-mapped pages, so tell the // MemoryFile to not bother trying. mf := d.fs.mfp.MemoryFile() for _, r := range mapped { mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End}) } } d.mapsMu.Unlock() return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { d.mapsMu.Lock() unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable) for _, r := range unmapped { d.pf.hostFileMapper.DecRefOn(r) } if d.fs.mayCachePagesInMemoryFile() { // Pages that are no longer referenced by any application memory // mappings are now considered unused; allow MemoryFile to evict them // when necessary. mf := d.fs.mfp.MemoryFile() d.dataMu.Lock() for _, r := range unmapped { // Since these pages are no longer mapped, they are no longer // concurrently dirtyable by a writable memory mapping. d.dirty.AllowClean(r) mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End}) } d.dataMu.Unlock() } d.mapsMu.Unlock() } // CopyMapping implements memmap.Mappable.CopyMapping. func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { return d.AddMapping(ctx, ms, dstAR, offset, writable) } // Translate implements memmap.Mappable.Translate. func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { d.handleMu.RLock() if d.mmapFD >= 0 && !d.fs.opts.forcePageCache { d.handleMu.RUnlock() mr := optional if d.fs.opts.limitHostFDTranslation { mr = maxFillRange(required, optional) } return []memmap.Translation{ { Source: mr, File: &d.pf, Offset: mr.Start, Perms: hostarch.AnyAccess, }, }, nil } d.dataMu.Lock() // Constrain translations to d.size (rounded up) to prevent translation to // pages that may be concurrently truncated. pgend, _ := hostarch.PageRoundUp(d.size) var beyondEOF bool if required.End > pgend { if required.Start >= pgend { d.dataMu.Unlock() d.handleMu.RUnlock() return nil, &memmap.BusError{io.EOF} } beyondEOF = true required.End = pgend } if optional.End > pgend { optional.End = pgend } mf := d.fs.mfp.MemoryFile() h := d.readHandleLocked() cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), d.size, mf, usage.PageCache, h.readToBlocksAt) var ts []memmap.Translation var translatedEnd uint64 for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { segMR := seg.Range().Intersect(optional) // TODO(jamieliu): Make Translations writable even if writability is // not required if already kept-dirty by another writable translation. perms := hostarch.AccessType{ Read: true, Execute: true, } if at.Write { // From this point forward, this memory can be dirtied through the // mapping at any time. d.dirty.KeepDirty(segMR) perms.Write = true } ts = append(ts, memmap.Translation{ Source: segMR, File: mf, Offset: seg.FileRangeOf(segMR).Start, Perms: perms, }) translatedEnd = segMR.End } d.dataMu.Unlock() d.handleMu.RUnlock() // Don't return the error returned by c.cache.Fill if it occurred outside // of required. if translatedEnd < required.End && cerr != nil { return ts, &memmap.BusError{cerr} } if beyondEOF { return ts, &memmap.BusError{io.EOF} } return ts, nil } func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange { const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily if required.Length() >= maxReadahead { return required } if optional.Length() <= maxReadahead { return optional } optional.Start = required.Start if optional.Length() <= maxReadahead { return optional } optional.End = optional.Start + maxReadahead return optional } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (d *dentry) InvalidateUnsavable(ctx context.Context) error { // Whether we have a host fd (and consequently what memmap.File is // mapped) can change across save/restore, so invalidate all translations // unconditionally. d.mapsMu.Lock() defer d.mapsMu.Unlock() d.mappings.InvalidateAll(memmap.InvalidateOpts{}) // Write the cache's contents back to the remote file so that if we have a // host fd after restore, the remote file's contents are coherent. mf := d.fs.mfp.MemoryFile() d.handleMu.RLock() defer d.handleMu.RUnlock() h := d.writeHandleLocked() d.dataMu.Lock() defer d.dataMu.Unlock() if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil { return err } // Discard the cache so that it's not stored in saved state. This is safe // because per InvalidateUnsavable invariants, no new translations can have // been returned after we invalidated all existing translations above. d.cache.DropAll(mf) d.dirty.RemoveAll() return nil } // Evict implements pgalloc.EvictableMemoryUser.Evict. func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { mr := memmap.MappableRange{er.Start, er.End} mf := d.fs.mfp.MemoryFile() d.mapsMu.Lock() defer d.mapsMu.Unlock() d.handleMu.RLock() defer d.handleMu.RUnlock() h := d.writeHandleLocked() d.dataMu.Lock() defer d.dataMu.Unlock() // Only allow pages that are no longer memory-mapped to be evicted. for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { mgapMR := mgap.Range().Intersect(mr) if mgapMR.Length() == 0 { continue } if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil { log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) } d.cache.Drop(mgapMR, mf) d.dirty.KeepClean(mgapMR) } } // dentryPlatformFile implements memmap.File. It exists solely because dentry // cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef. // // dentryPlatformFile is only used when a host FD representing the remote file // is available (i.e. dentry.mmapFD >= 0), and that FD is used for application // memory mappings (i.e. !filesystem.opts.forcePageCache). // // +stateify savable type dentryPlatformFile struct { *dentry // fdRefs counts references on memmap.File offsets. fdRefs is protected // by dentry.dataMu. fdRefs fsutil.FrameRefSet // If this dentry represents a regular file, and dentry.mmapFD >= 0, // hostFileMapper caches mappings of dentry.mmapFD. hostFileMapper fsutil.HostFileMapper // hostFileMapperInitOnce is used to lazily initialize hostFileMapper. hostFileMapperInitOnce sync.Once `state:"nosave"` } // IncRef implements memmap.File.IncRef. func (d *dentryPlatformFile) IncRef(fr memmap.FileRange) { d.dataMu.Lock() d.fdRefs.IncRefAndAccount(fr) d.dataMu.Unlock() } // DecRef implements memmap.File.DecRef. func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) { d.dataMu.Lock() d.fdRefs.DecRefAndAccount(fr) d.dataMu.Unlock() } // MapInternal implements memmap.File.MapInternal. func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { d.handleMu.RLock() defer d.handleMu.RUnlock() return d.hostFileMapper.MapInternal(fr, int(d.mmapFD), at.Write) } // FD implements memmap.File.FD. func (d *dentryPlatformFile) FD() int { d.handleMu.RLock() defer d.handleMu.RUnlock() return int(d.mmapFD) }