diff options
Diffstat (limited to 'pkg/sentry/fs/file.go')
-rw-r--r-- | pkg/sentry/fs/file.go | 593 |
1 files changed, 593 insertions, 0 deletions
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go new file mode 100644 index 000000000..ca41520b4 --- /dev/null +++ b/pkg/sentry/fs/file.go @@ -0,0 +1,593 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "math" + "sync/atomic" + "time" + + "gvisor.dev/gvisor/pkg/amutex" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/metric" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +var ( + // RecordWaitTime controls writing metrics for filesystem reads. + // Enabling this comes at a small CPU cost due to performing two + // monotonic clock reads per read call. + // + // Note that this is only performed in the direct read path, and may + // not be consistently applied for other forms of reads, such as + // splice. + RecordWaitTime = false + + reads = metric.MustCreateNewUint64Metric("/fs/reads", false /* sync */, "Number of file reads.") + readWait = metric.MustCreateNewUint64NanosecondsMetric("/fs/read_wait", false /* sync */, "Time waiting on file reads, in nanoseconds.") +) + +// IncrementWait increments the given wait time metric, if enabled. +func IncrementWait(m *metric.Uint64Metric, start time.Time) { + if !RecordWaitTime { + return + } + m.IncrementBy(uint64(time.Since(start))) +} + +// FileMaxOffset is the maximum possible file offset. +const FileMaxOffset = math.MaxInt64 + +// File is an open file handle. It is thread-safe. +// +// File provides stronger synchronization guarantees than Linux. Linux +// synchronizes lseek(2), read(2), and write(2) with respect to the file +// offset for regular files and only for those interfaces. See +// fs/read_write.c:fdget_pos, fs.read_write.c:fdput_pos and FMODE_ATOMIC_POS. +// +// In contrast, File synchronizes any operation that could take a long time +// under a single abortable mutex which also synchronizes lseek(2), read(2), +// and write(2). +// +// FIXME(b/38451980): Split synchronization from cancellation. +// +// +stateify savable +type File struct { + refs.AtomicRefCount + + // UniqueID is the globally unique identifier of the File. + UniqueID uint64 + + // Dirent is the Dirent backing this File. This encodes the name + // of the File via Dirent.FullName() as well as its identity via the + // Dirent's Inode. The Dirent is non-nil. + // + // A File holds a reference to this Dirent. Using the returned Dirent is + // only safe as long as a reference on the File is held. The association + // between a File and a Dirent is immutable. + // + // Files that are not parented in a filesystem return a root Dirent + // that holds a reference to their Inode. + // + // The name of the Dirent may reflect parentage if the Dirent is not a + // root Dirent or the identity of the File on a pseudo filesystem (pipefs, + // sockfs, etc). + // + // Multiple Files may hold a reference to the same Dirent. This is the + // common case for Files that are parented and maintain consistency with + // other files via the Dirent cache. + Dirent *Dirent + + // flagsMu protects flags and async below. + flagsMu sync.Mutex `state:"nosave"` + + // flags are the File's flags. Setting or getting flags is fully atomic + // and is not protected by mu (below). + flags FileFlags + + // async handles O_ASYNC notifications. + async FileAsync + + // saving indicates that this file is in the process of being saved. + saving bool `state:"nosave"` + + // mu is dual-purpose: first, to make read(2) and write(2) thread-safe + // in conformity with POSIX, and second, to cancel operations before they + // begin in response to interruptions (i.e. signals). + mu amutex.AbortableMutex `state:"nosave"` + + // FileOperations implements file system specific behavior for this File. + FileOperations FileOperations `state:"wait"` + + // offset is the File's offset. Updating offset is protected by mu but + // can be read atomically via File.Offset() outside of mu. + offset int64 +} + +// NewFile returns a File. It takes a reference on the Dirent and owns the +// lifetime of the FileOperations. Files that do not support reading and +// writing at an arbitrary offset should set flags.Pread and flags.Pwrite +// to false respectively. +func NewFile(ctx context.Context, dirent *Dirent, flags FileFlags, fops FileOperations) *File { + dirent.IncRef() + f := File{ + UniqueID: uniqueid.GlobalFromContext(ctx), + Dirent: dirent, + FileOperations: fops, + flags: flags, + } + f.mu.Init() + f.EnableLeakCheck("fs.File") + return &f +} + +// DecRef destroys the File when it is no longer referenced. +func (f *File) DecRef() { + f.DecRefWithDestructor(func() { + // Drop BSD style locks. + lockRng := lock.LockRange{Start: 0, End: lock.LockEOF} + f.Dirent.Inode.LockCtx.BSD.UnlockRegion(f, lockRng) + + // Release resources held by the FileOperations. + f.FileOperations.Release() + + // Release a reference on the Dirent. + f.Dirent.DecRef() + + // Only unregister if we are currently registered. There is nothing + // to register if f.async is nil (this happens when async mode is + // enabled without setting an owner). Also, we unregister during + // save. + f.flagsMu.Lock() + if !f.saving && f.flags.Async && f.async != nil { + f.async.Unregister(f) + } + f.async = nil + f.flagsMu.Unlock() + }) +} + +// Flags atomically loads the File's flags. +func (f *File) Flags() FileFlags { + f.flagsMu.Lock() + flags := f.flags + f.flagsMu.Unlock() + return flags +} + +// SetFlags atomically changes the File's flags to the values contained +// in newFlags. See SettableFileFlags for values that can be set. +func (f *File) SetFlags(newFlags SettableFileFlags) { + f.flagsMu.Lock() + f.flags.Direct = newFlags.Direct + f.flags.NonBlocking = newFlags.NonBlocking + f.flags.Append = newFlags.Append + if f.async != nil { + if newFlags.Async && !f.flags.Async { + f.async.Register(f) + } + if !newFlags.Async && f.flags.Async { + f.async.Unregister(f) + } + } + f.flags.Async = newFlags.Async + f.flagsMu.Unlock() +} + +// Offset atomically loads the File's offset. +func (f *File) Offset() int64 { + return atomic.LoadInt64(&f.offset) +} + +// Readiness implements waiter.Waitable.Readiness. +func (f *File) Readiness(mask waiter.EventMask) waiter.EventMask { + return f.FileOperations.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (f *File) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + f.FileOperations.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (f *File) EventUnregister(e *waiter.Entry) { + f.FileOperations.EventUnregister(e) +} + +// Seek calls f.FileOperations.Seek with f as the File, updating the file +// offset to the value returned by f.FileOperations.Seek if the operation +// is successful. +// +// Returns syserror.ErrInterrupted if seeking was interrupted. +func (f *File) Seek(ctx context.Context, whence SeekWhence, offset int64) (int64, error) { + if !f.mu.Lock(ctx) { + return 0, syserror.ErrInterrupted + } + defer f.mu.Unlock() + + newOffset, err := f.FileOperations.Seek(ctx, f, whence, offset) + if err == nil { + atomic.StoreInt64(&f.offset, newOffset) + } + return newOffset, err +} + +// Readdir reads the directory entries of this File and writes them out +// to the DentrySerializer until entries can no longer be written. If even +// a single directory entry is written then Readdir returns a nil error +// and the directory offset is advanced. +// +// Readdir unconditionally updates the access time on the File's Inode, +// see fs/readdir.c:iterate_dir. +// +// Returns syserror.ErrInterrupted if reading was interrupted. +func (f *File) Readdir(ctx context.Context, serializer DentrySerializer) error { + if !f.mu.Lock(ctx) { + return syserror.ErrInterrupted + } + defer f.mu.Unlock() + + offset, err := f.FileOperations.Readdir(ctx, f, serializer) + atomic.StoreInt64(&f.offset, offset) + return err +} + +// Readv calls f.FileOperations.Read with f as the File, advancing the file +// offset if f.FileOperations.Read returns bytes read > 0. +// +// Returns syserror.ErrInterrupted if reading was interrupted. +func (f *File) Readv(ctx context.Context, dst usermem.IOSequence) (int64, error) { + var start time.Time + if RecordWaitTime { + start = time.Now() + } + if !f.mu.Lock(ctx) { + IncrementWait(readWait, start) + return 0, syserror.ErrInterrupted + } + + reads.Increment() + n, err := f.FileOperations.Read(ctx, f, dst, f.offset) + if n > 0 && !f.flags.NonSeekable { + atomic.AddInt64(&f.offset, n) + } + f.mu.Unlock() + IncrementWait(readWait, start) + return n, err +} + +// Preadv calls f.FileOperations.Read with f as the File. It does not +// advance the file offset. If !f.Flags().Pread, Preadv should not be +// called. +// +// Otherwise same as Readv. +func (f *File) Preadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + var start time.Time + if RecordWaitTime { + start = time.Now() + } + if !f.mu.Lock(ctx) { + IncrementWait(readWait, start) + return 0, syserror.ErrInterrupted + } + + reads.Increment() + n, err := f.FileOperations.Read(ctx, f, dst, offset) + f.mu.Unlock() + IncrementWait(readWait, start) + return n, err +} + +// Writev calls f.FileOperations.Write with f as the File, advancing the +// file offset if f.FileOperations.Write returns bytes written > 0. +// +// Writev positions the write offset at EOF if f.Flags().Append. This is +// unavoidably racy for network file systems. Writev also truncates src +// to avoid overrunning the current file size limit if necessary. +// +// Returns syserror.ErrInterrupted if writing was interrupted. +func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error) { + if !f.mu.Lock(ctx) { + return 0, syserror.ErrInterrupted + } + unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append) + // Handle append mode. + if f.Flags().Append { + if err := f.offsetForAppend(ctx, &f.offset); err != nil { + unlockAppendMu() + f.mu.Unlock() + return 0, err + } + } + + // Enforce file limits. + limit, ok := f.checkLimit(ctx, f.offset) + switch { + case ok && limit == 0: + unlockAppendMu() + f.mu.Unlock() + return 0, syserror.ErrExceedsFileSizeLimit + case ok: + src = src.TakeFirst64(limit) + } + + // We must hold the lock during the write. + n, err := f.FileOperations.Write(ctx, f, src, f.offset) + if n >= 0 && !f.flags.NonSeekable { + atomic.StoreInt64(&f.offset, f.offset+n) + } + unlockAppendMu() + f.mu.Unlock() + return n, err +} + +// Pwritev calls f.FileOperations.Write with f as the File. It does not +// advance the file offset. If !f.Flags().Pwritev, Pwritev should not be +// called. +// +// Otherwise same as Writev. +func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + // "POSIX requires that opening a file with the O_APPEND flag should + // have no effect on the location at which pwrite() writes data. + // However, on Linux, if a file is opened with O_APPEND, pwrite() + // appends data to the end of the file, regardless of the value of + // offset." + unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append) + defer unlockAppendMu() + if f.Flags().Append { + if err := f.offsetForAppend(ctx, &offset); err != nil { + return 0, err + } + } + + // Enforce file limits. + limit, ok := f.checkLimit(ctx, offset) + switch { + case ok && limit == 0: + return 0, syserror.ErrExceedsFileSizeLimit + case ok: + src = src.TakeFirst64(limit) + } + + return f.FileOperations.Write(ctx, f, src, offset) +} + +// offsetForAppend atomically sets the given offset to the end of the file. +// +// Precondition: the file.Dirent.Inode.appendMu mutex should be held for +// writing. +func (f *File) offsetForAppend(ctx context.Context, offset *int64) error { + uattr, err := f.Dirent.Inode.UnstableAttr(ctx) + if err != nil { + // This is an odd error, we treat it as evidence that + // something is terribly wrong with the filesystem. + return syserror.EIO + } + + // Update the offset. + atomic.StoreInt64(offset, uattr.Size) + + return nil +} + +// checkLimit checks the offset that the write will be performed at. The +// returned boolean indicates that the write must be limited. The returned +// integer indicates the new maximum write length. +func (f *File) checkLimit(ctx context.Context, offset int64) (int64, bool) { + if IsRegular(f.Dirent.Inode.StableAttr) { + // Enforce size limits. + fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur + if fileSizeLimit <= math.MaxInt64 { + if offset >= int64(fileSizeLimit) { + return 0, true + } + return int64(fileSizeLimit) - offset, true + } + } + + return 0, false +} + +// Fsync calls f.FileOperations.Fsync with f as the File. +// +// Returns syserror.ErrInterrupted if syncing was interrupted. +func (f *File) Fsync(ctx context.Context, start int64, end int64, syncType SyncType) error { + if !f.mu.Lock(ctx) { + return syserror.ErrInterrupted + } + defer f.mu.Unlock() + + return f.FileOperations.Fsync(ctx, f, start, end, syncType) +} + +// Flush calls f.FileOperations.Flush with f as the File. +// +// Returns syserror.ErrInterrupted if syncing was interrupted. +func (f *File) Flush(ctx context.Context) error { + if !f.mu.Lock(ctx) { + return syserror.ErrInterrupted + } + defer f.mu.Unlock() + + return f.FileOperations.Flush(ctx, f) +} + +// ConfigureMMap calls f.FileOperations.ConfigureMMap with f as the File. +// +// Returns syserror.ErrInterrupted if interrupted. +func (f *File) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + if !f.mu.Lock(ctx) { + return syserror.ErrInterrupted + } + defer f.mu.Unlock() + + return f.FileOperations.ConfigureMMap(ctx, f, opts) +} + +// UnstableAttr calls f.FileOperations.UnstableAttr with f as the File. +// +// Returns syserror.ErrInterrupted if interrupted. +func (f *File) UnstableAttr(ctx context.Context) (UnstableAttr, error) { + if !f.mu.Lock(ctx) { + return UnstableAttr{}, syserror.ErrInterrupted + } + defer f.mu.Unlock() + + return f.FileOperations.UnstableAttr(ctx, f) +} + +// MappedName implements memmap.MappingIdentity.MappedName. +func (f *File) MappedName(ctx context.Context) string { + root := RootFromContext(ctx) + if root != nil { + defer root.DecRef() + } + name, _ := f.Dirent.FullName(root) + return name +} + +// DeviceID implements memmap.MappingIdentity.DeviceID. +func (f *File) DeviceID() uint64 { + return f.Dirent.Inode.StableAttr.DeviceID +} + +// InodeID implements memmap.MappingIdentity.InodeID. +func (f *File) InodeID() uint64 { + return f.Dirent.Inode.StableAttr.InodeID +} + +// Msync implements memmap.MappingIdentity.Msync. +func (f *File) Msync(ctx context.Context, mr memmap.MappableRange) error { + return f.Fsync(ctx, int64(mr.Start), int64(mr.End-1), SyncData) +} + +// A FileAsync sends signals to its owner when w is ready for IO. +type FileAsync interface { + Register(w waiter.Waitable) + Unregister(w waiter.Waitable) +} + +// Async gets the stored FileAsync or creates a new one with the supplied +// function. If the supplied function is nil, no FileAsync is created and the +// current value is returned. +func (f *File) Async(newAsync func() FileAsync) FileAsync { + f.flagsMu.Lock() + defer f.flagsMu.Unlock() + if f.async == nil && newAsync != nil { + f.async = newAsync() + if f.flags.Async { + f.async.Register(f) + } + } + return f.async +} + +// lockedReader implements io.Reader and io.ReaderAt. +// +// Note this reads the underlying file using the file operations directly. It +// is the responsibility of the caller to ensure that locks are appropriately +// held and offsets updated if required. This should be used only by internal +// functions that perform these operations and checks at other times. +type lockedReader struct { + // Ctx is the context for the file reader. + Ctx context.Context + + // File is the file to read from. + File *File + + // Offset is the offset to start at. + // + // This applies only to Read, not ReadAt. + Offset int64 +} + +// Read implements io.Reader.Read. +func (r *lockedReader) Read(buf []byte) (int, error) { + if r.Ctx.Interrupted() { + return 0, syserror.ErrInterrupted + } + n, err := r.File.FileOperations.Read(r.Ctx, r.File, usermem.BytesIOSequence(buf), r.Offset) + r.Offset += n + return int(n), err +} + +// ReadAt implements io.Reader.ReadAt. +func (r *lockedReader) ReadAt(buf []byte, offset int64) (int, error) { + if r.Ctx.Interrupted() { + return 0, syserror.ErrInterrupted + } + n, err := r.File.FileOperations.Read(r.Ctx, r.File, usermem.BytesIOSequence(buf), offset) + return int(n), err +} + +// lockedWriter implements io.Writer and io.WriterAt. +// +// The same constraints as lockedReader apply; see above. +type lockedWriter struct { + // Ctx is the context for the file writer. + Ctx context.Context + + // File is the file to write to. + File *File + + // Offset is the offset to start at. + // + // This applies only to Write, not WriteAt. + Offset int64 +} + +// Write implements io.Writer.Write. +func (w *lockedWriter) Write(buf []byte) (int, error) { + if w.Ctx.Interrupted() { + return 0, syserror.ErrInterrupted + } + n, err := w.WriteAt(buf, w.Offset) + w.Offset += int64(n) + return int(n), err +} + +// WriteAt implements io.Writer.WriteAt. +func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) { + var ( + written int + err error + ) + // The io.Writer contract requires that Write writes all available + // bytes and does not return short writes. This causes errors with + // io.Copy, since our own Write interface does not have this same + // contract. Enforce that here. + for written < len(buf) { + if w.Ctx.Interrupted() { + return written, syserror.ErrInterrupted + } + var n int64 + n, err = w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf[written:]), offset+int64(written)) + if n > 0 { + written += int(n) + } + if err != nil { + break + } + } + return written, err +} |