summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fs/file.go
blob: ca41520b4765685d0b3c6f354de4cba569ff20a7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fs

import (
	"math"
	"sync/atomic"
	"time"

	"gvisor.dev/gvisor/pkg/amutex"
	"gvisor.dev/gvisor/pkg/context"
	"gvisor.dev/gvisor/pkg/metric"
	"gvisor.dev/gvisor/pkg/refs"
	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
	"gvisor.dev/gvisor/pkg/sentry/limits"
	"gvisor.dev/gvisor/pkg/sentry/memmap"
	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
	"gvisor.dev/gvisor/pkg/sync"
	"gvisor.dev/gvisor/pkg/syserror"
	"gvisor.dev/gvisor/pkg/usermem"
	"gvisor.dev/gvisor/pkg/waiter"
)

var (
	// RecordWaitTime controls writing metrics for filesystem reads.
	// Enabling this comes at a small CPU cost due to performing two
	// monotonic clock reads per read call.
	//
	// Note that this is only performed in the direct read path, and may
	// not be consistently applied for other forms of reads, such as
	// splice.
	RecordWaitTime = false

	reads    = metric.MustCreateNewUint64Metric("/fs/reads", false /* sync */, "Number of file reads.")
	readWait = metric.MustCreateNewUint64NanosecondsMetric("/fs/read_wait", false /* sync */, "Time waiting on file reads, in nanoseconds.")
)

// IncrementWait increments the given wait time metric, if enabled.
func IncrementWait(m *metric.Uint64Metric, start time.Time) {
	if !RecordWaitTime {
		return
	}
	m.IncrementBy(uint64(time.Since(start)))
}

// FileMaxOffset is the maximum possible file offset.
const FileMaxOffset = math.MaxInt64

// File is an open file handle. It is thread-safe.
//
// File provides stronger synchronization guarantees than Linux. Linux
// synchronizes lseek(2), read(2), and write(2) with respect to the file
// offset for regular files and only for those interfaces. See
// fs/read_write.c:fdget_pos, fs.read_write.c:fdput_pos and FMODE_ATOMIC_POS.
//
// In contrast, File synchronizes any operation that could take a long time
// under a single abortable mutex which also synchronizes lseek(2), read(2),
// and write(2).
//
// FIXME(b/38451980): Split synchronization from cancellation.
//
// +stateify savable
type File struct {
	refs.AtomicRefCount

	// UniqueID is the globally unique identifier of the File.
	UniqueID uint64

	// Dirent is the Dirent backing this File. This encodes the name
	// of the File via Dirent.FullName() as well as its identity via the
	// Dirent's Inode. The Dirent is non-nil.
	//
	// A File holds a reference to this Dirent. Using the returned Dirent is
	// only safe as long as a reference on the File is held. The association
	// between a File and a Dirent is immutable.
	//
	// Files that are not parented in a filesystem return a root Dirent
	// that holds a reference to their Inode.
	//
	// The name of the Dirent may reflect parentage if the Dirent is not a
	// root Dirent or the identity of the File on a pseudo filesystem (pipefs,
	// sockfs, etc).
	//
	// Multiple Files may hold a reference to the same Dirent. This is the
	// common case for Files that are parented and maintain consistency with
	// other files via the Dirent cache.
	Dirent *Dirent

	// flagsMu protects flags and async below.
	flagsMu sync.Mutex `state:"nosave"`

	// flags are the File's flags. Setting or getting flags is fully atomic
	// and is not protected by mu (below).
	flags FileFlags

	// async handles O_ASYNC notifications.
	async FileAsync

	// saving indicates that this file is in the process of being saved.
	saving bool `state:"nosave"`

	// mu is dual-purpose: first, to make read(2) and write(2) thread-safe
	// in conformity with POSIX, and second, to cancel operations before they
	// begin in response to interruptions (i.e. signals).
	mu amutex.AbortableMutex `state:"nosave"`

	// FileOperations implements file system specific behavior for this File.
	FileOperations FileOperations `state:"wait"`

	// offset is the File's offset. Updating offset is protected by mu but
	// can be read atomically via File.Offset() outside of mu.
	offset int64
}

// NewFile returns a File. It takes a reference on the Dirent and owns the
// lifetime of the FileOperations. Files that do not support reading and
// writing at an arbitrary offset should set flags.Pread and flags.Pwrite
// to false respectively.
func NewFile(ctx context.Context, dirent *Dirent, flags FileFlags, fops FileOperations) *File {
	dirent.IncRef()
	f := File{
		UniqueID:       uniqueid.GlobalFromContext(ctx),
		Dirent:         dirent,
		FileOperations: fops,
		flags:          flags,
	}
	f.mu.Init()
	f.EnableLeakCheck("fs.File")
	return &f
}

// DecRef destroys the File when it is no longer referenced.
func (f *File) DecRef() {
	f.DecRefWithDestructor(func() {
		// Drop BSD style locks.
		lockRng := lock.LockRange{Start: 0, End: lock.LockEOF}
		f.Dirent.Inode.LockCtx.BSD.UnlockRegion(f, lockRng)

		// Release resources held by the FileOperations.
		f.FileOperations.Release()

		// Release a reference on the Dirent.
		f.Dirent.DecRef()

		// Only unregister if we are currently registered. There is nothing
		// to register if f.async is nil (this happens when async mode is
		// enabled without setting an owner). Also, we unregister during
		// save.
		f.flagsMu.Lock()
		if !f.saving && f.flags.Async && f.async != nil {
			f.async.Unregister(f)
		}
		f.async = nil
		f.flagsMu.Unlock()
	})
}

// Flags atomically loads the File's flags.
func (f *File) Flags() FileFlags {
	f.flagsMu.Lock()
	flags := f.flags
	f.flagsMu.Unlock()
	return flags
}

// SetFlags atomically changes the File's flags to the values contained
// in newFlags. See SettableFileFlags for values that can be set.
func (f *File) SetFlags(newFlags SettableFileFlags) {
	f.flagsMu.Lock()
	f.flags.Direct = newFlags.Direct
	f.flags.NonBlocking = newFlags.NonBlocking
	f.flags.Append = newFlags.Append
	if f.async != nil {
		if newFlags.Async && !f.flags.Async {
			f.async.Register(f)
		}
		if !newFlags.Async && f.flags.Async {
			f.async.Unregister(f)
		}
	}
	f.flags.Async = newFlags.Async
	f.flagsMu.Unlock()
}

// Offset atomically loads the File's offset.
func (f *File) Offset() int64 {
	return atomic.LoadInt64(&f.offset)
}

// Readiness implements waiter.Waitable.Readiness.
func (f *File) Readiness(mask waiter.EventMask) waiter.EventMask {
	return f.FileOperations.Readiness(mask)
}

// EventRegister implements waiter.Waitable.EventRegister.
func (f *File) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
	f.FileOperations.EventRegister(e, mask)
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (f *File) EventUnregister(e *waiter.Entry) {
	f.FileOperations.EventUnregister(e)
}

// Seek calls f.FileOperations.Seek with f as the File, updating the file
// offset to the value returned by f.FileOperations.Seek if the operation
// is successful.
//
// Returns syserror.ErrInterrupted if seeking was interrupted.
func (f *File) Seek(ctx context.Context, whence SeekWhence, offset int64) (int64, error) {
	if !f.mu.Lock(ctx) {
		return 0, syserror.ErrInterrupted
	}
	defer f.mu.Unlock()

	newOffset, err := f.FileOperations.Seek(ctx, f, whence, offset)
	if err == nil {
		atomic.StoreInt64(&f.offset, newOffset)
	}
	return newOffset, err
}

// Readdir reads the directory entries of this File and writes them out
// to the DentrySerializer until entries can no longer be written. If even
// a single directory entry is written then Readdir returns a nil error
// and the directory offset is advanced.
//
// Readdir unconditionally updates the access time on the File's Inode,
// see fs/readdir.c:iterate_dir.
//
// Returns syserror.ErrInterrupted if reading was interrupted.
func (f *File) Readdir(ctx context.Context, serializer DentrySerializer) error {
	if !f.mu.Lock(ctx) {
		return syserror.ErrInterrupted
	}
	defer f.mu.Unlock()

	offset, err := f.FileOperations.Readdir(ctx, f, serializer)
	atomic.StoreInt64(&f.offset, offset)
	return err
}

// Readv calls f.FileOperations.Read with f as the File, advancing the file
// offset if f.FileOperations.Read returns bytes read > 0.
//
// Returns syserror.ErrInterrupted if reading was interrupted.
func (f *File) Readv(ctx context.Context, dst usermem.IOSequence) (int64, error) {
	var start time.Time
	if RecordWaitTime {
		start = time.Now()
	}
	if !f.mu.Lock(ctx) {
		IncrementWait(readWait, start)
		return 0, syserror.ErrInterrupted
	}

	reads.Increment()
	n, err := f.FileOperations.Read(ctx, f, dst, f.offset)
	if n > 0 && !f.flags.NonSeekable {
		atomic.AddInt64(&f.offset, n)
	}
	f.mu.Unlock()
	IncrementWait(readWait, start)
	return n, err
}

// Preadv calls f.FileOperations.Read with f as the File. It does not
// advance the file offset. If !f.Flags().Pread, Preadv should not be
// called.
//
// Otherwise same as Readv.
func (f *File) Preadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
	var start time.Time
	if RecordWaitTime {
		start = time.Now()
	}
	if !f.mu.Lock(ctx) {
		IncrementWait(readWait, start)
		return 0, syserror.ErrInterrupted
	}

	reads.Increment()
	n, err := f.FileOperations.Read(ctx, f, dst, offset)
	f.mu.Unlock()
	IncrementWait(readWait, start)
	return n, err
}

// Writev calls f.FileOperations.Write with f as the File, advancing the
// file offset if f.FileOperations.Write returns bytes written > 0.
//
// Writev positions the write offset at EOF if f.Flags().Append. This is
// unavoidably racy for network file systems. Writev also truncates src
// to avoid overrunning the current file size limit if necessary.
//
// Returns syserror.ErrInterrupted if writing was interrupted.
func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error) {
	if !f.mu.Lock(ctx) {
		return 0, syserror.ErrInterrupted
	}
	unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append)
	// Handle append mode.
	if f.Flags().Append {
		if err := f.offsetForAppend(ctx, &f.offset); err != nil {
			unlockAppendMu()
			f.mu.Unlock()
			return 0, err
		}
	}

	// Enforce file limits.
	limit, ok := f.checkLimit(ctx, f.offset)
	switch {
	case ok && limit == 0:
		unlockAppendMu()
		f.mu.Unlock()
		return 0, syserror.ErrExceedsFileSizeLimit
	case ok:
		src = src.TakeFirst64(limit)
	}

	// We must hold the lock during the write.
	n, err := f.FileOperations.Write(ctx, f, src, f.offset)
	if n >= 0 && !f.flags.NonSeekable {
		atomic.StoreInt64(&f.offset, f.offset+n)
	}
	unlockAppendMu()
	f.mu.Unlock()
	return n, err
}

// Pwritev calls f.FileOperations.Write with f as the File. It does not
// advance the file offset. If !f.Flags().Pwritev, Pwritev should not be
// called.
//
// Otherwise same as Writev.
func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
	// "POSIX requires that opening a file with the O_APPEND flag should
	// have no effect on the location at which pwrite() writes data.
	// However, on Linux, if a file is opened with O_APPEND,  pwrite()
	// appends data to the end of the file, regardless of the value of
	// offset."
	unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append)
	defer unlockAppendMu()
	if f.Flags().Append {
		if err := f.offsetForAppend(ctx, &offset); err != nil {
			return 0, err
		}
	}

	// Enforce file limits.
	limit, ok := f.checkLimit(ctx, offset)
	switch {
	case ok && limit == 0:
		return 0, syserror.ErrExceedsFileSizeLimit
	case ok:
		src = src.TakeFirst64(limit)
	}

	return f.FileOperations.Write(ctx, f, src, offset)
}

// offsetForAppend atomically sets the given offset to the end of the file.
//
// Precondition: the file.Dirent.Inode.appendMu mutex should be held for
// writing.
func (f *File) offsetForAppend(ctx context.Context, offset *int64) error {
	uattr, err := f.Dirent.Inode.UnstableAttr(ctx)
	if err != nil {
		// This is an odd error, we treat it as evidence that
		// something is terribly wrong with the filesystem.
		return syserror.EIO
	}

	// Update the offset.
	atomic.StoreInt64(offset, uattr.Size)

	return nil
}

// checkLimit checks the offset that the write will be performed at. The
// returned boolean indicates that the write must be limited. The returned
// integer indicates the new maximum write length.
func (f *File) checkLimit(ctx context.Context, offset int64) (int64, bool) {
	if IsRegular(f.Dirent.Inode.StableAttr) {
		// Enforce size limits.
		fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur
		if fileSizeLimit <= math.MaxInt64 {
			if offset >= int64(fileSizeLimit) {
				return 0, true
			}
			return int64(fileSizeLimit) - offset, true
		}
	}

	return 0, false
}

// Fsync calls f.FileOperations.Fsync with f as the File.
//
// Returns syserror.ErrInterrupted if syncing was interrupted.
func (f *File) Fsync(ctx context.Context, start int64, end int64, syncType SyncType) error {
	if !f.mu.Lock(ctx) {
		return syserror.ErrInterrupted
	}
	defer f.mu.Unlock()

	return f.FileOperations.Fsync(ctx, f, start, end, syncType)
}

// Flush calls f.FileOperations.Flush with f as the File.
//
// Returns syserror.ErrInterrupted if syncing was interrupted.
func (f *File) Flush(ctx context.Context) error {
	if !f.mu.Lock(ctx) {
		return syserror.ErrInterrupted
	}
	defer f.mu.Unlock()

	return f.FileOperations.Flush(ctx, f)
}

// ConfigureMMap calls f.FileOperations.ConfigureMMap with f as the File.
//
// Returns syserror.ErrInterrupted if interrupted.
func (f *File) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
	if !f.mu.Lock(ctx) {
		return syserror.ErrInterrupted
	}
	defer f.mu.Unlock()

	return f.FileOperations.ConfigureMMap(ctx, f, opts)
}

// UnstableAttr calls f.FileOperations.UnstableAttr with f as the File.
//
// Returns syserror.ErrInterrupted if interrupted.
func (f *File) UnstableAttr(ctx context.Context) (UnstableAttr, error) {
	if !f.mu.Lock(ctx) {
		return UnstableAttr{}, syserror.ErrInterrupted
	}
	defer f.mu.Unlock()

	return f.FileOperations.UnstableAttr(ctx, f)
}

// MappedName implements memmap.MappingIdentity.MappedName.
func (f *File) MappedName(ctx context.Context) string {
	root := RootFromContext(ctx)
	if root != nil {
		defer root.DecRef()
	}
	name, _ := f.Dirent.FullName(root)
	return name
}

// DeviceID implements memmap.MappingIdentity.DeviceID.
func (f *File) DeviceID() uint64 {
	return f.Dirent.Inode.StableAttr.DeviceID
}

// InodeID implements memmap.MappingIdentity.InodeID.
func (f *File) InodeID() uint64 {
	return f.Dirent.Inode.StableAttr.InodeID
}

// Msync implements memmap.MappingIdentity.Msync.
func (f *File) Msync(ctx context.Context, mr memmap.MappableRange) error {
	return f.Fsync(ctx, int64(mr.Start), int64(mr.End-1), SyncData)
}

// A FileAsync sends signals to its owner when w is ready for IO.
type FileAsync interface {
	Register(w waiter.Waitable)
	Unregister(w waiter.Waitable)
}

// Async gets the stored FileAsync or creates a new one with the supplied
// function. If the supplied function is nil, no FileAsync is created and the
// current value is returned.
func (f *File) Async(newAsync func() FileAsync) FileAsync {
	f.flagsMu.Lock()
	defer f.flagsMu.Unlock()
	if f.async == nil && newAsync != nil {
		f.async = newAsync()
		if f.flags.Async {
			f.async.Register(f)
		}
	}
	return f.async
}

// lockedReader implements io.Reader and io.ReaderAt.
//
// Note this reads the underlying file using the file operations directly. It
// is the responsibility of the caller to ensure that locks are appropriately
// held and offsets updated if required. This should be used only by internal
// functions that perform these operations and checks at other times.
type lockedReader struct {
	// Ctx is the context for the file reader.
	Ctx context.Context

	// File is the file to read from.
	File *File

	// Offset is the offset to start at.
	//
	// This applies only to Read, not ReadAt.
	Offset int64
}

// Read implements io.Reader.Read.
func (r *lockedReader) Read(buf []byte) (int, error) {
	if r.Ctx.Interrupted() {
		return 0, syserror.ErrInterrupted
	}
	n, err := r.File.FileOperations.Read(r.Ctx, r.File, usermem.BytesIOSequence(buf), r.Offset)
	r.Offset += n
	return int(n), err
}

// ReadAt implements io.Reader.ReadAt.
func (r *lockedReader) ReadAt(buf []byte, offset int64) (int, error) {
	if r.Ctx.Interrupted() {
		return 0, syserror.ErrInterrupted
	}
	n, err := r.File.FileOperations.Read(r.Ctx, r.File, usermem.BytesIOSequence(buf), offset)
	return int(n), err
}

// lockedWriter implements io.Writer and io.WriterAt.
//
// The same constraints as lockedReader apply; see above.
type lockedWriter struct {
	// Ctx is the context for the file writer.
	Ctx context.Context

	// File is the file to write to.
	File *File

	// Offset is the offset to start at.
	//
	// This applies only to Write, not WriteAt.
	Offset int64
}

// Write implements io.Writer.Write.
func (w *lockedWriter) Write(buf []byte) (int, error) {
	if w.Ctx.Interrupted() {
		return 0, syserror.ErrInterrupted
	}
	n, err := w.WriteAt(buf, w.Offset)
	w.Offset += int64(n)
	return int(n), err
}

// WriteAt implements io.Writer.WriteAt.
func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) {
	var (
		written int
		err     error
	)
	// The io.Writer contract requires that Write writes all available
	// bytes and does not return short writes. This causes errors with
	// io.Copy, since our own Write interface does not have this same
	// contract. Enforce that here.
	for written < len(buf) {
		if w.Ctx.Interrupted() {
			return written, syserror.ErrInterrupted
		}
		var n int64
		n, err = w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf[written:]), offset+int64(written))
		if n > 0 {
			written += int(n)
		}
		if err != nil {
			break
		}
	}
	return written, err
}