diff options
author | Fabricio Voznika <fvoznika@google.com> | 2020-01-28 15:13:46 -0800 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2020-01-28 15:31:03 -0800 |
commit | 437c986c6a0ed0e1fccfbfb6706f43d2c801c444 (patch) | |
tree | c3ae0e02f85601e59ec58d946fc68cb151c44bbe /pkg | |
parent | 2862b0b1be9ce821e86877802b9608aad3102916 (diff) |
Add vfs.FileDescription to FD table
FD table now holds both VFS1 and VFS2 types and uses the correct
one based on what's set.
Parts of this CL are just initial changes (e.g. sys_read.go,
runsc/main.go) to serve as a template for the remaining changes.
Updates #1487
Updates #1623
PiperOrigin-RevId: 292023223
Diffstat (limited to 'pkg')
-rw-r--r-- | pkg/sentry/kernel/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/kernel/fd_table.go | 166 | ||||
-rw-r--r-- | pkg/sentry/kernel/fd_table_test.go | 4 | ||||
-rw-r--r-- | pkg/sentry/kernel/fd_table_unsafe.go | 98 | ||||
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 31 | ||||
-rw-r--r-- | pkg/sentry/kernel/task.go | 9 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_exec.go | 3 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/error.go | 72 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_file.go | 2 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/BUILD | 24 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/linux64.go | 16 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go | 25 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go | 25 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/sys_read.go | 95 |
15 files changed, 483 insertions, 89 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 0738946d9..a27628c0a 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -188,6 +188,7 @@ go_library( "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", "//pkg/sentry/uniqueid", "//pkg/sentry/usage", + "//pkg/sentry/vfs", "//pkg/state", "//pkg/state/statefile", "//pkg/sync", diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 9460bb235..56b70ce96 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) @@ -62,10 +63,14 @@ func (f FDFlags) ToLinuxFDFlags() (mask uint) { // Note that this is immutable and can only be changed via operations on the // descriptorTable. // +// It contains both VFS1 and VFS2 file types, but only one of them can be set. +// // +stateify savable type descriptor struct { - file *fs.File - flags FDFlags + // TODO(gvisor.dev/issue/1624): Remove fs.File. + file *fs.File + fileVFS2 *vfs.FileDescription + flags FDFlags } // FDTable is used to manage File references and flags. @@ -95,10 +100,11 @@ type FDTable struct { func (f *FDTable) saveDescriptorTable() map[int32]descriptor { m := make(map[int32]descriptor) - f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { m[fd] = descriptor{ - file: file, - flags: flags, + file: file, + fileVFS2: fileVFS2, + flags: flags, } }) return m @@ -107,13 +113,17 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor { func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { f.init() // Initialize table. for fd, d := range m { - f.set(fd, d.file, d.flags) - - // Note that we do _not_ need to acquire a extra table - // reference here. The table reference will already be - // accounted for in the file, so we drop the reference taken by - // set above. - d.file.DecRef() + f.setAll(fd, d.file, d.fileVFS2, d.flags) + + // Note that we do _not_ need to acquire a extra table reference here. The + // table reference will already be accounted for in the file, so we drop the + // reference taken by set above. + switch { + case d.file != nil: + d.file.DecRef() + case d.fileVFS2 != nil: + d.fileVFS2.DecRef() + } } } @@ -139,6 +149,15 @@ func (f *FDTable) drop(file *fs.File) { file.DecRef() } +// dropVFS2 drops the table reference. +func (f *FDTable) dropVFS2(file *vfs.FileDescription) { + // TODO(gvisor.dev/issue/1480): Release locks. + // TODO(gvisor.dev/issue/1479): Send inotify events. + + // Drop the table reference. + file.DecRef() +} + // ID returns a unique identifier for this FDTable. func (f *FDTable) ID() uint64 { return f.uid @@ -156,7 +175,7 @@ func (k *Kernel) NewFDTable() *FDTable { // destroy removes all of the file descriptors from the map. func (f *FDTable) destroy() { - f.RemoveIf(func(*fs.File, FDFlags) bool { + f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool { return true }) } @@ -175,19 +194,26 @@ func (f *FDTable) Size() int { // forEach iterates over all non-nil files. // // It is the caller's responsibility to acquire an appropriate lock. -func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) { +func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { fd := int32(0) for { - file, flags, ok := f.get(fd) + file, fileVFS2, flags, ok := f.getAll(fd) if !ok { break } - if file != nil { + switch { + case file != nil: if !file.TryIncRef() { continue // Race caught. } - fn(int32(fd), file, flags) + fn(fd, file, nil, flags) file.DecRef() + case fileVFS2 != nil: + if !fileVFS2.TryIncRef() { + continue // Race caught. + } + fn(fd, nil, fileVFS2, flags) + fileVFS2.DecRef() } fd++ } @@ -196,9 +222,21 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) { // String is a stringer for FDTable. func (f *FDTable) String() string { var b bytes.Buffer - f.forEach(func(fd int32, file *fs.File, flags FDFlags) { - n, _ := file.Dirent.FullName(nil /* root */) - b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n)) + f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + switch { + case file != nil: + n, _ := file.Dirent.FullName(nil /* root */) + b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n)) + + case fileVFS2 != nil: + fs := fileVFS2.VirtualDentry().Mount().Filesystem().VirtualFilesystem() + // TODO(gvisor.dev/issue/1623): We have no context nor root. Will this work? + name, err := fs.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) + if err != nil { + b.WriteString(fmt.Sprintf("<err: %v>\n", err)) + } + b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, name)) + } }) return b.String() } @@ -262,6 +300,17 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags // reference for that FD, the ref count for that existing reference is // decremented. func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error { + return f.newFDAt(ctx, fd, file, nil, flags) +} + +// NewFDAtVFS2 sets the file reference for the given FD. If there is an active +// reference for that FD, the ref count for that existing reference is +// decremented. +func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error { + return f.newFDAt(ctx, fd, nil, file, flags) +} + +func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error { if fd < 0 { // Don't accept negative FDs. return syscall.EBADF @@ -278,7 +327,7 @@ func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FD } // Install the entry. - f.set(fd, file, flags) + f.setAll(fd, file, fileVFS2, flags) return nil } @@ -330,10 +379,35 @@ func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) { } } +// GetVFS2 returns a reference to the file and the flags for the FD or nil if no +// file is defined for the given fd. +// +// N.B. Callers are required to use DecRef when they are done. +// +//go:nosplit +func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { + if fd < 0 { + return nil, FDFlags{} + } + + for { + file, flags, _ := f.getVFS2(fd) + if file != nil { + if !file.TryIncRef() { + continue // Race caught. + } + // Reference acquired. + return file, flags + } + // No file available. + return nil, FDFlags{} + } +} + // GetFDs returns a list of valid fds. func (f *FDTable) GetFDs() []int32 { fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) - f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { fds = append(fds, fd) }) return fds @@ -344,7 +418,19 @@ func (f *FDTable) GetFDs() []int32 { // they're done using the slice. func (f *FDTable) GetRefs() []*fs.File { files := make([]*fs.File, 0, f.Size()) - f.forEach(func(_ int32, file *fs.File, flags FDFlags) { + f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + file.IncRef() // Acquire a reference for caller. + files = append(files, file) + }) + return files +} + +// GetRefsVFS2 returns a stable slice of references to all files and bumps the +// reference count on each. The caller must use DecRef on each reference when +// they're done using the slice. +func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription { + files := make([]*vfs.FileDescription, 0, f.Size()) + f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) { file.IncRef() // Acquire a reference for caller. files = append(files, file) }) @@ -355,10 +441,15 @@ func (f *FDTable) GetRefs() []*fs.File { func (f *FDTable) Fork() *FDTable { clone := f.k.NewFDTable() - f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { // The set function here will acquire an appropriate table // reference for the clone. We don't need anything else. - clone.set(fd, file, flags) + switch { + case file != nil: + clone.set(fd, file, flags) + case fileVFS2 != nil: + clone.setVFS2(fd, fileVFS2, flags) + } }) return clone } @@ -366,9 +457,9 @@ func (f *FDTable) Fork() *FDTable { // Remove removes an FD from and returns a non-file iff successful. // // N.B. Callers are required to use DecRef when they are done. -func (f *FDTable) Remove(fd int32) *fs.File { +func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) { if fd < 0 { - return nil + return nil, nil } f.mu.Lock() @@ -379,21 +470,26 @@ func (f *FDTable) Remove(fd int32) *fs.File { f.next = fd } - orig, _, _ := f.get(fd) - if orig != nil { - orig.IncRef() // Reference for caller. - f.set(fd, nil, FDFlags{}) // Zap entry. + orig, orig2, _, _ := f.getAll(fd) + + // Add reference for caller. + switch { + case orig != nil: + orig.IncRef() + case orig2 != nil: + orig2.IncRef() } - return orig + f.setAll(fd, nil, nil, FDFlags{}) // Zap entry. + return orig, orig2 } // RemoveIf removes all FDs where cond is true. -func (f *FDTable) RemoveIf(cond func(*fs.File, FDFlags) bool) { +func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) { f.mu.Lock() defer f.mu.Unlock() - f.forEach(func(fd int32, file *fs.File, flags FDFlags) { - if cond(file, flags) { + f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + if cond(file, fileVFS2, flags) { f.set(fd, nil, FDFlags{}) // Clear from table. // Update current available position. if fd < f.next { diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go index 261b815f2..29f95a2c4 100644 --- a/pkg/sentry/kernel/fd_table_test.go +++ b/pkg/sentry/kernel/fd_table_test.go @@ -150,13 +150,13 @@ func TestFDTable(t *testing.T) { t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref) } - ref := fdTable.Remove(1) + ref, _ := fdTable.Remove(1) if ref == nil { t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success") } ref.DecRef() - if ref := fdTable.Remove(1); ref != nil { + if ref, _ := fdTable.Remove(1); ref != nil { t.Fatalf("r.Remove(1) for a removed FD: got success, want failure") } }) diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go index e9fdb0917..7fd97dc53 100644 --- a/pkg/sentry/kernel/fd_table_unsafe.go +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -19,6 +19,7 @@ import ( "unsafe" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/vfs" ) type descriptorTable struct { @@ -41,15 +42,38 @@ func (f *FDTable) init() { // //go:nosplit func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) { + file, _, flags, ok := f.getAll(fd) + return file, flags, ok +} + +// getVFS2 gets a file entry. +// +// The boolean indicates whether this was in range. +// +//go:nosplit +func (f *FDTable) getVFS2(fd int32) (*vfs.FileDescription, FDFlags, bool) { + _, file, flags, ok := f.getAll(fd) + return file, flags, ok +} + +// getAll gets a file entry. +// +// The boolean indicates whether this was in range. +// +//go:nosplit +func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, bool) { slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) if fd >= int32(len(slice)) { - return nil, FDFlags{}, false + return nil, nil, FDFlags{}, false } d := (*descriptor)(atomic.LoadPointer(&slice[fd])) if d == nil { - return nil, FDFlags{}, true + return nil, nil, FDFlags{}, true } - return d.file, d.flags, true + if d.file != nil && d.fileVFS2 != nil { + panic("VFS1 and VFS2 files set") + } + return d.file, d.fileVFS2, d.flags, true } // set sets an entry. @@ -59,6 +83,30 @@ func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) { // // Precondition: mu must be held. func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) { + f.setAll(fd, file, nil, flags) +} + +// setVFS2 sets an entry. +// +// This handles accounting changes, as well as acquiring and releasing the +// reference needed by the table iff the file is different. +// +// Precondition: mu must be held. +func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) { + f.setAll(fd, nil, file, flags) +} + +// setAll sets an entry. +// +// This handles accounting changes, as well as acquiring and releasing the +// reference needed by the table iff the file is different. +// +// Precondition: mu must be held. +func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + if file != nil && fileVFS2 != nil { + panic("VFS1 and VFS2 files set") + } + slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) // Grow the table as required. @@ -71,33 +119,51 @@ func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) { atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) } - // Create the new element. - var d *descriptor - if file != nil { - d = &descriptor{ - file: file, - flags: flags, + var desc *descriptor + if file != nil || fileVFS2 != nil { + desc = &descriptor{ + file: file, + fileVFS2: fileVFS2, + flags: flags, } } // Update the single element. - orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(d))) + orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(desc))) // Acquire a table reference. - if file != nil && (orig == nil || file != orig.file) { - file.IncRef() + if desc != nil { + switch { + case desc.file != nil: + if orig == nil || desc.file != orig.file { + desc.file.IncRef() + } + case desc.fileVFS2 != nil: + if orig == nil || desc.fileVFS2 != orig.fileVFS2 { + desc.fileVFS2.IncRef() + } + } } // Drop the table reference. - if orig != nil && file != orig.file { - f.drop(orig.file) + if orig != nil { + switch { + case orig.file != nil: + if desc == nil || desc.file != orig.file { + f.drop(orig.file) + } + case orig.fileVFS2 != nil: + if desc == nil || desc.fileVFS2 != orig.fileVFS2 { + f.dropVFS2(orig.fileVFS2) + } + } } // Adjust used. switch { - case orig == nil && file != nil: + case orig == nil && desc != nil: atomic.AddInt32(&f.used, 1) - case orig != nil && file == nil: + case orig != nil && desc == nil: atomic.AddInt32(&f.used, -1) } } diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 7b90fac5a..dcd6e91c4 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -65,6 +65,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/unimpl" uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/state" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" @@ -435,17 +436,17 @@ func (k *Kernel) flushMountSourceRefs() error { // There may be some open FDs whose filesystems have been unmounted. We // must flush those as well. - return k.tasks.forEachFDPaused(func(file *fs.File) error { + return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { file.Dirent.Inode.MountSource.FlushDirentRefs() return nil }) } -// forEachFDPaused applies the given function to each open file descriptor in each -// task. +// forEachFDPaused applies the given function to each open file descriptor in +// each task. // // Precondition: Must be called with the kernel paused. -func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) { +func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { @@ -453,8 +454,8 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) { if t.fdTable == nil { continue } - t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { - if lastErr := f(file); lastErr != nil && err == nil { + t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { + if lastErr := f(file, fileVFS2); lastErr != nil && err == nil { err = lastErr } }) @@ -463,7 +464,8 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) { } func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { - return ts.forEachFDPaused(func(file *fs.File) error { + // TODO(gvisor.dev/issues/1663): Add save support for VFS2. + return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { if flags := file.Flags(); !flags.Write { return nil } @@ -474,12 +476,9 @@ func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) if err := fs.SaveFileFsyncError(syncErr); err != nil { name, _ := file.Dirent.FullName(nil /* root */) - // Wrap this error in ErrSaveRejection - // so that it will trigger a save - // error, rather than a panic. This - // also allows us to distinguish Fsync - // errors from state file errors in - // state.Save. + // Wrap this error in ErrSaveRejection so that it will trigger a save + // error, rather than a panic. This also allows us to distinguish Fsync + // errors from state file errors in state.Save. return fs.ErrSaveRejection{ Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err), } @@ -519,7 +518,7 @@ func (ts *TaskSet) unregisterEpollWaiters() { for t := range ts.Root.tids { // We can skip locking Task.mu here since the kernel is paused. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { if e, ok := file.FileOperations.(*epoll.EventPoll); ok { e.UnregisterEpollWaiters() } @@ -921,7 +920,7 @@ func (k *Kernel) pauseTimeLocked() { // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.PauseTimer() } @@ -951,7 +950,7 @@ func (k *Kernel) resumeTimeLocked() { } } if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.ResumeTimer() } diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 95adf2778..981e8c7fe 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -35,6 +35,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -743,6 +744,14 @@ func (t *Task) GetFile(fd int32) *fs.File { return f } +// GetFileVFS2 is a convenience wrapper for t.FDTable().GetVFS2. +// +// Precondition: same as FDTable.Get. +func (t *Task) GetFileVFS2(fd int32) *vfs.FileDescription { + f, _ := t.fdTable.GetVFS2(fd) + return f +} + // NewFDs is a convenience wrapper for t.FDTable().NewFDs. // // This automatically passes the task as the context. diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index fa6528386..8f57a34a6 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -69,6 +69,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -198,7 +199,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.Unlock() // Remove FDs with the CloseOnExec flag set. - t.fdTable.RemoveIf(func(file *fs.File, flags FDFlags) bool { + t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool { return flags.CloseOnExec }) diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 8d6c52850..be16ee686 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -93,6 +93,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/syscalls", "//pkg/sentry/usage", + "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go index 60469549d..64de56ac5 100644 --- a/pkg/sentry/syscalls/linux/error.go +++ b/pkg/sentry/syscalls/linux/error.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -31,20 +32,58 @@ var ( partialResultOnce sync.Once ) +// HandleIOErrorVFS2 handles special error cases for partial results. For some +// errors, we may consume the error and return only the partial read/write. +// +// op and f are used only for panics. +func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op string, f *vfs.FileDescription) error { + known, err := handleIOErrorImpl(t, partialResult, err, intr, op) + if err != nil { + return err + } + if !known { + // An unknown error is encountered with a partial read/write. + fs := f.Mount().Filesystem().VirtualFilesystem() + root := vfs.RootFromContext(t) + name, _ := fs.PathnameWithDeleted(t, root, f.VirtualDentry()) + log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, err, err, op, name) + partialResultOnce.Do(partialResultMetric.Increment) + } + return nil +} + // handleIOError handles special error cases for partial results. For some // errors, we may consume the error and return only the partial read/write. // // op and f are used only for panics. func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op string, f *fs.File) error { + known, err := handleIOErrorImpl(t, partialResult, err, intr, op) + if err != nil { + return err + } + if !known { + // An unknown error is encountered with a partial read/write. + name, _ := f.Dirent.FullName(nil /* ignore chroot */) + log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations) + partialResultOnce.Do(partialResultMetric.Increment) + } + return nil +} + +// handleIOError handles special error cases for partial results. For some +// errors, we may consume the error and return only the partial read/write. +// +// Returns false if error is unknown. +func handleIOErrorImpl(t *kernel.Task, partialResult bool, err, intr error, op string) (bool, error) { switch err { case nil: // Typical successful syscall. - return nil + return true, nil case io.EOF: // EOF is always consumed. If this is a partial read/write // (result != 0), the application will see that, otherwise // they will see 0. - return nil + return true, nil case syserror.ErrExceedsFileSizeLimit: // Ignore partialResult because this error only applies to // normal files, and for those files we cannot accumulate @@ -53,20 +92,20 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin // Do not consume the error and return it as EFBIG. // Simultaneously send a SIGXFSZ per setrlimit(2). t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) - return syserror.EFBIG + return true, syserror.EFBIG case syserror.ErrInterrupted: // The syscall was interrupted. Return nil if it completed // partially, otherwise return the error code that the syscall // needs (to indicate to the kernel what it should do). if partialResult { - return nil + return true, nil } - return intr + return true, intr } if !partialResult { // Typical syscall error. - return err + return true, err } switch err { @@ -75,14 +114,14 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin // read/write. Like ErrWouldBlock, since we have a // partial read/write, we consume the error and return // the partial result. - return nil + return true, nil case syserror.EFAULT: // EFAULT is only shown the user if nothing was // read/written. If we read something (this case), they see // a partial read/write. They will then presumably try again // with an incremented buffer, which will EFAULT with // result == 0. - return nil + return true, nil case syserror.EPIPE: // Writes to a pipe or socket will return EPIPE if the other // side is gone. The partial write is returned. EPIPE will be @@ -90,32 +129,29 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin // // TODO(gvisor.dev/issue/161): In some cases SIGPIPE should // also be sent to the application. - return nil + return true, nil case syserror.ENOSPC: // Similar to EPIPE. Return what we wrote this time, and let // ENOSPC be returned on the next call. - return nil + return true, nil case syserror.ECONNRESET: // For TCP sendfile connections, we may have a reset. But we // should just return n as the result. - return nil + return true, nil case syserror.ErrWouldBlock: // Syscall would block, but completed a partial read/write. // This case should only be returned by IssueIO for nonblocking // files. Since we have a partial read/write, we consume // ErrWouldBlock, returning the partial result. - return nil + return true, nil } switch err.(type) { case kernel.SyscallRestartErrno: // Identical to the EINTR case. - return nil + return true, nil } - // An unknown error is encountered with a partial read/write. - name, _ := f.Dirent.FullName(nil /* ignore chroot */) - log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations) - partialResultOnce.Do(partialResultMetric.Increment) - return nil + // Error is unknown and cannot be properly handled. + return false, nil } diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index c54735148..421845ebb 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -767,7 +767,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Note that Remove provides a reference on the file that we may use to // flush. It is still active until we drop the final reference below // (and other reference-holding operations complete). - file := t.FDTable().Remove(fd) + file, _ := t.FDTable().Remove(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD new file mode 100644 index 000000000..6b8a00b6e --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -0,0 +1,24 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "vfs2", + srcs = [ + "linux64.go", + "linux64_override_amd64.go", + "linux64_override_arm64.go", + "sys_read.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/sentry/arch", + "//pkg/sentry/kernel", + "//pkg/sentry/syscalls", + "//pkg/sentry/syscalls/linux", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64.go b/pkg/sentry/syscalls/linux/vfs2/linux64.go new file mode 100644 index 000000000..19ee36081 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/linux64.go @@ -0,0 +1,16 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package vfs2 provides syscall implementations that use VFS2. +package vfs2 diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go new file mode 100644 index 000000000..c134714ee --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go @@ -0,0 +1,25 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls" +) + +// Override syscall table to add syscalls implementations from this package. +func Override(table map[uintptr]kernel.Syscall) { + table[0] = syscalls.Supported("read", Read) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go new file mode 100644 index 000000000..6af5c400f --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go @@ -0,0 +1,25 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls" +) + +// Override syscall table to add syscalls implementations from this package. +func Override(table map[uintptr]kernel.Syscall) { + table[63] = syscalls.Supported("read", Read) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_read.go b/pkg/sentry/syscalls/linux/vfs2/sys_read.go new file mode 100644 index 000000000..b9fb58464 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/sys_read.go @@ -0,0 +1,95 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// Read implements linux syscall read(2). Note that we try to get a buffer that +// is exactly the size requested because some applications like qemu expect +// they can do large reads all at once. Bug for bug. Same for other read +// calls below. +func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the file is readable. + if !file.IsReadable() { + return 0, nil, syserror.EBADF + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := read(t, file, dst, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, linux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file) +} + +func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + n, err := file.Read(t, dst, opts) + if err != syserror.ErrWouldBlock { + return n, err + } + + // Register for notifications. + _, ch := waiter.NewChannelEntry(nil) + // file.EventRegister(&w, EventMaskRead) + + total := n + for { + // Shorten dst to reflect bytes previously read. + dst = dst.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err := file.Read(t, dst, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + if err := t.Block(ch); err != nil { + break + } + } + //file.EventUnregister(&w) + + return total, err +} |