diff options
Diffstat (limited to 'pkg/sentry')
-rw-r--r-- | pkg/sentry/kernel/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/kernel/fd_table.go | 166 | ||||
-rw-r--r-- | pkg/sentry/kernel/fd_table_test.go | 4 | ||||
-rw-r--r-- | pkg/sentry/kernel/fd_table_unsafe.go | 98 | ||||
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 31 | ||||
-rw-r--r-- | pkg/sentry/kernel/task.go | 9 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_exec.go | 3 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/error.go | 72 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_file.go | 2 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/BUILD | 24 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/linux64.go | 16 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go | 25 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go | 25 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/vfs2/sys_read.go | 95 |
15 files changed, 483 insertions, 89 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 0738946d9..a27628c0a 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -188,6 +188,7 @@ go_library( "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", "//pkg/sentry/uniqueid", "//pkg/sentry/usage", + "//pkg/sentry/vfs", "//pkg/state", "//pkg/state/statefile", "//pkg/sync", diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 9460bb235..56b70ce96 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) @@ -62,10 +63,14 @@ func (f FDFlags) ToLinuxFDFlags() (mask uint) { // Note that this is immutable and can only be changed via operations on the // descriptorTable. // +// It contains both VFS1 and VFS2 file types, but only one of them can be set. +// // +stateify savable type descriptor struct { - file *fs.File - flags FDFlags + // TODO(gvisor.dev/issue/1624): Remove fs.File. + file *fs.File + fileVFS2 *vfs.FileDescription + flags FDFlags } // FDTable is used to manage File references and flags. @@ -95,10 +100,11 @@ type FDTable struct { func (f *FDTable) saveDescriptorTable() map[int32]descriptor { m := make(map[int32]descriptor) - f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { m[fd] = descriptor{ - file: file, - flags: flags, + file: file, + fileVFS2: fileVFS2, + flags: flags, } }) return m @@ -107,13 +113,17 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor { func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { f.init() // Initialize table. for fd, d := range m { - f.set(fd, d.file, d.flags) - - // Note that we do _not_ need to acquire a extra table - // reference here. The table reference will already be - // accounted for in the file, so we drop the reference taken by - // set above. - d.file.DecRef() + f.setAll(fd, d.file, d.fileVFS2, d.flags) + + // Note that we do _not_ need to acquire a extra table reference here. The + // table reference will already be accounted for in the file, so we drop the + // reference taken by set above. + switch { + case d.file != nil: + d.file.DecRef() + case d.fileVFS2 != nil: + d.fileVFS2.DecRef() + } } } @@ -139,6 +149,15 @@ func (f *FDTable) drop(file *fs.File) { file.DecRef() } +// dropVFS2 drops the table reference. +func (f *FDTable) dropVFS2(file *vfs.FileDescription) { + // TODO(gvisor.dev/issue/1480): Release locks. + // TODO(gvisor.dev/issue/1479): Send inotify events. + + // Drop the table reference. + file.DecRef() +} + // ID returns a unique identifier for this FDTable. func (f *FDTable) ID() uint64 { return f.uid @@ -156,7 +175,7 @@ func (k *Kernel) NewFDTable() *FDTable { // destroy removes all of the file descriptors from the map. func (f *FDTable) destroy() { - f.RemoveIf(func(*fs.File, FDFlags) bool { + f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool { return true }) } @@ -175,19 +194,26 @@ func (f *FDTable) Size() int { // forEach iterates over all non-nil files. // // It is the caller's responsibility to acquire an appropriate lock. -func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) { +func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { fd := int32(0) for { - file, flags, ok := f.get(fd) + file, fileVFS2, flags, ok := f.getAll(fd) if !ok { break } - if file != nil { + switch { + case file != nil: if !file.TryIncRef() { continue // Race caught. } - fn(int32(fd), file, flags) + fn(fd, file, nil, flags) file.DecRef() + case fileVFS2 != nil: + if !fileVFS2.TryIncRef() { + continue // Race caught. + } + fn(fd, nil, fileVFS2, flags) + fileVFS2.DecRef() } fd++ } @@ -196,9 +222,21 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) { // String is a stringer for FDTable. func (f *FDTable) String() string { var b bytes.Buffer - f.forEach(func(fd int32, file *fs.File, flags FDFlags) { - n, _ := file.Dirent.FullName(nil /* root */) - b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n)) + f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + switch { + case file != nil: + n, _ := file.Dirent.FullName(nil /* root */) + b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n)) + + case fileVFS2 != nil: + fs := fileVFS2.VirtualDentry().Mount().Filesystem().VirtualFilesystem() + // TODO(gvisor.dev/issue/1623): We have no context nor root. Will this work? + name, err := fs.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) + if err != nil { + b.WriteString(fmt.Sprintf("<err: %v>\n", err)) + } + b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, name)) + } }) return b.String() } @@ -262,6 +300,17 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags // reference for that FD, the ref count for that existing reference is // decremented. func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error { + return f.newFDAt(ctx, fd, file, nil, flags) +} + +// NewFDAtVFS2 sets the file reference for the given FD. If there is an active +// reference for that FD, the ref count for that existing reference is +// decremented. +func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error { + return f.newFDAt(ctx, fd, nil, file, flags) +} + +func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error { if fd < 0 { // Don't accept negative FDs. return syscall.EBADF @@ -278,7 +327,7 @@ func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FD } // Install the entry. - f.set(fd, file, flags) + f.setAll(fd, file, fileVFS2, flags) return nil } @@ -330,10 +379,35 @@ func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) { } } +// GetVFS2 returns a reference to the file and the flags for the FD or nil if no +// file is defined for the given fd. +// +// N.B. Callers are required to use DecRef when they are done. +// +//go:nosplit +func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { + if fd < 0 { + return nil, FDFlags{} + } + + for { + file, flags, _ := f.getVFS2(fd) + if file != nil { + if !file.TryIncRef() { + continue // Race caught. + } + // Reference acquired. + return file, flags + } + // No file available. + return nil, FDFlags{} + } +} + // GetFDs returns a list of valid fds. func (f *FDTable) GetFDs() []int32 { fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) - f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { fds = append(fds, fd) }) return fds @@ -344,7 +418,19 @@ func (f *FDTable) GetFDs() []int32 { // they're done using the slice. func (f *FDTable) GetRefs() []*fs.File { files := make([]*fs.File, 0, f.Size()) - f.forEach(func(_ int32, file *fs.File, flags FDFlags) { + f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + file.IncRef() // Acquire a reference for caller. + files = append(files, file) + }) + return files +} + +// GetRefsVFS2 returns a stable slice of references to all files and bumps the +// reference count on each. The caller must use DecRef on each reference when +// they're done using the slice. +func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription { + files := make([]*vfs.FileDescription, 0, f.Size()) + f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) { file.IncRef() // Acquire a reference for caller. files = append(files, file) }) @@ -355,10 +441,15 @@ func (f *FDTable) GetRefs() []*fs.File { func (f *FDTable) Fork() *FDTable { clone := f.k.NewFDTable() - f.forEach(func(fd int32, file *fs.File, flags FDFlags) { + f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { // The set function here will acquire an appropriate table // reference for the clone. We don't need anything else. - clone.set(fd, file, flags) + switch { + case file != nil: + clone.set(fd, file, flags) + case fileVFS2 != nil: + clone.setVFS2(fd, fileVFS2, flags) + } }) return clone } @@ -366,9 +457,9 @@ func (f *FDTable) Fork() *FDTable { // Remove removes an FD from and returns a non-file iff successful. // // N.B. Callers are required to use DecRef when they are done. -func (f *FDTable) Remove(fd int32) *fs.File { +func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) { if fd < 0 { - return nil + return nil, nil } f.mu.Lock() @@ -379,21 +470,26 @@ func (f *FDTable) Remove(fd int32) *fs.File { f.next = fd } - orig, _, _ := f.get(fd) - if orig != nil { - orig.IncRef() // Reference for caller. - f.set(fd, nil, FDFlags{}) // Zap entry. + orig, orig2, _, _ := f.getAll(fd) + + // Add reference for caller. + switch { + case orig != nil: + orig.IncRef() + case orig2 != nil: + orig2.IncRef() } - return orig + f.setAll(fd, nil, nil, FDFlags{}) // Zap entry. + return orig, orig2 } // RemoveIf removes all FDs where cond is true. -func (f *FDTable) RemoveIf(cond func(*fs.File, FDFlags) bool) { +func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) { f.mu.Lock() defer f.mu.Unlock() - f.forEach(func(fd int32, file *fs.File, flags FDFlags) { - if cond(file, flags) { + f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + if cond(file, fileVFS2, flags) { f.set(fd, nil, FDFlags{}) // Clear from table. // Update current available position. if fd < f.next { diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go index 261b815f2..29f95a2c4 100644 --- a/pkg/sentry/kernel/fd_table_test.go +++ b/pkg/sentry/kernel/fd_table_test.go @@ -150,13 +150,13 @@ func TestFDTable(t *testing.T) { t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref) } - ref := fdTable.Remove(1) + ref, _ := fdTable.Remove(1) if ref == nil { t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success") } ref.DecRef() - if ref := fdTable.Remove(1); ref != nil { + if ref, _ := fdTable.Remove(1); ref != nil { t.Fatalf("r.Remove(1) for a removed FD: got success, want failure") } }) diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go index e9fdb0917..7fd97dc53 100644 --- a/pkg/sentry/kernel/fd_table_unsafe.go +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -19,6 +19,7 @@ import ( "unsafe" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/vfs" ) type descriptorTable struct { @@ -41,15 +42,38 @@ func (f *FDTable) init() { // //go:nosplit func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) { + file, _, flags, ok := f.getAll(fd) + return file, flags, ok +} + +// getVFS2 gets a file entry. +// +// The boolean indicates whether this was in range. +// +//go:nosplit +func (f *FDTable) getVFS2(fd int32) (*vfs.FileDescription, FDFlags, bool) { + _, file, flags, ok := f.getAll(fd) + return file, flags, ok +} + +// getAll gets a file entry. +// +// The boolean indicates whether this was in range. +// +//go:nosplit +func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, bool) { slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) if fd >= int32(len(slice)) { - return nil, FDFlags{}, false + return nil, nil, FDFlags{}, false } d := (*descriptor)(atomic.LoadPointer(&slice[fd])) if d == nil { - return nil, FDFlags{}, true + return nil, nil, FDFlags{}, true } - return d.file, d.flags, true + if d.file != nil && d.fileVFS2 != nil { + panic("VFS1 and VFS2 files set") + } + return d.file, d.fileVFS2, d.flags, true } // set sets an entry. @@ -59,6 +83,30 @@ func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) { // // Precondition: mu must be held. func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) { + f.setAll(fd, file, nil, flags) +} + +// setVFS2 sets an entry. +// +// This handles accounting changes, as well as acquiring and releasing the +// reference needed by the table iff the file is different. +// +// Precondition: mu must be held. +func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) { + f.setAll(fd, nil, file, flags) +} + +// setAll sets an entry. +// +// This handles accounting changes, as well as acquiring and releasing the +// reference needed by the table iff the file is different. +// +// Precondition: mu must be held. +func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + if file != nil && fileVFS2 != nil { + panic("VFS1 and VFS2 files set") + } + slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) // Grow the table as required. @@ -71,33 +119,51 @@ func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) { atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) } - // Create the new element. - var d *descriptor - if file != nil { - d = &descriptor{ - file: file, - flags: flags, + var desc *descriptor + if file != nil || fileVFS2 != nil { + desc = &descriptor{ + file: file, + fileVFS2: fileVFS2, + flags: flags, } } // Update the single element. - orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(d))) + orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(desc))) // Acquire a table reference. - if file != nil && (orig == nil || file != orig.file) { - file.IncRef() + if desc != nil { + switch { + case desc.file != nil: + if orig == nil || desc.file != orig.file { + desc.file.IncRef() + } + case desc.fileVFS2 != nil: + if orig == nil || desc.fileVFS2 != orig.fileVFS2 { + desc.fileVFS2.IncRef() + } + } } // Drop the table reference. - if orig != nil && file != orig.file { - f.drop(orig.file) + if orig != nil { + switch { + case orig.file != nil: + if desc == nil || desc.file != orig.file { + f.drop(orig.file) + } + case orig.fileVFS2 != nil: + if desc == nil || desc.fileVFS2 != orig.fileVFS2 { + f.dropVFS2(orig.fileVFS2) + } + } } // Adjust used. switch { - case orig == nil && file != nil: + case orig == nil && desc != nil: atomic.AddInt32(&f.used, 1) - case orig != nil && file == nil: + case orig != nil && desc == nil: atomic.AddInt32(&f.used, -1) } } diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 7b90fac5a..dcd6e91c4 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -65,6 +65,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/unimpl" uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/state" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" @@ -435,17 +436,17 @@ func (k *Kernel) flushMountSourceRefs() error { // There may be some open FDs whose filesystems have been unmounted. We // must flush those as well. - return k.tasks.forEachFDPaused(func(file *fs.File) error { + return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { file.Dirent.Inode.MountSource.FlushDirentRefs() return nil }) } -// forEachFDPaused applies the given function to each open file descriptor in each -// task. +// forEachFDPaused applies the given function to each open file descriptor in +// each task. // // Precondition: Must be called with the kernel paused. -func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) { +func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { @@ -453,8 +454,8 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) { if t.fdTable == nil { continue } - t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { - if lastErr := f(file); lastErr != nil && err == nil { + t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { + if lastErr := f(file, fileVFS2); lastErr != nil && err == nil { err = lastErr } }) @@ -463,7 +464,8 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) { } func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { - return ts.forEachFDPaused(func(file *fs.File) error { + // TODO(gvisor.dev/issues/1663): Add save support for VFS2. + return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { if flags := file.Flags(); !flags.Write { return nil } @@ -474,12 +476,9 @@ func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) if err := fs.SaveFileFsyncError(syncErr); err != nil { name, _ := file.Dirent.FullName(nil /* root */) - // Wrap this error in ErrSaveRejection - // so that it will trigger a save - // error, rather than a panic. This - // also allows us to distinguish Fsync - // errors from state file errors in - // state.Save. + // Wrap this error in ErrSaveRejection so that it will trigger a save + // error, rather than a panic. This also allows us to distinguish Fsync + // errors from state file errors in state.Save. return fs.ErrSaveRejection{ Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err), } @@ -519,7 +518,7 @@ func (ts *TaskSet) unregisterEpollWaiters() { for t := range ts.Root.tids { // We can skip locking Task.mu here since the kernel is paused. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { if e, ok := file.FileOperations.(*epoll.EventPoll); ok { e.UnregisterEpollWaiters() } @@ -921,7 +920,7 @@ func (k *Kernel) pauseTimeLocked() { // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.PauseTimer() } @@ -951,7 +950,7 @@ func (k *Kernel) resumeTimeLocked() { } } if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.ResumeTimer() } diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 95adf2778..981e8c7fe 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -35,6 +35,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -743,6 +744,14 @@ func (t *Task) GetFile(fd int32) *fs.File { return f } +// GetFileVFS2 is a convenience wrapper for t.FDTable().GetVFS2. +// +// Precondition: same as FDTable.Get. +func (t *Task) GetFileVFS2(fd int32) *vfs.FileDescription { + f, _ := t.fdTable.GetVFS2(fd) + return f +} + // NewFDs is a convenience wrapper for t.FDTable().NewFDs. // // This automatically passes the task as the context. diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index fa6528386..8f57a34a6 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -69,6 +69,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -198,7 +199,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.Unlock() // Remove FDs with the CloseOnExec flag set. - t.fdTable.RemoveIf(func(file *fs.File, flags FDFlags) bool { + t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool { return flags.CloseOnExec }) diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 8d6c52850..be16ee686 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -93,6 +93,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/syscalls", "//pkg/sentry/usage", + "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go index 60469549d..64de56ac5 100644 --- a/pkg/sentry/syscalls/linux/error.go +++ b/pkg/sentry/syscalls/linux/error.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -31,20 +32,58 @@ var ( partialResultOnce sync.Once ) +// HandleIOErrorVFS2 handles special error cases for partial results. For some +// errors, we may consume the error and return only the partial read/write. +// +// op and f are used only for panics. +func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op string, f *vfs.FileDescription) error { + known, err := handleIOErrorImpl(t, partialResult, err, intr, op) + if err != nil { + return err + } + if !known { + // An unknown error is encountered with a partial read/write. + fs := f.Mount().Filesystem().VirtualFilesystem() + root := vfs.RootFromContext(t) + name, _ := fs.PathnameWithDeleted(t, root, f.VirtualDentry()) + log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, err, err, op, name) + partialResultOnce.Do(partialResultMetric.Increment) + } + return nil +} + // handleIOError handles special error cases for partial results. For some // errors, we may consume the error and return only the partial read/write. // // op and f are used only for panics. func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op string, f *fs.File) error { + known, err := handleIOErrorImpl(t, partialResult, err, intr, op) + if err != nil { + return err + } + if !known { + // An unknown error is encountered with a partial read/write. + name, _ := f.Dirent.FullName(nil /* ignore chroot */) + log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations) + partialResultOnce.Do(partialResultMetric.Increment) + } + return nil +} + +// handleIOError handles special error cases for partial results. For some +// errors, we may consume the error and return only the partial read/write. +// +// Returns false if error is unknown. +func handleIOErrorImpl(t *kernel.Task, partialResult bool, err, intr error, op string) (bool, error) { switch err { case nil: // Typical successful syscall. - return nil + return true, nil case io.EOF: // EOF is always consumed. If this is a partial read/write // (result != 0), the application will see that, otherwise // they will see 0. - return nil + return true, nil case syserror.ErrExceedsFileSizeLimit: // Ignore partialResult because this error only applies to // normal files, and for those files we cannot accumulate @@ -53,20 +92,20 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin // Do not consume the error and return it as EFBIG. // Simultaneously send a SIGXFSZ per setrlimit(2). t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) - return syserror.EFBIG + return true, syserror.EFBIG case syserror.ErrInterrupted: // The syscall was interrupted. Return nil if it completed // partially, otherwise return the error code that the syscall // needs (to indicate to the kernel what it should do). if partialResult { - return nil + return true, nil } - return intr + return true, intr } if !partialResult { // Typical syscall error. - return err + return true, err } switch err { @@ -75,14 +114,14 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin // read/write. Like ErrWouldBlock, since we have a // partial read/write, we consume the error and return // the partial result. - return nil + return true, nil case syserror.EFAULT: // EFAULT is only shown the user if nothing was // read/written. If we read something (this case), they see // a partial read/write. They will then presumably try again // with an incremented buffer, which will EFAULT with // result == 0. - return nil + return true, nil case syserror.EPIPE: // Writes to a pipe or socket will return EPIPE if the other // side is gone. The partial write is returned. EPIPE will be @@ -90,32 +129,29 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin // // TODO(gvisor.dev/issue/161): In some cases SIGPIPE should // also be sent to the application. - return nil + return true, nil case syserror.ENOSPC: // Similar to EPIPE. Return what we wrote this time, and let // ENOSPC be returned on the next call. - return nil + return true, nil case syserror.ECONNRESET: // For TCP sendfile connections, we may have a reset. But we // should just return n as the result. - return nil + return true, nil case syserror.ErrWouldBlock: // Syscall would block, but completed a partial read/write. // This case should only be returned by IssueIO for nonblocking // files. Since we have a partial read/write, we consume // ErrWouldBlock, returning the partial result. - return nil + return true, nil } switch err.(type) { case kernel.SyscallRestartErrno: // Identical to the EINTR case. - return nil + return true, nil } - // An unknown error is encountered with a partial read/write. - name, _ := f.Dirent.FullName(nil /* ignore chroot */) - log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations) - partialResultOnce.Do(partialResultMetric.Increment) - return nil + // Error is unknown and cannot be properly handled. + return false, nil } diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index c54735148..421845ebb 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -767,7 +767,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Note that Remove provides a reference on the file that we may use to // flush. It is still active until we drop the final reference below // (and other reference-holding operations complete). - file := t.FDTable().Remove(fd) + file, _ := t.FDTable().Remove(fd) if file == nil { return 0, nil, syserror.EBADF } diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD new file mode 100644 index 000000000..6b8a00b6e --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -0,0 +1,24 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "vfs2", + srcs = [ + "linux64.go", + "linux64_override_amd64.go", + "linux64_override_arm64.go", + "sys_read.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/sentry/arch", + "//pkg/sentry/kernel", + "//pkg/sentry/syscalls", + "//pkg/sentry/syscalls/linux", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64.go b/pkg/sentry/syscalls/linux/vfs2/linux64.go new file mode 100644 index 000000000..19ee36081 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/linux64.go @@ -0,0 +1,16 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package vfs2 provides syscall implementations that use VFS2. +package vfs2 diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go new file mode 100644 index 000000000..c134714ee --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go @@ -0,0 +1,25 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls" +) + +// Override syscall table to add syscalls implementations from this package. +func Override(table map[uintptr]kernel.Syscall) { + table[0] = syscalls.Supported("read", Read) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go new file mode 100644 index 000000000..6af5c400f --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go @@ -0,0 +1,25 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls" +) + +// Override syscall table to add syscalls implementations from this package. +func Override(table map[uintptr]kernel.Syscall) { + table[63] = syscalls.Supported("read", Read) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_read.go b/pkg/sentry/syscalls/linux/vfs2/sys_read.go new file mode 100644 index 000000000..b9fb58464 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/sys_read.go @@ -0,0 +1,95 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// Read implements linux syscall read(2). Note that we try to get a buffer that +// is exactly the size requested because some applications like qemu expect +// they can do large reads all at once. Bug for bug. Same for other read +// calls below. +func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the file is readable. + if !file.IsReadable() { + return 0, nil, syserror.EBADF + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := read(t, file, dst, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, linux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file) +} + +func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + n, err := file.Read(t, dst, opts) + if err != syserror.ErrWouldBlock { + return n, err + } + + // Register for notifications. + _, ch := waiter.NewChannelEntry(nil) + // file.EventRegister(&w, EventMaskRead) + + total := n + for { + // Shorten dst to reflect bytes previously read. + dst = dst.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err := file.Read(t, dst, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + if err := t.Block(ch); err != nil { + break + } + } + //file.EventUnregister(&w) + + return total, err +} |