summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorgVisor bot <gvisor-bot@google.com>2020-01-28 23:44:15 +0000
committergVisor bot <gvisor-bot@google.com>2020-01-28 23:44:15 +0000
commit5256e3e73cdf999448c930940be8f0b71cf0f8b1 (patch)
tree65fe40339e619395fd7b331c0e642e5400cacfc1
parent1afa1ac87c09bd2eea2c61877e2ad78baeee7a8f (diff)
parent437c986c6a0ed0e1fccfbfb6706f43d2c801c444 (diff)
Merge release-20200115.0-125-g437c986 (automated)
-rw-r--r--pkg/sentry/kernel/fd_table.go166
-rw-r--r--pkg/sentry/kernel/fd_table_unsafe.go98
-rw-r--r--pkg/sentry/kernel/kernel.go31
-rwxr-xr-xpkg/sentry/kernel/kernel_state_autogen.go2
-rw-r--r--pkg/sentry/kernel/task.go9
-rw-r--r--pkg/sentry/kernel/task_exec.go3
-rw-r--r--pkg/sentry/syscalls/linux/error.go72
-rw-r--r--pkg/sentry/syscalls/linux/sys_file.go2
-rwxr-xr-xpkg/sentry/syscalls/linux/vfs2/linux64.go16
-rwxr-xr-xpkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go25
-rwxr-xr-xpkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go25
-rwxr-xr-xpkg/sentry/syscalls/linux/vfs2/sys_read.go95
-rwxr-xr-xpkg/sentry/syscalls/linux/vfs2/vfs2_state_autogen.go4
-rw-r--r--runsc/boot/config.go3
-rw-r--r--runsc/boot/loader.go9
15 files changed, 473 insertions, 87 deletions
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 9460bb235..56b70ce96 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -27,6 +27,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
)
@@ -62,10 +63,14 @@ func (f FDFlags) ToLinuxFDFlags() (mask uint) {
// Note that this is immutable and can only be changed via operations on the
// descriptorTable.
//
+// It contains both VFS1 and VFS2 file types, but only one of them can be set.
+//
// +stateify savable
type descriptor struct {
- file *fs.File
- flags FDFlags
+ // TODO(gvisor.dev/issue/1624): Remove fs.File.
+ file *fs.File
+ fileVFS2 *vfs.FileDescription
+ flags FDFlags
}
// FDTable is used to manage File references and flags.
@@ -95,10 +100,11 @@ type FDTable struct {
func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
m := make(map[int32]descriptor)
- f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+ f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
m[fd] = descriptor{
- file: file,
- flags: flags,
+ file: file,
+ fileVFS2: fileVFS2,
+ flags: flags,
}
})
return m
@@ -107,13 +113,17 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
f.init() // Initialize table.
for fd, d := range m {
- f.set(fd, d.file, d.flags)
-
- // Note that we do _not_ need to acquire a extra table
- // reference here. The table reference will already be
- // accounted for in the file, so we drop the reference taken by
- // set above.
- d.file.DecRef()
+ f.setAll(fd, d.file, d.fileVFS2, d.flags)
+
+ // Note that we do _not_ need to acquire a extra table reference here. The
+ // table reference will already be accounted for in the file, so we drop the
+ // reference taken by set above.
+ switch {
+ case d.file != nil:
+ d.file.DecRef()
+ case d.fileVFS2 != nil:
+ d.fileVFS2.DecRef()
+ }
}
}
@@ -139,6 +149,15 @@ func (f *FDTable) drop(file *fs.File) {
file.DecRef()
}
+// dropVFS2 drops the table reference.
+func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
+ // TODO(gvisor.dev/issue/1480): Release locks.
+ // TODO(gvisor.dev/issue/1479): Send inotify events.
+
+ // Drop the table reference.
+ file.DecRef()
+}
+
// ID returns a unique identifier for this FDTable.
func (f *FDTable) ID() uint64 {
return f.uid
@@ -156,7 +175,7 @@ func (k *Kernel) NewFDTable() *FDTable {
// destroy removes all of the file descriptors from the map.
func (f *FDTable) destroy() {
- f.RemoveIf(func(*fs.File, FDFlags) bool {
+ f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool {
return true
})
}
@@ -175,19 +194,26 @@ func (f *FDTable) Size() int {
// forEach iterates over all non-nil files.
//
// It is the caller's responsibility to acquire an appropriate lock.
-func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) {
+func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
fd := int32(0)
for {
- file, flags, ok := f.get(fd)
+ file, fileVFS2, flags, ok := f.getAll(fd)
if !ok {
break
}
- if file != nil {
+ switch {
+ case file != nil:
if !file.TryIncRef() {
continue // Race caught.
}
- fn(int32(fd), file, flags)
+ fn(fd, file, nil, flags)
file.DecRef()
+ case fileVFS2 != nil:
+ if !fileVFS2.TryIncRef() {
+ continue // Race caught.
+ }
+ fn(fd, nil, fileVFS2, flags)
+ fileVFS2.DecRef()
}
fd++
}
@@ -196,9 +222,21 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) {
// String is a stringer for FDTable.
func (f *FDTable) String() string {
var b bytes.Buffer
- f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
- n, _ := file.Dirent.FullName(nil /* root */)
- b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n))
+ f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ switch {
+ case file != nil:
+ n, _ := file.Dirent.FullName(nil /* root */)
+ b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n))
+
+ case fileVFS2 != nil:
+ fs := fileVFS2.VirtualDentry().Mount().Filesystem().VirtualFilesystem()
+ // TODO(gvisor.dev/issue/1623): We have no context nor root. Will this work?
+ name, err := fs.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
+ if err != nil {
+ b.WriteString(fmt.Sprintf("<err: %v>\n", err))
+ }
+ b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, name))
+ }
})
return b.String()
}
@@ -262,6 +300,17 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
// reference for that FD, the ref count for that existing reference is
// decremented.
func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error {
+ return f.newFDAt(ctx, fd, file, nil, flags)
+}
+
+// NewFDAtVFS2 sets the file reference for the given FD. If there is an active
+// reference for that FD, the ref count for that existing reference is
+// decremented.
+func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error {
+ return f.newFDAt(ctx, fd, nil, file, flags)
+}
+
+func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error {
if fd < 0 {
// Don't accept negative FDs.
return syscall.EBADF
@@ -278,7 +327,7 @@ func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FD
}
// Install the entry.
- f.set(fd, file, flags)
+ f.setAll(fd, file, fileVFS2, flags)
return nil
}
@@ -330,10 +379,35 @@ func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) {
}
}
+// GetVFS2 returns a reference to the file and the flags for the FD or nil if no
+// file is defined for the given fd.
+//
+// N.B. Callers are required to use DecRef when they are done.
+//
+//go:nosplit
+func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) {
+ if fd < 0 {
+ return nil, FDFlags{}
+ }
+
+ for {
+ file, flags, _ := f.getVFS2(fd)
+ if file != nil {
+ if !file.TryIncRef() {
+ continue // Race caught.
+ }
+ // Reference acquired.
+ return file, flags
+ }
+ // No file available.
+ return nil, FDFlags{}
+ }
+}
+
// GetFDs returns a list of valid fds.
func (f *FDTable) GetFDs() []int32 {
fds := make([]int32, 0, int(atomic.LoadInt32(&f.used)))
- f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+ f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) {
fds = append(fds, fd)
})
return fds
@@ -344,7 +418,19 @@ func (f *FDTable) GetFDs() []int32 {
// they're done using the slice.
func (f *FDTable) GetRefs() []*fs.File {
files := make([]*fs.File, 0, f.Size())
- f.forEach(func(_ int32, file *fs.File, flags FDFlags) {
+ f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+ file.IncRef() // Acquire a reference for caller.
+ files = append(files, file)
+ })
+ return files
+}
+
+// GetRefsVFS2 returns a stable slice of references to all files and bumps the
+// reference count on each. The caller must use DecRef on each reference when
+// they're done using the slice.
+func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription {
+ files := make([]*vfs.FileDescription, 0, f.Size())
+ f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) {
file.IncRef() // Acquire a reference for caller.
files = append(files, file)
})
@@ -355,10 +441,15 @@ func (f *FDTable) GetRefs() []*fs.File {
func (f *FDTable) Fork() *FDTable {
clone := f.k.NewFDTable()
- f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+ f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
// The set function here will acquire an appropriate table
// reference for the clone. We don't need anything else.
- clone.set(fd, file, flags)
+ switch {
+ case file != nil:
+ clone.set(fd, file, flags)
+ case fileVFS2 != nil:
+ clone.setVFS2(fd, fileVFS2, flags)
+ }
})
return clone
}
@@ -366,9 +457,9 @@ func (f *FDTable) Fork() *FDTable {
// Remove removes an FD from and returns a non-file iff successful.
//
// N.B. Callers are required to use DecRef when they are done.
-func (f *FDTable) Remove(fd int32) *fs.File {
+func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
if fd < 0 {
- return nil
+ return nil, nil
}
f.mu.Lock()
@@ -379,21 +470,26 @@ func (f *FDTable) Remove(fd int32) *fs.File {
f.next = fd
}
- orig, _, _ := f.get(fd)
- if orig != nil {
- orig.IncRef() // Reference for caller.
- f.set(fd, nil, FDFlags{}) // Zap entry.
+ orig, orig2, _, _ := f.getAll(fd)
+
+ // Add reference for caller.
+ switch {
+ case orig != nil:
+ orig.IncRef()
+ case orig2 != nil:
+ orig2.IncRef()
}
- return orig
+ f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+ return orig, orig2
}
// RemoveIf removes all FDs where cond is true.
-func (f *FDTable) RemoveIf(cond func(*fs.File, FDFlags) bool) {
+func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
f.mu.Lock()
defer f.mu.Unlock()
- f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
- if cond(file, flags) {
+ f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ if cond(file, fileVFS2, flags) {
f.set(fd, nil, FDFlags{}) // Clear from table.
// Update current available position.
if fd < f.next {
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
index e9fdb0917..7fd97dc53 100644
--- a/pkg/sentry/kernel/fd_table_unsafe.go
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -19,6 +19,7 @@ import (
"unsafe"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
)
type descriptorTable struct {
@@ -41,15 +42,38 @@ func (f *FDTable) init() {
//
//go:nosplit
func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) {
+ file, _, flags, ok := f.getAll(fd)
+ return file, flags, ok
+}
+
+// getVFS2 gets a file entry.
+//
+// The boolean indicates whether this was in range.
+//
+//go:nosplit
+func (f *FDTable) getVFS2(fd int32) (*vfs.FileDescription, FDFlags, bool) {
+ _, file, flags, ok := f.getAll(fd)
+ return file, flags, ok
+}
+
+// getAll gets a file entry.
+//
+// The boolean indicates whether this was in range.
+//
+//go:nosplit
+func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, bool) {
slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
if fd >= int32(len(slice)) {
- return nil, FDFlags{}, false
+ return nil, nil, FDFlags{}, false
}
d := (*descriptor)(atomic.LoadPointer(&slice[fd]))
if d == nil {
- return nil, FDFlags{}, true
+ return nil, nil, FDFlags{}, true
}
- return d.file, d.flags, true
+ if d.file != nil && d.fileVFS2 != nil {
+ panic("VFS1 and VFS2 files set")
+ }
+ return d.file, d.fileVFS2, d.flags, true
}
// set sets an entry.
@@ -59,6 +83,30 @@ func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) {
//
// Precondition: mu must be held.
func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) {
+ f.setAll(fd, file, nil, flags)
+}
+
+// setVFS2 sets an entry.
+//
+// This handles accounting changes, as well as acquiring and releasing the
+// reference needed by the table iff the file is different.
+//
+// Precondition: mu must be held.
+func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) {
+ f.setAll(fd, nil, file, flags)
+}
+
+// setAll sets an entry.
+//
+// This handles accounting changes, as well as acquiring and releasing the
+// reference needed by the table iff the file is different.
+//
+// Precondition: mu must be held.
+func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ if file != nil && fileVFS2 != nil {
+ panic("VFS1 and VFS2 files set")
+ }
+
slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
// Grow the table as required.
@@ -71,33 +119,51 @@ func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) {
atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
}
- // Create the new element.
- var d *descriptor
- if file != nil {
- d = &descriptor{
- file: file,
- flags: flags,
+ var desc *descriptor
+ if file != nil || fileVFS2 != nil {
+ desc = &descriptor{
+ file: file,
+ fileVFS2: fileVFS2,
+ flags: flags,
}
}
// Update the single element.
- orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(d)))
+ orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(desc)))
// Acquire a table reference.
- if file != nil && (orig == nil || file != orig.file) {
- file.IncRef()
+ if desc != nil {
+ switch {
+ case desc.file != nil:
+ if orig == nil || desc.file != orig.file {
+ desc.file.IncRef()
+ }
+ case desc.fileVFS2 != nil:
+ if orig == nil || desc.fileVFS2 != orig.fileVFS2 {
+ desc.fileVFS2.IncRef()
+ }
+ }
}
// Drop the table reference.
- if orig != nil && file != orig.file {
- f.drop(orig.file)
+ if orig != nil {
+ switch {
+ case orig.file != nil:
+ if desc == nil || desc.file != orig.file {
+ f.drop(orig.file)
+ }
+ case orig.fileVFS2 != nil:
+ if desc == nil || desc.fileVFS2 != orig.fileVFS2 {
+ f.dropVFS2(orig.fileVFS2)
+ }
+ }
}
// Adjust used.
switch {
- case orig == nil && file != nil:
+ case orig == nil && desc != nil:
atomic.AddInt32(&f.used, 1)
- case orig != nil && file == nil:
+ case orig != nil && desc == nil:
atomic.AddInt32(&f.used, -1)
}
}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 7b90fac5a..dcd6e91c4 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -65,6 +65,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/unimpl"
uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
"gvisor.dev/gvisor/pkg/sentry/uniqueid"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/state"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
@@ -435,17 +436,17 @@ func (k *Kernel) flushMountSourceRefs() error {
// There may be some open FDs whose filesystems have been unmounted. We
// must flush those as well.
- return k.tasks.forEachFDPaused(func(file *fs.File) error {
+ return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
file.Dirent.Inode.MountSource.FlushDirentRefs()
return nil
})
}
-// forEachFDPaused applies the given function to each open file descriptor in each
-// task.
+// forEachFDPaused applies the given function to each open file descriptor in
+// each task.
//
// Precondition: Must be called with the kernel paused.
-func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) {
+func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) {
ts.mu.RLock()
defer ts.mu.RUnlock()
for t := range ts.Root.tids {
@@ -453,8 +454,8 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) {
if t.fdTable == nil {
continue
}
- t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
- if lastErr := f(file); lastErr != nil && err == nil {
+ t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) {
+ if lastErr := f(file, fileVFS2); lastErr != nil && err == nil {
err = lastErr
}
})
@@ -463,7 +464,8 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) {
}
func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
- return ts.forEachFDPaused(func(file *fs.File) error {
+ // TODO(gvisor.dev/issues/1663): Add save support for VFS2.
+ return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error {
if flags := file.Flags(); !flags.Write {
return nil
}
@@ -474,12 +476,9 @@ func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
if err := fs.SaveFileFsyncError(syncErr); err != nil {
name, _ := file.Dirent.FullName(nil /* root */)
- // Wrap this error in ErrSaveRejection
- // so that it will trigger a save
- // error, rather than a panic. This
- // also allows us to distinguish Fsync
- // errors from state file errors in
- // state.Save.
+ // Wrap this error in ErrSaveRejection so that it will trigger a save
+ // error, rather than a panic. This also allows us to distinguish Fsync
+ // errors from state file errors in state.Save.
return fs.ErrSaveRejection{
Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err),
}
@@ -519,7 +518,7 @@ func (ts *TaskSet) unregisterEpollWaiters() {
for t := range ts.Root.tids {
// We can skip locking Task.mu here since the kernel is paused.
if t.fdTable != nil {
- t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
e.UnregisterEpollWaiters()
}
@@ -921,7 +920,7 @@ func (k *Kernel) pauseTimeLocked() {
// This means we'll iterate FDTables shared by multiple tasks repeatedly,
// but ktime.Timer.Pause is idempotent so this is harmless.
if t.fdTable != nil {
- t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
tfd.PauseTimer()
}
@@ -951,7 +950,7 @@ func (k *Kernel) resumeTimeLocked() {
}
}
if t.fdTable != nil {
- t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
+ t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
tfd.ResumeTimer()
}
diff --git a/pkg/sentry/kernel/kernel_state_autogen.go b/pkg/sentry/kernel/kernel_state_autogen.go
index 2c32826c5..0aa106128 100755
--- a/pkg/sentry/kernel/kernel_state_autogen.go
+++ b/pkg/sentry/kernel/kernel_state_autogen.go
@@ -52,12 +52,14 @@ func (x *descriptor) beforeSave() {}
func (x *descriptor) save(m state.Map) {
x.beforeSave()
m.Save("file", &x.file)
+ m.Save("fileVFS2", &x.fileVFS2)
m.Save("flags", &x.flags)
}
func (x *descriptor) afterLoad() {}
func (x *descriptor) load(m state.Map) {
m.Load("file", &x.file)
+ m.Load("fileVFS2", &x.fileVFS2)
m.Load("flags", &x.flags)
}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 95adf2778..981e8c7fe 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -35,6 +35,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/unimpl"
"gvisor.dev/gvisor/pkg/sentry/uniqueid"
"gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
@@ -743,6 +744,14 @@ func (t *Task) GetFile(fd int32) *fs.File {
return f
}
+// GetFileVFS2 is a convenience wrapper for t.FDTable().GetVFS2.
+//
+// Precondition: same as FDTable.Get.
+func (t *Task) GetFileVFS2(fd int32) *vfs.FileDescription {
+ f, _ := t.fdTable.GetVFS2(fd)
+ return f
+}
+
// NewFDs is a convenience wrapper for t.FDTable().NewFDs.
//
// This automatically passes the task as the context.
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index fa6528386..8f57a34a6 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -69,6 +69,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/mm"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -198,7 +199,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
t.tg.pidns.owner.mu.Unlock()
// Remove FDs with the CloseOnExec flag set.
- t.fdTable.RemoveIf(func(file *fs.File, flags FDFlags) bool {
+ t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool {
return flags.CloseOnExec
})
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 60469549d..64de56ac5 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -31,20 +32,58 @@ var (
partialResultOnce sync.Once
)
+// HandleIOErrorVFS2 handles special error cases for partial results. For some
+// errors, we may consume the error and return only the partial read/write.
+//
+// op and f are used only for panics.
+func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op string, f *vfs.FileDescription) error {
+ known, err := handleIOErrorImpl(t, partialResult, err, intr, op)
+ if err != nil {
+ return err
+ }
+ if !known {
+ // An unknown error is encountered with a partial read/write.
+ fs := f.Mount().Filesystem().VirtualFilesystem()
+ root := vfs.RootFromContext(t)
+ name, _ := fs.PathnameWithDeleted(t, root, f.VirtualDentry())
+ log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, err, err, op, name)
+ partialResultOnce.Do(partialResultMetric.Increment)
+ }
+ return nil
+}
+
// handleIOError handles special error cases for partial results. For some
// errors, we may consume the error and return only the partial read/write.
//
// op and f are used only for panics.
func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op string, f *fs.File) error {
+ known, err := handleIOErrorImpl(t, partialResult, err, intr, op)
+ if err != nil {
+ return err
+ }
+ if !known {
+ // An unknown error is encountered with a partial read/write.
+ name, _ := f.Dirent.FullName(nil /* ignore chroot */)
+ log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations)
+ partialResultOnce.Do(partialResultMetric.Increment)
+ }
+ return nil
+}
+
+// handleIOError handles special error cases for partial results. For some
+// errors, we may consume the error and return only the partial read/write.
+//
+// Returns false if error is unknown.
+func handleIOErrorImpl(t *kernel.Task, partialResult bool, err, intr error, op string) (bool, error) {
switch err {
case nil:
// Typical successful syscall.
- return nil
+ return true, nil
case io.EOF:
// EOF is always consumed. If this is a partial read/write
// (result != 0), the application will see that, otherwise
// they will see 0.
- return nil
+ return true, nil
case syserror.ErrExceedsFileSizeLimit:
// Ignore partialResult because this error only applies to
// normal files, and for those files we cannot accumulate
@@ -53,20 +92,20 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
// Do not consume the error and return it as EFBIG.
// Simultaneously send a SIGXFSZ per setrlimit(2).
t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t))
- return syserror.EFBIG
+ return true, syserror.EFBIG
case syserror.ErrInterrupted:
// The syscall was interrupted. Return nil if it completed
// partially, otherwise return the error code that the syscall
// needs (to indicate to the kernel what it should do).
if partialResult {
- return nil
+ return true, nil
}
- return intr
+ return true, intr
}
if !partialResult {
// Typical syscall error.
- return err
+ return true, err
}
switch err {
@@ -75,14 +114,14 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
// read/write. Like ErrWouldBlock, since we have a
// partial read/write, we consume the error and return
// the partial result.
- return nil
+ return true, nil
case syserror.EFAULT:
// EFAULT is only shown the user if nothing was
// read/written. If we read something (this case), they see
// a partial read/write. They will then presumably try again
// with an incremented buffer, which will EFAULT with
// result == 0.
- return nil
+ return true, nil
case syserror.EPIPE:
// Writes to a pipe or socket will return EPIPE if the other
// side is gone. The partial write is returned. EPIPE will be
@@ -90,32 +129,29 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
//
// TODO(gvisor.dev/issue/161): In some cases SIGPIPE should
// also be sent to the application.
- return nil
+ return true, nil
case syserror.ENOSPC:
// Similar to EPIPE. Return what we wrote this time, and let
// ENOSPC be returned on the next call.
- return nil
+ return true, nil
case syserror.ECONNRESET:
// For TCP sendfile connections, we may have a reset. But we
// should just return n as the result.
- return nil
+ return true, nil
case syserror.ErrWouldBlock:
// Syscall would block, but completed a partial read/write.
// This case should only be returned by IssueIO for nonblocking
// files. Since we have a partial read/write, we consume
// ErrWouldBlock, returning the partial result.
- return nil
+ return true, nil
}
switch err.(type) {
case kernel.SyscallRestartErrno:
// Identical to the EINTR case.
- return nil
+ return true, nil
}
- // An unknown error is encountered with a partial read/write.
- name, _ := f.Dirent.FullName(nil /* ignore chroot */)
- log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations)
- partialResultOnce.Do(partialResultMetric.Increment)
- return nil
+ // Error is unknown and cannot be properly handled.
+ return false, nil
}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index c54735148..421845ebb 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -767,7 +767,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Note that Remove provides a reference on the file that we may use to
// flush. It is still active until we drop the final reference below
// (and other reference-holding operations complete).
- file := t.FDTable().Remove(fd)
+ file, _ := t.FDTable().Remove(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64.go b/pkg/sentry/syscalls/linux/vfs2/linux64.go
new file mode 100755
index 000000000..19ee36081
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64.go
@@ -0,0 +1,16 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package vfs2 provides syscall implementations that use VFS2.
+package vfs2
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
new file mode 100755
index 000000000..c134714ee
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/syscalls"
+)
+
+// Override syscall table to add syscalls implementations from this package.
+func Override(table map[uintptr]kernel.Syscall) {
+ table[0] = syscalls.Supported("read", Read)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
new file mode 100755
index 000000000..6af5c400f
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/syscalls"
+)
+
+// Override syscall table to add syscalls implementations from this package.
+func Override(table map[uintptr]kernel.Syscall) {
+ table[63] = syscalls.Supported("read", Read)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_read.go b/pkg/sentry/syscalls/linux/vfs2/sys_read.go
new file mode 100755
index 000000000..b9fb58464
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/sys_read.go
@@ -0,0 +1,95 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Read implements linux syscall read(2). Note that we try to get a buffer that
+// is exactly the size requested because some applications like qemu expect
+// they can do large reads all at once. Bug for bug. Same for other read
+// calls below.
+func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ size := args[2].SizeT()
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer file.DecRef()
+
+ // Check that the file is readable.
+ if !file.IsReadable() {
+ return 0, nil, syserror.EBADF
+ }
+
+ // Check that the size is legitimate.
+ si := int(size)
+ if si < 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // Get the destination of the read.
+ dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ n, err := read(t, file, dst, vfs.ReadOptions{})
+ t.IOUsage().AccountReadSyscall(n)
+ return uintptr(n), nil, linux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+}
+
+func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ n, err := file.Read(t, dst, opts)
+ if err != syserror.ErrWouldBlock {
+ return n, err
+ }
+
+ // Register for notifications.
+ _, ch := waiter.NewChannelEntry(nil)
+ // file.EventRegister(&w, EventMaskRead)
+
+ total := n
+ for {
+ // Shorten dst to reflect bytes previously read.
+ dst = dst.DropFirst(int(n))
+
+ // Issue the request and break out if it completes with anything other than
+ // "would block".
+ n, err := file.Read(t, dst, opts)
+ total += n
+ if err != syserror.ErrWouldBlock {
+ break
+ }
+ if err := t.Block(ch); err != nil {
+ break
+ }
+ }
+ //file.EventUnregister(&w)
+
+ return total, err
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2_state_autogen.go b/pkg/sentry/syscalls/linux/vfs2/vfs2_state_autogen.go
new file mode 100755
index 000000000..b049dbca1
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2_state_autogen.go
@@ -0,0 +1,4 @@
+// automatically generated by stateify.
+
+package vfs2
+
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index a878bc2ce..35391030f 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -256,6 +256,9 @@ type Config struct {
//
// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
CPUNumFromQuota bool
+
+ // Enables VFS2 (not plumbled through yet).
+ VFS2 bool
}
// ToFlags returns a slice of flags that correspond to the given Config.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index fad72f4ab..9f0d5d7af 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -26,6 +26,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/abi"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/log"
@@ -42,6 +43,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/sighandling"
+ "gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2"
"gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/watchdog"
@@ -184,6 +186,13 @@ func New(args Args) (*Loader, error) {
return nil, fmt.Errorf("setting up memory usage: %v", err)
}
+ if args.Conf.VFS2 {
+ st, ok := kernel.LookupSyscallTable(abi.Linux, arch.Host)
+ if ok {
+ vfs2.Override(st.Table)
+ }
+ }
+
// Create kernel and platform.
p, err := createPlatform(args.Conf, args.Device)
if err != nil {