summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel/fd_table.go
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel/fd_table.go')
-rw-r--r--pkg/sentry/kernel/fd_table.go324
1 files changed, 278 insertions, 46 deletions
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index cc3f43a45..ed40b5303 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -15,19 +15,20 @@
package kernel
import (
- "bytes"
"fmt"
"math"
- "sync"
+ "strings"
"sync/atomic"
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/refs"
- "gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
)
// FDFlags define flags for an individual descriptor.
@@ -62,10 +63,14 @@ func (f FDFlags) ToLinuxFDFlags() (mask uint) {
// Note that this is immutable and can only be changed via operations on the
// descriptorTable.
//
+// It contains both VFS1 and VFS2 file types, but only one of them can be set.
+//
// +stateify savable
type descriptor struct {
- file *fs.File
- flags FDFlags
+ // TODO(gvisor.dev/issue/1624): Remove fs.File.
+ file *fs.File
+ fileVFS2 *vfs.FileDescription
+ flags FDFlags
}
// FDTable is used to manage File references and flags.
@@ -81,6 +86,9 @@ type FDTable struct {
// mu protects below.
mu sync.Mutex `state:"nosave"`
+ // next is start position to find fd.
+ next int32
+
// used contains the number of non-nil entries. It must be accessed
// atomically. It may be read atomically without holding mu (but not
// written).
@@ -92,10 +100,11 @@ type FDTable struct {
func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
m := make(map[int32]descriptor)
- f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+ f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
m[fd] = descriptor{
- file: file,
- flags: flags,
+ file: file,
+ fileVFS2: fileVFS2,
+ flags: flags,
}
})
return m
@@ -104,13 +113,17 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
f.init() // Initialize table.
for fd, d := range m {
- f.set(fd, d.file, d.flags)
-
- // Note that we do _not_ need to acquire a extra table
- // reference here. The table reference will already be
- // accounted for in the file, so we drop the reference taken by
- // set above.
- d.file.DecRef()
+ f.setAll(fd, d.file, d.fileVFS2, d.flags)
+
+ // Note that we do _not_ need to acquire a extra table reference here. The
+ // table reference will already be accounted for in the file, so we drop the
+ // reference taken by set above.
+ switch {
+ case d.file != nil:
+ d.file.DecRef()
+ case d.fileVFS2 != nil:
+ d.fileVFS2.DecRef()
+ }
}
}
@@ -136,6 +149,15 @@ func (f *FDTable) drop(file *fs.File) {
file.DecRef()
}
+// dropVFS2 drops the table reference.
+func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
+ // TODO(gvisor.dev/issue/1480): Release locks.
+ // TODO(gvisor.dev/issue/1479): Send inotify events.
+
+ // Drop the table reference.
+ file.DecRef()
+}
+
// ID returns a unique identifier for this FDTable.
func (f *FDTable) ID() uint64 {
return f.uid
@@ -153,7 +175,7 @@ func (k *Kernel) NewFDTable() *FDTable {
// destroy removes all of the file descriptors from the map.
func (f *FDTable) destroy() {
- f.RemoveIf(func(*fs.File, FDFlags) bool {
+ f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool {
return true
})
}
@@ -169,35 +191,65 @@ func (f *FDTable) Size() int {
return int(size)
}
-// forEach iterates over all non-nil files.
+// forEach iterates over all non-nil files in sorted order.
//
// It is the caller's responsibility to acquire an appropriate lock.
-func (f *FDTable) forEach(fn func(fd int32, file *fs.File, flags FDFlags)) {
+func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
+ // retries tracks the number of failed TryIncRef attempts for the same FD.
+ retries := 0
fd := int32(0)
for {
- file, flags, ok := f.get(fd)
+ file, fileVFS2, flags, ok := f.getAll(fd)
if !ok {
break
}
- if file != nil {
+ switch {
+ case file != nil:
if !file.TryIncRef() {
+ retries++
+ if retries > 1000 {
+ panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, FileOps: %+v", fd, file, file.FileOperations))
+ }
continue // Race caught.
}
- fn(int32(fd), file, flags)
+ fn(fd, file, nil, flags)
file.DecRef()
+ case fileVFS2 != nil:
+ if !fileVFS2.TryIncRef() {
+ retries++
+ if retries > 1000 {
+ panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, Impl: %+v", fd, fileVFS2, fileVFS2.Impl()))
+ }
+ continue // Race caught.
+ }
+ fn(fd, nil, fileVFS2, flags)
+ fileVFS2.DecRef()
}
+ retries = 0
fd++
}
}
// String is a stringer for FDTable.
func (f *FDTable) String() string {
- var b bytes.Buffer
- f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
- n, _ := file.Dirent.FullName(nil /* root */)
- b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", fd, n))
+ var buf strings.Builder
+ f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ switch {
+ case file != nil:
+ n, _ := file.Dirent.FullName(nil /* root */)
+ fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, n)
+
+ case fileVFS2 != nil:
+ vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem()
+ name, err := vfsObj.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
+ if err != nil {
+ fmt.Fprintf(&buf, "<err: %v>\n", err)
+ return
+ }
+ fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, name)
+ }
})
- return b.String()
+ return buf.String()
}
// NewFDs allocates new FDs guaranteed to be the lowest number available
@@ -226,6 +278,11 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
f.mu.Lock()
defer f.mu.Unlock()
+ // From f.next to find available fd.
+ if fd < f.next {
+ fd = f.next
+ }
+
// Install all entries.
for i := fd; i < end && len(fds) < len(files); i++ {
if d, _, _ := f.get(i); d == nil {
@@ -242,21 +299,133 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
return nil, syscall.EMFILE
}
+ if fd == f.next {
+ // Update next search start position.
+ f.next = fds[len(fds)-1] + 1
+ }
+
+ return fds, nil
+}
+
+// NewFDsVFS2 allocates new FDs guaranteed to be the lowest number available
+// greater than or equal to the fd parameter. All files will share the set
+// flags. Success is guaranteed to be all or none.
+func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) {
+ if fd < 0 {
+ // Don't accept negative FDs.
+ return nil, syscall.EINVAL
+ }
+
+ // Default limit.
+ end := int32(math.MaxInt32)
+
+ // Ensure we don't get past the provided limit.
+ if limitSet := limits.FromContext(ctx); limitSet != nil {
+ lim := limitSet.Get(limits.NumberOfFiles)
+ if lim.Cur != limits.Infinity {
+ end = int32(lim.Cur)
+ }
+ if fd >= end {
+ return nil, syscall.EMFILE
+ }
+ }
+
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ // From f.next to find available fd.
+ if fd < f.next {
+ fd = f.next
+ }
+
+ // Install all entries.
+ for i := fd; i < end && len(fds) < len(files); i++ {
+ if d, _, _ := f.getVFS2(i); d == nil {
+ f.setVFS2(i, files[len(fds)], flags) // Set the descriptor.
+ fds = append(fds, i) // Record the file descriptor.
+ }
+ }
+
+ // Failure? Unwind existing FDs.
+ if len(fds) < len(files) {
+ for _, i := range fds {
+ f.setVFS2(i, nil, FDFlags{}) // Zap entry.
+ }
+ return nil, syscall.EMFILE
+ }
+
+ if fd == f.next {
+ // Update next search start position.
+ f.next = fds[len(fds)-1] + 1
+ }
+
return fds, nil
}
+// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for
+// the given file description. If it succeeds, it takes a reference on file.
+func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
+ if minfd < 0 {
+ // Don't accept negative FDs.
+ return -1, syscall.EINVAL
+ }
+
+ // Default limit.
+ end := int32(math.MaxInt32)
+
+ // Ensure we don't get past the provided limit.
+ if limitSet := limits.FromContext(ctx); limitSet != nil {
+ lim := limitSet.Get(limits.NumberOfFiles)
+ if lim.Cur != limits.Infinity {
+ end = int32(lim.Cur)
+ }
+ if minfd >= end {
+ return -1, syscall.EMFILE
+ }
+ }
+
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ // From f.next to find available fd.
+ fd := minfd
+ if fd < f.next {
+ fd = f.next
+ }
+ for fd < end {
+ if d, _, _ := f.getVFS2(fd); d == nil {
+ f.setVFS2(fd, file, flags)
+ if fd == f.next {
+ // Update next search start position.
+ f.next = fd + 1
+ }
+ return fd, nil
+ }
+ fd++
+ }
+ return -1, syscall.EMFILE
+}
+
// NewFDAt sets the file reference for the given FD. If there is an active
// reference for that FD, the ref count for that existing reference is
// decremented.
func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error {
+ return f.newFDAt(ctx, fd, file, nil, flags)
+}
+
+// NewFDAtVFS2 sets the file reference for the given FD. If there is an active
+// reference for that FD, the ref count for that existing reference is
+// decremented.
+func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error {
+ return f.newFDAt(ctx, fd, nil, file, flags)
+}
+
+func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error {
if fd < 0 {
// Don't accept negative FDs.
return syscall.EBADF
}
- f.mu.Lock()
- defer f.mu.Unlock()
-
// Check the limit for the provided file.
if limitSet := limits.FromContext(ctx); limitSet != nil {
if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
@@ -265,7 +434,9 @@ func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FD
}
// Install the entry.
- f.set(fd, file, flags)
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ f.setAll(fd, file, fileVFS2, flags)
return nil
}
@@ -317,10 +488,38 @@ func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) {
}
}
-// GetFDs returns a list of valid fds.
+// GetVFS2 returns a reference to the file and the flags for the FD or nil if no
+// file is defined for the given fd.
+//
+// N.B. Callers are required to use DecRef when they are done.
+//
+//go:nosplit
+func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) {
+ if fd < 0 {
+ return nil, FDFlags{}
+ }
+
+ for {
+ file, flags, _ := f.getVFS2(fd)
+ if file != nil {
+ if !file.TryIncRef() {
+ continue // Race caught.
+ }
+ // Reference acquired.
+ return file, flags
+ }
+ // No file available.
+ return nil, FDFlags{}
+ }
+}
+
+// GetFDs returns a sorted list of valid fds.
+//
+// Precondition: The caller must be running on the task goroutine, or Task.mu
+// must be locked.
func (f *FDTable) GetFDs() []int32 {
fds := make([]int32, 0, int(atomic.LoadInt32(&f.used)))
- f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+ f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) {
fds = append(fds, fd)
})
return fds
@@ -331,7 +530,19 @@ func (f *FDTable) GetFDs() []int32 {
// they're done using the slice.
func (f *FDTable) GetRefs() []*fs.File {
files := make([]*fs.File, 0, f.Size())
- f.forEach(func(_ int32, file *fs.File, flags FDFlags) {
+ f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
+ file.IncRef() // Acquire a reference for caller.
+ files = append(files, file)
+ })
+ return files
+}
+
+// GetRefsVFS2 returns a stable slice of references to all files and bumps the
+// reference count on each. The caller must use DecRef on each reference when
+// they're done using the slice.
+func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription {
+ files := make([]*vfs.FileDescription, 0, f.Size())
+ f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) {
file.IncRef() // Acquire a reference for caller.
files = append(files, file)
})
@@ -342,10 +553,15 @@ func (f *FDTable) GetRefs() []*fs.File {
func (f *FDTable) Fork() *FDTable {
clone := f.k.NewFDTable()
- f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
+ f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
// The set function here will acquire an appropriate table
// reference for the clone. We don't need anything else.
- clone.set(fd, file, flags)
+ switch {
+ case file != nil:
+ clone.set(fd, file, flags)
+ case fileVFS2 != nil:
+ clone.setVFS2(fd, fileVFS2, flags)
+ }
})
return clone
}
@@ -353,30 +569,46 @@ func (f *FDTable) Fork() *FDTable {
// Remove removes an FD from and returns a non-file iff successful.
//
// N.B. Callers are required to use DecRef when they are done.
-func (f *FDTable) Remove(fd int32) *fs.File {
+func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
if fd < 0 {
- return nil
+ return nil, nil
}
f.mu.Lock()
defer f.mu.Unlock()
- orig, _, _ := f.get(fd)
- if orig != nil {
- orig.IncRef() // Reference for caller.
- f.set(fd, nil, FDFlags{}) // Zap entry.
+ // Update current available position.
+ if fd < f.next {
+ f.next = fd
+ }
+
+ orig, orig2, _, _ := f.getAll(fd)
+
+ // Add reference for caller.
+ switch {
+ case orig != nil:
+ orig.IncRef()
+ case orig2 != nil:
+ orig2.IncRef()
}
- return orig
+ if orig != nil || orig2 != nil {
+ f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
+ }
+ return orig, orig2
}
// RemoveIf removes all FDs where cond is true.
-func (f *FDTable) RemoveIf(cond func(*fs.File, FDFlags) bool) {
+func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
f.mu.Lock()
defer f.mu.Unlock()
- f.forEach(func(fd int32, file *fs.File, flags FDFlags) {
- if cond(file, flags) {
+ f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
+ if cond(file, fileVFS2, flags) {
f.set(fd, nil, FDFlags{}) // Clear from table.
+ // Update current available position.
+ if fd < f.next {
+ f.next = fd
+ }
}
})
}