summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fsimpl
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/fsimpl')
-rw-r--r--pkg/sentry/fsimpl/devpts/BUILD19
-rw-r--r--pkg/sentry/fsimpl/devpts/devpts.go53
-rw-r--r--pkg/sentry/fsimpl/devpts/devpts_test.go4
-rw-r--r--pkg/sentry/fsimpl/devpts/line_discipline.go55
-rw-r--r--pkg/sentry/fsimpl/devpts/master.go38
-rw-r--r--pkg/sentry/fsimpl/devpts/queue.go23
-rw-r--r--pkg/sentry/fsimpl/devpts/replica.go (renamed from pkg/sentry/fsimpl/devpts/slave.go)90
-rw-r--r--pkg/sentry/fsimpl/devpts/terminal.go37
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/BUILD1
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/devtmpfs.go15
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go193
-rw-r--r--pkg/sentry/fsimpl/eventfd/eventfd.go8
-rw-r--r--pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go8
-rw-r--r--pkg/sentry/fsimpl/ext/ext_test.go6
-rw-r--r--pkg/sentry/fsimpl/ext/filesystem.go30
-rw-r--r--pkg/sentry/fsimpl/ext/symlink.go2
-rw-r--r--pkg/sentry/fsimpl/fuse/BUILD31
-rw-r--r--pkg/sentry/fsimpl/fuse/connection.go348
-rw-r--r--pkg/sentry/fsimpl/fuse/connection_control.go (renamed from pkg/sentry/fsimpl/fuse/init.go)171
-rw-r--r--pkg/sentry/fsimpl/fuse/connection_test.go117
-rw-r--r--pkg/sentry/fsimpl/fuse/dev.go192
-rw-r--r--pkg/sentry/fsimpl/fuse/dev_test.go105
-rw-r--r--pkg/sentry/fsimpl/fuse/directory.go105
-rw-r--r--pkg/sentry/fsimpl/fuse/file.go133
-rw-r--r--pkg/sentry/fsimpl/fuse/fusefs.go584
-rw-r--r--pkg/sentry/fsimpl/fuse/read_write.go242
-rw-r--r--pkg/sentry/fsimpl/fuse/regular_file.go230
-rw-r--r--pkg/sentry/fsimpl/fuse/request_response.go229
-rw-r--r--pkg/sentry/fsimpl/fuse/utils_test.go132
-rw-r--r--pkg/sentry/fsimpl/gofer/directory.go33
-rw-r--r--pkg/sentry/fsimpl/gofer/filesystem.go185
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go158
-rw-r--r--pkg/sentry/fsimpl/gofer/p9file.go7
-rw-r--r--pkg/sentry/fsimpl/gofer/regular_file.go70
-rw-r--r--pkg/sentry/fsimpl/gofer/special_file.go46
-rw-r--r--pkg/sentry/fsimpl/gofer/time.go31
-rw-r--r--pkg/sentry/fsimpl/host/BUILD26
-rw-r--r--pkg/sentry/fsimpl/host/host.go157
-rw-r--r--pkg/sentry/fsimpl/host/socket.go20
-rw-r--r--pkg/sentry/fsimpl/host/socket_unsafe.go4
-rw-r--r--pkg/sentry/fsimpl/host/tty.go52
-rw-r--r--pkg/sentry/fsimpl/kernfs/BUILD55
-rw-r--r--pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go1
-rw-r--r--pkg/sentry/fsimpl/kernfs/fd_impl_util.go46
-rw-r--r--pkg/sentry/fsimpl/kernfs/filesystem.go88
-rw-r--r--pkg/sentry/fsimpl/kernfs/inode_impl_util.go84
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs.go74
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs_test.go30
-rw-r--r--pkg/sentry/fsimpl/kernfs/symlink.go5
-rw-r--r--pkg/sentry/fsimpl/kernfs/synthetic_directory.go102
-rw-r--r--pkg/sentry/fsimpl/overlay/copy_up.go171
-rw-r--r--pkg/sentry/fsimpl/overlay/directory.go11
-rw-r--r--pkg/sentry/fsimpl/overlay/filesystem.go480
-rw-r--r--pkg/sentry/fsimpl/overlay/non_directory.go113
-rw-r--r--pkg/sentry/fsimpl/overlay/overlay.go108
-rw-r--r--pkg/sentry/fsimpl/pipefs/pipefs.go8
-rw-r--r--pkg/sentry/fsimpl/proc/BUILD62
-rw-r--r--pkg/sentry/fsimpl/proc/filesystem.go13
-rw-r--r--pkg/sentry/fsimpl/proc/subtasks.go28
-rw-r--r--pkg/sentry/fsimpl/proc/task.go29
-rw-r--r--pkg/sentry/fsimpl/proc/task_fds.go70
-rw-r--r--pkg/sentry/fsimpl/proc/task_files.go24
-rw-r--r--pkg/sentry/fsimpl/proc/task_net.go12
-rw-r--r--pkg/sentry/fsimpl/proc/tasks.go24
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_files.go20
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_sys.go127
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_sys_test.go6
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_test.go4
-rw-r--r--pkg/sentry/fsimpl/signalfd/signalfd.go6
-rw-r--r--pkg/sentry/fsimpl/sockfs/sockfs.go13
-rw-r--r--pkg/sentry/fsimpl/sys/BUILD20
-rw-r--r--pkg/sentry/fsimpl/sys/kcov.go117
-rw-r--r--pkg/sentry/fsimpl/sys/sys.go45
-rw-r--r--pkg/sentry/fsimpl/sys/sys_test.go2
-rw-r--r--pkg/sentry/fsimpl/timerfd/timerfd.go6
-rw-r--r--pkg/sentry/fsimpl/tmpfs/benchmark_test.go6
-rw-r--r--pkg/sentry/fsimpl/tmpfs/directory.go5
-rw-r--r--pkg/sentry/fsimpl/tmpfs/filesystem.go87
-rw-r--r--pkg/sentry/fsimpl/tmpfs/named_pipe.go4
-rw-r--r--pkg/sentry/fsimpl/tmpfs/pipe_test.go2
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file.go73
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs.go150
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs_test.go2
-rw-r--r--pkg/sentry/fsimpl/verity/BUILD5
-rw-r--r--pkg/sentry/fsimpl/verity/filesystem.go589
-rw-r--r--pkg/sentry/fsimpl/verity/verity.go350
86 files changed, 5767 insertions, 1500 deletions
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
index 93512c9b6..48e13613a 100644
--- a/pkg/sentry/fsimpl/devpts/BUILD
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -1,7 +1,19 @@
load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
licenses(["notice"])
+go_template_instance(
+ name = "root_inode_refs",
+ out = "root_inode_refs.go",
+ package = "devpts",
+ prefix = "rootInode",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "rootInode",
+ },
+)
+
go_library(
name = "devpts",
srcs = [
@@ -9,13 +21,18 @@ go_library(
"line_discipline.go",
"master.go",
"queue.go",
- "slave.go",
+ "replica.go",
+ "root_inode_refs.go",
"terminal.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
+ "//pkg/refs",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/fs/lock",
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index 7169e91af..f0f2e0be7 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -79,10 +79,11 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds
// Construct the root directory. This is always inode id 1.
root := &rootInode{
- slaves: make(map[uint32]*slaveInode),
+ replicas: make(map[uint32]*replicaInode),
}
root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555)
root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+ root.EnableLeakCheck()
root.dentry.Init(root)
// Construct the pts master inode and dentry. Linux always uses inode
@@ -110,11 +111,13 @@ func (fs *filesystem) Release(ctx context.Context) {
// rootInode is the root directory inode for the devpts mounts.
type rootInode struct {
+ implStatFS
kernfs.AlwaysValid
kernfs.InodeAttrs
kernfs.InodeDirectoryNoNewChildren
kernfs.InodeNotSymlink
kernfs.OrderedChildren
+ rootInodeRefs
locks vfs.FileLocks
@@ -130,8 +133,8 @@ type rootInode struct {
// mu protects the fields below.
mu sync.Mutex
- // slaves maps pty ids to slave inodes.
- slaves map[uint32]*slaveInode
+ // replicas maps pty ids to replica inodes.
+ replicas map[uint32]*replicaInode
// nextIdx is the next pty index to use. Must be accessed atomically.
//
@@ -151,22 +154,22 @@ func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error)
idx := i.nextIdx
i.nextIdx++
- // Sanity check that slave with idx does not exist.
- if _, ok := i.slaves[idx]; ok {
+ // Sanity check that replica with idx does not exist.
+ if _, ok := i.replicas[idx]; ok {
panic(fmt.Sprintf("pty index collision; index %d already exists", idx))
}
- // Create the new terminal and slave.
+ // Create the new terminal and replica.
t := newTerminal(idx)
- slave := &slaveInode{
+ replica := &replicaInode{
root: i,
t: t,
}
// Linux always uses pty index + 3 as the inode id. See
// fs/devpts/inode.c:devpts_pty_new().
- slave.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
- slave.dentry.Init(slave)
- i.slaves[idx] = slave
+ replica.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
+ replica.dentry.Init(replica)
+ i.replicas[idx] = replica
return t, nil
}
@@ -176,16 +179,18 @@ func (i *rootInode) masterClose(t *Terminal) {
i.mu.Lock()
defer i.mu.Unlock()
- // Sanity check that slave with idx exists.
- if _, ok := i.slaves[t.n]; !ok {
+ // Sanity check that replica with idx exists.
+ if _, ok := i.replicas[t.n]; !ok {
panic(fmt.Sprintf("pty with index %d does not exist", t.n))
}
- delete(i.slaves, t.n)
+ delete(i.replicas, t.n)
}
// Open implements kernfs.Inode.Open.
func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndStaticEntries,
+ })
if err != nil {
return nil, err
}
@@ -200,7 +205,7 @@ func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error
}
i.mu.Lock()
defer i.mu.Unlock()
- if si, ok := i.slaves[uint32(idx)]; ok {
+ if si, ok := i.replicas[uint32(idx)]; ok {
si.dentry.IncRef()
return si.dentry.VFSDentry(), nil
@@ -212,8 +217,8 @@ func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error
func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
i.mu.Lock()
defer i.mu.Unlock()
- ids := make([]int, 0, len(i.slaves))
- for id := range i.slaves {
+ ids := make([]int, 0, len(i.replicas))
+ for id := range i.replicas {
ids = append(ids, int(id))
}
sort.Ints(ids)
@@ -221,7 +226,7 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
dirent := vfs.Dirent{
Name: strconv.FormatUint(uint64(id), 10),
Type: linux.DT_CHR,
- Ino: i.slaves[uint32(id)].InodeAttrs.Ino(),
+ Ino: i.replicas[uint32(id)].InodeAttrs.Ino(),
NextOff: offset + 1,
}
if err := cb.Handle(dirent); err != nil {
@@ -231,3 +236,15 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
}
return offset, nil
}
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *rootInode) DecRef(context.Context) {
+ i.rootInodeRefs.DecRef(i.Destroy)
+}
+
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+ return vfs.GenericStatFS(linux.DEVPTS_SUPER_MAGIC), nil
+}
diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go
index b7c149047..448390cfe 100644
--- a/pkg/sentry/fsimpl/devpts/devpts_test.go
+++ b/pkg/sentry/fsimpl/devpts/devpts_test.go
@@ -22,8 +22,8 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
-func TestSimpleMasterToSlave(t *testing.T) {
- ld := newLineDiscipline(linux.DefaultSlaveTermios)
+func TestSimpleMasterToReplica(t *testing.T) {
+ ld := newLineDiscipline(linux.DefaultReplicaTermios)
ctx := contexttest.Context(t)
inBytes := []byte("hello, tty\n")
src := usermem.BytesIOSequence(inBytes)
diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go
index f7bc325d1..e6b0e81cf 100644
--- a/pkg/sentry/fsimpl/devpts/line_discipline.go
+++ b/pkg/sentry/fsimpl/devpts/line_discipline.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -41,7 +42,7 @@ const (
)
// lineDiscipline dictates how input and output are handled between the
-// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// pseudoterminal (pty) master and replica. It can be configured to alter I/O,
// modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
// pages are good resources for how to affect the line discipline:
//
@@ -52,8 +53,8 @@ const (
//
// lineDiscipline has a simple structure but supports a multitude of options
// (see the above man pages). It consists of two queues of bytes: one from the
-// terminal master to slave (the input queue) and one from slave to master (the
-// output queue). When bytes are written to one end of the pty, the line
+// terminal master to replica (the input queue) and one from replica to master
+// (the output queue). When bytes are written to one end of the pty, the line
// discipline reads the bytes, modifies them or takes special action if
// required, and enqueues them to be read by the other end of the pty:
//
@@ -62,7 +63,7 @@ const (
// | (inputQueueWrite) +-------------+ (inputQueueRead) |
// | |
// | v
-// masterFD slaveFD
+// masterFD replicaFD
// ^ |
// | |
// | output to terminal +--------------+ output from process |
@@ -101,8 +102,8 @@ type lineDiscipline struct {
// masterWaiter is used to wait on the master end of the TTY.
masterWaiter waiter.Queue `state:"zerovalue"`
- // slaveWaiter is used to wait on the slave end of the TTY.
- slaveWaiter waiter.Queue `state:"zerovalue"`
+ // replicaWaiter is used to wait on the replica end of the TTY.
+ replicaWaiter waiter.Queue `state:"zerovalue"`
}
func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
@@ -113,27 +114,23 @@ func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
}
// getTermios gets the linux.Termios for the tty.
-func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
l.termiosMu.RLock()
defer l.termiosMu.RUnlock()
// We must copy a Termios struct, not KernelTermios.
t := l.termios.ToTermios()
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := t.CopyOut(task, args[2].Pointer())
return 0, err
}
// setTermios sets a linux.Termios for the tty.
-func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
l.termiosMu.Lock()
defer l.termiosMu.Unlock()
oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
// We must copy a Termios struct, not KernelTermios.
var t linux.Termios
- _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := t.CopyIn(task, args[2].Pointer())
l.termios.FromTermios(t)
// If canonical mode is turned off, move bytes from inQueue's wait
@@ -144,27 +141,23 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
l.inQueue.pushWaitBufLocked(l)
l.inQueue.readable = true
l.inQueue.mu.Unlock()
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
}
return 0, err
}
-func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error {
l.sizeMu.Lock()
defer l.sizeMu.Unlock()
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := l.size.CopyOut(t, args[2].Pointer())
return err
}
-func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error {
l.sizeMu.Lock()
defer l.sizeMu.Unlock()
- _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := l.size.CopyIn(t, args[2].Pointer())
return err
}
@@ -174,14 +167,14 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask {
return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
}
-func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+func (l *lineDiscipline) replicaReadiness() waiter.EventMask {
l.termiosMu.RLock()
defer l.termiosMu.RUnlock()
return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
}
-func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
- return l.inQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
+ return l.inQueue.readableSize(t, io, args)
}
func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -194,7 +187,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque
if n > 0 {
l.masterWaiter.Notify(waiter.EventOut)
if pushed {
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
}
return n, nil
}
@@ -209,14 +202,14 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ
return 0, err
}
if n > 0 {
- l.slaveWaiter.Notify(waiter.EventIn)
+ l.replicaWaiter.Notify(waiter.EventIn)
return n, nil
}
return 0, syserror.ErrWouldBlock
}
-func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
- return l.outQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
+ return l.outQueue.readableSize(t, io, args)
}
func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -227,7 +220,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ
return 0, err
}
if n > 0 {
- l.slaveWaiter.Notify(waiter.EventOut)
+ l.replicaWaiter.Notify(waiter.EventOut)
if pushed {
l.masterWaiter.Notify(waiter.EventIn)
}
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index 3bb397f71..83d790b38 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -17,9 +17,11 @@ package devpts
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/unimpl"
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -30,6 +32,7 @@ import (
// masterInode is the inode for the master end of the Terminal.
type masterInode struct {
+ implStatFS
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
kernfs.InodeNotDirectory
@@ -130,46 +133,51 @@ func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSeque
// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ // ioctl(2) may only be called from a task goroutine.
+ return 0, syserror.ENOTTY
+ }
+
switch cmd := args[1].Uint(); cmd {
case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
// Get the number of bytes in the output queue read buffer.
- return 0, mfd.t.ld.outputQueueReadSize(ctx, io, args)
+ return 0, mfd.t.ld.outputQueueReadSize(t, io, args)
case linux.TCGETS:
// N.B. TCGETS on the master actually returns the configuration
- // of the slave end.
- return mfd.t.ld.getTermios(ctx, io, args)
+ // of the replica end.
+ return mfd.t.ld.getTermios(t, args)
case linux.TCSETS:
// N.B. TCSETS on the master actually affects the configuration
- // of the slave end.
- return mfd.t.ld.setTermios(ctx, io, args)
+ // of the replica end.
+ return mfd.t.ld.setTermios(t, args)
case linux.TCSETSW:
// TODO(b/29356795): This should drain the output queue first.
- return mfd.t.ld.setTermios(ctx, io, args)
+ return mfd.t.ld.setTermios(t, args)
case linux.TIOCGPTN:
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mfd.t.n), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ nP := primitive.Uint32(mfd.t.n)
+ _, err := nP.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCSPTLCK:
// TODO(b/29356795): Implement pty locking. For now just pretend we do.
return 0, nil
case linux.TIOCGWINSZ:
- return 0, mfd.t.ld.windowSize(ctx, io, args)
+ return 0, mfd.t.ld.windowSize(t, args)
case linux.TIOCSWINSZ:
- return 0, mfd.t.ld.setWindowSize(ctx, io, args)
+ return 0, mfd.t.ld.setWindowSize(t, args)
case linux.TIOCSCTTY:
// Make the given terminal the controlling terminal of the
// calling process.
- return 0, mfd.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+ return 0, mfd.t.setControllingTTY(ctx, args, true /* isMaster */)
case linux.TIOCNOTTY:
// Release this process's controlling terminal.
- return 0, mfd.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+ return 0, mfd.t.releaseControllingTTY(ctx, args, true /* isMaster */)
case linux.TIOCGPGRP:
// Get the foreground process group.
- return mfd.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+ return mfd.t.foregroundProcessGroup(ctx, args, true /* isMaster */)
case linux.TIOCSPGRP:
// Set the foreground process group.
- return mfd.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
+ return mfd.t.setForegroundProcessGroup(ctx, args, true /* isMaster */)
default:
maybeEmitUnimplementedEvent(ctx, cmd)
return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go
index dffb4232c..55bff3e60 100644
--- a/pkg/sentry/fsimpl/devpts/queue.go
+++ b/pkg/sentry/fsimpl/devpts/queue.go
@@ -17,8 +17,10 @@ package devpts
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -30,7 +32,7 @@ import (
const waitBufMaxBytes = 131072
// queue represents one of the input or output queues between a pty master and
-// slave. Bytes written to a queue are added to the read buffer until it is
+// replica. Bytes written to a queue are added to the read buffer until it is
// full, at which point they are written to the wait buffer. Bytes are
// processed (i.e. undergo termios transformations) as they are added to the
// read buffer. The read buffer is readable when its length is nonzero and
@@ -83,17 +85,15 @@ func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
}
// readableSize writes the number of readable bytes to userspace.
-func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (q *queue) readableSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
q.mu.Lock()
defer q.mu.Unlock()
- var size int32
+ size := primitive.Int32(0)
if q.readable {
- size = int32(len(q.readBuf))
+ size = primitive.Int32(len(q.readBuf))
}
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err := size.CopyOut(t, args[2].Pointer())
return err
}
@@ -102,8 +102,7 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca
// as whether the read caused more readable data to become available (whether
// data was pushed from the wait buffer to the read buffer).
//
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
q.mu.Lock()
defer q.mu.Unlock()
@@ -143,8 +142,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl
// write writes to q from userspace.
//
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) {
q.mu.Lock()
defer q.mu.Unlock()
@@ -186,8 +184,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip
// writeBytes writes to q from b.
//
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
func (q *queue) writeBytes(b []byte, l *lineDiscipline) {
q.mu.Lock()
defer q.mu.Unlock()
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/replica.go
index 32e4e1908..58f6c1d3a 100644
--- a/pkg/sentry/fsimpl/devpts/slave.go
+++ b/pkg/sentry/fsimpl/devpts/replica.go
@@ -17,9 +17,11 @@ package devpts
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
@@ -27,8 +29,9 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
-// slaveInode is the inode for the slave end of the Terminal.
-type slaveInode struct {
+// replicaInode is the inode for the replica end of the Terminal.
+type replicaInode struct {
+ implStatFS
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
kernfs.InodeNotDirectory
@@ -46,12 +49,12 @@ type slaveInode struct {
t *Terminal
}
-var _ kernfs.Inode = (*slaveInode)(nil)
+var _ kernfs.Inode = (*replicaInode)(nil)
// Open implements kernfs.Inode.Open.
-func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (si *replicaInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
si.IncRef()
- fd := &slaveFileDescription{
+ fd := &replicaFileDescription{
inode: si,
}
fd.LockFD.Init(&si.locks)
@@ -64,109 +67,114 @@ func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs
}
// Valid implements kernfs.Inode.Valid.
-func (si *slaveInode) Valid(context.Context) bool {
- // Return valid if the slave still exists.
+func (si *replicaInode) Valid(context.Context) bool {
+ // Return valid if the replica still exists.
si.root.mu.Lock()
defer si.root.mu.Unlock()
- _, ok := si.root.slaves[si.t.n]
+ _, ok := si.root.replicas[si.t.n]
return ok
}
// Stat implements kernfs.Inode.Stat.
-func (si *slaveInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+func (si *replicaInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
statx, err := si.InodeAttrs.Stat(ctx, vfsfs, opts)
if err != nil {
return linux.Statx{}, err
}
statx.Blksize = 1024
- statx.RdevMajor = linux.UNIX98_PTY_SLAVE_MAJOR
+ statx.RdevMajor = linux.UNIX98_PTY_REPLICA_MAJOR
statx.RdevMinor = si.t.n
return statx, nil
}
// SetStat implements kernfs.Inode.SetStat
-func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+func (si *replicaInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
if opts.Stat.Mask&linux.STATX_SIZE != 0 {
return syserror.EINVAL
}
return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
}
-type slaveFileDescription struct {
+type replicaFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.LockFD
- inode *slaveInode
+ inode *replicaInode
}
-var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil)
+var _ vfs.FileDescriptionImpl = (*replicaFileDescription)(nil)
// Release implements fs.FileOperations.Release.
-func (sfd *slaveFileDescription) Release(ctx context.Context) {
+func (sfd *replicaFileDescription) Release(ctx context.Context) {
sfd.inode.DecRef(ctx)
}
// EventRegister implements waiter.Waitable.EventRegister.
-func (sfd *slaveFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
- sfd.inode.t.ld.slaveWaiter.EventRegister(e, mask)
+func (sfd *replicaFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ sfd.inode.t.ld.replicaWaiter.EventRegister(e, mask)
}
// EventUnregister implements waiter.Waitable.EventUnregister.
-func (sfd *slaveFileDescription) EventUnregister(e *waiter.Entry) {
- sfd.inode.t.ld.slaveWaiter.EventUnregister(e)
+func (sfd *replicaFileDescription) EventUnregister(e *waiter.Entry) {
+ sfd.inode.t.ld.replicaWaiter.EventUnregister(e)
}
// Readiness implements waiter.Waitable.Readiness.
-func (sfd *slaveFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
- return sfd.inode.t.ld.slaveReadiness()
+func (sfd *replicaFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return sfd.inode.t.ld.replicaReadiness()
}
// Read implements vfs.FileDescriptionImpl.Read.
-func (sfd *slaveFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+func (sfd *replicaFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
return sfd.inode.t.ld.inputQueueRead(ctx, dst)
}
// Write implements vfs.FileDescriptionImpl.Write.
-func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+func (sfd *replicaFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
return sfd.inode.t.ld.outputQueueWrite(ctx, src)
}
// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
-func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (sfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ // ioctl(2) may only be called from a task goroutine.
+ return 0, syserror.ENOTTY
+ }
+
switch cmd := args[1].Uint(); cmd {
case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
// Get the number of bytes in the input queue read buffer.
- return 0, sfd.inode.t.ld.inputQueueReadSize(ctx, io, args)
+ return 0, sfd.inode.t.ld.inputQueueReadSize(t, io, args)
case linux.TCGETS:
- return sfd.inode.t.ld.getTermios(ctx, io, args)
+ return sfd.inode.t.ld.getTermios(t, args)
case linux.TCSETS:
- return sfd.inode.t.ld.setTermios(ctx, io, args)
+ return sfd.inode.t.ld.setTermios(t, args)
case linux.TCSETSW:
// TODO(b/29356795): This should drain the output queue first.
- return sfd.inode.t.ld.setTermios(ctx, io, args)
+ return sfd.inode.t.ld.setTermios(t, args)
case linux.TIOCGPTN:
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sfd.inode.t.n), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ nP := primitive.Uint32(sfd.inode.t.n)
+ _, err := nP.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCGWINSZ:
- return 0, sfd.inode.t.ld.windowSize(ctx, io, args)
+ return 0, sfd.inode.t.ld.windowSize(t, args)
case linux.TIOCSWINSZ:
- return 0, sfd.inode.t.ld.setWindowSize(ctx, io, args)
+ return 0, sfd.inode.t.ld.setWindowSize(t, args)
case linux.TIOCSCTTY:
// Make the given terminal the controlling terminal of the
// calling process.
- return 0, sfd.inode.t.setControllingTTY(ctx, io, args, false /* isMaster */)
+ return 0, sfd.inode.t.setControllingTTY(ctx, args, false /* isMaster */)
case linux.TIOCNOTTY:
// Release this process's controlling terminal.
- return 0, sfd.inode.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
+ return 0, sfd.inode.t.releaseControllingTTY(ctx, args, false /* isMaster */)
case linux.TIOCGPGRP:
// Get the foreground process group.
- return sfd.inode.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
+ return sfd.inode.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
case linux.TIOCSPGRP:
// Set the foreground process group.
- return sfd.inode.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
+ return sfd.inode.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
default:
maybeEmitUnimplementedEvent(ctx, cmd)
return 0, syserror.ENOTTY
@@ -174,24 +182,24 @@ func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args
}
// SetStat implements vfs.FileDescriptionImpl.SetStat.
-func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+func (sfd *replicaFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
creds := auth.CredentialsFromContext(ctx)
fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
return sfd.inode.SetStat(ctx, fs, creds, opts)
}
// Stat implements vfs.FileDescriptionImpl.Stat.
-func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+func (sfd *replicaFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
return sfd.inode.Stat(ctx, fs, opts)
}
// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
-func (sfd *slaveFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+func (sfd *replicaFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
return sfd.Locks().LockPOSIX(ctx, &sfd.vfsfd, uid, t, start, length, whence, block)
}
// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
-func (sfd *slaveFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+func (sfd *replicaFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
return sfd.Locks().UnlockPOSIX(ctx, &sfd.vfsfd, uid, start, length, whence)
}
diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go
index 7d2781c54..510bd6d89 100644
--- a/pkg/sentry/fsimpl/devpts/terminal.go
+++ b/pkg/sentry/fsimpl/devpts/terminal.go
@@ -17,9 +17,9 @@ package devpts
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/usermem"
)
// Terminal is a pseudoterminal.
@@ -36,25 +36,25 @@ type Terminal struct {
// this terminal. This field is immutable.
masterKTTY *kernel.TTY
- // slaveKTTY contains the controlling process of the slave end of this
+ // replicaKTTY contains the controlling process of the replica end of this
// terminal. This field is immutable.
- slaveKTTY *kernel.TTY
+ replicaKTTY *kernel.TTY
}
func newTerminal(n uint32) *Terminal {
- termios := linux.DefaultSlaveTermios
+ termios := linux.DefaultReplicaTermios
t := Terminal{
- n: n,
- ld: newLineDiscipline(termios),
- masterKTTY: &kernel.TTY{Index: n},
- slaveKTTY: &kernel.TTY{Index: n},
+ n: n,
+ ld: newLineDiscipline(termios),
+ masterKTTY: &kernel.TTY{Index: n},
+ replicaKTTY: &kernel.TTY{Index: n},
}
return &t
}
// setControllingTTY makes tm the controlling terminal of the calling thread
// group.
-func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) setControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("setControllingTTY must be called from a task context")
@@ -65,7 +65,7 @@ func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args a
// releaseControllingTTY removes tm as the controlling terminal of the calling
// thread group.
-func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("releaseControllingTTY must be called from a task context")
@@ -75,7 +75,7 @@ func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, ar
}
// foregroundProcessGroup gets the process group ID of tm's foreground process.
-func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("foregroundProcessGroup must be called from a task context")
@@ -87,24 +87,21 @@ func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, a
}
// Write it out to *arg.
- _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ retP := primitive.Int32(ret)
+ _, err = retP.CopyOut(task, args[2].Pointer())
return 0, err
}
// foregroundProcessGroup sets tm's foreground process.
-func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
task := kernel.TaskFromContext(ctx)
if task == nil {
panic("setForegroundProcessGroup must be called from a task context")
}
// Read in the process group ID.
- var pgid int32
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ var pgid primitive.Int32
+ if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
@@ -116,5 +113,5 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
if isMaster {
return tm.masterKTTY
}
- return tm.slaveKTTY
+ return tm.replicaKTTY
}
diff --git a/pkg/sentry/fsimpl/devtmpfs/BUILD b/pkg/sentry/fsimpl/devtmpfs/BUILD
index aa0c2ad8c..01bbee5ad 100644
--- a/pkg/sentry/fsimpl/devtmpfs/BUILD
+++ b/pkg/sentry/fsimpl/devtmpfs/BUILD
@@ -24,6 +24,7 @@ go_test(
library = ":devtmpfs",
deps = [
"//pkg/abi/linux",
+ "//pkg/context",
"//pkg/fspath",
"//pkg/sentry/contexttest",
"//pkg/sentry/fsimpl/tmpfs",
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index 2ed5fa8a9..a23094e54 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -18,6 +18,7 @@ package devtmpfs
import (
"fmt"
+ "path"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -79,7 +80,7 @@ type Accessor struct {
// NewAccessor returns an Accessor that supports creation of device special
// files in the devtmpfs instance registered with name fsTypeName in vfsObj.
func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, fsTypeName string) (*Accessor, error) {
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.MountOptions{})
if err != nil {
return nil, err
}
@@ -150,13 +151,11 @@ func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind v
// Create any parent directories. See
// devtmpfs.c:handle_create()=>path_create().
- for it := fspath.Parse(pathname).Begin; it.NextOk(); it = it.Next() {
- pop := a.pathOperationAt(it.String())
- if err := a.vfsObj.MkdirAt(actx, a.creds, pop, &vfs.MkdirOptions{
- Mode: 0755,
- }); err != nil {
- return fmt.Errorf("failed to create directory %q: %v", it.String(), err)
- }
+ parent := path.Dir(pathname)
+ if err := a.vfsObj.MkdirAllAt(ctx, parent, a.root, a.creds, &vfs.MkdirOptions{
+ Mode: 0755,
+ }); err != nil {
+ return fmt.Errorf("failed to create device parent directory %q: %v", parent, err)
}
// NOTE: Linux's devtmpfs refuses to automatically delete files it didn't
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index 747867cca..3a38b8bb4 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -15,9 +15,11 @@
package devtmpfs
import (
+ "path"
"testing"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
@@ -25,10 +27,13 @@ import (
"gvisor.dev/gvisor/pkg/sentry/vfs"
)
-func TestDevtmpfs(t *testing.T) {
+const devPath = "/dev"
+
+func setupDevtmpfs(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry, func()) {
+ t.Helper()
+
ctx := contexttest.Context(t)
creds := auth.CredentialsFromContext(ctx)
-
vfsObj := &vfs.VirtualFilesystem{}
if err := vfsObj.Init(ctx); err != nil {
t.Fatalf("VFS init: %v", err)
@@ -43,14 +48,11 @@ func TestDevtmpfs(t *testing.T) {
})
// Create a test mount namespace with devtmpfs mounted at "/dev".
- const devPath = "/dev"
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.MountOptions{})
if err != nil {
t.Fatalf("failed to create tmpfs root mount: %v", err)
}
- defer mntns.DecRef(ctx)
root := mntns.Root()
- defer root.DecRef(ctx)
devpop := vfs.PathOperation{
Root: root,
Start: root,
@@ -61,10 +63,20 @@ func TestDevtmpfs(t *testing.T) {
}); err != nil {
t.Fatalf("failed to create mount point: %v", err)
}
- if err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil {
+ if _, err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil {
t.Fatalf("failed to mount devtmpfs: %v", err)
}
+ return ctx, creds, vfsObj, root, func() {
+ root.DecRef(ctx)
+ mntns.DecRef(ctx)
+ }
+}
+
+func TestUserspaceInit(t *testing.T) {
+ ctx, creds, vfsObj, root, cleanup := setupDevtmpfs(t)
+ defer cleanup()
+
a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs")
if err != nil {
t.Fatalf("failed to create devtmpfs.Accessor: %v", err)
@@ -75,48 +87,143 @@ func TestDevtmpfs(t *testing.T) {
if err := a.UserspaceInit(ctx); err != nil {
t.Fatalf("failed to userspace-initialize devtmpfs: %v", err)
}
+
// Created files should be visible in the test mount namespace.
- abspath := devPath + "/fd"
- target, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(abspath),
- })
- if want := "/proc/self/fd"; err != nil || target != want {
- t.Fatalf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, target, err, want)
+ links := []struct {
+ source string
+ target string
+ }{
+ {
+ source: "fd",
+ target: "/proc/self/fd",
+ },
+ {
+ source: "stdin",
+ target: "/proc/self/fd/0",
+ },
+ {
+ source: "stdout",
+ target: "/proc/self/fd/1",
+ },
+ {
+ source: "stderr",
+ target: "/proc/self/fd/2",
+ },
+ {
+ source: "ptmx",
+ target: "pts/ptmx",
+ },
}
- // Create a dummy device special file using a devtmpfs.Accessor.
- const (
- pathInDev = "dummy"
- kind = vfs.CharDevice
- major = 12
- minor = 34
- perms = 0600
- wantMode = linux.S_IFCHR | perms
- )
- if err := a.CreateDeviceFile(ctx, pathInDev, kind, major, minor, perms); err != nil {
- t.Fatalf("failed to create device file: %v", err)
+ for _, link := range links {
+ abspath := path.Join(devPath, link.source)
+ if gotTarget, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(abspath),
+ }); err != nil || gotTarget != link.target {
+ t.Errorf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, gotTarget, err, link.target)
+ }
}
- // The device special file should be visible in the test mount namespace.
- abspath = devPath + "/" + pathInDev
- stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(abspath),
- }, &vfs.StatOptions{
- Mask: linux.STATX_TYPE | linux.STATX_MODE,
- })
- if err != nil {
- t.Fatalf("failed to stat device file at %q: %v", abspath, err)
+
+ dirs := []string{"shm", "pts"}
+ for _, dir := range dirs {
+ abspath := path.Join(devPath, dir)
+ statx, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(abspath),
+ }, &vfs.StatOptions{
+ Mask: linux.STATX_MODE,
+ })
+ if err != nil {
+ t.Errorf("stat(%q): got error %v ", abspath, err)
+ continue
+ }
+ if want := uint16(0755) | linux.S_IFDIR; statx.Mode != want {
+ t.Errorf("stat(%q): got mode %x, want %x", abspath, statx.Mode, want)
+ }
}
- if stat.Mode != wantMode {
- t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode)
+}
+
+func TestCreateDeviceFile(t *testing.T) {
+ ctx, creds, vfsObj, root, cleanup := setupDevtmpfs(t)
+ defer cleanup()
+
+ a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs")
+ if err != nil {
+ t.Fatalf("failed to create devtmpfs.Accessor: %v", err)
}
- if stat.RdevMajor != major {
- t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, major)
+ defer a.Release(ctx)
+
+ devFiles := []struct {
+ path string
+ kind vfs.DeviceKind
+ major uint32
+ minor uint32
+ perms uint16
+ }{
+ {
+ path: "dummy",
+ kind: vfs.CharDevice,
+ major: 12,
+ minor: 34,
+ perms: 0600,
+ },
+ {
+ path: "foo/bar",
+ kind: vfs.BlockDevice,
+ major: 13,
+ minor: 35,
+ perms: 0660,
+ },
+ {
+ path: "foo/baz",
+ kind: vfs.CharDevice,
+ major: 12,
+ minor: 40,
+ perms: 0666,
+ },
+ {
+ path: "a/b/c/d/e",
+ kind: vfs.BlockDevice,
+ major: 12,
+ minor: 34,
+ perms: 0600,
+ },
}
- if stat.RdevMinor != minor {
- t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, minor)
+
+ for _, f := range devFiles {
+ if err := a.CreateDeviceFile(ctx, f.path, f.kind, f.major, f.minor, f.perms); err != nil {
+ t.Fatalf("failed to create device file: %v", err)
+ }
+ // The device special file should be visible in the test mount namespace.
+ abspath := path.Join(devPath, f.path)
+ stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(abspath),
+ }, &vfs.StatOptions{
+ Mask: linux.STATX_TYPE | linux.STATX_MODE,
+ })
+ if err != nil {
+ t.Fatalf("failed to stat device file at %q: %v", abspath, err)
+ }
+ if stat.RdevMajor != f.major {
+ t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, f.major)
+ }
+ if stat.RdevMinor != f.minor {
+ t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, f.minor)
+ }
+ wantMode := f.perms
+ switch f.kind {
+ case vfs.CharDevice:
+ wantMode |= linux.S_IFCHR
+ case vfs.BlockDevice:
+ wantMode |= linux.S_IFBLK
+ }
+ if stat.Mode != wantMode {
+ t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode)
+ }
}
}
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go
index 812171fa3..bb0bf3a07 100644
--- a/pkg/sentry/fsimpl/eventfd/eventfd.go
+++ b/pkg/sentry/fsimpl/eventfd/eventfd.go
@@ -30,7 +30,7 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
-// EventFileDescription implements FileDescriptionImpl for file-based event
+// EventFileDescription implements vfs.FileDescriptionImpl for file-based event
// notification (eventfd). Eventfds are usually internal to the Sentry but in
// certain situations they may be converted into a host-backed eventfd.
type EventFileDescription struct {
@@ -106,7 +106,7 @@ func (efd *EventFileDescription) HostFD() (int, error) {
return efd.hostfd, nil
}
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
func (efd *EventFileDescription) Release(context.Context) {
efd.mu.Lock()
defer efd.mu.Unlock()
@@ -119,7 +119,7 @@ func (efd *EventFileDescription) Release(context.Context) {
}
}
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
if dst.NumBytes() < 8 {
return 0, syscall.EINVAL
@@ -130,7 +130,7 @@ func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequenc
return 8, nil
}
-// Write implements FileDescriptionImpl.Write.
+// Write implements vfs.FileDescriptionImpl.Write.
func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
if src.NumBytes() < 8 {
return 0, syscall.EINVAL
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 8f7d5a9bb..c349b886e 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -59,7 +59,11 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.MountOptions{
+ GetFilesystemOptions: vfs.GetFilesystemOptions{
+ InternalData: int(f.Fd()),
+ },
+ })
if err != nil {
f.Close()
return nil, nil, nil, nil, err
@@ -90,7 +94,7 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf
ctx := contexttest.Context(b)
creds := auth.CredentialsFromContext(ctx)
- if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
+ if _, err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
GetFilesystemOptions: vfs.GetFilesystemOptions{
InternalData: int(f.Fd()),
},
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 2dbaee287..0989558cd 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -71,7 +71,11 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.MountOptions{
+ GetFilesystemOptions: vfs.GetFilesystemOptions{
+ InternalData: int(f.Fd()),
+ },
+ })
if err != nil {
f.Close()
return nil, nil, nil, nil, err
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index c714ddf73..a4a6d8c55 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -81,9 +81,9 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil)
// stepLocked is loosely analogous to fs/namei.c:walk_component().
//
// Preconditions:
-// - filesystem.mu must be locked (for writing if write param is true).
-// - !rp.Done().
-// - inode == vfsd.Impl().(*Dentry).inode.
+// * filesystem.mu must be locked (for writing if write param is true).
+// * !rp.Done().
+// * inode == vfsd.Impl().(*Dentry).inode.
func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
if !inode.isDir() {
return nil, nil, syserror.ENOTDIR
@@ -166,7 +166,7 @@ func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, in
// walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
//
// Preconditions:
-// - filesystem.mu must be locked (for writing if write param is true).
+// * filesystem.mu must be locked (for writing if write param is true).
func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
vfsd := rp.Start()
inode := vfsd.Impl().(*dentry).inode
@@ -194,8 +194,8 @@ func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.De
// walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat().
//
// Preconditions:
-// - filesystem.mu must be locked (for writing if write param is true).
-// - !rp.Done().
+// * filesystem.mu must be locked (for writing if write param is true).
+// * !rp.Done().
func walkParentLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
vfsd := rp.Start()
inode := vfsd.Impl().(*dentry).inode
@@ -490,7 +490,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return syserror.EROFS
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
_, inode, err := fs.walk(ctx, rp, false)
if err != nil {
@@ -504,8 +504,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
return nil, syserror.ECONNREFUSED
}
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
_, _, err := fs.walk(ctx, rp, false)
if err != nil {
return nil, err
@@ -513,8 +513,8 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
return nil, syserror.ENOTSUP
}
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
_, _, err := fs.walk(ctx, rp, false)
if err != nil {
return "", err
@@ -522,8 +522,8 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
return "", syserror.ENOTSUP
}
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
_, _, err := fs.walk(ctx, rp, false)
if err != nil {
return err
@@ -531,8 +531,8 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
return syserror.ENOTSUP
}
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
_, _, err := fs.walk(ctx, rp, false)
if err != nil {
return err
diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go
index 2fd0d1fa8..f33592d59 100644
--- a/pkg/sentry/fsimpl/ext/symlink.go
+++ b/pkg/sentry/fsimpl/ext/symlink.go
@@ -61,7 +61,7 @@ func (in *inode) isSymlink() bool {
return ok
}
-// symlinkFD represents a symlink file description and implements implements
+// symlinkFD represents a symlink file description and implements
// vfs.FileDescriptionImpl. which may only be used if open options contains
// O_PATH. For this reason most of the functions return EBADF.
type symlinkFD struct {
diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD
index 999111deb..045d7ab08 100644
--- a/pkg/sentry/fsimpl/fuse/BUILD
+++ b/pkg/sentry/fsimpl/fuse/BUILD
@@ -15,21 +15,41 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "inode_refs",
+ out = "inode_refs.go",
+ package = "fuse",
+ prefix = "inode",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "inode",
+ },
+)
+
go_library(
name = "fuse",
srcs = [
"connection.go",
+ "connection_control.go",
"dev.go",
+ "directory.go",
+ "file.go",
"fusefs.go",
- "init.go",
+ "inode_refs.go",
+ "read_write.go",
"register.go",
+ "regular_file.go",
"request_list.go",
+ "request_response.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
"//pkg/context",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/refs",
+ "//pkg/safemem",
"//pkg/sentry/fsimpl/devtmpfs",
"//pkg/sentry/fsimpl/kernfs",
"//pkg/sentry/kernel",
@@ -39,7 +59,6 @@ go_library(
"//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",
- "//tools/go_marshal/marshal",
"@org_golang_x_sys//unix:go_default_library",
],
)
@@ -47,10 +66,15 @@ go_library(
go_test(
name = "fuse_test",
size = "small",
- srcs = ["dev_test.go"],
+ srcs = [
+ "connection_test.go",
+ "dev_test.go",
+ "utils_test.go",
+ ],
library = ":fuse",
deps = [
"//pkg/abi/linux",
+ "//pkg/marshal",
"//pkg/sentry/fsimpl/testutil",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
@@ -58,6 +82,5 @@ go_test(
"//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",
- "//tools/go_marshal/marshal",
],
)
diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go
index 6df2728ab..dbc5e1954 100644
--- a/pkg/sentry/fsimpl/fuse/connection.go
+++ b/pkg/sentry/fsimpl/fuse/connection.go
@@ -15,31 +15,17 @@
package fuse
import (
- "errors"
- "fmt"
"sync"
- "sync/atomic"
- "syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
)
-// maxActiveRequestsDefault is the default setting controlling the upper bound
-// on the number of active requests at any given time.
-const maxActiveRequestsDefault = 10000
-
-// Ordinary requests have even IDs, while interrupts IDs are odd.
-// Used to increment the unique ID for each FUSE request.
-var reqIDStep uint64 = 2
-
const (
// fuseDefaultMaxBackground is the default value for MaxBackground.
fuseDefaultMaxBackground = 12
@@ -52,43 +38,36 @@ const (
fuseDefaultMaxPagesPerReq = 32
)
-// Request represents a FUSE operation request that hasn't been sent to the
-// server yet.
-//
-// +stateify savable
-type Request struct {
- requestEntry
-
- id linux.FUSEOpID
- hdr *linux.FUSEHeaderIn
- data []byte
-}
-
-// Response represents an actual response from the server, including the
-// response payload.
-//
-// +stateify savable
-type Response struct {
- opcode linux.FUSEOpcode
- hdr linux.FUSEHeaderOut
- data []byte
-}
-
// connection is the struct by which the sentry communicates with the FUSE server daemon.
+// Lock order:
+// - conn.fd.mu
+// - conn.mu
+// - conn.asyncMu
type connection struct {
fd *DeviceFD
+ // mu protects access to struct memebers.
+ mu sync.Mutex
+
+ // attributeVersion is the version of connection's attributes.
+ attributeVersion uint64
+
+ // We target FUSE 7.23.
// The following FUSE_INIT flags are currently unsupported by this implementation:
- // - FUSE_ATOMIC_O_TRUNC: requires open(..., O_TRUNC)
// - FUSE_EXPORT_SUPPORT
- // - FUSE_HANDLE_KILLPRIV
// - FUSE_POSIX_LOCKS: requires POSIX locks
// - FUSE_FLOCK_LOCKS: requires POSIX locks
// - FUSE_AUTO_INVAL_DATA: requires page caching eviction
- // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction
// - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation
// - FUSE_ASYNC_DIO
- // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler
+ // - FUSE_PARALLEL_DIROPS (7.25)
+ // - FUSE_HANDLE_KILLPRIV (7.26)
+ // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler (7.26)
+ // - FUSE_ABORT_ERROR (7.27)
+ // - FUSE_CACHE_SYMLINKS (7.28)
+ // - FUSE_NO_OPENDIR_SUPPORT (7.29)
+ // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction (7.30)
+ // - FUSE_MAP_ALIGNMENT (7.31)
// initialized after receiving FUSE_INIT reply.
// Until it's set, suspend sending FUSE requests.
@@ -98,10 +77,6 @@ type connection struct {
// initializedChan is used to block requests before initialization.
initializedChan chan struct{}
- // blocked when there are too many outstading backgrounds requests (NumBackground == MaxBackground).
- // TODO(gvisor.dev/issue/3185): update the numBackground accordingly; use a channel to block.
- blocked bool
-
// connected (connection established) when a new FUSE file system is created.
// Set to false when:
// umount,
@@ -109,48 +84,55 @@ type connection struct {
// device release.
connected bool
- // aborted via sysfs.
- // TODO(gvisor.dev/issue/3185): abort all queued requests.
- aborted bool
-
// connInitError if FUSE_INIT encountered error (major version mismatch).
// Only set in INIT.
connInitError bool
// connInitSuccess if FUSE_INIT is successful.
// Only set in INIT.
- // Used for destory.
+ // Used for destory (not yet implemented).
connInitSuccess bool
- // TODO(gvisor.dev/issue/3185): All the queue logic are working in progress.
-
- // NumberBackground is the number of requests in the background.
- numBackground uint16
+ // aborted via sysfs, and will send ECONNABORTED to read after disconnection (instead of ENODEV).
+ // Set only if abortErr is true and via fuse control fs (not yet implemented).
+ // TODO(gvisor.dev/issue/3525): set this to true when user aborts.
+ aborted bool
- // congestionThreshold for NumBackground.
- // Negotiated in FUSE_INIT.
- congestionThreshold uint16
+ // numWating is the number of requests waiting to be
+ // sent to FUSE device or being processed by FUSE daemon.
+ numWaiting uint32
- // maxBackground is the maximum number of NumBackground.
- // Block connection when it is reached.
- // Negotiated in FUSE_INIT.
- maxBackground uint16
+ // Terminology note:
+ //
+ // - `asyncNumMax` is the `MaxBackground` in the FUSE_INIT_IN struct.
+ //
+ // - `asyncCongestionThreshold` is the `CongestionThreshold` in the FUSE_INIT_IN struct.
+ //
+ // We call the "background" requests in unix term as async requests.
+ // The "async requests" in unix term is our async requests that expect a reply,
+ // i.e. `!request.noReply`
- // numActiveBackground is the number of requests in background and has being marked as active.
- numActiveBackground uint16
+ // asyncMu protects the async request fields.
+ asyncMu sync.Mutex
- // numWating is the number of requests waiting for completion.
- numWaiting uint32
+ // asyncNum is the number of async requests.
+ // Protected by asyncMu.
+ asyncNum uint16
- // TODO(gvisor.dev/issue/3185): BgQueue
- // some queue for background queued requests.
+ // asyncCongestionThreshold the number of async requests.
+ // Negotiated in FUSE_INIT as "CongestionThreshold".
+ // TODO(gvisor.dev/issue/3529): add congestion control.
+ // Protected by asyncMu.
+ asyncCongestionThreshold uint16
- // bgLock protects:
- // MaxBackground, CongestionThreshold, NumBackground,
- // NumActiveBackground, BgQueue, Blocked.
- bgLock sync.Mutex
+ // asyncNumMax is the maximum number of asyncNum.
+ // Connection blocks the async requests when it is reached.
+ // Negotiated in FUSE_INIT as "MaxBackground".
+ // Protected by asyncMu.
+ asyncNumMax uint16
// maxRead is the maximum size of a read buffer in in bytes.
+ // Initialized from a fuse fs parameter.
maxRead uint32
// maxWrite is the maximum size of a write buffer in bytes.
@@ -165,23 +147,20 @@ type connection struct {
// Negotiated and only set in INIT.
minor uint32
- // asyncRead if read pages asynchronously.
+ // atomicOTrunc is true when FUSE does not send a separate SETATTR request
+ // before open with O_TRUNC flag.
// Negotiated and only set in INIT.
- asyncRead bool
+ atomicOTrunc bool
- // abortErr is true if kernel need to return an unique read error after abort.
+ // asyncRead if read pages asynchronously.
// Negotiated and only set in INIT.
- abortErr bool
+ asyncRead bool
// writebackCache is true for write-back cache policy,
// false for write-through policy.
// Negotiated and only set in INIT.
writebackCache bool
- // cacheSymlinks if filesystem needs to cache READLINK responses in page cache.
- // Negotiated and only set in INIT.
- cacheSymlinks bool
-
// bigWrites if doing multi-page cached writes.
// Negotiated and only set in INIT.
bigWrites bool
@@ -189,116 +168,70 @@ type connection struct {
// dontMask if filestestem does not apply umask to creation modes.
// Negotiated in INIT.
dontMask bool
+
+ // noOpen if FUSE server doesn't support open operation.
+ // This flag only influence performance, not correctness of the program.
+ noOpen bool
}
// newFUSEConnection creates a FUSE connection to fd.
-func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*connection, error) {
+func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, opts *filesystemOptions) (*connection, error) {
// Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to
// mount a FUSE filesystem.
fuseFD := fd.Impl().(*DeviceFD)
- fuseFD.mounted = true
// Create the writeBuf for the header to be stored in.
hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
fuseFD.writeBuf = make([]byte, hdrLen)
fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse)
- fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests)
+ fuseFD.fullQueueCh = make(chan struct{}, opts.maxActiveRequests)
fuseFD.writeCursor = 0
return &connection{
- fd: fuseFD,
- maxBackground: fuseDefaultMaxBackground,
- congestionThreshold: fuseDefaultCongestionThreshold,
- maxPages: fuseDefaultMaxPagesPerReq,
- initializedChan: make(chan struct{}),
- connected: true,
- }, nil
-}
-
-// SetInitialized atomically sets the connection as initialized.
-func (conn *connection) SetInitialized() {
- // Unblock the requests sent before INIT.
- close(conn.initializedChan)
-
- // Close the channel first to avoid the non-atomic situation
- // where conn.initialized is true but there are
- // tasks being blocked on the channel.
- // And it prevents the newer tasks from gaining
- // unnecessary higher chance to be issued before the blocked one.
-
- atomic.StoreInt32(&(conn.initialized), int32(1))
-}
-
-// IsInitialized atomically check if the connection is initialized.
-// pairs with SetInitialized().
-func (conn *connection) Initialized() bool {
- return atomic.LoadInt32(&(conn.initialized)) != 0
-}
-
-// NewRequest creates a new request that can be sent to the FUSE server.
-func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
- conn.fd.mu.Lock()
- defer conn.fd.mu.Unlock()
- conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
-
- hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
- hdr := linux.FUSEHeaderIn{
- Len: uint32(hdrLen + payload.SizeBytes()),
- Opcode: opcode,
- Unique: conn.fd.nextOpID,
- NodeID: ino,
- UID: uint32(creds.EffectiveKUID),
- GID: uint32(creds.EffectiveKGID),
- PID: pid,
- }
-
- buf := make([]byte, hdr.Len)
- hdr.MarshalUnsafe(buf[:hdrLen])
- payload.MarshalUnsafe(buf[hdrLen:])
-
- return &Request{
- id: hdr.Unique,
- hdr: &hdr,
- data: buf,
+ fd: fuseFD,
+ asyncNumMax: fuseDefaultMaxBackground,
+ asyncCongestionThreshold: fuseDefaultCongestionThreshold,
+ maxRead: opts.maxRead,
+ maxPages: fuseDefaultMaxPagesPerReq,
+ initializedChan: make(chan struct{}),
+ connected: true,
}, nil
}
-// Call makes a request to the server and blocks the invoking task until a
-// server responds with a response. Task should never be nil.
-// Requests will not be sent before the connection is initialized.
-// For async tasks, use CallAsync().
-func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
- // Block requests sent before connection is initalized.
- if !conn.Initialized() {
- if err := t.Block(conn.initializedChan); err != nil {
- return nil, err
- }
- }
-
- return conn.call(t, r)
+// CallAsync makes an async (aka background) request.
+// It's a simple wrapper around Call().
+func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
+ r.async = true
+ _, err := conn.Call(t, r)
+ return err
}
-// CallAsync makes an async (aka background) request.
-// Those requests either do not expect a response (e.g. release) or
-// the response should be handled by others (e.g. init).
-// Return immediately unless the connection is blocked (before initialization).
-// Async call example: init, release, forget, aio, interrupt.
+// Call makes a request to the server.
+// Block before the connection is initialized.
// When the Request is FUSE_INIT, it will not be blocked before initialization.
-func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
+// Task should never be nil.
+//
+// For a sync request, it blocks the invoking task until
+// a server responds with a response.
+//
+// For an async request (that do not expect a response immediately),
+// it returns directly unless being blocked either before initialization
+// or when there are too many async requests ongoing.
+//
+// Example for async request:
+// init, readahead, write, async read/write, fuse_notify_reply,
+// non-sync release, interrupt, forget.
+//
+// The forget request does not have a reply,
+// as documented in include/uapi/linux/fuse.h:FUSE_FORGET.
+func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
// Block requests sent before connection is initalized.
if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT {
if err := t.Block(conn.initializedChan); err != nil {
- return err
+ return nil, err
}
}
- // This should be the only place that invokes call() with a nil task.
- _, err := conn.call(nil, r)
- return err
-}
-
-// call makes a call without blocking checks.
-func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) {
if !conn.connected {
return nil, syserror.ENOTCONN
}
@@ -315,31 +248,6 @@ func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) {
return fut.resolve(t)
}
-// Error returns the error of the FUSE call.
-func (r *Response) Error() error {
- errno := r.hdr.Error
- if errno >= 0 {
- return nil
- }
-
- sysErrNo := syscall.Errno(-errno)
- return error(sysErrNo)
-}
-
-// UnmarshalPayload unmarshals the response data into m.
-func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
- hdrLen := r.hdr.SizeBytes()
- haveDataLen := r.hdr.Len - uint32(hdrLen)
- wantDataLen := uint32(m.SizeBytes())
-
- if haveDataLen < wantDataLen {
- return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen)
- }
-
- m.UnmarshalUnsafe(r.data[hdrLen:])
- return nil
-}
-
// callFuture makes a request to the server and returns a future response.
// Call resolve() when the response needs to be fulfilled.
func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) {
@@ -358,11 +266,6 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse,
// if there are always too many ongoing requests all the time. The
// supported maxActiveRequests setting should be really high to avoid this.
for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests {
- if t == nil {
- // Since there is no task that is waiting. We must error out.
- return nil, errors.New("FUSE request queue full")
- }
-
log.Infof("Blocking request %v from being queued. Too many active requests: %v",
r.id, conn.fd.numActiveRequests)
conn.fd.mu.Unlock()
@@ -378,9 +281,19 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse,
// callFutureLocked makes a request to the server and returns a future response.
func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) {
+ // Check connected again holding conn.mu.
+ conn.mu.Lock()
+ if !conn.connected {
+ conn.mu.Unlock()
+ // we checked connected before,
+ // this must be due to aborted connection.
+ return nil, syserror.ECONNABORTED
+ }
+ conn.mu.Unlock()
+
conn.fd.queue.PushBack(r)
- conn.fd.numActiveRequests += 1
- fut := newFutureResponse(r.hdr.Opcode)
+ conn.fd.numActiveRequests++
+ fut := newFutureResponse(r)
conn.fd.completions[r.id] = fut
// Signal the readers that there is something to read.
@@ -388,50 +301,3 @@ func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureRes
return fut, nil
}
-
-// futureResponse represents an in-flight request, that may or may not have
-// completed yet. Convert it to a resolved Response by calling Resolve, but note
-// that this may block.
-//
-// +stateify savable
-type futureResponse struct {
- opcode linux.FUSEOpcode
- ch chan struct{}
- hdr *linux.FUSEHeaderOut
- data []byte
-}
-
-// newFutureResponse creates a future response to a FUSE request.
-func newFutureResponse(opcode linux.FUSEOpcode) *futureResponse {
- return &futureResponse{
- opcode: opcode,
- ch: make(chan struct{}),
- }
-}
-
-// resolve blocks the task until the server responds to its corresponding request,
-// then returns a resolved response.
-func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
- // If there is no Task associated with this request - then we don't try to resolve
- // the response. Instead, the task writing the response (proxy to the server) will
- // process the response on our behalf.
- if t == nil {
- log.Infof("fuse.Response.resolve: Not waiting on a response from server.")
- return nil, nil
- }
-
- if err := t.Block(f.ch); err != nil {
- return nil, err
- }
-
- return f.getResponse(), nil
-}
-
-// getResponse creates a Response from the data the futureResponse has.
-func (f *futureResponse) getResponse() *Response {
- return &Response{
- opcode: f.opcode,
- hdr: *f.hdr,
- data: f.data,
- }
-}
diff --git a/pkg/sentry/fsimpl/fuse/init.go b/pkg/sentry/fsimpl/fuse/connection_control.go
index 779c2bd3f..bfde78559 100644
--- a/pkg/sentry/fsimpl/fuse/init.go
+++ b/pkg/sentry/fsimpl/fuse/connection_control.go
@@ -15,7 +15,11 @@
package fuse
import (
+ "sync/atomic"
+ "syscall"
+
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)
@@ -29,9 +33,10 @@ const (
// Follow the same behavior as unix fuse implementation.
fuseMaxTimeGranNs = 1000000000
- // Minimum value for MaxWrite.
+ // Minimum value for MaxWrite and MaxRead.
// Follow the same behavior as unix fuse implementation.
fuseMinMaxWrite = 4096
+ fuseMinMaxRead = 4096
// Temporary default value for max readahead, 128kb.
fuseDefaultMaxReadahead = 131072
@@ -49,6 +54,26 @@ var (
MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold
)
+// SetInitialized atomically sets the connection as initialized.
+func (conn *connection) SetInitialized() {
+ // Unblock the requests sent before INIT.
+ close(conn.initializedChan)
+
+ // Close the channel first to avoid the non-atomic situation
+ // where conn.initialized is true but there are
+ // tasks being blocked on the channel.
+ // And it prevents the newer tasks from gaining
+ // unnecessary higher chance to be issued before the blocked one.
+
+ atomic.StoreInt32(&(conn.initialized), int32(1))
+}
+
+// IsInitialized atomically check if the connection is initialized.
+// pairs with SetInitialized().
+func (conn *connection) Initialized() bool {
+ return atomic.LoadInt32(&(conn.initialized)) != 0
+}
+
// InitSend sends a FUSE_INIT request.
func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error {
in := linux.FUSEInitIn{
@@ -70,29 +95,31 @@ func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error {
}
// InitRecv receives a FUSE_INIT reply and process it.
+//
+// Preconditions: conn.asyncMu must not be held if minor verion is newer than 13.
func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error {
if err := res.Error(); err != nil {
return err
}
- var out linux.FUSEInitOut
- if err := res.UnmarshalPayload(&out); err != nil {
+ initRes := fuseInitRes{initLen: res.DataLen()}
+ if err := res.UnmarshalPayload(&initRes); err != nil {
return err
}
- return conn.initProcessReply(&out, hasSysAdminCap)
+ return conn.initProcessReply(&initRes.initOut, hasSysAdminCap)
}
// Process the FUSE_INIT reply from the FUSE server.
+// It tries to acquire the conn.asyncMu lock if minor version is newer than 13.
func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error {
+ // No matter error or not, always set initialzied.
+ // to unblock the blocked requests.
+ defer conn.SetInitialized()
+
// No support for old major fuse versions.
if out.Major != linux.FUSE_KERNEL_VERSION {
conn.connInitError = true
-
- // Set the connection as initialized and unblock the blocked requests
- // (i.e. return error for them).
- conn.SetInitialized()
-
return nil
}
@@ -100,29 +127,14 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
conn.connInitSuccess = true
conn.minor = out.Minor
- // No support for limits before minor version 13.
- if out.Minor >= 13 {
- conn.bgLock.Lock()
-
- if out.MaxBackground > 0 {
- conn.maxBackground = out.MaxBackground
-
- if !hasSysAdminCap &&
- conn.maxBackground > MaxUserBackgroundRequest {
- conn.maxBackground = MaxUserBackgroundRequest
- }
- }
-
- if out.CongestionThreshold > 0 {
- conn.congestionThreshold = out.CongestionThreshold
-
- if !hasSysAdminCap &&
- conn.congestionThreshold > MaxUserCongestionThreshold {
- conn.congestionThreshold = MaxUserCongestionThreshold
- }
- }
-
- conn.bgLock.Unlock()
+ // No support for negotiating MaxWrite before minor version 5.
+ if out.Minor >= 5 {
+ conn.maxWrite = out.MaxWrite
+ } else {
+ conn.maxWrite = fuseMinMaxWrite
+ }
+ if conn.maxWrite < fuseMinMaxWrite {
+ conn.maxWrite = fuseMinMaxWrite
}
// No support for the following flags before minor version 6.
@@ -131,8 +143,6 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0
conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0
conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0
- conn.cacheSymlinks = out.Flags&linux.FUSE_CACHE_SYMLINKS != 0
- conn.abortErr = out.Flags&linux.FUSE_ABORT_ERROR != 0
// TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs).
@@ -148,19 +158,90 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
}
}
- // No support for negotiating MaxWrite before minor version 5.
- if out.Minor >= 5 {
- conn.maxWrite = out.MaxWrite
- } else {
- conn.maxWrite = fuseMinMaxWrite
+ // No support for limits before minor version 13.
+ if out.Minor >= 13 {
+ conn.asyncMu.Lock()
+
+ if out.MaxBackground > 0 {
+ conn.asyncNumMax = out.MaxBackground
+
+ if !hasSysAdminCap &&
+ conn.asyncNumMax > MaxUserBackgroundRequest {
+ conn.asyncNumMax = MaxUserBackgroundRequest
+ }
+ }
+
+ if out.CongestionThreshold > 0 {
+ conn.asyncCongestionThreshold = out.CongestionThreshold
+
+ if !hasSysAdminCap &&
+ conn.asyncCongestionThreshold > MaxUserCongestionThreshold {
+ conn.asyncCongestionThreshold = MaxUserCongestionThreshold
+ }
+ }
+
+ conn.asyncMu.Unlock()
}
- if conn.maxWrite < fuseMinMaxWrite {
- conn.maxWrite = fuseMinMaxWrite
+
+ return nil
+}
+
+// Abort this FUSE connection.
+// It tries to acquire conn.fd.mu, conn.lock, conn.bgLock in order.
+// All possible requests waiting or blocking will be aborted.
+//
+// Preconditions: conn.fd.mu is locked.
+func (conn *connection) Abort(ctx context.Context) {
+ conn.mu.Lock()
+ conn.asyncMu.Lock()
+
+ if !conn.connected {
+ conn.asyncMu.Unlock()
+ conn.mu.Unlock()
+ conn.fd.mu.Unlock()
+ return
}
- // Set connection as initialized and unblock the requests
- // issued before init.
- conn.SetInitialized()
+ conn.connected = false
- return nil
+ // Empty the `fd.queue` that holds the requests
+ // not yet read by the FUSE daemon yet.
+ // These are a subset of the requests in `fuse.completion` map.
+ for !conn.fd.queue.Empty() {
+ req := conn.fd.queue.Front()
+ conn.fd.queue.Remove(req)
+ }
+
+ var terminate []linux.FUSEOpID
+
+ // 2. Collect the requests have not been sent to FUSE daemon,
+ // or have not received a reply.
+ for unique := range conn.fd.completions {
+ terminate = append(terminate, unique)
+ }
+
+ // Release locks to avoid deadlock.
+ conn.asyncMu.Unlock()
+ conn.mu.Unlock()
+
+ // 1. The requets blocked before initialization.
+ // Will reach call() `connected` check and return.
+ if !conn.Initialized() {
+ conn.SetInitialized()
+ }
+
+ // 2. Terminate the requests collected above.
+ // Set ECONNABORTED error.
+ // sendError() will remove them from `fd.completion` map.
+ // Will enter the path of a normally received error.
+ for _, toTerminate := range terminate {
+ conn.fd.sendError(ctx, -int32(syscall.ECONNABORTED), toTerminate)
+ }
+
+ // 3. The requests not yet written to FUSE device.
+ // Early terminate.
+ // Will reach callFutureLocked() `connected` check and return.
+ close(conn.fd.fullQueueCh)
+
+ // TODO(gvisor.dev/issue/3528): Forget all pending forget reqs.
}
diff --git a/pkg/sentry/fsimpl/fuse/connection_test.go b/pkg/sentry/fsimpl/fuse/connection_test.go
new file mode 100644
index 000000000..91d16c1cf
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/connection_test.go
@@ -0,0 +1,117 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "math/rand"
+ "syscall"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// TestConnectionInitBlock tests if initialization
+// correctly blocks and unblocks the connection.
+// Since it's unfeasible to test kernelTask.Block() in unit test,
+// the code in Call() are not tested here.
+func TestConnectionInitBlock(t *testing.T) {
+ s := setup(t)
+ defer s.Destroy()
+
+ k := kernel.KernelFromContext(s.Ctx)
+
+ conn, _, err := newTestConnection(s, k, maxActiveRequestsDefault)
+ if err != nil {
+ t.Fatalf("newTestConnection: %v", err)
+ }
+
+ select {
+ case <-conn.initializedChan:
+ t.Fatalf("initializedChan should be blocking before SetInitialized")
+ default:
+ }
+
+ conn.SetInitialized()
+
+ select {
+ case <-conn.initializedChan:
+ default:
+ t.Fatalf("initializedChan should not be blocking after SetInitialized")
+ }
+}
+
+func TestConnectionAbort(t *testing.T) {
+ s := setup(t)
+ defer s.Destroy()
+
+ k := kernel.KernelFromContext(s.Ctx)
+ creds := auth.CredentialsFromContext(s.Ctx)
+ task := kernel.TaskFromContext(s.Ctx)
+
+ const numRequests uint64 = 256
+
+ conn, _, err := newTestConnection(s, k, numRequests)
+ if err != nil {
+ t.Fatalf("newTestConnection: %v", err)
+ }
+
+ testObj := &testPayload{
+ data: rand.Uint32(),
+ }
+
+ var futNormal []*futureResponse
+
+ for i := 0; i < int(numRequests); i++ {
+ req, err := conn.NewRequest(creds, uint32(i), uint64(i), 0, testObj)
+ if err != nil {
+ t.Fatalf("NewRequest creation failed: %v", err)
+ }
+ fut, err := conn.callFutureLocked(task, req)
+ if err != nil {
+ t.Fatalf("callFutureLocked failed: %v", err)
+ }
+ futNormal = append(futNormal, fut)
+ }
+
+ conn.Abort(s.Ctx)
+
+ // Abort should unblock the initialization channel.
+ // Note: no test requests are actually blocked on `conn.initializedChan`.
+ select {
+ case <-conn.initializedChan:
+ default:
+ t.Fatalf("initializedChan should not be blocking after SetInitialized")
+ }
+
+ // Abort will return ECONNABORTED error to unblocked requests.
+ for _, fut := range futNormal {
+ if fut.getResponse().hdr.Error != -int32(syscall.ECONNABORTED) {
+ t.Fatalf("Incorrect error code received for aborted connection: %v", fut.getResponse().hdr.Error)
+ }
+ }
+
+ // After abort, Call() should return directly with ENOTCONN.
+ req, err := conn.NewRequest(creds, 0, 0, 0, testObj)
+ if err != nil {
+ t.Fatalf("NewRequest creation failed: %v", err)
+ }
+ _, err = conn.Call(task, req)
+ if err != syserror.ENOTCONN {
+ t.Fatalf("Incorrect error code received for Call() after connection aborted")
+ }
+
+}
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
index e522ff9a0..f690ef5ad 100644
--- a/pkg/sentry/fsimpl/fuse/dev.go
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -19,7 +19,6 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -56,9 +55,6 @@ type DeviceFD struct {
vfs.DentryMetadataFileDescriptionImpl
vfs.NoLockFD
- // mounted specifies whether a FUSE filesystem was mounted using the DeviceFD.
- mounted bool
-
// nextOpID is used to create new requests.
nextOpID linux.FUSEOpID
@@ -99,14 +95,21 @@ type DeviceFD struct {
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *DeviceFD) Release(context.Context) {
- fd.fs.conn.connected = false
+func (fd *DeviceFD) Release(ctx context.Context) {
+ if fd.fs != nil {
+ fd.fs.conn.mu.Lock()
+ fd.fs.conn.connected = false
+ fd.fs.conn.mu.Unlock()
+
+ fd.fs.VFSFilesystem().DecRef(ctx)
+ fd.fs = nil
+ }
}
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
- if !fd.mounted {
+ if fd.fs == nil {
return 0, syserror.EPERM
}
@@ -116,10 +119,16 @@ func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset in
// Read implements vfs.FileDescriptionImpl.Read.
func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
- if !fd.mounted {
+ if fd.fs == nil {
return 0, syserror.EPERM
}
+ // Return ENODEV if the filesystem is umounted.
+ if fd.fs.umounted {
+ // TODO(gvisor.dev/issue/3525): return ECONNABORTED if aborted via fuse control fs.
+ return 0, syserror.ENODEV
+ }
+
// We require that any Read done on this filesystem have a sane minimum
// read buffer. It must have the capacity for the fixed parts of any request
// header (Linux uses the request header and the FUSEWriteIn header for this
@@ -143,58 +152,82 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R
}
// readLocked implements the reading of the fuse device while locked with DeviceFD.mu.
+//
+// Preconditions: dst is large enough for any reasonable request.
func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
- if fd.queue.Empty() {
- return 0, syserror.ErrWouldBlock
- }
+ var req *Request
- var readCursor uint32
- var bytesRead int64
- for {
- req := fd.queue.Front()
- if dst.NumBytes() < int64(req.hdr.Len) {
- // The request is too large. Cannot process it. All requests must be smaller than the
- // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
- // handshake.
- errno := -int32(syscall.EIO)
- if req.hdr.Opcode == linux.FUSE_SETXATTR {
- errno = -int32(syscall.E2BIG)
- }
+ // Find the first valid request.
+ // For the normal case this loop only execute once.
+ for !fd.queue.Empty() {
+ req = fd.queue.Front()
- // Return the error to the calling task.
- if err := fd.sendError(ctx, errno, req); err != nil {
- return 0, err
- }
+ if int64(req.hdr.Len)+int64(len(req.payload)) <= dst.NumBytes() {
+ break
+ }
- // We're done with this request.
- fd.queue.Remove(req)
+ // The request is too large. Cannot process it. All requests must be smaller than the
+ // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
+ // handshake.
+ errno := -int32(syscall.EIO)
+ if req.hdr.Opcode == linux.FUSE_SETXATTR {
+ errno = -int32(syscall.E2BIG)
+ }
- // Restart the read as this request was invalid.
- log.Warningf("fuse.DeviceFD.Read: request found was too large. Restarting read.")
- return fd.readLocked(ctx, dst, opts)
+ // Return the error to the calling task.
+ if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil {
+ return 0, err
}
- n, err := dst.CopyOut(ctx, req.data[readCursor:])
+ // We're done with this request.
+ fd.queue.Remove(req)
+ req = nil
+ }
+
+ if req == nil {
+ return 0, syserror.ErrWouldBlock
+ }
+
+ // We already checked the size: dst must be able to fit the whole request.
+ // Now we write the marshalled header, the payload,
+ // and the potential additional payload
+ // to the user memory IOSequence.
+
+ n, err := dst.CopyOut(ctx, req.data)
+ if err != nil {
+ return 0, err
+ }
+ if n != len(req.data) {
+ return 0, syserror.EIO
+ }
+
+ if req.hdr.Opcode == linux.FUSE_WRITE {
+ written, err := dst.DropFirst(n).CopyOut(ctx, req.payload)
if err != nil {
return 0, err
}
- readCursor += uint32(n)
- bytesRead += int64(n)
-
- if readCursor >= req.hdr.Len {
- // Fully done with this req, remove it from the queue.
- fd.queue.Remove(req)
- break
+ if written != len(req.payload) {
+ return 0, syserror.EIO
}
+ n += int(written)
+ }
+
+ // Fully done with this req, remove it from the queue.
+ fd.queue.Remove(req)
+
+ // Remove noReply ones from map of requests expecting a reply.
+ if req.noReply {
+ fd.numActiveRequests -= 1
+ delete(fd.completions, req.hdr.Unique)
}
- return bytesRead, nil
+ return int64(n), nil
}
// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
- if !fd.mounted {
+ if fd.fs == nil {
return 0, syserror.EPERM
}
@@ -211,10 +244,15 @@ func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.
// writeLocked implements writing to the fuse device while locked with DeviceFD.mu.
func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
- if !fd.mounted {
+ if fd.fs == nil {
return 0, syserror.EPERM
}
+ // Return ENODEV if the filesystem is umounted.
+ if fd.fs.umounted {
+ return 0, syserror.ENODEV
+ }
+
var cn, n int64
hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
@@ -276,7 +314,8 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt
fut, ok := fd.completions[hdr.Unique]
if !ok {
- // Server sent us a response for a request we never sent?
+ // Server sent us a response for a request we never sent,
+ // or for which we already received a reply (e.g. aborted), an unlikely event.
return 0, syserror.EINVAL
}
@@ -307,8 +346,23 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt
// Readiness implements vfs.FileDescriptionImpl.Readiness.
func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ return fd.readinessLocked(mask)
+}
+
+// readinessLocked implements checking the readiness of the fuse device while
+// locked with DeviceFD.mu.
+func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask {
var ready waiter.EventMask
- ready |= waiter.EventOut // FD is always writable
+
+ if fd.fs.umounted {
+ ready |= waiter.EventErr
+ return ready & mask
+ }
+
+ // FD is always writable.
+ ready |= waiter.EventOut
if !fd.queue.Empty() {
// Have reqs available, FD is readable.
ready |= waiter.EventIn
@@ -330,7 +384,7 @@ func (fd *DeviceFD) EventUnregister(e *waiter.Entry) {
// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
- if !fd.mounted {
+ if fd.fs == nil {
return 0, syserror.EPERM
}
@@ -338,59 +392,59 @@ func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64
}
// sendResponse sends a response to the waiting task (if any).
+//
+// Preconditions: fd.mu must be held.
func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error {
- // See if the running task need to perform some action before returning.
- // Since we just finished writing the future, we can be sure that
- // getResponse generates a populated response.
- if err := fd.noReceiverAction(ctx, fut.getResponse()); err != nil {
- return err
- }
+ // Signal the task waiting on a response if any.
+ defer close(fut.ch)
// Signal that the queue is no longer full.
select {
case fd.fullQueueCh <- struct{}{}:
default:
}
- fd.numActiveRequests -= 1
+ fd.numActiveRequests--
+
+ if fut.async {
+ return fd.asyncCallBack(ctx, fut.getResponse())
+ }
- // Signal the task waiting on a response.
- close(fut.ch)
return nil
}
-// sendError sends an error response to the waiting task (if any).
-func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) error {
+// sendError sends an error response to the waiting task (if any) by calling sendResponse().
+//
+// Preconditions: fd.mu must be held.
+func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error {
// Return the error to the calling task.
outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
respHdr := linux.FUSEHeaderOut{
Len: outHdrLen,
Error: errno,
- Unique: req.hdr.Unique,
+ Unique: unique,
}
fut, ok := fd.completions[respHdr.Unique]
if !ok {
- // Server sent us a response for a request we never sent?
+ // A response for a request we never sent,
+ // or for which we already received a reply (e.g. aborted).
return syserror.EINVAL
}
delete(fd.completions, respHdr.Unique)
fut.hdr = &respHdr
- if err := fd.sendResponse(ctx, fut); err != nil {
- return err
- }
-
- return nil
+ return fd.sendResponse(ctx, fut)
}
-// noReceiverAction has the calling kernel.Task do some action if its known that no
-// receiver is going to be waiting on the future channel. This is to be used by:
-// FUSE_INIT.
-func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error {
- if r.opcode == linux.FUSE_INIT {
+// asyncCallBack executes pre-defined callback function for async requests.
+// Currently used by: FUSE_INIT.
+func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error {
+ switch r.opcode {
+ case linux.FUSE_INIT:
creds := auth.CredentialsFromContext(ctx)
rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace()
return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs))
+ // TODO(gvisor.dev/issue/3247): support async read: correctly process the response.
}
return nil
diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go
index 1ffe7ccd2..5986133e9 100644
--- a/pkg/sentry/fsimpl/fuse/dev_test.go
+++ b/pkg/sentry/fsimpl/fuse/dev_test.go
@@ -16,7 +16,6 @@ package fuse
import (
"fmt"
- "io"
"math/rand"
"testing"
@@ -28,17 +27,12 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
- "gvisor.dev/gvisor/tools/go_marshal/marshal"
)
// echoTestOpcode is the Opcode used during testing. The server used in tests
// will simply echo the payload back with the appropriate headers.
const echoTestOpcode linux.FUSEOpcode = 1000
-type testPayload struct {
- data uint32
-}
-
// TestFUSECommunication tests that the communication layer between the Sentry and the
// FUSE server daemon works as expected.
func TestFUSECommunication(t *testing.T) {
@@ -327,102 +321,3 @@ func fuseServerRun(t *testing.T, s *testutil.System, k *kernel.Kernel, fd *vfs.F
}
}
}
-
-func setup(t *testing.T) *testutil.System {
- k, err := testutil.Boot()
- if err != nil {
- t.Fatalf("Error creating kernel: %v", err)
- }
-
- ctx := k.SupervisorContext()
- creds := auth.CredentialsFromContext(ctx)
-
- k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
- AllowUserList: true,
- AllowUserMount: true,
- })
-
- mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
- if err != nil {
- t.Fatalf("NewMountNamespace(): %v", err)
- }
-
- return testutil.NewSystem(ctx, t, k.VFS(), mntns)
-}
-
-// newTestConnection creates a fuse connection that the sentry can communicate with
-// and the FD for the server to communicate with.
-func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) {
- vfsObj := &vfs.VirtualFilesystem{}
- fuseDev := &DeviceFD{}
-
- if err := vfsObj.Init(system.Ctx); err != nil {
- return nil, nil, err
- }
-
- vd := vfsObj.NewAnonVirtualDentry("genCountFD")
- defer vd.DecRef(system.Ctx)
- if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil {
- return nil, nil, err
- }
-
- fsopts := filesystemOptions{
- maxActiveRequests: maxActiveRequests,
- }
- fs, err := NewFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd)
- if err != nil {
- return nil, nil, err
- }
-
- return fs.conn, &fuseDev.vfsfd, nil
-}
-
-// SizeBytes implements marshal.Marshallable.SizeBytes.
-func (t *testPayload) SizeBytes() int {
- return 4
-}
-
-// MarshalBytes implements marshal.Marshallable.MarshalBytes.
-func (t *testPayload) MarshalBytes(dst []byte) {
- usermem.ByteOrder.PutUint32(dst[:4], t.data)
-}
-
-// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
-func (t *testPayload) UnmarshalBytes(src []byte) {
- *t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])}
-}
-
-// Packed implements marshal.Marshallable.Packed.
-func (t *testPayload) Packed() bool {
- return true
-}
-
-// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
-func (t *testPayload) MarshalUnsafe(dst []byte) {
- t.MarshalBytes(dst)
-}
-
-// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
-func (t *testPayload) UnmarshalUnsafe(src []byte) {
- t.UnmarshalBytes(src)
-}
-
-// CopyOutN implements marshal.Marshallable.CopyOutN.
-func (t *testPayload) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {
- panic("not implemented")
-}
-
-// CopyOut implements marshal.Marshallable.CopyOut.
-func (t *testPayload) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {
- panic("not implemented")
-}
-
-// CopyIn implements marshal.Marshallable.CopyIn.
-func (t *testPayload) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {
- panic("not implemented")
-}
-
-// WriteTo implements io.WriterTo.WriteTo.
-func (t *testPayload) WriteTo(w io.Writer) (int64, error) {
- panic("not implemented")
-}
diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go
new file mode 100644
index 000000000..8f220a04b
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/directory.go
@@ -0,0 +1,105 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+type directoryFD struct {
+ fileDescription
+}
+
+// Allocate implements directoryFD.Allocate.
+func (*directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ return syserror.EISDIR
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (*directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ return 0, syserror.EISDIR
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (*directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ return 0, syserror.EISDIR
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (*directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return 0, syserror.EISDIR
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (*directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ return 0, syserror.EISDIR
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (dir *directoryFD) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback) error {
+ fusefs := dir.inode().fs
+ task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+
+ in := linux.FUSEReadIn{
+ Fh: dir.Fh,
+ Offset: uint64(atomic.LoadInt64(&dir.off)),
+ Size: linux.FUSE_PAGE_SIZE,
+ Flags: dir.statusFlags(),
+ }
+
+ // TODO(gVisor.dev/issue/3404): Support FUSE_READDIRPLUS.
+ req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), dir.inode().nodeID, linux.FUSE_READDIR, &in)
+ if err != nil {
+ return err
+ }
+
+ res, err := fusefs.conn.Call(task, req)
+ if err != nil {
+ return err
+ }
+ if err := res.Error(); err != nil {
+ return err
+ }
+
+ var out linux.FUSEDirents
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return err
+ }
+
+ for _, fuseDirent := range out.Dirents {
+ nextOff := int64(fuseDirent.Meta.Off)
+ dirent := vfs.Dirent{
+ Name: fuseDirent.Name,
+ Type: uint8(fuseDirent.Meta.Type),
+ Ino: fuseDirent.Meta.Ino,
+ NextOff: nextOff,
+ }
+
+ if err := callback.Handle(dirent); err != nil {
+ return err
+ }
+ atomic.StoreInt64(&dir.off, nextOff)
+ }
+
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/file.go b/pkg/sentry/fsimpl/fuse/file.go
new file mode 100644
index 000000000..83f2816b7
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/file.go
@@ -0,0 +1,133 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fileDescription implements vfs.FileDescriptionImpl for fuse.
+type fileDescription struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+ vfs.DentryMetadataFileDescriptionImpl
+ vfs.NoLockFD
+
+ // the file handle used in userspace.
+ Fh uint64
+
+ // Nonseekable is indicate cannot perform seek on a file.
+ Nonseekable bool
+
+ // DirectIO suggest fuse to use direct io operation.
+ DirectIO bool
+
+ // OpenFlag is the flag returned by open.
+ OpenFlag uint32
+
+ // off is the file offset.
+ off int64
+}
+
+func (fd *fileDescription) dentry() *kernfs.Dentry {
+ return fd.vfsfd.Dentry().Impl().(*kernfs.Dentry)
+}
+
+func (fd *fileDescription) inode() *inode {
+ return fd.dentry().Inode().(*inode)
+}
+
+func (fd *fileDescription) filesystem() *vfs.Filesystem {
+ return fd.vfsfd.VirtualDentry().Mount().Filesystem()
+}
+
+func (fd *fileDescription) statusFlags() uint32 {
+ return fd.vfsfd.StatusFlags()
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *fileDescription) Release(ctx context.Context) {
+ // no need to release if FUSE server doesn't implement Open.
+ conn := fd.inode().fs.conn
+ if conn.noOpen {
+ return
+ }
+
+ in := linux.FUSEReleaseIn{
+ Fh: fd.Fh,
+ Flags: fd.statusFlags(),
+ }
+ // TODO(gvisor.dev/issue/3245): add logic when we support file lock owner.
+ var opcode linux.FUSEOpcode
+ if fd.inode().Mode().IsDir() {
+ opcode = linux.FUSE_RELEASEDIR
+ } else {
+ opcode = linux.FUSE_RELEASE
+ }
+ kernelTask := kernel.TaskFromContext(ctx)
+ // ignoring errors and FUSE server reply is analogous to Linux's behavior.
+ req, err := conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), fd.inode().nodeID, opcode, &in)
+ if err != nil {
+ // No way to invoke Call() with an errored request.
+ return
+ }
+ // The reply will be ignored since no callback is defined in asyncCallBack().
+ conn.CallAsync(kernelTask, req)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ return 0, nil
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ return 0, nil
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return 0, nil
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ return 0, nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ return 0, nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ fs := fd.filesystem()
+ inode := fd.inode()
+ return inode.Stat(ctx, fs, opts)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ fs := fd.filesystem()
+ creds := auth.CredentialsFromContext(ctx)
+ return fd.inode().setAttr(ctx, fs, creds, opts, true, fd.Fh)
+}
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
index 83c24ec25..b3573f80d 100644
--- a/pkg/sentry/fsimpl/fuse/fusefs.go
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -16,21 +16,30 @@
package fuse
import (
+ "math"
"strconv"
+ "sync"
+ "sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/marshal"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/waiter"
)
// Name is the default filesystem name.
const Name = "fuse"
+// maxActiveRequestsDefault is the default setting controlling the upper bound
+// on the number of active requests at any given time.
+const maxActiveRequestsDefault = 10000
+
// FilesystemType implements vfs.FilesystemType.
type FilesystemType struct{}
@@ -56,6 +65,11 @@ type filesystemOptions struct {
// exist at any time. Any further requests will block when trying to
// Call the server.
maxActiveRequests uint64
+
+ // maxRead is the max number of bytes to read,
+ // specified as "max_read" in fs parameters.
+ // If not specified by user, use math.MaxUint32 as default value.
+ maxRead uint32
}
// filesystem implements vfs.FilesystemImpl.
@@ -69,6 +83,9 @@ type filesystem struct {
// opts is the options the fusefs is initialized with.
opts *filesystemOptions
+
+ // umounted is true if filesystem.Release() has been called.
+ umounted bool
}
// Name implements vfs.FilesystemType.Name.
@@ -142,14 +159,29 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
// Set the maxInFlightRequests option.
fsopts.maxActiveRequests = maxActiveRequestsDefault
+ if maxReadStr, ok := mopts["max_read"]; ok {
+ delete(mopts, "max_read")
+ maxRead, err := strconv.ParseUint(maxReadStr, 10, 32)
+ if err != nil {
+ log.Warningf("%s.GetFilesystem: invalid max_read: max_read=%s", fsType.Name(), maxReadStr)
+ return nil, nil, syserror.EINVAL
+ }
+ if maxRead < fuseMinMaxRead {
+ maxRead = fuseMinMaxRead
+ }
+ fsopts.maxRead = uint32(maxRead)
+ } else {
+ fsopts.maxRead = math.MaxUint32
+ }
+
// Check for unparsed options.
if len(mopts) != 0 {
- log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts)
+ log.Warningf("%s.GetFilesystem: unsupported or unknown options: %v", fsType.Name(), mopts)
return nil, nil, syserror.EINVAL
}
// Create a new FUSE filesystem.
- fs, err := NewFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
+ fs, err := newFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
if err != nil {
log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err)
return nil, nil, err
@@ -165,26 +197,28 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
// root is the fusefs root directory.
- root := fs.newInode(creds, fsopts.rootMode)
+ root := fs.newRootInode(creds, fsopts.rootMode)
return fs.VFSFilesystem(), root.VFSDentry(), nil
}
-// NewFUSEFilesystem creates a new FUSE filesystem.
-func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
- fs := &filesystem{
- devMinor: devMinor,
- opts: opts,
- }
-
- conn, err := newFUSEConnection(ctx, device, opts.maxActiveRequests)
+// newFUSEFilesystem creates a new FUSE filesystem.
+func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
+ conn, err := newFUSEConnection(ctx, device, opts)
if err != nil {
log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err)
return nil, syserror.EINVAL
}
- fs.conn = conn
fuseFD := device.Impl().(*DeviceFD)
+
+ fs := &filesystem{
+ devMinor: devMinor,
+ opts: opts,
+ conn: conn,
+ }
+
+ fs.VFSFilesystem().IncRef()
fuseFD.fs = fs
return fs, nil
@@ -192,39 +226,375 @@ func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt
// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
+ fs.conn.fd.mu.Lock()
+
+ fs.umounted = true
+ fs.conn.Abort(ctx)
+ // Notify all the waiters on this fd.
+ fs.conn.fd.waitQueue.Notify(waiter.EventIn)
+
+ fs.conn.fd.mu.Unlock()
+
fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
fs.Filesystem.Release(ctx)
}
// inode implements kernfs.Inode.
type inode struct {
+ inodeRefs
kernfs.InodeAttrs
+ kernfs.InodeDirectoryNoNewChildren
kernfs.InodeNoDynamicLookup
kernfs.InodeNotSymlink
- kernfs.InodeDirectoryNoNewChildren
kernfs.OrderedChildren
+ dentry kernfs.Dentry
+
+ // the owning filesystem. fs is immutable.
+ fs *filesystem
+
+ // metaDataMu protects the metadata of this inode.
+ metadataMu sync.Mutex
+
+ nodeID uint64
+
locks vfs.FileLocks
- dentry kernfs.Dentry
+ // size of the file.
+ size uint64
+
+ // attributeVersion is the version of inode's attributes.
+ attributeVersion uint64
+
+ // attributeTime is the remaining vaild time of attributes.
+ attributeTime uint64
+
+ // version of the inode.
+ version uint64
+
+ // link is result of following a symbolic link.
+ link string
}
-func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
- i := &inode{}
- i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
+func (fs *filesystem) newRootInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
+ i := &inode{fs: fs}
+ i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, 1, linux.ModeDirectory|0755)
i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
i.dentry.Init(i)
+ i.nodeID = 1
+
+ return &i.dentry
+}
+
+func (fs *filesystem) newInode(nodeID uint64, attr linux.FUSEAttr) *kernfs.Dentry {
+ i := &inode{fs: fs, nodeID: nodeID}
+ creds := auth.Credentials{EffectiveKGID: auth.KGID(attr.UID), EffectiveKUID: auth.KUID(attr.UID)}
+ i.InodeAttrs.Init(&creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.FileMode(attr.Mode))
+ atomic.StoreUint64(&i.size, attr.Size)
+ i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+ i.EnableLeakCheck()
+ i.dentry.Init(i)
return &i.dentry
}
// Open implements kernfs.Inode.Open.
func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+ isDir := i.InodeAttrs.Mode().IsDir()
+ // return error if specified to open directory but inode is not a directory.
+ if !isDir && opts.Mode.IsDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if opts.Flags&linux.O_LARGEFILE == 0 && atomic.LoadUint64(&i.size) > linux.MAX_NON_LFS {
+ return nil, syserror.EOVERFLOW
+ }
+
+ var fd *fileDescription
+ var fdImpl vfs.FileDescriptionImpl
+ if isDir {
+ directoryFD := &directoryFD{}
+ fd = &(directoryFD.fileDescription)
+ fdImpl = directoryFD
+ } else {
+ regularFD := &regularFileFD{}
+ fd = &(regularFD.fileDescription)
+ fdImpl = regularFD
+ }
+ // FOPEN_KEEP_CACHE is the defualt flag for noOpen.
+ fd.OpenFlag = linux.FOPEN_KEEP_CACHE
+
+ // Only send open request when FUSE server support open or is opening a directory.
+ if !i.fs.conn.noOpen || isDir {
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("fusefs.Inode.Open: couldn't get kernel task from context")
+ return nil, syserror.EINVAL
+ }
+
+ // Build the request.
+ var opcode linux.FUSEOpcode
+ if isDir {
+ opcode = linux.FUSE_OPENDIR
+ } else {
+ opcode = linux.FUSE_OPEN
+ }
+
+ in := linux.FUSEOpenIn{Flags: opts.Flags & ^uint32(linux.O_CREAT|linux.O_EXCL|linux.O_NOCTTY)}
+ if !i.fs.conn.atomicOTrunc {
+ in.Flags &= ^uint32(linux.O_TRUNC)
+ }
+
+ req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, &in)
+ if err != nil {
+ return nil, err
+ }
+
+ // Send the request and receive the reply.
+ res, err := i.fs.conn.Call(kernelTask, req)
+ if err != nil {
+ return nil, err
+ }
+ if err := res.Error(); err == syserror.ENOSYS && !isDir {
+ i.fs.conn.noOpen = true
+ } else if err != nil {
+ return nil, err
+ } else {
+ out := linux.FUSEOpenOut{}
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return nil, err
+ }
+
+ // Process the reply.
+ fd.OpenFlag = out.OpenFlag
+ if isDir {
+ fd.OpenFlag &= ^uint32(linux.FOPEN_DIRECT_IO)
+ }
+
+ fd.Fh = out.Fh
+ }
+ }
+
+ // TODO(gvisor.dev/issue/3234): invalidate mmap after implemented it for FUSE Inode
+ fd.DirectIO = fd.OpenFlag&linux.FOPEN_DIRECT_IO != 0
+ fdOptions := &vfs.FileDescriptionOptions{}
+ if fd.OpenFlag&linux.FOPEN_NONSEEKABLE != 0 {
+ fdOptions.DenyPRead = true
+ fdOptions.DenyPWrite = true
+ fd.Nonseekable = true
+ }
+
+ // If we don't send SETATTR before open (which is indicated by atomicOTrunc)
+ // and O_TRUNC is set, update the inode's version number and clean existing data
+ // by setting the file size to 0.
+ if i.fs.conn.atomicOTrunc && opts.Flags&linux.O_TRUNC != 0 {
+ i.fs.conn.mu.Lock()
+ i.fs.conn.attributeVersion++
+ i.attributeVersion = i.fs.conn.attributeVersion
+ atomic.StoreUint64(&i.size, 0)
+ i.fs.conn.mu.Unlock()
+ i.attributeTime = 0
+ }
+
+ if err := fd.vfsfd.Init(fdImpl, opts.Flags, rp.Mount(), vfsd, fdOptions); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// Lookup implements kernfs.Inode.Lookup.
+func (i *inode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+ in := linux.FUSELookupIn{Name: name}
+ return i.newEntry(ctx, name, 0, linux.FUSE_LOOKUP, &in)
+}
+
+// IterDirents implements kernfs.Inode.IterDirents.
+func (*inode) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+ return offset, nil
+}
+
+// Valid implements kernfs.Inode.Valid.
+func (*inode) Valid(ctx context.Context) bool {
+ return true
+}
+
+// NewFile implements kernfs.Inode.NewFile.
+func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID)
+ return nil, syserror.EINVAL
+ }
+ in := linux.FUSECreateIn{
+ CreateMeta: linux.FUSECreateMeta{
+ Flags: opts.Flags,
+ Mode: uint32(opts.Mode) | linux.S_IFREG,
+ Umask: uint32(kernelTask.FSContext().Umask()),
+ },
+ Name: name,
+ }
+ return i.newEntry(ctx, name, linux.S_IFREG, linux.FUSE_CREATE, &in)
+}
+
+// NewNode implements kernfs.Inode.NewNode.
+func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error) {
+ in := linux.FUSEMknodIn{
+ MknodMeta: linux.FUSEMknodMeta{
+ Mode: uint32(opts.Mode),
+ Rdev: linux.MakeDeviceID(uint16(opts.DevMajor), opts.DevMinor),
+ Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()),
+ },
+ Name: name,
+ }
+ return i.newEntry(ctx, name, opts.Mode.FileType(), linux.FUSE_MKNOD, &in)
+}
+
+// NewSymlink implements kernfs.Inode.NewSymlink.
+func (i *inode) NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error) {
+ in := linux.FUSESymLinkIn{
+ Name: name,
+ Target: target,
+ }
+ return i.newEntry(ctx, name, linux.S_IFLNK, linux.FUSE_SYMLINK, &in)
+}
+
+// Unlink implements kernfs.Inode.Unlink.
+func (i *inode) Unlink(ctx context.Context, name string, child *vfs.Dentry) error {
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
+ return syserror.EINVAL
+ }
+ in := linux.FUSEUnlinkIn{Name: name}
+ req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_UNLINK, &in)
+ if err != nil {
+ return err
+ }
+ res, err := i.fs.conn.Call(kernelTask, req)
+ if err != nil {
+ return err
+ }
+ // only return error, discard res.
+ if err := res.Error(); err != nil {
+ return err
+ }
+ return i.dentry.RemoveChildLocked(name, child)
+}
+
+// NewDir implements kernfs.Inode.NewDir.
+func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
+ in := linux.FUSEMkdirIn{
+ MkdirMeta: linux.FUSEMkdirMeta{
+ Mode: uint32(opts.Mode),
+ Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()),
+ },
+ Name: name,
+ }
+ return i.newEntry(ctx, name, linux.S_IFDIR, linux.FUSE_MKDIR, &in)
+}
+
+// RmDir implements kernfs.Inode.RmDir.
+func (i *inode) RmDir(ctx context.Context, name string, child *vfs.Dentry) error {
+ fusefs := i.fs
+ task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+
+ in := linux.FUSERmDirIn{Name: name}
+ req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_RMDIR, &in)
if err != nil {
+ return err
+ }
+
+ res, err := i.fs.conn.Call(task, req)
+ if err != nil {
+ return err
+ }
+ if err := res.Error(); err != nil {
+ return err
+ }
+
+ // TODO(Before merging): When creating new nodes, should we add nodes to the ordered children?
+ // If so we'll probably need to call this. We will also need to add them with the writable flag when
+ // appropriate.
+ // return i.OrderedChildren.RmDir(ctx, name, child)
+
+ return nil
+}
+
+// newEntry calls FUSE server for entry creation and allocates corresponding entry according to response.
+// Shared by FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and FUSE_LOOKUP.
+func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*vfs.Dentry, error) {
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
+ return nil, syserror.EINVAL
+ }
+ req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, payload)
+ if err != nil {
+ return nil, err
+ }
+ res, err := i.fs.conn.Call(kernelTask, req)
+ if err != nil {
+ return nil, err
+ }
+ if err := res.Error(); err != nil {
+ return nil, err
+ }
+ out := linux.FUSEEntryOut{}
+ if err := res.UnmarshalPayload(&out); err != nil {
return nil, err
}
- return fd.VFSFileDescription(), nil
+ if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) {
+ return nil, syserror.EIO
+ }
+ child := i.fs.newInode(out.NodeID, out.Attr)
+ if opcode == linux.FUSE_LOOKUP {
+ i.dentry.InsertChildLocked(name, child)
+ } else {
+ i.dentry.InsertChild(name, child)
+ }
+ return child.VFSDentry(), nil
+}
+
+// Getlink implements kernfs.Inode.Getlink.
+func (i *inode) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+ path, err := i.Readlink(ctx, mnt)
+ return vfs.VirtualDentry{}, path, err
+}
+
+// Readlink implements kernfs.Inode.Readlink.
+func (i *inode) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
+ if i.Mode().FileType()&linux.S_IFLNK == 0 {
+ return "", syserror.EINVAL
+ }
+ if len(i.link) == 0 {
+ kernelTask := kernel.TaskFromContext(ctx)
+ if kernelTask == nil {
+ log.Warningf("fusefs.Inode.Readlink: couldn't get kernel task from context")
+ return "", syserror.EINVAL
+ }
+ req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{})
+ if err != nil {
+ return "", err
+ }
+ res, err := i.fs.conn.Call(kernelTask, req)
+ if err != nil {
+ return "", err
+ }
+ i.link = string(res.data[res.hdr.SizeBytes():])
+ if !mnt.Options().ReadOnly {
+ i.attributeTime = 0
+ }
+ }
+ return i.link, nil
+}
+
+// getFUSEAttr returns a linux.FUSEAttr of this inode stored in local cache.
+// TODO(gvisor.dev/issue/3679): Add support for other fields.
+func (i *inode) getFUSEAttr() linux.FUSEAttr {
+ return linux.FUSEAttr{
+ Ino: i.Ino(),
+ Size: atomic.LoadUint64(&i.size),
+ Mode: uint32(i.Mode()),
+ }
}
// statFromFUSEAttr makes attributes from linux.FUSEAttr to linux.Statx. The
@@ -280,45 +650,179 @@ func statFromFUSEAttr(attr linux.FUSEAttr, mask, devMinor uint32) linux.Statx {
return stat
}
-// Stat implements kernfs.Inode.Stat.
-func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
- fusefs := fs.Impl().(*filesystem)
- conn := fusefs.conn
- task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+// getAttr gets the attribute of this inode by issuing a FUSE_GETATTR request
+// or read from local cache. It updates the corresponding attributes if
+// necessary.
+func (i *inode) getAttr(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions, flags uint32, fh uint64) (linux.FUSEAttr, error) {
+ attributeVersion := atomic.LoadUint64(&i.fs.conn.attributeVersion)
+
+ // TODO(gvisor.dev/issue/3679): send the request only if
+ // - invalid local cache for fields specified in the opts.Mask
+ // - forced update
+ // - i.attributeTime expired
+ // If local cache is still valid, return local cache.
+ // Currently we always send a request,
+ // and we always set the metadata with the new result,
+ // unless attributeVersion has changed.
+
+ task := kernel.TaskFromContext(ctx)
if task == nil {
log.Warningf("couldn't get kernel task from context")
- return linux.Statx{}, syserror.EINVAL
+ return linux.FUSEAttr{}, syserror.EINVAL
}
- var in linux.FUSEGetAttrIn
- // We don't set any attribute in the request, because in VFS2 fstat(2) will
- // finally be translated into vfs.FilesystemImpl.StatAt() (see
- // pkg/sentry/syscalls/linux/vfs2/stat.go), resulting in the same flow
- // as stat(2). Thus GetAttrFlags and Fh variable will never be used in VFS2.
- req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.Ino(), linux.FUSE_GETATTR, &in)
+ creds := auth.CredentialsFromContext(ctx)
+
+ in := linux.FUSEGetAttrIn{
+ GetAttrFlags: flags,
+ Fh: fh,
+ }
+ req, err := i.fs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_GETATTR, &in)
if err != nil {
- return linux.Statx{}, err
+ return linux.FUSEAttr{}, err
}
- res, err := conn.Call(task, req)
+ res, err := i.fs.conn.Call(task, req)
if err != nil {
- return linux.Statx{}, err
+ return linux.FUSEAttr{}, err
}
if err := res.Error(); err != nil {
- return linux.Statx{}, err
+ return linux.FUSEAttr{}, err
}
var out linux.FUSEGetAttrOut
if err := res.UnmarshalPayload(&out); err != nil {
- return linux.Statx{}, err
+ return linux.FUSEAttr{}, err
}
- // Set all metadata into kernfs.InodeAttrs.
- if err := i.SetStat(ctx, fs, creds, vfs.SetStatOptions{
- Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, fusefs.devMinor),
+ // Local version is newer, return the local one.
+ // Skip the update.
+ if attributeVersion != 0 && atomic.LoadUint64(&i.attributeVersion) > attributeVersion {
+ return i.getFUSEAttr(), nil
+ }
+
+ // Set the metadata of kernfs.InodeAttrs.
+ if err := i.SetInodeStat(ctx, fs, creds, vfs.SetStatOptions{
+ Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor),
}); err != nil {
+ return linux.FUSEAttr{}, err
+ }
+
+ // Set the size if no error (after SetStat() check).
+ atomic.StoreUint64(&i.size, out.Attr.Size)
+
+ return out.Attr, nil
+}
+
+// reviseAttr attempts to update the attributes for internal purposes
+// by calling getAttr with a pre-specified mask.
+// Used by read, write, lseek.
+func (i *inode) reviseAttr(ctx context.Context, flags uint32, fh uint64) error {
+ // Never need atime for internal purposes.
+ _, err := i.getAttr(ctx, i.fs.VFSFilesystem(), vfs.StatOptions{
+ Mask: linux.STATX_BASIC_STATS &^ linux.STATX_ATIME,
+ }, flags, fh)
+ return err
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ attr, err := i.getAttr(ctx, fs, opts, 0, 0)
+ if err != nil {
return linux.Statx{}, err
}
- return statFromFUSEAttr(out.Attr, opts.Mask, fusefs.devMinor), nil
+ return statFromFUSEAttr(attr, opts.Mask, i.fs.devMinor), nil
+}
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *inode) DecRef(context.Context) {
+ i.inodeRefs.DecRef(i.Destroy)
+}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+ // TODO(gvisor.dev/issues/3413): Complete the implementation of statfs.
+ return vfs.GenericStatFS(linux.FUSE_SUPER_MAGIC), nil
+}
+
+// fattrMaskFromStats converts vfs.SetStatOptions.Stat.Mask to linux stats mask
+// aligned with the attribute mask defined in include/linux/fs.h.
+func fattrMaskFromStats(mask uint32) uint32 {
+ var fuseAttrMask uint32
+ maskMap := map[uint32]uint32{
+ linux.STATX_MODE: linux.FATTR_MODE,
+ linux.STATX_UID: linux.FATTR_UID,
+ linux.STATX_GID: linux.FATTR_GID,
+ linux.STATX_SIZE: linux.FATTR_SIZE,
+ linux.STATX_ATIME: linux.FATTR_ATIME,
+ linux.STATX_MTIME: linux.FATTR_MTIME,
+ linux.STATX_CTIME: linux.FATTR_CTIME,
+ }
+ for statxMask, fattrMask := range maskMap {
+ if mask&statxMask != 0 {
+ fuseAttrMask |= fattrMask
+ }
+ }
+ return fuseAttrMask
+}
+
+// SetStat implements kernfs.Inode.SetStat.
+func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+ return i.setAttr(ctx, fs, creds, opts, false, 0)
+}
+
+func (i *inode) setAttr(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions, useFh bool, fh uint64) error {
+ conn := i.fs.conn
+ task := kernel.TaskFromContext(ctx)
+ if task == nil {
+ log.Warningf("couldn't get kernel task from context")
+ return syserror.EINVAL
+ }
+
+ // We should retain the original file type when assigning new mode.
+ fileType := uint16(i.Mode()) & linux.S_IFMT
+ fattrMask := fattrMaskFromStats(opts.Stat.Mask)
+ if useFh {
+ fattrMask |= linux.FATTR_FH
+ }
+ in := linux.FUSESetAttrIn{
+ Valid: fattrMask,
+ Fh: fh,
+ Size: opts.Stat.Size,
+ Atime: uint64(opts.Stat.Atime.Sec),
+ Mtime: uint64(opts.Stat.Mtime.Sec),
+ Ctime: uint64(opts.Stat.Ctime.Sec),
+ AtimeNsec: opts.Stat.Atime.Nsec,
+ MtimeNsec: opts.Stat.Mtime.Nsec,
+ CtimeNsec: opts.Stat.Ctime.Nsec,
+ Mode: uint32(fileType | opts.Stat.Mode),
+ UID: opts.Stat.UID,
+ GID: opts.Stat.GID,
+ }
+ req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_SETATTR, &in)
+ if err != nil {
+ return err
+ }
+
+ res, err := conn.Call(task, req)
+ if err != nil {
+ return err
+ }
+ if err := res.Error(); err != nil {
+ return err
+ }
+ out := linux.FUSEGetAttrOut{}
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return err
+ }
+
+ // Set the metadata of kernfs.InodeAttrs.
+ if err := i.SetInodeStat(ctx, fs, creds, vfs.SetStatOptions{
+ Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor),
+ }); err != nil {
+ return err
+ }
+
+ return nil
}
diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go
new file mode 100644
index 000000000..625d1547f
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/read_write.go
@@ -0,0 +1,242 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "io"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// ReadInPages sends FUSE_READ requests for the size after round it up to
+// a multiple of page size, blocks on it for reply, processes the reply
+// and returns the payload (or joined payloads) as a byte slice.
+// This is used for the general purpose reading.
+// We do not support direct IO (which read the exact number of bytes)
+// at this moment.
+func (fs *filesystem) ReadInPages(ctx context.Context, fd *regularFileFD, off uint64, size uint32) ([][]byte, uint32, error) {
+ attributeVersion := atomic.LoadUint64(&fs.conn.attributeVersion)
+
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ log.Warningf("fusefs.Read: couldn't get kernel task from context")
+ return nil, 0, syserror.EINVAL
+ }
+
+ // Round up to a multiple of page size.
+ readSize, _ := usermem.PageRoundUp(uint64(size))
+
+ // One request cannnot exceed either maxRead or maxPages.
+ maxPages := fs.conn.maxRead >> usermem.PageShift
+ if maxPages > uint32(fs.conn.maxPages) {
+ maxPages = uint32(fs.conn.maxPages)
+ }
+
+ var outs [][]byte
+ var sizeRead uint32
+
+ // readSize is a multiple of usermem.PageSize.
+ // Always request bytes as a multiple of pages.
+ pagesRead, pagesToRead := uint32(0), uint32(readSize>>usermem.PageShift)
+
+ // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation.
+ in := linux.FUSEReadIn{
+ Fh: fd.Fh,
+ LockOwner: 0, // TODO(gvisor.dev/issue/3245): file lock
+ ReadFlags: 0, // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER
+ Flags: fd.statusFlags(),
+ }
+
+ // This loop is intended for fragmented read where the bytes to read is
+ // larger than either the maxPages or maxRead.
+ // For the majority of reads with normal size, this loop should only
+ // execute once.
+ for pagesRead < pagesToRead {
+ pagesCanRead := pagesToRead - pagesRead
+ if pagesCanRead > maxPages {
+ pagesCanRead = maxPages
+ }
+
+ in.Offset = off + (uint64(pagesRead) << usermem.PageShift)
+ in.Size = pagesCanRead << usermem.PageShift
+
+ req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_READ, &in)
+ if err != nil {
+ return nil, 0, err
+ }
+
+ // TODO(gvisor.dev/issue/3247): support async read.
+
+ res, err := fs.conn.Call(t, req)
+ if err != nil {
+ return nil, 0, err
+ }
+ if err := res.Error(); err != nil {
+ return nil, 0, err
+ }
+
+ // Not enough bytes in response,
+ // either we reached EOF,
+ // or the FUSE server sends back a response
+ // that cannot even fit the hdr.
+ if len(res.data) <= res.hdr.SizeBytes() {
+ // We treat both case as EOF here for now
+ // since there is no reliable way to detect
+ // the over-short hdr case.
+ break
+ }
+
+ // Directly using the slice to avoid extra copy.
+ out := res.data[res.hdr.SizeBytes():]
+
+ outs = append(outs, out)
+ sizeRead += uint32(len(out))
+
+ pagesRead += pagesCanRead
+ }
+
+ defer fs.ReadCallback(ctx, fd, off, size, sizeRead, attributeVersion)
+
+ // No bytes returned: offset >= EOF.
+ if len(outs) == 0 {
+ return nil, 0, io.EOF
+ }
+
+ return outs, sizeRead, nil
+}
+
+// ReadCallback updates several information after receiving a read response.
+// Due to readahead, sizeRead can be larger than size.
+func (fs *filesystem) ReadCallback(ctx context.Context, fd *regularFileFD, off uint64, size uint32, sizeRead uint32, attributeVersion uint64) {
+ // TODO(gvisor.dev/issue/3247): support async read.
+ // If this is called by an async read, correctly process it.
+ // May need to update the signature.
+
+ i := fd.inode()
+ // TODO(gvisor.dev/issue/1193): Invalidate or update atime.
+
+ // Reached EOF.
+ if sizeRead < size {
+ // TODO(gvisor.dev/issue/3630): If we have writeback cache, then we need to fill this hole.
+ // Might need to update the buf to be returned from the Read().
+
+ // Update existing size.
+ newSize := off + uint64(sizeRead)
+ fs.conn.mu.Lock()
+ if attributeVersion == i.attributeVersion && newSize < atomic.LoadUint64(&i.size) {
+ fs.conn.attributeVersion++
+ i.attributeVersion = i.fs.conn.attributeVersion
+ atomic.StoreUint64(&i.size, newSize)
+ }
+ fs.conn.mu.Unlock()
+ }
+}
+
+// Write sends FUSE_WRITE requests and return the bytes
+// written according to the response.
+//
+// Preconditions: len(data) == size.
+func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, size uint32, data []byte) (uint32, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ log.Warningf("fusefs.Read: couldn't get kernel task from context")
+ return 0, syserror.EINVAL
+ }
+
+ // One request cannnot exceed either maxWrite or maxPages.
+ maxWrite := uint32(fs.conn.maxPages) << usermem.PageShift
+ if maxWrite > fs.conn.maxWrite {
+ maxWrite = fs.conn.maxWrite
+ }
+
+ // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation.
+ in := linux.FUSEWriteIn{
+ Fh: fd.Fh,
+ // TODO(gvisor.dev/issue/3245): file lock
+ LockOwner: 0,
+ // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER
+ // TODO(gvisor.dev/issue/3237): |= linux.FUSE_WRITE_CACHE (not added yet)
+ WriteFlags: 0,
+ Flags: fd.statusFlags(),
+ }
+
+ var written uint32
+
+ // This loop is intended for fragmented write where the bytes to write is
+ // larger than either the maxWrite or maxPages or when bigWrites is false.
+ // Unless a small value for max_write is explicitly used, this loop
+ // is expected to execute only once for the majority of the writes.
+ for written < size {
+ toWrite := size - written
+
+ // Limit the write size to one page.
+ // Note that the bigWrites flag is obsolete,
+ // latest libfuse always sets it on.
+ if !fs.conn.bigWrites && toWrite > usermem.PageSize {
+ toWrite = usermem.PageSize
+ }
+
+ // Limit the write size to maxWrite.
+ if toWrite > maxWrite {
+ toWrite = maxWrite
+ }
+
+ in.Offset = off + uint64(written)
+ in.Size = toWrite
+
+ req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_WRITE, &in)
+ if err != nil {
+ return 0, err
+ }
+
+ req.payload = data[written : written+toWrite]
+
+ // TODO(gvisor.dev/issue/3247): support async write.
+
+ res, err := fs.conn.Call(t, req)
+ if err != nil {
+ return 0, err
+ }
+ if err := res.Error(); err != nil {
+ return 0, err
+ }
+
+ out := linux.FUSEWriteOut{}
+ if err := res.UnmarshalPayload(&out); err != nil {
+ return 0, err
+ }
+
+ // Write more than requested? EIO.
+ if out.Size > toWrite {
+ return 0, syserror.EIO
+ }
+
+ written += out.Size
+
+ // Break if short write. Not necessarily an error.
+ if out.Size != toWrite {
+ break
+ }
+ }
+
+ return written, nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/regular_file.go b/pkg/sentry/fsimpl/fuse/regular_file.go
new file mode 100644
index 000000000..5bdd096c3
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/regular_file.go
@@ -0,0 +1,230 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "io"
+ "math"
+ "sync"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+type regularFileFD struct {
+ fileDescription
+
+ // off is the file offset.
+ off int64
+ // offMu protects off.
+ offMu sync.Mutex
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
+ return 0, syserror.EOPNOTSUPP
+ }
+
+ size := dst.NumBytes()
+ if size == 0 {
+ // Early return if count is 0.
+ return 0, nil
+ } else if size > math.MaxUint32 {
+ // FUSE only supports uint32 for size.
+ // Overflow.
+ return 0, syserror.EINVAL
+ }
+
+ // TODO(gvisor.dev/issue/3678): Add direct IO support.
+
+ inode := fd.inode()
+
+ // Reading beyond EOF, update file size if outdated.
+ if uint64(offset+size) > atomic.LoadUint64(&inode.size) {
+ if err := inode.reviseAttr(ctx, linux.FUSE_GETATTR_FH, fd.Fh); err != nil {
+ return 0, err
+ }
+ // If the offset after update is still too large, return error.
+ if uint64(offset) >= atomic.LoadUint64(&inode.size) {
+ return 0, io.EOF
+ }
+ }
+
+ // Truncate the read with updated file size.
+ fileSize := atomic.LoadUint64(&inode.size)
+ if uint64(offset+size) > fileSize {
+ size = int64(fileSize) - offset
+ }
+
+ buffers, n, err := inode.fs.ReadInPages(ctx, fd, uint64(offset), uint32(size))
+ if err != nil {
+ return 0, err
+ }
+
+ // TODO(gvisor.dev/issue/3237): support indirect IO (e.g. caching),
+ // store the bytes that were read ahead.
+
+ // Update the number of bytes to copy for short read.
+ if n < uint32(size) {
+ size = int64(n)
+ }
+
+ // Copy the bytes read to the dst.
+ // This loop is intended for fragmented reads.
+ // For the majority of reads, this loop only execute once.
+ var copied int64
+ for _, buffer := range buffers {
+ toCopy := int64(len(buffer))
+ if copied+toCopy > size {
+ toCopy = size - copied
+ }
+ cp, err := dst.DropFirst64(copied).CopyOut(ctx, buffer[:toCopy])
+ if err != nil {
+ return 0, err
+ }
+ if int64(cp) != toCopy {
+ return 0, syserror.EIO
+ }
+ copied += toCopy
+ }
+
+ return copied, nil
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ fd.offMu.Lock()
+ n, err := fd.PRead(ctx, dst, fd.off, opts)
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ n, _, err := fd.pwrite(ctx, src, offset, opts)
+ return n, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ fd.offMu.Lock()
+ n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+ fd.off = off
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// pwrite returns the number of bytes written, final offset and error. The
+// final offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
+ if offset < 0 {
+ return 0, offset, syserror.EINVAL
+ }
+
+ // Check that flags are supported.
+ //
+ // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+ if opts.Flags&^linux.RWF_HIPRI != 0 {
+ return 0, offset, syserror.EOPNOTSUPP
+ }
+
+ inode := fd.inode()
+ inode.metadataMu.Lock()
+ defer inode.metadataMu.Unlock()
+
+ // If the file is opened with O_APPEND, update offset to file size.
+ // Note: since our Open() implements the interface of kernfs,
+ // and kernfs currently does not support O_APPEND, this will never
+ // be true before we switch out from kernfs.
+ if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+ // Locking inode.metadataMu is sufficient for reading size
+ offset = int64(inode.size)
+ }
+
+ srclen := src.NumBytes()
+
+ if srclen > math.MaxUint32 {
+ // FUSE only supports uint32 for size.
+ // Overflow.
+ return 0, offset, syserror.EINVAL
+ }
+ if end := offset + srclen; end < offset {
+ // Overflow.
+ return 0, offset, syserror.EINVAL
+ }
+
+ srclen, err = vfs.CheckLimit(ctx, offset, srclen)
+ if err != nil {
+ return 0, offset, err
+ }
+
+ if srclen == 0 {
+ // Return before causing any side effects.
+ return 0, offset, nil
+ }
+
+ src = src.TakeFirst64(srclen)
+
+ // TODO(gvisor.dev/issue/3237): Add cache support:
+ // buffer cache. Ideally we write from src to our buffer cache first.
+ // The slice passed to fs.Write() should be a slice from buffer cache.
+ data := make([]byte, srclen)
+ // Reason for making a copy here: connection.Call() blocks on kerneltask,
+ // which in turn acquires mm.activeMu lock. Functions like CopyInTo() will
+ // attemp to acquire the mm.activeMu lock as well -> deadlock.
+ // We must finish reading from the userspace memory before
+ // t.Block() deactivates it.
+ cp, err := src.CopyIn(ctx, data)
+ if err != nil {
+ return 0, offset, err
+ }
+ if int64(cp) != srclen {
+ return 0, offset, syserror.EIO
+ }
+
+ n, err := fd.inode().fs.Write(ctx, fd, uint64(offset), uint32(srclen), data)
+ if err != nil {
+ return 0, offset, err
+ }
+
+ if n == 0 {
+ // We have checked srclen != 0 previously.
+ // If err == nil, then it's a short write and we return EIO.
+ return 0, offset, syserror.EIO
+ }
+
+ written = int64(n)
+ finalOff = offset + written
+
+ if finalOff > int64(inode.size) {
+ atomic.StoreUint64(&inode.size, uint64(finalOff))
+ atomic.AddUint64(&inode.fs.conn.attributeVersion, 1)
+ }
+
+ return
+}
diff --git a/pkg/sentry/fsimpl/fuse/request_response.go b/pkg/sentry/fsimpl/fuse/request_response.go
new file mode 100644
index 000000000..7fa00569b
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/request_response.go
@@ -0,0 +1,229 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "fmt"
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fuseInitRes is a variable-length wrapper of linux.FUSEInitOut. The FUSE
+// server may implement an older version of FUSE protocol, which contains a
+// linux.FUSEInitOut with less attributes.
+//
+// Dynamically-sized objects cannot be marshalled.
+type fuseInitRes struct {
+ marshal.StubMarshallable
+
+ // initOut contains the response from the FUSE server.
+ initOut linux.FUSEInitOut
+
+ // initLen is the total length of bytes of the response.
+ initLen uint32
+}
+
+// UnmarshalBytes deserializes src to the initOut attribute in a fuseInitRes.
+func (r *fuseInitRes) UnmarshalBytes(src []byte) {
+ out := &r.initOut
+
+ // Introduced before FUSE kernel version 7.13.
+ out.Major = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+ out.Minor = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+ out.MaxReadahead = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+ out.Flags = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+ out.MaxBackground = uint16(usermem.ByteOrder.Uint16(src[:2]))
+ src = src[2:]
+ out.CongestionThreshold = uint16(usermem.ByteOrder.Uint16(src[:2]))
+ src = src[2:]
+ out.MaxWrite = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+
+ // Introduced in FUSE kernel version 7.23.
+ if len(src) >= 4 {
+ out.TimeGran = uint32(usermem.ByteOrder.Uint32(src[:4]))
+ src = src[4:]
+ }
+ // Introduced in FUSE kernel version 7.28.
+ if len(src) >= 2 {
+ out.MaxPages = uint16(usermem.ByteOrder.Uint16(src[:2]))
+ src = src[2:]
+ }
+}
+
+// SizeBytes is the size of the payload of the FUSE_INIT response.
+func (r *fuseInitRes) SizeBytes() int {
+ return int(r.initLen)
+}
+
+// Ordinary requests have even IDs, while interrupts IDs are odd.
+// Used to increment the unique ID for each FUSE request.
+var reqIDStep uint64 = 2
+
+// Request represents a FUSE operation request that hasn't been sent to the
+// server yet.
+//
+// +stateify savable
+type Request struct {
+ requestEntry
+
+ id linux.FUSEOpID
+ hdr *linux.FUSEHeaderIn
+ data []byte
+
+ // payload for this request: extra bytes to write after
+ // the data slice. Used by FUSE_WRITE.
+ payload []byte
+
+ // If this request is async.
+ async bool
+ // If we don't care its response.
+ // Manually set by the caller.
+ noReply bool
+}
+
+// NewRequest creates a new request that can be sent to the FUSE server.
+func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
+ conn.fd.mu.Lock()
+ defer conn.fd.mu.Unlock()
+ conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
+
+ hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
+ hdr := linux.FUSEHeaderIn{
+ Len: uint32(hdrLen + payload.SizeBytes()),
+ Opcode: opcode,
+ Unique: conn.fd.nextOpID,
+ NodeID: ino,
+ UID: uint32(creds.EffectiveKUID),
+ GID: uint32(creds.EffectiveKGID),
+ PID: pid,
+ }
+
+ buf := make([]byte, hdr.Len)
+
+ // TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again.
+ hdr.MarshalBytes(buf[:hdrLen])
+ payload.MarshalBytes(buf[hdrLen:])
+
+ return &Request{
+ id: hdr.Unique,
+ hdr: &hdr,
+ data: buf,
+ }, nil
+}
+
+// futureResponse represents an in-flight request, that may or may not have
+// completed yet. Convert it to a resolved Response by calling Resolve, but note
+// that this may block.
+//
+// +stateify savable
+type futureResponse struct {
+ opcode linux.FUSEOpcode
+ ch chan struct{}
+ hdr *linux.FUSEHeaderOut
+ data []byte
+
+ // If this request is async.
+ async bool
+}
+
+// newFutureResponse creates a future response to a FUSE request.
+func newFutureResponse(req *Request) *futureResponse {
+ return &futureResponse{
+ opcode: req.hdr.Opcode,
+ ch: make(chan struct{}),
+ async: req.async,
+ }
+}
+
+// resolve blocks the task until the server responds to its corresponding request,
+// then returns a resolved response.
+func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
+ // Return directly for async requests.
+ if f.async {
+ return nil, nil
+ }
+
+ if err := t.Block(f.ch); err != nil {
+ return nil, err
+ }
+
+ return f.getResponse(), nil
+}
+
+// getResponse creates a Response from the data the futureResponse has.
+func (f *futureResponse) getResponse() *Response {
+ return &Response{
+ opcode: f.opcode,
+ hdr: *f.hdr,
+ data: f.data,
+ }
+}
+
+// Response represents an actual response from the server, including the
+// response payload.
+//
+// +stateify savable
+type Response struct {
+ opcode linux.FUSEOpcode
+ hdr linux.FUSEHeaderOut
+ data []byte
+}
+
+// Error returns the error of the FUSE call.
+func (r *Response) Error() error {
+ errno := r.hdr.Error
+ if errno >= 0 {
+ return nil
+ }
+
+ sysErrNo := syscall.Errno(-errno)
+ return error(sysErrNo)
+}
+
+// DataLen returns the size of the response without the header.
+func (r *Response) DataLen() uint32 {
+ return r.hdr.Len - uint32(r.hdr.SizeBytes())
+}
+
+// UnmarshalPayload unmarshals the response data into m.
+func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
+ hdrLen := r.hdr.SizeBytes()
+ haveDataLen := r.hdr.Len - uint32(hdrLen)
+ wantDataLen := uint32(m.SizeBytes())
+
+ if haveDataLen < wantDataLen {
+ return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen)
+ }
+
+ // The response data is empty unless there is some payload. And so, doesn't
+ // need to be unmarshalled.
+ if r.data == nil {
+ return nil
+ }
+
+ // TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again.
+ m.UnmarshalBytes(r.data[hdrLen:])
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/utils_test.go b/pkg/sentry/fsimpl/fuse/utils_test.go
new file mode 100644
index 000000000..e1d9e3365
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/utils_test.go
@@ -0,0 +1,132 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+ "io"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+func setup(t *testing.T) *testutil.System {
+ k, err := testutil.Boot()
+ if err != nil {
+ t.Fatalf("Error creating kernel: %v", err)
+ }
+
+ ctx := k.SupervisorContext()
+ creds := auth.CredentialsFromContext(ctx)
+
+ k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserList: true,
+ AllowUserMount: true,
+ })
+
+ mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
+ if err != nil {
+ t.Fatalf("NewMountNamespace(): %v", err)
+ }
+
+ return testutil.NewSystem(ctx, t, k.VFS(), mntns)
+}
+
+// newTestConnection creates a fuse connection that the sentry can communicate with
+// and the FD for the server to communicate with.
+func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) {
+ vfsObj := &vfs.VirtualFilesystem{}
+ fuseDev := &DeviceFD{}
+
+ if err := vfsObj.Init(system.Ctx); err != nil {
+ return nil, nil, err
+ }
+
+ vd := vfsObj.NewAnonVirtualDentry("genCountFD")
+ defer vd.DecRef(system.Ctx)
+ if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil {
+ return nil, nil, err
+ }
+
+ fsopts := filesystemOptions{
+ maxActiveRequests: maxActiveRequests,
+ }
+ fs, err := newFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ return fs.conn, &fuseDev.vfsfd, nil
+}
+
+type testPayload struct {
+ marshal.StubMarshallable
+ data uint32
+}
+
+// SizeBytes implements marshal.Marshallable.SizeBytes.
+func (t *testPayload) SizeBytes() int {
+ return 4
+}
+
+// MarshalBytes implements marshal.Marshallable.MarshalBytes.
+func (t *testPayload) MarshalBytes(dst []byte) {
+ usermem.ByteOrder.PutUint32(dst[:4], t.data)
+}
+
+// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
+func (t *testPayload) UnmarshalBytes(src []byte) {
+ *t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])}
+}
+
+// Packed implements marshal.Marshallable.Packed.
+func (t *testPayload) Packed() bool {
+ return true
+}
+
+// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
+func (t *testPayload) MarshalUnsafe(dst []byte) {
+ t.MarshalBytes(dst)
+}
+
+// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
+func (t *testPayload) UnmarshalUnsafe(src []byte) {
+ t.UnmarshalBytes(src)
+}
+
+// CopyOutN implements marshal.Marshallable.CopyOutN.
+func (t *testPayload) CopyOutN(task marshal.CopyContext, addr usermem.Addr, limit int) (int, error) {
+ panic("not implemented")
+}
+
+// CopyOut implements marshal.Marshallable.CopyOut.
+func (t *testPayload) CopyOut(task marshal.CopyContext, addr usermem.Addr) (int, error) {
+ panic("not implemented")
+}
+
+// CopyIn implements marshal.Marshallable.CopyIn.
+func (t *testPayload) CopyIn(task marshal.CopyContext, addr usermem.Addr) (int, error) {
+ panic("not implemented")
+}
+
+// WriteTo implements io.WriterTo.WriteTo.
+func (t *testPayload) WriteTo(w io.Writer) (int64, error) {
+ panic("not implemented")
+}
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 2a8011eb4..91d2ae199 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -34,8 +34,11 @@ func (d *dentry) isDir() bool {
return d.fileType() == linux.S_IFDIR
}
-// Preconditions: filesystem.renameMu must be locked. d.dirMu must be locked.
-// d.isDir(). child must be a newly-created dentry that has never had a parent.
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+// * child must be a newly-created dentry that has never had a parent.
func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
d.IncRef() // reference held by child on its parent
child.parent = d
@@ -46,7 +49,9 @@ func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
d.children[name] = child
}
-// Preconditions: d.dirMu must be locked. d.isDir().
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
func (d *dentry) cacheNegativeLookupLocked(name string) {
// Don't cache negative lookups if InteropModeShared is in effect (since
// this makes remote lookup unavoidable), or if d.isSynthetic() (in which
@@ -79,10 +84,12 @@ type createSyntheticOpts struct {
// createSyntheticChildLocked creates a synthetic file with the given name
// in d.
//
-// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain
-// a child with the given name.
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
+// * d does not already contain a child with the given name.
func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
- d2 := &dentry{
+ child := &dentry{
refs: 1, // held by d
fs: d.fs,
ino: d.fs.nextSyntheticIno(),
@@ -97,16 +104,16 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
case linux.S_IFDIR:
// Nothing else needs to be done.
case linux.S_IFSOCK:
- d2.endpoint = opts.endpoint
+ child.endpoint = opts.endpoint
case linux.S_IFIFO:
- d2.pipe = opts.pipe
+ child.pipe = opts.pipe
default:
panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType()))
}
- d2.pf.dentry = d2
- d2.vfsd.Init(d2)
+ child.pf.dentry = child
+ child.vfsd.Init(child)
- d.cacheNewChildLocked(d2, opts.name)
+ d.cacheNewChildLocked(child, opts.name)
d.syntheticChildren++
}
@@ -151,7 +158,9 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
return nil
}
-// Preconditions: d.isDir(). There exists at least one directoryFD representing d.
+// Preconditions:
+// * d.isDir().
+// * There exists at least one directoryFD representing d.
func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
// NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the
// presence of concurrent mutation of an iterated directory, so
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index a3903db33..97b9165cc 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -115,9 +115,12 @@ func putDentrySlice(ds *[]*dentry) {
// Dentries which may become cached as a result of the traversal are appended
// to *ds.
//
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
-// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata
-// must be up to date.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !rp.Done().
+// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up
+// to date.
//
// Postconditions: The returned dentry's cached metadata is up to date.
func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
@@ -185,8 +188,11 @@ afterSymlink:
// getChildLocked returns a dentry representing the child of parent with the
// given name. If no such child exists, getChildLocked returns (nil, nil).
//
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
-// parent.isDir(). name is not "." or "..".
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
+// * parent.isDir().
+// * name is not "." or "..".
//
// Postconditions: If getChildLocked returns a non-nil dentry, its cached
// metadata is up to date.
@@ -206,7 +212,8 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil
return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds)
}
-// Preconditions: As for getChildLocked. !parent.isSynthetic().
+// Preconditions: Same as getChildLocked, plus:
+// * !parent.isSynthetic().
func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) {
if child != nil {
// Need to lock child.metadataMu because we might be updating child
@@ -279,9 +286,11 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
// rp.Start().Impl().(*dentry)). It does not check that the returned directory
// is searchable by the provider of rp.
//
-// Preconditions: fs.renameMu must be locked. !rp.Done(). If
-// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to
-// date.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * !rp.Done().
+// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up
+// to date.
func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
for !rp.Final() {
d.dirMu.Lock()
@@ -328,9 +337,10 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
// createInRemoteDir (if the parent directory is a real remote directory) or
// createInSyntheticDir (if the parent directory is synthetic) to do so.
//
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error {
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) error, createInSyntheticDir func(parent *dentry, name string) error) error {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -399,7 +409,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
// RPC will fail with EEXIST like we would have. If the RPC succeeds, and a
// stale dentry exists, the dentry will fail revalidation next time it's
// used.
- if err := createInRemoteDir(parent, name); err != nil {
+ if err := createInRemoteDir(parent, name, &ds); err != nil {
return err
}
ev := linux.IN_CREATE
@@ -414,7 +424,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
}
// No cached dentry exists; however, there might still be an existing file
// at name. As above, we attempt the file creation RPC anyway.
- if err := createInRemoteDir(parent, name); err != nil {
+ if err := createInRemoteDir(parent, name, &ds); err != nil {
return err
}
if child, ok := parent.children[name]; ok && child == nil {
@@ -721,7 +731,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
// LinkAt implements vfs.FilesystemImpl.LinkAt.
func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
- return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, _ **[]*dentry) error {
if rp.Mount() != vd.Mount() {
return syserror.EXDEV
}
@@ -754,7 +764,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
creds := rp.Credentials()
- return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
+ return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, _ **[]*dentry) error {
if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil {
if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
return err
@@ -789,34 +799,49 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
// MknodAt implements vfs.FilesystemImpl.MknodAt.
func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
- return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) error {
creds := rp.Credentials()
_, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
- // If the gofer does not allow creating a socket or pipe, create a
- // synthetic one, i.e. one that is kept entirely in memory.
- if err == syserror.EPERM {
- switch opts.Mode.FileType() {
- case linux.S_IFSOCK:
- parent.createSyntheticChildLocked(&createSyntheticOpts{
- name: name,
- mode: opts.Mode,
- kuid: creds.EffectiveKUID,
- kgid: creds.EffectiveKGID,
- endpoint: opts.Endpoint,
- })
- return nil
- case linux.S_IFIFO:
- parent.createSyntheticChildLocked(&createSyntheticOpts{
- name: name,
- mode: opts.Mode,
- kuid: creds.EffectiveKUID,
- kgid: creds.EffectiveKGID,
- pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
- })
- return nil
- }
+ if err != syserror.EPERM {
+ return err
}
- return err
+
+ // EPERM means that gofer does not allow creating a socket or pipe. Fallback
+ // to creating a synthetic one, i.e. one that is kept entirely in memory.
+
+ // Check that we're not overriding an existing file with a synthetic one.
+ _, err = fs.stepLocked(ctx, rp, parent, true, ds)
+ switch {
+ case err == nil:
+ // Step succeeded, another file exists.
+ return syserror.EEXIST
+ case err != syserror.ENOENT:
+ // Unexpected error.
+ return err
+ }
+
+ switch opts.Mode.FileType() {
+ case linux.S_IFSOCK:
+ parent.createSyntheticChildLocked(&createSyntheticOpts{
+ name: name,
+ mode: opts.Mode,
+ kuid: creds.EffectiveKUID,
+ kgid: creds.EffectiveKGID,
+ endpoint: opts.Endpoint,
+ })
+ return nil
+ case linux.S_IFIFO:
+ parent.createSyntheticChildLocked(&createSyntheticOpts{
+ name: name,
+ mode: opts.Mode,
+ kuid: creds.EffectiveKUID,
+ kgid: creds.EffectiveKGID,
+ pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
+ })
+ return nil
+ }
+ // Retain error from gofer if synthetic file cannot be created internally.
+ return syserror.EPERM
}, nil)
}
@@ -834,7 +859,14 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ unlocked := false
+ unlock := func() {
+ if !unlocked {
+ fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+ unlocked = true
+ }
+ }
+ defer unlock()
start := rp.Start().Impl().(*dentry)
if !start.cachedMetadataAuthoritative() {
@@ -851,7 +883,10 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if mustCreate {
return nil, syserror.EEXIST
}
- return start.openLocked(ctx, rp, &opts)
+ start.IncRef()
+ defer start.DecRef(ctx)
+ unlock()
+ return start.open(ctx, rp, &opts)
}
afterTrailingSymlink:
@@ -901,11 +936,15 @@ afterTrailingSymlink:
if rp.MustBeDir() && !child.isDir() {
return nil, syserror.ENOTDIR
}
- return child.openLocked(ctx, rp, &opts)
+ child.IncRef()
+ defer child.DecRef(ctx)
+ unlock()
+ return child.open(ctx, rp, &opts)
}
-// Preconditions: fs.renameMu must be locked.
-func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Preconditions: The caller must hold no locks (since opening pipes may block
+// indefinitely).
+func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
ats := vfs.AccessTypesForOpenFlags(opts)
if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
return nil, err
@@ -968,7 +1007,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
return nil, syserror.ENXIO
}
if d.fs.iopts.OpenSocketsByConnecting {
- return d.connectSocketLocked(ctx, opts)
+ return d.openSocketByConnecting(ctx, opts)
}
case linux.S_IFIFO:
if d.isSynthetic() {
@@ -977,7 +1016,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
}
if vfd == nil {
- if vfd, err = d.openSpecialFileLocked(ctx, mnt, opts); err != nil {
+ if vfd, err = d.openSpecialFile(ctx, mnt, opts); err != nil {
return nil, err
}
}
@@ -987,7 +1026,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
// step is required even if !d.cachedMetadataAuthoritative() because
// d.mappings has to be updated.
// d.metadataMu has already been acquired if trunc == true.
- d.updateFileSizeLocked(0)
+ d.updateSizeLocked(0)
if d.cachedMetadataAuthoritative() {
d.touchCMtimeLocked()
@@ -996,7 +1035,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
return vfd, err
}
-func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
if opts.Flags&linux.O_DIRECT != 0 {
return nil, syserror.EINVAL
}
@@ -1016,7 +1055,7 @@ func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions)
return fd, nil
}
-func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (d *dentry) openSpecialFile(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
ats := vfs.AccessTypesForOpenFlags(opts)
if opts.Flags&linux.O_DIRECT != 0 {
return nil, syserror.EINVAL
@@ -1058,8 +1097,10 @@ retry:
return &fd.vfsfd, nil
}
-// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
-// !d.isSynthetic().
+// Preconditions:
+// * d.fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !d.isSynthetic().
func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
return nil, err
@@ -1270,6 +1311,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if !renamed.isDir() {
return syserror.EISDIR
}
+ if genericIsAncestorDentry(replaced, renamed) {
+ return syserror.ENOTEMPTY
+ }
} else {
if rp.MustBeDir() || renamed.isDir() {
return syserror.ENOTDIR
@@ -1320,14 +1364,15 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// with reference counts and queue oldParent for checkCachingLocked if the
// parent isn't actually changing.
if oldParent != newParent {
+ oldParent.decRefLocked()
ds = appendDentry(ds, oldParent)
newParent.IncRef()
if renamed.isSynthetic() {
oldParent.syntheticChildren--
newParent.syntheticChildren++
}
+ renamed.parent = newParent
}
- renamed.parent = newParent
renamed.name = newName
if newParent.children == nil {
newParent.children = make(map[string]*dentry)
@@ -1438,7 +1483,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
- return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, _ **[]*dentry) error {
creds := rp.Credentials()
_, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
return err
@@ -1450,7 +1495,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return fs.unlinkAt(ctx, rp, false /* dir */)
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
var ds *[]*dentry
fs.renameMu.RLock()
@@ -1471,13 +1516,15 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
path: opts.Addr,
}, nil
}
- return d.endpoint, nil
+ if d.endpoint != nil {
+ return d.endpoint, nil
+ }
}
return nil, syserror.ECONNREFUSED
}
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -1485,11 +1532,11 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
if err != nil {
return nil, err
}
- return d.listxattr(ctx, rp.Credentials(), size)
+ return d.listXattr(ctx, rp.Credentials(), size)
}
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -1497,11 +1544,11 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
if err != nil {
return "", err
}
- return d.getxattr(ctx, rp.Credentials(), &opts)
+ return d.getXattr(ctx, rp.Credentials(), &opts)
}
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
var ds *[]*dentry
fs.renameMu.RLock()
d, err := fs.resolveLocked(ctx, rp, &ds)
@@ -1509,7 +1556,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
return err
}
- if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil {
+ if err := d.setXattr(ctx, rp.Credentials(), &opts); err != nil {
fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
return err
}
@@ -1519,8 +1566,8 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
return nil
}
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
var ds *[]*dentry
fs.renameMu.RLock()
d, err := fs.resolveLocked(ctx, rp, &ds)
@@ -1528,7 +1575,7 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
return err
}
- if err := d.removexattr(ctx, rp.Credentials(), name); err != nil {
+ if err := d.removeXattr(ctx, rp.Credentials(), name); err != nil {
fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
return err
}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 63e589859..aaad9c0d9 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -195,11 +195,7 @@ const (
// and consistent with Linux's semantics (in particular, it is not always
// possible for clients to set arbitrary atimes and mtimes depending on the
// remote filesystem implementation, and never possible for clients to set
- // arbitrary ctimes.) If a dentry containing a client-defined atime or
- // mtime is evicted from cache, client timestamps will be sent to the
- // remote filesystem on a best-effort basis to attempt to ensure that
- // timestamps will be preserved when another dentry representing the same
- // file is instantiated.
+ // arbitrary ctimes.)
InteropModeExclusive InteropMode = iota
// InteropModeWritethrough is appropriate when there are read-only users of
@@ -703,6 +699,13 @@ type dentry struct {
locks vfs.FileLocks
// Inotify watches for this dentry.
+ //
+ // Note that inotify may behave unexpectedly in the presence of hard links,
+ // because dentries corresponding to the same file have separate inotify
+ // watches when they should share the same set. This is the case because it is
+ // impossible for us to know for sure whether two dentries correspond to the
+ // same underlying file (see the gofer filesystem section fo vfs/inotify.md for
+ // a more in-depth discussion on this matter).
watches vfs.Watches
}
@@ -830,7 +833,7 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
}
if mask.Size {
- d.updateFileSizeLocked(attr.Size)
+ d.updateSizeLocked(attr.Size)
}
}
@@ -984,7 +987,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
// d.size should be kept up to date, and privatized
// copy-on-write mappings of truncated pages need to be
// invalidated, even if InteropModeShared is in effect.
- d.updateFileSizeLocked(stat.Size)
+ d.updateSizeLocked(stat.Size)
}
}
if d.fs.opts.interop == InteropModeShared {
@@ -1021,8 +1024,31 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
return nil
}
+// doAllocate performs an allocate operation on d. Note that d.metadataMu will
+// be held when allocate is called.
+func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error {
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+
+ // Allocating a smaller size is a noop.
+ size := offset + length
+ if d.cachedMetadataAuthoritative() && size <= d.size {
+ return nil
+ }
+
+ err := allocate()
+ if err != nil {
+ return err
+ }
+ d.updateSizeLocked(size)
+ if d.cachedMetadataAuthoritative() {
+ d.touchCMtimeLocked()
+ }
+ return nil
+}
+
// Preconditions: d.metadataMu must be locked.
-func (d *dentry) updateFileSizeLocked(newSize uint64) {
+func (d *dentry) updateSizeLocked(newSize uint64) {
d.dataMu.Lock()
oldSize := d.size
atomic.StoreUint64(&d.size, newSize)
@@ -1060,6 +1086,21 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
}
+func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+ // We only support xattrs prefixed with "user." (see b/148380782). Currently,
+ // there is no need to expose any other xattrs through a gofer.
+ if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+ return syserror.EOPNOTSUPP
+ }
+ mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+ kuid := auth.KUID(atomic.LoadUint32(&d.uid))
+ kgid := auth.KGID(atomic.LoadUint32(&d.gid))
+ if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+ return err
+ }
+ return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
+}
+
func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid)))
}
@@ -1293,30 +1334,19 @@ func (d *dentry) destroyLocked(ctx context.Context) {
d.handleMu.Unlock()
if !d.file.isNil() {
- if !d.isDeleted() {
- // Write dirty timestamps back to the remote filesystem.
- atimeDirty := atomic.LoadUint32(&d.atimeDirty) != 0
- mtimeDirty := atomic.LoadUint32(&d.mtimeDirty) != 0
- if atimeDirty || mtimeDirty {
- atime := atomic.LoadInt64(&d.atime)
- mtime := atomic.LoadInt64(&d.mtime)
- if err := d.file.setAttr(ctx, p9.SetAttrMask{
- ATime: atimeDirty,
- ATimeNotSystemTime: atimeDirty,
- MTime: mtimeDirty,
- MTimeNotSystemTime: mtimeDirty,
- }, p9.SetAttr{
- ATimeSeconds: uint64(atime / 1e9),
- ATimeNanoSeconds: uint64(atime % 1e9),
- MTimeSeconds: uint64(mtime / 1e9),
- MTimeNanoSeconds: uint64(mtime % 1e9),
- }); err != nil {
- log.Warningf("gofer.dentry.destroyLocked: failed to write dirty timestamps back: %v", err)
- }
- }
+ // Note that it's possible that d.atimeDirty or d.mtimeDirty are true,
+ // i.e. client and server timestamps may differ (because e.g. a client
+ // write was serviced by the page cache, and only written back to the
+ // remote file later). Ideally, we'd write client timestamps back to
+ // the remote filesystem so that timestamps for a new dentry
+ // instantiated for the same file would remain coherent. Unfortunately,
+ // this turns out to be too expensive in many cases, so for now we
+ // don't do this.
+ if err := d.file.close(ctx); err != nil {
+ log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err)
}
- d.file.close(ctx)
d.file = p9file{}
+
// Remove d from the set of syncable dentries.
d.fs.syncMu.Lock()
delete(d.fs.syncableDentries, d)
@@ -1344,9 +1374,7 @@ func (d *dentry) setDeleted() {
atomic.StoreUint32(&d.deleted, 1)
}
-// We only support xattrs prefixed with "user." (see b/148380782). Currently,
-// there is no need to expose any other xattrs through a gofer.
-func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
+func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
if d.file.isNil() || !d.userXattrSupported() {
return nil, nil
}
@@ -1356,6 +1384,7 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui
}
xattrs := make([]string, 0, len(xattrMap))
for x := range xattrMap {
+ // We only support xattrs in the user.* namespace.
if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
xattrs = append(xattrs, x)
}
@@ -1363,51 +1392,33 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui
return xattrs, nil
}
-func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
if d.file.isNil() {
return "", syserror.ENODATA
}
- if err := d.checkPermissions(creds, vfs.MayRead); err != nil {
+ if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
return "", err
}
- if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
- return "", syserror.EOPNOTSUPP
- }
- if !d.userXattrSupported() {
- return "", syserror.ENODATA
- }
return d.file.getXattr(ctx, opts.Name, opts.Size)
}
-func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
if d.file.isNil() {
return syserror.EPERM
}
- if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+ if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
return err
}
- if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
- return syserror.EOPNOTSUPP
- }
- if !d.userXattrSupported() {
- return syserror.EPERM
- }
return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
}
-func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error {
+func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error {
if d.file.isNil() {
return syserror.EPERM
}
- if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+ if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
return err
}
- if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
- return syserror.EOPNOTSUPP
- }
- if !d.userXattrSupported() {
- return syserror.EPERM
- }
return d.file.removeXattr(ctx, name)
}
@@ -1418,7 +1429,9 @@ func (d *dentry) userXattrSupported() bool {
return filetype == linux.ModeRegular || filetype == linux.ModeDirectory
}
-// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDir().
+// Preconditions:
+// * !d.isSynthetic().
+// * d.isRegularFile() || d.isDir().
func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
// O_TRUNC unconditionally requires us to obtain a new handle (opened with
// O_TRUNC).
@@ -1463,8 +1476,9 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
return err
}
- if d.hostFD < 0 && openReadable && h.fd >= 0 {
- // We have no existing FD; use the new FD for at least reading.
+ if d.hostFD < 0 && h.fd >= 0 && openReadable && (d.writeFile.isNil() || openWritable) {
+ // We have no existing FD, and the new FD meets the requirements
+ // for d.hostFD, so start using it.
d.hostFD = h.fd
} else if d.hostFD >= 0 && d.writeFile.isNil() && openWritable {
// We have an existing read-only FD, but the file has just been
@@ -1656,30 +1670,30 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
return nil
}
-// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
- return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size)
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+ return fd.dentry().listXattr(ctx, auth.CredentialsFromContext(ctx), size)
}
-// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
- return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+ return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts)
}
-// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
-func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
d := fd.dentry()
- if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
+ if err := d.setXattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
return err
}
d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
return nil
}
-// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
-func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
d := fd.dentry()
- if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
+ if err := d.removeXattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
return err
}
d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
index 87f0b877f..21b4a96fe 100644
--- a/pkg/sentry/fsimpl/gofer/p9file.go
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -127,6 +127,13 @@ func (f p9file) close(ctx context.Context) error {
return err
}
+func (f p9file) setAttrClose(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error {
+ ctx.UninterruptibleSleepStart(false)
+ err := f.file.SetAttrClose(valid, attr)
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+}
+
func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
ctx.UninterruptibleSleepStart(false)
fdobj, qid, iounit, err := f.file.Open(flags)
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 7e1cbf065..24f03ee94 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -56,10 +56,16 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
if !fd.vfsfd.IsWritable() {
return nil
}
- // Skip flushing if writes may be buffered by the client, since (as with
- // the VFS1 client) we don't flush buffered writes on close anyway.
+ // Skip flushing if there are client-buffered writes, since (as with the
+ // VFS1 client) we don't flush buffered writes on close anyway.
d := fd.dentry()
- if d.fs.opts.interop == InteropModeExclusive {
+ if d.fs.opts.interop != InteropModeExclusive {
+ return nil
+ }
+ d.dataMu.RLock()
+ haveDirtyPages := !d.dirty.IsEmpty()
+ d.dataMu.RUnlock()
+ if haveDirtyPages {
return nil
}
d.handleMu.RLock()
@@ -73,28 +79,11 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
// Allocate implements vfs.FileDescriptionImpl.Allocate.
func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
d := fd.dentry()
- d.metadataMu.Lock()
- defer d.metadataMu.Unlock()
-
- // Allocating a smaller size is a noop.
- size := offset + length
- if d.cachedMetadataAuthoritative() && size <= d.size {
- return nil
- }
-
- d.handleMu.RLock()
- err := d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
- d.handleMu.RUnlock()
- if err != nil {
- return err
- }
- d.dataMu.Lock()
- atomic.StoreUint64(&d.size, size)
- d.dataMu.Unlock()
- if d.cachedMetadataAuthoritative() {
- d.touchCMtimeLocked()
- }
- return nil
+ return d.doAllocate(ctx, offset, length, func() error {
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+ })
}
// PRead implements vfs.FileDescriptionImpl.PRead.
@@ -117,6 +106,10 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
return 0, io.EOF
}
+ var (
+ n int64
+ readErr error
+ )
if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
// Lock d.metadataMu for the rest of the read to prevent d.size from
// changing.
@@ -127,20 +120,25 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil {
return 0, err
}
- }
-
- rw := getDentryReadWriter(ctx, d, offset)
- if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+ rw := getDentryReadWriter(ctx, d, offset)
// Require the read to go to the remote file.
rw.direct = true
+ n, readErr = dst.CopyOutFrom(ctx, rw)
+ putDentryReadWriter(rw)
+ if d.fs.opts.interop != InteropModeShared {
+ // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+ d.touchAtimeLocked(fd.vfsfd.Mount())
+ }
+ } else {
+ rw := getDentryReadWriter(ctx, d, offset)
+ n, readErr = dst.CopyOutFrom(ctx, rw)
+ putDentryReadWriter(rw)
+ if d.fs.opts.interop != InteropModeShared {
+ // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+ d.touchAtime(fd.vfsfd.Mount())
+ }
}
- n, err := dst.CopyOutFrom(ctx, rw)
- putDentryReadWriter(rw)
- if d.fs.opts.interop != InteropModeShared {
- // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
- d.touchAtime(fd.vfsfd.Mount())
- }
- return n, err
+ return n, readErr
}
// Read implements vfs.FileDescriptionImpl.Read.
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index a6368fdd0..dc960e5bf 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fdnotifier"
+ "gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
@@ -39,8 +40,14 @@ type specialFileFD struct {
// handle is used for file I/O. handle is immutable.
handle handle
+ // isRegularFile is true if this FD represents a regular file which is only
+ // possible when filesystemOptions.regularFilesUseSpecialFileFD is in
+ // effect. isRegularFile is immutable.
+ isRegularFile bool
+
// seekable is true if this file description represents a file for which
- // file offset is significant, i.e. a regular file. seekable is immutable.
+ // file offset is significant, i.e. a regular file, character device or
+ // block device. seekable is immutable.
seekable bool
// haveQueue is true if this file description represents a file for which
@@ -55,12 +62,13 @@ type specialFileFD struct {
func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) {
ftype := d.fileType()
- seekable := ftype == linux.S_IFREG
+ seekable := ftype == linux.S_IFREG || ftype == linux.S_IFCHR || ftype == linux.S_IFBLK
haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && h.fd >= 0
fd := &specialFileFD{
- handle: h,
- seekable: seekable,
- haveQueue: haveQueue,
+ handle: h,
+ isRegularFile: ftype == linux.S_IFREG,
+ seekable: seekable,
+ haveQueue: haveQueue,
}
fd.LockFD.Init(locks)
if haveQueue {
@@ -128,6 +136,16 @@ func (fd *specialFileFD) EventUnregister(e *waiter.Entry) {
fd.fileDescription.EventUnregister(e)
}
+func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ if fd.isRegularFile {
+ d := fd.dentry()
+ return d.doAllocate(ctx, offset, length, func() error {
+ return fd.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+ })
+ }
+ return fd.FileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length)
+}
+
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
if fd.seekable && offset < 0 {
@@ -200,13 +218,13 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
// If the regular file fd was opened with O_APPEND, make sure the file size
// is updated. There is a possible race here if size is modified externally
// after metadata cache is updated.
- if fd.seekable && fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
+ if fd.isRegularFile && fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
if err := d.updateFromGetattr(ctx); err != nil {
return 0, offset, err
}
}
- if fd.seekable {
+ if fd.isRegularFile {
// We need to hold the metadataMu *while* writing to a regular file.
d.metadataMu.Lock()
defer d.metadataMu.Unlock()
@@ -236,18 +254,20 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
if err == syserror.EAGAIN {
err = syserror.ErrWouldBlock
}
- finalOff = offset
+ // Update offset if the offset is valid.
+ if offset >= 0 {
+ offset += int64(n)
+ }
// Update file size for regular files.
- if fd.seekable {
- finalOff += int64(n)
+ if fd.isRegularFile {
// d.metadataMu is already locked at this point.
- if uint64(finalOff) > d.size {
+ if uint64(offset) > d.size {
d.dataMu.Lock()
defer d.dataMu.Unlock()
- atomic.StoreUint64(&d.size, uint64(finalOff))
+ atomic.StoreUint64(&d.size, uint64(offset))
}
}
- return int64(n), finalOff, err
+ return int64(n), offset, err
}
// Write implements vfs.FileDescriptionImpl.Write.
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index 2cb8191b9..7e825caae 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -38,7 +38,7 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
// Preconditions: d.cachedMetadataAuthoritative() == true.
func (d *dentry) touchAtime(mnt *vfs.Mount) {
- if mnt.Flags.NoATime {
+ if mnt.Flags.NoATime || mnt.ReadOnly() {
return
}
if err := mnt.CheckBeginWrite(); err != nil {
@@ -52,8 +52,23 @@ func (d *dentry) touchAtime(mnt *vfs.Mount) {
mnt.EndWrite()
}
-// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
-// successfully called vfs.Mount.CheckBeginWrite().
+// Preconditions: d.metadataMu is locked. d.cachedMetadataAuthoritative() == true.
+func (d *dentry) touchAtimeLocked(mnt *vfs.Mount) {
+ if mnt.Flags.NoATime || mnt.ReadOnly() {
+ return
+ }
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return
+ }
+ now := d.fs.clock.Now().Nanoseconds()
+ atomic.StoreInt64(&d.atime, now)
+ atomic.StoreUint32(&d.atimeDirty, 1)
+ mnt.EndWrite()
+}
+
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has successfully called vfs.Mount.CheckBeginWrite().
func (d *dentry) touchCtime() {
now := d.fs.clock.Now().Nanoseconds()
d.metadataMu.Lock()
@@ -61,8 +76,9 @@ func (d *dentry) touchCtime() {
d.metadataMu.Unlock()
}
-// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
-// successfully called vfs.Mount.CheckBeginWrite().
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has successfully called vfs.Mount.CheckBeginWrite().
func (d *dentry) touchCMtime() {
now := d.fs.clock.Now().Nanoseconds()
d.metadataMu.Lock()
@@ -72,8 +88,9 @@ func (d *dentry) touchCMtime() {
d.metadataMu.Unlock()
}
-// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
-// locked d.metadataMu.
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has locked d.metadataMu.
func (d *dentry) touchCMtimeLocked() {
now := d.fs.clock.Now().Nanoseconds()
atomic.StoreInt64(&d.mtime, now)
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index bd701bbc7..56bcf9bdb 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -1,12 +1,37 @@
load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
licenses(["notice"])
+go_template_instance(
+ name = "inode_refs",
+ out = "inode_refs.go",
+ package = "host",
+ prefix = "inode",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "inode",
+ },
+)
+
+go_template_instance(
+ name = "connected_endpoint_refs",
+ out = "connected_endpoint_refs.go",
+ package = "host",
+ prefix = "ConnectedEndpoint",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "ConnectedEndpoint",
+ },
+)
+
go_library(
name = "host",
srcs = [
+ "connected_endpoint_refs.go",
"control.go",
"host.go",
+ "inode_refs.go",
"ioctl_unsafe.go",
"mmap.go",
"socket.go",
@@ -24,6 +49,7 @@ go_library(
"//pkg/fspath",
"//pkg/iovec",
"//pkg/log",
+ "//pkg/marshal/primitive",
"//pkg/refs",
"//pkg/safemem",
"//pkg/sentry/arch",
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index bd6caba06..db8536f26 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -27,7 +27,6 @@ import (
"gvisor.dev/gvisor/pkg/fdnotifier"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/pkg/refs"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/hostfd"
@@ -41,6 +40,44 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
+func newInode(fs *filesystem, hostFD int, fileType linux.FileMode, isTTY bool) (*inode, error) {
+ // Determine if hostFD is seekable. If not, this syscall will return ESPIPE
+ // (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character
+ // devices.
+ _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
+ seekable := err != syserror.ESPIPE
+
+ i := &inode{
+ hostFD: hostFD,
+ ino: fs.NextIno(),
+ isTTY: isTTY,
+ wouldBlock: wouldBlock(uint32(fileType)),
+ seekable: seekable,
+ // NOTE(b/38213152): Technically, some obscure char devices can be memory
+ // mapped, but we only allow regular files.
+ canMap: fileType == linux.S_IFREG,
+ }
+ i.pf.inode = i
+ i.refs.EnableLeakCheck()
+
+ // Non-seekable files can't be memory mapped, assert this.
+ if !i.seekable && i.canMap {
+ panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
+ }
+
+ // If the hostFD would block, we must set it to non-blocking and handle
+ // blocking behavior in the sentry.
+ if i.wouldBlock {
+ if err := syscall.SetNonblock(i.hostFD, true); err != nil {
+ return nil, err
+ }
+ if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
+ return nil, err
+ }
+ }
+ return i, nil
+}
+
// NewFDOptions contains options to NewFD.
type NewFDOptions struct {
// If IsTTY is true, the file descriptor is a TTY.
@@ -76,44 +113,11 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
flags = uint32(flagsInt)
}
- fileMode := linux.FileMode(s.Mode)
- fileType := fileMode.FileType()
-
- // Determine if hostFD is seekable. If not, this syscall will return ESPIPE
- // (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character
- // devices.
- _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
- seekable := err != syserror.ESPIPE
-
- i := &inode{
- hostFD: hostFD,
- ino: fs.NextIno(),
- isTTY: opts.IsTTY,
- wouldBlock: wouldBlock(uint32(fileType)),
- seekable: seekable,
- // NOTE(b/38213152): Technically, some obscure char devices can be memory
- // mapped, but we only allow regular files.
- canMap: fileType == linux.S_IFREG,
- }
- i.pf.inode = i
-
- // Non-seekable files can't be memory mapped, assert this.
- if !i.seekable && i.canMap {
- panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
- }
-
- // If the hostFD would block, we must set it to non-blocking and handle
- // blocking behavior in the sentry.
- if i.wouldBlock {
- if err := syscall.SetNonblock(i.hostFD, true); err != nil {
- return nil, err
- }
- if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
- return nil, err
- }
- }
-
d := &kernfs.Dentry{}
+ i, err := newInode(fs, hostFD, linux.FileMode(s.Mode).FileType(), opts.IsTTY)
+ if err != nil {
+ return nil, err
+ }
d.Init(i)
// i.open will take a reference on d.
@@ -135,12 +139,12 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs
// filesystemType implements vfs.FilesystemType.
type filesystemType struct{}
-// GetFilesystem implements FilesystemType.GetFilesystem.
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
panic("host.filesystemType.GetFilesystem should never be called")
}
-// Name implements FilesystemType.Name.
+// Name implements vfs.FilesystemType.Name.
func (filesystemType) Name() string {
return "none"
}
@@ -182,13 +186,14 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
// inode implements kernfs.Inode.
type inode struct {
+ kernfs.InodeNoStatFS
kernfs.InodeNotDirectory
kernfs.InodeNotSymlink
locks vfs.FileLocks
// When the reference count reaches zero, the host fd is closed.
- refs.AtomicRefCount
+ refs inodeRefs
// hostFD contains the host fd that this file was originally created from,
// which must be available at time of restore.
@@ -238,7 +243,7 @@ type inode struct {
pf inodePlatformFile
}
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
var s syscall.Stat_t
if err := syscall.Fstat(i.hostFD, &s); err != nil {
@@ -247,7 +252,7 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a
return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid))
}
-// Mode implements kernfs.Inode.
+// Mode implements kernfs.Inode.Mode.
func (i *inode) Mode() linux.FileMode {
var s syscall.Stat_t
if err := syscall.Fstat(i.hostFD, &s); err != nil {
@@ -258,7 +263,7 @@ func (i *inode) Mode() linux.FileMode {
return linux.FileMode(s.Mode)
}
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
if opts.Mask&linux.STATX__RESERVED != 0 {
return linux.Statx{}, syserror.EINVAL
@@ -371,7 +376,7 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) {
}, nil
}
-// SetStat implements kernfs.Inode.
+// SetStat implements kernfs.Inode.SetStat.
func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
s := &opts.Stat
@@ -430,22 +435,29 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
return nil
}
-// DecRef implements kernfs.Inode.
-func (i *inode) DecRef(ctx context.Context) {
- i.AtomicRefCount.DecRefWithDestructor(ctx, i.Destroy)
+// IncRef implements kernfs.Inode.IncRef.
+func (i *inode) IncRef() {
+ i.refs.IncRef()
}
-// Destroy implements kernfs.Inode.
-func (i *inode) Destroy(context.Context) {
- if i.wouldBlock {
- fdnotifier.RemoveFD(int32(i.hostFD))
- }
- if err := unix.Close(i.hostFD); err != nil {
- log.Warningf("failed to close host fd %d: %v", i.hostFD, err)
- }
+// TryIncRef implements kernfs.Inode.TryIncRef.
+func (i *inode) TryIncRef() bool {
+ return i.refs.TryIncRef()
+}
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *inode) DecRef(ctx context.Context) {
+ i.refs.DecRef(func() {
+ if i.wouldBlock {
+ fdnotifier.RemoveFD(int32(i.hostFD))
+ }
+ if err := unix.Close(i.hostFD); err != nil {
+ log.Warningf("failed to close host fd %d: %v", i.hostFD, err)
+ }
+ })
}
-// Open implements kernfs.Inode.
+// Open implements kernfs.Inode.Open.
func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
// Once created, we cannot re-open a socket fd through /proc/[pid]/fd/.
if i.Mode().FileType() == linux.S_IFSOCK {
@@ -484,7 +496,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
if i.isTTY {
fd := &TTYFileDescription{
fileDescription: fileDescription{inode: i},
- termios: linux.DefaultSlaveTermios,
+ termios: linux.DefaultReplicaTermios,
}
fd.LockFD.Init(&i.locks)
vfsfd := &fd.vfsfd
@@ -530,33 +542,28 @@ type fileDescription struct {
offset int64
}
-// SetStat implements vfs.FileDescriptionImpl.
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
creds := auth.CredentialsFromContext(ctx)
return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts)
}
-// Stat implements vfs.FileDescriptionImpl.
+// Stat implements vfs.FileDescriptionImpl.Stat.
func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts)
}
-// Release implements vfs.FileDescriptionImpl.
+// Release implements vfs.FileDescriptionImpl.Release.
func (f *fileDescription) Release(context.Context) {
// noop
}
-// Allocate implements vfs.FileDescriptionImpl.
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
- if !f.inode.seekable {
- return syserror.ESPIPE
- }
-
- // TODO(gvisor.dev/issue/3589): Implement Allocate for non-pipe hostfds.
- return syserror.EOPNOTSUPP
+ return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length))
}
-// PRead implements FileDescriptionImpl.
+// PRead implements vfs.FileDescriptionImpl.PRead.
func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
i := f.inode
if !i.seekable {
@@ -566,7 +573,7 @@ func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, off
return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags)
}
-// Read implements FileDescriptionImpl.
+// Read implements vfs.FileDescriptionImpl.Read.
func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
i := f.inode
if !i.seekable {
@@ -603,7 +610,7 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off
return int64(n), err
}
-// PWrite implements FileDescriptionImpl.
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
if !f.inode.seekable {
return 0, syserror.ESPIPE
@@ -612,7 +619,7 @@ func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, of
return f.writeToHostFD(ctx, src, offset, opts.Flags)
}
-// Write implements FileDescriptionImpl.
+// Write implements vfs.FileDescriptionImpl.Write.
func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
i := f.inode
if !i.seekable {
@@ -660,7 +667,7 @@ func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSeque
return int64(n), err
}
-// Seek implements FileDescriptionImpl.
+// Seek implements vfs.FileDescriptionImpl.Seek.
//
// Note that we do not support seeking on directories, since we do not even
// allow directory fds to be imported at all.
@@ -725,13 +732,13 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i
return f.offset, nil
}
-// Sync implements FileDescriptionImpl.
+// Sync implements vfs.FileDescriptionImpl.Sync.
func (f *fileDescription) Sync(context.Context) error {
// TODO(gvisor.dev/issue/1897): Currently, we always sync everything.
return unix.Fsync(f.inode.hostFD)
}
-// ConfigureMMap implements FileDescriptionImpl.
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
if !f.inode.canMap {
return syserror.ENODEV
diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go
index 4979dd0a9..131145b85 100644
--- a/pkg/sentry/fsimpl/host/socket.go
+++ b/pkg/sentry/fsimpl/host/socket.go
@@ -22,7 +22,6 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fdnotifier"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/socket/control"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/uniqueid"
@@ -59,8 +58,7 @@ func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transpor
//
// +stateify savable
type ConnectedEndpoint struct {
- // ref keeps track of references to a ConnectedEndpoint.
- ref refs.AtomicRefCount
+ ConnectedEndpointRefs
// mu protects fd below.
mu sync.RWMutex `state:"nosave"`
@@ -132,9 +130,9 @@ func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable
return nil, err
}
- // AtomicRefCounters start off with a single reference. We need two.
- e.ref.IncRef()
- e.ref.EnableLeakCheck("host.ConnectedEndpoint")
+ // ConnectedEndpointRefs start off with a single reference. We need two.
+ e.IncRef()
+ e.EnableLeakCheck()
return &e, nil
}
@@ -318,7 +316,7 @@ func (c *ConnectedEndpoint) destroyLocked() {
// Release implements transport.ConnectedEndpoint.Release and
// transport.Receiver.Release.
func (c *ConnectedEndpoint) Release(ctx context.Context) {
- c.ref.DecRefWithDestructor(ctx, func(context.Context) {
+ c.DecRef(func() {
c.mu.Lock()
c.destroyLocked()
c.mu.Unlock()
@@ -348,7 +346,7 @@ func (e *SCMConnectedEndpoint) Init() error {
// Release implements transport.ConnectedEndpoint.Release and
// transport.Receiver.Release.
func (e *SCMConnectedEndpoint) Release(ctx context.Context) {
- e.ref.DecRefWithDestructor(ctx, func(context.Context) {
+ e.DecRef(func() {
e.mu.Lock()
if err := syscall.Close(e.fd); err != nil {
log.Warningf("Failed to close host fd %d: %v", err)
@@ -378,8 +376,8 @@ func NewSCMEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr s
return nil, err
}
- // AtomicRefCounters start off with a single reference. We need two.
- e.ref.IncRef()
- e.ref.EnableLeakCheck("host.SCMConnectedEndpoint")
+ // ConnectedEndpointRefs start off with a single reference. We need two.
+ e.IncRef()
+ e.EnableLeakCheck()
return &e, nil
}
diff --git a/pkg/sentry/fsimpl/host/socket_unsafe.go b/pkg/sentry/fsimpl/host/socket_unsafe.go
index 35ded24bc..c0bf45f08 100644
--- a/pkg/sentry/fsimpl/host/socket_unsafe.go
+++ b/pkg/sentry/fsimpl/host/socket_unsafe.go
@@ -63,10 +63,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (
controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
if n > length {
- return length, n, msg.Controllen, controlTrunc, err
+ return length, n, msg.Controllen, controlTrunc, nil
}
- return n, n, msg.Controllen, controlTrunc, err
+ return n, n, msg.Controllen, controlTrunc, nil
}
// fdWriteVec sends from bufs to fd.
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
index d372c60cb..e02b9b8f6 100644
--- a/pkg/sentry/fsimpl/host/tty.go
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -17,6 +17,7 @@ package host
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -75,7 +76,7 @@ func (t *TTYFileDescription) Release(ctx context.Context) {
t.fileDescription.Release(ctx)
}
-// PRead implements vfs.FileDescriptionImpl.
+// PRead implements vfs.FileDescriptionImpl.PRead.
//
// Reading from a TTY is only allowed for foreground process groups. Background
// process groups will either get EIO or a SIGTTIN.
@@ -93,7 +94,7 @@ func (t *TTYFileDescription) PRead(ctx context.Context, dst usermem.IOSequence,
return t.fileDescription.PRead(ctx, dst, offset, opts)
}
-// Read implements vfs.FileDescriptionImpl.
+// Read implements vfs.FileDescriptionImpl.Read.
//
// Reading from a TTY is only allowed for foreground process groups. Background
// process groups will either get EIO or a SIGTTIN.
@@ -111,7 +112,7 @@ func (t *TTYFileDescription) Read(ctx context.Context, dst usermem.IOSequence, o
return t.fileDescription.Read(ctx, dst, opts)
}
-// PWrite implements vfs.FileDescriptionImpl.
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
t.mu.Lock()
defer t.mu.Unlock()
@@ -126,7 +127,7 @@ func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence,
return t.fileDescription.PWrite(ctx, src, offset, opts)
}
-// Write implements vfs.FileDescriptionImpl.
+// Write implements vfs.FileDescriptionImpl.Write.
func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
t.mu.Lock()
defer t.mu.Unlock()
@@ -141,8 +142,13 @@ func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence,
return t.fileDescription.Write(ctx, src, opts)
}
-// Ioctl implements vfs.FileDescriptionImpl.
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ task := kernel.TaskFromContext(ctx)
+ if task == nil {
+ return 0, syserror.ENOTTY
+ }
+
// Ignore arg[0]. This is the real FD:
fd := t.inode.hostFD
ioctl := args[1].Uint64()
@@ -152,9 +158,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
if err != nil {
return 0, err
}
- _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err = termios.CopyOut(task, args[2].Pointer())
return 0, err
case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
@@ -166,9 +170,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
}
var termios linux.Termios
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ if _, err := termios.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
err := ioctlSetTermios(fd, ioctl, &termios)
@@ -192,10 +194,8 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
defer t.mu.Unlock()
// Map the ProcessGroup into a ProcessGroupID in the task's PID namespace.
- pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
- _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ pgID := primitive.Int32(pidns.IDOfProcessGroup(t.fgProcessGroup))
+ _, err := pgID.CopyOut(task, args[2].Pointer())
return 0, err
case linux.TIOCSPGRP:
@@ -203,11 +203,6 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
// Equivalent to tcsetpgrp(fd, *argp).
// Set the foreground process group ID of this terminal.
- task := kernel.TaskFromContext(ctx)
- if task == nil {
- return 0, syserror.ENOTTY
- }
-
t.mu.Lock()
defer t.mu.Unlock()
@@ -226,12 +221,11 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
return 0, syserror.ENOTTY
}
- var pgID kernel.ProcessGroupID
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ var pgIDP primitive.Int32
+ if _, err := pgIDP.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
+ pgID := kernel.ProcessGroupID(pgIDP)
// pgID must be non-negative.
if pgID < 0 {
@@ -260,9 +254,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
if err != nil {
return 0, err
}
- _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
- AddressSpaceActive: true,
- })
+ _, err = winsize.CopyOut(task, args[2].Pointer())
return 0, err
case linux.TIOCSWINSZ:
@@ -273,9 +265,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
// set the winsize.
var winsize linux.Winsize
- if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
- AddressSpaceActive: true,
- }); err != nil {
+ if _, err := winsize.CopyIn(task, args[2].Pointer()); err != nil {
return 0, err
}
err := ioctlSetWinsize(fd, &winsize)
@@ -376,7 +366,7 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal)
//
// Linux ignores the result of kill_pgrp().
_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
- return kernel.ERESTARTSYS
+ return syserror.ERESTARTSYS
}
// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 3835557fe..5e91e0536 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -26,9 +26,54 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "dentry_refs",
+ out = "dentry_refs.go",
+ package = "kernfs",
+ prefix = "Dentry",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "Dentry",
+ },
+)
+
+go_template_instance(
+ name = "static_directory_refs",
+ out = "static_directory_refs.go",
+ package = "kernfs",
+ prefix = "StaticDirectory",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "StaticDirectory",
+ },
+)
+
+go_template_instance(
+ name = "dir_refs",
+ out = "dir_refs.go",
+ package = "kernfs_test",
+ prefix = "dir",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "dir",
+ },
+)
+
+go_template_instance(
+ name = "readonly_dir_refs",
+ out = "readonly_dir_refs.go",
+ package = "kernfs_test",
+ prefix = "readonlyDir",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "readonlyDir",
+ },
+)
+
go_library(
name = "kernfs",
srcs = [
+ "dentry_refs.go",
"dynamic_bytes_file.go",
"fd_impl_util.go",
"filesystem.go",
@@ -36,7 +81,9 @@ go_library(
"inode_impl_util.go",
"kernfs.go",
"slot_list.go",
+ "static_directory_refs.go",
"symlink.go",
+ "synthetic_directory.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
@@ -59,11 +106,17 @@ go_library(
go_test(
name = "kernfs_test",
size = "small",
- srcs = ["kernfs_test.go"],
+ srcs = [
+ "dir_refs.go",
+ "kernfs_test.go",
+ "readonly_dir_refs.go",
+ ],
deps = [
":kernfs",
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/log",
+ "//pkg/refs",
"//pkg/sentry/contexttest",
"//pkg/sentry/fsimpl/testutil",
"//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 12adf727a..1ee089620 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -35,6 +35,7 @@ import (
// +stateify savable
type DynamicBytesFile struct {
InodeAttrs
+ InodeNoStatFS
InodeNoopRefCount
InodeNotDirectory
InodeNotSymlink
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index fcee6200a..6518ff5cd 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -15,7 +15,7 @@
package kernfs
import (
- "math"
+ "fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -28,9 +28,25 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
+// SeekEndConfig describes the SEEK_END behaviour for FDs.
+type SeekEndConfig int
+
+// Constants related to SEEK_END behaviour for FDs.
+const (
+ // Consider the end of the file to be after the final static entry. This is
+ // the default option.
+ SeekEndStaticEntries = iota
+ // Consider the end of the file to be at offset 0.
+ SeekEndZero
+)
+
+// GenericDirectoryFDOptions contains configuration for a GenericDirectoryFD.
+type GenericDirectoryFDOptions struct {
+ SeekEnd SeekEndConfig
+}
+
// GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory
-// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not
-// compatible with dynamic directories.
+// inode that uses OrderChildren to track child nodes.
//
// Note that GenericDirectoryFD holds a lock over OrderedChildren while calling
// IterDirents callback. The IterDirents callback therefore cannot hash or
@@ -45,6 +61,9 @@ type GenericDirectoryFD struct {
vfs.DirectoryFileDescriptionDefaultImpl
vfs.LockFD
+ // Immutable.
+ seekEnd SeekEndConfig
+
vfsfd vfs.FileDescription
children *OrderedChildren
@@ -57,9 +76,9 @@ type GenericDirectoryFD struct {
// NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its
// dentry.
-func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
+func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) (*GenericDirectoryFD, error) {
fd := &GenericDirectoryFD{}
- if err := fd.Init(children, locks, opts); err != nil {
+ if err := fd.Init(children, locks, opts, fdOpts); err != nil {
return nil, err
}
if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
@@ -71,12 +90,13 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre
// Init initializes a GenericDirectoryFD. Use it when overriding
// GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the
// correct implementation.
-func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) error {
+func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) error {
if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
// Can't open directories for writing.
return syserror.EISDIR
}
fd.LockFD.Init(locks)
+ fd.seekEnd = fdOpts.SeekEnd
fd.children = children
return nil
}
@@ -209,9 +229,17 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
case linux.SEEK_CUR:
offset += fd.off
case linux.SEEK_END:
- // TODO(gvisor.dev/issue/1193): This can prevent new files from showing up
- // if they are added after SEEK_END.
- offset = math.MaxInt64
+ switch fd.seekEnd {
+ case SeekEndStaticEntries:
+ fd.children.mu.RLock()
+ offset += int64(len(fd.children.set))
+ offset += 2 // '.' and '..' aren't tracked in children.
+ fd.children.mu.RUnlock()
+ case SeekEndZero:
+ // No-op: offset += 0.
+ default:
+ panic(fmt.Sprintf("Invalid GenericDirectoryFD.seekEnd = %v", fd.seekEnd))
+ }
default:
return 0, syserror.EINVAL
}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index d7edb6342..89ed265dc 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -32,7 +32,9 @@ import (
//
// stepExistingLocked is loosely analogous to fs/namei.c:walk_component().
//
-// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * !rp.Done().
//
// Postcondition: Caller must call fs.processDeferredDecRefs*.
func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, mayFollowSymlinks bool) (*vfs.Dentry, error) {
@@ -107,8 +109,11 @@ afterSymlink:
// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
// nil) to verify that the returned child (or lack thereof) is correct.
//
-// Preconditions: Filesystem.mu must be locked for at least reading.
-// parent.dirMu must be locked. parent.isDir(). name is not "." or "..".
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * parent.dirMu must be locked.
+// * parent.isDir().
+// * name is not "." or "..".
//
// Postconditions: Caller must call fs.processDeferredDecRefs*.
func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) {
@@ -135,7 +140,7 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
}
// Reference on childVFSD dropped by a corresponding Valid.
child = childVFSD.Impl().(*Dentry)
- parent.insertChildLocked(name, child)
+ parent.InsertChildLocked(name, child)
}
return child, nil
}
@@ -171,7 +176,9 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP
// walkParentDirLocked is loosely analogous to Linux's
// fs/namei.c:path_parentat().
//
-// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * !rp.Done().
//
// Postconditions: Caller must call fs.processDeferredDecRefs*.
func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
@@ -193,8 +200,10 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving
// checkCreateLocked checks that a file named rp.Component() may be created in
// directory parentVFSD, then returns rp.Component().
//
-// Preconditions: Filesystem.mu must be locked for at least reading. parentInode
-// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true.
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * parentInode == parentVFSD.Impl().(*Dentry).Inode.
+// * isDir(parentInode) == true.
func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) {
if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
return "", err
@@ -351,7 +360,10 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
defer rp.Mount().EndWrite()
childVFSD, err := parentInode.NewDir(ctx, pc, opts)
if err != nil {
- return err
+ if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
+ return err
+ }
+ childVFSD = newSyntheticDirectory(rp.Credentials(), opts.Mode)
}
parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
return nil
@@ -397,15 +409,21 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
// Do not create new file.
if opts.Flags&linux.O_CREAT == 0 {
fs.mu.RLock()
- defer fs.processDeferredDecRefs(ctx)
- defer fs.mu.RUnlock()
vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
+ fs.mu.RUnlock()
+ fs.processDeferredDecRefs(ctx)
return nil, err
}
if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+ fs.mu.RUnlock()
+ fs.processDeferredDecRefs(ctx)
return nil, err
}
+ inode.IncRef()
+ defer inode.DecRef(ctx)
+ fs.mu.RUnlock()
+ fs.processDeferredDecRefs(ctx)
return inode.Open(ctx, rp, vfsd, opts)
}
@@ -414,7 +432,14 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
vfsd := rp.Start()
inode := vfsd.Impl().(*Dentry).inode
fs.mu.Lock()
- defer fs.mu.Unlock()
+ unlocked := false
+ unlock := func() {
+ if !unlocked {
+ fs.mu.Unlock()
+ unlocked = true
+ }
+ }
+ defer unlock()
if rp.Done() {
if rp.MustBeDir() {
return nil, syserror.EISDIR
@@ -425,6 +450,9 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
return nil, err
}
+ inode.IncRef()
+ defer inode.DecRef(ctx)
+ unlock()
return inode.Open(ctx, rp, vfsd, opts)
}
afterTrailingSymlink:
@@ -466,6 +494,9 @@ afterTrailingSymlink:
}
child := childVFSD.Impl().(*Dentry)
parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+ child.inode.IncRef()
+ defer child.inode.DecRef(ctx)
+ unlock()
return child.inode.Open(ctx, rp, childVFSD, opts)
}
if err != nil {
@@ -499,6 +530,9 @@ afterTrailingSymlink:
if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
return nil, err
}
+ child.inode.IncRef()
+ defer child.inode.DecRef(ctx)
+ unlock()
return child.inode.Open(ctx, rp, &child.vfsd, opts)
}
@@ -514,7 +548,7 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
if !d.Impl().(*Dentry).isSymlink() {
return "", syserror.EINVAL
}
- return inode.Readlink(ctx)
+ return inode.Readlink(ctx, rp.Mount())
}
// RenameAt implements vfs.FilesystemImpl.RenameAt.
@@ -623,6 +657,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
+
vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
@@ -652,7 +687,8 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
return err
}
- if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
+
+ if err := parentDentry.inode.RmDir(ctx, d.name, vfsd); err != nil {
virtfs.AbortDeleteDentry(vfsd)
return err
}
@@ -690,14 +726,13 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
fs.mu.RLock()
- _, _, err := fs.walkExistingLocked(ctx, rp)
+ _, inode, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
fs.processDeferredDecRefs(ctx)
if err != nil {
return linux.Statfs{}, err
}
- // TODO(gvisor.dev/issue/1193): actually implement statfs.
- return linux.Statfs{}, syserror.ENOSYS
+ return inode.StatFS(ctx, fs.VFSFilesystem())
}
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
@@ -732,6 +767,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
+
vfsd, _, err := fs.walkExistingLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
@@ -757,7 +793,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
return err
}
- if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
+ if err := parentDentry.inode.Unlink(ctx, d.name, vfsd); err != nil {
virtfs.AbortDeleteDentry(vfsd)
return err
}
@@ -765,7 +801,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return nil
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
fs.mu.RLock()
_, inode, err := fs.walkExistingLocked(ctx, rp)
@@ -780,8 +816,8 @@ func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
return nil, syserror.ECONNREFUSED
}
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
fs.mu.RLock()
_, _, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
@@ -793,8 +829,8 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
return nil, syserror.ENOTSUP
}
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
fs.mu.RLock()
_, _, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
@@ -806,8 +842,8 @@ func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
return "", syserror.ENOTSUP
}
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
fs.mu.RLock()
_, _, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
@@ -819,8 +855,8 @@ func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
return syserror.ENOTSUP
}
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
fs.mu.RLock()
_, _, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index c3efcf3ec..6ee353ace 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -20,7 +20,6 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
@@ -48,10 +47,6 @@ func (InodeNoopRefCount) TryIncRef() bool {
return true
}
-// Destroy implements Inode.Destroy.
-func (InodeNoopRefCount) Destroy(context.Context) {
-}
-
// InodeDirectoryNoNewChildren partially implements the Inode interface.
// InodeDirectoryNoNewChildren represents a directory inode which does not
// support creation of new children.
@@ -177,7 +172,7 @@ func (InodeNoDynamicLookup) Valid(ctx context.Context) bool {
type InodeNotSymlink struct{}
// Readlink implements Inode.Readlink.
-func (InodeNotSymlink) Readlink(context.Context) (string, error) {
+func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) {
return "", syserror.EINVAL
}
@@ -261,12 +256,29 @@ func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (li
// SetStat implements Inode.SetStat.
func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+ return a.SetInodeStat(ctx, fs, creds, opts)
+}
+
+// SetInodeStat sets the corresponding attributes from opts to InodeAttrs.
+// This function can be used by other kernfs-based filesystem implementation to
+// sets the unexported attributes into kernfs.InodeAttrs.
+func (a *InodeAttrs) SetInodeStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
if opts.Stat.Mask == 0 {
return nil
}
- if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 {
+
+ // Note that not all fields are modifiable. For example, the file type and
+ // inode numbers are immutable after node creation. Setting the size is often
+ // allowed by kernfs files but does not do anything. If some other behavior is
+ // needed, the embedder should consider extending SetStat.
+ //
+ // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
+ if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_SIZE) != 0 {
return syserror.EPERM
}
+ if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() {
+ return syserror.EISDIR
+ }
if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
return err
}
@@ -289,13 +301,6 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
atomic.StoreUint32(&a.gid, stat.GID)
}
- // Note that not all fields are modifiable. For example, the file type and
- // inode numbers are immutable after node creation.
-
- // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
- // Also, STATX_SIZE will need some special handling, because read-only static
- // files should return EIO for truncate operations.
-
return nil
}
@@ -348,8 +353,6 @@ type OrderedChildrenOptions struct {
//
// Must be initialize with Init before first use.
type OrderedChildren struct {
- refs.AtomicRefCount
-
// Can children be modified by user syscalls? It set to false, interface
// methods that would modify the children return EPERM. Immutable.
writable bool
@@ -365,13 +368,10 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) {
o.set = make(map[string]*slot)
}
-// DecRef implements Inode.DecRef.
-func (o *OrderedChildren) DecRef(ctx context.Context) {
- o.AtomicRefCount.DecRefWithDestructor(ctx, o.Destroy)
-}
-
-// Destroy cleans up resources referenced by this OrderedChildren.
-func (o *OrderedChildren) Destroy(context.Context) {
+// Destroy clears the children stored in o. It should be called by structs
+// embedding OrderedChildren upon destruction, i.e. when their reference count
+// reaches zero.
+func (o *OrderedChildren) Destroy() {
o.mu.Lock()
defer o.mu.Unlock()
o.order.Reset()
@@ -556,21 +556,24 @@ func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D
//
// +stateify savable
type StaticDirectory struct {
- InodeNotSymlink
- InodeDirectoryNoNewChildren
InodeAttrs
+ InodeDirectoryNoNewChildren
InodeNoDynamicLookup
+ InodeNoStatFS
+ InodeNotSymlink
OrderedChildren
+ StaticDirectoryRefs
- locks vfs.FileLocks
+ locks vfs.FileLocks
+ fdOpts GenericDirectoryFDOptions
}
var _ Inode = (*StaticDirectory)(nil)
// NewStaticDir creates a new static directory and returns its dentry.
-func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry {
+func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry, fdOpts GenericDirectoryFDOptions) *Dentry {
inode := &StaticDirectory{}
- inode.Init(creds, devMajor, devMinor, ino, perm)
+ inode.Init(creds, devMajor, devMinor, ino, perm, fdOpts)
dentry := &Dentry{}
dentry.Init(inode)
@@ -583,31 +586,46 @@ func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64
}
// Init initializes StaticDirectory.
-func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, fdOpts GenericDirectoryFDOptions) {
if perm&^linux.PermissionsMask != 0 {
panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
}
+ s.fdOpts = fdOpts
s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
}
-// Open implements kernfs.Inode.
+// Open implements kernfs.Inode.Open.
func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts)
+ fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts, s.fdOpts)
if err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
+// DecRef implements kernfs.Inode.DecRef.
+func (s *StaticDirectory) DecRef(context.Context) {
+ s.StaticDirectoryRefs.DecRef(s.Destroy)
+}
+
// AlwaysValid partially implements kernfs.inodeDynamicLookup.
type AlwaysValid struct{}
-// Valid implements kernfs.inodeDynamicLookup.
+// Valid implements kernfs.inodeDynamicLookup.Valid.
func (*AlwaysValid) Valid(context.Context) bool {
return true
}
+
+// InodeNoStatFS partially implements the Inode interface, where the client
+// filesystem doesn't support statfs(2).
+type InodeNoStatFS struct{}
+
+// StatFS implements Inode.StatFS.
+func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+ return linux.Statfs{}, syserror.ENOSYS
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 080118841..163f26ceb 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -57,10 +57,10 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
)
// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
@@ -161,9 +161,9 @@ const (
//
// Must be initialized by Init prior to first use.
type Dentry struct {
- vfsd vfs.Dentry
+ DentryRefs
- refs.AtomicRefCount
+ vfsd vfs.Dentry
// flags caches useful information about the dentry from the inode. See the
// dflags* consts above. Must be accessed by atomic ops.
@@ -194,6 +194,7 @@ func (d *Dentry) Init(inode Inode) {
if ftype == linux.ModeSymlink {
d.flags |= dflagsIsSymlink
}
+ d.EnableLeakCheck()
}
// VFSDentry returns the generic vfs dentry for this kernfs dentry.
@@ -213,16 +214,14 @@ func (d *Dentry) isSymlink() bool {
// DecRef implements vfs.DentryImpl.DecRef.
func (d *Dentry) DecRef(ctx context.Context) {
- d.AtomicRefCount.DecRefWithDestructor(ctx, d.destroy)
-}
-
-// Precondition: Dentry must be removed from VFS' dentry cache.
-func (d *Dentry) destroy(ctx context.Context) {
- d.inode.DecRef(ctx) // IncRef from Init.
- d.inode = nil
- if d.parent != nil {
- d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild.
- }
+ // Before the destructor is called, Dentry must be removed from VFS' dentry cache.
+ d.DentryRefs.DecRef(func() {
+ d.inode.DecRef(ctx) // IncRef from Init.
+ d.inode = nil
+ if d.parent != nil {
+ d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild.
+ }
+ })
}
// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
@@ -248,15 +247,15 @@ func (d *Dentry) OnZeroWatches(context.Context) {}
// Precondition: d must represent a directory inode.
func (d *Dentry) InsertChild(name string, child *Dentry) {
d.dirMu.Lock()
- d.insertChildLocked(name, child)
+ d.InsertChildLocked(name, child)
d.dirMu.Unlock()
}
-// insertChildLocked is equivalent to InsertChild, with additional
+// InsertChildLocked is equivalent to InsertChild, with additional
// preconditions.
//
// Precondition: d.dirMu must be locked.
-func (d *Dentry) insertChildLocked(name string, child *Dentry) {
+func (d *Dentry) InsertChildLocked(name string, child *Dentry) {
if !d.isDir() {
panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d))
}
@@ -269,6 +268,36 @@ func (d *Dentry) insertChildLocked(name string, child *Dentry) {
d.children[name] = child
}
+// RemoveChild removes child from the vfs dentry cache. This does not update the
+// directory inode or modify the inode to be unlinked. So calling this on its own
+// isn't sufficient to remove a child from a directory.
+//
+// Precondition: d must represent a directory inode.
+func (d *Dentry) RemoveChild(name string, child *vfs.Dentry) error {
+ d.dirMu.Lock()
+ defer d.dirMu.Unlock()
+ return d.RemoveChildLocked(name, child)
+}
+
+// RemoveChildLocked is equivalent to RemoveChild, with additional
+// preconditions.
+//
+// Precondition: d.dirMu must be locked.
+func (d *Dentry) RemoveChildLocked(name string, child *vfs.Dentry) error {
+ if !d.isDir() {
+ panic(fmt.Sprintf("RemoveChild called on non-directory Dentry: %+v.", d))
+ }
+ c, ok := d.children[name]
+ if !ok {
+ return syserror.ENOENT
+ }
+ if &c.vfsd != child {
+ panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! Child: %+v, vfs: %+v", c, child))
+ }
+ delete(d.children, name)
+ return nil
+}
+
// Inode returns the dentry's inode.
func (d *Dentry) Inode() Inode {
return d.inode
@@ -322,16 +351,17 @@ type Inode interface {
// Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
// the inode on which Open() is being called.
Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
+
+ // StatFS returns filesystem statistics for the client filesystem. This
+ // corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem
+ // doesn't support statfs(2), this should return ENOSYS.
+ StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error)
}
type inodeRefs interface {
IncRef()
DecRef(ctx context.Context)
TryIncRef() bool
- // Destroy is called when the inode reaches zero references. Destroy release
- // all resources (references) on objects referenced by the inode, including
- // any child dentries.
- Destroy(ctx context.Context)
}
type inodeMetadata interface {
@@ -426,7 +456,7 @@ type inodeDynamicLookup interface {
Valid(ctx context.Context) bool
// IterDirents is used to iterate over dynamically created entries. It invokes
- // cb on each entry in the directory represented by the FileDescription.
+ // cb on each entry in the directory represented by the Inode.
// 'offset' is the offset for the entire IterDirents call, which may include
// results from the caller (e.g. "." and ".."). 'relOffset' is the offset
// inside the entries returned by this IterDirents invocation. In other words,
@@ -438,7 +468,7 @@ type inodeDynamicLookup interface {
type inodeSymlink interface {
// Readlink returns the target of a symbolic link. If an inode is not a
// symlink, the implementation should return EINVAL.
- Readlink(ctx context.Context) (string, error)
+ Readlink(ctx context.Context, mnt *vfs.Mount) (string, error)
// Getlink returns the target of a symbolic link, as used by path
// resolution:
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index c5d5afedf..09806a3f2 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -52,7 +52,7 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System {
v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{})
+ mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.MountOptions{})
if err != nil {
t.Fatalf("Failed to create testfs root mount: %v", err)
}
@@ -96,10 +96,12 @@ func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.S
}
type readonlyDir struct {
+ readonlyDirRefs
attrs
- kernfs.InodeNotSymlink
- kernfs.InodeNoDynamicLookup
kernfs.InodeDirectoryNoNewChildren
+ kernfs.InodeNoDynamicLookup
+ kernfs.InodeNoStatFS
+ kernfs.InodeNotSymlink
kernfs.OrderedChildren
locks vfs.FileLocks
@@ -111,6 +113,7 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod
dir := &readonlyDir{}
dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+ dir.EnableLeakCheck()
dir.dentry.Init(dir)
dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
@@ -119,18 +122,26 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod
}
func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndStaticEntries,
+ })
if err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
+func (d *readonlyDir) DecRef(context.Context) {
+ d.readonlyDirRefs.DecRef(d.Destroy)
+}
+
type dir struct {
+ dirRefs
attrs
- kernfs.InodeNotSymlink
kernfs.InodeNoDynamicLookup
+ kernfs.InodeNotSymlink
kernfs.OrderedChildren
+ kernfs.InodeNoStatFS
locks vfs.FileLocks
@@ -143,6 +154,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
dir.fs = fs
dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
+ dir.EnableLeakCheck()
dir.dentry.Init(dir)
dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
@@ -151,13 +163,19 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
}
func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndStaticEntries,
+ })
if err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
+func (d *dir) DecRef(context.Context) {
+ d.dirRefs.DecRef(d.Destroy)
+}
+
func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
creds := auth.CredentialsFromContext(ctx)
dir := d.fs.newDir(creds, opts.Mode, nil)
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 2ab3f53fd..443121c99 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -28,6 +28,7 @@ type StaticSymlink struct {
InodeAttrs
InodeNoopRefCount
InodeSymlink
+ InodeNoStatFS
target string
}
@@ -50,8 +51,8 @@ func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor
s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777)
}
-// Readlink implements Inode.
-func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
+// Readlink implements Inode.Readlink.
+func (s *StaticSymlink) Readlink(_ context.Context, _ *vfs.Mount) (string, error) {
return s.target, nil
}
diff --git a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
new file mode 100644
index 000000000..01ba72fa8
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
@@ -0,0 +1,102 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// syntheticDirectory implements kernfs.Inode for a directory created by
+// MkdirAt(ForSyntheticMountpoint=true).
+//
+// +stateify savable
+type syntheticDirectory struct {
+ InodeAttrs
+ InodeNoStatFS
+ InodeNoopRefCount
+ InodeNoDynamicLookup
+ InodeNotSymlink
+ OrderedChildren
+
+ locks vfs.FileLocks
+}
+
+var _ Inode = (*syntheticDirectory)(nil)
+
+func newSyntheticDirectory(creds *auth.Credentials, perm linux.FileMode) *vfs.Dentry {
+ inode := &syntheticDirectory{}
+ inode.Init(creds, 0 /* devMajor */, 0 /* devMinor */, 0 /* ino */, perm)
+ d := &Dentry{}
+ d.Init(inode)
+ return &d.vfsd
+}
+
+func (dir *syntheticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+ if perm&^linux.PermissionsMask != 0 {
+ panic(fmt.Sprintf("perm contains non-permission bits: %#o", perm))
+ }
+ dir.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.S_IFDIR|perm)
+ dir.OrderedChildren.Init(OrderedChildrenOptions{
+ Writable: true,
+ })
+}
+
+// Open implements Inode.Open.
+func (dir *syntheticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &dir.OrderedChildren, &dir.locks, &opts, GenericDirectoryFDOptions{})
+ if err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// NewFile implements Inode.NewFile.
+func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
+ return nil, syserror.EPERM
+}
+
+// NewDir implements Inode.NewDir.
+func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
+ if !opts.ForSyntheticMountpoint {
+ return nil, syserror.EPERM
+ }
+ subdird := newSyntheticDirectory(auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask)
+ if err := dir.OrderedChildren.Insert(name, subdird); err != nil {
+ subdird.DecRef(ctx)
+ return nil, err
+ }
+ return subdird, nil
+}
+
+// NewLink implements Inode.NewLink.
+func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error) {
+ return nil, syserror.EPERM
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error) {
+ return nil, syserror.EPERM
+}
+
+// NewNode implements Inode.NewNode.
+func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error) {
+ return nil, syserror.EPERM
+}
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
index b3d19ff82..73b126669 100644
--- a/pkg/sentry/fsimpl/overlay/copy_up.go
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -22,6 +22,8 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -40,6 +42,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
return nil
}
+ // Attach our credentials to the context, as some VFS operations use
+ // credentials from context rather an take an explicit creds parameter.
+ ctx = auth.ContextWithCredentials(ctx, d.fs.creds)
+
ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT
switch ftype {
case linux.S_IFREG, linux.S_IFDIR, linux.S_IFLNK, linux.S_IFBLK, linux.S_IFCHR:
@@ -76,6 +82,8 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
Start: d.parent.upperVD,
Path: fspath.Parse(d.name),
}
+ // Used during copy-up of memory-mapped regular files.
+ var mmapOpts *memmap.MMapOpts
cleanupUndoCopyUp := func() {
var err error
if ftype == linux.S_IFDIR {
@@ -84,7 +92,11 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop)
}
if err != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err))
+ }
+ if d.upperVD.Ok() {
+ d.upperVD.DecRef(ctx)
+ d.upperVD = vfs.VirtualDentry{}
}
}
switch ftype {
@@ -127,6 +139,25 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
break
}
}
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ if d.wrappedMappable != nil {
+ // We may have memory mappings of the file on the lower layer.
+ // Switch to mapping the file on the upper layer instead.
+ mmapOpts = &memmap.MMapOpts{
+ Perms: usermem.ReadWrite,
+ MaxPerms: usermem.ReadWrite,
+ }
+ if err := newFD.ConfigureMMap(ctx, mmapOpts); err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
+ if mmapOpts.MappingIdentity != nil {
+ mmapOpts.MappingIdentity.DecRef(ctx)
+ }
+ // Don't actually switch Mappables until the end of copy-up; see
+ // below for why.
+ }
if err := newFD.SetStat(ctx, vfs.SetStatOptions{
Stat: linux.Statx{
Mask: linux.STATX_UID | linux.STATX_GID,
@@ -229,7 +260,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
panic(fmt.Sprintf("unexpected file type %o", ftype))
}
- // TODO(gvisor.dev/issue/1199): copy up xattrs
+ if err := d.copyXattrsLocked(ctx); err != nil {
+ cleanupUndoCopyUp()
+ return err
+ }
// Update the dentry's device and inode numbers (except for directories,
// for which these remain overlay-assigned).
@@ -241,14 +275,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
Mask: linux.STATX_INO,
})
if err != nil {
- d.upperVD.DecRef(ctx)
- d.upperVD = vfs.VirtualDentry{}
cleanupUndoCopyUp()
return err
}
if upperStat.Mask&linux.STATX_INO == 0 {
- d.upperVD.DecRef(ctx)
- d.upperVD = vfs.VirtualDentry{}
cleanupUndoCopyUp()
return syserror.EREMOTE
}
@@ -257,6 +287,135 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
atomic.StoreUint64(&d.ino, upperStat.Ino)
}
+ if mmapOpts != nil && mmapOpts.Mappable != nil {
+ // Note that if mmapOpts != nil, then d.mapsMu is locked for writing
+ // (from the S_IFREG path above).
+
+ // Propagate mappings of d to the new Mappable. Remember which mappings
+ // we added so we can remove them on failure.
+ upperMappable := mmapOpts.Mappable
+ allAdded := make(map[memmap.MappableRange]memmap.MappingsOfRange)
+ for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+ added := make(memmap.MappingsOfRange)
+ for m := range seg.Value() {
+ if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable); err != nil {
+ for m := range added {
+ upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
+ }
+ for mr, mappings := range allAdded {
+ for m := range mappings {
+ upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start, m.Writable)
+ }
+ }
+ return err
+ }
+ added[m] = struct{}{}
+ }
+ allAdded[seg.Range()] = added
+ }
+
+ // Switch to the new Mappable. We do this at the end of copy-up
+ // because:
+ //
+ // - We need to switch Mappables (by changing d.wrappedMappable) before
+ // invalidating Translations from the old Mappable (to pick up
+ // Translations from the new one).
+ //
+ // - We need to lock d.dataMu while changing d.wrappedMappable, but
+ // must invalidate Translations with d.dataMu unlocked (due to lock
+ // ordering).
+ //
+ // - Consequently, once we unlock d.dataMu, other threads may
+ // immediately observe the new (copied-up) Mappable, which we want to
+ // delay until copy-up is guaranteed to succeed.
+ d.dataMu.Lock()
+ lowerMappable := d.wrappedMappable
+ d.wrappedMappable = upperMappable
+ d.dataMu.Unlock()
+ d.lowerMappings.InvalidateAll(memmap.InvalidateOpts{})
+
+ // Remove mappings from the old Mappable.
+ for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+ for m := range seg.Value() {
+ lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
+ }
+ }
+ d.lowerMappings.RemoveAll()
+ }
+
atomic.StoreUint32(&d.copiedUp, 1)
return nil
}
+
+// copyXattrsLocked copies a subset of lower's extended attributes to upper.
+// Attributes that configure an overlay in the lower are not copied up.
+//
+// Preconditions: d.copyMu must be locked for writing.
+func (d *dentry) copyXattrsLocked(ctx context.Context) error {
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ lowerPop := &vfs.PathOperation{Root: d.lowerVDs[0], Start: d.lowerVDs[0]}
+ upperPop := &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}
+
+ lowerXattrs, err := vfsObj.ListXattrAt(ctx, d.fs.creds, lowerPop, 0)
+ if err != nil {
+ if err == syserror.EOPNOTSUPP {
+ // There are no guarantees as to the contents of lowerXattrs.
+ return nil
+ }
+ ctx.Infof("failed to copy up xattrs because ListXattrAt failed: %v", err)
+ return err
+ }
+
+ for _, name := range lowerXattrs {
+ // Do not copy up overlay attributes.
+ if isOverlayXattr(name) {
+ continue
+ }
+
+ value, err := vfsObj.GetXattrAt(ctx, d.fs.creds, lowerPop, &vfs.GetXattrOptions{Name: name, Size: 0})
+ if err != nil {
+ ctx.Infof("failed to copy up xattrs because GetXattrAt failed: %v", err)
+ return err
+ }
+
+ if err := vfsObj.SetXattrAt(ctx, d.fs.creds, upperPop, &vfs.SetXattrOptions{Name: name, Value: value}); err != nil {
+ ctx.Infof("failed to copy up xattrs because SetXattrAt failed: %v", err)
+ return err
+ }
+ }
+ return nil
+}
+
+// copyUpDescendantsLocked ensures that all descendants of d are copied up.
+//
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+func (d *dentry) copyUpDescendantsLocked(ctx context.Context, ds **[]*dentry) error {
+ dirents, err := d.getDirentsLocked(ctx)
+ if err != nil {
+ return err
+ }
+ for _, dirent := range dirents {
+ if dirent.Name == "." || dirent.Name == ".." {
+ continue
+ }
+ child, err := d.fs.getChildLocked(ctx, d, dirent.Name, ds)
+ if err != nil {
+ return err
+ }
+ if err := child.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ if child.isDir() {
+ child.dirMu.Lock()
+ err := child.copyUpDescendantsLocked(ctx, ds)
+ child.dirMu.Unlock()
+ if err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go
index 6a79f7ffe..7ab42e71e 100644
--- a/pkg/sentry/fsimpl/overlay/directory.go
+++ b/pkg/sentry/fsimpl/overlay/directory.go
@@ -29,7 +29,9 @@ func (d *dentry) isDir() bool {
return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR
}
-// Preconditions: d.dirMu must be locked. d.isDir().
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string]bool, error) {
vfsObj := d.fs.vfsfs.VirtualFilesystem()
var readdirErr error
@@ -141,7 +143,14 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
defer d.fs.renameMu.RUnlock()
d.dirMu.Lock()
defer d.dirMu.Unlock()
+ return d.getDirentsLocked(ctx)
+}
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+func (d *dentry) getDirentsLocked(ctx context.Context) ([]vfs.Dirent, error) {
if d.dirents != nil {
return d.dirents, nil
}
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index 986b36ead..e9ce4bde1 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -15,6 +15,8 @@
package overlay
import (
+ "fmt"
+ "strings"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -27,10 +29,15 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
)
+// _OVL_XATTR_PREFIX is an extended attribute key prefix to identify overlayfs
+// attributes.
+// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_PREFIX
+const _OVL_XATTR_PREFIX = linux.XATTR_TRUSTED_PREFIX + "overlay."
+
// _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for
// opaque directories.
// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE
-const _OVL_XATTR_OPAQUE = "trusted.overlay.opaque"
+const _OVL_XATTR_OPAQUE = _OVL_XATTR_PREFIX + "opaque"
func isWhiteout(stat *linux.Statx) bool {
return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0
@@ -110,8 +117,10 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de
// Dentries which may have a reference count of zero, and which therefore
// should be dropped once traversal is complete, are appended to ds.
//
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
-// !rp.Done().
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !rp.Done().
func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
if !d.isDir() {
return nil, syserror.ENOTDIR
@@ -159,7 +168,9 @@ afterSymlink:
return child, nil
}
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
if child, ok := parent.children[name]; ok {
return child, nil
@@ -177,7 +188,9 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s
return child, nil
}
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
childPath := fspath.Parse(name)
child := fs.newDentry()
@@ -199,6 +212,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
lookupErr = err
return false
}
+ defer childVD.DecRef(ctx)
mask := uint32(linux.STATX_TYPE)
if !existsOnAnyLayer {
@@ -237,6 +251,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
}
// Update child to include this layer.
+ childVD.IncRef()
if isUpper {
child.upperVD = childVD
child.copiedUp = 1
@@ -261,10 +276,10 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
// Directories are merged with directories from lower layers if they
// are not explicitly opaque.
- opaqueVal, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{
+ opaqueVal, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
Root: childVD,
Start: childVD,
- }, &vfs.GetxattrOptions{
+ }, &vfs.GetXattrOptions{
Name: _OVL_XATTR_OPAQUE,
Size: 1,
})
@@ -300,7 +315,9 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
// lookupLayerLocked is similar to lookupLocked, but only returns information
// about the file rather than a dentry.
//
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, name string) (lookupLayer, error) {
childPath := fspath.Parse(name)
lookupLayer := lookupLayerNone
@@ -385,7 +402,9 @@ func (ll lookupLayer) existsInOverlay() bool {
// rp.Start().Impl().(*dentry)). It does not check that the returned directory
// is searchable by the provider of rp.
//
-// Preconditions: fs.renameMu must be locked. !rp.Done().
+// Preconditions:
+// * fs.renameMu must be locked.
+// * !rp.Done().
func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
for !rp.Final() {
d.dirMu.Lock()
@@ -425,8 +444,9 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
// doCreateAt checks that creating a file at rp is permitted, then invokes
// create to do so.
//
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error {
var ds *[]*dentry
fs.renameMu.RLock()
@@ -493,7 +513,7 @@ func (fs *filesystem) createWhiteout(ctx context.Context, vfsObj *vfs.VirtualFil
func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) {
if err := fs.createWhiteout(ctx, vfsObj, pop); err != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err))
}
}
@@ -605,7 +625,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
},
}); err != nil {
if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
}
@@ -644,7 +664,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
},
}); err != nil {
if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -654,12 +674,12 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
// There may be directories on lower layers (previously hidden by
// the whiteout) that the new directory should not be merged with.
// Mark it opaque to prevent merging.
- if err := vfsObj.SetxattrAt(ctx, fs.creds, &pop, &vfs.SetxattrOptions{
+ if err := vfsObj.SetXattrAt(ctx, fs.creds, &pop, &vfs.SetXattrOptions{
Name: _OVL_XATTR_OPAQUE,
Value: "y",
}); err != nil {
if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr))
} else {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -703,7 +723,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
},
}); err != nil {
if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -717,17 +737,36 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
mayCreate := opts.Flags&linux.O_CREAT != 0
mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL)
+ mayWrite := vfs.AccessTypesForOpenFlags(&opts).MayWrite()
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ unlocked := false
+ unlock := func() {
+ if !unlocked {
+ fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ unlocked = true
+ }
+ }
+ defer unlock()
start := rp.Start().Impl().(*dentry)
if rp.Done() {
+ if mayCreate && rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
if mustCreate {
return nil, syserror.EEXIST
}
- return start.openLocked(ctx, rp, &opts)
+ if mayWrite {
+ if err := start.copyUpLocked(ctx); err != nil {
+ return nil, err
+ }
+ }
+ start.IncRef()
+ defer start.DecRef(ctx)
+ unlock()
+ return start.openCopiedUp(ctx, rp, &opts)
}
afterTrailingSymlink:
@@ -739,6 +778,10 @@ afterTrailingSymlink:
if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
return nil, err
}
+ // Reject attempts to open directories with O_CREAT.
+ if mayCreate && rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
// Determine whether or not we need to create a file.
parent.dirMu.Lock()
child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
@@ -747,12 +790,11 @@ afterTrailingSymlink:
parent.dirMu.Unlock()
return fd, err
}
+ parent.dirMu.Unlock()
if err != nil {
- parent.dirMu.Unlock()
return nil, err
}
// Open existing child or follow symlink.
- parent.dirMu.Unlock()
if mustCreate {
return nil, syserror.EEXIST
}
@@ -767,20 +809,27 @@ afterTrailingSymlink:
start = parent
goto afterTrailingSymlink
}
- return child.openLocked(ctx, rp, &opts)
+ if rp.MustBeDir() && !child.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if mayWrite {
+ if err := child.copyUpLocked(ctx); err != nil {
+ return nil, err
+ }
+ }
+ child.IncRef()
+ defer child.DecRef(ctx)
+ unlock()
+ return child.openCopiedUp(ctx, rp, &opts)
}
-// Preconditions: fs.renameMu must be locked.
-func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Preconditions: If vfs.AccessTypesForOpenFlags(opts).MayWrite(), then d has
+// been copied up.
+func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
ats := vfs.AccessTypesForOpenFlags(opts)
if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
return nil, err
}
- if ats.MayWrite() {
- if err := d.copyUpLocked(ctx); err != nil {
- return nil, err
- }
- }
mnt := rp.Mount()
// Directory FDs open FDs from each layer when directory entries are read,
@@ -792,7 +841,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
return nil, syserror.EISDIR
}
// Can't open directories writably.
- if ats&vfs.MayWrite != 0 {
+ if ats.MayWrite() {
return nil, syserror.EISDIR
}
if opts.Flags&linux.O_DIRECT != 0 {
@@ -831,8 +880,9 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
return &fd.vfsfd, nil
}
-// Preconditions: parent.dirMu must be locked. parent does not already contain
-// a child named rp.Component().
+// Preconditions:
+// * parent.dirMu must be locked.
+// * parent does not already contain a child named rp.Component().
func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
creds := rp.Credentials()
if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil {
@@ -893,7 +943,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
},
}); err != nil {
if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -904,7 +954,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
child, err := fs.getChildLocked(ctx, parent, childName, ds)
if err != nil {
if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -970,9 +1020,223 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
defer mnt.EndWrite()
- // FIXME(gvisor.dev/issue/1199): Actually implement rename.
- _ = newParent
- return syserror.EXDEV
+ oldParent := oldParentVD.Dentry().Impl().(*dentry)
+ creds := rp.Credentials()
+ if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ // We need a dentry representing the renamed file since, if it's a
+ // directory, we need to check for write permission on it.
+ oldParent.dirMu.Lock()
+ defer oldParent.dirMu.Unlock()
+ renamed, err := fs.getChildLocked(ctx, oldParent, oldName, &ds)
+ if err != nil {
+ return err
+ }
+ if err := vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&oldParent.mode)), auth.KUID(atomic.LoadUint32(&renamed.uid))); err != nil {
+ return err
+ }
+ if renamed.isDir() {
+ if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
+ return syserror.EINVAL
+ }
+ if oldParent != newParent {
+ if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil {
+ return err
+ }
+ }
+ } else {
+ if opts.MustBeDir || rp.MustBeDir() {
+ return syserror.ENOTDIR
+ }
+ }
+
+ if oldParent != newParent {
+ if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ newParent.dirMu.Lock()
+ defer newParent.dirMu.Unlock()
+ }
+ if newParent.vfsd.IsDead() {
+ return syserror.ENOENT
+ }
+ replacedLayer, err := fs.lookupLayerLocked(ctx, newParent, newName)
+ if err != nil {
+ return err
+ }
+ var (
+ replaced *dentry
+ replacedVFSD *vfs.Dentry
+ whiteouts map[string]bool
+ )
+ if replacedLayer.existsInOverlay() {
+ replaced, err = fs.getChildLocked(ctx, newParent, newName, &ds)
+ if err != nil {
+ return err
+ }
+ replacedVFSD = &replaced.vfsd
+ if replaced.isDir() {
+ if !renamed.isDir() {
+ return syserror.EISDIR
+ }
+ if genericIsAncestorDentry(replaced, renamed) {
+ return syserror.ENOTEMPTY
+ }
+ replaced.dirMu.Lock()
+ defer replaced.dirMu.Unlock()
+ whiteouts, err = replaced.collectWhiteoutsForRmdirLocked(ctx)
+ if err != nil {
+ return err
+ }
+ } else {
+ if rp.MustBeDir() || renamed.isDir() {
+ return syserror.ENOTDIR
+ }
+ }
+ }
+
+ if oldParent == newParent && oldName == newName {
+ return nil
+ }
+
+ // renamed and oldParent need to be copied-up before they're renamed on the
+ // upper layer.
+ if err := renamed.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ // If renamed is a directory, all of its descendants need to be copied-up
+ // before they're renamed on the upper layer.
+ if renamed.isDir() {
+ if err := renamed.copyUpDescendantsLocked(ctx, &ds); err != nil {
+ return err
+ }
+ }
+ // newParent must be copied-up before it can contain renamed on the upper
+ // layer.
+ if err := newParent.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ // If replaced exists, it doesn't need to be copied-up, but we do need to
+ // serialize with copy-up. Holding renameMu for writing should be
+ // sufficient, but out of an abundance of caution...
+ if replaced != nil {
+ replaced.copyMu.RLock()
+ defer replaced.copyMu.RUnlock()
+ }
+
+ vfsObj := rp.VirtualFilesystem()
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef(ctx)
+ if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
+ return err
+ }
+
+ newpop := vfs.PathOperation{
+ Root: newParent.upperVD,
+ Start: newParent.upperVD,
+ Path: fspath.Parse(newName),
+ }
+
+ needRecreateWhiteouts := false
+ cleanupRecreateWhiteouts := func() {
+ if !needRecreateWhiteouts {
+ return
+ }
+ for whiteoutName, whiteoutUpper := range whiteouts {
+ if !whiteoutUpper {
+ continue
+ }
+ if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{
+ Root: replaced.upperVD,
+ Start: replaced.upperVD,
+ Path: fspath.Parse(whiteoutName),
+ }); err != nil && err != syserror.EEXIST {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RenameAt failure: %v", err))
+ }
+ }
+ }
+ if renamed.isDir() {
+ if replacedLayer == lookupLayerUpper {
+ // Remove whiteouts from the directory being replaced.
+ needRecreateWhiteouts = true
+ for whiteoutName, whiteoutUpper := range whiteouts {
+ if !whiteoutUpper {
+ continue
+ }
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: replaced.upperVD,
+ Start: replaced.upperVD,
+ Path: fspath.Parse(whiteoutName),
+ }); err != nil {
+ cleanupRecreateWhiteouts()
+ vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+ return err
+ }
+ }
+ } else if replacedLayer == lookupLayerUpperWhiteout {
+ // We need to explicitly remove the whiteout since otherwise rename
+ // on the upper layer will fail with ENOTDIR.
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil {
+ vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+ return err
+ }
+ }
+ }
+
+ // Essentially no gVisor filesystem supports RENAME_WHITEOUT, so just do a
+ // regular rename and create the whiteout at the origin manually. Unlike
+ // RENAME_WHITEOUT, this isn't atomic with respect to other users of the
+ // upper filesystem, but this is already the case for virtually all other
+ // overlay filesystem operations too.
+ oldpop := vfs.PathOperation{
+ Root: oldParent.upperVD,
+ Start: oldParent.upperVD,
+ Path: fspath.Parse(oldName),
+ }
+ if err := vfsObj.RenameAt(ctx, creds, &oldpop, &newpop, &opts); err != nil {
+ cleanupRecreateWhiteouts()
+ vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+ return err
+ }
+
+ // Below this point, the renamed dentry is now at newpop, and anything we
+ // replaced is gone forever. Commit the rename, update the overlay
+ // filesystem tree, and abandon attempts to recover from errors.
+ vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
+ delete(oldParent.children, oldName)
+ if replaced != nil {
+ ds = appendDentry(ds, replaced)
+ }
+ if oldParent != newParent {
+ newParent.dirents = nil
+ // This can't drop the last reference on oldParent because one is held
+ // by oldParentVD, so lock recursion is impossible.
+ oldParent.DecRef(ctx)
+ ds = appendDentry(ds, oldParent)
+ newParent.IncRef()
+ renamed.parent = newParent
+ }
+ renamed.name = newName
+ if newParent.children == nil {
+ newParent.children = make(map[string]*dentry)
+ }
+ newParent.children[newName] = renamed
+ oldParent.dirents = nil
+
+ if err := fs.createWhiteout(ctx, vfsObj, &oldpop); err != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout at origin after RenameAt: %v", err))
+ }
+ if renamed.isDir() {
+ if err := vfsObj.SetXattrAt(ctx, fs.creds, &newpop, &vfs.SetXattrOptions{
+ Name: _OVL_XATTR_OPAQUE,
+ Value: "y",
+ }); err != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to make renamed directory opaque: %v", err))
+ }
+ }
+
+ return nil
}
// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
@@ -1051,7 +1315,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
Start: child.upperVD,
Path: fspath.Parse(whiteoutName),
}); err != nil && err != syserror.EEXIST {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err))
}
}
}
@@ -1081,9 +1345,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
// Don't attempt to recover from this: the original directory is
// already gone, so any dentries representing it are invalid, and
// creating a new directory won't undo that.
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err)
- vfsObj.AbortDeleteDentry(&child.vfsd)
- return err
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err))
}
vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
@@ -1197,7 +1459,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
},
}); err != nil {
if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -1290,11 +1552,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
}
}
if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err)
- if child != nil {
- vfsObj.AbortDeleteDentry(&child.vfsd)
- }
- return err
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err))
}
if child != nil {
@@ -1306,54 +1564,146 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return nil
}
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// isOverlayXattr returns whether the given extended attribute configures the
+// overlay.
+func isOverlayXattr(name string) bool {
+ return strings.HasPrefix(name, _OVL_XATTR_PREFIX)
+}
+
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
- _, err := fs.resolveLocked(ctx, rp, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return nil, err
}
- // TODO(gvisor.dev/issue/1199): Linux overlayfs actually allows listxattr,
- // but not any other xattr syscalls. For now we just reject all of them.
- return nil, syserror.ENOTSUP
+
+ return fs.listXattr(ctx, d, size)
}
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+func (fs *filesystem) listXattr(ctx context.Context, d *dentry, size uint64) ([]string, error) {
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ top := d.topLayer()
+ names, err := vfsObj.ListXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, size)
+ if err != nil {
+ return nil, err
+ }
+
+ // Filter out all overlay attributes.
+ n := 0
+ for _, name := range names {
+ if !isOverlayXattr(name) {
+ names[n] = name
+ n++
+ }
+ }
+ return names[:n], err
+}
+
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
- _, err := fs.resolveLocked(ctx, rp, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return "", err
}
- return "", syserror.ENOTSUP
+
+ return fs.getXattr(ctx, d, rp.Credentials(), &opts)
}
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
+ if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
+ return "", err
+ }
+
+ // Return EOPNOTSUPP when fetching an overlay attribute.
+ // See fs/overlayfs/super.c:ovl_own_xattr_get().
+ if isOverlayXattr(opts.Name) {
+ return "", syserror.EOPNOTSUPP
+ }
+
+ // Analogous to fs/overlayfs/super.c:ovl_other_xattr_get().
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ top := d.topLayer()
+ return vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, opts)
+}
+
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
- _, err := fs.resolveLocked(ctx, rp, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return err
}
- return syserror.ENOTSUP
+
+ return fs.setXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), &opts)
+}
+
+// Precondition: fs.renameMu must be locked.
+func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
+ if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
+ return err
+ }
+
+ // Return EOPNOTSUPP when setting an overlay attribute.
+ // See fs/overlayfs/super.c:ovl_own_xattr_set().
+ if isOverlayXattr(opts.Name) {
+ return syserror.EOPNOTSUPP
+ }
+
+ // Analogous to fs/overlayfs/super.c:ovl_other_xattr_set().
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ if err := d.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ return vfsObj.SetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, opts)
}
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
- _, err := fs.resolveLocked(ctx, rp, &ds)
+ d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return err
}
- return syserror.ENOTSUP
+
+ return fs.removeXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), name)
+}
+
+// Precondition: fs.renameMu must be locked.
+func (fs *filesystem) removeXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, name string) error {
+ if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
+ return err
+ }
+
+ // Like SetXattrAt, return EOPNOTSUPP when removing an overlay attribute.
+ // Linux passes the remove request to xattr_handler->set.
+ // See fs/xattr.c:vfs_removexattr().
+ if isOverlayXattr(name) {
+ return syserror.EOPNOTSUPP
+ }
+
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ if err := d.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ vfsObj := d.fs.vfsfs.VirtualFilesystem()
+ return vfsObj.RemoveXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, name)
}
// PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go
index d3060a481..6e04705c7 100644
--- a/pkg/sentry/fsimpl/overlay/non_directory.go
+++ b/pkg/sentry/fsimpl/overlay/non_directory.go
@@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -121,7 +122,6 @@ func (fd *nonDirectoryFD) OnClose(ctx context.Context) error {
fd.cachedFlags = statusFlags
}
wrappedFD := fd.cachedFD
- defer wrappedFD.IncRef()
fd.mu.Unlock()
return wrappedFD.OnClose(ctx)
}
@@ -147,6 +147,16 @@ func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux
return stat, nil
}
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *nonDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ wrappedFD, err := fd.getCurrentFD(ctx)
+ if err != nil {
+ return err
+ }
+ defer wrappedFD.DecRef(ctx)
+ return wrappedFD.Allocate(ctx, mode, offset, length)
+}
+
// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
d := fd.dentry()
@@ -257,10 +267,105 @@ func (fd *nonDirectoryFD) Sync(ctx context.Context) error {
// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
- wrappedFD, err := fd.getCurrentFD(ctx)
+ if err := fd.ensureMappable(ctx, opts); err != nil {
+ return err
+ }
+ return vfs.GenericConfigureMMap(&fd.vfsfd, fd.dentry(), opts)
+}
+
+// ensureMappable ensures that fd.dentry().wrappedMappable is not nil.
+func (fd *nonDirectoryFD) ensureMappable(ctx context.Context, opts *memmap.MMapOpts) error {
+ d := fd.dentry()
+
+ // Fast path if we already have a Mappable for the current top layer.
+ if atomic.LoadUint32(&d.isMappable) != 0 {
+ return nil
+ }
+
+ // Only permit mmap of regular files, since other file types may have
+ // unpredictable behavior when mmapped (e.g. /dev/zero).
+ if atomic.LoadUint32(&d.mode)&linux.S_IFMT != linux.S_IFREG {
+ return syserror.ENODEV
+ }
+
+ // Get a Mappable for the current top layer.
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ d.copyMu.RLock()
+ defer d.copyMu.RUnlock()
+ if atomic.LoadUint32(&d.isMappable) != 0 {
+ return nil
+ }
+ wrappedFD, err := fd.currentFDLocked(ctx)
if err != nil {
return err
}
- defer wrappedFD.DecRef(ctx)
- return wrappedFD.ConfigureMMap(ctx, opts)
+ if err := wrappedFD.ConfigureMMap(ctx, opts); err != nil {
+ return err
+ }
+ if opts.MappingIdentity != nil {
+ opts.MappingIdentity.DecRef(ctx)
+ opts.MappingIdentity = nil
+ }
+ // Use this Mappable for all mappings of this layer (unless we raced with
+ // another call to ensureMappable).
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ d.dataMu.Lock()
+ defer d.dataMu.Unlock()
+ if d.wrappedMappable == nil {
+ d.wrappedMappable = opts.Mappable
+ atomic.StoreUint32(&d.isMappable, 1)
+ }
+ return nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ if err := d.wrappedMappable.AddMapping(ctx, ms, ar, offset, writable); err != nil {
+ return err
+ }
+ if !d.isCopiedUp() {
+ d.lowerMappings.AddMapping(ms, ar, offset, writable)
+ }
+ return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ d.wrappedMappable.RemoveMapping(ctx, ms, ar, offset, writable)
+ if !d.isCopiedUp() {
+ d.lowerMappings.RemoveMapping(ms, ar, offset, writable)
+ }
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ if err := d.wrappedMappable.CopyMapping(ctx, ms, srcAR, dstAR, offset, writable); err != nil {
+ return err
+ }
+ if !d.isCopiedUp() {
+ d.lowerMappings.AddMapping(ms, dstAR, offset, writable)
+ }
+ return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+ d.dataMu.RLock()
+ defer d.dataMu.RUnlock()
+ return d.wrappedMappable.Translate(ctx, required, optional, at)
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
+ d.mapsMu.Lock()
+ defer d.mapsMu.Unlock()
+ return d.wrappedMappable.InvalidateUnsavable(ctx)
}
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
index 75cc006bf..d0d26185e 100644
--- a/pkg/sentry/fsimpl/overlay/overlay.go
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -22,6 +22,10 @@
// filesystem.renameMu
// dentry.dirMu
// dentry.copyMu
+// *** "memmap.Mappable locks" below this point
+// dentry.mapsMu
+// *** "memmap.Mappable locks taken by Translate" below this point
+// dentry.dataMu
//
// Locking dentry.dirMu in multiple dentries requires that parent dentries are
// locked before child dentries, and that filesystem.renameMu is locked to
@@ -37,6 +41,7 @@ import (
"gvisor.dev/gvisor/pkg/fspath"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -106,16 +111,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
fsoptsRaw := opts.InternalData
fsopts, haveFSOpts := fsoptsRaw.(FilesystemOptions)
if fsoptsRaw != nil && !haveFSOpts {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
return nil, nil, syserror.EINVAL
}
if haveFSOpts {
if len(fsopts.LowerRoots) == 0 {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty")
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty")
return nil, nil, syserror.EINVAL
}
if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified")
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified")
return nil, nil, syserror.EINVAL
}
// We don't enforce a maximum number of lower layers when not
@@ -132,7 +137,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
delete(mopts, "workdir")
upperPath := fspath.Parse(upperPathname)
if !upperPath.Absolute {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
return nil, nil, syserror.EINVAL
}
upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
@@ -144,13 +149,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
CheckSearchable: true,
})
if err != nil {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
return nil, nil, err
}
defer upperRoot.DecRef(ctx)
privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */)
if err != nil {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
return nil, nil, err
}
defer privateUpperRoot.DecRef(ctx)
@@ -158,24 +163,24 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
lowerPathnamesStr, ok := mopts["lowerdir"]
if !ok {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: missing required option lowerdir")
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: missing required option lowerdir")
return nil, nil, syserror.EINVAL
}
delete(mopts, "lowerdir")
lowerPathnames := strings.Split(lowerPathnamesStr, ":")
const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK
if len(lowerPathnames) < 2 && !fsopts.UpperRoot.Ok() {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified")
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified")
return nil, nil, syserror.EINVAL
}
if len(lowerPathnames) > maxLowerLayers {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers)
return nil, nil, syserror.EINVAL
}
for _, lowerPathname := range lowerPathnames {
lowerPath := fspath.Parse(lowerPathname)
if !lowerPath.Absolute {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
return nil, nil, syserror.EINVAL
}
lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
@@ -187,13 +192,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
CheckSearchable: true,
})
if err != nil {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
return nil, nil, err
}
defer lowerRoot.DecRef(ctx)
privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */)
if err != nil {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
return nil, nil, err
}
defer privateLowerRoot.DecRef(ctx)
@@ -201,7 +206,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
}
if len(mopts) != 0 {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
return nil, nil, syserror.EINVAL
}
@@ -274,7 +279,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
return nil, nil, syserror.EREMOTE
}
if isWhiteout(&rootStat) {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
root.destroyLocked(ctx)
fs.vfsfs.DecRef(ctx)
return nil, nil, syserror.EINVAL
@@ -315,7 +320,11 @@ func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forc
if err != nil {
return vfs.VirtualDentry{}, err
}
- return vfs.MakeVirtualDentry(newmnt, vd.Dentry()), nil
+ // Take a reference on the dentry which will be owned by the returned
+ // VirtualDentry.
+ d := vd.Dentry()
+ d.IncRef()
+ return vfs.MakeVirtualDentry(newmnt, d), nil
}
// Release implements vfs.FilesystemImpl.Release.
@@ -415,6 +424,35 @@ type dentry struct {
devMinor uint32
ino uint64
+ // If this dentry represents a regular file, then:
+ //
+ // - mapsMu is used to synchronize between copy-up and memmap.Mappable
+ // methods on dentry preceding mm.MemoryManager.activeMu in the lock order.
+ //
+ // - dataMu is used to synchronize between copy-up and
+ // dentry.(memmap.Mappable).Translate.
+ //
+ // - lowerMappings tracks memory mappings of the file. lowerMappings is
+ // used to invalidate mappings of the lower layer when the file is copied
+ // up to ensure that they remain coherent with subsequent writes to the
+ // file. (Note that, as of this writing, Linux overlayfs does not do this;
+ // this feature is a gVisor extension.) lowerMappings is protected by
+ // mapsMu.
+ //
+ // - If this dentry is copied-up, then wrappedMappable is the Mappable
+ // obtained from a call to the current top layer's
+ // FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil
+ // (from a call to nonDirectoryFD.ensureMappable()), it cannot become nil.
+ // wrappedMappable is protected by mapsMu and dataMu.
+ //
+ // - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is
+ // accessed using atomic memory operations.
+ mapsMu sync.Mutex
+ lowerMappings memmap.MappingSet
+ dataMu sync.RWMutex
+ wrappedMappable memmap.Mappable
+ isMappable uint32
+
locks vfs.FileLocks
}
@@ -482,7 +520,9 @@ func (d *dentry) checkDropLocked(ctx context.Context) {
// destroyLocked destroys the dentry.
//
-// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0.
+// Preconditions:
+// * d.fs.renameMu must be locked for writing.
+// * d.refs == 0.
func (d *dentry) destroyLocked(ctx context.Context) {
switch atomic.LoadInt64(&d.refs) {
case 0:
@@ -564,6 +604,16 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
}
+func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+ mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+ kuid := auth.KUID(atomic.LoadUint32(&d.uid))
+ kgid := auth.KGID(atomic.LoadUint32(&d.gid))
+ if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+ return err
+ }
+ return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
+}
+
// statInternalMask is the set of stat fields that is set by
// dentry.statInternalTo().
const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
@@ -616,6 +666,32 @@ func (fd *fileDescription) dentry() *dentry {
return fd.vfsfd.Dentry().Impl().(*dentry)
}
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+ return fd.filesystem().listXattr(ctx, fd.dentry(), size)
+}
+
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+ return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts)
+}
+
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
+ fs := fd.filesystem()
+ fs.renameMu.RLock()
+ defer fs.renameMu.RUnlock()
+ return fs.setXattrLocked(ctx, fd.dentry(), fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts)
+}
+
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
+ fs := fd.filesystem()
+ fs.renameMu.RLock()
+ defer fs.renameMu.RUnlock()
+ return fs.removeXattrLocked(ctx, fd.dentry(), fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name)
+}
+
// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
index 2ca793db9..7053ad6db 100644
--- a/pkg/sentry/fsimpl/pipefs/pipefs.go
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -143,14 +143,16 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.
return syserror.EPERM
}
-// TODO(gvisor.dev/issue/1193): kernfs does not provide a way to implement
-// statfs, from which we should indicate PIPEFS_MAGIC.
-
// Open implements kernfs.Inode.Open.
func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags, &i.locks)
}
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+ return vfs.GenericStatFS(linux.PIPEFS_MAGIC), nil
+}
+
// NewConnectedPipeFDs returns a pair of FileDescriptions representing the read
// and write ends of a newly-created pipe, as for pipe(2) and pipe2(2).
//
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index f074e6056..2e086e34c 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -1,18 +1,79 @@
load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
licenses(["notice"])
+go_template_instance(
+ name = "fd_dir_inode_refs",
+ out = "fd_dir_inode_refs.go",
+ package = "proc",
+ prefix = "fdDirInode",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "fdDirInode",
+ },
+)
+
+go_template_instance(
+ name = "fd_info_dir_inode_refs",
+ out = "fd_info_dir_inode_refs.go",
+ package = "proc",
+ prefix = "fdInfoDirInode",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "fdInfoDirInode",
+ },
+)
+
+go_template_instance(
+ name = "subtasks_inode_refs",
+ out = "subtasks_inode_refs.go",
+ package = "proc",
+ prefix = "subtasksInode",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "subtasksInode",
+ },
+)
+
+go_template_instance(
+ name = "task_inode_refs",
+ out = "task_inode_refs.go",
+ package = "proc",
+ prefix = "taskInode",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "taskInode",
+ },
+)
+
+go_template_instance(
+ name = "tasks_inode_refs",
+ out = "tasks_inode_refs.go",
+ package = "proc",
+ prefix = "tasksInode",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "tasksInode",
+ },
+)
+
go_library(
name = "proc",
srcs = [
+ "fd_dir_inode_refs.go",
+ "fd_info_dir_inode_refs.go",
"filesystem.go",
"subtasks.go",
+ "subtasks_inode_refs.go",
"task.go",
"task_fds.go",
"task_files.go",
+ "task_inode_refs.go",
"task_net.go",
"tasks.go",
"tasks_files.go",
+ "tasks_inode_refs.go",
"tasks_sys.go",
],
visibility = ["//pkg/sentry:internal"],
@@ -36,6 +97,7 @@ go_library(
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/usage",
"//pkg/sentry/vfs",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/tcpip/header",
"//pkg/tcpip/network/ipv4",
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index 2463d51cd..03b5941b9 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -110,8 +110,21 @@ func newStaticFile(data string) *staticFile {
return &staticFile{StaticData: vfs.StaticData{Data: data}}
}
+func newStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry {
+ return kernfs.NewStaticDir(creds, devMajor, devMinor, ino, perm, children, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndZero,
+ })
+}
+
// InternalData contains internal data passed in to the procfs mount via
// vfs.GetFilesystemOptions.InternalData.
type InternalData struct {
Cgroups map[string]string
}
+
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+ return vfs.GenericStatFS(linux.PROC_SUPER_MAGIC), nil
+}
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 79c2725f3..57f026040 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -31,11 +31,13 @@ import (
//
// +stateify savable
type subtasksInode struct {
- kernfs.InodeNotSymlink
- kernfs.InodeDirectoryNoNewChildren
+ implStatFS
+ kernfs.AlwaysValid
kernfs.InodeAttrs
+ kernfs.InodeDirectoryNoNewChildren
+ kernfs.InodeNotSymlink
kernfs.OrderedChildren
- kernfs.AlwaysValid
+ subtasksInodeRefs
locks vfs.FileLocks
@@ -57,6 +59,7 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace,
// Note: credentials are overridden by taskOwnedInode.
subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+ subInode.EnableLeakCheck()
inode := &taskOwnedInode{Inode: subInode, owner: task}
dentry := &kernfs.Dentry{}
@@ -65,7 +68,7 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace,
return dentry
}
-// Lookup implements kernfs.inodeDynamicLookup.
+// Lookup implements kernfs.inodeDynamicLookup.Lookup.
func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
tid, err := strconv.ParseUint(name, 10, 32)
if err != nil {
@@ -84,7 +87,7 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e
return subTaskDentry.VFSDentry(), nil
}
-// IterDirents implements kernfs.inodeDynamicLookup.
+// IterDirents implements kernfs.inodeDynamicLookup.IterDirents.
func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
tasks := i.task.ThreadGroup().MemberIDs(i.pidns)
if len(tasks) == 0 {
@@ -152,10 +155,12 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro
return fd.GenericDirectoryFD.SetStat(ctx, opts)
}
-// Open implements kernfs.Inode.
+// Open implements kernfs.Inode.Open.
func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &subtasksFD{task: i.task}
- if err := fd.Init(&i.OrderedChildren, &i.locks, &opts); err != nil {
+ if err := fd.Init(&i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndZero,
+ }); err != nil {
return nil, err
}
if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
@@ -164,7 +169,7 @@ func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *v
return fd.VFSFileDescription(), nil
}
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
if err != nil {
@@ -176,7 +181,12 @@ func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs
return stat, nil
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *subtasksInode) DecRef(context.Context) {
+ i.subtasksInodeRefs.DecRef(i.Destroy)
+}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index a5c7aa470..e24c8a031 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -32,11 +32,13 @@ import (
//
// +stateify savable
type taskInode struct {
- kernfs.InodeNotSymlink
+ implStatFS
+ kernfs.InodeAttrs
kernfs.InodeDirectoryNoNewChildren
kernfs.InodeNoDynamicLookup
- kernfs.InodeAttrs
+ kernfs.InodeNotSymlink
kernfs.OrderedChildren
+ taskInodeRefs
locks vfs.FileLocks
@@ -84,6 +86,7 @@ func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace
taskInode := &taskInode{task: task}
// Note: credentials are overridden by taskOwnedInode.
taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+ taskInode.EnableLeakCheck()
inode := &taskOwnedInode{Inode: taskInode, owner: task}
dentry := &kernfs.Dentry{}
@@ -103,20 +106,27 @@ func (i *taskInode) Valid(ctx context.Context) bool {
return i.task.ExitState() != kernel.TaskExitDead
}
-// Open implements kernfs.Inode.
+// Open implements kernfs.Inode.Open.
func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndZero,
+ })
if err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
+// DecRef implements kernfs.Inode.DecRef.
+func (i *taskInode) DecRef(context.Context) {
+ i.taskInodeRefs.DecRef(i.Destroy)
+}
+
// taskOwnedInode implements kernfs.Inode and overrides inode owner with task
// effective user and group.
type taskOwnedInode struct {
@@ -142,7 +152,10 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.
dir := &kernfs.StaticDirectory{}
// Note: credentials are overridden by taskOwnedInode.
- dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm)
+ dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndZero,
+ })
+ dir.EnableLeakCheck()
inode := &taskOwnedInode{Inode: dir, owner: task}
d := &kernfs.Dentry{}
@@ -155,7 +168,7 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.
return d
}
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
stat, err := i.Inode.Stat(ctx, fs, opts)
if err != nil {
@@ -173,7 +186,7 @@ func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.
return stat, nil
}
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
mode := i.Mode()
uid, gid := i.getOwner(mode)
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index f0d3f7f5e..c492bcfa7 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -22,7 +22,6 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -63,7 +62,7 @@ type fdDir struct {
produceSymlink bool
}
-// IterDirents implements kernfs.inodeDynamicLookup.
+// IterDirents implements kernfs.inodeDynamicLookup.IterDirents.
func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
var fds []int32
i.task.WithMuLocked(func(t *kernel.Task) {
@@ -87,26 +86,33 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, off
Name: strconv.FormatUint(uint64(fd), 10),
Type: typ,
Ino: i.fs.NextIno(),
- NextOff: offset + 1,
+ NextOff: int64(fd) + 3,
}
if err := cb.Handle(dirent); err != nil {
- return offset, err
+ // Getdents should iterate correctly despite mutation
+ // of fds, so we return the next fd to serialize plus
+ // 2 (which accounts for the "." and ".." tracked by
+ // kernfs) as the offset.
+ return int64(fd) + 2, err
}
- offset++
}
- return offset, nil
+ // We serialized them all. Next offset should be higher than last
+ // serialized fd.
+ return int64(fds[len(fds)-1]) + 3, nil
}
// fdDirInode represents the inode for /proc/[pid]/fd directory.
//
// +stateify savable
type fdDirInode struct {
- kernfs.InodeNotSymlink
- kernfs.InodeDirectoryNoNewChildren
+ fdDir
+ fdDirInodeRefs
+ implStatFS
+ kernfs.AlwaysValid
kernfs.InodeAttrs
+ kernfs.InodeDirectoryNoNewChildren
+ kernfs.InodeNotSymlink
kernfs.OrderedChildren
- kernfs.AlwaysValid
- fdDir
}
var _ kernfs.Inode = (*fdDirInode)(nil)
@@ -120,6 +126,7 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
},
}
inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+ inode.EnableLeakCheck()
dentry := &kernfs.Dentry{}
dentry.Init(inode)
@@ -128,7 +135,7 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
return dentry
}
-// Lookup implements kernfs.inodeDynamicLookup.
+// Lookup implements kernfs.inodeDynamicLookup.Lookup.
func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
fdInt, err := strconv.ParseInt(name, 10, 32)
if err != nil {
@@ -142,16 +149,18 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
return taskDentry.VFSDentry(), nil
}
-// Open implements kernfs.Inode.
+// Open implements kernfs.Inode.Open.
func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndZero,
+ })
if err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
//
// This is to match Linux, which uses a special permission handler to guarantee
// that a process can still access /proc/self/fd after it has executed
@@ -173,10 +182,16 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia
return err
}
+// DecRef implements kernfs.Inode.DecRef.
+func (i *fdDirInode) DecRef(context.Context) {
+ i.fdDirInodeRefs.DecRef(i.Destroy)
+}
+
// fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file.
//
// +stateify savable
type fdSymlink struct {
+ implStatFS
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
kernfs.InodeSymlink
@@ -199,7 +214,7 @@ func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *ker
return d
}
-func (s *fdSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
file, _ := getTaskFD(s.task, s.fd)
if file == nil {
return "", syserror.ENOENT
@@ -225,12 +240,14 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen
//
// +stateify savable
type fdInfoDirInode struct {
- kernfs.InodeNotSymlink
- kernfs.InodeDirectoryNoNewChildren
+ fdDir
+ fdInfoDirInodeRefs
+ implStatFS
+ kernfs.AlwaysValid
kernfs.InodeAttrs
+ kernfs.InodeDirectoryNoNewChildren
+ kernfs.InodeNotSymlink
kernfs.OrderedChildren
- kernfs.AlwaysValid
- fdDir
}
var _ kernfs.Inode = (*fdInfoDirInode)(nil)
@@ -243,6 +260,7 @@ func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry {
},
}
inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+ inode.EnableLeakCheck()
dentry := &kernfs.Dentry{}
dentry.Init(inode)
@@ -251,7 +269,7 @@ func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry {
return dentry
}
-// Lookup implements kernfs.inodeDynamicLookup.
+// Lookup implements kernfs.inodeDynamicLookup.Lookup.
func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
fdInt, err := strconv.ParseInt(name, 10, 32)
if err != nil {
@@ -269,21 +287,27 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry,
return dentry.VFSDentry(), nil
}
-// Open implements kernfs.Inode.
+// Open implements kernfs.Inode.Open.
func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndZero,
+ })
if err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
+// DecRef implements kernfs.Inode.DecRef.
+func (i *fdInfoDirInode) DecRef(context.Context) {
+ i.fdInfoDirInodeRefs.DecRef(i.Destroy)
+}
+
// fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd].
//
// +stateify savable
type fdInfoData struct {
kernfs.DynamicBytesFile
- refs.AtomicRefCount
task *kernel.Task
fd int32
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 830b78949..8f7e9b801 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -543,7 +543,7 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
var vss, rss, data uint64
s.task.WithMuLocked(func(t *kernel.Task) {
if fdTable := t.FDTable(); fdTable != nil {
- fds = fdTable.Size()
+ fds = fdTable.CurrentMaxFDs()
}
if mm := t.MemoryManager(); mm != nil {
vss = mm.VirtualMemorySize()
@@ -648,6 +648,7 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset
//
// +stateify savable
type exeSymlink struct {
+ implStatFS
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
kernfs.InodeSymlink
@@ -666,8 +667,8 @@ func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentr
return d
}
-// Readlink implements kernfs.Inode.
-func (s *exeSymlink) Readlink(ctx context.Context) (string, error) {
+// Readlink implements kernfs.Inode.Readlink.
+func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
if !kernel.ContextCanTrace(ctx, s.task, false) {
return "", syserror.EACCES
}
@@ -806,15 +807,15 @@ func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns stri
return d
}
-// Readlink implements Inode.
-func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) {
+// Readlink implements kernfs.Inode.Readlink.
+func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
if err := checkTaskState(s.task); err != nil {
return "", err
}
- return s.StaticSymlink.Readlink(ctx)
+ return s.StaticSymlink.Readlink(ctx, mnt)
}
-// Getlink implements Inode.Getlink.
+// Getlink implements kernfs.Inode.Getlink.
func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
if err := checkTaskState(s.task); err != nil {
return vfs.VirtualDentry{}, "", err
@@ -832,6 +833,7 @@ func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.Vir
// namespaceInode is a synthetic inode created to represent a namespace in
// /proc/[pid]/ns/*.
type namespaceInode struct {
+ implStatFS
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
kernfs.InodeNotDirectory
@@ -850,7 +852,7 @@ func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32
i.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
}
-// Open implements Inode.Open.
+// Open implements kernfs.Inode.Open.
func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &namespaceFD{inode: i}
i.IncRef()
@@ -873,20 +875,20 @@ type namespaceFD struct {
var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil)
-// Stat implements FileDescriptionImpl.
+// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
return fd.inode.Stat(ctx, vfs, opts)
}
-// SetStat implements FileDescriptionImpl.
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
creds := auth.CredentialsFromContext(ctx)
return fd.inode.SetStat(ctx, vfs, creds, opts)
}
-// Release implements FileDescriptionImpl.
+// Release implements vfs.FileDescriptionImpl.Release.
func (fd *namespaceFD) Release(ctx context.Context) {
fd.inode.DecRef(ctx)
}
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
index a4c884bf9..1607eac19 100644
--- a/pkg/sentry/fsimpl/proc/task_net.go
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -262,7 +262,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
// For now, we always redact this pointer.
fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d",
(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
- s.Refs()-1, // RefCount, don't count our own ref.
+ s.ReadRefs()-1, // RefCount, don't count our own ref.
0, // Protocol, always 0 for UDS.
sockFlags, // Flags.
sops.Endpoint().Type(), // Type.
@@ -430,7 +430,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
// Field: refcount. Don't count the ref we obtain while deferencing
// the weakref to this socket.
- fmt.Fprintf(buf, "%d ", s.Refs()-1)
+ fmt.Fprintf(buf, "%d ", s.ReadRefs()-1)
// Field: Socket struct address. Redacted due to the same reason as
// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
@@ -589,7 +589,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
// Field: ref; reference count on the socket inode. Don't count the ref
// we obtain while deferencing the weakref to this socket.
- fmt.Fprintf(buf, "%d ", s.Refs()-1)
+ fmt.Fprintf(buf, "%d ", s.ReadRefs()-1)
// Field: Socket struct address. Redacted due to the same reason as
// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
@@ -660,7 +660,7 @@ func sprintSlice(s []uint64) string {
return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice.
}
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
types := []interface{}{
&inet.StatSNMPIP{},
@@ -709,7 +709,7 @@ type netRouteData struct {
var _ dynamicInode = (*netRouteData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error {
fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT")
@@ -773,7 +773,7 @@ type netStatData struct {
var _ dynamicInode = (*netStatData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " +
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 6d2b90a8b..6d60acc30 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -37,11 +37,13 @@ const (
//
// +stateify savable
type tasksInode struct {
- kernfs.InodeNotSymlink
- kernfs.InodeDirectoryNoNewChildren
+ implStatFS
+ kernfs.AlwaysValid
kernfs.InodeAttrs
+ kernfs.InodeDirectoryNoNewChildren
+ kernfs.InodeNotSymlink
kernfs.OrderedChildren
- kernfs.AlwaysValid
+ tasksInodeRefs
locks vfs.FileLocks
@@ -84,6 +86,7 @@ func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace
cgroupControllers: cgroupControllers,
}
inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+ inode.EnableLeakCheck()
dentry := &kernfs.Dentry{}
dentry.Init(inode)
@@ -95,7 +98,7 @@ func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace
return inode, dentry
}
-// Lookup implements kernfs.inodeDynamicLookup.
+// Lookup implements kernfs.inodeDynamicLookup.Lookup.
func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
// Try to lookup a corresponding task.
tid, err := strconv.ParseUint(name, 10, 64)
@@ -119,7 +122,7 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
return taskDentry.VFSDentry(), nil
}
-// IterDirents implements kernfs.inodeDynamicLookup.
+// IterDirents implements kernfs.inodeDynamicLookup.IterDirents.
func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
// fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256
const FIRST_PROCESS_ENTRY = 256
@@ -197,9 +200,11 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
return maxTaskID, nil
}
-// Open implements kernfs.Inode.
+// Open implements kernfs.Inode.Open.
func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndZero,
+ })
if err != nil {
return nil, err
}
@@ -224,6 +229,11 @@ func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.St
return stat, nil
}
+// DecRef implements kernfs.Inode.DecRef.
+func (i *tasksInode) DecRef(context.Context) {
+ i.tasksInodeRefs.DecRef(i.Destroy)
+}
+
// staticFileSetStat implements a special static file that allows inode
// attributes to be set. This is to support /proc files that are readonly, but
// allow attributes to be set.
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 7d8983aa5..459a8e52e 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -32,6 +32,7 @@ import (
)
type selfSymlink struct {
+ implStatFS
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
kernfs.InodeSymlink
@@ -50,7 +51,7 @@ func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns
return d
}
-func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
t := kernel.TaskFromContext(ctx)
if t == nil {
// Who is reading this link?
@@ -63,17 +64,18 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
return strconv.FormatUint(uint64(tgid), 10), nil
}
-func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
- target, err := s.Readlink(ctx)
+func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+ target, err := s.Readlink(ctx, mnt)
return vfs.VirtualDentry{}, target, err
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
type threadSelfSymlink struct {
+ implStatFS
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
kernfs.InodeSymlink
@@ -92,7 +94,7 @@ func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64,
return d
}
-func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
t := kernel.TaskFromContext(ctx)
if t == nil {
// Who is reading this link?
@@ -106,12 +108,12 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
return fmt.Sprintf("%d/task/%d", tgid, tid), nil
}
-func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
- target, err := s.Readlink(ctx)
+func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+ target, err := s.Readlink(ctx, mnt)
return vfs.VirtualDentry{}, target, err
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
@@ -123,7 +125,7 @@ type dynamicBytesFileSetAttr struct {
kernfs.DynamicBytesFile
}
-// SetStat implements Inode.SetStat.
+// SetStat implements kernfs.Inode.SetStat.
func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts)
}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index 6768aa880..a3ffbb15e 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -25,21 +25,29 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
"gvisor.dev/gvisor/pkg/usermem"
)
+type tcpMemDir int
+
+const (
+ tcpRMem tcpMemDir = iota
+ tcpWMem
+)
+
// newSysDir returns the dentry corresponding to /proc/sys directory.
func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
- return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
- "kernel": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+ return newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+ "kernel": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
"hostname": fs.newDentry(root, fs.NextIno(), 0444, &hostnameData{}),
"shmall": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMALL)),
"shmmax": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMAX)),
"shmmni": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMNI)),
}),
- "vm": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+ "vm": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
"mmap_min_addr": fs.newDentry(root, fs.NextIno(), 0444, &mmapMinAddrData{k: k}),
"overcommit_memory": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0\n")),
}),
@@ -55,9 +63,12 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke
// network namespace of the calling process.
if stack := k.RootNetworkNamespace().Stack(); stack != nil {
contents = map[string]*kernfs.Dentry{
- "ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+ "ipv4": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
"tcp_recovery": fs.newDentry(root, fs.NextIno(), 0644, &tcpRecoveryData{stack: stack}),
+ "tcp_rmem": fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
"tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}),
+ "tcp_wmem": fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpWMem}),
+ "ip_forward": fs.newDentry(root, fs.NextIno(), 0444, &ipForwarding{stack: stack}),
// The following files are simple stubs until they are implemented in
// netstack, most of these files are configuration related. We use the
@@ -100,7 +111,7 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke
"tcp_syn_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")),
"tcp_timestamps": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
}),
- "core": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+ "core": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
"default_qdisc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("pfifo_fast")),
"message_burst": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("10")),
"message_cost": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")),
@@ -114,7 +125,7 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke
}
}
- return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents)
+ return newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents)
}
// mmapMinAddrData implements vfs.DynamicBytesSource for
@@ -165,7 +176,7 @@ type tcpSackData struct {
var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error {
if d.enabled == nil {
sack, err := d.stack.TCPSACKEnabled()
@@ -182,10 +193,11 @@ func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error {
// Tough luck.
val = "1\n"
}
- buf.WriteString(val)
- return nil
+ _, err := buf.WriteString(val)
+ return err
}
+// Write implements vfs.WritableDynamicBytesSource.Write.
func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
if offset != 0 {
// No need to handle partial writes thus far.
@@ -201,7 +213,7 @@ func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset
var v int32
n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
if err != nil {
- return n, err
+ return 0, err
}
if d.enabled == nil {
d.enabled = new(bool)
@@ -222,17 +234,18 @@ type tcpRecoveryData struct {
var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error {
recovery, err := d.stack.TCPRecovery()
if err != nil {
return err
}
- buf.WriteString(fmt.Sprintf("%d\n", recovery))
- return nil
+ _, err = buf.WriteString(fmt.Sprintf("%d\n", recovery))
+ return err
}
+// Write implements vfs.WritableDynamicBytesSource.Write.
func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
if offset != 0 {
// No need to handle partial writes thus far.
@@ -256,6 +269,94 @@ func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, off
return n, nil
}
+// tcpMemData implements vfs.WritableDynamicBytesSource for
+// /proc/sys/net/ipv4/tcp_rmem and /proc/sys/net/ipv4/tcp_wmem.
+//
+// +stateify savable
+type tcpMemData struct {
+ kernfs.DynamicBytesFile
+
+ dir tcpMemDir
+ stack inet.Stack `state:"wait"`
+
+ // mu protects against concurrent reads/writes to FDs based on the dentry
+ // backing this byte source.
+ mu sync.Mutex `state:"nosave"`
+}
+
+var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ d.mu.Lock()
+ defer d.mu.Unlock()
+
+ size, err := d.readSizeLocked()
+ if err != nil {
+ return err
+ }
+ _, err = buf.WriteString(fmt.Sprintf("%d\t%d\t%d\n", size.Min, size.Default, size.Max))
+ return err
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (d *tcpMemData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+ if offset != 0 {
+ // No need to handle partial writes thus far.
+ return 0, syserror.EINVAL
+ }
+ if src.NumBytes() == 0 {
+ return 0, nil
+ }
+ d.mu.Lock()
+ defer d.mu.Unlock()
+
+ // Limit the amount of memory allocated.
+ src = src.TakeFirst(usermem.PageSize - 1)
+ size, err := d.readSizeLocked()
+ if err != nil {
+ return 0, err
+ }
+ buf := []int32{int32(size.Min), int32(size.Default), int32(size.Max)}
+ n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts)
+ if err != nil {
+ return 0, err
+ }
+ newSize := inet.TCPBufferSize{
+ Min: int(buf[0]),
+ Default: int(buf[1]),
+ Max: int(buf[2]),
+ }
+ if err := d.writeSizeLocked(newSize); err != nil {
+ return 0, err
+ }
+ return n, nil
+}
+
+// Precondition: d.mu must be locked.
+func (d *tcpMemData) readSizeLocked() (inet.TCPBufferSize, error) {
+ switch d.dir {
+ case tcpRMem:
+ return d.stack.TCPReceiveBufferSize()
+ case tcpWMem:
+ return d.stack.TCPSendBufferSize()
+ default:
+ panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
+ }
+}
+
+// Precondition: d.mu must be locked.
+func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error {
+ switch d.dir {
+ case tcpRMem:
+ return d.stack.SetTCPReceiveBufferSize(size)
+ case tcpWMem:
+ return d.stack.SetTCPSendBufferSize(size)
+ default:
+ panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
+ }
+}
+
// ipForwarding implements vfs.WritableDynamicBytesSource for
// /proc/sys/net/ipv4/ip_forwarding.
//
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
index 1abf56da2..6cee22823 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
@@ -137,12 +137,12 @@ func TestConfigureIPForwarding(t *testing.T) {
// Write the values.
src := usermem.BytesIOSequence([]byte(c.str))
if n, err := file.Write(ctx, src, 0); n != int64(len(c.str)) || err != nil {
- t.Errorf("file.Write(ctx, nil, %v, 0) = (%d, %v); wanted (%d, nil)", c.str, n, err, len(c.str))
+ t.Errorf("file.Write(ctx, nil, %q, 0) = (%d, %v); want (%d, nil)", c.str, n, err, len(c.str))
}
// Read the values from the stack and check them.
- if s.IPForwarding != c.final {
- t.Errorf("s.IPForwarding = %v; wanted %v", s.IPForwarding, c.final)
+ if got, want := s.IPForwarding, c.final; got != want {
+ t.Errorf("s.IPForwarding incorrect; got: %v, want: %v", got, want)
}
})
}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 3c9297dee..f693f9060 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -104,7 +104,7 @@ func setup(t *testing.T) *testutil.System {
AllowUserMount: true,
})
- mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{})
+ mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.MountOptions{})
if err != nil {
t.Fatalf("NewMountNamespace(): %v", err)
}
@@ -132,7 +132,7 @@ func setup(t *testing.T) *testutil.System {
},
},
}
- if err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil {
+ if _, err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil {
t.Fatalf("MountAt(/proc): %v", err)
}
return testutil.NewSystem(ctx, t, k.VFS(), mntns)
diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go
index 6297e1df4..3c02af8c9 100644
--- a/pkg/sentry/fsimpl/signalfd/signalfd.go
+++ b/pkg/sentry/fsimpl/signalfd/signalfd.go
@@ -26,7 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
-// SignalFileDescription implements FileDescriptionImpl for signal fds.
+// SignalFileDescription implements vfs.FileDescriptionImpl for signal fds.
type SignalFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -83,7 +83,7 @@ func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) {
sfd.mask = mask
}
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
// Attempt to dequeue relevant signals.
info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0)
@@ -132,5 +132,5 @@ func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) {
sfd.target.SignalUnregister(entry)
}
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
func (sfd *SignalFileDescription) Release(context.Context) {}
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
index c61818ff6..80b41aa9e 100644
--- a/pkg/sentry/fsimpl/sockfs/sockfs.go
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -30,12 +30,12 @@ import (
// filesystemType implements vfs.FilesystemType.
type filesystemType struct{}
-// GetFilesystem implements FilesystemType.GetFilesystem.
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
panic("sockfs.filesystemType.GetFilesystem should never be called")
}
-// Name implements FilesystemType.Name.
+// Name implements vfs.FilesystemType.Name.
//
// Note that registering sockfs is unnecessary, except for the fact that it
// will not show up under /proc/filesystems as a result. This is a very minor
@@ -81,10 +81,10 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
// inode implements kernfs.Inode.
type inode struct {
- kernfs.InodeNotDirectory
- kernfs.InodeNotSymlink
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
+ kernfs.InodeNotDirectory
+ kernfs.InodeNotSymlink
}
// Open implements kernfs.Inode.Open.
@@ -92,6 +92,11 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
return nil, syserror.ENXIO
}
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+ return vfs.GenericStatFS(linux.SOCKFS_MAGIC), nil
+}
+
// NewDentry constructs and returns a sockfs dentry.
//
// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem().
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index 1b548ccd4..906cd52cb 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -1,21 +1,41 @@
load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
licenses(["notice"])
+go_template_instance(
+ name = "dir_refs",
+ out = "dir_refs.go",
+ package = "sys",
+ prefix = "dir",
+ template = "//pkg/refs_vfs2:refs_template",
+ types = {
+ "T": "dir",
+ },
+)
+
go_library(
name = "sys",
srcs = [
+ "dir_refs.go",
+ "kcov.go",
"sys.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/coverage",
+ "//pkg/log",
+ "//pkg/refs",
+ "//pkg/sentry/arch",
"//pkg/sentry/fsimpl/kernfs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
+ "//pkg/sentry/memmap",
"//pkg/sentry/vfs",
"//pkg/syserror",
+ "//pkg/usermem",
],
)
diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go
new file mode 100644
index 000000000..73f3d3309
--- /dev/null
+++ b/pkg/sentry/fsimpl/sys/kcov.go
@@ -0,0 +1,117 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sys
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials) *kernfs.Dentry {
+ k := &kcovInode{}
+ k.InodeAttrs.Init(creds, 0, 0, fs.NextIno(), linux.S_IFREG|0600)
+ d := &kernfs.Dentry{}
+ d.Init(k)
+ return d
+}
+
+// kcovInode implements kernfs.Inode.
+type kcovInode struct {
+ kernfs.InodeAttrs
+ kernfs.InodeNoopRefCount
+ kernfs.InodeNotDirectory
+ kernfs.InodeNotSymlink
+ implStatFS
+}
+
+func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ k := kernel.KernelFromContext(ctx)
+ if k == nil {
+ panic("KernelFromContext returned nil")
+ }
+ fd := &kcovFD{
+ inode: i,
+ kcov: k.NewKcov(),
+ }
+
+ if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{
+ DenyPRead: true,
+ DenyPWrite: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+type kcovFD struct {
+ vfs.FileDescriptionDefaultImpl
+ vfs.NoLockFD
+
+ vfsfd vfs.FileDescription
+ inode *kcovInode
+ kcov *kernel.Kcov
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *kcovFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ cmd := uint32(args[1].Int())
+ arg := args[2].Uint64()
+ switch uint32(cmd) {
+ case linux.KCOV_INIT_TRACE:
+ return 0, fd.kcov.InitTrace(arg)
+ case linux.KCOV_ENABLE:
+ return 0, fd.kcov.EnableTrace(ctx, uint8(arg))
+ case linux.KCOV_DISABLE:
+ if arg != 0 {
+ // This arg is unused; it should be 0.
+ return 0, syserror.EINVAL
+ }
+ return 0, fd.kcov.DisableTrace(ctx)
+ default:
+ return 0, syserror.ENOTTY
+ }
+}
+
+// ConfigureMmap implements vfs.FileDescriptionImpl.ConfigureMmap.
+func (fd *kcovFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+ return fd.kcov.ConfigureMMap(ctx, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *kcovFD) Release(ctx context.Context) {
+ // kcov instances have reference counts in Linux, but this seems sufficient
+ // for our purposes.
+ fd.kcov.Reset()
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *kcovFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ creds := auth.CredentialsFromContext(ctx)
+ fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+ return fd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *kcovFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ return fd.inode.Stat(ctx, fd.vfsfd.Mount().Filesystem(), opts)
+}
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 0401726b6..39952d2d0 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/coverage"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -73,7 +74,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}),
"firmware": fs.newDir(creds, defaultSysDirMode, nil),
"fs": fs.newDir(creds, defaultSysDirMode, nil),
- "kernel": fs.newDir(creds, defaultSysDirMode, nil),
+ "kernel": kernelDir(ctx, fs, creds),
"module": fs.newDir(creds, defaultSysDirMode, nil),
"power": fs.newDir(creds, defaultSysDirMode, nil),
})
@@ -94,6 +95,21 @@ func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernf
return fs.newDir(creds, defaultSysDirMode, children)
}
+func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernfs.Dentry {
+ // If kcov is available, set up /sys/kernel/debug/kcov. Technically, debugfs
+ // should be mounted at debug/, but for our purposes, it is sufficient to
+ // keep it in sys.
+ var children map[string]*kernfs.Dentry
+ if coverage.KcovAvailable() {
+ children = map[string]*kernfs.Dentry{
+ "debug": fs.newDir(creds, linux.FileMode(0700), map[string]*kernfs.Dentry{
+ "kcov": fs.newKcovFile(ctx, creds),
+ }),
+ }
+ }
+ return fs.newDir(creds, defaultSysDirMode, children)
+}
+
// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
@@ -102,6 +118,7 @@ func (fs *filesystem) Release(ctx context.Context) {
// dir implements kernfs.Inode.
type dir struct {
+ dirRefs
kernfs.InodeAttrs
kernfs.InodeNoDynamicLookup
kernfs.InodeNotSymlink
@@ -117,6 +134,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
d := &dir{}
d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+ d.EnableLeakCheck()
d.dentry.Init(d)
d.IncLinks(d.OrderedChildren.Populate(&d.dentry, contents))
@@ -124,23 +142,37 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
return &d.dentry
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
// Open implements kernfs.Inode.Open.
func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndStaticEntries,
+ })
if err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
}
+// DecRef implements kernfs.Inode.DecRef.
+func (d *dir) DecRef(context.Context) {
+ d.dirRefs.DecRef(d.Destroy)
+}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+ return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
+}
+
// cpuFile implements kernfs.Inode.
type cpuFile struct {
+ implStatFS
kernfs.DynamicBytesFile
+
maxCores uint
}
@@ -157,3 +189,10 @@ func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode li
d.Init(c)
return d
}
+
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+ return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
+}
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 9fd38b295..0a0d914cc 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -38,7 +38,7 @@ func newTestSystem(t *testing.T) *testutil.System {
AllowUserMount: true,
})
- mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{})
+ mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.MountOptions{})
if err != nil {
t.Fatalf("Failed to create new mount namespace: %v", err)
}
diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go
index 86beaa0a8..ac8a4e3bb 100644
--- a/pkg/sentry/fsimpl/timerfd/timerfd.go
+++ b/pkg/sentry/fsimpl/timerfd/timerfd.go
@@ -26,7 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
-// TimerFileDescription implements FileDescriptionImpl for timer fds. It also
+// TimerFileDescription implements vfs.FileDescriptionImpl for timer fds. It also
// implements ktime.TimerListener.
type TimerFileDescription struct {
vfsfd vfs.FileDescription
@@ -62,7 +62,7 @@ func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock,
return &tfd.vfsfd, nil
}
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
const sizeofUint64 = 8
if dst.NumBytes() < sizeofUint64 {
@@ -128,7 +128,7 @@ func (tfd *TimerFileDescription) ResumeTimer() {
tfd.timer.Resume()
}
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
func (tfd *TimerFileDescription) Release(context.Context) {
tfd.timer.Destroy()
}
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index d263147c2..5209a17af 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -182,7 +182,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
if err != nil {
b.Fatalf("failed to create tmpfs root mount: %v", err)
}
@@ -376,7 +376,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
if err != nil {
b.Fatalf("failed to create tmpfs root mount: %v", err)
}
@@ -405,7 +405,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
}
defer mountPoint.DecRef(ctx)
// Create and mount the submount.
- if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
+ if _, err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
b.Fatalf("failed to mount tmpfs submount: %v", err)
}
filePathBuilder.WriteString(mountPointName)
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index 78b4fc5be..070c75e68 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -57,8 +57,9 @@ func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.Fi
return dir
}
-// Preconditions: filesystem.mu must be locked for writing. dir must not
-// already contain a child with the given name.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * dir must not already contain a child with the given name.
func (dir *directory) insertChildLocked(child *dentry, name string) {
child.parent = &dir.dentry
child.name = name
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index cb8b2d944..1362c1602 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -25,7 +25,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
// Sync implements vfs.FilesystemImpl.Sync.
@@ -39,7 +38,9 @@ func (fs *filesystem) Sync(ctx context.Context) error {
//
// stepLocked is loosely analogous to fs/namei.c:walk_component().
//
-// Preconditions: filesystem.mu must be locked. !rp.Done().
+// Preconditions:
+// * filesystem.mu must be locked.
+// * !rp.Done().
func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
dir, ok := d.inode.impl.(*directory)
if !ok {
@@ -97,7 +98,9 @@ afterSymlink:
// walkParentDirLocked is loosely analogous to Linux's
// fs/namei.c:path_parentat().
//
-// Preconditions: filesystem.mu must be locked. !rp.Done().
+// Preconditions:
+// * filesystem.mu must be locked.
+// * !rp.Done().
func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
for !rp.Final() {
next, err := stepLocked(ctx, rp, d)
@@ -139,8 +142,9 @@ func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error)
// doCreateAt is loosely analogous to a conjunction of Linux's
// fs/namei.c:filename_create() and done_path_create().
//
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
fs.mu.Lock()
defer fs.mu.Unlock()
@@ -307,18 +311,28 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
// don't need fs.mu for writing.
if opts.Flags&linux.O_CREAT == 0 {
fs.mu.RLock()
- defer fs.mu.RUnlock()
d, err := resolveLocked(ctx, rp)
if err != nil {
+ fs.mu.RUnlock()
return nil, err
}
+ d.IncRef()
+ defer d.DecRef(ctx)
+ fs.mu.RUnlock()
return d.open(ctx, rp, &opts, false /* afterCreate */)
}
mustCreate := opts.Flags&linux.O_EXCL != 0
start := rp.Start().Impl().(*dentry)
fs.mu.Lock()
- defer fs.mu.Unlock()
+ unlocked := false
+ unlock := func() {
+ if !unlocked {
+ fs.mu.Unlock()
+ unlocked = true
+ }
+ }
+ defer unlock()
if rp.Done() {
// Reject attempts to open mount root directory with O_CREAT.
if rp.MustBeDir() {
@@ -327,6 +341,9 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if mustCreate {
return nil, syserror.EEXIST
}
+ start.IncRef()
+ defer start.DecRef(ctx)
+ unlock()
return start.open(ctx, rp, &opts, false /* afterCreate */)
}
afterTrailingSymlink:
@@ -364,6 +381,7 @@ afterTrailingSymlink:
creds := rp.Credentials()
child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode))
parentDir.insertChildLocked(child, name)
+ unlock()
fd, err := child.open(ctx, rp, &opts, true)
if err != nil {
return nil, err
@@ -392,9 +410,14 @@ afterTrailingSymlink:
if rp.MustBeDir() && !child.inode.isDir() {
return nil, syserror.ENOTDIR
}
+ child.IncRef()
+ defer child.DecRef(ctx)
+ unlock()
return child.open(ctx, rp, &opts, false)
}
+// Preconditions: The caller must hold no locks (since opening pipes may block
+// indefinitely).
func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) {
ats := vfs.AccessTypesForOpenFlags(opts)
if !afterCreate {
@@ -682,16 +705,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
if _, err := resolveLocked(ctx, rp); err != nil {
return linux.Statfs{}, err
}
- statfs := linux.Statfs{
- Type: linux.TMPFS_MAGIC,
- BlockSize: usermem.PageSize,
- FragmentSize: usermem.PageSize,
- NameLength: linux.NAME_MAX,
- // TODO(b/29637826): Allow configuring a tmpfs size and enforce it.
- Blocks: 0,
- BlocksFree: 0,
- }
- return statfs, nil
+ return globalStatfs, nil
}
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
@@ -756,7 +770,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return nil
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
@@ -769,43 +783,46 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
}
switch impl := d.inode.impl.(type) {
case *socketFile:
+ if impl.ep == nil {
+ return nil, syserror.ECONNREFUSED
+ }
return impl.ep, nil
default:
return nil, syserror.ECONNREFUSED
}
}
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
d, err := resolveLocked(ctx, rp)
if err != nil {
return nil, err
}
- return d.inode.listxattr(size)
+ return d.inode.listXattr(size)
}
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
d, err := resolveLocked(ctx, rp)
if err != nil {
return "", err
}
- return d.inode.getxattr(rp.Credentials(), &opts)
+ return d.inode.getXattr(rp.Credentials(), &opts)
}
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
fs.mu.RLock()
d, err := resolveLocked(ctx, rp)
if err != nil {
fs.mu.RUnlock()
return err
}
- if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil {
+ if err := d.inode.setXattr(rp.Credentials(), &opts); err != nil {
fs.mu.RUnlock()
return err
}
@@ -815,15 +832,15 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
return nil
}
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
fs.mu.RLock()
d, err := resolveLocked(ctx, rp)
if err != nil {
fs.mu.RUnlock()
return err
}
- if err := d.inode.removexattr(rp.Credentials(), name); err != nil {
+ if err := d.inode.removeXattr(rp.Credentials(), name); err != nil {
fs.mu.RUnlock()
return err
}
@@ -848,8 +865,16 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
}
if d.parent == nil {
if d.name != "" {
- // This must be an anonymous memfd file.
+ // This file must have been created by
+ // newUnlinkedRegularFileDescription(). In Linux,
+ // mm/shmem.c:__shmem_file_setup() =>
+ // fs/file_table.c:alloc_file_pseudo() sets the created
+ // dentry's dentry_operations to anon_ops, for which d_dname ==
+ // simple_dname. fs/d_path.c:simple_dname() defines the
+ // dentry's pathname to be its name, prefixed with "/" and
+ // suffixed with " (deleted)".
b.PrependComponent("/" + d.name)
+ b.AppendString(" (deleted)")
return vfs.PrependPathSyntheticError{}
}
return vfs.PrependPathAtNonMountRootError{}
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 739350cf0..5b0471ff4 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -28,8 +28,8 @@ type namedPipe struct {
}
// Preconditions:
-// * fs.mu must be locked.
-// * rp.Mount().CheckBeginWrite() has been called successfully.
+// * fs.mu must be locked.
+// * rp.Mount().CheckBeginWrite() has been called successfully.
func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)}
file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode)
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index ec2701d8b..be29a2363 100644
--- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -158,7 +158,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
if err != nil {
t.Fatalf("failed to create tmpfs root mount: %v", err)
}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 0710b65db..b8699d064 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -42,6 +42,10 @@ type regularFile struct {
// memFile is a platform.File used to allocate pages to this regularFile.
memFile *pgalloc.MemoryFile
+ // memoryUsageKind is the memory accounting category under which pages backing
+ // this regularFile's contents are accounted.
+ memoryUsageKind usage.MemoryKind
+
// mapsMu protects mappings.
mapsMu sync.Mutex `state:"nosave"`
@@ -86,14 +90,75 @@ type regularFile struct {
func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
file := &regularFile{
- memFile: fs.memFile,
- seals: linux.F_SEAL_SEAL,
+ memFile: fs.memFile,
+ memoryUsageKind: usage.Tmpfs,
+ seals: linux.F_SEAL_SEAL,
}
file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode)
file.inode.nlink = 1 // from parent directory
return &file.inode
}
+// newUnlinkedRegularFileDescription creates a regular file on the tmpfs
+// filesystem represented by mount and returns an FD representing that file.
+// The new file is not reachable by path traversal from any other file.
+//
+// newUnlinkedRegularFileDescription is analogous to Linux's
+// mm/shmem.c:__shmem_file_setup().
+//
+// Preconditions: mount must be a tmpfs mount.
+func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) {
+ fs, ok := mount.Filesystem().Impl().(*filesystem)
+ if !ok {
+ panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount")
+ }
+
+ inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
+ d := fs.newDentry(inode)
+ defer d.DecRef(ctx)
+ d.name = name
+
+ fd := &regularFileFD{}
+ fd.Init(&inode.locks)
+ flags := uint32(linux.O_RDWR)
+ if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ return nil, err
+ }
+ return fd, nil
+}
+
+// NewZeroFile creates a new regular file and file description as for
+// mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is
+// initially (implicitly) filled with zeroes.
+//
+// Preconditions: mount must be a tmpfs mount.
+func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) {
+ // Compare mm/shmem.c:shmem_zero_setup().
+ fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero")
+ if err != nil {
+ return nil, err
+ }
+ rf := fd.inode().impl.(*regularFile)
+ rf.memoryUsageKind = usage.Anonymous
+ rf.size = size
+ return &fd.vfsfd, err
+}
+
+// NewMemfd creates a new regular file and file description as for
+// memfd_create.
+//
+// Preconditions: mount must be a tmpfs mount.
+func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
+ fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name)
+ if err != nil {
+ return nil, err
+ }
+ if allowSeals {
+ fd.inode().impl.(*regularFile).seals = 0
+ }
+ return &fd.vfsfd, nil
+}
+
// truncate grows or shrinks the file to the given size. It returns true if the
// file size was updated.
func (rf *regularFile) truncate(newSize uint64) (bool, error) {
@@ -226,7 +291,7 @@ func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.
optional.End = pgend
}
- cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+ cerr := rf.data.Fill(ctx, required, optional, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
// Newly-allocated pages are zeroed, so we don't need to do anything.
return dsts.NumBytes(), nil
})
@@ -575,7 +640,7 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64,
case gap.Ok():
// Allocate memory for the write.
gapMR := gap.Range().Intersect(pgMR)
- fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs)
+ fr, err := rw.file.memFile.Allocate(gapMR.Length(), rw.file.memoryUsageKind)
if err != nil {
retErr = err
goto exitLoop
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index de2af6d01..4658e1533 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -201,6 +201,25 @@ func (fs *filesystem) Release(ctx context.Context) {
fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
}
+// immutable
+var globalStatfs = linux.Statfs{
+ Type: linux.TMPFS_MAGIC,
+ BlockSize: usermem.PageSize,
+ FragmentSize: usermem.PageSize,
+ NameLength: linux.NAME_MAX,
+
+ // tmpfs currently does not support configurable size limits. In Linux,
+ // such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from
+ // statfs(2). However, many applications treat this as having a size limit
+ // of 0. To work around this, claim to have a very large but non-zero size,
+ // chosen to ensure that BlockSize * Blocks does not overflow int64 (which
+ // applications may also handle incorrectly).
+ // TODO(b/29637826): allow configuring a tmpfs size and enforce it.
+ Blocks: math.MaxInt64 / usermem.PageSize,
+ BlocksFree: math.MaxInt64 / usermem.PageSize,
+ BlocksAvailable: math.MaxInt64 / usermem.PageSize,
+}
+
// dentry implements vfs.DentryImpl.
type dentry struct {
vfsd vfs.Dentry
@@ -340,8 +359,10 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth
// incLinksLocked increments i's link count.
//
-// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
-// i.nlink < maxLinks.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * i.nlink != 0.
+// * i.nlink < maxLinks.
func (i *inode) incLinksLocked() {
if i.nlink == 0 {
panic("tmpfs.inode.incLinksLocked() called with no existing links")
@@ -355,7 +376,9 @@ func (i *inode) incLinksLocked() {
// decLinksLocked decrements i's link count. If the link count reaches 0, we
// remove a reference on i as well.
//
-// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * i.nlink != 0.
func (i *inode) decLinksLocked(ctx context.Context) {
if i.nlink == 0 {
panic("tmpfs.inode.decLinksLocked() called with no existing links")
@@ -594,62 +617,53 @@ func (i *inode) touchCMtime() {
i.mu.Unlock()
}
-// Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds
-// inode.mu.
+// Preconditions:
+// * The caller has called vfs.Mount.CheckBeginWrite().
+// * inode.mu must be locked.
func (i *inode) touchCMtimeLocked() {
now := i.fs.clock.Now().Nanoseconds()
atomic.StoreInt64(&i.mtime, now)
atomic.StoreInt64(&i.ctime, now)
}
-func (i *inode) listxattr(size uint64) ([]string, error) {
- return i.xattrs.Listxattr(size)
+func (i *inode) listXattr(size uint64) ([]string, error) {
+ return i.xattrs.ListXattr(size)
}
-func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
- if err := i.checkPermissions(creds, vfs.MayRead); err != nil {
+func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
+ if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
return "", err
}
- if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
- return "", syserror.EOPNOTSUPP
- }
- if !i.userXattrSupported() {
- return "", syserror.ENODATA
- }
- return i.xattrs.Getxattr(opts)
+ return i.xattrs.GetXattr(opts)
}
-func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
- if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
+ if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
return err
}
- if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
- return syserror.EOPNOTSUPP
- }
- if !i.userXattrSupported() {
- return syserror.EPERM
- }
- return i.xattrs.Setxattr(opts)
+ return i.xattrs.SetXattr(opts)
}
-func (i *inode) removexattr(creds *auth.Credentials, name string) error {
- if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+func (i *inode) removeXattr(creds *auth.Credentials, name string) error {
+ if err := i.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
return err
}
- if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+ return i.xattrs.RemoveXattr(name)
+}
+
+func (i *inode) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+ // We currently only support extended attributes in the user.* and
+ // trusted.* namespaces. See b/148380782.
+ if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) && !strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) {
return syserror.EOPNOTSUPP
}
- if !i.userXattrSupported() {
- return syserror.EPERM
+ mode := linux.FileMode(atomic.LoadUint32(&i.mode))
+ kuid := auth.KUID(atomic.LoadUint32(&i.uid))
+ kgid := auth.KGID(atomic.LoadUint32(&i.gid))
+ if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+ return err
}
- return i.xattrs.Removexattr(name)
-}
-
-// Extended attributes in the user.* namespace are only supported for regular
-// files and directories.
-func (i *inode) userXattrSupported() bool {
- filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode)
- return filetype == linux.S_IFREG || filetype == linux.S_IFDIR
+ return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
}
// fileDescription is embedded by tmpfs implementations of
@@ -693,20 +707,25 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
return nil
}
-// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
- return fd.inode().listxattr(size)
+// StatFS implements vfs.FileDescriptionImpl.StatFS.
+func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+ return globalStatfs, nil
+}
+
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+ return fd.inode().listXattr(size)
}
-// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
- return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts)
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+ return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts)
}
-// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
-func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
d := fd.dentry()
- if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
+ if err := d.inode.setXattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
return err
}
@@ -715,10 +734,10 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption
return nil
}
-// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
-func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
d := fd.dentry()
- if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil {
+ if err := d.inode.removeXattr(auth.CredentialsFromContext(ctx), name); err != nil {
return err
}
@@ -727,37 +746,6 @@ func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
return nil
}
-// NewMemfd creates a new tmpfs regular file and file description that can back
-// an anonymous fd created by memfd_create.
-func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
- fs, ok := mount.Filesystem().Impl().(*filesystem)
- if !ok {
- panic("NewMemfd() called with non-tmpfs mount")
- }
-
- // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
- // S_IRWXUGO.
- inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
- rf := inode.impl.(*regularFile)
- if allowSeals {
- rf.seals = 0
- }
-
- d := fs.newDentry(inode)
- defer d.DecRef(ctx)
- d.name = name
-
- // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
- // FMODE_READ | FMODE_WRITE.
- var fd regularFileFD
- fd.Init(&inode.locks)
- flags := uint32(linux.O_RDWR)
- if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
- return nil, err
- }
- return &fd.vfsfd, nil
-}
-
// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
index 6f3e3ae6f..99c8e3c0f 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
@@ -41,7 +41,7 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr
vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
if err != nil {
return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
}
diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD
index 28d2a4bcb..bc8e38431 100644
--- a/pkg/sentry/fsimpl/verity/BUILD
+++ b/pkg/sentry/fsimpl/verity/BUILD
@@ -13,11 +13,16 @@ go_library(
"//pkg/abi/linux",
"//pkg/context",
"//pkg/fspath",
+ "//pkg/marshal/primitive",
+ "//pkg/merkletree",
+ "//pkg/sentry/arch",
"//pkg/sentry/fs/lock",
+ "//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/vfs",
"//pkg/sync",
"//pkg/syserror",
+ "//pkg/usermem",
],
)
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index 78c6074bd..26b117ca4 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -15,9 +15,16 @@
package verity
import (
+ "bytes"
+ "fmt"
+ "io"
+ "strconv"
+ "strings"
+
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/merkletree"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -91,10 +98,350 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de
putDentrySlice(*ds)
}
-// resolveLocked resolves rp to an existing file.
-func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
- // TODO(b/159261227): Implement resolveLocked.
- return nil, nil
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may have a reference count of zero, and which therefore
+// should be dropped once traversal is complete, are appended to ds.
+//
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// !rp.Done().
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+
+ if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+
+afterSymlink:
+ name := rp.Component()
+ if name == "." {
+ rp.Advance()
+ return d, nil
+ }
+ if name == ".." {
+ if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
+ return nil, err
+ } else if isRoot || d.parent == nil {
+ rp.Advance()
+ return d, nil
+ }
+ if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+ return nil, err
+ }
+ rp.Advance()
+ return d.parent, nil
+ }
+ child, err := fs.getChildLocked(ctx, d, name, ds)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
+ return nil, err
+ }
+ if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
+ target, err := child.readlink(ctx)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.HandleSymlink(target); err != nil {
+ return nil, err
+ }
+ goto afterSymlink // don't check the current directory again
+ }
+ rp.Advance()
+ return child, nil
+}
+
+// verifyChild verifies the root hash of child against the already verified
+// root hash of the parent to ensure the child is expected. verifyChild
+// triggers a sentry panic if unexpected modifications to the file system are
+// detected. In noCrashOnVerificationFailure mode it returns a syserror
+// instead.
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// TODO(b/166474175): Investigate all possible errors returned in this
+// function, and make sure we differentiate all errors that indicate unexpected
+// modifications to the file system from the ones that are not harmful.
+func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+
+ // Get the path to the child dentry. This is only used to provide path
+ // information in failure case.
+ childPath, err := vfsObj.PathnameWithDeleted(ctx, child.fs.rootDentry.lowerVD, child.lowerVD)
+ if err != nil {
+ return nil, err
+ }
+
+ verityMu.RLock()
+ defer verityMu.RUnlock()
+ // Read the offset of the child from the extended attributes of the
+ // corresponding Merkle tree file.
+ // This is the offset of the root hash for child in its parent's Merkle
+ // tree file.
+ off, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: child.lowerMerkleVD,
+ Start: child.lowerMerkleVD,
+ }, &vfs.GetXattrOptions{
+ Name: merkleOffsetInParentXattr,
+ Size: sizeOfStringInt32,
+ })
+
+ // The Merkle tree file for the child should have been created and
+ // contains the expected xattrs. If the file or the xattr does not
+ // exist, it indicates unexpected modifications to the file system.
+ if err == syserror.ENOENT || err == syserror.ENODATA {
+ return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleOffsetInParentXattr, childPath, err))
+ }
+ if err != nil {
+ return nil, err
+ }
+ // The offset xattr should be an integer. If it's not, it indicates
+ // unexpected modifications to the file system.
+ offset, err := strconv.Atoi(off)
+ if err != nil {
+ return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleOffsetInParentXattr, childPath, err))
+ }
+
+ // Open parent Merkle tree file to read and verify child's root hash.
+ parentMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: parent.lowerMerkleVD,
+ Start: parent.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ })
+
+ // The parent Merkle tree file should have been created. If it's
+ // missing, it indicates an unexpected modification to the file system.
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to open parent Merkle file for %s: %v", childPath, err))
+ }
+ if err != nil {
+ return nil, err
+ }
+
+ // dataSize is the size of raw data for the Merkle tree. For a file,
+ // dataSize is the size of the whole file. For a directory, dataSize is
+ // the size of all its children's root hashes.
+ dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{
+ Name: merkleSizeXattr,
+ Size: sizeOfStringInt32,
+ })
+
+ // The Merkle tree file for the child should have been created and
+ // contains the expected xattrs. If the file or the xattr does not
+ // exist, it indicates unexpected modifications to the file system.
+ if err == syserror.ENOENT || err == syserror.ENODATA {
+ return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleSizeXattr, childPath, err))
+ }
+ if err != nil {
+ return nil, err
+ }
+
+ // The dataSize xattr should be an integer. If it's not, it indicates
+ // unexpected modifications to the file system.
+ parentSize, err := strconv.Atoi(dataSize)
+ if err != nil {
+ return nil, alertIntegrityViolation(syserror.EINVAL, fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err))
+ }
+
+ fdReader := vfs.FileReadWriteSeeker{
+ FD: parentMerkleFD,
+ Ctx: ctx,
+ }
+
+ // Since we are verifying against a directory Merkle tree, buf should
+ // contain the root hash of the children in the parent Merkle tree when
+ // Verify returns with success.
+ var buf bytes.Buffer
+ if _, err := merkletree.Verify(&buf, &fdReader, &fdReader, int64(parentSize), int64(offset), int64(merkletree.DigestSize()), parent.rootHash, true /* dataAndTreeInSameFile */); err != nil && err != io.EOF {
+ return nil, alertIntegrityViolation(syserror.EIO, fmt.Sprintf("Verification for %s failed: %v", childPath, err))
+ }
+
+ // Cache child root hash when it's verified the first time.
+ if len(child.rootHash) == 0 {
+ child.rootHash = buf.Bytes()
+ }
+ return child, nil
+}
+
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
+ if child, ok := parent.children[name]; ok {
+ // If enabling verification on files/directories is not allowed
+ // during runtime, all cached children are already verified. If
+ // runtime enable is allowed and the parent directory is
+ // enabled, we should verify the child root hash here because
+ // it may be cached before enabled.
+ if fs.allowRuntimeEnable && len(parent.rootHash) != 0 {
+ if _, err := fs.verifyChild(ctx, parent, child); err != nil {
+ return nil, err
+ }
+ }
+ return child, nil
+ }
+ child, err := fs.lookupAndVerifyLocked(ctx, parent, name)
+ if err != nil {
+ return nil, err
+ }
+ if parent.children == nil {
+ parent.children = make(map[string]*dentry)
+ }
+ parent.children[name] = child
+ // child's refcount is initially 0, so it may be dropped after traversal.
+ *ds = appendDentry(*ds, child)
+ return child, nil
+}
+
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
+ vfsObj := fs.vfsfs.VirtualFilesystem()
+
+ childFilename := fspath.Parse(name)
+ childVD, childErr := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: parent.lowerVD,
+ Start: parent.lowerVD,
+ Path: childFilename,
+ }, &vfs.GetDentryOptions{})
+
+ // We will handle ENOENT separately, as it may indicate unexpected
+ // modifications to the file system, and may cause a sentry panic.
+ if childErr != nil && childErr != syserror.ENOENT {
+ return nil, childErr
+ }
+
+ // The dentry needs to be cleaned up if any error occurs. IncRef will be
+ // called if a verity child dentry is successfully created.
+ if childErr == nil {
+ defer childVD.DecRef(ctx)
+ }
+
+ childMerkleFilename := merklePrefix + name
+ childMerkleVD, childMerkleErr := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: parent.lowerVD,
+ Start: parent.lowerVD,
+ Path: fspath.Parse(childMerkleFilename),
+ }, &vfs.GetDentryOptions{})
+
+ // We will handle ENOENT separately, as it may indicate unexpected
+ // modifications to the file system, and may cause a sentry panic.
+ if childMerkleErr != nil && childMerkleErr != syserror.ENOENT {
+ return nil, childMerkleErr
+ }
+
+ // The dentry needs to be cleaned up if any error occurs. IncRef will be
+ // called if a verity child dentry is successfully created.
+ if childMerkleErr == nil {
+ defer childMerkleVD.DecRef(ctx)
+ }
+
+ // Get the path to the parent dentry. This is only used to provide path
+ // information in failure case.
+ parentPath, err := vfsObj.PathnameWithDeleted(ctx, parent.fs.rootDentry.lowerVD, parent.lowerVD)
+ if err != nil {
+ return nil, err
+ }
+
+ // TODO(b/166474175): Investigate all possible errors of childErr and
+ // childMerkleErr, and make sure we differentiate all errors that
+ // indicate unexpected modifications to the file system from the ones
+ // that are not harmful.
+ if childErr == syserror.ENOENT && childMerkleErr == nil {
+ // Failed to get child file/directory dentry. However the
+ // corresponding Merkle tree is found. This indicates an
+ // unexpected modification to the file system that
+ // removed/renamed the child.
+ return nil, alertIntegrityViolation(childErr, fmt.Sprintf("Target file %s is expected but missing", parentPath+"/"+name))
+ } else if childErr == nil && childMerkleErr == syserror.ENOENT {
+ // If in allowRuntimeEnable mode, and the Merkle tree file is
+ // not created yet, we create an empty Merkle tree file, so that
+ // if the file is enabled through ioctl, we have the Merkle tree
+ // file open and ready to use.
+ // This may cause empty and unused Merkle tree files in
+ // allowRuntimeEnable mode, if they are never enabled. This
+ // does not affect verification, as we rely on cached root hash
+ // to decide whether to perform verification, not the existence
+ // of the Merkle tree file. Also, those Merkle tree files are
+ // always hidden and cannot be accessed by verity fs users.
+ if fs.allowRuntimeEnable {
+ childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: parent.lowerVD,
+ Start: parent.lowerVD,
+ Path: fspath.Parse(childMerkleFilename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT,
+ Mode: 0644,
+ })
+ if err != nil {
+ return nil, err
+ }
+ childMerkleFD.DecRef(ctx)
+ childMerkleVD, err = vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: parent.lowerVD,
+ Start: parent.lowerVD,
+ Path: fspath.Parse(childMerkleFilename),
+ }, &vfs.GetDentryOptions{})
+ if err != nil {
+ return nil, err
+ }
+ } else {
+ // If runtime enable is not allowed. This indicates an
+ // unexpected modification to the file system that
+ // removed/renamed the Merkle tree file.
+ return nil, alertIntegrityViolation(childMerkleErr, fmt.Sprintf("Expected Merkle file for target %s but none found", parentPath+"/"+name))
+ }
+ } else if childErr == syserror.ENOENT && childMerkleErr == syserror.ENOENT {
+ // Both the child and the corresponding Merkle tree are missing.
+ // This could be an unexpected modification or due to incorrect
+ // parameter.
+ // TODO(b/167752508): Investigate possible ways to differentiate
+ // cases that both files are deleted from cases that they never
+ // exist in the file system.
+ return nil, alertIntegrityViolation(childErr, fmt.Sprintf("Failed to find file %s", parentPath+"/"+name))
+ }
+
+ mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID)
+ stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: childVD,
+ Start: childVD,
+ }, &vfs.StatOptions{
+ Mask: mask,
+ })
+ if err != nil {
+ return nil, err
+ }
+
+ child := fs.newDentry()
+ child.lowerVD = childVD
+ child.lowerMerkleVD = childMerkleVD
+
+ // Increase the reference for both childVD and childMerkleVD as they are
+ // held by child. If this function fails and the child is destroyed, the
+ // references will be decreased in destroyLocked.
+ childVD.IncRef()
+ childMerkleVD.IncRef()
+
+ parent.IncRef()
+ child.parent = parent
+ child.name = name
+
+ // TODO(b/162788573): Verify child metadata.
+ child.mode = uint32(stat.Mode)
+ child.uid = stat.UID
+ child.gid = stat.GID
+
+ // Verify child root hash. This should always be performed unless in
+ // allowRuntimeEnable mode and the parent directory hasn't been enabled
+ // yet.
+ if !(fs.allowRuntimeEnable && len(parent.rootHash) == 0) {
+ if _, err := fs.verifyChild(ctx, parent, child); err != nil {
+ child.destroyLocked(ctx)
+ return nil, err
+ }
+ }
+
+ return child, nil
}
// walkParentDirLocked resolves all but the last path component of rp to an
@@ -104,8 +451,39 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
//
// Preconditions: fs.renameMu must be locked. !rp.Done().
func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
- // TODO(b/159261227): Implement walkParentDirLocked.
- return nil, nil
+ for !rp.Final() {
+ d.dirMu.Lock()
+ next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+ d.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ d = next
+ }
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+ d := rp.Start().Impl().(*dentry)
+ for !rp.Done() {
+ d.dirMu.Lock()
+ next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+ d.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ d = next
+ }
+ if rp.MustBeDir() && !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return d, nil
}
// AccessAt implements vfs.Filesystem.Impl.AccessAt.
@@ -179,8 +557,181 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
// OpenAt implements vfs.FilesystemImpl.OpenAt.
func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- //TODO(b/159261227): Implement OpenAt.
- return nil, nil
+ // Verity fs is read-only.
+ if opts.Flags&(linux.O_WRONLY|linux.O_CREAT) != 0 {
+ return nil, syserror.EROFS
+ }
+
+ var ds *[]*dentry
+ fs.renameMu.RLock()
+ defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+
+ start := rp.Start().Impl().(*dentry)
+ if rp.Done() {
+ return start.openLocked(ctx, rp, &opts)
+ }
+
+afterTrailingSymlink:
+ parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+ if err != nil {
+ return nil, err
+ }
+
+ // Check for search permission in the parent directory.
+ if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+
+ // Open existing child or follow symlink.
+ parent.dirMu.Lock()
+ child, err := fs.stepLocked(ctx, rp, parent, false /*mayFollowSymlinks*/, &ds)
+ parent.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ if child.isSymlink() && rp.ShouldFollowSymlink() {
+ target, err := child.readlink(ctx)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.HandleSymlink(target); err != nil {
+ return nil, err
+ }
+ start = parent
+ goto afterTrailingSymlink
+ }
+ return child.openLocked(ctx, rp, &opts)
+}
+
+// Preconditions: fs.renameMu must be locked.
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+ // Users should not open the Merkle tree files. Those are for verity fs
+ // use only.
+ if strings.Contains(d.name, merklePrefix) {
+ return nil, syserror.EPERM
+ }
+ ats := vfs.AccessTypesForOpenFlags(opts)
+ if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
+ return nil, err
+ }
+
+ // Verity fs is read-only.
+ if ats&vfs.MayWrite != 0 {
+ return nil, syserror.EROFS
+ }
+
+ // Get the path to the target file. This is only used to provide path
+ // information in failure case.
+ path, err := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD)
+ if err != nil {
+ return nil, err
+ }
+
+ // Open the file in the underlying file system.
+ lowerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerVD,
+ Start: d.lowerVD,
+ }, opts)
+
+ // The file should exist, as we succeeded in finding its dentry. If it's
+ // missing, it indicates an unexpected modification to the file system.
+ if err != nil {
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(err, fmt.Sprintf("File %s expected but not found", path))
+ }
+ return nil, err
+ }
+
+ // lowerFD needs to be cleaned up if any error occurs. IncRef will be
+ // called if a verity FD is successfully created.
+ defer lowerFD.DecRef(ctx)
+
+ // Open the Merkle tree file corresponding to the current file/directory
+ // to be used later for verifying Read/Walk.
+ merkleReader, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerMerkleVD,
+ Start: d.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ })
+
+ // The Merkle tree file should exist, as we succeeded in finding its
+ // dentry. If it's missing, it indicates an unexpected modification to
+ // the file system.
+ if err != nil {
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(err, fmt.Sprintf("Merkle file for %s expected but not found", path))
+ }
+ return nil, err
+ }
+
+ // merkleReader needs to be cleaned up if any error occurs. IncRef will
+ // be called if a verity FD is successfully created.
+ defer merkleReader.DecRef(ctx)
+
+ lowerFlags := lowerFD.StatusFlags()
+ lowerFDOpts := lowerFD.Options()
+ var merkleWriter *vfs.FileDescription
+ var parentMerkleWriter *vfs.FileDescription
+
+ // Only open the Merkle tree files for write if in allowRuntimeEnable
+ // mode.
+ if d.fs.allowRuntimeEnable {
+ merkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.lowerMerkleVD,
+ Start: d.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_WRONLY | linux.O_APPEND,
+ })
+ if err != nil {
+ if err == syserror.ENOENT {
+ return nil, alertIntegrityViolation(err, fmt.Sprintf("Merkle file for %s expected but not found", path))
+ }
+ return nil, err
+ }
+ // merkleWriter is cleaned up if any error occurs. IncRef will
+ // be called if a verity FD is created successfully.
+ defer merkleWriter.DecRef(ctx)
+
+ parentMerkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+ Root: d.parent.lowerMerkleVD,
+ Start: d.parent.lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_WRONLY | linux.O_APPEND,
+ })
+ if err != nil {
+ if err == syserror.ENOENT {
+ parentPath, _ := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.parent.lowerVD)
+ return nil, alertIntegrityViolation(err, fmt.Sprintf("Merkle file for %s expected but not found", parentPath))
+ }
+ return nil, err
+ }
+ // parentMerkleWriter is cleaned up if any error occurs. IncRef
+ // will be called if a verity FD is created successfully.
+ defer parentMerkleWriter.DecRef(ctx)
+ }
+
+ fd := &fileDescription{
+ d: d,
+ lowerFD: lowerFD,
+ merkleReader: merkleReader,
+ merkleWriter: merkleWriter,
+ parentMerkleWriter: parentMerkleWriter,
+ isDir: d.isDir(),
+ }
+
+ if err := fd.vfsfd.Init(fd, lowerFlags, rp.Mount(), &d.vfsd, &lowerFDOpts); err != nil {
+ return nil, err
+ }
+ lowerFD.IncRef()
+ merkleReader.IncRef()
+ if merkleWriter != nil {
+ merkleWriter.IncRef()
+ }
+ if parentMerkleWriter != nil {
+ parentMerkleWriter.IncRef()
+ }
+ return &fd.vfsfd, err
}
// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
@@ -256,7 +807,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return syserror.EROFS
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
var ds *[]*dentry
fs.renameMu.RLock()
@@ -267,8 +818,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
return nil, syserror.ECONNREFUSED
}
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -277,14 +828,14 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
return nil, err
}
lowerVD := d.lowerVD
- return fs.vfsfs.VirtualFilesystem().ListxattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+ return fs.vfsfs.VirtualFilesystem().ListXattrAt(ctx, d.fs.creds, &vfs.PathOperation{
Root: lowerVD,
Start: lowerVD,
}, size)
}
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -293,20 +844,20 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
return "", err
}
lowerVD := d.lowerVD
- return fs.vfsfs.VirtualFilesystem().GetxattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+ return fs.vfsfs.VirtualFilesystem().GetXattrAt(ctx, d.fs.creds, &vfs.PathOperation{
Root: lowerVD,
Start: lowerVD,
}, &opts)
}
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
// Verity file system is read-only.
return syserror.EROFS
}
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
// Verity file system is read-only.
return syserror.EROFS
}
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index cb29d33a5..9182df317 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -22,24 +22,56 @@
package verity
import (
+ "fmt"
+ "strconv"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/marshal/primitive"
+ "gvisor.dev/gvisor/pkg/merkletree"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// Name is the default filesystem name.
const Name = "verity"
-// testOnlyDebugging allows verity file system to return error instead of
-// crashing the application when a malicious action is detected. This should
-// only be set for tests.
-var testOnlyDebugging bool
+// merklePrefix is the prefix of the Merkle tree files. For example, the Merkle
+// tree file for "/foo" is "/.merkle.verity.foo".
+const merklePrefix = ".merkle.verity."
+
+// merkleoffsetInParentXattr is the extended attribute name specifying the
+// offset of child root hash in its parent's Merkle tree.
+const merkleOffsetInParentXattr = "user.merkle.offset"
+
+// merkleSizeXattr is the extended attribute name specifying the size of data
+// hashed by the corresponding Merkle tree. For a file, it's the size of the
+// whole file. For a directory, it's the size of all its children's root hashes.
+const merkleSizeXattr = "user.merkle.size"
+
+// sizeOfStringInt32 is the size for a 32 bit integer stored as string in
+// extended attributes. The maximum value of a 32 bit integer is 10 digits.
+const sizeOfStringInt32 = 10
+
+// noCrashOnVerificationFailure indicates whether the sandbox should panic
+// whenever verification fails. If true, an error is returned instead of
+// panicking. This should only be set for tests.
+// TOOD(b/165661693): Decide whether to panic or return error based on this
+// flag.
+var noCrashOnVerificationFailure bool
+
+// verityMu synchronizes enabling verity files, protects files or directories
+// from being enabled by different threads simultaneously. It also ensures that
+// verity does not access files that are being enabled.
+var verityMu sync.RWMutex
// FilesystemType implements vfs.FilesystemType.
type FilesystemType struct{}
@@ -93,10 +125,10 @@ type InternalFilesystemOptions struct {
// system wrapped by verity file system.
LowerGetFSOptions vfs.GetFilesystemOptions
- // TestOnlyDebugging allows verity file system to return error instead
- // of crashing the application when a malicious action is detected. This
- // should only be set for tests.
- TestOnlyDebugging bool
+ // NoCrashOnVerificationFailure indicates whether the sandbox should
+ // panic whenever verification fails. If true, an error is returned
+ // instead of panicking. This should only be set for tests.
+ NoCrashOnVerificationFailure bool
}
// Name implements vfs.FilesystemType.Name.
@@ -104,10 +136,120 @@ func (FilesystemType) Name() string {
return Name
}
+// alertIntegrityViolation alerts a violation of integrity, which usually means
+// unexpected modification to the file system is detected. In
+// noCrashOnVerificationFailure mode, it returns an error, otherwise it panic.
+func alertIntegrityViolation(err error, msg string) error {
+ if noCrashOnVerificationFailure {
+ return err
+ }
+ panic(msg)
+}
+
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
- //TODO(b/159261227): Implement GetFilesystem.
- return nil, nil, nil
+ iopts, ok := opts.InternalData.(InternalFilesystemOptions)
+ if !ok {
+ ctx.Warningf("verity.FilesystemType.GetFilesystem: missing verity configs")
+ return nil, nil, syserror.EINVAL
+ }
+ noCrashOnVerificationFailure = iopts.NoCrashOnVerificationFailure
+
+ // Mount the lower file system. The lower file system is wrapped inside
+ // verity, and should not be exposed or connected.
+ mopts := &vfs.MountOptions{
+ GetFilesystemOptions: iopts.LowerGetFSOptions,
+ }
+ mnt, err := vfsObj.MountDisconnected(ctx, creds, "", iopts.LowerName, mopts)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ fs := &filesystem{
+ creds: creds.Fork(),
+ lowerMount: mnt,
+ allowRuntimeEnable: iopts.AllowRuntimeEnable,
+ }
+ fs.vfsfs.Init(vfsObj, &fstype, fs)
+
+ // Construct the root dentry.
+ d := fs.newDentry()
+ d.refs = 1
+ lowerVD := vfs.MakeVirtualDentry(mnt, mnt.Root())
+ lowerVD.IncRef()
+ d.lowerVD = lowerVD
+
+ rootMerkleName := merklePrefix + iopts.RootMerkleFileName
+
+ lowerMerkleVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ Path: fspath.Parse(rootMerkleName),
+ }, &vfs.GetDentryOptions{})
+
+ // If runtime enable is allowed, the root merkle tree may be absent. We
+ // should create the tree file.
+ if err == syserror.ENOENT && fs.allowRuntimeEnable {
+ lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ Path: fspath.Parse(rootMerkleName),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT,
+ Mode: 0644,
+ })
+ if err != nil {
+ fs.vfsfs.DecRef(ctx)
+ d.DecRef(ctx)
+ return nil, nil, err
+ }
+ lowerMerkleFD.DecRef(ctx)
+ lowerMerkleVD, err = vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ Path: fspath.Parse(rootMerkleName),
+ }, &vfs.GetDentryOptions{})
+ if err != nil {
+ fs.vfsfs.DecRef(ctx)
+ d.DecRef(ctx)
+ return nil, nil, err
+ }
+ } else if err != nil {
+ // Failed to get dentry for the root Merkle file. This
+ // indicates an unexpected modification that removed/renamed
+ // the root Merkle file, or it's never generated.
+ fs.vfsfs.DecRef(ctx)
+ d.DecRef(ctx)
+ return nil, nil, alertIntegrityViolation(err, "Failed to find root Merkle file")
+ }
+ d.lowerMerkleVD = lowerMerkleVD
+
+ // Get metadata from the underlying file system.
+ const statMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID
+ stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ }, &vfs.StatOptions{
+ Mask: statMask,
+ })
+ if err != nil {
+ fs.vfsfs.DecRef(ctx)
+ d.DecRef(ctx)
+ return nil, nil, err
+ }
+
+ // TODO(b/162788573): Verify Metadata.
+ d.mode = uint32(stat.Mode)
+ d.uid = stat.UID
+ d.gid = stat.GID
+
+ d.rootHash = make([]byte, len(iopts.RootHash))
+ copy(d.rootHash, iopts.RootHash)
+ d.vfsd.Init(d)
+
+ fs.rootDentry = d
+
+ return &fs.vfsfs, &d.vfsd, nil
}
// Release implements vfs.FilesystemImpl.Release.
@@ -344,6 +486,194 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
return syserror.EPERM
}
+// generateMerkle generates a Merkle tree file for fd. If fd points to a file
+// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The root
+// hash of the generated Merkle tree and the data size is returned.
+// If fd points to a regular file, the data is the content of the file. If fd
+// points to a directory, the data is all root hahes of its children, written
+// to the Merkle tree file.
+func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, error) {
+ fdReader := vfs.FileReadWriteSeeker{
+ FD: fd.lowerFD,
+ Ctx: ctx,
+ }
+ merkleReader := vfs.FileReadWriteSeeker{
+ FD: fd.merkleReader,
+ Ctx: ctx,
+ }
+ merkleWriter := vfs.FileReadWriteSeeker{
+ FD: fd.merkleWriter,
+ Ctx: ctx,
+ }
+ var rootHash []byte
+ var dataSize uint64
+
+ switch atomic.LoadUint32(&fd.d.mode) & linux.S_IFMT {
+ case linux.S_IFREG:
+ // For a regular file, generate a Merkle tree based on its
+ // content.
+ var err error
+ stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ return nil, 0, err
+ }
+ dataSize = stat.Size
+
+ rootHash, err = merkletree.Generate(&fdReader, int64(dataSize), &merkleReader, &merkleWriter, false /* dataAndTreeInSameFile */)
+ if err != nil {
+ return nil, 0, err
+ }
+ case linux.S_IFDIR:
+ // For a directory, generate a Merkle tree based on the root
+ // hashes of its children that has already been written to the
+ // Merkle tree file.
+ merkleStat, err := fd.merkleReader.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ return nil, 0, err
+ }
+ dataSize = merkleStat.Size
+
+ rootHash, err = merkletree.Generate(&merkleReader, int64(dataSize), &merkleReader, &merkleWriter, true /* dataAndTreeInSameFile */)
+ if err != nil {
+ return nil, 0, err
+ }
+ default:
+ // TODO(b/167728857): Investigate whether and how we should
+ // enable other types of file.
+ return nil, 0, syserror.EINVAL
+ }
+ return rootHash, dataSize, nil
+}
+
+// enableVerity enables verity features on fd by generating a Merkle tree file
+// and stores its root hash in its parent directory's Merkle tree.
+func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ if !fd.d.fs.allowRuntimeEnable {
+ return 0, syserror.EPERM
+ }
+
+ // Lock to prevent other threads performing enable or access the file
+ // while it's being enabled.
+ verityMu.Lock()
+ defer verityMu.Unlock()
+
+ if fd.lowerFD == nil || fd.merkleReader == nil || fd.merkleWriter == nil || fd.parentMerkleWriter == nil {
+ return 0, alertIntegrityViolation(syserror.EIO, "Unexpected verity fd: missing expected underlying fds")
+ }
+
+ rootHash, dataSize, err := fd.generateMerkle(ctx)
+ if err != nil {
+ return 0, err
+ }
+
+ stat, err := fd.parentMerkleWriter.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ return 0, err
+ }
+
+ // Write the root hash of fd to the parent directory's Merkle tree
+ // file, as it should be part of the parent Merkle tree data.
+ // parentMerkleWriter is open with O_APPEND, so it should write
+ // directly to the end of the file.
+ if _, err = fd.parentMerkleWriter.Write(ctx, usermem.BytesIOSequence(rootHash), vfs.WriteOptions{}); err != nil {
+ return 0, err
+ }
+
+ // Record the offset of the root hash of fd in parent directory's
+ // Merkle tree file.
+ if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+ Name: merkleOffsetInParentXattr,
+ Value: strconv.Itoa(int(stat.Size)),
+ }); err != nil {
+ return 0, err
+ }
+
+ // Record the size of the data being hashed for fd.
+ if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+ Name: merkleSizeXattr,
+ Value: strconv.Itoa(int(dataSize)),
+ }); err != nil {
+ return 0, err
+ }
+ fd.d.rootHash = append(fd.d.rootHash, rootHash...)
+ return 0, nil
+}
+
+func (fd *fileDescription) getFlags(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ f := int32(0)
+
+ // All enabled files should store a root hash. This flag is not settable
+ // via FS_IOC_SETFLAGS.
+ if len(fd.d.rootHash) != 0 {
+ f |= linux.FS_VERITY_FL
+ }
+
+ t := kernel.TaskFromContext(ctx)
+ addr := args[2].Pointer()
+ _, err := primitive.CopyInt32Out(t, addr, f)
+ return 0, err
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ switch cmd := args[1].Uint(); cmd {
+ case linux.FS_IOC_ENABLE_VERITY:
+ return fd.enableVerity(ctx, uio, args)
+ case linux.FS_IOC_GETFLAGS:
+ return fd.getFlags(ctx, uio, args)
+ default:
+ return fd.lowerFD.Ioctl(ctx, uio, args)
+ }
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ // No need to verify if the file is not enabled yet in
+ // allowRuntimeEnable mode.
+ if fd.d.fs.allowRuntimeEnable && len(fd.d.rootHash) == 0 {
+ return fd.lowerFD.PRead(ctx, dst, offset, opts)
+ }
+
+ // dataSize is the size of the whole file.
+ dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{
+ Name: merkleSizeXattr,
+ Size: sizeOfStringInt32,
+ })
+
+ // The Merkle tree file for the child should have been created and
+ // contains the expected xattrs. If the xattr does not exist, it
+ // indicates unexpected modifications to the file system.
+ if err == syserror.ENODATA {
+ return 0, alertIntegrityViolation(err, fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err))
+ }
+ if err != nil {
+ return 0, err
+ }
+
+ // The dataSize xattr should be an integer. If it's not, it indicates
+ // unexpected modifications to the file system.
+ size, err := strconv.Atoi(dataSize)
+ if err != nil {
+ return 0, alertIntegrityViolation(err, fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err))
+ }
+
+ dataReader := vfs.FileReadWriteSeeker{
+ FD: fd.lowerFD,
+ Ctx: ctx,
+ }
+
+ merkleReader := vfs.FileReadWriteSeeker{
+ FD: fd.merkleReader,
+ Ctx: ctx,
+ }
+
+ n, err := merkletree.Verify(dst.Writer(ctx), &dataReader, &merkleReader, int64(size), offset, dst.NumBytes(), fd.d.rootHash, false /* dataAndTreeInSameFile */)
+ if err != nil {
+ return 0, alertIntegrityViolation(syserror.EINVAL, fmt.Sprintf("Verification failed: %v", err))
+ }
+ return n, err
+}
+
// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)