diff options
Diffstat (limited to 'pkg/sentry/fsimpl')
123 files changed, 5527 insertions, 1733 deletions
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD index 3f64fab3a..48e13613a 100644 --- a/pkg/sentry/fsimpl/devpts/BUILD +++ b/pkg/sentry/fsimpl/devpts/BUILD @@ -21,8 +21,8 @@ go_library( "line_discipline.go", "master.go", "queue.go", + "replica.go", "root_inode_refs.go", - "slave.go", "terminal.go", ], visibility = ["//pkg/sentry:internal"], @@ -30,6 +30,8 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/log", + "//pkg/marshal", + "//pkg/marshal/primitive", "//pkg/refs", "//pkg/safemem", "//pkg/sentry/arch", diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 57580f4d4..903135fae 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -35,6 +35,8 @@ import ( const Name = "devpts" // FilesystemType implements vfs.FilesystemType. +// +// +stateify savable type FilesystemType struct{} // Name implements vfs.FilesystemType.Name. @@ -58,6 +60,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return fs.Filesystem.VFSFilesystem(), root.VFSDentry(), nil } +// +stateify savable type filesystem struct { kernfs.Filesystem @@ -79,7 +82,7 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds // Construct the root directory. This is always inode id 1. root := &rootInode{ - slaves: make(map[uint32]*slaveInode), + replicas: make(map[uint32]*replicaInode), } root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555) root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) @@ -110,6 +113,8 @@ func (fs *filesystem) Release(ctx context.Context) { } // rootInode is the root directory inode for the devpts mounts. +// +// +stateify savable type rootInode struct { implStatFS kernfs.AlwaysValid @@ -131,10 +136,10 @@ type rootInode struct { root *rootInode // mu protects the fields below. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` - // slaves maps pty ids to slave inodes. - slaves map[uint32]*slaveInode + // replicas maps pty ids to replica inodes. + replicas map[uint32]*replicaInode // nextIdx is the next pty index to use. Must be accessed atomically. // @@ -154,22 +159,22 @@ func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error) idx := i.nextIdx i.nextIdx++ - // Sanity check that slave with idx does not exist. - if _, ok := i.slaves[idx]; ok { + // Sanity check that replica with idx does not exist. + if _, ok := i.replicas[idx]; ok { panic(fmt.Sprintf("pty index collision; index %d already exists", idx)) } - // Create the new terminal and slave. + // Create the new terminal and replica. t := newTerminal(idx) - slave := &slaveInode{ + replica := &replicaInode{ root: i, t: t, } // Linux always uses pty index + 3 as the inode id. See // fs/devpts/inode.c:devpts_pty_new(). - slave.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600) - slave.dentry.Init(slave) - i.slaves[idx] = slave + replica.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600) + replica.dentry.Init(replica) + i.replicas[idx] = replica return t, nil } @@ -179,16 +184,16 @@ func (i *rootInode) masterClose(t *Terminal) { i.mu.Lock() defer i.mu.Unlock() - // Sanity check that slave with idx exists. - if _, ok := i.slaves[t.n]; !ok { + // Sanity check that replica with idx exists. + if _, ok := i.replicas[t.n]; !ok { panic(fmt.Sprintf("pty with index %d does not exist", t.n)) } - delete(i.slaves, t.n) + delete(i.replicas, t.n) } // Open implements kernfs.Inode.Open. -func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ +func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndStaticEntries, }) if err != nil { @@ -198,16 +203,16 @@ func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D } // Lookup implements kernfs.Inode.Lookup. -func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +func (i *rootInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) { idx, err := strconv.ParseUint(name, 10, 32) if err != nil { return nil, syserror.ENOENT } i.mu.Lock() defer i.mu.Unlock() - if si, ok := i.slaves[uint32(idx)]; ok { + if si, ok := i.replicas[uint32(idx)]; ok { si.dentry.IncRef() - return si.dentry.VFSDentry(), nil + return &si.dentry, nil } return nil, syserror.ENOENT @@ -217,8 +222,8 @@ func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { i.mu.Lock() defer i.mu.Unlock() - ids := make([]int, 0, len(i.slaves)) - for id := range i.slaves { + ids := make([]int, 0, len(i.replicas)) + for id := range i.replicas { ids = append(ids, int(id)) } sort.Ints(ids) @@ -226,7 +231,7 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, dirent := vfs.Dirent{ Name: strconv.FormatUint(uint64(id), 10), Type: linux.DT_CHR, - Ino: i.slaves[uint32(id)].InodeAttrs.Ino(), + Ino: i.replicas[uint32(id)].InodeAttrs.Ino(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { @@ -237,11 +242,12 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, return offset, nil } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (i *rootInode) DecRef(context.Context) { i.rootInodeRefs.DecRef(i.Destroy) } +// +stateify savable type implStatFS struct{} // StatFS implements kernfs.Inode.StatFS. diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go index b7c149047..448390cfe 100644 --- a/pkg/sentry/fsimpl/devpts/devpts_test.go +++ b/pkg/sentry/fsimpl/devpts/devpts_test.go @@ -22,8 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -func TestSimpleMasterToSlave(t *testing.T) { - ld := newLineDiscipline(linux.DefaultSlaveTermios) +func TestSimpleMasterToReplica(t *testing.T) { + ld := newLineDiscipline(linux.DefaultReplicaTermios) ctx := contexttest.Context(t) inBytes := []byte("hello, tty\n") src := usermem.BytesIOSequence(inBytes) diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go index f7bc325d1..e6b0e81cf 100644 --- a/pkg/sentry/fsimpl/devpts/line_discipline.go +++ b/pkg/sentry/fsimpl/devpts/line_discipline.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -41,7 +42,7 @@ const ( ) // lineDiscipline dictates how input and output are handled between the -// pseudoterminal (pty) master and slave. It can be configured to alter I/O, +// pseudoterminal (pty) master and replica. It can be configured to alter I/O, // modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man // pages are good resources for how to affect the line discipline: // @@ -52,8 +53,8 @@ const ( // // lineDiscipline has a simple structure but supports a multitude of options // (see the above man pages). It consists of two queues of bytes: one from the -// terminal master to slave (the input queue) and one from slave to master (the -// output queue). When bytes are written to one end of the pty, the line +// terminal master to replica (the input queue) and one from replica to master +// (the output queue). When bytes are written to one end of the pty, the line // discipline reads the bytes, modifies them or takes special action if // required, and enqueues them to be read by the other end of the pty: // @@ -62,7 +63,7 @@ const ( // | (inputQueueWrite) +-------------+ (inputQueueRead) | // | | // | v -// masterFD slaveFD +// masterFD replicaFD // ^ | // | | // | output to terminal +--------------+ output from process | @@ -101,8 +102,8 @@ type lineDiscipline struct { // masterWaiter is used to wait on the master end of the TTY. masterWaiter waiter.Queue `state:"zerovalue"` - // slaveWaiter is used to wait on the slave end of the TTY. - slaveWaiter waiter.Queue `state:"zerovalue"` + // replicaWaiter is used to wait on the replica end of the TTY. + replicaWaiter waiter.Queue `state:"zerovalue"` } func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline { @@ -113,27 +114,23 @@ func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline { } // getTermios gets the linux.Termios for the tty. -func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) { l.termiosMu.RLock() defer l.termiosMu.RUnlock() // We must copy a Termios struct, not KernelTermios. t := l.termios.ToTermios() - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := t.CopyOut(task, args[2].Pointer()) return 0, err } // setTermios sets a linux.Termios for the tty. -func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) { l.termiosMu.Lock() defer l.termiosMu.Unlock() oldCanonEnabled := l.termios.LEnabled(linux.ICANON) // We must copy a Termios struct, not KernelTermios. var t linux.Termios - _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := t.CopyIn(task, args[2].Pointer()) l.termios.FromTermios(t) // If canonical mode is turned off, move bytes from inQueue's wait @@ -144,27 +141,23 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc l.inQueue.pushWaitBufLocked(l) l.inQueue.readable = true l.inQueue.mu.Unlock() - l.slaveWaiter.Notify(waiter.EventIn) + l.replicaWaiter.Notify(waiter.EventIn) } return 0, err } -func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { +func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error { l.sizeMu.Lock() defer l.sizeMu.Unlock() - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := l.size.CopyOut(t, args[2].Pointer()) return err } -func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { +func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error { l.sizeMu.Lock() defer l.sizeMu.Unlock() - _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := l.size.CopyIn(t, args[2].Pointer()) return err } @@ -174,14 +167,14 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask { return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios) } -func (l *lineDiscipline) slaveReadiness() waiter.EventMask { +func (l *lineDiscipline) replicaReadiness() waiter.EventMask { l.termiosMu.RLock() defer l.termiosMu.RUnlock() return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios) } -func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { - return l.inQueue.readableSize(ctx, io, args) +func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error { + return l.inQueue.readableSize(t, io, args) } func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { @@ -194,7 +187,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque if n > 0 { l.masterWaiter.Notify(waiter.EventOut) if pushed { - l.slaveWaiter.Notify(waiter.EventIn) + l.replicaWaiter.Notify(waiter.EventIn) } return n, nil } @@ -209,14 +202,14 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ return 0, err } if n > 0 { - l.slaveWaiter.Notify(waiter.EventIn) + l.replicaWaiter.Notify(waiter.EventIn) return n, nil } return 0, syserror.ErrWouldBlock } -func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { - return l.outQueue.readableSize(ctx, io, args) +func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error { + return l.outQueue.readableSize(t, io, args) } func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { @@ -227,7 +220,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ return 0, err } if n > 0 { - l.slaveWaiter.Notify(waiter.EventOut) + l.replicaWaiter.Notify(waiter.EventOut) if pushed { l.masterWaiter.Notify(waiter.EventIn) } diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 60feb1993..69c2fe951 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -17,9 +17,11 @@ package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -29,6 +31,8 @@ import ( ) // masterInode is the inode for the master end of the Terminal. +// +// +stateify savable type masterInode struct { implStatFS kernfs.InodeAttrs @@ -48,20 +52,18 @@ type masterInode struct { var _ kernfs.Inode = (*masterInode)(nil) // Open implements kernfs.Inode.Open. -func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { t, err := mi.root.allocateTerminal(rp.Credentials()) if err != nil { return nil, err } - mi.IncRef() fd := &masterFileDescription{ inode: mi, t: t, } fd.LockFD.Init(&mi.locks) - if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { - mi.DecRef(ctx) + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil @@ -87,6 +89,7 @@ func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) } +// +stateify savable type masterFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -101,7 +104,6 @@ var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil) // Release implements vfs.FileDescriptionImpl.Release. func (mfd *masterFileDescription) Release(ctx context.Context) { mfd.inode.root.masterClose(mfd.t) - mfd.inode.DecRef(ctx) } // EventRegister implements waiter.Waitable.EventRegister. @@ -131,46 +133,51 @@ func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSeque // Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // ioctl(2) may only be called from a task goroutine. + return 0, syserror.ENOTTY + } + switch cmd := args[1].Uint(); cmd { case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ // Get the number of bytes in the output queue read buffer. - return 0, mfd.t.ld.outputQueueReadSize(ctx, io, args) + return 0, mfd.t.ld.outputQueueReadSize(t, io, args) case linux.TCGETS: // N.B. TCGETS on the master actually returns the configuration - // of the slave end. - return mfd.t.ld.getTermios(ctx, io, args) + // of the replica end. + return mfd.t.ld.getTermios(t, args) case linux.TCSETS: // N.B. TCSETS on the master actually affects the configuration - // of the slave end. - return mfd.t.ld.setTermios(ctx, io, args) + // of the replica end. + return mfd.t.ld.setTermios(t, args) case linux.TCSETSW: // TODO(b/29356795): This should drain the output queue first. - return mfd.t.ld.setTermios(ctx, io, args) + return mfd.t.ld.setTermios(t, args) case linux.TIOCGPTN: - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mfd.t.n), usermem.IOOpts{ - AddressSpaceActive: true, - }) + nP := primitive.Uint32(mfd.t.n) + _, err := nP.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCSPTLCK: // TODO(b/29356795): Implement pty locking. For now just pretend we do. return 0, nil case linux.TIOCGWINSZ: - return 0, mfd.t.ld.windowSize(ctx, io, args) + return 0, mfd.t.ld.windowSize(t, args) case linux.TIOCSWINSZ: - return 0, mfd.t.ld.setWindowSize(ctx, io, args) + return 0, mfd.t.ld.setWindowSize(t, args) case linux.TIOCSCTTY: // Make the given terminal the controlling terminal of the // calling process. - return 0, mfd.t.setControllingTTY(ctx, io, args, true /* isMaster */) + return 0, mfd.t.setControllingTTY(ctx, args, true /* isMaster */) case linux.TIOCNOTTY: // Release this process's controlling terminal. - return 0, mfd.t.releaseControllingTTY(ctx, io, args, true /* isMaster */) + return 0, mfd.t.releaseControllingTTY(ctx, args, true /* isMaster */) case linux.TIOCGPGRP: // Get the foreground process group. - return mfd.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */) + return mfd.t.foregroundProcessGroup(ctx, args, true /* isMaster */) case linux.TIOCSPGRP: // Set the foreground process group. - return mfd.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */) + return mfd.t.setForegroundProcessGroup(ctx, args, true /* isMaster */) default: maybeEmitUnimplementedEvent(ctx, cmd) return 0, syserror.ENOTTY diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go index 331c13997..55bff3e60 100644 --- a/pkg/sentry/fsimpl/devpts/queue.go +++ b/pkg/sentry/fsimpl/devpts/queue.go @@ -17,8 +17,10 @@ package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -30,7 +32,7 @@ import ( const waitBufMaxBytes = 131072 // queue represents one of the input or output queues between a pty master and -// slave. Bytes written to a queue are added to the read buffer until it is +// replica. Bytes written to a queue are added to the read buffer until it is // full, at which point they are written to the wait buffer. Bytes are // processed (i.e. undergo termios transformations) as they are added to the // read buffer. The read buffer is readable when its length is nonzero and @@ -83,17 +85,15 @@ func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask { } // readableSize writes the number of readable bytes to userspace. -func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { +func (q *queue) readableSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error { q.mu.Lock() defer q.mu.Unlock() - var size int32 + size := primitive.Int32(0) if q.readable { - size = int32(len(q.readBuf)) + size = primitive.Int32(len(q.readBuf)) } - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := size.CopyOut(t, args[2].Pointer()) return err } diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go new file mode 100644 index 000000000..6515c5536 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/replica.go @@ -0,0 +1,204 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" + "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// replicaInode is the inode for the replica end of the Terminal. +// +// +stateify savable +type replicaInode struct { + implStatFS + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + + locks vfs.FileLocks + + // Keep a reference to this inode's dentry. + dentry kernfs.Dentry + + // root is the devpts root inode. + root *rootInode + + // t is the connected Terminal. + t *Terminal +} + +var _ kernfs.Inode = (*replicaInode)(nil) + +// Open implements kernfs.Inode.Open. +func (ri *replicaInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &replicaFileDescription{ + inode: ri, + } + fd.LockFD.Init(&ri.locks) + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return &fd.vfsfd, nil + +} + +// Valid implements kernfs.Inode.Valid. +func (ri *replicaInode) Valid(context.Context) bool { + // Return valid if the replica still exists. + ri.root.mu.Lock() + defer ri.root.mu.Unlock() + _, ok := ri.root.replicas[ri.t.n] + return ok +} + +// Stat implements kernfs.Inode.Stat. +func (ri *replicaInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + statx, err := ri.InodeAttrs.Stat(ctx, vfsfs, opts) + if err != nil { + return linux.Statx{}, err + } + statx.Blksize = 1024 + statx.RdevMajor = linux.UNIX98_PTY_REPLICA_MAJOR + statx.RdevMinor = ri.t.n + return statx, nil +} + +// SetStat implements kernfs.Inode.SetStat +func (ri *replicaInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + if opts.Stat.Mask&linux.STATX_SIZE != 0 { + return syserror.EINVAL + } + return ri.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) +} + +// +stateify savable +type replicaFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.LockFD + + inode *replicaInode +} + +var _ vfs.FileDescriptionImpl = (*replicaFileDescription)(nil) + +// Release implements fs.FileOperations.Release. +func (rfd *replicaFileDescription) Release(ctx context.Context) {} + +// EventRegister implements waiter.Waitable.EventRegister. +func (rfd *replicaFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + rfd.inode.t.ld.replicaWaiter.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (rfd *replicaFileDescription) EventUnregister(e *waiter.Entry) { + rfd.inode.t.ld.replicaWaiter.EventUnregister(e) +} + +// Readiness implements waiter.Waitable.Readiness. +func (rfd *replicaFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + return rfd.inode.t.ld.replicaReadiness() +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (rfd *replicaFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + return rfd.inode.t.ld.inputQueueRead(ctx, dst) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (rfd *replicaFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { + return rfd.inode.t.ld.outputQueueWrite(ctx, src) +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (rfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // ioctl(2) may only be called from a task goroutine. + return 0, syserror.ENOTTY + } + + switch cmd := args[1].Uint(); cmd { + case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ + // Get the number of bytes in the input queue read buffer. + return 0, rfd.inode.t.ld.inputQueueReadSize(t, io, args) + case linux.TCGETS: + return rfd.inode.t.ld.getTermios(t, args) + case linux.TCSETS: + return rfd.inode.t.ld.setTermios(t, args) + case linux.TCSETSW: + // TODO(b/29356795): This should drain the output queue first. + return rfd.inode.t.ld.setTermios(t, args) + case linux.TIOCGPTN: + nP := primitive.Uint32(rfd.inode.t.n) + _, err := nP.CopyOut(t, args[2].Pointer()) + return 0, err + case linux.TIOCGWINSZ: + return 0, rfd.inode.t.ld.windowSize(t, args) + case linux.TIOCSWINSZ: + return 0, rfd.inode.t.ld.setWindowSize(t, args) + case linux.TIOCSCTTY: + // Make the given terminal the controlling terminal of the + // calling process. + return 0, rfd.inode.t.setControllingTTY(ctx, args, false /* isMaster */) + case linux.TIOCNOTTY: + // Release this process's controlling terminal. + return 0, rfd.inode.t.releaseControllingTTY(ctx, args, false /* isMaster */) + case linux.TIOCGPGRP: + // Get the foreground process group. + return rfd.inode.t.foregroundProcessGroup(ctx, args, false /* isMaster */) + case linux.TIOCSPGRP: + // Set the foreground process group. + return rfd.inode.t.setForegroundProcessGroup(ctx, args, false /* isMaster */) + default: + maybeEmitUnimplementedEvent(ctx, cmd) + return 0, syserror.ENOTTY + } +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (rfd *replicaFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + creds := auth.CredentialsFromContext(ctx) + fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem() + return rfd.inode.SetStat(ctx, fs, creds, opts) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (rfd *replicaFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem() + return rfd.inode.Stat(ctx, fs, opts) +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (rfd *replicaFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return rfd.Locks().LockPOSIX(ctx, &rfd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (rfd *replicaFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return rfd.Locks().UnlockPOSIX(ctx, &rfd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go deleted file mode 100644 index a9da7af64..000000000 --- a/pkg/sentry/fsimpl/devpts/slave.go +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package devpts - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/arch" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/pkg/waiter" -) - -// slaveInode is the inode for the slave end of the Terminal. -type slaveInode struct { - implStatFS - kernfs.InodeAttrs - kernfs.InodeNoopRefCount - kernfs.InodeNotDirectory - kernfs.InodeNotSymlink - - locks vfs.FileLocks - - // Keep a reference to this inode's dentry. - dentry kernfs.Dentry - - // root is the devpts root inode. - root *rootInode - - // t is the connected Terminal. - t *Terminal -} - -var _ kernfs.Inode = (*slaveInode)(nil) - -// Open implements kernfs.Inode.Open. -func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - si.IncRef() - fd := &slaveFileDescription{ - inode: si, - } - fd.LockFD.Init(&si.locks) - if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { - si.DecRef(ctx) - return nil, err - } - return &fd.vfsfd, nil - -} - -// Valid implements kernfs.Inode.Valid. -func (si *slaveInode) Valid(context.Context) bool { - // Return valid if the slave still exists. - si.root.mu.Lock() - defer si.root.mu.Unlock() - _, ok := si.root.slaves[si.t.n] - return ok -} - -// Stat implements kernfs.Inode.Stat. -func (si *slaveInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { - statx, err := si.InodeAttrs.Stat(ctx, vfsfs, opts) - if err != nil { - return linux.Statx{}, err - } - statx.Blksize = 1024 - statx.RdevMajor = linux.UNIX98_PTY_SLAVE_MAJOR - statx.RdevMinor = si.t.n - return statx, nil -} - -// SetStat implements kernfs.Inode.SetStat -func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { - if opts.Stat.Mask&linux.STATX_SIZE != 0 { - return syserror.EINVAL - } - return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) -} - -type slaveFileDescription struct { - vfsfd vfs.FileDescription - vfs.FileDescriptionDefaultImpl - vfs.LockFD - - inode *slaveInode -} - -var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil) - -// Release implements fs.FileOperations.Release. -func (sfd *slaveFileDescription) Release(ctx context.Context) { - sfd.inode.DecRef(ctx) -} - -// EventRegister implements waiter.Waitable.EventRegister. -func (sfd *slaveFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { - sfd.inode.t.ld.slaveWaiter.EventRegister(e, mask) -} - -// EventUnregister implements waiter.Waitable.EventUnregister. -func (sfd *slaveFileDescription) EventUnregister(e *waiter.Entry) { - sfd.inode.t.ld.slaveWaiter.EventUnregister(e) -} - -// Readiness implements waiter.Waitable.Readiness. -func (sfd *slaveFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { - return sfd.inode.t.ld.slaveReadiness() -} - -// Read implements vfs.FileDescriptionImpl.Read. -func (sfd *slaveFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { - return sfd.inode.t.ld.inputQueueRead(ctx, dst) -} - -// Write implements vfs.FileDescriptionImpl.Write. -func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { - return sfd.inode.t.ld.outputQueueWrite(ctx, src) -} - -// Ioctl implements vfs.FileDescriptionImpl.Ioctl. -func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { - switch cmd := args[1].Uint(); cmd { - case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ - // Get the number of bytes in the input queue read buffer. - return 0, sfd.inode.t.ld.inputQueueReadSize(ctx, io, args) - case linux.TCGETS: - return sfd.inode.t.ld.getTermios(ctx, io, args) - case linux.TCSETS: - return sfd.inode.t.ld.setTermios(ctx, io, args) - case linux.TCSETSW: - // TODO(b/29356795): This should drain the output queue first. - return sfd.inode.t.ld.setTermios(ctx, io, args) - case linux.TIOCGPTN: - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sfd.inode.t.n), usermem.IOOpts{ - AddressSpaceActive: true, - }) - return 0, err - case linux.TIOCGWINSZ: - return 0, sfd.inode.t.ld.windowSize(ctx, io, args) - case linux.TIOCSWINSZ: - return 0, sfd.inode.t.ld.setWindowSize(ctx, io, args) - case linux.TIOCSCTTY: - // Make the given terminal the controlling terminal of the - // calling process. - return 0, sfd.inode.t.setControllingTTY(ctx, io, args, false /* isMaster */) - case linux.TIOCNOTTY: - // Release this process's controlling terminal. - return 0, sfd.inode.t.releaseControllingTTY(ctx, io, args, false /* isMaster */) - case linux.TIOCGPGRP: - // Get the foreground process group. - return sfd.inode.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */) - case linux.TIOCSPGRP: - // Set the foreground process group. - return sfd.inode.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */) - default: - maybeEmitUnimplementedEvent(ctx, cmd) - return 0, syserror.ENOTTY - } -} - -// SetStat implements vfs.FileDescriptionImpl.SetStat. -func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - creds := auth.CredentialsFromContext(ctx) - fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem() - return sfd.inode.SetStat(ctx, fs, creds, opts) -} - -// Stat implements vfs.FileDescriptionImpl.Stat. -func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { - fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem() - return sfd.inode.Stat(ctx, fs, opts) -} - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (sfd *slaveFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return sfd.Locks().LockPOSIX(ctx, &sfd.vfsfd, uid, t, start, length, whence, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (sfd *slaveFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return sfd.Locks().UnlockPOSIX(ctx, &sfd.vfsfd, uid, start, length, whence) -} diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go index 7d2781c54..510bd6d89 100644 --- a/pkg/sentry/fsimpl/devpts/terminal.go +++ b/pkg/sentry/fsimpl/devpts/terminal.go @@ -17,9 +17,9 @@ package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/usermem" ) // Terminal is a pseudoterminal. @@ -36,25 +36,25 @@ type Terminal struct { // this terminal. This field is immutable. masterKTTY *kernel.TTY - // slaveKTTY contains the controlling process of the slave end of this + // replicaKTTY contains the controlling process of the replica end of this // terminal. This field is immutable. - slaveKTTY *kernel.TTY + replicaKTTY *kernel.TTY } func newTerminal(n uint32) *Terminal { - termios := linux.DefaultSlaveTermios + termios := linux.DefaultReplicaTermios t := Terminal{ - n: n, - ld: newLineDiscipline(termios), - masterKTTY: &kernel.TTY{Index: n}, - slaveKTTY: &kernel.TTY{Index: n}, + n: n, + ld: newLineDiscipline(termios), + masterKTTY: &kernel.TTY{Index: n}, + replicaKTTY: &kernel.TTY{Index: n}, } return &t } // setControllingTTY makes tm the controlling terminal of the calling thread // group. -func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error { +func (tm *Terminal) setControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error { task := kernel.TaskFromContext(ctx) if task == nil { panic("setControllingTTY must be called from a task context") @@ -65,7 +65,7 @@ func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args a // releaseControllingTTY removes tm as the controlling terminal of the calling // thread group. -func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error { +func (tm *Terminal) releaseControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error { task := kernel.TaskFromContext(ctx) if task == nil { panic("releaseControllingTTY must be called from a task context") @@ -75,7 +75,7 @@ func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, ar } // foregroundProcessGroup gets the process group ID of tm's foreground process. -func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) { +func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) { task := kernel.TaskFromContext(ctx) if task == nil { panic("foregroundProcessGroup must be called from a task context") @@ -87,24 +87,21 @@ func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, a } // Write it out to *arg. - _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{ - AddressSpaceActive: true, - }) + retP := primitive.Int32(ret) + _, err = retP.CopyOut(task, args[2].Pointer()) return 0, err } // foregroundProcessGroup sets tm's foreground process. -func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) { +func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) { task := kernel.TaskFromContext(ctx) if task == nil { panic("setForegroundProcessGroup must be called from a task context") } // Read in the process group ID. - var pgid int32 - if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { + var pgid primitive.Int32 + if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil { return 0, err } @@ -116,5 +113,5 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY { if isMaster { return tm.masterKTTY } - return tm.slaveKTTY + return tm.replicaKTTY } diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go index 52f44f66d..6d1753080 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -33,8 +33,10 @@ import ( const Name = "devtmpfs" // FilesystemType implements vfs.FilesystemType. +// +// +stateify savable type FilesystemType struct { - initOnce sync.Once + initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1664): not yet supported. initErr error // fs is the tmpfs filesystem that backs all mounts of this FilesystemType. @@ -80,7 +82,7 @@ type Accessor struct { // NewAccessor returns an Accessor that supports creation of device special // files in the devtmpfs instance registered with name fsTypeName in vfsObj. func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, fsTypeName string) (*Accessor, error) { - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.MountOptions{}) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go index 827a608cb..3a38b8bb4 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go @@ -48,7 +48,7 @@ func setupDevtmpfs(t *testing.T) (context.Context, *auth.Credentials, *vfs.Virtu }) // Create a test mount namespace with devtmpfs mounted at "/dev". - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.MountOptions{}) if err != nil { t.Fatalf("failed to create tmpfs root mount: %v", err) } diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go index 812171fa3..1c27ad700 100644 --- a/pkg/sentry/fsimpl/eventfd/eventfd.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd.go @@ -30,9 +30,11 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// EventFileDescription implements FileDescriptionImpl for file-based event +// EventFileDescription implements vfs.FileDescriptionImpl for file-based event // notification (eventfd). Eventfds are usually internal to the Sentry but in // certain situations they may be converted into a host-backed eventfd. +// +// +stateify savable type EventFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -106,7 +108,7 @@ func (efd *EventFileDescription) HostFD() (int, error) { return efd.hostfd, nil } -// Release implements FileDescriptionImpl.Release() +// Release implements vfs.FileDescriptionImpl.Release. func (efd *EventFileDescription) Release(context.Context) { efd.mu.Lock() defer efd.mu.Unlock() @@ -119,7 +121,7 @@ func (efd *EventFileDescription) Release(context.Context) { } } -// Read implements FileDescriptionImpl.Read. +// Read implements vfs.FileDescriptionImpl.Read. func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { if dst.NumBytes() < 8 { return 0, syscall.EINVAL @@ -130,7 +132,7 @@ func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequenc return 8, nil } -// Write implements FileDescriptionImpl.Write. +// Write implements vfs.FileDescriptionImpl.Write. func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { if src.NumBytes() < 8 { return 0, syscall.EINVAL diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index abc610ef3..7b1eec3da 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -51,6 +51,8 @@ go_library( "//pkg/fd", "//pkg/fspath", "//pkg/log", + "//pkg/marshal", + "//pkg/marshal/primitive", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fs", @@ -86,9 +88,9 @@ go_test( library = ":ext", deps = [ "//pkg/abi/linux", - "//pkg/binary", "//pkg/context", "//pkg/fspath", + "//pkg/marshal/primitive", "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/ext/disklayout", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index a2cc9b59f..c349b886e 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -59,7 +59,11 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: int(f.Fd()), + }, + }) if err != nil { f.Close() return nil, nil, nil, nil, err diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go index 8bb104ff0..1165234f9 100644 --- a/pkg/sentry/fsimpl/ext/block_map_file.go +++ b/pkg/sentry/fsimpl/ext/block_map_file.go @@ -18,7 +18,7 @@ import ( "io" "math" - "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/syserror" ) @@ -34,19 +34,19 @@ type blockMapFile struct { // directBlks are the direct blocks numbers. The physical blocks pointed by // these holds file data. Contains file blocks 0 to 11. - directBlks [numDirectBlks]uint32 + directBlks [numDirectBlks]primitive.Uint32 // indirectBlk is the physical block which contains (blkSize/4) direct block // numbers (as uint32 integers). - indirectBlk uint32 + indirectBlk primitive.Uint32 // doubleIndirectBlk is the physical block which contains (blkSize/4) indirect // block numbers (as uint32 integers). - doubleIndirectBlk uint32 + doubleIndirectBlk primitive.Uint32 // tripleIndirectBlk is the physical block which contains (blkSize/4) doubly // indirect block numbers (as uint32 integers). - tripleIndirectBlk uint32 + tripleIndirectBlk primitive.Uint32 // coverage at (i)th index indicates the amount of file data a node at // height (i) covers. Height 0 is the direct block. @@ -68,10 +68,12 @@ func newBlockMapFile(args inodeArgs) (*blockMapFile, error) { } blkMap := file.regFile.inode.diskInode.Data() - binary.Unmarshal(blkMap[:numDirectBlks*4], binary.LittleEndian, &file.directBlks) - binary.Unmarshal(blkMap[numDirectBlks*4:(numDirectBlks+1)*4], binary.LittleEndian, &file.indirectBlk) - binary.Unmarshal(blkMap[(numDirectBlks+1)*4:(numDirectBlks+2)*4], binary.LittleEndian, &file.doubleIndirectBlk) - binary.Unmarshal(blkMap[(numDirectBlks+2)*4:(numDirectBlks+3)*4], binary.LittleEndian, &file.tripleIndirectBlk) + for i := 0; i < numDirectBlks; i++ { + file.directBlks[i].UnmarshalBytes(blkMap[i*4 : (i+1)*4]) + } + file.indirectBlk.UnmarshalBytes(blkMap[numDirectBlks*4 : (numDirectBlks+1)*4]) + file.doubleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+1)*4 : (numDirectBlks+2)*4]) + file.tripleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+2)*4 : (numDirectBlks+3)*4]) return file, nil } @@ -117,16 +119,16 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) { switch { case offset < dirBlksEnd: // Direct block. - curR, err = f.read(f.directBlks[offset/f.regFile.inode.blkSize], offset%f.regFile.inode.blkSize, 0, dst[read:]) + curR, err = f.read(uint32(f.directBlks[offset/f.regFile.inode.blkSize]), offset%f.regFile.inode.blkSize, 0, dst[read:]) case offset < indirBlkEnd: // Indirect block. - curR, err = f.read(f.indirectBlk, offset-dirBlksEnd, 1, dst[read:]) + curR, err = f.read(uint32(f.indirectBlk), offset-dirBlksEnd, 1, dst[read:]) case offset < doubIndirBlkEnd: // Doubly indirect block. - curR, err = f.read(f.doubleIndirectBlk, offset-indirBlkEnd, 2, dst[read:]) + curR, err = f.read(uint32(f.doubleIndirectBlk), offset-indirBlkEnd, 2, dst[read:]) default: // Triply indirect block. - curR, err = f.read(f.tripleIndirectBlk, offset-doubIndirBlkEnd, 3, dst[read:]) + curR, err = f.read(uint32(f.tripleIndirectBlk), offset-doubIndirBlkEnd, 3, dst[read:]) } read += curR @@ -174,13 +176,13 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds read := 0 curChildOff := relFileOff % childCov for i := startIdx; i < endIdx; i++ { - var childPhyBlk uint32 + var childPhyBlk primitive.Uint32 err := readFromDisk(f.regFile.inode.fs.dev, curPhyBlkOff+int64(i*4), &childPhyBlk) if err != nil { return read, err } - n, err := f.read(childPhyBlk, curChildOff, height-1, dst[read:]) + n, err := f.read(uint32(childPhyBlk), curChildOff, height-1, dst[read:]) read += n if err != nil { return read, err diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go index 6fa84e7aa..ed98b482e 100644 --- a/pkg/sentry/fsimpl/ext/block_map_test.go +++ b/pkg/sentry/fsimpl/ext/block_map_test.go @@ -20,7 +20,7 @@ import ( "testing" "github.com/google/go-cmp/cmp" - "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" ) @@ -87,29 +87,33 @@ func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) { mockDisk := make([]byte, mockBMDiskSize) var fileData []byte blkNums := newBlkNumGen() - var data []byte + off := 0 + data := make([]byte, (numDirectBlks+3)*(*primitive.Uint32)(nil).SizeBytes()) // Write the direct blocks. for i := 0; i < numDirectBlks; i++ { - curBlkNum := blkNums.next() - data = binary.Marshal(data, binary.LittleEndian, curBlkNum) - fileData = append(fileData, writeFileDataToBlock(mockDisk, curBlkNum, 0, blkNums)...) + curBlkNum := primitive.Uint32(blkNums.next()) + curBlkNum.MarshalBytes(data[off:]) + off += curBlkNum.SizeBytes() + fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(curBlkNum), 0, blkNums)...) } // Write to indirect block. - indirectBlk := blkNums.next() - data = binary.Marshal(data, binary.LittleEndian, indirectBlk) - fileData = append(fileData, writeFileDataToBlock(mockDisk, indirectBlk, 1, blkNums)...) - - // Write to indirect block. - doublyIndirectBlk := blkNums.next() - data = binary.Marshal(data, binary.LittleEndian, doublyIndirectBlk) - fileData = append(fileData, writeFileDataToBlock(mockDisk, doublyIndirectBlk, 2, blkNums)...) - - // Write to indirect block. - triplyIndirectBlk := blkNums.next() - data = binary.Marshal(data, binary.LittleEndian, triplyIndirectBlk) - fileData = append(fileData, writeFileDataToBlock(mockDisk, triplyIndirectBlk, 3, blkNums)...) + indirectBlk := primitive.Uint32(blkNums.next()) + indirectBlk.MarshalBytes(data[off:]) + off += indirectBlk.SizeBytes() + fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(indirectBlk), 1, blkNums)...) + + // Write to double indirect block. + doublyIndirectBlk := primitive.Uint32(blkNums.next()) + doublyIndirectBlk.MarshalBytes(data[off:]) + off += doublyIndirectBlk.SizeBytes() + fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(doublyIndirectBlk), 2, blkNums)...) + + // Write to triple indirect block. + triplyIndirectBlk := primitive.Uint32(blkNums.next()) + triplyIndirectBlk.MarshalBytes(data[off:]) + fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(triplyIndirectBlk), 3, blkNums)...) args := inodeArgs{ fs: &filesystem{ @@ -142,9 +146,9 @@ func writeFileDataToBlock(disk []byte, blkNum uint32, height uint, blkNums *blkN var fileData []byte for off := blkNum * mockBMBlkSize; off < (blkNum+1)*mockBMBlkSize; off += 4 { - curBlkNum := blkNums.next() - copy(disk[off:off+4], binary.Marshal(nil, binary.LittleEndian, curBlkNum)) - fileData = append(fileData, writeFileDataToBlock(disk, curBlkNum, height-1, blkNums)...) + curBlkNum := primitive.Uint32(blkNums.next()) + curBlkNum.MarshalBytes(disk[off : off+4]) + fileData = append(fileData, writeFileDataToBlock(disk, uint32(curBlkNum), height-1, blkNums)...) } return fileData } diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go index 7a1b4219f..9bfed883a 100644 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -20,6 +20,8 @@ import ( ) // dentry implements vfs.DentryImpl. +// +// +stateify savable type dentry struct { vfsd vfs.Dentry diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index 0fc01668d..0ad79b381 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -16,7 +16,6 @@ package ext import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -28,6 +27,8 @@ import ( ) // directory represents a directory inode. It holds the childList in memory. +// +// +stateify savable type directory struct { inode inode @@ -39,7 +40,7 @@ type directory struct { // Lock Order (outermost locks must be taken first): // directory.mu // filesystem.mu - mu sync.Mutex + mu sync.Mutex `state:"nosave"` // childList is a list containing (1) child dirents and (2) fake dirents // (with diskDirent == nil) that represent the iteration position of @@ -98,7 +99,7 @@ func newDirectory(args inodeArgs, newDirent bool) (*directory, error) { } else { curDirent.diskDirent = &disklayout.DirentOld{} } - binary.Unmarshal(buf, binary.LittleEndian, curDirent.diskDirent) + curDirent.diskDirent.UnmarshalBytes(buf) if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 { // Inode number and name length fields being set to 0 is used to indicate @@ -120,6 +121,8 @@ func (i *inode) isDir() bool { } // dirent is the directory.childList node. +// +// +stateify savable type dirent struct { diskDirent disklayout.Dirent @@ -129,6 +132,8 @@ type dirent struct { // directoryFD represents a directory file description. It implements // vfs.FileDescriptionImpl. +// +// +stateify savable type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD index 9bd9c76c0..d98a05dd8 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/BUILD +++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD @@ -22,10 +22,11 @@ go_library( "superblock_old.go", "test_utils.go", ], + marshal = True, visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/binary", + "//pkg/marshal", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group.go b/pkg/sentry/fsimpl/ext/disklayout/block_group.go index ad6f4fef8..0d56ae9da 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group.go +++ b/pkg/sentry/fsimpl/ext/disklayout/block_group.go @@ -14,6 +14,10 @@ package disklayout +import ( + "gvisor.dev/gvisor/pkg/marshal" +) + // BlockGroup represents a Linux ext block group descriptor. An ext file system // is split into a series of block groups. This provides an access layer to // information needed to access and use a block group. @@ -30,6 +34,8 @@ package disklayout // // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors. type BlockGroup interface { + marshal.Marshallable + // InodeTable returns the absolute block number of the block containing the // inode table. This points to an array of Inode structs. Inode tables are // statically allocated at mkfs time. The superblock records the number of diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go index 3e16c76db..a35fa22a0 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go +++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go @@ -17,6 +17,8 @@ package disklayout // BlockGroup32Bit emulates the first half of struct ext4_group_desc in // fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and // 32-bit ext4 filesystems. It implements BlockGroup interface. +// +// +marshal type BlockGroup32Bit struct { BlockBitmapLo uint32 InodeBitmapLo uint32 diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go index 9a809197a..d54d1d345 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go +++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go @@ -18,6 +18,8 @@ package disklayout // It is the block group descriptor struct for 64-bit ext4 filesystems. // It implements BlockGroup interface. It is an extension of the 32-bit // version of BlockGroup. +// +// +marshal type BlockGroup64Bit struct { // We embed the 32-bit struct here because 64-bit version is just an extension // of the 32-bit version. diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go index 0ef4294c0..e4ce484e4 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go +++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go @@ -21,6 +21,8 @@ import ( // TestBlockGroupSize tests that the block group descriptor structs are of the // correct size. func TestBlockGroupSize(t *testing.T) { - assertSize(t, BlockGroup32Bit{}, 32) - assertSize(t, BlockGroup64Bit{}, 64) + var bgSmall BlockGroup32Bit + assertSize(t, &bgSmall, 32) + var bgBig BlockGroup64Bit + assertSize(t, &bgBig, 64) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent.go b/pkg/sentry/fsimpl/ext/disklayout/dirent.go index 417b6cf65..568c8cb4c 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent.go +++ b/pkg/sentry/fsimpl/ext/disklayout/dirent.go @@ -15,6 +15,7 @@ package disklayout import ( + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/fs" ) @@ -51,6 +52,8 @@ var ( // // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories. type Dirent interface { + marshal.Marshallable + // Inode returns the absolute inode number of the underlying inode. // Inode number 0 signifies an unused dirent. Inode() uint32 diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go index 29ae4a5c2..51f9c2946 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go +++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go @@ -29,12 +29,14 @@ import ( // Note: This struct can be of variable size on disk. The one described below // is of maximum size and the FileName beyond NameLength bytes might contain // garbage. +// +// +marshal type DirentNew struct { InodeNumber uint32 RecordLength uint16 NameLength uint8 FileTypeRaw uint8 - FileNameRaw [MaxFileName]byte + FileNameRaw [MaxFileName]byte `marshal:"unaligned"` } // Compiles only if DirentNew implements Dirent. diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go index 6fff12a6e..d4b19e086 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go +++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go @@ -22,11 +22,13 @@ import "gvisor.dev/gvisor/pkg/sentry/fs" // Note: This struct can be of variable size on disk. The one described below // is of maximum size and the FileName beyond NameLength bytes might contain // garbage. +// +// +marshal type DirentOld struct { InodeNumber uint32 RecordLength uint16 NameLength uint16 - FileNameRaw [MaxFileName]byte + FileNameRaw [MaxFileName]byte `marshal:"unaligned"` } // Compiles only if DirentOld implements Dirent. diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go index 934919f8a..3486864dc 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go +++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go @@ -21,6 +21,8 @@ import ( // TestDirentSize tests that the dirent structs are of the correct // size. func TestDirentSize(t *testing.T) { - assertSize(t, DirentOld{}, uintptr(DirentSize)) - assertSize(t, DirentNew{}, uintptr(DirentSize)) + var dOld DirentOld + assertSize(t, &dOld, DirentSize) + var dNew DirentNew + assertSize(t, &dNew, DirentSize) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go index bdf4e2132..0834e9ba8 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go +++ b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go @@ -36,8 +36,6 @@ // escape analysis on an unknown implementation at compile time. // // Notes: -// - All fields in these structs are exported because binary.Read would -// panic otherwise. // - All structures on disk are in little-endian order. Only jbd2 (journal) // structures are in big-endian order. // - All OS dependent fields in these structures will be interpretted using diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go index 4110649ab..b13999bfc 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/extent.go +++ b/pkg/sentry/fsimpl/ext/disklayout/extent.go @@ -14,6 +14,10 @@ package disklayout +import ( + "gvisor.dev/gvisor/pkg/marshal" +) + // Extents were introduced in ext4 and provide huge performance gains in terms // data locality and reduced metadata block usage. Extents are organized in // extent trees. The root node is contained in inode.BlocksRaw. @@ -64,6 +68,8 @@ type ExtentNode struct { // ExtentEntry represents an extent tree node entry. The entry can either be // an ExtentIdx or Extent itself. This exists to simplify navigation logic. type ExtentEntry interface { + marshal.Marshallable + // FileBlock returns the first file block number covered by this entry. FileBlock() uint32 @@ -75,6 +81,8 @@ type ExtentEntry interface { // tree node begins with this and is followed by `NumEntries` number of: // - Extent if `Depth` == 0 // - ExtentIdx otherwise +// +// +marshal type ExtentHeader struct { // Magic in the extent magic number, must be 0xf30a. Magic uint16 @@ -96,6 +104,8 @@ type ExtentHeader struct { // internal nodes. Sorted in ascending order based on FirstFileBlock since // Linux does a binary search on this. This points to a block containing the // child node. +// +// +marshal type ExtentIdx struct { FirstFileBlock uint32 ChildBlockLo uint32 @@ -121,6 +131,8 @@ func (ei *ExtentIdx) PhysicalBlock() uint64 { // nodes. Sorted in ascending order based on FirstFileBlock since Linux does a // binary search on this. This points to an array of data blocks containing the // file data. It covers `Length` data blocks starting from `StartBlock`. +// +// +marshal type Extent struct { FirstFileBlock uint32 Length uint16 diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go index 8762b90db..c96002e19 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go +++ b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go @@ -21,7 +21,10 @@ import ( // TestExtentSize tests that the extent structs are of the correct // size. func TestExtentSize(t *testing.T) { - assertSize(t, ExtentHeader{}, ExtentHeaderSize) - assertSize(t, ExtentIdx{}, ExtentEntrySize) - assertSize(t, Extent{}, ExtentEntrySize) + var h ExtentHeader + assertSize(t, &h, ExtentHeaderSize) + var i ExtentIdx + assertSize(t, &i, ExtentEntrySize) + var e Extent + assertSize(t, &e, ExtentEntrySize) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode.go b/pkg/sentry/fsimpl/ext/disklayout/inode.go index 88ae913f5..ef25040a9 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/inode.go +++ b/pkg/sentry/fsimpl/ext/disklayout/inode.go @@ -16,6 +16,7 @@ package disklayout import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" ) @@ -38,6 +39,8 @@ const ( // // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes. type Inode interface { + marshal.Marshallable + // Mode returns the linux file mode which is majorly used to extract // information like: // - File permissions (read/write/execute by user/group/others). diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go index 8f9f574ce..a4503f5cf 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go +++ b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go @@ -27,6 +27,8 @@ import "gvisor.dev/gvisor/pkg/sentry/kernel/time" // are used to provide nanoscond precision. Hence, these timestamps will now // overflow in May 2446. // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. +// +// +marshal type InodeNew struct { InodeOld diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go index db25b11b6..e6b28babf 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go +++ b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go @@ -30,6 +30,8 @@ const ( // // All fields representing time are in seconds since the epoch. Which means that // they will overflow in January 2038. +// +// +marshal type InodeOld struct { ModeRaw uint16 UIDLo uint16 diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go index dd03ee50e..90744e956 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go +++ b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go @@ -24,10 +24,12 @@ import ( // TestInodeSize tests that the inode structs are of the correct size. func TestInodeSize(t *testing.T) { - assertSize(t, InodeOld{}, OldInodeSize) + var iOld InodeOld + assertSize(t, &iOld, OldInodeSize) // This was updated from 156 bytes to 160 bytes in Oct 2015. - assertSize(t, InodeNew{}, 160) + var iNew InodeNew + assertSize(t, &iNew, 160) } // TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock.go b/pkg/sentry/fsimpl/ext/disklayout/superblock.go index 8bb327006..70948ebe9 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock.go +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock.go @@ -14,6 +14,10 @@ package disklayout +import ( + "gvisor.dev/gvisor/pkg/marshal" +) + const ( // SbOffset is the absolute offset at which the superblock is placed. SbOffset = 1024 @@ -38,6 +42,8 @@ const ( // // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#super-block. type SuperBlock interface { + marshal.Marshallable + // InodesCount returns the total number of inodes in this filesystem. InodesCount() uint32 diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go index 53e515fd3..4dc6080fb 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go @@ -17,6 +17,8 @@ package disklayout // SuperBlock32Bit implements SuperBlock and represents the 32-bit version of // the ext4_super_block struct in fs/ext4/ext4.h. Should be used only if // RevLevel = DynamicRev and 64-bit feature is disabled. +// +// +marshal type SuperBlock32Bit struct { // We embed the old superblock struct here because the 32-bit version is just // an extension of the old version. diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go index 7c1053fb4..2c9039327 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go @@ -19,6 +19,8 @@ package disklayout // 1024 bytes (smallest possible block size) and hence the superblock always // fits in no more than one data block. Should only be used when the 64-bit // feature is set. +// +// +marshal type SuperBlock64Bit struct { // We embed the 32-bit struct here because 64-bit version is just an extension // of the 32-bit version. diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go index 9221e0251..e4709f23c 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go @@ -16,6 +16,8 @@ package disklayout // SuperBlockOld implements SuperBlock and represents the old version of the // superblock struct. Should be used only if RevLevel = OldRev. +// +// +marshal type SuperBlockOld struct { InodesCountRaw uint32 BlocksCountLo uint32 diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go index 463b5ba21..b734b6987 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go @@ -21,7 +21,10 @@ import ( // TestSuperBlockSize tests that the superblock structs are of the correct // size. func TestSuperBlockSize(t *testing.T) { - assertSize(t, SuperBlockOld{}, 84) - assertSize(t, SuperBlock32Bit{}, 336) - assertSize(t, SuperBlock64Bit{}, 1024) + var sbOld SuperBlockOld + assertSize(t, &sbOld, 84) + var sb32 SuperBlock32Bit + assertSize(t, &sb32, 336) + var sb64 SuperBlock64Bit + assertSize(t, &sb64, 1024) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go index 9c63f04c0..a4bc08411 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go +++ b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go @@ -18,13 +18,13 @@ import ( "reflect" "testing" - "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/marshal" ) -func assertSize(t *testing.T, v interface{}, want uintptr) { +func assertSize(t *testing.T, v marshal.Marshallable, want int) { t.Helper() - if got := binary.Size(v); got != want { + if got := v.SizeBytes(); got != want { t.Errorf("struct %s should be exactly %d bytes but is %d bytes", reflect.TypeOf(v).Name(), want, got) } } diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go index 08ffc2834..aca258d40 100644 --- a/pkg/sentry/fsimpl/ext/ext.go +++ b/pkg/sentry/fsimpl/ext/ext.go @@ -34,6 +34,8 @@ import ( const Name = "ext" // FilesystemType implements vfs.FilesystemType. +// +// +stateify savable type FilesystemType struct{} // Compiles only if FilesystemType implements vfs.FilesystemType. diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 2dbaee287..0989558cd 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -71,7 +71,11 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: int(f.Fd()), + }, + }) if err != nil { f.Close() return nil, nil, nil, nil, err diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go index c36225a7c..778460107 100644 --- a/pkg/sentry/fsimpl/ext/extent_file.go +++ b/pkg/sentry/fsimpl/ext/extent_file.go @@ -18,12 +18,13 @@ import ( "io" "sort" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/syserror" ) // extentFile is a type of regular file which uses extents to store file data. +// +// +stateify savable type extentFile struct { regFile regularFile @@ -58,7 +59,7 @@ func newExtentFile(args inodeArgs) (*extentFile, error) { func (f *extentFile) buildExtTree() error { rootNodeData := f.regFile.inode.diskInode.Data() - binary.Unmarshal(rootNodeData[:disklayout.ExtentHeaderSize], binary.LittleEndian, &f.root.Header) + f.root.Header.UnmarshalBytes(rootNodeData[:disklayout.ExtentHeaderSize]) // Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries. if f.root.Header.NumEntries > 4 { @@ -77,7 +78,7 @@ func (f *extentFile) buildExtTree() error { // Internal node. curEntry = &disklayout.ExtentIdx{} } - binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentEntrySize], binary.LittleEndian, curEntry) + curEntry.UnmarshalBytes(rootNodeData[off : off+disklayout.ExtentEntrySize]) f.root.Entries[i].Entry = curEntry } diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go index cd10d46ee..985f76ac0 100644 --- a/pkg/sentry/fsimpl/ext/extent_test.go +++ b/pkg/sentry/fsimpl/ext/extent_test.go @@ -21,7 +21,6 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" ) @@ -202,13 +201,14 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, [] // writeTree writes the tree represented by `root` to the inode and disk. It // also writes random file data on disk. func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBlkSize uint64) []byte { - rootData := binary.Marshal(nil, binary.LittleEndian, root.Header) + rootData := in.diskInode.Data() + root.Header.MarshalBytes(rootData) + off := root.Header.SizeBytes() for _, ep := range root.Entries { - rootData = binary.Marshal(rootData, binary.LittleEndian, ep.Entry) + ep.Entry.MarshalBytes(rootData[off:]) + off += ep.Entry.SizeBytes() } - copy(in.diskInode.Data(), rootData) - var fileData []byte for _, ep := range root.Entries { if root.Header.Height == 0 { @@ -223,13 +223,14 @@ func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBl // writeTreeToDisk is the recursive step for writeTree which writes the tree // on the disk only. Also writes random file data on disk. func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair) []byte { - nodeData := binary.Marshal(nil, binary.LittleEndian, curNode.Node.Header) + nodeData := disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:] + curNode.Node.Header.MarshalBytes(nodeData) + off := curNode.Node.Header.SizeBytes() for _, ep := range curNode.Node.Entries { - nodeData = binary.Marshal(nodeData, binary.LittleEndian, ep.Entry) + ep.Entry.MarshalBytes(nodeData[off:]) + off += ep.Entry.SizeBytes() } - copy(disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:], nodeData) - var fileData []byte for _, ep := range curNode.Node.Entries { if curNode.Node.Header.Height == 0 { diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 8565d1a66..917f1873d 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -38,11 +38,13 @@ var ( ) // filesystem implements vfs.FilesystemImpl. +// +// +stateify savable type filesystem struct { vfsfs vfs.Filesystem // mu serializes changes to the Dentry tree. - mu sync.RWMutex + mu sync.RWMutex `state:"nosave"` // dev represents the underlying fs device. It does not require protection // because io.ReaderAt permits concurrent read calls to it. It translates to @@ -490,7 +492,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return syserror.EROFS } -// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { _, inode, err := fs.walk(ctx, rp, false) if err != nil { @@ -504,8 +506,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath return nil, syserror.ECONNREFUSED } -// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { +// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. +func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { _, _, err := fs.walk(ctx, rp, false) if err != nil { return nil, err @@ -513,8 +515,8 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si return nil, syserror.ENOTSUP } -// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { +// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. +func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { _, _, err := fs.walk(ctx, rp, false) if err != nil { return "", err @@ -522,8 +524,8 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt return "", syserror.ENOTSUP } -// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. -func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { +// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. +func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { _, _, err := fs.walk(ctx, rp, false) if err != nil { return err @@ -531,8 +533,8 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt return syserror.ENOTSUP } -// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. -func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { +// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. +func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { _, _, err := fs.walk(ctx, rp, false) if err != nil { return err diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 30636cf66..9009ba3c7 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -37,6 +37,8 @@ import ( // |-- regular-- // |-- extent file // |-- block map file +// +// +stateify savable type inode struct { // refs is a reference count. refs is accessed using atomic memory operations. refs int64 diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index e73e740d6..4a5539b37 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -31,6 +31,8 @@ import ( // regularFile represents a regular file's inode. This too follows the // inheritance pattern prevelant in the vfs layer described in // pkg/sentry/vfs/README.md. +// +// +stateify savable type regularFile struct { inode inode @@ -67,6 +69,8 @@ func (in *inode) isRegular() bool { // directoryFD represents a directory file description. It implements // vfs.FileDescriptionImpl. +// +// +stateify savable type regularFileFD struct { fileDescription vfs.LockFD @@ -75,7 +79,7 @@ type regularFileFD struct { off int64 // offMu serializes operations that may mutate off. - offMu sync.Mutex + offMu sync.Mutex `state:"nosave"` } // Release implements vfs.FileDescriptionImpl.Release. diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go index 2fd0d1fa8..5e2bcc837 100644 --- a/pkg/sentry/fsimpl/ext/symlink.go +++ b/pkg/sentry/fsimpl/ext/symlink.go @@ -23,6 +23,8 @@ import ( ) // symlink represents a symlink inode. +// +// +stateify savable type symlink struct { inode inode target string // immutable @@ -61,9 +63,11 @@ func (in *inode) isSymlink() bool { return ok } -// symlinkFD represents a symlink file description and implements implements +// symlinkFD represents a symlink file description and implements // vfs.FileDescriptionImpl. which may only be used if open options contains // O_PATH. For this reason most of the functions return EBADF. +// +// +stateify savable type symlinkFD struct { fileDescription vfs.NoLockFD diff --git a/pkg/sentry/fsimpl/ext/utils.go b/pkg/sentry/fsimpl/ext/utils.go index d8b728f8c..58ef7b9b8 100644 --- a/pkg/sentry/fsimpl/ext/utils.go +++ b/pkg/sentry/fsimpl/ext/utils.go @@ -17,21 +17,21 @@ package ext import ( "io" - "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/syserror" ) // readFromDisk performs a binary read from disk into the given struct from // the absolute offset provided. -func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error { - n := binary.Size(v) +func readFromDisk(dev io.ReaderAt, abOff int64, v marshal.Marshallable) error { + n := v.SizeBytes() buf := make([]byte, n) if read, _ := dev.ReadAt(buf, abOff); read < int(n) { return syserror.EIO } - binary.Unmarshal(buf, binary.LittleEndian, v) + v.UnmarshalBytes(buf) return nil } diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD index 53a4f3012..045d7ab08 100644 --- a/pkg/sentry/fsimpl/fuse/BUILD +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -30,19 +30,26 @@ go_library( name = "fuse", srcs = [ "connection.go", + "connection_control.go", "dev.go", + "directory.go", + "file.go", "fusefs.go", - "init.go", "inode_refs.go", + "read_write.go", "register.go", + "regular_file.go", "request_list.go", + "request_response.go", ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", "//pkg/log", + "//pkg/marshal", "//pkg/refs", + "//pkg/safemem", "//pkg/sentry/fsimpl/devtmpfs", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel", @@ -52,7 +59,6 @@ go_library( "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", - "//tools/go_marshal/marshal", "@org_golang_x_sys//unix:go_default_library", ], ) @@ -60,10 +66,15 @@ go_library( go_test( name = "fuse_test", size = "small", - srcs = ["dev_test.go"], + srcs = [ + "connection_test.go", + "dev_test.go", + "utils_test.go", + ], library = ":fuse", deps = [ "//pkg/abi/linux", + "//pkg/marshal", "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", @@ -71,6 +82,5 @@ go_test( "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", - "//tools/go_marshal/marshal", ], ) diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go index 6df2728ab..8ccda1264 100644 --- a/pkg/sentry/fsimpl/fuse/connection.go +++ b/pkg/sentry/fsimpl/fuse/connection.go @@ -15,31 +15,17 @@ package fuse import ( - "errors" - "fmt" "sync" - "sync/atomic" - "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" - "gvisor.dev/gvisor/tools/go_marshal/marshal" ) -// maxActiveRequestsDefault is the default setting controlling the upper bound -// on the number of active requests at any given time. -const maxActiveRequestsDefault = 10000 - -// Ordinary requests have even IDs, while interrupts IDs are odd. -// Used to increment the unique ID for each FUSE request. -var reqIDStep uint64 = 2 - const ( // fuseDefaultMaxBackground is the default value for MaxBackground. fuseDefaultMaxBackground = 12 @@ -52,43 +38,39 @@ const ( fuseDefaultMaxPagesPerReq = 32 ) -// Request represents a FUSE operation request that hasn't been sent to the -// server yet. +// connection is the struct by which the sentry communicates with the FUSE server daemon. // -// +stateify savable -type Request struct { - requestEntry - - id linux.FUSEOpID - hdr *linux.FUSEHeaderIn - data []byte -} - -// Response represents an actual response from the server, including the -// response payload. +// Lock order: +// - conn.fd.mu +// - conn.mu +// - conn.asyncMu // // +stateify savable -type Response struct { - opcode linux.FUSEOpcode - hdr linux.FUSEHeaderOut - data []byte -} - -// connection is the struct by which the sentry communicates with the FUSE server daemon. type connection struct { fd *DeviceFD + // mu protects access to struct memebers. + mu sync.Mutex `state:"nosave"` + + // attributeVersion is the version of connection's attributes. + attributeVersion uint64 + + // We target FUSE 7.23. // The following FUSE_INIT flags are currently unsupported by this implementation: - // - FUSE_ATOMIC_O_TRUNC: requires open(..., O_TRUNC) // - FUSE_EXPORT_SUPPORT - // - FUSE_HANDLE_KILLPRIV // - FUSE_POSIX_LOCKS: requires POSIX locks // - FUSE_FLOCK_LOCKS: requires POSIX locks // - FUSE_AUTO_INVAL_DATA: requires page caching eviction - // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction // - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation // - FUSE_ASYNC_DIO - // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler + // - FUSE_PARALLEL_DIROPS (7.25) + // - FUSE_HANDLE_KILLPRIV (7.26) + // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler (7.26) + // - FUSE_ABORT_ERROR (7.27) + // - FUSE_CACHE_SYMLINKS (7.28) + // - FUSE_NO_OPENDIR_SUPPORT (7.29) + // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction (7.30) + // - FUSE_MAP_ALIGNMENT (7.31) // initialized after receiving FUSE_INIT reply. // Until it's set, suspend sending FUSE requests. @@ -96,11 +78,7 @@ type connection struct { initialized int32 // initializedChan is used to block requests before initialization. - initializedChan chan struct{} - - // blocked when there are too many outstading backgrounds requests (NumBackground == MaxBackground). - // TODO(gvisor.dev/issue/3185): update the numBackground accordingly; use a channel to block. - blocked bool + initializedChan chan struct{} `state:".(bool)"` // connected (connection established) when a new FUSE file system is created. // Set to false when: @@ -109,48 +87,55 @@ type connection struct { // device release. connected bool - // aborted via sysfs. - // TODO(gvisor.dev/issue/3185): abort all queued requests. - aborted bool - // connInitError if FUSE_INIT encountered error (major version mismatch). // Only set in INIT. connInitError bool // connInitSuccess if FUSE_INIT is successful. // Only set in INIT. - // Used for destory. + // Used for destory (not yet implemented). connInitSuccess bool - // TODO(gvisor.dev/issue/3185): All the queue logic are working in progress. - - // NumberBackground is the number of requests in the background. - numBackground uint16 + // aborted via sysfs, and will send ECONNABORTED to read after disconnection (instead of ENODEV). + // Set only if abortErr is true and via fuse control fs (not yet implemented). + // TODO(gvisor.dev/issue/3525): set this to true when user aborts. + aborted bool - // congestionThreshold for NumBackground. - // Negotiated in FUSE_INIT. - congestionThreshold uint16 + // numWating is the number of requests waiting to be + // sent to FUSE device or being processed by FUSE daemon. + numWaiting uint32 - // maxBackground is the maximum number of NumBackground. - // Block connection when it is reached. - // Negotiated in FUSE_INIT. - maxBackground uint16 + // Terminology note: + // + // - `asyncNumMax` is the `MaxBackground` in the FUSE_INIT_IN struct. + // + // - `asyncCongestionThreshold` is the `CongestionThreshold` in the FUSE_INIT_IN struct. + // + // We call the "background" requests in unix term as async requests. + // The "async requests" in unix term is our async requests that expect a reply, + // i.e. `!request.noReply` - // numActiveBackground is the number of requests in background and has being marked as active. - numActiveBackground uint16 + // asyncMu protects the async request fields. + asyncMu sync.Mutex `state:"nosave"` - // numWating is the number of requests waiting for completion. - numWaiting uint32 + // asyncNum is the number of async requests. + // Protected by asyncMu. + asyncNum uint16 - // TODO(gvisor.dev/issue/3185): BgQueue - // some queue for background queued requests. + // asyncCongestionThreshold the number of async requests. + // Negotiated in FUSE_INIT as "CongestionThreshold". + // TODO(gvisor.dev/issue/3529): add congestion control. + // Protected by asyncMu. + asyncCongestionThreshold uint16 - // bgLock protects: - // MaxBackground, CongestionThreshold, NumBackground, - // NumActiveBackground, BgQueue, Blocked. - bgLock sync.Mutex + // asyncNumMax is the maximum number of asyncNum. + // Connection blocks the async requests when it is reached. + // Negotiated in FUSE_INIT as "MaxBackground". + // Protected by asyncMu. + asyncNumMax uint16 // maxRead is the maximum size of a read buffer in in bytes. + // Initialized from a fuse fs parameter. maxRead uint32 // maxWrite is the maximum size of a write buffer in bytes. @@ -165,23 +150,20 @@ type connection struct { // Negotiated and only set in INIT. minor uint32 - // asyncRead if read pages asynchronously. + // atomicOTrunc is true when FUSE does not send a separate SETATTR request + // before open with O_TRUNC flag. // Negotiated and only set in INIT. - asyncRead bool + atomicOTrunc bool - // abortErr is true if kernel need to return an unique read error after abort. + // asyncRead if read pages asynchronously. // Negotiated and only set in INIT. - abortErr bool + asyncRead bool // writebackCache is true for write-back cache policy, // false for write-through policy. // Negotiated and only set in INIT. writebackCache bool - // cacheSymlinks if filesystem needs to cache READLINK responses in page cache. - // Negotiated and only set in INIT. - cacheSymlinks bool - // bigWrites if doing multi-page cached writes. // Negotiated and only set in INIT. bigWrites bool @@ -189,116 +171,86 @@ type connection struct { // dontMask if filestestem does not apply umask to creation modes. // Negotiated in INIT. dontMask bool + + // noOpen if FUSE server doesn't support open operation. + // This flag only influence performance, not correctness of the program. + noOpen bool +} + +func (conn *connection) saveInitializedChan() bool { + select { + case <-conn.initializedChan: + return true // Closed. + default: + return false // Not closed. + } +} + +func (conn *connection) loadInitializedChan(closed bool) { + conn.initializedChan = make(chan struct{}, 1) + if closed { + close(conn.initializedChan) + } } // newFUSEConnection creates a FUSE connection to fd. -func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*connection, error) { +func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, opts *filesystemOptions) (*connection, error) { // Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to // mount a FUSE filesystem. fuseFD := fd.Impl().(*DeviceFD) - fuseFD.mounted = true // Create the writeBuf for the header to be stored in. hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) fuseFD.writeBuf = make([]byte, hdrLen) fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse) - fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests) + fuseFD.fullQueueCh = make(chan struct{}, opts.maxActiveRequests) fuseFD.writeCursor = 0 return &connection{ - fd: fuseFD, - maxBackground: fuseDefaultMaxBackground, - congestionThreshold: fuseDefaultCongestionThreshold, - maxPages: fuseDefaultMaxPagesPerReq, - initializedChan: make(chan struct{}), - connected: true, - }, nil -} - -// SetInitialized atomically sets the connection as initialized. -func (conn *connection) SetInitialized() { - // Unblock the requests sent before INIT. - close(conn.initializedChan) - - // Close the channel first to avoid the non-atomic situation - // where conn.initialized is true but there are - // tasks being blocked on the channel. - // And it prevents the newer tasks from gaining - // unnecessary higher chance to be issued before the blocked one. - - atomic.StoreInt32(&(conn.initialized), int32(1)) -} - -// IsInitialized atomically check if the connection is initialized. -// pairs with SetInitialized(). -func (conn *connection) Initialized() bool { - return atomic.LoadInt32(&(conn.initialized)) != 0 -} - -// NewRequest creates a new request that can be sent to the FUSE server. -func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { - conn.fd.mu.Lock() - defer conn.fd.mu.Unlock() - conn.fd.nextOpID += linux.FUSEOpID(reqIDStep) - - hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes() - hdr := linux.FUSEHeaderIn{ - Len: uint32(hdrLen + payload.SizeBytes()), - Opcode: opcode, - Unique: conn.fd.nextOpID, - NodeID: ino, - UID: uint32(creds.EffectiveKUID), - GID: uint32(creds.EffectiveKGID), - PID: pid, - } - - buf := make([]byte, hdr.Len) - hdr.MarshalUnsafe(buf[:hdrLen]) - payload.MarshalUnsafe(buf[hdrLen:]) - - return &Request{ - id: hdr.Unique, - hdr: &hdr, - data: buf, + fd: fuseFD, + asyncNumMax: fuseDefaultMaxBackground, + asyncCongestionThreshold: fuseDefaultCongestionThreshold, + maxRead: opts.maxRead, + maxPages: fuseDefaultMaxPagesPerReq, + initializedChan: make(chan struct{}), + connected: true, }, nil } -// Call makes a request to the server and blocks the invoking task until a -// server responds with a response. Task should never be nil. -// Requests will not be sent before the connection is initialized. -// For async tasks, use CallAsync(). -func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) { - // Block requests sent before connection is initalized. - if !conn.Initialized() { - if err := t.Block(conn.initializedChan); err != nil { - return nil, err - } - } - - return conn.call(t, r) +// CallAsync makes an async (aka background) request. +// It's a simple wrapper around Call(). +func (conn *connection) CallAsync(t *kernel.Task, r *Request) error { + r.async = true + _, err := conn.Call(t, r) + return err } -// CallAsync makes an async (aka background) request. -// Those requests either do not expect a response (e.g. release) or -// the response should be handled by others (e.g. init). -// Return immediately unless the connection is blocked (before initialization). -// Async call example: init, release, forget, aio, interrupt. +// Call makes a request to the server. +// Block before the connection is initialized. // When the Request is FUSE_INIT, it will not be blocked before initialization. -func (conn *connection) CallAsync(t *kernel.Task, r *Request) error { +// Task should never be nil. +// +// For a sync request, it blocks the invoking task until +// a server responds with a response. +// +// For an async request (that do not expect a response immediately), +// it returns directly unless being blocked either before initialization +// or when there are too many async requests ongoing. +// +// Example for async request: +// init, readahead, write, async read/write, fuse_notify_reply, +// non-sync release, interrupt, forget. +// +// The forget request does not have a reply, +// as documented in include/uapi/linux/fuse.h:FUSE_FORGET. +func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) { // Block requests sent before connection is initalized. if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT { if err := t.Block(conn.initializedChan); err != nil { - return err + return nil, err } } - // This should be the only place that invokes call() with a nil task. - _, err := conn.call(nil, r) - return err -} - -// call makes a call without blocking checks. -func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) { if !conn.connected { return nil, syserror.ENOTCONN } @@ -315,31 +267,6 @@ func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) { return fut.resolve(t) } -// Error returns the error of the FUSE call. -func (r *Response) Error() error { - errno := r.hdr.Error - if errno >= 0 { - return nil - } - - sysErrNo := syscall.Errno(-errno) - return error(sysErrNo) -} - -// UnmarshalPayload unmarshals the response data into m. -func (r *Response) UnmarshalPayload(m marshal.Marshallable) error { - hdrLen := r.hdr.SizeBytes() - haveDataLen := r.hdr.Len - uint32(hdrLen) - wantDataLen := uint32(m.SizeBytes()) - - if haveDataLen < wantDataLen { - return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen) - } - - m.UnmarshalUnsafe(r.data[hdrLen:]) - return nil -} - // callFuture makes a request to the server and returns a future response. // Call resolve() when the response needs to be fulfilled. func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) { @@ -358,11 +285,6 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, // if there are always too many ongoing requests all the time. The // supported maxActiveRequests setting should be really high to avoid this. for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests { - if t == nil { - // Since there is no task that is waiting. We must error out. - return nil, errors.New("FUSE request queue full") - } - log.Infof("Blocking request %v from being queued. Too many active requests: %v", r.id, conn.fd.numActiveRequests) conn.fd.mu.Unlock() @@ -378,9 +300,19 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, // callFutureLocked makes a request to the server and returns a future response. func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) { + // Check connected again holding conn.mu. + conn.mu.Lock() + if !conn.connected { + conn.mu.Unlock() + // we checked connected before, + // this must be due to aborted connection. + return nil, syserror.ECONNABORTED + } + conn.mu.Unlock() + conn.fd.queue.PushBack(r) - conn.fd.numActiveRequests += 1 - fut := newFutureResponse(r.hdr.Opcode) + conn.fd.numActiveRequests++ + fut := newFutureResponse(r) conn.fd.completions[r.id] = fut // Signal the readers that there is something to read. @@ -388,50 +320,3 @@ func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureRes return fut, nil } - -// futureResponse represents an in-flight request, that may or may not have -// completed yet. Convert it to a resolved Response by calling Resolve, but note -// that this may block. -// -// +stateify savable -type futureResponse struct { - opcode linux.FUSEOpcode - ch chan struct{} - hdr *linux.FUSEHeaderOut - data []byte -} - -// newFutureResponse creates a future response to a FUSE request. -func newFutureResponse(opcode linux.FUSEOpcode) *futureResponse { - return &futureResponse{ - opcode: opcode, - ch: make(chan struct{}), - } -} - -// resolve blocks the task until the server responds to its corresponding request, -// then returns a resolved response. -func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) { - // If there is no Task associated with this request - then we don't try to resolve - // the response. Instead, the task writing the response (proxy to the server) will - // process the response on our behalf. - if t == nil { - log.Infof("fuse.Response.resolve: Not waiting on a response from server.") - return nil, nil - } - - if err := t.Block(f.ch); err != nil { - return nil, err - } - - return f.getResponse(), nil -} - -// getResponse creates a Response from the data the futureResponse has. -func (f *futureResponse) getResponse() *Response { - return &Response{ - opcode: f.opcode, - hdr: *f.hdr, - data: f.data, - } -} diff --git a/pkg/sentry/fsimpl/fuse/init.go b/pkg/sentry/fsimpl/fuse/connection_control.go index 779c2bd3f..bfde78559 100644 --- a/pkg/sentry/fsimpl/fuse/init.go +++ b/pkg/sentry/fsimpl/fuse/connection_control.go @@ -15,7 +15,11 @@ package fuse import ( + "sync/atomic" + "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) @@ -29,9 +33,10 @@ const ( // Follow the same behavior as unix fuse implementation. fuseMaxTimeGranNs = 1000000000 - // Minimum value for MaxWrite. + // Minimum value for MaxWrite and MaxRead. // Follow the same behavior as unix fuse implementation. fuseMinMaxWrite = 4096 + fuseMinMaxRead = 4096 // Temporary default value for max readahead, 128kb. fuseDefaultMaxReadahead = 131072 @@ -49,6 +54,26 @@ var ( MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold ) +// SetInitialized atomically sets the connection as initialized. +func (conn *connection) SetInitialized() { + // Unblock the requests sent before INIT. + close(conn.initializedChan) + + // Close the channel first to avoid the non-atomic situation + // where conn.initialized is true but there are + // tasks being blocked on the channel. + // And it prevents the newer tasks from gaining + // unnecessary higher chance to be issued before the blocked one. + + atomic.StoreInt32(&(conn.initialized), int32(1)) +} + +// IsInitialized atomically check if the connection is initialized. +// pairs with SetInitialized(). +func (conn *connection) Initialized() bool { + return atomic.LoadInt32(&(conn.initialized)) != 0 +} + // InitSend sends a FUSE_INIT request. func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error { in := linux.FUSEInitIn{ @@ -70,29 +95,31 @@ func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error { } // InitRecv receives a FUSE_INIT reply and process it. +// +// Preconditions: conn.asyncMu must not be held if minor verion is newer than 13. func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error { if err := res.Error(); err != nil { return err } - var out linux.FUSEInitOut - if err := res.UnmarshalPayload(&out); err != nil { + initRes := fuseInitRes{initLen: res.DataLen()} + if err := res.UnmarshalPayload(&initRes); err != nil { return err } - return conn.initProcessReply(&out, hasSysAdminCap) + return conn.initProcessReply(&initRes.initOut, hasSysAdminCap) } // Process the FUSE_INIT reply from the FUSE server. +// It tries to acquire the conn.asyncMu lock if minor version is newer than 13. func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error { + // No matter error or not, always set initialzied. + // to unblock the blocked requests. + defer conn.SetInitialized() + // No support for old major fuse versions. if out.Major != linux.FUSE_KERNEL_VERSION { conn.connInitError = true - - // Set the connection as initialized and unblock the blocked requests - // (i.e. return error for them). - conn.SetInitialized() - return nil } @@ -100,29 +127,14 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap conn.connInitSuccess = true conn.minor = out.Minor - // No support for limits before minor version 13. - if out.Minor >= 13 { - conn.bgLock.Lock() - - if out.MaxBackground > 0 { - conn.maxBackground = out.MaxBackground - - if !hasSysAdminCap && - conn.maxBackground > MaxUserBackgroundRequest { - conn.maxBackground = MaxUserBackgroundRequest - } - } - - if out.CongestionThreshold > 0 { - conn.congestionThreshold = out.CongestionThreshold - - if !hasSysAdminCap && - conn.congestionThreshold > MaxUserCongestionThreshold { - conn.congestionThreshold = MaxUserCongestionThreshold - } - } - - conn.bgLock.Unlock() + // No support for negotiating MaxWrite before minor version 5. + if out.Minor >= 5 { + conn.maxWrite = out.MaxWrite + } else { + conn.maxWrite = fuseMinMaxWrite + } + if conn.maxWrite < fuseMinMaxWrite { + conn.maxWrite = fuseMinMaxWrite } // No support for the following flags before minor version 6. @@ -131,8 +143,6 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0 conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0 conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0 - conn.cacheSymlinks = out.Flags&linux.FUSE_CACHE_SYMLINKS != 0 - conn.abortErr = out.Flags&linux.FUSE_ABORT_ERROR != 0 // TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs). @@ -148,19 +158,90 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap } } - // No support for negotiating MaxWrite before minor version 5. - if out.Minor >= 5 { - conn.maxWrite = out.MaxWrite - } else { - conn.maxWrite = fuseMinMaxWrite + // No support for limits before minor version 13. + if out.Minor >= 13 { + conn.asyncMu.Lock() + + if out.MaxBackground > 0 { + conn.asyncNumMax = out.MaxBackground + + if !hasSysAdminCap && + conn.asyncNumMax > MaxUserBackgroundRequest { + conn.asyncNumMax = MaxUserBackgroundRequest + } + } + + if out.CongestionThreshold > 0 { + conn.asyncCongestionThreshold = out.CongestionThreshold + + if !hasSysAdminCap && + conn.asyncCongestionThreshold > MaxUserCongestionThreshold { + conn.asyncCongestionThreshold = MaxUserCongestionThreshold + } + } + + conn.asyncMu.Unlock() } - if conn.maxWrite < fuseMinMaxWrite { - conn.maxWrite = fuseMinMaxWrite + + return nil +} + +// Abort this FUSE connection. +// It tries to acquire conn.fd.mu, conn.lock, conn.bgLock in order. +// All possible requests waiting or blocking will be aborted. +// +// Preconditions: conn.fd.mu is locked. +func (conn *connection) Abort(ctx context.Context) { + conn.mu.Lock() + conn.asyncMu.Lock() + + if !conn.connected { + conn.asyncMu.Unlock() + conn.mu.Unlock() + conn.fd.mu.Unlock() + return } - // Set connection as initialized and unblock the requests - // issued before init. - conn.SetInitialized() + conn.connected = false - return nil + // Empty the `fd.queue` that holds the requests + // not yet read by the FUSE daemon yet. + // These are a subset of the requests in `fuse.completion` map. + for !conn.fd.queue.Empty() { + req := conn.fd.queue.Front() + conn.fd.queue.Remove(req) + } + + var terminate []linux.FUSEOpID + + // 2. Collect the requests have not been sent to FUSE daemon, + // or have not received a reply. + for unique := range conn.fd.completions { + terminate = append(terminate, unique) + } + + // Release locks to avoid deadlock. + conn.asyncMu.Unlock() + conn.mu.Unlock() + + // 1. The requets blocked before initialization. + // Will reach call() `connected` check and return. + if !conn.Initialized() { + conn.SetInitialized() + } + + // 2. Terminate the requests collected above. + // Set ECONNABORTED error. + // sendError() will remove them from `fd.completion` map. + // Will enter the path of a normally received error. + for _, toTerminate := range terminate { + conn.fd.sendError(ctx, -int32(syscall.ECONNABORTED), toTerminate) + } + + // 3. The requests not yet written to FUSE device. + // Early terminate. + // Will reach callFutureLocked() `connected` check and return. + close(conn.fd.fullQueueCh) + + // TODO(gvisor.dev/issue/3528): Forget all pending forget reqs. } diff --git a/pkg/sentry/fsimpl/fuse/connection_test.go b/pkg/sentry/fsimpl/fuse/connection_test.go new file mode 100644 index 000000000..91d16c1cf --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/connection_test.go @@ -0,0 +1,117 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "math/rand" + "syscall" + "testing" + + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// TestConnectionInitBlock tests if initialization +// correctly blocks and unblocks the connection. +// Since it's unfeasible to test kernelTask.Block() in unit test, +// the code in Call() are not tested here. +func TestConnectionInitBlock(t *testing.T) { + s := setup(t) + defer s.Destroy() + + k := kernel.KernelFromContext(s.Ctx) + + conn, _, err := newTestConnection(s, k, maxActiveRequestsDefault) + if err != nil { + t.Fatalf("newTestConnection: %v", err) + } + + select { + case <-conn.initializedChan: + t.Fatalf("initializedChan should be blocking before SetInitialized") + default: + } + + conn.SetInitialized() + + select { + case <-conn.initializedChan: + default: + t.Fatalf("initializedChan should not be blocking after SetInitialized") + } +} + +func TestConnectionAbort(t *testing.T) { + s := setup(t) + defer s.Destroy() + + k := kernel.KernelFromContext(s.Ctx) + creds := auth.CredentialsFromContext(s.Ctx) + task := kernel.TaskFromContext(s.Ctx) + + const numRequests uint64 = 256 + + conn, _, err := newTestConnection(s, k, numRequests) + if err != nil { + t.Fatalf("newTestConnection: %v", err) + } + + testObj := &testPayload{ + data: rand.Uint32(), + } + + var futNormal []*futureResponse + + for i := 0; i < int(numRequests); i++ { + req, err := conn.NewRequest(creds, uint32(i), uint64(i), 0, testObj) + if err != nil { + t.Fatalf("NewRequest creation failed: %v", err) + } + fut, err := conn.callFutureLocked(task, req) + if err != nil { + t.Fatalf("callFutureLocked failed: %v", err) + } + futNormal = append(futNormal, fut) + } + + conn.Abort(s.Ctx) + + // Abort should unblock the initialization channel. + // Note: no test requests are actually blocked on `conn.initializedChan`. + select { + case <-conn.initializedChan: + default: + t.Fatalf("initializedChan should not be blocking after SetInitialized") + } + + // Abort will return ECONNABORTED error to unblocked requests. + for _, fut := range futNormal { + if fut.getResponse().hdr.Error != -int32(syscall.ECONNABORTED) { + t.Fatalf("Incorrect error code received for aborted connection: %v", fut.getResponse().hdr.Error) + } + } + + // After abort, Call() should return directly with ENOTCONN. + req, err := conn.NewRequest(creds, 0, 0, 0, testObj) + if err != nil { + t.Fatalf("NewRequest creation failed: %v", err) + } + _, err = conn.Call(task, req) + if err != syserror.ENOTCONN { + t.Fatalf("Incorrect error code received for Call() after connection aborted") + } + +} diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index e522ff9a0..1b86a4b4c 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -32,6 +31,8 @@ import ( const fuseDevMinor = 229 // fuseDevice implements vfs.Device for /dev/fuse. +// +// +stateify savable type fuseDevice struct{} // Open implements vfs.Device.Open. @@ -50,15 +51,14 @@ func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op } // DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse. +// +// +stateify savable type DeviceFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD - // mounted specifies whether a FUSE filesystem was mounted using the DeviceFD. - mounted bool - // nextOpID is used to create new requests. nextOpID linux.FUSEOpID @@ -83,7 +83,7 @@ type DeviceFD struct { writeCursorFR *futureResponse // mu protects all the queues, maps, buffers and cursors and nextOpID. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` // waitQueue is used to notify interested parties when the device becomes // readable or writable. @@ -92,21 +92,36 @@ type DeviceFD struct { // fullQueueCh is a channel used to synchronize the readers with the writers. // Writers (inbound requests to the filesystem) block if there are too many // unprocessed in-flight requests. - fullQueueCh chan struct{} + fullQueueCh chan struct{} `state:".(int)"` // fs is the FUSE filesystem that this FD is being used for. fs *filesystem } +func (fd *DeviceFD) saveFullQueueCh() int { + return cap(fd.fullQueueCh) +} + +func (fd *DeviceFD) loadFullQueueCh(capacity int) { + fd.fullQueueCh = make(chan struct{}, capacity) +} + // Release implements vfs.FileDescriptionImpl.Release. -func (fd *DeviceFD) Release(context.Context) { - fd.fs.conn.connected = false +func (fd *DeviceFD) Release(ctx context.Context) { + if fd.fs != nil { + fd.fs.conn.mu.Lock() + fd.fs.conn.connected = false + fd.fs.conn.mu.Unlock() + + fd.fs.VFSFilesystem().DecRef(ctx) + fd.fs = nil + } } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. - if !fd.mounted { + if fd.fs == nil { return 0, syserror.EPERM } @@ -116,10 +131,16 @@ func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset in // Read implements vfs.FileDescriptionImpl.Read. func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. - if !fd.mounted { + if fd.fs == nil { return 0, syserror.EPERM } + // Return ENODEV if the filesystem is umounted. + if fd.fs.umounted { + // TODO(gvisor.dev/issue/3525): return ECONNABORTED if aborted via fuse control fs. + return 0, syserror.ENODEV + } + // We require that any Read done on this filesystem have a sane minimum // read buffer. It must have the capacity for the fixed parts of any request // header (Linux uses the request header and the FUSEWriteIn header for this @@ -143,58 +164,82 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R } // readLocked implements the reading of the fuse device while locked with DeviceFD.mu. +// +// Preconditions: dst is large enough for any reasonable request. func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - if fd.queue.Empty() { - return 0, syserror.ErrWouldBlock - } + var req *Request - var readCursor uint32 - var bytesRead int64 - for { - req := fd.queue.Front() - if dst.NumBytes() < int64(req.hdr.Len) { - // The request is too large. Cannot process it. All requests must be smaller than the - // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT - // handshake. - errno := -int32(syscall.EIO) - if req.hdr.Opcode == linux.FUSE_SETXATTR { - errno = -int32(syscall.E2BIG) - } + // Find the first valid request. + // For the normal case this loop only execute once. + for !fd.queue.Empty() { + req = fd.queue.Front() - // Return the error to the calling task. - if err := fd.sendError(ctx, errno, req); err != nil { - return 0, err - } + if int64(req.hdr.Len)+int64(len(req.payload)) <= dst.NumBytes() { + break + } - // We're done with this request. - fd.queue.Remove(req) + // The request is too large. Cannot process it. All requests must be smaller than the + // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT + // handshake. + errno := -int32(syscall.EIO) + if req.hdr.Opcode == linux.FUSE_SETXATTR { + errno = -int32(syscall.E2BIG) + } - // Restart the read as this request was invalid. - log.Warningf("fuse.DeviceFD.Read: request found was too large. Restarting read.") - return fd.readLocked(ctx, dst, opts) + // Return the error to the calling task. + if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil { + return 0, err } - n, err := dst.CopyOut(ctx, req.data[readCursor:]) + // We're done with this request. + fd.queue.Remove(req) + req = nil + } + + if req == nil { + return 0, syserror.ErrWouldBlock + } + + // We already checked the size: dst must be able to fit the whole request. + // Now we write the marshalled header, the payload, + // and the potential additional payload + // to the user memory IOSequence. + + n, err := dst.CopyOut(ctx, req.data) + if err != nil { + return 0, err + } + if n != len(req.data) { + return 0, syserror.EIO + } + + if req.hdr.Opcode == linux.FUSE_WRITE { + written, err := dst.DropFirst(n).CopyOut(ctx, req.payload) if err != nil { return 0, err } - readCursor += uint32(n) - bytesRead += int64(n) - - if readCursor >= req.hdr.Len { - // Fully done with this req, remove it from the queue. - fd.queue.Remove(req) - break + if written != len(req.payload) { + return 0, syserror.EIO } + n += int(written) } - return bytesRead, nil + // Fully done with this req, remove it from the queue. + fd.queue.Remove(req) + + // Remove noReply ones from map of requests expecting a reply. + if req.noReply { + fd.numActiveRequests -= 1 + delete(fd.completions, req.hdr.Unique) + } + + return int64(n), nil } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. - if !fd.mounted { + if fd.fs == nil { return 0, syserror.EPERM } @@ -211,10 +256,15 @@ func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs. // writeLocked implements writing to the fuse device while locked with DeviceFD.mu. func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. - if !fd.mounted { + if fd.fs == nil { return 0, syserror.EPERM } + // Return ENODEV if the filesystem is umounted. + if fd.fs.umounted { + return 0, syserror.ENODEV + } + var cn, n int64 hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) @@ -276,7 +326,8 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt fut, ok := fd.completions[hdr.Unique] if !ok { - // Server sent us a response for a request we never sent? + // Server sent us a response for a request we never sent, + // or for which we already received a reply (e.g. aborted), an unlikely event. return 0, syserror.EINVAL } @@ -307,8 +358,23 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt // Readiness implements vfs.FileDescriptionImpl.Readiness. func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask { + fd.mu.Lock() + defer fd.mu.Unlock() + return fd.readinessLocked(mask) +} + +// readinessLocked implements checking the readiness of the fuse device while +// locked with DeviceFD.mu. +func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask { var ready waiter.EventMask - ready |= waiter.EventOut // FD is always writable + + if fd.fs.umounted { + ready |= waiter.EventErr + return ready & mask + } + + // FD is always writable. + ready |= waiter.EventOut if !fd.queue.Empty() { // Have reqs available, FD is readable. ready |= waiter.EventIn @@ -330,7 +396,7 @@ func (fd *DeviceFD) EventUnregister(e *waiter.Entry) { // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. - if !fd.mounted { + if fd.fs == nil { return 0, syserror.EPERM } @@ -338,59 +404,59 @@ func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64 } // sendResponse sends a response to the waiting task (if any). +// +// Preconditions: fd.mu must be held. func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error { - // See if the running task need to perform some action before returning. - // Since we just finished writing the future, we can be sure that - // getResponse generates a populated response. - if err := fd.noReceiverAction(ctx, fut.getResponse()); err != nil { - return err - } + // Signal the task waiting on a response if any. + defer close(fut.ch) // Signal that the queue is no longer full. select { case fd.fullQueueCh <- struct{}{}: default: } - fd.numActiveRequests -= 1 + fd.numActiveRequests-- + + if fut.async { + return fd.asyncCallBack(ctx, fut.getResponse()) + } - // Signal the task waiting on a response. - close(fut.ch) return nil } -// sendError sends an error response to the waiting task (if any). -func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) error { +// sendError sends an error response to the waiting task (if any) by calling sendResponse(). +// +// Preconditions: fd.mu must be held. +func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error { // Return the error to the calling task. outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) respHdr := linux.FUSEHeaderOut{ Len: outHdrLen, Error: errno, - Unique: req.hdr.Unique, + Unique: unique, } fut, ok := fd.completions[respHdr.Unique] if !ok { - // Server sent us a response for a request we never sent? + // A response for a request we never sent, + // or for which we already received a reply (e.g. aborted). return syserror.EINVAL } delete(fd.completions, respHdr.Unique) fut.hdr = &respHdr - if err := fd.sendResponse(ctx, fut); err != nil { - return err - } - - return nil + return fd.sendResponse(ctx, fut) } -// noReceiverAction has the calling kernel.Task do some action if its known that no -// receiver is going to be waiting on the future channel. This is to be used by: -// FUSE_INIT. -func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error { - if r.opcode == linux.FUSE_INIT { +// asyncCallBack executes pre-defined callback function for async requests. +// Currently used by: FUSE_INIT. +func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error { + switch r.opcode { + case linux.FUSE_INIT: creds := auth.CredentialsFromContext(ctx) rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace() return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs)) + // TODO(gvisor.dev/issue/3247): support async read: correctly process the response. } return nil diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go index 1ffe7ccd2..5986133e9 100644 --- a/pkg/sentry/fsimpl/fuse/dev_test.go +++ b/pkg/sentry/fsimpl/fuse/dev_test.go @@ -16,7 +16,6 @@ package fuse import ( "fmt" - "io" "math/rand" "testing" @@ -28,17 +27,12 @@ import ( "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" - "gvisor.dev/gvisor/tools/go_marshal/marshal" ) // echoTestOpcode is the Opcode used during testing. The server used in tests // will simply echo the payload back with the appropriate headers. const echoTestOpcode linux.FUSEOpcode = 1000 -type testPayload struct { - data uint32 -} - // TestFUSECommunication tests that the communication layer between the Sentry and the // FUSE server daemon works as expected. func TestFUSECommunication(t *testing.T) { @@ -327,102 +321,3 @@ func fuseServerRun(t *testing.T, s *testutil.System, k *kernel.Kernel, fd *vfs.F } } } - -func setup(t *testing.T) *testutil.System { - k, err := testutil.Boot() - if err != nil { - t.Fatalf("Error creating kernel: %v", err) - } - - ctx := k.SupervisorContext() - creds := auth.CredentialsFromContext(ctx) - - k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserList: true, - AllowUserMount: true, - }) - - mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) - if err != nil { - t.Fatalf("NewMountNamespace(): %v", err) - } - - return testutil.NewSystem(ctx, t, k.VFS(), mntns) -} - -// newTestConnection creates a fuse connection that the sentry can communicate with -// and the FD for the server to communicate with. -func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) { - vfsObj := &vfs.VirtualFilesystem{} - fuseDev := &DeviceFD{} - - if err := vfsObj.Init(system.Ctx); err != nil { - return nil, nil, err - } - - vd := vfsObj.NewAnonVirtualDentry("genCountFD") - defer vd.DecRef(system.Ctx) - if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil { - return nil, nil, err - } - - fsopts := filesystemOptions{ - maxActiveRequests: maxActiveRequests, - } - fs, err := NewFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd) - if err != nil { - return nil, nil, err - } - - return fs.conn, &fuseDev.vfsfd, nil -} - -// SizeBytes implements marshal.Marshallable.SizeBytes. -func (t *testPayload) SizeBytes() int { - return 4 -} - -// MarshalBytes implements marshal.Marshallable.MarshalBytes. -func (t *testPayload) MarshalBytes(dst []byte) { - usermem.ByteOrder.PutUint32(dst[:4], t.data) -} - -// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. -func (t *testPayload) UnmarshalBytes(src []byte) { - *t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])} -} - -// Packed implements marshal.Marshallable.Packed. -func (t *testPayload) Packed() bool { - return true -} - -// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. -func (t *testPayload) MarshalUnsafe(dst []byte) { - t.MarshalBytes(dst) -} - -// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. -func (t *testPayload) UnmarshalUnsafe(src []byte) { - t.UnmarshalBytes(src) -} - -// CopyOutN implements marshal.Marshallable.CopyOutN. -func (t *testPayload) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { - panic("not implemented") -} - -// CopyOut implements marshal.Marshallable.CopyOut. -func (t *testPayload) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { - panic("not implemented") -} - -// CopyIn implements marshal.Marshallable.CopyIn. -func (t *testPayload) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { - panic("not implemented") -} - -// WriteTo implements io.WriterTo.WriteTo. -func (t *testPayload) WriteTo(w io.Writer) (int64, error) { - panic("not implemented") -} diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go new file mode 100644 index 000000000..8f220a04b --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/directory.go @@ -0,0 +1,105 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +type directoryFD struct { + fileDescription +} + +// Allocate implements directoryFD.Allocate. +func (*directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.EISDIR +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (*directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (*directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (*directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (*directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (dir *directoryFD) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback) error { + fusefs := dir.inode().fs + task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx) + + in := linux.FUSEReadIn{ + Fh: dir.Fh, + Offset: uint64(atomic.LoadInt64(&dir.off)), + Size: linux.FUSE_PAGE_SIZE, + Flags: dir.statusFlags(), + } + + // TODO(gVisor.dev/issue/3404): Support FUSE_READDIRPLUS. + req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), dir.inode().nodeID, linux.FUSE_READDIR, &in) + if err != nil { + return err + } + + res, err := fusefs.conn.Call(task, req) + if err != nil { + return err + } + if err := res.Error(); err != nil { + return err + } + + var out linux.FUSEDirents + if err := res.UnmarshalPayload(&out); err != nil { + return err + } + + for _, fuseDirent := range out.Dirents { + nextOff := int64(fuseDirent.Meta.Off) + dirent := vfs.Dirent{ + Name: fuseDirent.Name, + Type: uint8(fuseDirent.Meta.Type), + Ino: fuseDirent.Meta.Ino, + NextOff: nextOff, + } + + if err := callback.Handle(dirent); err != nil { + return err + } + atomic.StoreInt64(&dir.off, nextOff) + } + + return nil +} diff --git a/pkg/sentry/fsimpl/fuse/file.go b/pkg/sentry/fsimpl/fuse/file.go new file mode 100644 index 000000000..83f2816b7 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/file.go @@ -0,0 +1,133 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +// fileDescription implements vfs.FileDescriptionImpl for fuse. +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD + + // the file handle used in userspace. + Fh uint64 + + // Nonseekable is indicate cannot perform seek on a file. + Nonseekable bool + + // DirectIO suggest fuse to use direct io operation. + DirectIO bool + + // OpenFlag is the flag returned by open. + OpenFlag uint32 + + // off is the file offset. + off int64 +} + +func (fd *fileDescription) dentry() *kernfs.Dentry { + return fd.vfsfd.Dentry().Impl().(*kernfs.Dentry) +} + +func (fd *fileDescription) inode() *inode { + return fd.dentry().Inode().(*inode) +} + +func (fd *fileDescription) filesystem() *vfs.Filesystem { + return fd.vfsfd.VirtualDentry().Mount().Filesystem() +} + +func (fd *fileDescription) statusFlags() uint32 { + return fd.vfsfd.StatusFlags() +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *fileDescription) Release(ctx context.Context) { + // no need to release if FUSE server doesn't implement Open. + conn := fd.inode().fs.conn + if conn.noOpen { + return + } + + in := linux.FUSEReleaseIn{ + Fh: fd.Fh, + Flags: fd.statusFlags(), + } + // TODO(gvisor.dev/issue/3245): add logic when we support file lock owner. + var opcode linux.FUSEOpcode + if fd.inode().Mode().IsDir() { + opcode = linux.FUSE_RELEASEDIR + } else { + opcode = linux.FUSE_RELEASE + } + kernelTask := kernel.TaskFromContext(ctx) + // ignoring errors and FUSE server reply is analogous to Linux's behavior. + req, err := conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), fd.inode().nodeID, opcode, &in) + if err != nil { + // No way to invoke Call() with an errored request. + return + } + // The reply will be ignored since no callback is defined in asyncCallBack(). + conn.CallAsync(kernelTask, req) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, nil +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return 0, nil +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, nil +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return 0, nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, nil +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := fd.filesystem() + inode := fd.inode() + return inode.Stat(ctx, fs, opts) +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + fs := fd.filesystem() + creds := auth.CredentialsFromContext(ctx) + return fd.inode().setAttr(ctx, fs, creds, opts, true, fd.Fh) +} diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 810819ae4..65786e42a 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -16,24 +16,36 @@ package fuse import ( + "math" "strconv" + "sync" + "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" ) // Name is the default filesystem name. const Name = "fuse" +// maxActiveRequestsDefault is the default setting controlling the upper bound +// on the number of active requests at any given time. +const maxActiveRequestsDefault = 10000 + // FilesystemType implements vfs.FilesystemType. +// +// +stateify savable type FilesystemType struct{} +// +stateify savable type filesystemOptions struct { // userID specifies the numeric uid of the mount owner. // This option should not be specified by the filesystem owner. @@ -56,9 +68,16 @@ type filesystemOptions struct { // exist at any time. Any further requests will block when trying to // Call the server. maxActiveRequests uint64 + + // maxRead is the max number of bytes to read, + // specified as "max_read" in fs parameters. + // If not specified by user, use math.MaxUint32 as default value. + maxRead uint32 } // filesystem implements vfs.FilesystemImpl. +// +// +stateify savable type filesystem struct { kernfs.Filesystem devMinor uint32 @@ -69,6 +88,9 @@ type filesystem struct { // opts is the options the fusefs is initialized with. opts *filesystemOptions + + // umounted is true if filesystem.Release() has been called. + umounted bool } // Name implements vfs.FilesystemType.Name. @@ -142,14 +164,29 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Set the maxInFlightRequests option. fsopts.maxActiveRequests = maxActiveRequestsDefault + if maxReadStr, ok := mopts["max_read"]; ok { + delete(mopts, "max_read") + maxRead, err := strconv.ParseUint(maxReadStr, 10, 32) + if err != nil { + log.Warningf("%s.GetFilesystem: invalid max_read: max_read=%s", fsType.Name(), maxReadStr) + return nil, nil, syserror.EINVAL + } + if maxRead < fuseMinMaxRead { + maxRead = fuseMinMaxRead + } + fsopts.maxRead = uint32(maxRead) + } else { + fsopts.maxRead = math.MaxUint32 + } + // Check for unparsed options. if len(mopts) != 0 { - log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts) + log.Warningf("%s.GetFilesystem: unsupported or unknown options: %v", fsType.Name(), mopts) return nil, nil, syserror.EINVAL } // Create a new FUSE filesystem. - fs, err := NewFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd) + fs, err := newFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd) if err != nil { log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err) return nil, nil, err @@ -165,26 +202,28 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } // root is the fusefs root directory. - root := fs.newInode(creds, fsopts.rootMode) + root := fs.newRootInode(creds, fsopts.rootMode) return fs.VFSFilesystem(), root.VFSDentry(), nil } -// NewFUSEFilesystem creates a new FUSE filesystem. -func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) { - fs := &filesystem{ - devMinor: devMinor, - opts: opts, - } - - conn, err := newFUSEConnection(ctx, device, opts.maxActiveRequests) +// newFUSEFilesystem creates a new FUSE filesystem. +func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) { + conn, err := newFUSEConnection(ctx, device, opts) if err != nil { log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err) return nil, syserror.EINVAL } - fs.conn = conn fuseFD := device.Impl().(*DeviceFD) + + fs := &filesystem{ + devMinor: devMinor, + opts: opts, + conn: conn, + } + + fs.VFSFilesystem().IncRef() fuseFD.fs = fs return fs, nil @@ -192,11 +231,22 @@ func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { + fs.conn.fd.mu.Lock() + + fs.umounted = true + fs.conn.Abort(ctx) + // Notify all the waiters on this fd. + fs.conn.fd.waitQueue.Notify(waiter.EventIn) + + fs.conn.fd.mu.Unlock() + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } // inode implements kernfs.Inode. +// +// +stateify savable type inode struct { inodeRefs kernfs.InodeAttrs @@ -205,14 +255,50 @@ type inode struct { kernfs.InodeNotSymlink kernfs.OrderedChildren + dentry kernfs.Dentry + + // the owning filesystem. fs is immutable. + fs *filesystem + + // metaDataMu protects the metadata of this inode. + metadataMu sync.Mutex + + nodeID uint64 + locks vfs.FileLocks - dentry kernfs.Dentry + // size of the file. + size uint64 + + // attributeVersion is the version of inode's attributes. + attributeVersion uint64 + + // attributeTime is the remaining vaild time of attributes. + attributeTime uint64 + + // version of the inode. + version uint64 + + // link is result of following a symbolic link. + link string +} + +func (fs *filesystem) newRootInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry { + i := &inode{fs: fs} + i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, 1, linux.ModeDirectory|0755) + i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + i.EnableLeakCheck() + i.dentry.Init(i) + i.nodeID = 1 + + return &i.dentry } -func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry { - i := &inode{} - i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) +func (fs *filesystem) newInode(nodeID uint64, attr linux.FUSEAttr) *kernfs.Dentry { + i := &inode{fs: fs, nodeID: nodeID} + creds := auth.Credentials{EffectiveKGID: auth.KGID(attr.UID), EffectiveKUID: auth.KUID(attr.UID)} + i.InodeAttrs.Init(&creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.FileMode(attr.Mode)) + atomic.StoreUint64(&i.size, attr.Size) i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) i.EnableLeakCheck() i.dentry.Init(i) @@ -221,14 +307,292 @@ func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *ke } // Open implements kernfs.Inode.Open. -func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ - SeekEnd: kernfs.SeekEndStaticEntries, - }) +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + isDir := i.InodeAttrs.Mode().IsDir() + // return error if specified to open directory but inode is not a directory. + if !isDir && opts.Mode.IsDir() { + return nil, syserror.ENOTDIR + } + if opts.Flags&linux.O_LARGEFILE == 0 && atomic.LoadUint64(&i.size) > linux.MAX_NON_LFS { + return nil, syserror.EOVERFLOW + } + + var fd *fileDescription + var fdImpl vfs.FileDescriptionImpl + if isDir { + directoryFD := &directoryFD{} + fd = &(directoryFD.fileDescription) + fdImpl = directoryFD + } else { + regularFD := ®ularFileFD{} + fd = &(regularFD.fileDescription) + fdImpl = regularFD + } + // FOPEN_KEEP_CACHE is the defualt flag for noOpen. + fd.OpenFlag = linux.FOPEN_KEEP_CACHE + + // Only send open request when FUSE server support open or is opening a directory. + if !i.fs.conn.noOpen || isDir { + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("fusefs.Inode.Open: couldn't get kernel task from context") + return nil, syserror.EINVAL + } + + // Build the request. + var opcode linux.FUSEOpcode + if isDir { + opcode = linux.FUSE_OPENDIR + } else { + opcode = linux.FUSE_OPEN + } + + in := linux.FUSEOpenIn{Flags: opts.Flags & ^uint32(linux.O_CREAT|linux.O_EXCL|linux.O_NOCTTY)} + if !i.fs.conn.atomicOTrunc { + in.Flags &= ^uint32(linux.O_TRUNC) + } + + req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, &in) + if err != nil { + return nil, err + } + + // Send the request and receive the reply. + res, err := i.fs.conn.Call(kernelTask, req) + if err != nil { + return nil, err + } + if err := res.Error(); err == syserror.ENOSYS && !isDir { + i.fs.conn.noOpen = true + } else if err != nil { + return nil, err + } else { + out := linux.FUSEOpenOut{} + if err := res.UnmarshalPayload(&out); err != nil { + return nil, err + } + + // Process the reply. + fd.OpenFlag = out.OpenFlag + if isDir { + fd.OpenFlag &= ^uint32(linux.FOPEN_DIRECT_IO) + } + + fd.Fh = out.Fh + } + } + + // TODO(gvisor.dev/issue/3234): invalidate mmap after implemented it for FUSE Inode + fd.DirectIO = fd.OpenFlag&linux.FOPEN_DIRECT_IO != 0 + fdOptions := &vfs.FileDescriptionOptions{} + if fd.OpenFlag&linux.FOPEN_NONSEEKABLE != 0 { + fdOptions.DenyPRead = true + fdOptions.DenyPWrite = true + fd.Nonseekable = true + } + + // If we don't send SETATTR before open (which is indicated by atomicOTrunc) + // and O_TRUNC is set, update the inode's version number and clean existing data + // by setting the file size to 0. + if i.fs.conn.atomicOTrunc && opts.Flags&linux.O_TRUNC != 0 { + i.fs.conn.mu.Lock() + i.fs.conn.attributeVersion++ + i.attributeVersion = i.fs.conn.attributeVersion + atomic.StoreUint64(&i.size, 0) + i.fs.conn.mu.Unlock() + i.attributeTime = 0 + } + + if err := fd.vfsfd.Init(fdImpl, opts.Flags, rp.Mount(), d.VFSDentry(), fdOptions); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// Lookup implements kernfs.Inode.Lookup. +func (i *inode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) { + in := linux.FUSELookupIn{Name: name} + return i.newEntry(ctx, name, 0, linux.FUSE_LOOKUP, &in) +} + +// IterDirents implements kernfs.Inode.IterDirents. +func (*inode) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + return offset, nil +} + +// Valid implements kernfs.Inode.Valid. +func (*inode) Valid(ctx context.Context) bool { + return true +} + +// NewFile implements kernfs.Inode.NewFile. +func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*kernfs.Dentry, error) { + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID) + return nil, syserror.EINVAL + } + in := linux.FUSECreateIn{ + CreateMeta: linux.FUSECreateMeta{ + Flags: opts.Flags, + Mode: uint32(opts.Mode) | linux.S_IFREG, + Umask: uint32(kernelTask.FSContext().Umask()), + }, + Name: name, + } + return i.newEntry(ctx, name, linux.S_IFREG, linux.FUSE_CREATE, &in) +} + +// NewNode implements kernfs.Inode.NewNode. +func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*kernfs.Dentry, error) { + in := linux.FUSEMknodIn{ + MknodMeta: linux.FUSEMknodMeta{ + Mode: uint32(opts.Mode), + Rdev: linux.MakeDeviceID(uint16(opts.DevMajor), opts.DevMinor), + Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()), + }, + Name: name, + } + return i.newEntry(ctx, name, opts.Mode.FileType(), linux.FUSE_MKNOD, &in) +} + +// NewSymlink implements kernfs.Inode.NewSymlink. +func (i *inode) NewSymlink(ctx context.Context, name, target string) (*kernfs.Dentry, error) { + in := linux.FUSESymLinkIn{ + Name: name, + Target: target, + } + return i.newEntry(ctx, name, linux.S_IFLNK, linux.FUSE_SYMLINK, &in) +} + +// Unlink implements kernfs.Inode.Unlink. +func (i *inode) Unlink(ctx context.Context, name string, child *kernfs.Dentry) error { + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID) + return syserror.EINVAL + } + in := linux.FUSEUnlinkIn{Name: name} + req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_UNLINK, &in) + if err != nil { + return err + } + res, err := i.fs.conn.Call(kernelTask, req) + if err != nil { + return err + } + // only return error, discard res. + if err := res.Error(); err != nil { + return err + } + return i.dentry.RemoveChildLocked(name, child) +} + +// NewDir implements kernfs.Inode.NewDir. +func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*kernfs.Dentry, error) { + in := linux.FUSEMkdirIn{ + MkdirMeta: linux.FUSEMkdirMeta{ + Mode: uint32(opts.Mode), + Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()), + }, + Name: name, + } + return i.newEntry(ctx, name, linux.S_IFDIR, linux.FUSE_MKDIR, &in) +} + +// RmDir implements kernfs.Inode.RmDir. +func (i *inode) RmDir(ctx context.Context, name string, child *kernfs.Dentry) error { + fusefs := i.fs + task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx) + + in := linux.FUSERmDirIn{Name: name} + req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_RMDIR, &in) + if err != nil { + return err + } + + res, err := i.fs.conn.Call(task, req) + if err != nil { + return err + } + if err := res.Error(); err != nil { + return err + } + + return i.dentry.RemoveChildLocked(name, child) +} + +// newEntry calls FUSE server for entry creation and allocates corresponding entry according to response. +// Shared by FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and FUSE_LOOKUP. +func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*kernfs.Dentry, error) { + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID) + return nil, syserror.EINVAL + } + req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, payload) + if err != nil { + return nil, err + } + res, err := i.fs.conn.Call(kernelTask, req) if err != nil { return nil, err } - return fd.VFSFileDescription(), nil + if err := res.Error(); err != nil { + return nil, err + } + out := linux.FUSEEntryOut{} + if err := res.UnmarshalPayload(&out); err != nil { + return nil, err + } + if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) { + return nil, syserror.EIO + } + child := i.fs.newInode(out.NodeID, out.Attr) + return child, nil +} + +// Getlink implements kernfs.Inode.Getlink. +func (i *inode) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { + path, err := i.Readlink(ctx, mnt) + return vfs.VirtualDentry{}, path, err +} + +// Readlink implements kernfs.Inode.Readlink. +func (i *inode) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { + if i.Mode().FileType()&linux.S_IFLNK == 0 { + return "", syserror.EINVAL + } + if len(i.link) == 0 { + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("fusefs.Inode.Readlink: couldn't get kernel task from context") + return "", syserror.EINVAL + } + req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{}) + if err != nil { + return "", err + } + res, err := i.fs.conn.Call(kernelTask, req) + if err != nil { + return "", err + } + i.link = string(res.data[res.hdr.SizeBytes():]) + if !mnt.Options().ReadOnly { + i.attributeTime = 0 + } + } + return i.link, nil +} + +// getFUSEAttr returns a linux.FUSEAttr of this inode stored in local cache. +// TODO(gvisor.dev/issue/3679): Add support for other fields. +func (i *inode) getFUSEAttr() linux.FUSEAttr { + return linux.FUSEAttr{ + Ino: i.Ino(), + Size: atomic.LoadUint64(&i.size), + Mode: uint32(i.Mode()), + } } // statFromFUSEAttr makes attributes from linux.FUSEAttr to linux.Statx. The @@ -284,50 +648,92 @@ func statFromFUSEAttr(attr linux.FUSEAttr, mask, devMinor uint32) linux.Statx { return stat } -// Stat implements kernfs.Inode.Stat. -func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { - fusefs := fs.Impl().(*filesystem) - conn := fusefs.conn - task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx) +// getAttr gets the attribute of this inode by issuing a FUSE_GETATTR request +// or read from local cache. It updates the corresponding attributes if +// necessary. +func (i *inode) getAttr(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions, flags uint32, fh uint64) (linux.FUSEAttr, error) { + attributeVersion := atomic.LoadUint64(&i.fs.conn.attributeVersion) + + // TODO(gvisor.dev/issue/3679): send the request only if + // - invalid local cache for fields specified in the opts.Mask + // - forced update + // - i.attributeTime expired + // If local cache is still valid, return local cache. + // Currently we always send a request, + // and we always set the metadata with the new result, + // unless attributeVersion has changed. + + task := kernel.TaskFromContext(ctx) if task == nil { log.Warningf("couldn't get kernel task from context") - return linux.Statx{}, syserror.EINVAL + return linux.FUSEAttr{}, syserror.EINVAL } - var in linux.FUSEGetAttrIn - // We don't set any attribute in the request, because in VFS2 fstat(2) will - // finally be translated into vfs.FilesystemImpl.StatAt() (see - // pkg/sentry/syscalls/linux/vfs2/stat.go), resulting in the same flow - // as stat(2). Thus GetAttrFlags and Fh variable will never be used in VFS2. - req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.Ino(), linux.FUSE_GETATTR, &in) + creds := auth.CredentialsFromContext(ctx) + + in := linux.FUSEGetAttrIn{ + GetAttrFlags: flags, + Fh: fh, + } + req, err := i.fs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_GETATTR, &in) if err != nil { - return linux.Statx{}, err + return linux.FUSEAttr{}, err } - res, err := conn.Call(task, req) + res, err := i.fs.conn.Call(task, req) if err != nil { - return linux.Statx{}, err + return linux.FUSEAttr{}, err } if err := res.Error(); err != nil { - return linux.Statx{}, err + return linux.FUSEAttr{}, err } var out linux.FUSEGetAttrOut if err := res.UnmarshalPayload(&out); err != nil { - return linux.Statx{}, err + return linux.FUSEAttr{}, err + } + + // Local version is newer, return the local one. + // Skip the update. + if attributeVersion != 0 && atomic.LoadUint64(&i.attributeVersion) > attributeVersion { + return i.getFUSEAttr(), nil } - // Set all metadata into kernfs.InodeAttrs. - if err := i.SetStat(ctx, fs, creds, vfs.SetStatOptions{ - Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, fusefs.devMinor), + // Set the metadata of kernfs.InodeAttrs. + if err := i.SetInodeStat(ctx, fs, creds, vfs.SetStatOptions{ + Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor), }); err != nil { + return linux.FUSEAttr{}, err + } + + // Set the size if no error (after SetStat() check). + atomic.StoreUint64(&i.size, out.Attr.Size) + + return out.Attr, nil +} + +// reviseAttr attempts to update the attributes for internal purposes +// by calling getAttr with a pre-specified mask. +// Used by read, write, lseek. +func (i *inode) reviseAttr(ctx context.Context, flags uint32, fh uint64) error { + // Never need atime for internal purposes. + _, err := i.getAttr(ctx, i.fs.VFSFilesystem(), vfs.StatOptions{ + Mask: linux.STATX_BASIC_STATS &^ linux.STATX_ATIME, + }, flags, fh) + return err +} + +// Stat implements kernfs.Inode.Stat. +func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + attr, err := i.getAttr(ctx, fs, opts, 0, 0) + if err != nil { return linux.Statx{}, err } - return statFromFUSEAttr(out.Attr, opts.Mask, fusefs.devMinor), nil + return statFromFUSEAttr(attr, opts.Mask, i.fs.devMinor), nil } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (i *inode) DecRef(context.Context) { i.inodeRefs.DecRef(i.Destroy) } @@ -337,3 +743,84 @@ func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, e // TODO(gvisor.dev/issues/3413): Complete the implementation of statfs. return vfs.GenericStatFS(linux.FUSE_SUPER_MAGIC), nil } + +// fattrMaskFromStats converts vfs.SetStatOptions.Stat.Mask to linux stats mask +// aligned with the attribute mask defined in include/linux/fs.h. +func fattrMaskFromStats(mask uint32) uint32 { + var fuseAttrMask uint32 + maskMap := map[uint32]uint32{ + linux.STATX_MODE: linux.FATTR_MODE, + linux.STATX_UID: linux.FATTR_UID, + linux.STATX_GID: linux.FATTR_GID, + linux.STATX_SIZE: linux.FATTR_SIZE, + linux.STATX_ATIME: linux.FATTR_ATIME, + linux.STATX_MTIME: linux.FATTR_MTIME, + linux.STATX_CTIME: linux.FATTR_CTIME, + } + for statxMask, fattrMask := range maskMap { + if mask&statxMask != 0 { + fuseAttrMask |= fattrMask + } + } + return fuseAttrMask +} + +// SetStat implements kernfs.Inode.SetStat. +func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + return i.setAttr(ctx, fs, creds, opts, false, 0) +} + +func (i *inode) setAttr(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions, useFh bool, fh uint64) error { + conn := i.fs.conn + task := kernel.TaskFromContext(ctx) + if task == nil { + log.Warningf("couldn't get kernel task from context") + return syserror.EINVAL + } + + // We should retain the original file type when assigning new mode. + fileType := uint16(i.Mode()) & linux.S_IFMT + fattrMask := fattrMaskFromStats(opts.Stat.Mask) + if useFh { + fattrMask |= linux.FATTR_FH + } + in := linux.FUSESetAttrIn{ + Valid: fattrMask, + Fh: fh, + Size: opts.Stat.Size, + Atime: uint64(opts.Stat.Atime.Sec), + Mtime: uint64(opts.Stat.Mtime.Sec), + Ctime: uint64(opts.Stat.Ctime.Sec), + AtimeNsec: opts.Stat.Atime.Nsec, + MtimeNsec: opts.Stat.Mtime.Nsec, + CtimeNsec: opts.Stat.Ctime.Nsec, + Mode: uint32(fileType | opts.Stat.Mode), + UID: opts.Stat.UID, + GID: opts.Stat.GID, + } + req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_SETATTR, &in) + if err != nil { + return err + } + + res, err := conn.Call(task, req) + if err != nil { + return err + } + if err := res.Error(); err != nil { + return err + } + out := linux.FUSEGetAttrOut{} + if err := res.UnmarshalPayload(&out); err != nil { + return err + } + + // Set the metadata of kernfs.InodeAttrs. + if err := i.SetInodeStat(ctx, fs, creds, vfs.SetStatOptions{ + Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor), + }); err != nil { + return err + } + + return nil +} diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go new file mode 100644 index 000000000..625d1547f --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/read_write.go @@ -0,0 +1,242 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "io" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// ReadInPages sends FUSE_READ requests for the size after round it up to +// a multiple of page size, blocks on it for reply, processes the reply +// and returns the payload (or joined payloads) as a byte slice. +// This is used for the general purpose reading. +// We do not support direct IO (which read the exact number of bytes) +// at this moment. +func (fs *filesystem) ReadInPages(ctx context.Context, fd *regularFileFD, off uint64, size uint32) ([][]byte, uint32, error) { + attributeVersion := atomic.LoadUint64(&fs.conn.attributeVersion) + + t := kernel.TaskFromContext(ctx) + if t == nil { + log.Warningf("fusefs.Read: couldn't get kernel task from context") + return nil, 0, syserror.EINVAL + } + + // Round up to a multiple of page size. + readSize, _ := usermem.PageRoundUp(uint64(size)) + + // One request cannnot exceed either maxRead or maxPages. + maxPages := fs.conn.maxRead >> usermem.PageShift + if maxPages > uint32(fs.conn.maxPages) { + maxPages = uint32(fs.conn.maxPages) + } + + var outs [][]byte + var sizeRead uint32 + + // readSize is a multiple of usermem.PageSize. + // Always request bytes as a multiple of pages. + pagesRead, pagesToRead := uint32(0), uint32(readSize>>usermem.PageShift) + + // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation. + in := linux.FUSEReadIn{ + Fh: fd.Fh, + LockOwner: 0, // TODO(gvisor.dev/issue/3245): file lock + ReadFlags: 0, // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER + Flags: fd.statusFlags(), + } + + // This loop is intended for fragmented read where the bytes to read is + // larger than either the maxPages or maxRead. + // For the majority of reads with normal size, this loop should only + // execute once. + for pagesRead < pagesToRead { + pagesCanRead := pagesToRead - pagesRead + if pagesCanRead > maxPages { + pagesCanRead = maxPages + } + + in.Offset = off + (uint64(pagesRead) << usermem.PageShift) + in.Size = pagesCanRead << usermem.PageShift + + req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_READ, &in) + if err != nil { + return nil, 0, err + } + + // TODO(gvisor.dev/issue/3247): support async read. + + res, err := fs.conn.Call(t, req) + if err != nil { + return nil, 0, err + } + if err := res.Error(); err != nil { + return nil, 0, err + } + + // Not enough bytes in response, + // either we reached EOF, + // or the FUSE server sends back a response + // that cannot even fit the hdr. + if len(res.data) <= res.hdr.SizeBytes() { + // We treat both case as EOF here for now + // since there is no reliable way to detect + // the over-short hdr case. + break + } + + // Directly using the slice to avoid extra copy. + out := res.data[res.hdr.SizeBytes():] + + outs = append(outs, out) + sizeRead += uint32(len(out)) + + pagesRead += pagesCanRead + } + + defer fs.ReadCallback(ctx, fd, off, size, sizeRead, attributeVersion) + + // No bytes returned: offset >= EOF. + if len(outs) == 0 { + return nil, 0, io.EOF + } + + return outs, sizeRead, nil +} + +// ReadCallback updates several information after receiving a read response. +// Due to readahead, sizeRead can be larger than size. +func (fs *filesystem) ReadCallback(ctx context.Context, fd *regularFileFD, off uint64, size uint32, sizeRead uint32, attributeVersion uint64) { + // TODO(gvisor.dev/issue/3247): support async read. + // If this is called by an async read, correctly process it. + // May need to update the signature. + + i := fd.inode() + // TODO(gvisor.dev/issue/1193): Invalidate or update atime. + + // Reached EOF. + if sizeRead < size { + // TODO(gvisor.dev/issue/3630): If we have writeback cache, then we need to fill this hole. + // Might need to update the buf to be returned from the Read(). + + // Update existing size. + newSize := off + uint64(sizeRead) + fs.conn.mu.Lock() + if attributeVersion == i.attributeVersion && newSize < atomic.LoadUint64(&i.size) { + fs.conn.attributeVersion++ + i.attributeVersion = i.fs.conn.attributeVersion + atomic.StoreUint64(&i.size, newSize) + } + fs.conn.mu.Unlock() + } +} + +// Write sends FUSE_WRITE requests and return the bytes +// written according to the response. +// +// Preconditions: len(data) == size. +func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, size uint32, data []byte) (uint32, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + log.Warningf("fusefs.Read: couldn't get kernel task from context") + return 0, syserror.EINVAL + } + + // One request cannnot exceed either maxWrite or maxPages. + maxWrite := uint32(fs.conn.maxPages) << usermem.PageShift + if maxWrite > fs.conn.maxWrite { + maxWrite = fs.conn.maxWrite + } + + // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation. + in := linux.FUSEWriteIn{ + Fh: fd.Fh, + // TODO(gvisor.dev/issue/3245): file lock + LockOwner: 0, + // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER + // TODO(gvisor.dev/issue/3237): |= linux.FUSE_WRITE_CACHE (not added yet) + WriteFlags: 0, + Flags: fd.statusFlags(), + } + + var written uint32 + + // This loop is intended for fragmented write where the bytes to write is + // larger than either the maxWrite or maxPages or when bigWrites is false. + // Unless a small value for max_write is explicitly used, this loop + // is expected to execute only once for the majority of the writes. + for written < size { + toWrite := size - written + + // Limit the write size to one page. + // Note that the bigWrites flag is obsolete, + // latest libfuse always sets it on. + if !fs.conn.bigWrites && toWrite > usermem.PageSize { + toWrite = usermem.PageSize + } + + // Limit the write size to maxWrite. + if toWrite > maxWrite { + toWrite = maxWrite + } + + in.Offset = off + uint64(written) + in.Size = toWrite + + req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_WRITE, &in) + if err != nil { + return 0, err + } + + req.payload = data[written : written+toWrite] + + // TODO(gvisor.dev/issue/3247): support async write. + + res, err := fs.conn.Call(t, req) + if err != nil { + return 0, err + } + if err := res.Error(); err != nil { + return 0, err + } + + out := linux.FUSEWriteOut{} + if err := res.UnmarshalPayload(&out); err != nil { + return 0, err + } + + // Write more than requested? EIO. + if out.Size > toWrite { + return 0, syserror.EIO + } + + written += out.Size + + // Break if short write. Not necessarily an error. + if out.Size != toWrite { + break + } + } + + return written, nil +} diff --git a/pkg/sentry/fsimpl/fuse/regular_file.go b/pkg/sentry/fsimpl/fuse/regular_file.go new file mode 100644 index 000000000..5bdd096c3 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/regular_file.go @@ -0,0 +1,230 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "io" + "math" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +type regularFileFD struct { + fileDescription + + // off is the file offset. + off int64 + // offMu protects off. + offMu sync.Mutex +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^linux.RWF_HIPRI != 0 { + return 0, syserror.EOPNOTSUPP + } + + size := dst.NumBytes() + if size == 0 { + // Early return if count is 0. + return 0, nil + } else if size > math.MaxUint32 { + // FUSE only supports uint32 for size. + // Overflow. + return 0, syserror.EINVAL + } + + // TODO(gvisor.dev/issue/3678): Add direct IO support. + + inode := fd.inode() + + // Reading beyond EOF, update file size if outdated. + if uint64(offset+size) > atomic.LoadUint64(&inode.size) { + if err := inode.reviseAttr(ctx, linux.FUSE_GETATTR_FH, fd.Fh); err != nil { + return 0, err + } + // If the offset after update is still too large, return error. + if uint64(offset) >= atomic.LoadUint64(&inode.size) { + return 0, io.EOF + } + } + + // Truncate the read with updated file size. + fileSize := atomic.LoadUint64(&inode.size) + if uint64(offset+size) > fileSize { + size = int64(fileSize) - offset + } + + buffers, n, err := inode.fs.ReadInPages(ctx, fd, uint64(offset), uint32(size)) + if err != nil { + return 0, err + } + + // TODO(gvisor.dev/issue/3237): support indirect IO (e.g. caching), + // store the bytes that were read ahead. + + // Update the number of bytes to copy for short read. + if n < uint32(size) { + size = int64(n) + } + + // Copy the bytes read to the dst. + // This loop is intended for fragmented reads. + // For the majority of reads, this loop only execute once. + var copied int64 + for _, buffer := range buffers { + toCopy := int64(len(buffer)) + if copied+toCopy > size { + toCopy = size - copied + } + cp, err := dst.DropFirst64(copied).CopyOut(ctx, buffer[:toCopy]) + if err != nil { + return 0, err + } + if int64(cp) != toCopy { + return 0, syserror.EIO + } + copied += toCopy + } + + return copied, nil +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + fd.offMu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + n, _, err := fd.pwrite(ctx, src, offset, opts) + return n, err +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.offMu.Lock() + n, off, err := fd.pwrite(ctx, src, fd.off, opts) + fd.off = off + fd.offMu.Unlock() + return n, err +} + +// pwrite returns the number of bytes written, final offset and error. The +// final offset should be ignored by PWrite. +func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { + if offset < 0 { + return 0, offset, syserror.EINVAL + } + + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^linux.RWF_HIPRI != 0 { + return 0, offset, syserror.EOPNOTSUPP + } + + inode := fd.inode() + inode.metadataMu.Lock() + defer inode.metadataMu.Unlock() + + // If the file is opened with O_APPEND, update offset to file size. + // Note: since our Open() implements the interface of kernfs, + // and kernfs currently does not support O_APPEND, this will never + // be true before we switch out from kernfs. + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { + // Locking inode.metadataMu is sufficient for reading size + offset = int64(inode.size) + } + + srclen := src.NumBytes() + + if srclen > math.MaxUint32 { + // FUSE only supports uint32 for size. + // Overflow. + return 0, offset, syserror.EINVAL + } + if end := offset + srclen; end < offset { + // Overflow. + return 0, offset, syserror.EINVAL + } + + srclen, err = vfs.CheckLimit(ctx, offset, srclen) + if err != nil { + return 0, offset, err + } + + if srclen == 0 { + // Return before causing any side effects. + return 0, offset, nil + } + + src = src.TakeFirst64(srclen) + + // TODO(gvisor.dev/issue/3237): Add cache support: + // buffer cache. Ideally we write from src to our buffer cache first. + // The slice passed to fs.Write() should be a slice from buffer cache. + data := make([]byte, srclen) + // Reason for making a copy here: connection.Call() blocks on kerneltask, + // which in turn acquires mm.activeMu lock. Functions like CopyInTo() will + // attemp to acquire the mm.activeMu lock as well -> deadlock. + // We must finish reading from the userspace memory before + // t.Block() deactivates it. + cp, err := src.CopyIn(ctx, data) + if err != nil { + return 0, offset, err + } + if int64(cp) != srclen { + return 0, offset, syserror.EIO + } + + n, err := fd.inode().fs.Write(ctx, fd, uint64(offset), uint32(srclen), data) + if err != nil { + return 0, offset, err + } + + if n == 0 { + // We have checked srclen != 0 previously. + // If err == nil, then it's a short write and we return EIO. + return 0, offset, syserror.EIO + } + + written = int64(n) + finalOff = offset + written + + if finalOff > int64(inode.size) { + atomic.StoreUint64(&inode.size, uint64(finalOff)) + atomic.AddUint64(&inode.fs.conn.attributeVersion, 1) + } + + return +} diff --git a/pkg/sentry/fsimpl/fuse/request_response.go b/pkg/sentry/fsimpl/fuse/request_response.go new file mode 100644 index 000000000..7fa00569b --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/request_response.go @@ -0,0 +1,229 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "fmt" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/usermem" +) + +// fuseInitRes is a variable-length wrapper of linux.FUSEInitOut. The FUSE +// server may implement an older version of FUSE protocol, which contains a +// linux.FUSEInitOut with less attributes. +// +// Dynamically-sized objects cannot be marshalled. +type fuseInitRes struct { + marshal.StubMarshallable + + // initOut contains the response from the FUSE server. + initOut linux.FUSEInitOut + + // initLen is the total length of bytes of the response. + initLen uint32 +} + +// UnmarshalBytes deserializes src to the initOut attribute in a fuseInitRes. +func (r *fuseInitRes) UnmarshalBytes(src []byte) { + out := &r.initOut + + // Introduced before FUSE kernel version 7.13. + out.Major = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + out.Minor = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + out.MaxReadahead = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + out.Flags = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + out.MaxBackground = uint16(usermem.ByteOrder.Uint16(src[:2])) + src = src[2:] + out.CongestionThreshold = uint16(usermem.ByteOrder.Uint16(src[:2])) + src = src[2:] + out.MaxWrite = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + + // Introduced in FUSE kernel version 7.23. + if len(src) >= 4 { + out.TimeGran = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + } + // Introduced in FUSE kernel version 7.28. + if len(src) >= 2 { + out.MaxPages = uint16(usermem.ByteOrder.Uint16(src[:2])) + src = src[2:] + } +} + +// SizeBytes is the size of the payload of the FUSE_INIT response. +func (r *fuseInitRes) SizeBytes() int { + return int(r.initLen) +} + +// Ordinary requests have even IDs, while interrupts IDs are odd. +// Used to increment the unique ID for each FUSE request. +var reqIDStep uint64 = 2 + +// Request represents a FUSE operation request that hasn't been sent to the +// server yet. +// +// +stateify savable +type Request struct { + requestEntry + + id linux.FUSEOpID + hdr *linux.FUSEHeaderIn + data []byte + + // payload for this request: extra bytes to write after + // the data slice. Used by FUSE_WRITE. + payload []byte + + // If this request is async. + async bool + // If we don't care its response. + // Manually set by the caller. + noReply bool +} + +// NewRequest creates a new request that can be sent to the FUSE server. +func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { + conn.fd.mu.Lock() + defer conn.fd.mu.Unlock() + conn.fd.nextOpID += linux.FUSEOpID(reqIDStep) + + hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes() + hdr := linux.FUSEHeaderIn{ + Len: uint32(hdrLen + payload.SizeBytes()), + Opcode: opcode, + Unique: conn.fd.nextOpID, + NodeID: ino, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + PID: pid, + } + + buf := make([]byte, hdr.Len) + + // TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again. + hdr.MarshalBytes(buf[:hdrLen]) + payload.MarshalBytes(buf[hdrLen:]) + + return &Request{ + id: hdr.Unique, + hdr: &hdr, + data: buf, + }, nil +} + +// futureResponse represents an in-flight request, that may or may not have +// completed yet. Convert it to a resolved Response by calling Resolve, but note +// that this may block. +// +// +stateify savable +type futureResponse struct { + opcode linux.FUSEOpcode + ch chan struct{} + hdr *linux.FUSEHeaderOut + data []byte + + // If this request is async. + async bool +} + +// newFutureResponse creates a future response to a FUSE request. +func newFutureResponse(req *Request) *futureResponse { + return &futureResponse{ + opcode: req.hdr.Opcode, + ch: make(chan struct{}), + async: req.async, + } +} + +// resolve blocks the task until the server responds to its corresponding request, +// then returns a resolved response. +func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) { + // Return directly for async requests. + if f.async { + return nil, nil + } + + if err := t.Block(f.ch); err != nil { + return nil, err + } + + return f.getResponse(), nil +} + +// getResponse creates a Response from the data the futureResponse has. +func (f *futureResponse) getResponse() *Response { + return &Response{ + opcode: f.opcode, + hdr: *f.hdr, + data: f.data, + } +} + +// Response represents an actual response from the server, including the +// response payload. +// +// +stateify savable +type Response struct { + opcode linux.FUSEOpcode + hdr linux.FUSEHeaderOut + data []byte +} + +// Error returns the error of the FUSE call. +func (r *Response) Error() error { + errno := r.hdr.Error + if errno >= 0 { + return nil + } + + sysErrNo := syscall.Errno(-errno) + return error(sysErrNo) +} + +// DataLen returns the size of the response without the header. +func (r *Response) DataLen() uint32 { + return r.hdr.Len - uint32(r.hdr.SizeBytes()) +} + +// UnmarshalPayload unmarshals the response data into m. +func (r *Response) UnmarshalPayload(m marshal.Marshallable) error { + hdrLen := r.hdr.SizeBytes() + haveDataLen := r.hdr.Len - uint32(hdrLen) + wantDataLen := uint32(m.SizeBytes()) + + if haveDataLen < wantDataLen { + return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen) + } + + // The response data is empty unless there is some payload. And so, doesn't + // need to be unmarshalled. + if r.data == nil { + return nil + } + + // TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again. + m.UnmarshalBytes(r.data[hdrLen:]) + return nil +} diff --git a/pkg/sentry/fsimpl/fuse/utils_test.go b/pkg/sentry/fsimpl/fuse/utils_test.go new file mode 100644 index 000000000..e1d9e3365 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/utils_test.go @@ -0,0 +1,132 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "io" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +func setup(t *testing.T) *testutil.System { + k, err := testutil.Boot() + if err != nil { + t.Fatalf("Error creating kernel: %v", err) + } + + ctx := k.SupervisorContext() + creds := auth.CredentialsFromContext(ctx) + + k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, + AllowUserMount: true, + }) + + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) + if err != nil { + t.Fatalf("NewMountNamespace(): %v", err) + } + + return testutil.NewSystem(ctx, t, k.VFS(), mntns) +} + +// newTestConnection creates a fuse connection that the sentry can communicate with +// and the FD for the server to communicate with. +func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) { + vfsObj := &vfs.VirtualFilesystem{} + fuseDev := &DeviceFD{} + + if err := vfsObj.Init(system.Ctx); err != nil { + return nil, nil, err + } + + vd := vfsObj.NewAnonVirtualDentry("genCountFD") + defer vd.DecRef(system.Ctx) + if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil { + return nil, nil, err + } + + fsopts := filesystemOptions{ + maxActiveRequests: maxActiveRequests, + } + fs, err := newFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd) + if err != nil { + return nil, nil, err + } + + return fs.conn, &fuseDev.vfsfd, nil +} + +type testPayload struct { + marshal.StubMarshallable + data uint32 +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (t *testPayload) SizeBytes() int { + return 4 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (t *testPayload) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint32(dst[:4], t.data) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (t *testPayload) UnmarshalBytes(src []byte) { + *t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])} +} + +// Packed implements marshal.Marshallable.Packed. +func (t *testPayload) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (t *testPayload) MarshalUnsafe(dst []byte) { + t.MarshalBytes(dst) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (t *testPayload) UnmarshalUnsafe(src []byte) { + t.UnmarshalBytes(src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +func (t *testPayload) CopyOutN(task marshal.CopyContext, addr usermem.Addr, limit int) (int, error) { + panic("not implemented") +} + +// CopyOut implements marshal.Marshallable.CopyOut. +func (t *testPayload) CopyOut(task marshal.CopyContext, addr usermem.Addr) (int, error) { + panic("not implemented") +} + +// CopyIn implements marshal.Marshallable.CopyIn. +func (t *testPayload) CopyIn(task marshal.CopyContext, addr usermem.Addr) (int, error) { + panic("not implemented") +} + +// WriteTo implements io.WriterTo.WriteTo. +func (t *testPayload) WriteTo(w io.Writer) (int64, error) { + panic("not implemented") +} diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 91d2ae199..18c884b59 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -117,11 +117,12 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) { d.syntheticChildren++ } +// +stateify savable type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl - mu sync.Mutex + mu sync.Mutex `state:"nosave"` off int64 dirents []vfs.Dirent } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 5d0f487db..94d96261b 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -1026,7 +1026,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open // step is required even if !d.cachedMetadataAuthoritative() because // d.mappings has to be updated. // d.metadataMu has already been acquired if trunc == true. - d.updateFileSizeLocked(0) + d.updateSizeLocked(0) if d.cachedMetadataAuthoritative() { d.touchCMtimeLocked() @@ -1311,6 +1311,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if !renamed.isDir() { return syserror.EISDIR } + if genericIsAncestorDentry(replaced, renamed) { + return syserror.ENOTEMPTY + } } else { if rp.MustBeDir() || renamed.isDir() { return syserror.ENOTDIR @@ -1361,14 +1364,15 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // with reference counts and queue oldParent for checkCachingLocked if the // parent isn't actually changing. if oldParent != newParent { + oldParent.decRefLocked() ds = appendDentry(ds, oldParent) newParent.IncRef() if renamed.isSynthetic() { oldParent.syntheticChildren-- newParent.syntheticChildren++ } + renamed.parent = newParent } - renamed.parent = newParent renamed.name = newName if newParent.children == nil { newParent.children = make(map[string]*dentry) @@ -1412,11 +1416,11 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } - if err := d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()); err != nil { - fs.renameMuRUnlockAndCheckCaching(ctx, &ds) + err = d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) + if err != nil { return err } - fs.renameMuRUnlockAndCheckCaching(ctx, &ds) if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) @@ -1491,7 +1495,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return fs.unlinkAt(ctx, rp, false /* dir */) } -// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() @@ -1519,8 +1523,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath return nil, syserror.ECONNREFUSED } -// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { +// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. +func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) @@ -1528,11 +1532,11 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si if err != nil { return nil, err } - return d.listxattr(ctx, rp.Credentials(), size) + return d.listXattr(ctx, rp.Credentials(), size) } -// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { +// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. +func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) @@ -1540,11 +1544,11 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt if err != nil { return "", err } - return d.getxattr(ctx, rp.Credentials(), &opts) + return d.getXattr(ctx, rp.Credentials(), &opts) } -// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. -func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { +// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. +func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { var ds *[]*dentry fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) @@ -1552,18 +1556,18 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } - if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil { - fs.renameMuRUnlockAndCheckCaching(ctx, &ds) + err = d.setXattr(ctx, rp.Credentials(), &opts) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) + if err != nil { return err } - fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } -// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. -func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { +// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. +func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { var ds *[]*dentry fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) @@ -1571,11 +1575,11 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } - if err := d.removexattr(ctx, rp.Credentials(), name); err != nil { - fs.renameMuRUnlockAndCheckCaching(ctx, &ds) + err = d.removeXattr(ctx, rp.Credentials(), name) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) + if err != nil { return err } - fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 57bff1789..8608471f8 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -62,9 +62,13 @@ import ( const Name = "9p" // FilesystemType implements vfs.FilesystemType. +// +// +stateify savable type FilesystemType struct{} // filesystem implements vfs.FilesystemImpl. +// +// +stateify savable type filesystem struct { vfsfs vfs.Filesystem @@ -77,7 +81,7 @@ type filesystem struct { iopts InternalFilesystemOptions // client is the client used by this filesystem. client is immutable. - client *p9.Client + client *p9.Client `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported. // clock is a realtime clock used to set timestamps in file operations. clock ktime.Clock @@ -95,7 +99,7 @@ type filesystem struct { // reference count (such that it is usable as vfs.ResolvingPath.Start() or // is reachable from its children), or if it is a child dentry (such that // it is reachable from its parent). - renameMu sync.RWMutex + renameMu sync.RWMutex `state:"nosave"` // cachedDentries contains all dentries with 0 references. (Due to race // conditions, it may also contain dentries with non-zero references.) @@ -107,7 +111,7 @@ type filesystem struct { // syncableDentries contains all dentries in this filesystem for which // !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs. // These fields are protected by syncMu. - syncMu sync.Mutex + syncMu sync.Mutex `state:"nosave"` syncableDentries map[*dentry]struct{} specialFileFDs map[*specialFileFD]struct{} @@ -120,6 +124,8 @@ type filesystem struct { // dentries, it comes from QID.Path from the 9P server. Synthetic dentries // have have their inodeNumber generated sequentially, with the MSB reserved to // prevent conflicts with regular dentries. +// +// +stateify savable type inodeNumber uint64 // Reserve MSB for synthetic mounts. @@ -132,6 +138,7 @@ func inoFromPath(path uint64) inodeNumber { return inodeNumber(path &^ syntheticInoMask) } +// +stateify savable type filesystemOptions struct { // "Standard" 9P options. fd int @@ -177,6 +184,8 @@ type filesystemOptions struct { // InteropMode controls the client's interaction with other remote filesystem // users. +// +// +stateify savable type InteropMode uint32 const ( @@ -195,11 +204,7 @@ const ( // and consistent with Linux's semantics (in particular, it is not always // possible for clients to set arbitrary atimes and mtimes depending on the // remote filesystem implementation, and never possible for clients to set - // arbitrary ctimes.) If a dentry containing a client-defined atime or - // mtime is evicted from cache, client timestamps will be sent to the - // remote filesystem on a best-effort basis to attempt to ensure that - // timestamps will be preserved when another dentry representing the same - // file is instantiated. + // arbitrary ctimes.) InteropModeExclusive InteropMode = iota // InteropModeWritethrough is appropriate when there are read-only users of @@ -239,6 +244,8 @@ const ( // InternalFilesystemOptions may be passed as // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. +// +// +stateify savable type InternalFilesystemOptions struct { // If LeakConnection is true, do not close the connection to the server // when the Filesystem is released. This is necessary for deployments in @@ -538,6 +545,8 @@ func (fs *filesystem) Release(ctx context.Context) { } // dentry implements vfs.DentryImpl. +// +// +stateify savable type dentry struct { vfsd vfs.Dentry @@ -567,7 +576,7 @@ type dentry struct { // If file.isNil(), this dentry represents a synthetic file, i.e. a file // that does not exist on the remote filesystem. As of this writing, the // only files that can be synthetic are sockets, pipes, and directories. - file p9file + file p9file `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported. // If deleted is non-zero, the file represented by this dentry has been // deleted. deleted is accessed using atomic memory operations. @@ -579,7 +588,7 @@ type dentry struct { cached bool dentryEntry - dirMu sync.Mutex + dirMu sync.Mutex `state:"nosave"` // If this dentry represents a directory, children contains: // @@ -611,7 +620,7 @@ type dentry struct { // To mutate: // - Lock metadataMu and use atomic operations to update because we might // have atomic readers that don't hold the lock. - metadataMu sync.Mutex + metadataMu sync.Mutex `state:"nosave"` ino inodeNumber // immutable mode uint32 // type is immutable, perms are mutable uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic @@ -642,7 +651,7 @@ type dentry struct { // other metadata fields. nlink uint32 - mapsMu sync.Mutex + mapsMu sync.Mutex `state:"nosave"` // If this dentry represents a regular file, mappings tracks mappings of // the file into memmap.MappingSpaces. mappings is protected by mapsMu. @@ -666,12 +675,12 @@ type dentry struct { // either p9.File transitions from closed (isNil() == true) to open // (isNil() == false), it may be mutated with handleMu locked, but cannot // be closed until the dentry is destroyed. - handleMu sync.RWMutex - readFile p9file - writeFile p9file + handleMu sync.RWMutex `state:"nosave"` + readFile p9file `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported. + writeFile p9file `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported. hostFD int32 - dataMu sync.RWMutex + dataMu sync.RWMutex `state:"nosave"` // If this dentry represents a regular file that is client-cached, cache // maps offsets into the cached file to offsets into @@ -837,7 +846,7 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { atomic.StoreUint32(&d.nlink, uint32(attr.NLink)) } if mask.Size { - d.updateFileSizeLocked(attr.Size) + d.updateSizeLocked(attr.Size) } } @@ -991,7 +1000,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs // d.size should be kept up to date, and privatized // copy-on-write mappings of truncated pages need to be // invalidated, even if InteropModeShared is in effect. - d.updateFileSizeLocked(stat.Size) + d.updateSizeLocked(stat.Size) } } if d.fs.opts.interop == InteropModeShared { @@ -1028,8 +1037,31 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs return nil } +// doAllocate performs an allocate operation on d. Note that d.metadataMu will +// be held when allocate is called. +func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error { + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + + // Allocating a smaller size is a noop. + size := offset + length + if d.cachedMetadataAuthoritative() && size <= d.size { + return nil + } + + err := allocate() + if err != nil { + return err + } + d.updateSizeLocked(size) + if d.cachedMetadataAuthoritative() { + d.touchCMtimeLocked() + } + return nil +} + // Preconditions: d.metadataMu must be locked. -func (d *dentry) updateFileSizeLocked(newSize uint64) { +func (d *dentry) updateSizeLocked(newSize uint64) { d.dataMu.Lock() oldSize := d.size atomic.StoreUint64(&d.size, newSize) @@ -1067,6 +1099,21 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) } +func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { + // We only support xattrs prefixed with "user." (see b/148380782). Currently, + // there is no need to expose any other xattrs through a gofer. + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + kuid := auth.KUID(atomic.LoadUint32(&d.uid)) + kgid := auth.KGID(atomic.LoadUint32(&d.gid)) + if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { + return err + } + return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) +} + func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid))) } @@ -1300,30 +1347,19 @@ func (d *dentry) destroyLocked(ctx context.Context) { d.handleMu.Unlock() if !d.file.isNil() { - if !d.isDeleted() { - // Write dirty timestamps back to the remote filesystem. - atimeDirty := atomic.LoadUint32(&d.atimeDirty) != 0 - mtimeDirty := atomic.LoadUint32(&d.mtimeDirty) != 0 - if atimeDirty || mtimeDirty { - atime := atomic.LoadInt64(&d.atime) - mtime := atomic.LoadInt64(&d.mtime) - if err := d.file.setAttr(ctx, p9.SetAttrMask{ - ATime: atimeDirty, - ATimeNotSystemTime: atimeDirty, - MTime: mtimeDirty, - MTimeNotSystemTime: mtimeDirty, - }, p9.SetAttr{ - ATimeSeconds: uint64(atime / 1e9), - ATimeNanoSeconds: uint64(atime % 1e9), - MTimeSeconds: uint64(mtime / 1e9), - MTimeNanoSeconds: uint64(mtime % 1e9), - }); err != nil { - log.Warningf("gofer.dentry.destroyLocked: failed to write dirty timestamps back: %v", err) - } - } + // Note that it's possible that d.atimeDirty or d.mtimeDirty are true, + // i.e. client and server timestamps may differ (because e.g. a client + // write was serviced by the page cache, and only written back to the + // remote file later). Ideally, we'd write client timestamps back to + // the remote filesystem so that timestamps for a new dentry + // instantiated for the same file would remain coherent. Unfortunately, + // this turns out to be too expensive in many cases, so for now we + // don't do this. + if err := d.file.close(ctx); err != nil { + log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err) } - d.file.close(ctx) d.file = p9file{} + // Remove d from the set of syncable dentries. d.fs.syncMu.Lock() delete(d.fs.syncableDentries, d) @@ -1351,9 +1387,7 @@ func (d *dentry) setDeleted() { atomic.StoreUint32(&d.deleted, 1) } -// We only support xattrs prefixed with "user." (see b/148380782). Currently, -// there is no need to expose any other xattrs through a gofer. -func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) { +func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) { if d.file.isNil() || !d.userXattrSupported() { return nil, nil } @@ -1363,6 +1397,7 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui } xattrs := make([]string, 0, len(xattrMap)) for x := range xattrMap { + // We only support xattrs in the user.* namespace. if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) { xattrs = append(xattrs, x) } @@ -1370,51 +1405,33 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui return xattrs, nil } -func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { +func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { if d.file.isNil() { return "", syserror.ENODATA } - if err := d.checkPermissions(creds, vfs.MayRead); err != nil { + if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { return "", err } - if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { - return "", syserror.EOPNOTSUPP - } - if !d.userXattrSupported() { - return "", syserror.ENODATA - } return d.file.getXattr(ctx, opts.Name, opts.Size) } -func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error { +func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error { if d.file.isNil() { return syserror.EPERM } - if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { + if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { return err } - if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { - return syserror.EOPNOTSUPP - } - if !d.userXattrSupported() { - return syserror.EPERM - } return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags) } -func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error { +func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error { if d.file.isNil() { return syserror.EPERM } - if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { + if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { return err } - if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { - return syserror.EOPNOTSUPP - } - if !d.userXattrSupported() { - return syserror.EPERM - } return d.file.removeXattr(ctx, name) } @@ -1623,12 +1640,14 @@ func (d *dentry) decLinks() { // fileDescription is embedded by gofer implementations of // vfs.FileDescriptionImpl. +// +// +stateify savable type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.LockFD - lockLogging sync.Once + lockLogging sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported. } func (fd *fileDescription) filesystem() *filesystem { @@ -1666,30 +1685,30 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) return nil } -// Listxattr implements vfs.FileDescriptionImpl.Listxattr. -func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { - return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size) +// ListXattr implements vfs.FileDescriptionImpl.ListXattr. +func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { + return fd.dentry().listXattr(ctx, auth.CredentialsFromContext(ctx), size) } -// Getxattr implements vfs.FileDescriptionImpl.Getxattr. -func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) { - return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts) +// GetXattr implements vfs.FileDescriptionImpl.GetXattr. +func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { + return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts) } -// Setxattr implements vfs.FileDescriptionImpl.Setxattr. -func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { +// SetXattr implements vfs.FileDescriptionImpl.SetXattr. +func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { d := fd.dentry() - if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil { + if err := d.setXattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil { return err } d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } -// Removexattr implements vfs.FileDescriptionImpl.Removexattr. -func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { +// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. +func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { d := fd.dentry() - if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil { + if err := d.removeXattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil { return err } d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go index 104157512..a9ebe1206 100644 --- a/pkg/sentry/fsimpl/gofer/handle.go +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -25,6 +25,8 @@ import ( // handle represents a remote "open file descriptor", consisting of an opened // fid (p9.File) and optionally a host file descriptor. +// +// These are explicitly not savable. type handle struct { file p9file fd int32 // -1 if unavailable diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go index 87f0b877f..21b4a96fe 100644 --- a/pkg/sentry/fsimpl/gofer/p9file.go +++ b/pkg/sentry/fsimpl/gofer/p9file.go @@ -127,6 +127,13 @@ func (f p9file) close(ctx context.Context) error { return err } +func (f p9file) setAttrClose(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.SetAttrClose(valid, attr) + ctx.UninterruptibleSleepFinish(false) + return err +} + func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { ctx.UninterruptibleSleepStart(false) fdobj, qid, iounit, err := f.file.Open(flags) diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index a2e9342d5..eeaf6e444 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -39,11 +39,12 @@ func (d *dentry) isRegularFile() bool { return d.fileType() == linux.S_IFREG } +// +stateify savable type regularFileFD struct { fileDescription // off is the file offset. off is protected by mu. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` off int64 } @@ -79,28 +80,11 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error { // Allocate implements vfs.FileDescriptionImpl.Allocate. func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { d := fd.dentry() - d.metadataMu.Lock() - defer d.metadataMu.Unlock() - - // Allocating a smaller size is a noop. - size := offset + length - if d.cachedMetadataAuthoritative() && size <= d.size { - return nil - } - - d.handleMu.RLock() - err := d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length) - d.handleMu.RUnlock() - if err != nil { - return err - } - d.dataMu.Lock() - atomic.StoreUint64(&d.size, size) - d.dataMu.Unlock() - if d.cachedMetadataAuthoritative() { - d.touchCMtimeLocked() - } - return nil + return d.doAllocate(ctx, offset, length, func() error { + d.handleMu.RLock() + defer d.handleMu.RUnlock() + return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length) + }) } // PRead implements vfs.FileDescriptionImpl.PRead. @@ -915,6 +899,8 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { // dentryPlatformFile is only used when a host FD representing the remote file // is available (i.e. dentry.hostFD >= 0), and that FD is used for application // memory mappings (i.e. !filesystem.opts.forcePageCache). +// +// +stateify savable type dentryPlatformFile struct { *dentry @@ -927,7 +913,7 @@ type dentryPlatformFile struct { hostFileMapper fsutil.HostFileMapper // hostFileMapperInitOnce is used to lazily initialize hostFileMapper. - hostFileMapperInitOnce sync.Once + hostFileMapperInitOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported. } // IncRef implements memmap.File.IncRef. diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go index 85d2bee72..326b940a7 100644 --- a/pkg/sentry/fsimpl/gofer/socket.go +++ b/pkg/sentry/fsimpl/gofer/socket.go @@ -36,12 +36,14 @@ func (d *dentry) isSocket() bool { // An endpoint's lifetime is the time between when filesystem.BoundEndpointAt() // is called and either BoundEndpoint.BidirectionalConnect or // BoundEndpoint.UnidirectionalConnect is called. +// +// +stateify savable type endpoint struct { // dentry is the filesystem dentry which produced this endpoint. dentry *dentry // file is the p9 file that contains a single unopened fid. - file p9.File + file p9.File `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported. // path is the sentry path where this endpoint is bound. path string diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 3c39aa9b7..71581736c 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" @@ -33,11 +34,13 @@ import ( // special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is // in effect) regular files. specialFileFD differs from regularFileFD by using // per-FD handles instead of shared per-dentry handles, and never buffering I/O. +// +// +stateify savable type specialFileFD struct { fileDescription // handle is used for file I/O. handle is immutable. - handle handle + handle handle `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported. // isRegularFile is true if this FD represents a regular file which is only // possible when filesystemOptions.regularFilesUseSpecialFileFD is in @@ -55,7 +58,7 @@ type specialFileFD struct { queue waiter.Queue // If seekable is true, off is the file offset. off is protected by mu. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` off int64 } @@ -135,6 +138,16 @@ func (fd *specialFileFD) EventUnregister(e *waiter.Entry) { fd.fileDescription.EventUnregister(e) } +func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + if fd.isRegularFile { + d := fd.dentry() + return d.doAllocate(ctx, offset, length, func() error { + return fd.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length) + }) + } + return fd.FileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length) +} + // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if fd.seekable && offset < 0 { @@ -235,11 +248,12 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off d.touchCMtime() } buf := make([]byte, src.NumBytes()) - // Don't do partial writes if we get a partial read from src. - if _, err := src.CopyIn(ctx, buf); err != nil { - return 0, offset, err + copied, copyErr := src.CopyIn(ctx, buf) + if copied == 0 && copyErr != nil { + // Only return the error if we didn't get any data. + return 0, offset, copyErr } - n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:copied])), uint64(offset)) if err == syserror.EAGAIN { err = syserror.ErrWouldBlock } @@ -256,7 +270,10 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off atomic.StoreUint64(&d.size, uint64(offset)) } } - return int64(n), offset, err + if err != nil { + return int64(n), offset, err + } + return int64(n), offset, copyErr } // Write implements vfs.FileDescriptionImpl.Write. diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index be1c88c82..56bcf9bdb 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -49,6 +49,7 @@ go_library( "//pkg/fspath", "//pkg/iovec", "//pkg/log", + "//pkg/marshal/primitive", "//pkg/refs", "//pkg/safemem", "//pkg/sentry/arch", @@ -72,7 +73,6 @@ go_library( "//pkg/unet", "//pkg/usermem", "//pkg/waiter", - "//tools/go_marshal/primitive", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 7561f821c..ffe4ddb32 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -58,7 +58,7 @@ func newInode(fs *filesystem, hostFD int, fileType linux.FileMode, isTTY bool) ( canMap: fileType == linux.S_IFREG, } i.pf.inode = i - i.refs.EnableLeakCheck() + i.EnableLeakCheck() // Non-seekable files can't be memory mapped, assert this. if !i.seekable && i.canMap { @@ -126,7 +126,7 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) // For simplicity, fileDescription.offset is set to 0. Technically, we // should only set to 0 on files that are not seekable (sockets, pipes, // etc.), and use the offset from the host fd otherwise when importing. - return i.open(ctx, d.VFSDentry(), mnt, flags) + return i.open(ctx, d, mnt, flags) } // ImportFD sets up and returns a vfs.FileDescription from a donated fd. @@ -137,14 +137,16 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs } // filesystemType implements vfs.FilesystemType. +// +// +stateify savable type filesystemType struct{} -// GetFilesystem implements FilesystemType.GetFilesystem. +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { panic("host.filesystemType.GetFilesystem should never be called") } -// Name implements FilesystemType.Name. +// Name implements vfs.FilesystemType.Name. func (filesystemType) Name() string { return "none" } @@ -166,6 +168,8 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { } // filesystem implements vfs.FilesystemImpl. +// +// +stateify savable type filesystem struct { kernfs.Filesystem @@ -185,6 +189,8 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe } // inode implements kernfs.Inode. +// +// +stateify savable type inode struct { kernfs.InodeNoStatFS kernfs.InodeNotDirectory @@ -193,7 +199,7 @@ type inode struct { locks vfs.FileLocks // When the reference count reaches zero, the host fd is closed. - refs inodeRefs + inodeRefs // hostFD contains the host fd that this file was originally created from, // which must be available at time of restore. @@ -233,7 +239,7 @@ type inode struct { canMap bool // mapsMu protects mappings. - mapsMu sync.Mutex + mapsMu sync.Mutex `state:"nosave"` // If canMap is true, mappings tracks mappings of hostFD into // memmap.MappingSpaces. @@ -243,7 +249,7 @@ type inode struct { pf inodePlatformFile } -// CheckPermissions implements kernfs.Inode. +// CheckPermissions implements kernfs.Inode.CheckPermissions. func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { var s syscall.Stat_t if err := syscall.Fstat(i.hostFD, &s); err != nil { @@ -252,7 +258,7 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid)) } -// Mode implements kernfs.Inode. +// Mode implements kernfs.Inode.Mode. func (i *inode) Mode() linux.FileMode { var s syscall.Stat_t if err := syscall.Fstat(i.hostFD, &s); err != nil { @@ -263,7 +269,7 @@ func (i *inode) Mode() linux.FileMode { return linux.FileMode(s.Mode) } -// Stat implements kernfs.Inode. +// Stat implements kernfs.Inode.Stat. func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { if opts.Mask&linux.STATX__RESERVED != 0 { return linux.Statx{}, syserror.EINVAL @@ -376,7 +382,7 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) { }, nil } -// SetStat implements kernfs.Inode. +// SetStat implements kernfs.Inode.SetStat. func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { s := &opts.Stat @@ -435,19 +441,9 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre return nil } -// IncRef implements kernfs.Inode. -func (i *inode) IncRef() { - i.refs.IncRef() -} - -// TryIncRef implements kernfs.Inode. -func (i *inode) TryIncRef() bool { - return i.refs.TryIncRef() -} - -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (i *inode) DecRef(ctx context.Context) { - i.refs.DecRef(func() { + i.inodeRefs.DecRef(func() { if i.wouldBlock { fdnotifier.RemoveFD(int32(i.hostFD)) } @@ -457,16 +453,16 @@ func (i *inode) DecRef(ctx context.Context) { }) } -// Open implements kernfs.Inode. -func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +// Open implements kernfs.Inode.Open. +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { // Once created, we cannot re-open a socket fd through /proc/[pid]/fd/. if i.Mode().FileType() == linux.S_IFSOCK { return nil, syserror.ENXIO } - return i.open(ctx, vfsd, rp.Mount(), opts.Flags) + return i.open(ctx, d, rp.Mount(), opts.Flags) } -func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) { +func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) { var s syscall.Stat_t if err := syscall.Fstat(i.hostFD, &s); err != nil { return nil, err @@ -490,17 +486,17 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u return nil, err } // Currently, we only allow Unix sockets to be imported. - return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d, &i.locks) + return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d.VFSDentry(), &i.locks) case syscall.S_IFREG, syscall.S_IFIFO, syscall.S_IFCHR: if i.isTTY { fd := &TTYFileDescription{ fileDescription: fileDescription{inode: i}, - termios: linux.DefaultSlaveTermios, + termios: linux.DefaultReplicaTermios, } fd.LockFD.Init(&i.locks) vfsfd := &fd.vfsfd - if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { + if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return vfsfd, nil @@ -509,7 +505,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u fd := &fileDescription{inode: i} fd.LockFD.Init(&i.locks) vfsfd := &fd.vfsfd - if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { + if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return vfsfd, nil @@ -521,6 +517,8 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u } // fileDescription is embedded by host fd implementations of FileDescriptionImpl. +// +// +stateify savable type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -535,40 +533,35 @@ type fileDescription struct { inode *inode // offsetMu protects offset. - offsetMu sync.Mutex + offsetMu sync.Mutex `state:"nosave"` // offset specifies the current file offset. It is only meaningful when // inode.seekable is true. offset int64 } -// SetStat implements vfs.FileDescriptionImpl. +// SetStat implements vfs.FileDescriptionImpl.SetStat. func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts) } -// Stat implements vfs.FileDescriptionImpl. +// Stat implements vfs.FileDescriptionImpl.Stat. func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts) } -// Release implements vfs.FileDescriptionImpl. +// Release implements vfs.FileDescriptionImpl.Release. func (f *fileDescription) Release(context.Context) { // noop } -// Allocate implements vfs.FileDescriptionImpl. +// Allocate implements vfs.FileDescriptionImpl.Allocate. func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { - if !f.inode.seekable { - return syserror.ESPIPE - } - - // TODO(gvisor.dev/issue/3589): Implement Allocate for non-pipe hostfds. - return syserror.EOPNOTSUPP + return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length)) } -// PRead implements FileDescriptionImpl. +// PRead implements vfs.FileDescriptionImpl.PRead. func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { i := f.inode if !i.seekable { @@ -578,7 +571,7 @@ func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, off return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags) } -// Read implements FileDescriptionImpl. +// Read implements vfs.FileDescriptionImpl.Read. func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { i := f.inode if !i.seekable { @@ -615,7 +608,7 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off return int64(n), err } -// PWrite implements FileDescriptionImpl. +// PWrite implements vfs.FileDescriptionImpl.PWrite. func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { if !f.inode.seekable { return 0, syserror.ESPIPE @@ -624,7 +617,7 @@ func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, of return f.writeToHostFD(ctx, src, offset, opts.Flags) } -// Write implements FileDescriptionImpl. +// Write implements vfs.FileDescriptionImpl.Write. func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { i := f.inode if !i.seekable { @@ -672,7 +665,7 @@ func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSeque return int64(n), err } -// Seek implements FileDescriptionImpl. +// Seek implements vfs.FileDescriptionImpl.Seek. // // Note that we do not support seeking on directories, since we do not even // allow directory fds to be imported at all. @@ -737,13 +730,13 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i return f.offset, nil } -// Sync implements FileDescriptionImpl. +// Sync implements vfs.FileDescriptionImpl.Sync. func (f *fileDescription) Sync(context.Context) error { // TODO(gvisor.dev/issue/1897): Currently, we always sync everything. return unix.Fsync(f.inode.hostFD) } -// ConfigureMMap implements FileDescriptionImpl. +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error { if !f.inode.canMap { return syserror.ENODEV diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go index 65d3af38c..b51a17bed 100644 --- a/pkg/sentry/fsimpl/host/mmap.go +++ b/pkg/sentry/fsimpl/host/mmap.go @@ -27,11 +27,13 @@ import ( // cannot implement both kernfs.Inode.IncRef and memmap.File.IncRef. // // inodePlatformFile should only be used if inode.canMap is true. +// +// +stateify savable type inodePlatformFile struct { *inode // fdRefsMu protects fdRefs. - fdRefsMu sync.Mutex + fdRefsMu sync.Mutex `state:"nosave"` // fdRefs counts references on memmap.File offsets. It is used solely for // memory accounting. @@ -41,7 +43,7 @@ type inodePlatformFile struct { fileMapper fsutil.HostFileMapper // fileMapperInitOnce is used to lazily initialize fileMapper. - fileMapperInitOnce sync.Once + fileMapperInitOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported. } // IncRef implements memmap.File.IncRef. diff --git a/pkg/sentry/fsimpl/host/socket_unsafe.go b/pkg/sentry/fsimpl/host/socket_unsafe.go index 35ded24bc..c0bf45f08 100644 --- a/pkg/sentry/fsimpl/host/socket_unsafe.go +++ b/pkg/sentry/fsimpl/host/socket_unsafe.go @@ -63,10 +63,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) ( controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC if n > length { - return length, n, msg.Controllen, controlTrunc, err + return length, n, msg.Controllen, controlTrunc, nil } - return n, n, msg.Controllen, controlTrunc, err + return n, n, msg.Controllen, controlTrunc, nil } // fdWriteVec sends from bufs to fd. diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index 7a9be4b97..f5c596fec 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -17,6 +17,7 @@ package host import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -25,11 +26,12 @@ import ( "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/tools/go_marshal/primitive" ) // TTYFileDescription implements vfs.FileDescriptionImpl for a host file // descriptor that wraps a TTY FD. +// +// +stateify savable type TTYFileDescription struct { fileDescription @@ -76,7 +78,7 @@ func (t *TTYFileDescription) Release(ctx context.Context) { t.fileDescription.Release(ctx) } -// PRead implements vfs.FileDescriptionImpl. +// PRead implements vfs.FileDescriptionImpl.PRead. // // Reading from a TTY is only allowed for foreground process groups. Background // process groups will either get EIO or a SIGTTIN. @@ -94,7 +96,7 @@ func (t *TTYFileDescription) PRead(ctx context.Context, dst usermem.IOSequence, return t.fileDescription.PRead(ctx, dst, offset, opts) } -// Read implements vfs.FileDescriptionImpl. +// Read implements vfs.FileDescriptionImpl.Read. // // Reading from a TTY is only allowed for foreground process groups. Background // process groups will either get EIO or a SIGTTIN. @@ -112,7 +114,7 @@ func (t *TTYFileDescription) Read(ctx context.Context, dst usermem.IOSequence, o return t.fileDescription.Read(ctx, dst, opts) } -// PWrite implements vfs.FileDescriptionImpl. +// PWrite implements vfs.FileDescriptionImpl.PWrite. func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() @@ -127,7 +129,7 @@ func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, return t.fileDescription.PWrite(ctx, src, offset, opts) } -// Write implements vfs.FileDescriptionImpl. +// Write implements vfs.FileDescriptionImpl.Write. func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() @@ -142,7 +144,7 @@ func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, return t.fileDescription.Write(ctx, src, opts) } -// Ioctl implements vfs.FileDescriptionImpl. +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { task := kernel.TaskFromContext(ctx) if task == nil { diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index 637dca70c..5e91e0536 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -83,6 +83,7 @@ go_library( "slot_list.go", "static_directory_refs.go", "symlink.go", + "synthetic_directory.go", ], visibility = ["//pkg/sentry:internal"], deps = [ diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 1ee089620..b929118b1 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -56,9 +56,9 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint } // Open implements Inode.Open. -func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &DynamicBytesFD{} - if err := fd.Init(rp.Mount(), vfsd, f.data, &f.locks, opts.Flags); err != nil { + if err := fd.Init(rp.Mount(), d, f.data, &f.locks, opts.Flags); err != nil { return nil, err } return &fd.vfsfd, nil @@ -87,12 +87,12 @@ type DynamicBytesFD struct { } // Init initializes a DynamicBytesFD. -func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error { +func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error { fd.LockFD.Init(locks) - if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return err } - fd.inode = d.Impl().(*Dentry).inode + fd.inode = d.inode fd.SetDataSource(data) return nil } diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 6518ff5cd..0a4cd4057 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -29,6 +29,8 @@ import ( ) // SeekEndConfig describes the SEEK_END behaviour for FDs. +// +// +stateify savable type SeekEndConfig int // Constants related to SEEK_END behaviour for FDs. @@ -41,6 +43,8 @@ const ( ) // GenericDirectoryFDOptions contains configuration for a GenericDirectoryFD. +// +// +stateify savable type GenericDirectoryFDOptions struct { SeekEnd SeekEndConfig } @@ -56,6 +60,8 @@ type GenericDirectoryFDOptions struct { // Must be initialize with Init before first use. // // Lock ordering: mu => children.mu. +// +// +stateify savable type GenericDirectoryFD struct { vfs.FileDescriptionDefaultImpl vfs.DirectoryFileDescriptionDefaultImpl @@ -68,7 +74,7 @@ type GenericDirectoryFD struct { children *OrderedChildren // mu protects the fields below. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` // off is the current directory offset. Protected by "mu". off int64 @@ -76,12 +82,12 @@ type GenericDirectoryFD struct { // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its // dentry. -func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) (*GenericDirectoryFD, error) { +func NewGenericDirectoryFD(m *vfs.Mount, d *Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) (*GenericDirectoryFD, error) { fd := &GenericDirectoryFD{} if err := fd.Init(children, locks, opts, fdOpts); err != nil { return nil, err } - if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return fd, nil @@ -195,8 +201,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent // these. childIdx := fd.off - 2 for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() { - inode := it.Dentry.Impl().(*Dentry).inode - stat, err := inode.Stat(ctx, fd.filesystem(), opts) + stat, err := it.Dentry.inode.Stat(ctx, fd.filesystem(), opts) if err != nil { return err } diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 0e3011689..5cc1c4281 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -37,8 +37,7 @@ import ( // * !rp.Done(). // // Postcondition: Caller must call fs.processDeferredDecRefs*. -func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, mayFollowSymlinks bool) (*vfs.Dentry, error) { - d := vfsd.Impl().(*Dentry) +func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, mayFollowSymlinks bool) (*Dentry, error) { if !d.isDir() { return nil, syserror.ENOTDIR } @@ -55,20 +54,20 @@ afterSymlink: // calls d_revalidate(), but walk_component() => handle_dots() does not. if name == "." { rp.Advance() - return vfsd, nil + return d, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(ctx, vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, d.VFSDentry()); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() - return vfsd, nil + return d, nil } - if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, d.parent.VFSDentry()); err != nil { return nil, err } rp.Advance() - return &d.parent.vfsd, nil + return d.parent, nil } if len(name) > linux.NAME_MAX { return nil, syserror.ENAMETOOLONG @@ -79,7 +78,7 @@ afterSymlink: if err != nil { return nil, err } - if err := rp.CheckMount(ctx, &next.vfsd); err != nil { + if err := rp.CheckMount(ctx, next.VFSDentry()); err != nil { return nil, err } // Resolve any symlink at current path component. @@ -102,7 +101,7 @@ afterSymlink: goto afterSymlink } rp.Advance() - return &next.vfsd, nil + return next, nil } // revalidateChildLocked must be called after a call to parent.vfsd.Child(name) @@ -122,25 +121,21 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir if !child.inode.Valid(ctx) { delete(parent.children, name) vfsObj.InvalidateDentry(ctx, &child.vfsd) - fs.deferDecRef(&child.vfsd) // Reference from Lookup. + fs.deferDecRef(child) // Reference from Lookup. child = nil } } if child == nil { - // Dentry isn't cached; it either doesn't exist or failed - // revalidation. Attempt to resolve it via Lookup. - // - // FIXME(gvisor.dev/issue/1193): Inode.Lookup() should return - // *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes - // that all dentries in the filesystem are (kernfs.)Dentry and performs - // vfs.DentryImpl casts accordingly. - childVFSD, err := parent.inode.Lookup(ctx, name) + // Dentry isn't cached; it either doesn't exist or failed revalidation. + // Attempt to resolve it via Lookup. + c, err := parent.inode.Lookup(ctx, name) if err != nil { return nil, err } - // Reference on childVFSD dropped by a corresponding Valid. - child = childVFSD.Impl().(*Dentry) - parent.insertChildLocked(name, child) + // Reference on c (provided by Lookup) will be dropped when the dentry + // fails validation. + parent.InsertChildLocked(name, c) + child = c } return child, nil } @@ -153,20 +148,19 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // Preconditions: Filesystem.mu must be locked for at least reading. // // Postconditions: Caller must call fs.processDeferredDecRefs*. -func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) { - vfsd := rp.Start() +func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) { + d := rp.Start().Impl().(*Dentry) for !rp.Done() { var err error - vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */) + d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */) if err != nil { - return nil, nil, err + return nil, err } } - d := vfsd.Impl().(*Dentry) if rp.MustBeDir() && !d.isDir() { - return nil, nil, syserror.ENOTDIR + return nil, syserror.ENOTDIR } - return vfsd, d.inode, nil + return d, nil } // walkParentDirLocked resolves all but the last path component of rp to an @@ -181,20 +175,19 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP // * !rp.Done(). // // Postconditions: Caller must call fs.processDeferredDecRefs*. -func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) { - vfsd := rp.Start() +func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) { + d := rp.Start().Impl().(*Dentry) for !rp.Final() { var err error - vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */) + d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */) if err != nil { - return nil, nil, err + return nil, err } } - d := vfsd.Impl().(*Dentry) if !d.isDir() { - return nil, nil, syserror.ENOTDIR + return nil, syserror.ENOTDIR } - return vfsd, d.inode, nil + return d, nil } // checkCreateLocked checks that a file named rp.Component() may be created in @@ -202,10 +195,9 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving // // Preconditions: // * Filesystem.mu must be locked for at least reading. -// * parentInode == parentVFSD.Impl().(*Dentry).Inode. // * isDir(parentInode) == true. -func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) { - if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { +func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *Dentry) (string, error) { + if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return "", err } pc := rp.Component() @@ -215,11 +207,10 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v if len(pc) > linux.NAME_MAX { return "", syserror.ENAMETOOLONG } - // FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu. - if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok { + if _, ok := parent.children[pc]; ok { return "", syserror.EEXIST } - if parentVFSD.IsDead() { + if parent.VFSDentry().IsDead() { return "", syserror.ENOENT } return pc, nil @@ -228,8 +219,8 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v // checkDeleteLocked checks that the file represented by vfsd may be deleted. // // Preconditions: Filesystem.mu must be locked for at least reading. -func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error { - parent := vfsd.Impl().(*Dentry).parent +func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) error { + parent := d.parent if parent == nil { return syserror.EBUSY } @@ -258,11 +249,11 @@ func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() - _, inode, err := fs.walkExistingLocked(ctx, rp) + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } - return inode.CheckPermissions(ctx, creds, ats) + return d.inode.CheckPermissions(ctx, creds, ats) } // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. @@ -270,20 +261,20 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op fs.mu.RLock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() - vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return nil, err } if opts.CheckSearchable { - d := vfsd.Impl().(*Dentry) if !d.isDir() { return nil, syserror.ENOTDIR } - if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { + if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } + vfsd := d.VFSDentry() vfsd.IncRef() // Ownership transferred to caller. return vfsd, nil } @@ -293,12 +284,12 @@ func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa fs.mu.RLock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() - vfsd, _, err := fs.walkParentDirLocked(ctx, rp) + d, err := fs.walkParentDirLocked(ctx, rp) if err != nil { return nil, err } - vfsd.IncRef() // Ownership transferred to caller. - return vfsd, nil + d.IncRef() // Ownership transferred to caller. + return d.VFSDentry(), nil } // LinkAt implements vfs.FilesystemImpl.LinkAt. @@ -308,12 +299,15 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. } fs.mu.Lock() defer fs.mu.Unlock() - parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + parent, err := fs.walkParentDirLocked(ctx, rp) fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } - pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) + + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + pc, err := checkCreateLocked(ctx, rp, parent) if err != nil { return err } @@ -330,11 +324,11 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EPERM } - childVFSD, err := parentInode.NewLink(ctx, pc, d.inode) + child, err := parent.inode.NewLink(ctx, pc, d.inode) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) + parent.InsertChildLocked(pc, child) return nil } @@ -345,12 +339,15 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v } fs.mu.Lock() defer fs.mu.Unlock() - parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + parent, err := fs.walkParentDirLocked(ctx, rp) fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } - pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) + + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + pc, err := checkCreateLocked(ctx, rp, parent) if err != nil { return err } @@ -358,11 +355,14 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } defer rp.Mount().EndWrite() - childVFSD, err := parentInode.NewDir(ctx, pc, opts) + child, err := parent.inode.NewDir(ctx, pc, opts) if err != nil { - return err + if !opts.ForSyntheticMountpoint || err == syserror.EEXIST { + return err + } + child = newSyntheticDirectory(rp.Credentials(), opts.Mode) } - parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) + parent.InsertChildLocked(pc, child) return nil } @@ -373,12 +373,15 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v } fs.mu.Lock() defer fs.mu.Unlock() - parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + parent, err := fs.walkParentDirLocked(ctx, rp) fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } - pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) + + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + pc, err := checkCreateLocked(ctx, rp, parent) if err != nil { return err } @@ -386,11 +389,11 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } defer rp.Mount().EndWrite() - newVFSD, err := parentInode.NewNode(ctx, pc, opts) + newD, err := parent.inode.NewNode(ctx, pc, opts) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry)) + parent.InsertChildLocked(pc, newD) return nil } @@ -406,28 +409,27 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // Do not create new file. if opts.Flags&linux.O_CREAT == 0 { fs.mu.RLock() - vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { fs.mu.RUnlock() fs.processDeferredDecRefs(ctx) return nil, err } - if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { + if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { fs.mu.RUnlock() fs.processDeferredDecRefs(ctx) return nil, err } - inode.IncRef() - defer inode.DecRef(ctx) + d.inode.IncRef() + defer d.inode.DecRef(ctx) fs.mu.RUnlock() fs.processDeferredDecRefs(ctx) - return inode.Open(ctx, rp, vfsd, opts) + return d.inode.Open(ctx, rp, d, opts) } // May create new file. mustCreate := opts.Flags&linux.O_EXCL != 0 - vfsd := rp.Start() - inode := vfsd.Impl().(*Dentry).inode + d := rp.Start().Impl().(*Dentry) fs.mu.Lock() unlocked := false unlock := func() { @@ -444,22 +446,22 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if mustCreate { return nil, syserror.EEXIST } - if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { + if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - inode.IncRef() - defer inode.DecRef(ctx) + d.inode.IncRef() + defer d.inode.DecRef(ctx) unlock() - return inode.Open(ctx, rp, vfsd, opts) + return d.inode.Open(ctx, rp, d, opts) } afterTrailingSymlink: - parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + parent, err := fs.walkParentDirLocked(ctx, rp) fs.processDeferredDecRefsLocked(ctx) if err != nil { return nil, err } // Check for search permission in the parent directory. - if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { + if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Reject attempts to open directories with O_CREAT. @@ -474,10 +476,10 @@ afterTrailingSymlink: return nil, syserror.ENAMETOOLONG } // Determine whether or not we need to create a file. - childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD, false /* mayFollowSymlinks */) + child, err := fs.stepExistingLocked(ctx, rp, parent, false /* mayFollowSymlinks */) if err == syserror.ENOENT { // Already checked for searchability above; now check for writability. - if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { + if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if err := rp.Mount().CheckBeginWrite(); err != nil { @@ -485,16 +487,18 @@ afterTrailingSymlink: } defer rp.Mount().EndWrite() // Create and open the child. - childVFSD, err = parentInode.NewFile(ctx, pc, opts) + child, err := parent.inode.NewFile(ctx, pc, opts) if err != nil { return nil, err } - child := childVFSD.Impl().(*Dentry) - parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + // FIXME(gvisor.dev/issue/1193): Race between checking existence with + // fs.stepExistingLocked and parent.InsertChild. If possible, we should hold + // dirMu from one to the other. + parent.InsertChild(pc, child) child.inode.IncRef() defer child.inode.DecRef(ctx) unlock() - return child.inode.Open(ctx, rp, childVFSD, opts) + return child.inode.Open(ctx, rp, child, opts) } if err != nil { return nil, err @@ -503,7 +507,6 @@ afterTrailingSymlink: if mustCreate { return nil, syserror.EEXIST } - child := childVFSD.Impl().(*Dentry) if rp.ShouldFollowSymlink() && child.isSymlink() { targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount()) if err != nil { @@ -530,22 +533,22 @@ afterTrailingSymlink: child.inode.IncRef() defer child.inode.DecRef(ctx) unlock() - return child.inode.Open(ctx, rp, &child.vfsd, opts) + return child.inode.Open(ctx, rp, child, opts) } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { fs.mu.RLock() - d, inode, err := fs.walkExistingLocked(ctx, rp) + d, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() fs.processDeferredDecRefs(ctx) if err != nil { return "", err } - if !d.Impl().(*Dentry).isSymlink() { + if !d.isSymlink() { return "", syserror.EINVAL } - return inode.Readlink(ctx) + return d.inode.Readlink(ctx, rp.Mount()) } // RenameAt implements vfs.FilesystemImpl.RenameAt. @@ -562,11 +565,10 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // Resolve the destination directory first to verify that it's on this // Mount. - dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp) + dstDir, err := fs.walkParentDirLocked(ctx, rp) if err != nil { return err } - dstDir := dstDirVFSD.Impl().(*Dentry) mnt := rp.Mount() if mnt != oldParentVD.Mount() { return syserror.EXDEV @@ -584,16 +586,15 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if err != nil { return err } - srcVFSD := &src.vfsd // Can we remove the src dentry? - if err := checkDeleteLocked(ctx, rp, srcVFSD); err != nil { + if err := checkDeleteLocked(ctx, rp, src); err != nil { return err } // Can we create the dst dentry? var dst *Dentry - pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode) + pc, err := checkCreateLocked(ctx, rp, dstDir) switch err { case nil: // Ok, continue with rename as replacement. @@ -604,14 +605,14 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } dst = dstDir.children[pc] if dst == nil { - panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD)) + panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDir)) } default: return err } var dstVFSD *vfs.Dentry if dst != nil { - dstVFSD = &dst.vfsd + dstVFSD = dst.VFSDentry() } mntns := vfs.MountNamespaceFromContext(ctx) @@ -627,17 +628,18 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa defer dstDir.dirMu.Unlock() } + srcVFSD := src.VFSDentry() if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil { return err } - replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD) + replaced, err := srcDir.inode.Rename(ctx, src.name, pc, src, dstDir) if err != nil { virtfs.AbortRenameDentry(srcVFSD, dstVFSD) return err } delete(srcDir.children, src.name) if srcDir != dstDir { - fs.deferDecRef(srcDirVFSD) + fs.deferDecRef(srcDir) dstDir.IncRef() } src.parent = dstDir @@ -646,7 +648,11 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa dstDir.children = make(map[string]*Dentry) } dstDir.children[pc] = src - virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaced) + var replaceVFSD *vfs.Dentry + if replaced != nil { + replaceVFSD = replaced.VFSDentry() + } + virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD) return nil } @@ -654,7 +660,8 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + + d, err := fs.walkExistingLocked(ctx, rp) fs.processDeferredDecRefsLocked(ctx) if err != nil { return err @@ -663,14 +670,13 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } defer rp.Mount().EndWrite() - if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { + if err := checkDeleteLocked(ctx, rp, d); err != nil { return err } - d := vfsd.Impl().(*Dentry) if !d.isDir() { return syserror.ENOTDIR } - if inode.HasChildren() { + if d.inode.HasChildren() { return syserror.ENOTEMPTY } virtfs := rp.VirtualFilesystem() @@ -680,10 +686,12 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef(ctx) + vfsd := d.VFSDentry() if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } - if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil { + + if err := parentDentry.inode.RmDir(ctx, d.name, d); err != nil { virtfs.AbortDeleteDentry(vfsd) return err } @@ -694,7 +702,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { fs.mu.RLock() - _, inode, err := fs.walkExistingLocked(ctx, rp) + d, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() fs.processDeferredDecRefs(ctx) if err != nil { @@ -703,31 +711,31 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts if opts.Stat.Mask == 0 { return nil } - return inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts) + return d.inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts) } // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { fs.mu.RLock() - _, inode, err := fs.walkExistingLocked(ctx, rp) + d, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() fs.processDeferredDecRefs(ctx) if err != nil { return linux.Statx{}, err } - return inode.Stat(ctx, fs.VFSFilesystem(), opts) + return d.inode.Stat(ctx, fs.VFSFilesystem(), opts) } // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { fs.mu.RLock() - _, inode, err := fs.walkExistingLocked(ctx, rp) + d, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() fs.processDeferredDecRefs(ctx) if err != nil { return linux.Statfs{}, err } - return inode.StatFS(ctx, fs.VFSFilesystem()) + return d.inode.StatFS(ctx, fs.VFSFilesystem()) } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. @@ -737,12 +745,15 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ } fs.mu.Lock() defer fs.mu.Unlock() - parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + parent, err := fs.walkParentDirLocked(ctx, rp) fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } - pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + + pc, err := checkCreateLocked(ctx, rp, parent) if err != nil { return err } @@ -750,11 +761,11 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return err } defer rp.Mount().EndWrite() - childVFSD, err := parentInode.NewSymlink(ctx, pc, target) + child, err := parent.inode.NewSymlink(ctx, pc, target) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) + parent.InsertChildLocked(pc, child) return nil } @@ -762,7 +773,8 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - vfsd, _, err := fs.walkExistingLocked(ctx, rp) + + d, err := fs.walkExistingLocked(ctx, rp) fs.processDeferredDecRefsLocked(ctx) if err != nil { return err @@ -771,10 +783,9 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } defer rp.Mount().EndWrite() - if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { + if err := checkDeleteLocked(ctx, rp, d); err != nil { return err } - d := vfsd.Impl().(*Dentry) if d.isDir() { return syserror.EISDIR } @@ -784,10 +795,11 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error defer parentDentry.dirMu.Unlock() mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef(ctx) + vfsd := d.VFSDentry() if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } - if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil { + if err := parentDentry.inode.Unlink(ctx, d.name, d); err != nil { virtfs.AbortDeleteDentry(vfsd) return err } @@ -795,25 +807,25 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return nil } -// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() - _, inode, err := fs.walkExistingLocked(ctx, rp) + d, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() fs.processDeferredDecRefs(ctx) if err != nil { return nil, err } - if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { + if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } return nil, syserror.ECONNREFUSED } -// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { +// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. +func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) + _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() fs.processDeferredDecRefs(ctx) if err != nil { @@ -823,10 +835,10 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si return nil, syserror.ENOTSUP } -// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { +// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. +func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) + _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() fs.processDeferredDecRefs(ctx) if err != nil { @@ -836,10 +848,10 @@ func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt return "", syserror.ENOTSUP } -// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. -func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { +// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. +func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) + _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() fs.processDeferredDecRefs(ctx) if err != nil { @@ -849,10 +861,10 @@ func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt return syserror.ENOTSUP } -// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. -func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { +// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. +func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) + _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() fs.processDeferredDecRefs(ctx) if err != nil { diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index c0b863ba4..49210e748 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -31,6 +31,8 @@ import ( // count for inodes, performing no extra actions when references are obtained or // released. This is suitable for simple file inodes that don't reference any // resources. +// +// +stateify savable type InodeNoopRefCount struct { } @@ -50,30 +52,32 @@ func (InodeNoopRefCount) TryIncRef() bool { // InodeDirectoryNoNewChildren partially implements the Inode interface. // InodeDirectoryNoNewChildren represents a directory inode which does not // support creation of new children. +// +// +stateify savable type InodeDirectoryNoNewChildren struct{} // NewFile implements Inode.NewFile. -func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*Dentry, error) { return nil, syserror.EPERM } // NewDir implements Inode.NewDir. -func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*Dentry, error) { return nil, syserror.EPERM } // NewLink implements Inode.NewLink. -func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*Dentry, error) { return nil, syserror.EPERM } // NewSymlink implements Inode.NewSymlink. -func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*Dentry, error) { return nil, syserror.EPERM } // NewNode implements Inode.NewNode. -func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*Dentry, error) { return nil, syserror.EPERM } @@ -81,6 +85,8 @@ func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOpt // inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not // represent directories can embed this to provide no-op implementations for // directory-related functions. +// +// +stateify savable type InodeNotDirectory struct { } @@ -90,47 +96,47 @@ func (InodeNotDirectory) HasChildren() bool { } // NewFile implements Inode.NewFile. -func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*Dentry, error) { panic("NewFile called on non-directory inode") } // NewDir implements Inode.NewDir. -func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*Dentry, error) { panic("NewDir called on non-directory inode") } // NewLink implements Inode.NewLinkink. -func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*Dentry, error) { panic("NewLink called on non-directory inode") } // NewSymlink implements Inode.NewSymlink. -func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*Dentry, error) { panic("NewSymlink called on non-directory inode") } // NewNode implements Inode.NewNode. -func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*Dentry, error) { panic("NewNode called on non-directory inode") } // Unlink implements Inode.Unlink. -func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error { +func (InodeNotDirectory) Unlink(context.Context, string, *Dentry) error { panic("Unlink called on non-directory inode") } // RmDir implements Inode.RmDir. -func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error { +func (InodeNotDirectory) RmDir(context.Context, string, *Dentry) error { panic("RmDir called on non-directory inode") } // Rename implements Inode.Rename. -func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) { +func (InodeNotDirectory) Rename(context.Context, string, string, *Dentry, *Dentry) (*Dentry, error) { panic("Rename called on non-directory inode") } // Lookup implements Inode.Lookup. -func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*Dentry, error) { panic("Lookup called on non-directory inode") } @@ -149,10 +155,12 @@ func (InodeNotDirectory) Valid(context.Context) bool { // dymanic entries (i.e. entries that are not "hashed" into the // vfs.Dentry.children) can embed this to provide no-op implementations for // functions related to dynamic entries. +// +// +stateify savable type InodeNoDynamicLookup struct{} // Lookup implements Inode.Lookup. -func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*Dentry, error) { return nil, syserror.ENOENT } @@ -169,10 +177,12 @@ func (InodeNoDynamicLookup) Valid(ctx context.Context) bool { // InodeNotSymlink partially implements the Inode interface, specifically the // inodeSymlink sub interface. All inodes that are not symlinks may embed this // to return the appropriate errors from symlink-related functions. +// +// +stateify savable type InodeNotSymlink struct{} // Readlink implements Inode.Readlink. -func (InodeNotSymlink) Readlink(context.Context) (string, error) { +func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) { return "", syserror.EINVAL } @@ -186,6 +196,8 @@ func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, // inode attributes. // // Must be initialized by Init prior to first use. +// +// +stateify savable type InodeAttrs struct { devMajor uint32 devMinor uint32 @@ -256,12 +268,29 @@ func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (li // SetStat implements Inode.SetStat. func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + return a.SetInodeStat(ctx, fs, creds, opts) +} + +// SetInodeStat sets the corresponding attributes from opts to InodeAttrs. +// This function can be used by other kernfs-based filesystem implementation to +// sets the unexported attributes into kernfs.InodeAttrs. +func (a *InodeAttrs) SetInodeStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { if opts.Stat.Mask == 0 { return nil } - if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 { + + // Note that not all fields are modifiable. For example, the file type and + // inode numbers are immutable after node creation. Setting the size is often + // allowed by kernfs files but does not do anything. If some other behavior is + // needed, the embedder should consider extending SetStat. + // + // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps. + if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_SIZE) != 0 { return syserror.EPERM } + if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() { + return syserror.EISDIR + } if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { return err } @@ -284,13 +313,6 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut atomic.StoreUint32(&a.gid, stat.GID) } - // Note that not all fields are modifiable. For example, the file type and - // inode numbers are immutable after node creation. - - // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps. - // Also, STATX_SIZE will need some special handling, because read-only static - // files should return EIO for truncate operations. - return nil } @@ -320,13 +342,16 @@ func (a *InodeAttrs) DecLinks() { } } +// +stateify savable type slot struct { Name string - Dentry *vfs.Dentry + Dentry *Dentry slotEntry } // OrderedChildrenOptions contains initialization options for OrderedChildren. +// +// +stateify savable type OrderedChildrenOptions struct { // Writable indicates whether vfs.FilesystemImpl methods implemented by // OrderedChildren may modify the tracked children. This applies to @@ -342,12 +367,14 @@ type OrderedChildrenOptions struct { // directories. // // Must be initialize with Init before first use. +// +// +stateify savable type OrderedChildren struct { // Can children be modified by user syscalls? It set to false, interface // methods that would modify the children return EPERM. Immutable. writable bool - mu sync.RWMutex + mu sync.RWMutex `state:"nosave"` order slotList set map[string]*slot } @@ -380,7 +407,7 @@ func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint3 if child.isDir() { links++ } - if err := o.Insert(name, child.VFSDentry()); err != nil { + if err := o.Insert(name, child); err != nil { panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d)) } d.InsertChild(name, child) @@ -397,7 +424,7 @@ func (o *OrderedChildren) HasChildren() bool { // Insert inserts child into o. This ignores the writability of o, as this is // not part of the vfs.FilesystemImpl interface, and is a lower-level operation. -func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error { +func (o *OrderedChildren) Insert(name string, child *Dentry) error { o.mu.Lock() defer o.mu.Unlock() if _, ok := o.set[name]; ok { @@ -421,10 +448,10 @@ func (o *OrderedChildren) removeLocked(name string) { } // Precondition: caller must hold o.mu for writing. -func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry { +func (o *OrderedChildren) replaceChildLocked(name string, new *Dentry) *Dentry { if s, ok := o.set[name]; ok { // Existing slot with given name, simply replace the dentry. - var old *vfs.Dentry + var old *Dentry old, s.Dentry = s.Dentry, new return old } @@ -440,7 +467,7 @@ func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs. } // Precondition: caller must hold o.mu for reading or writing. -func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error { +func (o *OrderedChildren) checkExistingLocked(name string, child *Dentry) error { s, ok := o.set[name] if !ok { return syserror.ENOENT @@ -452,7 +479,7 @@ func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) er } // Unlink implements Inode.Unlink. -func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error { +func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *Dentry) error { if !o.writable { return syserror.EPERM } @@ -468,12 +495,13 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.De } // Rmdir implements Inode.Rmdir. -func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error { +func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *Dentry) error { // We're not responsible for checking that child is a directory, that it's // empty, or updating any link counts; so this is the same as unlink. return o.Unlink(ctx, name, child) } +// +stateify savable type renameAcrossDifferentImplementationsError struct{} func (renameAcrossDifferentImplementationsError) Error() string { @@ -489,8 +517,8 @@ func (renameAcrossDifferentImplementationsError) Error() string { // that will support Rename. // // Postcondition: reference on any replaced dentry transferred to caller. -func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) { - dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren) +func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *Dentry) (*Dentry, error) { + dst, ok := dstDir.inode.(interface{}).(*OrderedChildren) if !ok { return nil, renameAcrossDifferentImplementationsError{} } @@ -532,12 +560,14 @@ func (o *OrderedChildren) nthLocked(i int64) *slot { } // InodeSymlink partially implements Inode interface for symlinks. +// +// +stateify savable type InodeSymlink struct { InodeNotDirectory } // Open implements Inode.Open. -func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return nil, syserror.ELOOP } @@ -564,6 +594,7 @@ var _ Inode = (*StaticDirectory)(nil) func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry, fdOpts GenericDirectoryFDOptions) *Dentry { inode := &StaticDirectory{} inode.Init(creds, devMajor, devMinor, ino, perm, fdOpts) + inode.EnableLeakCheck() dentry := &Dentry{} dentry.Init(inode) @@ -584,9 +615,9 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint3 s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm) } -// Open implements kernfs.Inode. -func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts, s.fdOpts) +// Open implements kernfs.Inode.Open. +func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := NewGenericDirectoryFD(rp.Mount(), d, &s.OrderedChildren, &s.locks, &opts, s.fdOpts) if err != nil { return nil, err } @@ -598,21 +629,25 @@ func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credenti return syserror.EPERM } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (s *StaticDirectory) DecRef(context.Context) { s.StaticDirectoryRefs.DecRef(s.Destroy) } // AlwaysValid partially implements kernfs.inodeDynamicLookup. +// +// +stateify savable type AlwaysValid struct{} -// Valid implements kernfs.inodeDynamicLookup. +// Valid implements kernfs.inodeDynamicLookup.Valid. func (*AlwaysValid) Valid(context.Context) bool { return true } // InodeNoStatFS partially implements the Inode interface, where the client // filesystem doesn't support statfs(2). +// +// +stateify savable type InodeNoStatFS struct{} // StatFS implements Inode.StatFS. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 88fcd54aa..6d3d79333 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -29,7 +29,7 @@ // // Reference Model: // -// Kernfs dentries represents named pointers to inodes. Dentries and inode have +// Kernfs dentries represents named pointers to inodes. Dentries and inodes have // independent lifetimes and reference counts. A child dentry unconditionally // holds a reference on its parent directory's dentry. A dentry also holds a // reference on the inode it points to. Multiple dentries can point to the same @@ -60,20 +60,23 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" ) // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory // filesystem. Concrete implementations are expected to embed this in their own // Filesystem type. +// +// +stateify savable type Filesystem struct { vfsfs vfs.Filesystem - droppedDentriesMu sync.Mutex + droppedDentriesMu sync.Mutex `state:"nosave"` // droppedDentries is a list of dentries waiting to be DecRef()ed. This is // used to defer dentry destruction until mu can be acquired for // writing. Protected by droppedDentriesMu. - droppedDentries []*vfs.Dentry + droppedDentries []*Dentry // mu synchronizes the lifetime of Dentries on this filesystem. Holding it // for reading guarantees continued existence of any resolved dentries, but @@ -96,7 +99,7 @@ type Filesystem struct { // defer fs.mu.RUnlock() // ... // fs.deferDecRef(dentry) - mu sync.RWMutex + mu sync.RWMutex `state:"nosave"` // nextInoMinusOne is used to to allocate inode numbers on this // filesystem. Must be accessed by atomic operations. @@ -107,7 +110,7 @@ type Filesystem struct { // processDeferredDecRefs{,Locked}. See comment on Filesystem.mu. // // Precondition: d must not already be pending destruction. -func (fs *Filesystem) deferDecRef(d *vfs.Dentry) { +func (fs *Filesystem) deferDecRef(d *Dentry) { fs.droppedDentriesMu.Lock() fs.droppedDentries = append(fs.droppedDentries, d) fs.droppedDentriesMu.Unlock() @@ -159,6 +162,8 @@ const ( // to, and child dentries hold a reference on their parent. // // Must be initialized by Init prior to first use. +// +// +stateify savable type Dentry struct { DentryRefs @@ -172,7 +177,11 @@ type Dentry struct { name string // dirMu protects children and the names of child Dentries. - dirMu sync.Mutex + // + // Note that holding fs.mu for writing is not sufficient; + // revalidateChildLocked(), which is a very hot path, may modify children with + // fs.mu acquired for reading only. + dirMu sync.Mutex `state:"nosave"` children map[string]*Dentry inode Inode @@ -239,24 +248,25 @@ func (d *Dentry) Watches() *vfs.Watches { func (d *Dentry) OnZeroWatches(context.Context) {} // InsertChild inserts child into the vfs dentry cache with the given name under -// this dentry. This does not update the directory inode, so calling this on -// its own isn't sufficient to insert a child into a directory. InsertChild -// updates the link count on d if required. +// this dentry. This does not update the directory inode, so calling this on its +// own isn't sufficient to insert a child into a directory. // // Precondition: d must represent a directory inode. func (d *Dentry) InsertChild(name string, child *Dentry) { d.dirMu.Lock() - d.insertChildLocked(name, child) + d.InsertChildLocked(name, child) d.dirMu.Unlock() } -// insertChildLocked is equivalent to InsertChild, with additional +// InsertChildLocked is equivalent to InsertChild, with additional // preconditions. // -// Precondition: d.dirMu must be locked. -func (d *Dentry) insertChildLocked(name string, child *Dentry) { +// Preconditions: +// * d must represent a directory inode. +// * d.dirMu must be locked. +func (d *Dentry) InsertChildLocked(name string, child *Dentry) { if !d.isDir() { - panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d)) + panic(fmt.Sprintf("InsertChildLocked called on non-directory Dentry: %+v.", d)) } d.IncRef() // DecRef in child's Dentry.destroy. child.parent = d @@ -267,6 +277,36 @@ func (d *Dentry) insertChildLocked(name string, child *Dentry) { d.children[name] = child } +// RemoveChild removes child from the vfs dentry cache. This does not update the +// directory inode or modify the inode to be unlinked. So calling this on its own +// isn't sufficient to remove a child from a directory. +// +// Precondition: d must represent a directory inode. +func (d *Dentry) RemoveChild(name string, child *Dentry) error { + d.dirMu.Lock() + defer d.dirMu.Unlock() + return d.RemoveChildLocked(name, child) +} + +// RemoveChildLocked is equivalent to RemoveChild, with additional +// preconditions. +// +// Precondition: d.dirMu must be locked. +func (d *Dentry) RemoveChildLocked(name string, child *Dentry) error { + if !d.isDir() { + panic(fmt.Sprintf("RemoveChild called on non-directory Dentry: %+v.", d)) + } + c, ok := d.children[name] + if !ok { + return syserror.ENOENT + } + if c != child { + panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! Child: %+v, vfs: %+v", c, child)) + } + delete(d.children, name) + return nil +} + // Inode returns the dentry's inode. func (d *Dentry) Inode() Inode { return d.inode @@ -287,7 +327,6 @@ func (d *Dentry) Inode() Inode { // // - Checking that dentries passed to methods are of the appropriate file type. // - Checking permissions. -// - Updating link and reference counts. // // Specific responsibilities of implementations are documented below. type Inode interface { @@ -297,7 +336,8 @@ type Inode interface { inodeRefs // Methods related to node metadata. A generic implementation is provided by - // InodeAttrs. + // InodeAttrs. Note that a concrete filesystem using kernfs is responsible for + // managing link counts. inodeMetadata // Method for inodes that represent symlink. InodeNotSymlink provides a @@ -315,11 +355,11 @@ type Inode interface { // Open creates a file description for the filesystem object represented by // this inode. The returned file description should hold a reference on the - // inode for its lifetime. + // dentry for its lifetime. // // Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing // the inode on which Open() is being called. - Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) + Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) // StatFS returns filesystem statistics for the client filesystem. This // corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem @@ -369,30 +409,30 @@ type inodeDirectory interface { HasChildren() bool // NewFile creates a new regular file inode. - NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) + NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*Dentry, error) // NewDir creates a new directory inode. - NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) + NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*Dentry, error) // NewLink creates a new hardlink to a specified inode in this // directory. Implementations should create a new kernfs Dentry pointing to // target, and update target's link count. - NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error) + NewLink(ctx context.Context, name string, target Inode) (*Dentry, error) // NewSymlink creates a new symbolic link inode. - NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error) + NewSymlink(ctx context.Context, name, target string) (*Dentry, error) // NewNode creates a new filesystem node for a mknod syscall. - NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error) + NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*Dentry, error) // Unlink removes a child dentry from this directory inode. - Unlink(ctx context.Context, name string, child *vfs.Dentry) error + Unlink(ctx context.Context, name string, child *Dentry) error // RmDir removes an empty child directory from this directory // inode. Implementations must update the parent directory's link count, // if required. Implementations are not responsible for checking that child // is a directory, checking for an empty directory. - RmDir(ctx context.Context, name string, child *vfs.Dentry) error + RmDir(ctx context.Context, name string, child *Dentry) error // Rename is called on the source directory containing an inode being // renamed. child should point to the resolved child in the source @@ -400,7 +440,7 @@ type inodeDirectory interface { // should return the replaced dentry or nil otherwise. // // Precondition: Caller must serialize concurrent calls to Rename. - Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error) + Rename(ctx context.Context, oldname, newname string, child, dstDir *Dentry) (replaced *Dentry, err error) } type inodeDynamicLookup interface { @@ -418,14 +458,14 @@ type inodeDynamicLookup interface { // // Lookup returns the child with an extra reference and the caller owns this // reference. - Lookup(ctx context.Context, name string) (*vfs.Dentry, error) + Lookup(ctx context.Context, name string) (*Dentry, error) // Valid should return true if this inode is still valid, or needs to // be resolved again by a call to Lookup. Valid(ctx context.Context) bool // IterDirents is used to iterate over dynamically created entries. It invokes - // cb on each entry in the directory represented by the FileDescription. + // cb on each entry in the directory represented by the Inode. // 'offset' is the offset for the entire IterDirents call, which may include // results from the caller (e.g. "." and ".."). 'relOffset' is the offset // inside the entries returned by this IterDirents invocation. In other words, @@ -437,7 +477,7 @@ type inodeDynamicLookup interface { type inodeSymlink interface { // Readlink returns the target of a symbolic link. If an inode is not a // symlink, the implementation should return EINVAL. - Readlink(ctx context.Context) (string, error) + Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) // Getlink returns the target of a symbolic link, as used by path // resolution: diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 675587c6b..e413242dc 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -52,7 +52,7 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System { v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{}) + mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.MountOptions{}) if err != nil { t.Fatalf("Failed to create testfs root mount: %v", err) } @@ -121,8 +121,8 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod return &dir.dentry } -func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ +func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndStaticEntries, }) if err != nil { @@ -162,8 +162,8 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte return &dir.dentry } -func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ +func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndStaticEntries, }) if err != nil { @@ -176,38 +176,36 @@ func (d *dir) DecRef(context.Context) { d.dirRefs.DecRef(d.Destroy) } -func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) { +func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*kernfs.Dentry, error) { creds := auth.CredentialsFromContext(ctx) dir := d.fs.newDir(creds, opts.Mode, nil) - dirVFSD := dir.VFSDentry() - if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil { + if err := d.OrderedChildren.Insert(name, dir); err != nil { dir.DecRef(ctx) return nil, err } d.IncLinks(1) - return dirVFSD, nil + return dir, nil } -func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) { +func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*kernfs.Dentry, error) { creds := auth.CredentialsFromContext(ctx) f := d.fs.newFile(creds, "") - fVFSD := f.VFSDentry() - if err := d.OrderedChildren.Insert(name, fVFSD); err != nil { + if err := d.OrderedChildren.Insert(name, f); err != nil { f.DecRef(ctx) return nil, err } - return fVFSD, nil + return f, nil } -func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) { +func (*dir) NewLink(context.Context, string, kernfs.Inode) (*kernfs.Dentry, error) { return nil, syserror.EPERM } -func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { +func (*dir) NewSymlink(context.Context, string, string) (*kernfs.Dentry, error) { return nil, syserror.EPERM } -func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { +func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*kernfs.Dentry, error) { return nil, syserror.EPERM } diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 64731a3e4..58a93eaac 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -24,6 +24,8 @@ import ( // StaticSymlink provides an Inode implementation for symlinks that point to // a immutable target. +// +// +stateify savable type StaticSymlink struct { InodeAttrs InodeNoopRefCount @@ -51,8 +53,8 @@ func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777) } -// Readlink implements Inode. -func (s *StaticSymlink) Readlink(_ context.Context) (string, error) { +// Readlink implements Inode.Readlink. +func (s *StaticSymlink) Readlink(_ context.Context, _ *vfs.Mount) (string, error) { return s.target, nil } diff --git a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go new file mode 100644 index 000000000..ea7f073eb --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go @@ -0,0 +1,102 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// syntheticDirectory implements kernfs.Inode for a directory created by +// MkdirAt(ForSyntheticMountpoint=true). +// +// +stateify savable +type syntheticDirectory struct { + InodeAttrs + InodeNoStatFS + InodeNoopRefCount + InodeNoDynamicLookup + InodeNotSymlink + OrderedChildren + + locks vfs.FileLocks +} + +var _ Inode = (*syntheticDirectory)(nil) + +func newSyntheticDirectory(creds *auth.Credentials, perm linux.FileMode) *Dentry { + inode := &syntheticDirectory{} + inode.Init(creds, 0 /* devMajor */, 0 /* devMinor */, 0 /* ino */, perm) + d := &Dentry{} + d.Init(inode) + return d +} + +func (dir *syntheticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { + if perm&^linux.PermissionsMask != 0 { + panic(fmt.Sprintf("perm contains non-permission bits: %#o", perm)) + } + dir.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.S_IFDIR|perm) + dir.OrderedChildren.Init(OrderedChildrenOptions{ + Writable: true, + }) +} + +// Open implements Inode.Open. +func (dir *syntheticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := NewGenericDirectoryFD(rp.Mount(), d, &dir.OrderedChildren, &dir.locks, &opts, GenericDirectoryFDOptions{}) + if err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// NewFile implements Inode.NewFile. +func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*Dentry, error) { + return nil, syserror.EPERM +} + +// NewDir implements Inode.NewDir. +func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*Dentry, error) { + if !opts.ForSyntheticMountpoint { + return nil, syserror.EPERM + } + subdird := newSyntheticDirectory(auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask) + if err := dir.OrderedChildren.Insert(name, subdird); err != nil { + subdird.DecRef(ctx) + return nil, err + } + return subdird, nil +} + +// NewLink implements Inode.NewLink. +func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (*Dentry, error) { + return nil, syserror.EPERM +} + +// NewSymlink implements Inode.NewSymlink. +func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (*Dentry, error) { + return nil, syserror.EPERM +} + +// NewNode implements Inode.NewNode. +func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*Dentry, error) { + return nil, syserror.EPERM +} diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go index 13735eb05..73b126669 100644 --- a/pkg/sentry/fsimpl/overlay/copy_up.go +++ b/pkg/sentry/fsimpl/overlay/copy_up.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -81,6 +82,8 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { Start: d.parent.upperVD, Path: fspath.Parse(d.name), } + // Used during copy-up of memory-mapped regular files. + var mmapOpts *memmap.MMapOpts cleanupUndoCopyUp := func() { var err error if ftype == linux.S_IFDIR { @@ -89,7 +92,11 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop) } if err != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err)) + } + if d.upperVD.Ok() { + d.upperVD.DecRef(ctx) + d.upperVD = vfs.VirtualDentry{} } } switch ftype { @@ -132,6 +139,25 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { break } } + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + if d.wrappedMappable != nil { + // We may have memory mappings of the file on the lower layer. + // Switch to mapping the file on the upper layer instead. + mmapOpts = &memmap.MMapOpts{ + Perms: usermem.ReadWrite, + MaxPerms: usermem.ReadWrite, + } + if err := newFD.ConfigureMMap(ctx, mmapOpts); err != nil { + cleanupUndoCopyUp() + return err + } + if mmapOpts.MappingIdentity != nil { + mmapOpts.MappingIdentity.DecRef(ctx) + } + // Don't actually switch Mappables until the end of copy-up; see + // below for why. + } if err := newFD.SetStat(ctx, vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_UID | linux.STATX_GID, @@ -234,7 +260,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { panic(fmt.Sprintf("unexpected file type %o", ftype)) } - // TODO(gvisor.dev/issue/1199): copy up xattrs + if err := d.copyXattrsLocked(ctx); err != nil { + cleanupUndoCopyUp() + return err + } // Update the dentry's device and inode numbers (except for directories, // for which these remain overlay-assigned). @@ -246,14 +275,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { Mask: linux.STATX_INO, }) if err != nil { - d.upperVD.DecRef(ctx) - d.upperVD = vfs.VirtualDentry{} cleanupUndoCopyUp() return err } if upperStat.Mask&linux.STATX_INO == 0 { - d.upperVD.DecRef(ctx) - d.upperVD = vfs.VirtualDentry{} cleanupUndoCopyUp() return syserror.EREMOTE } @@ -262,6 +287,135 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { atomic.StoreUint64(&d.ino, upperStat.Ino) } + if mmapOpts != nil && mmapOpts.Mappable != nil { + // Note that if mmapOpts != nil, then d.mapsMu is locked for writing + // (from the S_IFREG path above). + + // Propagate mappings of d to the new Mappable. Remember which mappings + // we added so we can remove them on failure. + upperMappable := mmapOpts.Mappable + allAdded := make(map[memmap.MappableRange]memmap.MappingsOfRange) + for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + added := make(memmap.MappingsOfRange) + for m := range seg.Value() { + if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable); err != nil { + for m := range added { + upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable) + } + for mr, mappings := range allAdded { + for m := range mappings { + upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start, m.Writable) + } + } + return err + } + added[m] = struct{}{} + } + allAdded[seg.Range()] = added + } + + // Switch to the new Mappable. We do this at the end of copy-up + // because: + // + // - We need to switch Mappables (by changing d.wrappedMappable) before + // invalidating Translations from the old Mappable (to pick up + // Translations from the new one). + // + // - We need to lock d.dataMu while changing d.wrappedMappable, but + // must invalidate Translations with d.dataMu unlocked (due to lock + // ordering). + // + // - Consequently, once we unlock d.dataMu, other threads may + // immediately observe the new (copied-up) Mappable, which we want to + // delay until copy-up is guaranteed to succeed. + d.dataMu.Lock() + lowerMappable := d.wrappedMappable + d.wrappedMappable = upperMappable + d.dataMu.Unlock() + d.lowerMappings.InvalidateAll(memmap.InvalidateOpts{}) + + // Remove mappings from the old Mappable. + for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + for m := range seg.Value() { + lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable) + } + } + d.lowerMappings.RemoveAll() + } + atomic.StoreUint32(&d.copiedUp, 1) return nil } + +// copyXattrsLocked copies a subset of lower's extended attributes to upper. +// Attributes that configure an overlay in the lower are not copied up. +// +// Preconditions: d.copyMu must be locked for writing. +func (d *dentry) copyXattrsLocked(ctx context.Context) error { + vfsObj := d.fs.vfsfs.VirtualFilesystem() + lowerPop := &vfs.PathOperation{Root: d.lowerVDs[0], Start: d.lowerVDs[0]} + upperPop := &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD} + + lowerXattrs, err := vfsObj.ListXattrAt(ctx, d.fs.creds, lowerPop, 0) + if err != nil { + if err == syserror.EOPNOTSUPP { + // There are no guarantees as to the contents of lowerXattrs. + return nil + } + ctx.Infof("failed to copy up xattrs because ListXattrAt failed: %v", err) + return err + } + + for _, name := range lowerXattrs { + // Do not copy up overlay attributes. + if isOverlayXattr(name) { + continue + } + + value, err := vfsObj.GetXattrAt(ctx, d.fs.creds, lowerPop, &vfs.GetXattrOptions{Name: name, Size: 0}) + if err != nil { + ctx.Infof("failed to copy up xattrs because GetXattrAt failed: %v", err) + return err + } + + if err := vfsObj.SetXattrAt(ctx, d.fs.creds, upperPop, &vfs.SetXattrOptions{Name: name, Value: value}); err != nil { + ctx.Infof("failed to copy up xattrs because SetXattrAt failed: %v", err) + return err + } + } + return nil +} + +// copyUpDescendantsLocked ensures that all descendants of d are copied up. +// +// Preconditions: +// * filesystem.renameMu must be locked. +// * d.dirMu must be locked. +// * d.isDir(). +func (d *dentry) copyUpDescendantsLocked(ctx context.Context, ds **[]*dentry) error { + dirents, err := d.getDirentsLocked(ctx) + if err != nil { + return err + } + for _, dirent := range dirents { + if dirent.Name == "." || dirent.Name == ".." { + continue + } + child, err := d.fs.getChildLocked(ctx, d, dirent.Name, ds) + if err != nil { + return err + } + if err := child.copyUpLocked(ctx); err != nil { + return err + } + if child.isDir() { + child.dirMu.Lock() + err := child.copyUpDescendantsLocked(ctx, ds) + child.dirMu.Unlock() + if err != nil { + return err + } + } + } + return nil +} diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go index b1b292e83..df4492346 100644 --- a/pkg/sentry/fsimpl/overlay/directory.go +++ b/pkg/sentry/fsimpl/overlay/directory.go @@ -100,12 +100,13 @@ func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string return whiteouts, readdirErr } +// +stateify savable type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl - mu sync.Mutex + mu sync.Mutex `state:"nosave"` off int64 dirents []vfs.Dirent } @@ -116,10 +117,12 @@ func (fd *directoryFD) Release(ctx context.Context) { // IterDirents implements vfs.FileDescriptionImpl.IterDirents. func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + d := fd.dentry() + defer d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent) + fd.mu.Lock() defer fd.mu.Unlock() - d := fd.dentry() if fd.dirents == nil { ds, err := d.getDirents(ctx) if err != nil { @@ -143,7 +146,14 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { defer d.fs.renameMu.RUnlock() d.dirMu.Lock() defer d.dirMu.Unlock() + return d.getDirentsLocked(ctx) +} +// Preconditions: +// * filesystem.renameMu must be locked. +// * d.dirMu must be locked. +// * d.isDir(). +func (d *dentry) getDirentsLocked(ctx context.Context) ([]vfs.Dirent, error) { if d.dirents != nil { return d.dirents, nil } diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index e720bfb0b..bd11372d5 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -15,6 +15,8 @@ package overlay import ( + "fmt" + "strings" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -27,10 +29,15 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// _OVL_XATTR_PREFIX is an extended attribute key prefix to identify overlayfs +// attributes. +// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_PREFIX +const _OVL_XATTR_PREFIX = linux.XATTR_TRUSTED_PREFIX + "overlay." + // _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for // opaque directories. // Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE -const _OVL_XATTR_OPAQUE = linux.XATTR_TRUSTED_PREFIX + "overlay.opaque" +const _OVL_XATTR_OPAQUE = _OVL_XATTR_PREFIX + "opaque" func isWhiteout(stat *linux.Statx) bool { return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0 @@ -205,6 +212,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str lookupErr = err return false } + defer childVD.DecRef(ctx) mask := uint32(linux.STATX_TYPE) if !existsOnAnyLayer { @@ -243,6 +251,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str } // Update child to include this layer. + childVD.IncRef() if isUpper { child.upperVD = childVD child.copiedUp = 1 @@ -267,10 +276,10 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str // Directories are merged with directories from lower layers if they // are not explicitly opaque. - opaqueVal, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{ + opaqueVal, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{ Root: childVD, Start: childVD, - }, &vfs.GetxattrOptions{ + }, &vfs.GetXattrOptions{ Name: _OVL_XATTR_OPAQUE, Size: 1, }) @@ -490,7 +499,13 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err := create(parent, name, childLayer == lookupLayerUpperWhiteout); err != nil { return err } + parent.dirents = nil + ev := linux.IN_CREATE + if dir { + ev |= linux.IN_ISDIR + } + parent.watches.Notify(ctx, name, uint32(ev), 0 /* cookie */, vfs.InodeEvent, false /* unlinked */) return nil } @@ -504,7 +519,7 @@ func (fs *filesystem) createWhiteout(ctx context.Context, vfsObj *vfs.VirtualFil func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) { if err := fs.createWhiteout(ctx, vfsObj, pop); err != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err)) } } @@ -616,12 +631,13 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. }, }); err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop) } return err } + old.watches.Notify(ctx, "", linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent, false /* unlinked */) return nil }) } @@ -655,7 +671,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v }, }); err != nil { if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } @@ -665,12 +681,12 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // There may be directories on lower layers (previously hidden by // the whiteout) that the new directory should not be merged with. // Mark it opaque to prevent merging. - if err := vfsObj.SetxattrAt(ctx, fs.creds, &pop, &vfs.SetxattrOptions{ + if err := vfsObj.SetXattrAt(ctx, fs.creds, &pop, &vfs.SetXattrOptions{ Name: _OVL_XATTR_OPAQUE, Value: "y", }); err != nil { if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr)) } else { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } @@ -714,7 +730,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v }, }); err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } @@ -743,6 +759,9 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf start := rp.Start().Impl().(*dentry) if rp.Done() { + if mayCreate && rp.MustBeDir() { + return nil, syserror.EISDIR + } if mustCreate { return nil, syserror.EEXIST } @@ -766,6 +785,10 @@ afterTrailingSymlink: if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } + // Reject attempts to open directories with O_CREAT. + if mayCreate && rp.MustBeDir() { + return nil, syserror.EISDIR + } // Determine whether or not we need to create a file. parent.dirMu.Lock() child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds) @@ -774,12 +797,11 @@ afterTrailingSymlink: parent.dirMu.Unlock() return fd, err } + parent.dirMu.Unlock() if err != nil { - parent.dirMu.Unlock() return nil, err } // Open existing child or follow symlink. - parent.dirMu.Unlock() if mustCreate { return nil, syserror.EEXIST } @@ -794,6 +816,9 @@ afterTrailingSymlink: start = parent goto afterTrailingSymlink } + if rp.MustBeDir() && !child.isDir() { + return nil, syserror.ENOTDIR + } if mayWrite { if err := child.copyUpLocked(ctx); err != nil { return nil, err @@ -925,7 +950,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving }, }); err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } @@ -936,7 +961,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving child, err := fs.getChildLocked(ctx, parent, childName, ds) if err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } @@ -957,6 +982,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving // just can't open it anymore for some reason. return nil, err } + parent.watches.Notify(ctx, childName, linux.IN_CREATE, 0 /* cookie */, vfs.PathEvent, false /* unlinked */) return &fd.vfsfd, nil } @@ -1002,9 +1028,224 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } defer mnt.EndWrite() - // FIXME(gvisor.dev/issue/1199): Actually implement rename. - _ = newParent - return syserror.EXDEV + oldParent := oldParentVD.Dentry().Impl().(*dentry) + creds := rp.Credentials() + if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + // We need a dentry representing the renamed file since, if it's a + // directory, we need to check for write permission on it. + oldParent.dirMu.Lock() + defer oldParent.dirMu.Unlock() + renamed, err := fs.getChildLocked(ctx, oldParent, oldName, &ds) + if err != nil { + return err + } + if err := vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&oldParent.mode)), auth.KUID(atomic.LoadUint32(&renamed.uid))); err != nil { + return err + } + if renamed.isDir() { + if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { + return syserror.EINVAL + } + if oldParent != newParent { + if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + } + } else { + if opts.MustBeDir || rp.MustBeDir() { + return syserror.ENOTDIR + } + } + + if oldParent != newParent { + if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + newParent.dirMu.Lock() + defer newParent.dirMu.Unlock() + } + if newParent.vfsd.IsDead() { + return syserror.ENOENT + } + replacedLayer, err := fs.lookupLayerLocked(ctx, newParent, newName) + if err != nil { + return err + } + var ( + replaced *dentry + replacedVFSD *vfs.Dentry + whiteouts map[string]bool + ) + if replacedLayer.existsInOverlay() { + replaced, err = fs.getChildLocked(ctx, newParent, newName, &ds) + if err != nil { + return err + } + replacedVFSD = &replaced.vfsd + if replaced.isDir() { + if !renamed.isDir() { + return syserror.EISDIR + } + if genericIsAncestorDentry(replaced, renamed) { + return syserror.ENOTEMPTY + } + replaced.dirMu.Lock() + defer replaced.dirMu.Unlock() + whiteouts, err = replaced.collectWhiteoutsForRmdirLocked(ctx) + if err != nil { + return err + } + } else { + if rp.MustBeDir() || renamed.isDir() { + return syserror.ENOTDIR + } + } + } + + if oldParent == newParent && oldName == newName { + return nil + } + + // renamed and oldParent need to be copied-up before they're renamed on the + // upper layer. + if err := renamed.copyUpLocked(ctx); err != nil { + return err + } + // If renamed is a directory, all of its descendants need to be copied-up + // before they're renamed on the upper layer. + if renamed.isDir() { + if err := renamed.copyUpDescendantsLocked(ctx, &ds); err != nil { + return err + } + } + // newParent must be copied-up before it can contain renamed on the upper + // layer. + if err := newParent.copyUpLocked(ctx); err != nil { + return err + } + // If replaced exists, it doesn't need to be copied-up, but we do need to + // serialize with copy-up. Holding renameMu for writing should be + // sufficient, but out of an abundance of caution... + if replaced != nil { + replaced.copyMu.RLock() + defer replaced.copyMu.RUnlock() + } + + vfsObj := rp.VirtualFilesystem() + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef(ctx) + if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { + return err + } + + newpop := vfs.PathOperation{ + Root: newParent.upperVD, + Start: newParent.upperVD, + Path: fspath.Parse(newName), + } + + needRecreateWhiteouts := false + cleanupRecreateWhiteouts := func() { + if !needRecreateWhiteouts { + return + } + for whiteoutName, whiteoutUpper := range whiteouts { + if !whiteoutUpper { + continue + } + if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{ + Root: replaced.upperVD, + Start: replaced.upperVD, + Path: fspath.Parse(whiteoutName), + }); err != nil && err != syserror.EEXIST { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RenameAt failure: %v", err)) + } + } + } + if renamed.isDir() { + if replacedLayer == lookupLayerUpper { + // Remove whiteouts from the directory being replaced. + needRecreateWhiteouts = true + for whiteoutName, whiteoutUpper := range whiteouts { + if !whiteoutUpper { + continue + } + if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{ + Root: replaced.upperVD, + Start: replaced.upperVD, + Path: fspath.Parse(whiteoutName), + }); err != nil { + cleanupRecreateWhiteouts() + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + } + } else if replacedLayer == lookupLayerUpperWhiteout { + // We need to explicitly remove the whiteout since otherwise rename + // on the upper layer will fail with ENOTDIR. + if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil { + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + } + } + + // Essentially no gVisor filesystem supports RENAME_WHITEOUT, so just do a + // regular rename and create the whiteout at the origin manually. Unlike + // RENAME_WHITEOUT, this isn't atomic with respect to other users of the + // upper filesystem, but this is already the case for virtually all other + // overlay filesystem operations too. + oldpop := vfs.PathOperation{ + Root: oldParent.upperVD, + Start: oldParent.upperVD, + Path: fspath.Parse(oldName), + } + if err := vfsObj.RenameAt(ctx, creds, &oldpop, &newpop, &opts); err != nil { + cleanupRecreateWhiteouts() + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + + // Below this point, the renamed dentry is now at newpop, and anything we + // replaced is gone forever. Commit the rename, update the overlay + // filesystem tree, and abandon attempts to recover from errors. + vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) + delete(oldParent.children, oldName) + if replaced != nil { + ds = appendDentry(ds, replaced) + } + if oldParent != newParent { + newParent.dirents = nil + // This can't drop the last reference on oldParent because one is held + // by oldParentVD, so lock recursion is impossible. + oldParent.DecRef(ctx) + ds = appendDentry(ds, oldParent) + newParent.IncRef() + renamed.parent = newParent + } + renamed.name = newName + if newParent.children == nil { + newParent.children = make(map[string]*dentry) + } + newParent.children[newName] = renamed + oldParent.dirents = nil + + if err := fs.createWhiteout(ctx, vfsObj, &oldpop); err != nil { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout at origin after RenameAt: %v", err)) + } + if renamed.isDir() { + if err := vfsObj.SetXattrAt(ctx, fs.creds, &newpop, &vfs.SetXattrOptions{ + Name: _OVL_XATTR_OPAQUE, + Value: "y", + }); err != nil { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to make renamed directory opaque: %v", err)) + } + } + + vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir()) + return nil } // RmdirAt implements vfs.FilesystemImpl.RmdirAt. @@ -1083,7 +1324,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error Start: child.upperVD, Path: fspath.Parse(whiteoutName), }); err != nil && err != syserror.EEXIST { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err)) } } } @@ -1113,15 +1354,14 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error // Don't attempt to recover from this: the original directory is // already gone, so any dentries representing it are invalid, and // creating a new directory won't undo that. - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err) - vfsObj.AbortDeleteDentry(&child.vfsd) - return err + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err)) } vfsObj.CommitDeleteDentry(ctx, &child.vfsd) delete(parent.children, name) ds = appendDentry(ds, child) parent.dirents = nil + parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0 /* cookie */, vfs.InodeEvent, true /* unlinked */) return nil } @@ -1129,12 +1369,25 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { + fs.renameMuRUnlockAndCheckDrop(ctx, &ds) return err } + err = d.setStatLocked(ctx, rp, opts) + fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + if err != nil { + return err + } + + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + d.InotifyWithParent(ctx, ev, 0 /* cookie */, vfs.InodeEvent) + } + return nil +} +// Precondition: d.fs.renameMu must be held for reading. +func (d *dentry) setStatLocked(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { mode := linux.FileMode(atomic.LoadUint32(&d.mode)) if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { return err @@ -1229,7 +1482,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ }, }); err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr) + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } @@ -1322,70 +1575,175 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } } if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil { - ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err) - if child != nil { - vfsObj.AbortDeleteDentry(&child.vfsd) - } - return err + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err)) } + var cw *vfs.Watches if child != nil { vfsObj.CommitDeleteDentry(ctx, &child.vfsd) delete(parent.children, name) ds = appendDentry(ds, child) + cw = &child.watches } + vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name) parent.dirents = nil return nil } -// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { +// isOverlayXattr returns whether the given extended attribute configures the +// overlay. +func isOverlayXattr(name string) bool { + return strings.HasPrefix(name, _OVL_XATTR_PREFIX) +} + +// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. +func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) - _, err := fs.resolveLocked(ctx, rp, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err } - // TODO(gvisor.dev/issue/1199): Linux overlayfs actually allows listxattr, - // but not any other xattr syscalls. For now we just reject all of them. - return nil, syserror.ENOTSUP + + return fs.listXattr(ctx, d, size) } -// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { +func (fs *filesystem) listXattr(ctx context.Context, d *dentry, size uint64) ([]string, error) { + vfsObj := d.fs.vfsfs.VirtualFilesystem() + top := d.topLayer() + names, err := vfsObj.ListXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, size) + if err != nil { + return nil, err + } + + // Filter out all overlay attributes. + n := 0 + for _, name := range names { + if !isOverlayXattr(name) { + names[n] = name + n++ + } + } + return names[:n], err +} + +// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. +func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) - _, err := fs.resolveLocked(ctx, rp, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err } - return "", syserror.ENOTSUP + + return fs.getXattr(ctx, d, rp.Credentials(), &opts) } -// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. -func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { +func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { + if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { + return "", err + } + + // Return EOPNOTSUPP when fetching an overlay attribute. + // See fs/overlayfs/super.c:ovl_own_xattr_get(). + if isOverlayXattr(opts.Name) { + return "", syserror.EOPNOTSUPP + } + + // Analogous to fs/overlayfs/super.c:ovl_other_xattr_get(). + vfsObj := d.fs.vfsfs.VirtualFilesystem() + top := d.topLayer() + return vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, opts) +} + +// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. +func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) - _, err := fs.resolveLocked(ctx, rp, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + return err + } + + err = fs.setXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), &opts) + fs.renameMuRUnlockAndCheckDrop(ctx, &ds) if err != nil { return err } - return syserror.ENOTSUP + + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent) + return nil +} + +// Precondition: fs.renameMu must be locked. +func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, opts *vfs.SetXattrOptions) error { + if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { + return err + } + + // Return EOPNOTSUPP when setting an overlay attribute. + // See fs/overlayfs/super.c:ovl_own_xattr_set(). + if isOverlayXattr(opts.Name) { + return syserror.EOPNOTSUPP + } + + // Analogous to fs/overlayfs/super.c:ovl_other_xattr_set(). + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + if err := d.copyUpLocked(ctx); err != nil { + return err + } + vfsObj := d.fs.vfsfs.VirtualFilesystem() + return vfsObj.SetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, opts) } -// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. -func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { +// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. +func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) - _, err := fs.resolveLocked(ctx, rp, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + return err + } + + err = fs.removeXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), name) + fs.renameMuRUnlockAndCheckDrop(ctx, &ds) if err != nil { return err } - return syserror.ENOTSUP + + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent) + return nil +} + +// Precondition: fs.renameMu must be locked. +func (fs *filesystem) removeXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, name string) error { + if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { + return err + } + + // Like SetXattrAt, return EOPNOTSUPP when removing an overlay attribute. + // Linux passes the remove request to xattr_handler->set. + // See fs/xattr.c:vfs_removexattr(). + if isOverlayXattr(name) { + return syserror.EOPNOTSUPP + } + + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + if err := d.copyUpLocked(ctx); err != nil { + return err + } + vfsObj := d.fs.vfsfs.VirtualFilesystem() + return vfsObj.RemoveXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, name) } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go index 268b32537..853aee951 100644 --- a/pkg/sentry/fsimpl/overlay/non_directory.go +++ b/pkg/sentry/fsimpl/overlay/non_directory.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -38,6 +39,7 @@ func (d *dentry) readlink(ctx context.Context) (string, error) { }) } +// +stateify savable type nonDirectoryFD struct { fileDescription @@ -46,7 +48,7 @@ type nonDirectoryFD struct { // fileDescription.dentry().upperVD. cachedFlags is the last known value of // cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are // protected by mu. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` copiedUp bool cachedFD *vfs.FileDescription cachedFlags uint32 @@ -146,6 +148,16 @@ func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux return stat, nil } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *nonDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + return err + } + defer wrappedFD.DecRef(ctx) + return wrappedFD.Allocate(ctx, mode, offset, length) +} + // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { d := fd.dentry() @@ -172,6 +184,9 @@ func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) return err } d.updateAfterSetStatLocked(&opts) + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) + } return nil } @@ -256,10 +271,105 @@ func (fd *nonDirectoryFD) Sync(ctx context.Context) error { // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - wrappedFD, err := fd.getCurrentFD(ctx) + if err := fd.ensureMappable(ctx, opts); err != nil { + return err + } + return vfs.GenericConfigureMMap(&fd.vfsfd, fd.dentry(), opts) +} + +// ensureMappable ensures that fd.dentry().wrappedMappable is not nil. +func (fd *nonDirectoryFD) ensureMappable(ctx context.Context, opts *memmap.MMapOpts) error { + d := fd.dentry() + + // Fast path if we already have a Mappable for the current top layer. + if atomic.LoadUint32(&d.isMappable) != 0 { + return nil + } + + // Only permit mmap of regular files, since other file types may have + // unpredictable behavior when mmapped (e.g. /dev/zero). + if atomic.LoadUint32(&d.mode)&linux.S_IFMT != linux.S_IFREG { + return syserror.ENODEV + } + + // Get a Mappable for the current top layer. + fd.mu.Lock() + defer fd.mu.Unlock() + d.copyMu.RLock() + defer d.copyMu.RUnlock() + if atomic.LoadUint32(&d.isMappable) != 0 { + return nil + } + wrappedFD, err := fd.currentFDLocked(ctx) if err != nil { return err } - defer wrappedFD.DecRef(ctx) - return wrappedFD.ConfigureMMap(ctx, opts) + if err := wrappedFD.ConfigureMMap(ctx, opts); err != nil { + return err + } + if opts.MappingIdentity != nil { + opts.MappingIdentity.DecRef(ctx) + opts.MappingIdentity = nil + } + // Use this Mappable for all mappings of this layer (unless we raced with + // another call to ensureMappable). + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + d.dataMu.Lock() + defer d.dataMu.Unlock() + if d.wrappedMappable == nil { + d.wrappedMappable = opts.Mappable + atomic.StoreUint32(&d.isMappable, 1) + } + return nil +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + if err := d.wrappedMappable.AddMapping(ctx, ms, ar, offset, writable); err != nil { + return err + } + if !d.isCopiedUp() { + d.lowerMappings.AddMapping(ms, ar, offset, writable) + } + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + d.wrappedMappable.RemoveMapping(ctx, ms, ar, offset, writable) + if !d.isCopiedUp() { + d.lowerMappings.RemoveMapping(ms, ar, offset, writable) + } +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + if err := d.wrappedMappable.CopyMapping(ctx, ms, srcAR, dstAR, offset, writable); err != nil { + return err + } + if !d.isCopiedUp() { + d.lowerMappings.AddMapping(ms, dstAR, offset, writable) + } + return nil +} + +// Translate implements memmap.Mappable.Translate. +func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + d.dataMu.RLock() + defer d.dataMu.RUnlock() + return d.wrappedMappable.Translate(ctx, required, optional, at) +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (d *dentry) InvalidateUnsavable(ctx context.Context) error { + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + return d.wrappedMappable.InvalidateUnsavable(ctx) } diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index 00562667f..dfbccd05f 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -22,6 +22,10 @@ // filesystem.renameMu // dentry.dirMu // dentry.copyMu +// *** "memmap.Mappable locks" below this point +// dentry.mapsMu +// *** "memmap.Mappable locks taken by Translate" below this point +// dentry.dataMu // // Locking dentry.dirMu in multiple dentries requires that parent dentries are // locked before child dentries, and that filesystem.renameMu is locked to @@ -37,6 +41,7 @@ import ( "gvisor.dev/gvisor/pkg/fspath" fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -46,6 +51,8 @@ import ( const Name = "overlay" // FilesystemType implements vfs.FilesystemType. +// +// +stateify savable type FilesystemType struct{} // Name implements vfs.FilesystemType.Name. @@ -55,6 +62,8 @@ func (FilesystemType) Name() string { // FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to // FilesystemType.GetFilesystem. +// +// +stateify savable type FilesystemOptions struct { // Callers passing FilesystemOptions to // overlay.FilesystemType.GetFilesystem() are responsible for ensuring that @@ -71,6 +80,8 @@ type FilesystemOptions struct { } // filesystem implements vfs.FilesystemImpl. +// +// +stateify savable type filesystem struct { vfsfs vfs.Filesystem @@ -93,7 +104,7 @@ type filesystem struct { // renameMu synchronizes renaming with non-renaming operations in order to // ensure consistent lock ordering between dentry.dirMu in different // dentries. - renameMu sync.RWMutex + renameMu sync.RWMutex `state:"nosave"` // lastDirIno is the last inode number assigned to a directory. lastDirIno // is accessed using atomic memory operations. @@ -106,16 +117,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fsoptsRaw := opts.InternalData fsopts, haveFSOpts := fsoptsRaw.(FilesystemOptions) if fsoptsRaw != nil && !haveFSOpts { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw) + ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw) return nil, nil, syserror.EINVAL } if haveFSOpts { if len(fsopts.LowerRoots) == 0 { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty") + ctx.Infof("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty") return nil, nil, syserror.EINVAL } if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified") + ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified") return nil, nil, syserror.EINVAL } // We don't enforce a maximum number of lower layers when not @@ -132,7 +143,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt delete(mopts, "workdir") upperPath := fspath.Parse(upperPathname) if !upperPath.Absolute { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname) + ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname) return nil, nil, syserror.EINVAL } upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ @@ -144,13 +155,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt CheckSearchable: true, }) if err != nil { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err) + ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err) return nil, nil, err } defer upperRoot.DecRef(ctx) privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */) if err != nil { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err) + ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err) return nil, nil, err } defer privateUpperRoot.DecRef(ctx) @@ -158,24 +169,24 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } lowerPathnamesStr, ok := mopts["lowerdir"] if !ok { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: missing required option lowerdir") + ctx.Infof("overlay.FilesystemType.GetFilesystem: missing required option lowerdir") return nil, nil, syserror.EINVAL } delete(mopts, "lowerdir") lowerPathnames := strings.Split(lowerPathnamesStr, ":") const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK if len(lowerPathnames) < 2 && !fsopts.UpperRoot.Ok() { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified") + ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified") return nil, nil, syserror.EINVAL } if len(lowerPathnames) > maxLowerLayers { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers) + ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers) return nil, nil, syserror.EINVAL } for _, lowerPathname := range lowerPathnames { lowerPath := fspath.Parse(lowerPathname) if !lowerPath.Absolute { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname) + ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname) return nil, nil, syserror.EINVAL } lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ @@ -187,13 +198,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt CheckSearchable: true, }) if err != nil { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err) + ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err) return nil, nil, err } defer lowerRoot.DecRef(ctx) privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */) if err != nil { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err) + ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err) return nil, nil, err } defer privateLowerRoot.DecRef(ctx) @@ -201,7 +212,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } } if len(mopts) != 0 { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts) + ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts) return nil, nil, syserror.EINVAL } @@ -274,7 +285,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return nil, nil, syserror.EREMOTE } if isWhiteout(&rootStat) { - ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") + ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") root.destroyLocked(ctx) fs.vfsfs.DecRef(ctx) return nil, nil, syserror.EINVAL @@ -362,6 +373,8 @@ func (fs *filesystem) newDirIno() uint64 { } // dentry implements vfs.DentryImpl. +// +// +stateify savable type dentry struct { vfsd vfs.Dentry @@ -394,7 +407,7 @@ type dentry struct { // and dirents (if not nil) is a cache of dirents as returned by // directoryFDs representing this directory. children is protected by // dirMu. - dirMu sync.Mutex + dirMu sync.Mutex `state:"nosave"` children map[string]*dentry dirents []vfs.Dirent @@ -404,7 +417,7 @@ type dentry struct { // If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e. // be copied up) with copyMu locked for writing; otherwise, it is // immutable. lowerVDs is always immutable. - copyMu sync.RWMutex + copyMu sync.RWMutex `state:"nosave"` upperVD vfs.VirtualDentry lowerVDs []vfs.VirtualDentry @@ -419,7 +432,43 @@ type dentry struct { devMinor uint32 ino uint64 + // If this dentry represents a regular file, then: + // + // - mapsMu is used to synchronize between copy-up and memmap.Mappable + // methods on dentry preceding mm.MemoryManager.activeMu in the lock order. + // + // - dataMu is used to synchronize between copy-up and + // dentry.(memmap.Mappable).Translate. + // + // - lowerMappings tracks memory mappings of the file. lowerMappings is + // used to invalidate mappings of the lower layer when the file is copied + // up to ensure that they remain coherent with subsequent writes to the + // file. (Note that, as of this writing, Linux overlayfs does not do this; + // this feature is a gVisor extension.) lowerMappings is protected by + // mapsMu. + // + // - If this dentry is copied-up, then wrappedMappable is the Mappable + // obtained from a call to the current top layer's + // FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil + // (from a call to nonDirectoryFD.ensureMappable()), it cannot become nil. + // wrappedMappable is protected by mapsMu and dataMu. + // + // - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is + // accessed using atomic memory operations. + mapsMu sync.Mutex + lowerMappings memmap.MappingSet + dataMu sync.RWMutex + wrappedMappable memmap.Mappable + isMappable uint32 + locks vfs.FileLocks + + // watches is the set of inotify watches on the file repesented by this dentry. + // + // Note that hard links to the same file will not share the same set of + // watches, due to the fact that we do not have inode structures in this + // overlay implementation. + watches vfs.Watches } // newDentry creates a new dentry. The dentry initially has no references; it @@ -479,6 +528,14 @@ func (d *dentry) checkDropLocked(ctx context.Context) { if atomic.LoadInt64(&d.refs) != 0 { return } + + // Make sure that we do not lose watches on dentries that have not been + // deleted. Note that overlayfs never calls VFS.InvalidateDentry(), so + // d.vfsd.IsDead() indicates that d was deleted. + if !d.vfsd.IsDead() && d.watches.Size() > 0 { + return + } + // Refs is still zero; destroy it. d.destroyLocked(ctx) return @@ -507,6 +564,8 @@ func (d *dentry) destroyLocked(ctx context.Context) { lowerVD.DecRef(ctx) } + d.watches.HandleDeletion(ctx) + if d.parent != nil { d.parent.dirMu.Lock() if !d.vfsd.IsDead() { @@ -525,19 +584,36 @@ func (d *dentry) destroyLocked(ctx context.Context) { // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) { - // TODO(gvisor.dev/issue/1479): Implement inotify. + if d.isDir() { + events |= linux.IN_ISDIR + } + + // overlayfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates + // that d was deleted. + deleted := d.vfsd.IsDead() + + d.fs.renameMu.RLock() + // The ordering below is important, Linux always notifies the parent first. + if d.parent != nil { + d.parent.watches.Notify(ctx, d.name, events, cookie, et, deleted) + } + d.watches.Notify(ctx, "", events, cookie, et, deleted) + d.fs.renameMu.RUnlock() } // Watches implements vfs.DentryImpl.Watches. func (d *dentry) Watches() *vfs.Watches { - // TODO(gvisor.dev/issue/1479): Implement inotify. - return nil + return &d.watches } // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. -// -// TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *dentry) OnZeroWatches(context.Context) {} +func (d *dentry) OnZeroWatches(ctx context.Context) { + if atomic.LoadInt64(&d.refs) == 0 { + d.fs.renameMu.Lock() + d.checkDropLocked(ctx) + d.fs.renameMu.Unlock() + } +} // iterLayers invokes yield on each layer comprising d, from top to bottom. If // any call to yield returns false, iterLayer stops iteration. @@ -570,6 +646,16 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) } +func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + kuid := auth.KUID(atomic.LoadUint32(&d.uid)) + kgid := auth.KGID(atomic.LoadUint32(&d.gid)) + if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { + return err + } + return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) +} + // statInternalMask is the set of stat fields that is set by // dentry.statInternalTo(). const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO @@ -608,6 +694,8 @@ func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) { // fileDescription is embedded by overlay implementations of // vfs.FileDescriptionImpl. +// +// +stateify savable type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -622,6 +710,48 @@ func (fd *fileDescription) dentry() *dentry { return fd.vfsfd.Dentry().Impl().(*dentry) } +// ListXattr implements vfs.FileDescriptionImpl.ListXattr. +func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { + return fd.filesystem().listXattr(ctx, fd.dentry(), size) +} + +// GetXattr implements vfs.FileDescriptionImpl.GetXattr. +func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { + return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts) +} + +// SetXattr implements vfs.FileDescriptionImpl.SetXattr. +func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { + fs := fd.filesystem() + d := fd.dentry() + + fs.renameMu.RLock() + err := fs.setXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts) + fs.renameMu.RUnlock() + if err != nil { + return err + } + + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil +} + +// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. +func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { + fs := fd.filesystem() + d := fd.dentry() + + fs.renameMu.RLock() + err := fs.removeXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name) + fs.renameMu.RUnlock() + if err != nil { + return err + } + + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil +} + // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index 7053ad6db..4e2da4810 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -31,6 +31,7 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// +stateify savable type filesystemType struct{} // Name implements vfs.FilesystemType.Name. @@ -43,6 +44,7 @@ func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile panic("pipefs.filesystemType.GetFilesystem should never be called") } +// +stateify savable type filesystem struct { kernfs.Filesystem @@ -76,6 +78,8 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe } // inode implements kernfs.Inode. +// +// +stateify savable type inode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink @@ -144,8 +148,8 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth. } // Open implements kernfs.Inode.Open. -func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags, &i.locks) +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + return i.pipe.Open(ctx, rp.Mount(), d.VFSDentry(), opts.Flags, &i.locks) } // StatFS implements kernfs.Inode.StatFS. diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index a45b44440..2e086e34c 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -100,6 +100,7 @@ go_library( "//pkg/sync", "//pkg/syserror", "//pkg/tcpip/header", + "//pkg/tcpip/network/ipv4", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index 03b5941b9..05d7948ea 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -41,6 +41,7 @@ func (FilesystemType) Name() string { return Name } +// +stateify savable type filesystem struct { kernfs.Filesystem @@ -84,6 +85,8 @@ func (fs *filesystem) Release(ctx context.Context) { // dynamicInode is an overfitted interface for common Inodes with // dynamicByteSource types used in procfs. +// +// +stateify savable type dynamicInode interface { kernfs.Inode vfs.DynamicBytesSource @@ -99,6 +102,7 @@ func (fs *filesystem) newDentry(creds *auth.Credentials, ino uint64, perm linux. return d } +// +stateify savable type staticFile struct { kernfs.DynamicBytesFile vfs.StaticData @@ -118,10 +122,13 @@ func newStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64 // InternalData contains internal data passed in to the procfs mount via // vfs.GetFilesystemOptions.InternalData. +// +// +stateify savable type InternalData struct { Cgroups map[string]string } +// +stateify savable type implStatFS struct{} // StatFS implements kernfs.Inode.StatFS. diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index d57d94dbc..47ecd941c 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -68,8 +68,8 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, return dentry } -// Lookup implements kernfs.inodeDynamicLookup. -func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +// Lookup implements kernfs.inodeDynamicLookup.Lookup. +func (i *subtasksInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) { tid, err := strconv.ParseUint(name, 10, 32) if err != nil { return nil, syserror.ENOENT @@ -82,12 +82,10 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e if subTask.ThreadGroup() != i.task.ThreadGroup() { return nil, syserror.ENOENT } - - subTaskDentry := i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers) - return subTaskDentry.VFSDentry(), nil + return i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers), nil } -// IterDirents implements kernfs.inodeDynamicLookup. +// IterDirents implements kernfs.inodeDynamicLookup.IterDirents. func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { tasks := i.task.ThreadGroup().MemberIDs(i.pidns) if len(tasks) == 0 { @@ -118,6 +116,7 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb return offset, nil } +// +stateify savable type subtasksFD struct { kernfs.GenericDirectoryFD @@ -155,21 +154,21 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro return fd.GenericDirectoryFD.SetStat(ctx, opts) } -// Open implements kernfs.Inode. -func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +// Open implements kernfs.Inode.Open. +func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &subtasksFD{task: i.task} if err := fd.Init(&i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }); err != nil { return nil, err } - if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return fd.VFSFileDescription(), nil } -// Stat implements kernfs.Inode. +// Stat implements kernfs.Inode.Stat. func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts) if err != nil { @@ -181,12 +180,12 @@ func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs return stat, nil } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (i *subtasksInode) DecRef(context.Context) { i.subtasksInodeRefs.DecRef(i.Destroy) } diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index dbdb5d929..a7cd6f57e 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -53,6 +53,7 @@ func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace "auxv": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &auxvData{task: task}), "cmdline": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), "comm": fs.newComm(task, fs.NextIno(), 0444), + "cwd": fs.newCwdSymlink(task, fs.NextIno()), "environ": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), "exe": fs.newExeSymlink(task, fs.NextIno()), "fd": fs.newFDDirInode(task), @@ -106,9 +107,9 @@ func (i *taskInode) Valid(ctx context.Context) bool { return i.task.ExitState() != kernel.TaskExitDead } -// Open implements kernfs.Inode. -func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ +// Open implements kernfs.Inode.Open. +func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }) if err != nil { @@ -117,18 +118,20 @@ func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D return fd.VFSFileDescription(), nil } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (i *taskInode) DecRef(context.Context) { i.taskInodeRefs.DecRef(i.Destroy) } // taskOwnedInode implements kernfs.Inode and overrides inode owner with task // effective user and group. +// +// +stateify savable type taskOwnedInode struct { kernfs.Inode @@ -168,7 +171,7 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux. return d } -// Stat implements kernfs.Inode. +// Stat implements kernfs.Inode.Stat. func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { stat, err := i.Inode.Stat(ctx, fs, opts) if err != nil { @@ -186,7 +189,7 @@ func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs. return stat, nil } -// CheckPermissions implements kernfs.Inode. +// CheckPermissions implements kernfs.Inode.CheckPermissions. func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { mode := i.Mode() uid, gid := i.getOwner(mode) diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 3f0d78461..0866cea2b 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -51,6 +51,7 @@ func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool { return true } +// +stateify savable type fdDir struct { locks vfs.FileLocks @@ -62,7 +63,7 @@ type fdDir struct { produceSymlink bool } -// IterDirents implements kernfs.inodeDynamicLookup. +// IterDirents implements kernfs.inodeDynamicLookup.IterDirents. func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { var fds []int32 i.task.WithMuLocked(func(t *kernel.Task) { @@ -86,14 +87,19 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, off Name: strconv.FormatUint(uint64(fd), 10), Type: typ, Ino: i.fs.NextIno(), - NextOff: offset + 1, + NextOff: int64(fd) + 3, } if err := cb.Handle(dirent); err != nil { - return offset, err + // Getdents should iterate correctly despite mutation + // of fds, so we return the next fd to serialize plus + // 2 (which accounts for the "." and ".." tracked by + // kernfs) as the offset. + return int64(fd) + 2, err } - offset++ } - return offset, nil + // We serialized them all. Next offset should be higher than last + // serialized fd. + return int64(fds[len(fds)-1]) + 3, nil } // fdDirInode represents the inode for /proc/[pid]/fd directory. @@ -130,8 +136,8 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry { return dentry } -// Lookup implements kernfs.inodeDynamicLookup. -func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +// Lookup implements kernfs.inodeDynamicLookup.Lookup. +func (i *fdDirInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) { fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { return nil, syserror.ENOENT @@ -140,13 +146,12 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro if !taskFDExists(ctx, i.task, fd) { return nil, syserror.ENOENT } - taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()) - return taskDentry.VFSDentry(), nil + return i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()), nil } -// Open implements kernfs.Inode. -func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ +// Open implements kernfs.Inode.Open. +func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }) if err != nil { @@ -155,7 +160,7 @@ func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs. return fd.VFSFileDescription(), nil } -// CheckPermissions implements kernfs.Inode. +// CheckPermissions implements kernfs.Inode.CheckPermissions. // // This is to match Linux, which uses a special permission handler to guarantee // that a process can still access /proc/self/fd after it has executed @@ -177,7 +182,7 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia return err } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (i *fdDirInode) DecRef(context.Context) { i.fdDirInodeRefs.DecRef(i.Destroy) } @@ -209,7 +214,7 @@ func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *ker return d } -func (s *fdSymlink) Readlink(ctx context.Context) (string, error) { +func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { file, _ := getTaskFD(s.task, s.fd) if file == nil { return "", syserror.ENOENT @@ -264,8 +269,8 @@ func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry { return dentry } -// Lookup implements kernfs.inodeDynamicLookup. -func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +// Lookup implements kernfs.inodeDynamicLookup.Lookup. +func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) { fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { return nil, syserror.ENOENT @@ -278,13 +283,12 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, task: i.task, fd: fd, } - dentry := i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data) - return dentry.VFSDentry(), nil + return i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data), nil } -// Open implements kernfs.Inode. -func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ +// Open implements kernfs.Inode.Open. +func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }) if err != nil { @@ -293,7 +297,7 @@ func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd * return fd.VFSFileDescription(), nil } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (i *fdInfoDirInode) DecRef(context.Context) { i.fdInfoDirInodeRefs.DecRef(i.Destroy) } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 356036b9b..3fbf081a6 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -543,7 +543,7 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { var vss, rss, data uint64 s.task.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { - fds = fdTable.Size() + fds = fdTable.CurrentMaxFDs() } if mm := t.MemoryManager(); mm != nil { vss = mm.VirtualMemorySize() @@ -667,20 +667,24 @@ func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentr return d } -// Readlink implements kernfs.Inode. -func (s *exeSymlink) Readlink(ctx context.Context) (string, error) { - if !kernel.ContextCanTrace(ctx, s.task, false) { - return "", syserror.EACCES - } - - // Pull out the executable for /proc/[pid]/exe. - exec, err := s.executable() +// Readlink implements kernfs.Inode.Readlink. +func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { + exec, _, err := s.Getlink(ctx, nil) if err != nil { return "", err } defer exec.DecRef(ctx) - return exec.PathnameWithDeleted(ctx), nil + root := vfs.RootFromContext(ctx) + if !root.Ok() { + // It could have raced with process deletion. + return "", syserror.ESRCH + } + defer root.DecRef(ctx) + + vfsObj := exec.Mount().Filesystem().VirtualFilesystem() + name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec) + return name, nil } // Getlink implements kernfs.Inode.Getlink. @@ -688,23 +692,12 @@ func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent if !kernel.ContextCanTrace(ctx, s.task, false) { return vfs.VirtualDentry{}, "", syserror.EACCES } - - exec, err := s.executable() - if err != nil { - return vfs.VirtualDentry{}, "", err - } - defer exec.DecRef(ctx) - - vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry() - vd.IncRef() - return vd, "", nil -} - -func (s *exeSymlink) executable() (file fsbridge.File, err error) { if err := checkTaskState(s.task); err != nil { - return nil, err + return vfs.VirtualDentry{}, "", err } + var err error + var exec fsbridge.File s.task.WithMuLocked(func(t *kernel.Task) { mm := t.MemoryManager() if mm == nil { @@ -715,12 +708,78 @@ func (s *exeSymlink) executable() (file fsbridge.File, err error) { // The MemoryManager may be destroyed, in which case // MemoryManager.destroy will simply set the executable to nil // (with locks held). - file = mm.Executable() - if file == nil { + exec = mm.Executable() + if exec == nil { err = syserror.ESRCH } }) - return + if err != nil { + return vfs.VirtualDentry{}, "", err + } + defer exec.DecRef(ctx) + + vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry() + vd.IncRef() + return vd, "", nil +} + +// cwdSymlink is an symlink for the /proc/[pid]/cwd file. +// +// +stateify savable +type cwdSymlink struct { + implStatFS + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeSymlink + + task *kernel.Task +} + +var _ kernfs.Inode = (*cwdSymlink)(nil) + +func (fs *filesystem) newCwdSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry { + inode := &cwdSymlink{task: task} + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +// Readlink implements kernfs.Inode.Readlink. +func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { + cwd, _, err := s.Getlink(ctx, nil) + if err != nil { + return "", err + } + defer cwd.DecRef(ctx) + + root := vfs.RootFromContext(ctx) + if !root.Ok() { + // It could have raced with process deletion. + return "", syserror.ESRCH + } + defer root.DecRef(ctx) + + vfsObj := cwd.Mount().Filesystem().VirtualFilesystem() + name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd) + return name, nil +} + +// Getlink implements kernfs.Inode.Getlink. +func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { + if !kernel.ContextCanTrace(ctx, s.task, false) { + return vfs.VirtualDentry{}, "", syserror.EACCES + } + if err := checkTaskState(s.task); err != nil { + return vfs.VirtualDentry{}, "", err + } + cwd := s.task.FSContext().WorkingDirectoryVFS2() + if !cwd.Ok() { + // It could have raced with process deletion. + return vfs.VirtualDentry{}, "", syserror.ESRCH + } + return cwd, "", nil } // mountInfoData is used to implement /proc/[pid]/mountinfo. @@ -785,6 +844,7 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } +// +stateify savable type namespaceSymlink struct { kernfs.StaticSymlink @@ -807,15 +867,15 @@ func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns stri return d } -// Readlink implements Inode. -func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) { +// Readlink implements kernfs.Inode.Readlink. +func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { if err := checkTaskState(s.task); err != nil { return "", err } - return s.StaticSymlink.Readlink(ctx) + return s.StaticSymlink.Readlink(ctx, mnt) } -// Getlink implements Inode.Getlink. +// Getlink implements kernfs.Inode.Getlink. func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { if err := checkTaskState(s.task); err != nil { return vfs.VirtualDentry{}, "", err @@ -832,6 +892,8 @@ func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.Vir // namespaceInode is a synthetic inode created to represent a namespace in // /proc/[pid]/ns/*. +// +// +stateify savable type namespaceInode struct { implStatFS kernfs.InodeAttrs @@ -852,12 +914,12 @@ func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32 i.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm) } -// Open implements Inode.Open. -func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +// Open implements kernfs.Inode.Open. +func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &namespaceFD{inode: i} i.IncRef() fd.LockFD.Init(&i.locks) - if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil @@ -865,6 +927,8 @@ func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd * // namespace FD is a synthetic file that represents a namespace in // /proc/[pid]/ns/*. +// +// +stateify savable type namespaceFD struct { vfs.FileDescriptionDefaultImpl vfs.LockFD @@ -875,20 +939,20 @@ type namespaceFD struct { var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil) -// Stat implements FileDescriptionImpl. +// Stat implements vfs.FileDescriptionImpl.Stat. func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() return fd.inode.Stat(ctx, vfs, opts) } -// SetStat implements FileDescriptionImpl. +// SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() creds := auth.CredentialsFromContext(ctx) return fd.inode.SetStat(ctx, vfs, creds, opts) } -// Release implements FileDescriptionImpl. +// Release implements vfs.FileDescriptionImpl.Release. func (fd *namespaceFD) Release(ctx context.Context) { fd.inode.DecRef(ctx) } diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 4e69782c7..e7f748655 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -616,6 +616,7 @@ type netSnmpData struct { var _ dynamicInode = (*netSnmpData)(nil) +// +stateify savable type snmpLine struct { prefix string header string @@ -660,7 +661,7 @@ func sprintSlice(s []uint64) string { return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice. } -// Generate implements vfs.DynamicBytesSource. +// Generate implements vfs.DynamicBytesSource.Generate. func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error { types := []interface{}{ &inet.StatSNMPIP{}, @@ -709,7 +710,7 @@ type netRouteData struct { var _ dynamicInode = (*netRouteData)(nil) -// Generate implements vfs.DynamicBytesSource. +// Generate implements vfs.DynamicBytesSource.Generate. // See Linux's net/ipv4/fib_trie.c:fib_route_seq_show. func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT") @@ -773,7 +774,7 @@ type netStatData struct { var _ dynamicInode = (*netStatData)(nil) -// Generate implements vfs.DynamicBytesSource. +// Generate implements vfs.DynamicBytesSource.Generate. // See Linux's net/ipv4/fib_trie.c:fib_route_seq_show. func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " + diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 3ea00ab87..d8f5dd509 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -52,8 +52,8 @@ type tasksInode struct { // '/proc/self' and '/proc/thread-self' have custom directory offsets in // Linux. So handle them outside of OrderedChildren. - selfSymlink *vfs.Dentry - threadSelfSymlink *vfs.Dentry + selfSymlink *kernfs.Dentry + threadSelfSymlink *kernfs.Dentry // cgroupControllers is a map of controller name to directory in the // cgroup hierarchy. These controllers are immutable and will be listed @@ -81,8 +81,8 @@ func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace inode := &tasksInode{ pidns: pidns, fs: fs, - selfSymlink: fs.newSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(), - threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(), + selfSymlink: fs.newSelfSymlink(root, fs.NextIno(), pidns), + threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns), cgroupControllers: cgroupControllers, } inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) @@ -98,8 +98,8 @@ func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace return inode, dentry } -// Lookup implements kernfs.inodeDynamicLookup. -func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +// Lookup implements kernfs.inodeDynamicLookup.Lookup. +func (i *tasksInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) { // Try to lookup a corresponding task. tid, err := strconv.ParseUint(name, 10, 64) if err != nil { @@ -118,11 +118,10 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro return nil, syserror.ENOENT } - taskDentry := i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers) - return taskDentry.VFSDentry(), nil + return i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers), nil } -// IterDirents implements kernfs.inodeDynamicLookup. +// IterDirents implements kernfs.inodeDynamicLookup.IterDirents. func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) { // fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256 const FIRST_PROCESS_ENTRY = 256 @@ -200,9 +199,9 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback return maxTaskID, nil } -// Open implements kernfs.Inode. -func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ +// Open implements kernfs.Inode.Open. +func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }) if err != nil { @@ -229,7 +228,7 @@ func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.St return stat, nil } -// DecRef implements kernfs.Inode. +// DecRef implements kernfs.Inode.DecRef. func (i *tasksInode) DecRef(context.Context) { i.tasksInodeRefs.DecRef(i.Destroy) } @@ -237,6 +236,8 @@ func (i *tasksInode) DecRef(context.Context) { // staticFileSetStat implements a special static file that allows inode // attributes to be set. This is to support /proc files that are readonly, but // allow attributes to be set. +// +// +stateify savable type staticFileSetStat struct { dynamicBytesFileSetAttr vfs.StaticData diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 8c41729e4..f268c59b0 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -31,6 +31,7 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// +stateify savable type selfSymlink struct { implStatFS kernfs.InodeAttrs @@ -51,7 +52,7 @@ func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns return d } -func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { +func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { t := kernel.TaskFromContext(ctx) if t == nil { // Who is reading this link? @@ -64,16 +65,17 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { return strconv.FormatUint(uint64(tgid), 10), nil } -func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { - target, err := s.Readlink(ctx) +func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { + target, err := s.Readlink(ctx, mnt) return vfs.VirtualDentry{}, target, err } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } +// +stateify savable type threadSelfSymlink struct { implStatFS kernfs.InodeAttrs @@ -94,7 +96,7 @@ func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64, return d } -func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { +func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { t := kernel.TaskFromContext(ctx) if t == nil { // Who is reading this link? @@ -108,12 +110,12 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { return fmt.Sprintf("%d/task/%d", tgid, tid), nil } -func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { - target, err := s.Readlink(ctx) +func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { + target, err := s.Readlink(ctx, mnt) return vfs.VirtualDentry{}, target, err } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } @@ -121,16 +123,20 @@ func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Creden // dynamicBytesFileSetAttr implements a special file that allows inode // attributes to be set. This is to support /proc files that are readonly, but // allow attributes to be set. +// +// +stateify savable type dynamicBytesFileSetAttr struct { kernfs.DynamicBytesFile } -// SetStat implements Inode.SetStat. +// SetStat implements kernfs.Inode.SetStat. func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts) } // cpuStats contains the breakdown of CPU time for /proc/stat. +// +// +stateify savable type cpuStats struct { // user is time spent in userspace tasks with non-positive niceness. user uint64 diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 038a194c7..3312b0418 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -27,9 +27,11 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/usermem" ) +// +stateify savable type tcpMemDir int const ( @@ -67,6 +69,7 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke "tcp_rmem": fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpRMem}), "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}), "tcp_wmem": fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpWMem}), + "ip_forward": fs.newDentry(root, fs.NextIno(), 0444, &ipForwarding{stack: stack}), // The following files are simple stubs until they are implemented in // netstack, most of these files are configuration related. We use the @@ -174,7 +177,7 @@ type tcpSackData struct { var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil) -// Generate implements vfs.DynamicBytesSource. +// Generate implements vfs.DynamicBytesSource.Generate. func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error { if d.enabled == nil { sack, err := d.stack.TCPSACKEnabled() @@ -232,7 +235,7 @@ type tcpRecoveryData struct { var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil) -// Generate implements vfs.DynamicBytesSource. +// Generate implements vfs.DynamicBytesSource.Generate. func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error { recovery, err := d.stack.TCPRecovery() if err != nil { @@ -284,7 +287,7 @@ type tcpMemData struct { var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil) -// Generate implements vfs.DynamicBytesSource. +// Generate implements vfs.DynamicBytesSource.Generate. func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error { d.mu.Lock() defer d.mu.Unlock() @@ -354,3 +357,63 @@ func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error { panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir)) } } + +// ipForwarding implements vfs.WritableDynamicBytesSource for +// /proc/sys/net/ipv4/ip_forwarding. +// +// +stateify savable +type ipForwarding struct { + kernfs.DynamicBytesFile + + stack inet.Stack `state:"wait"` + enabled *bool +} + +var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error { + if ipf.enabled == nil { + enabled := ipf.stack.Forwarding(ipv4.ProtocolNumber) + ipf.enabled = &enabled + } + + val := "0\n" + if *ipf.enabled { + // Technically, this is not quite compatible with Linux. Linux stores these + // as an integer, so if you write "2" into tcp_sack, you should get 2 back. + // Tough luck. + val = "1\n" + } + buf.WriteString(val) + + return nil +} + +// Write implements vfs.WritableDynamicBytesSource.Write. +func (ipf *ipForwarding) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + // No need to handle partial writes thus far. + return 0, syserror.EINVAL + } + if src.NumBytes() == 0 { + return 0, nil + } + + // Limit input size so as not to impact performance if input size is large. + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return 0, err + } + if ipf.enabled == nil { + ipf.enabled = new(bool) + } + *ipf.enabled = v != 0 + if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil { + return 0, err + } + return n, nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go index be54897bb..6cee22823 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go @@ -20,8 +20,10 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/usermem" ) func newIPv6TestStack() *inet.TestStack { @@ -76,3 +78,72 @@ func TestIfinet6(t *testing.T) { t.Errorf("Got n.contents() = %v, want = %v", got, want) } } + +// TestIPForwarding tests the implementation of +// /proc/sys/net/ipv4/ip_forwarding +func TestConfigureIPForwarding(t *testing.T) { + ctx := context.Background() + s := inet.NewTestStack() + + var cases = []struct { + comment string + initial bool + str string + final bool + }{ + { + comment: `Forwarding is disabled; write 1 and enable forwarding`, + initial: false, + str: "1", + final: true, + }, + { + comment: `Forwarding is disabled; write 0 and disable forwarding`, + initial: false, + str: "0", + final: false, + }, + { + comment: `Forwarding is enabled; write 1 and enable forwarding`, + initial: true, + str: "1", + final: true, + }, + { + comment: `Forwarding is enabled; write 0 and disable forwarding`, + initial: true, + str: "0", + final: false, + }, + { + comment: `Forwarding is disabled; write 2404 and enable forwarding`, + initial: false, + str: "2404", + final: true, + }, + { + comment: `Forwarding is enabled; write 2404 and enable forwarding`, + initial: true, + str: "2404", + final: true, + }, + } + for _, c := range cases { + t.Run(c.comment, func(t *testing.T) { + s.IPForwarding = c.initial + + file := &ipForwarding{stack: s, enabled: &c.initial} + + // Write the values. + src := usermem.BytesIOSequence([]byte(c.str)) + if n, err := file.Write(ctx, src, 0); n != int64(len(c.str)) || err != nil { + t.Errorf("file.Write(ctx, nil, %q, 0) = (%d, %v); want (%d, nil)", c.str, n, err, len(c.str)) + } + + // Read the values from the stack and check them. + if got, want := s.IPForwarding, c.final; got != want { + t.Errorf("s.IPForwarding incorrect; got: %v, want: %v", got, want) + } + }) + } +} diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index d82b3d2f3..6975af5a7 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -67,6 +67,7 @@ var ( taskStaticFiles = map[string]testutil.DirentType{ "auxv": linux.DT_REG, "cgroup": linux.DT_REG, + "cwd": linux.DT_LNK, "cmdline": linux.DT_REG, "comm": linux.DT_REG, "environ": linux.DT_REG, @@ -104,7 +105,7 @@ func setup(t *testing.T) *testutil.System { AllowUserMount: true, }) - mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{}) + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.MountOptions{}) if err != nil { t.Fatalf("NewMountNamespace(): %v", err) } diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go index 6297e1df4..bf11b425a 100644 --- a/pkg/sentry/fsimpl/signalfd/signalfd.go +++ b/pkg/sentry/fsimpl/signalfd/signalfd.go @@ -26,7 +26,9 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// SignalFileDescription implements FileDescriptionImpl for signal fds. +// SignalFileDescription implements vfs.FileDescriptionImpl for signal fds. +// +// +stateify savable type SignalFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -43,7 +45,7 @@ type SignalFileDescription struct { target *kernel.Task // mu protects mask. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` // mask is the signal mask. Protected by mu. mask linux.SignalSet @@ -83,7 +85,7 @@ func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) { sfd.mask = mask } -// Read implements FileDescriptionImpl.Read. +// Read implements vfs.FileDescriptionImpl.Read. func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { // Attempt to dequeue relevant signals. info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0) @@ -132,5 +134,5 @@ func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) { sfd.target.SignalUnregister(entry) } -// Release implements FileDescriptionImpl.Release() +// Release implements vfs.FileDescriptionImpl.Release. func (sfd *SignalFileDescription) Release(context.Context) {} diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index 94a998568..29e5371d6 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -28,14 +28,16 @@ import ( ) // filesystemType implements vfs.FilesystemType. +// +// +stateify savable type filesystemType struct{} -// GetFilesystem implements FilesystemType.GetFilesystem. +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { panic("sockfs.filesystemType.GetFilesystem should never be called") } -// Name implements FilesystemType.Name. +// Name implements vfs.FilesystemType.Name. // // Note that registering sockfs is unnecessary, except for the fact that it // will not show up under /proc/filesystems as a result. This is a very minor @@ -44,6 +46,7 @@ func (filesystemType) Name() string { return "sockfs" } +// +stateify savable type filesystem struct { kernfs.Filesystem @@ -80,6 +83,8 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe } // inode implements kernfs.Inode. +// +// +stateify savable type inode struct { kernfs.InodeAttrs kernfs.InodeNoopRefCount @@ -88,7 +93,7 @@ type inode struct { } // Open implements kernfs.Inode.Open. -func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return nil, syserror.ENXIO } diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go index 73f3d3309..b75d70ae6 100644 --- a/pkg/sentry/fsimpl/sys/kcov.go +++ b/pkg/sentry/fsimpl/sys/kcov.go @@ -36,6 +36,8 @@ func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials) } // kcovInode implements kernfs.Inode. +// +// +stateify savable type kcovInode struct { kernfs.InodeAttrs kernfs.InodeNoopRefCount @@ -44,7 +46,7 @@ type kcovInode struct { implStatFS } -func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { k := kernel.KernelFromContext(ctx) if k == nil { panic("KernelFromContext returned nil") @@ -54,7 +56,7 @@ func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D kcov: k.NewKcov(), } - if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{ + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, }); err != nil { @@ -63,6 +65,7 @@ func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D return &fd.vfsfd, nil } +// +stateify savable type kcovFD struct { vfs.FileDescriptionDefaultImpl vfs.NoLockFD diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 39952d2d0..1568c581f 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -34,9 +34,13 @@ const Name = "sysfs" const defaultSysDirMode = linux.FileMode(0755) // FilesystemType implements vfs.FilesystemType. +// +// +stateify savable type FilesystemType struct{} // filesystem implements vfs.FilesystemImpl. +// +// +stateify savable type filesystem struct { kernfs.Filesystem @@ -117,6 +121,8 @@ func (fs *filesystem) Release(ctx context.Context) { } // dir implements kernfs.Inode. +// +// +stateify savable type dir struct { dirRefs kernfs.InodeAttrs @@ -148,8 +154,8 @@ func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.Set } // Open implements kernfs.Inode.Open. -func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ +func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndStaticEntries, }) if err != nil { @@ -169,6 +175,8 @@ func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, err } // cpuFile implements kernfs.Inode. +// +// +stateify savable type cpuFile struct { implStatFS kernfs.DynamicBytesFile @@ -190,6 +198,7 @@ func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode li return d } +// +stateify savable type implStatFS struct{} // StatFS implements kernfs.Inode.StatFS. diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go index 9fd38b295..0a0d914cc 100644 --- a/pkg/sentry/fsimpl/sys/sys_test.go +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -38,7 +38,7 @@ func newTestSystem(t *testing.T) *testutil.System { AllowUserMount: true, }) - mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{}) + mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.MountOptions{}) if err != nil { t.Fatalf("Failed to create new mount namespace: %v", err) } diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go index 86beaa0a8..8853c8ad2 100644 --- a/pkg/sentry/fsimpl/timerfd/timerfd.go +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -26,8 +26,10 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// TimerFileDescription implements FileDescriptionImpl for timer fds. It also +// TimerFileDescription implements vfs.FileDescriptionImpl for timer fds. It also // implements ktime.TimerListener. +// +// +stateify savable type TimerFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -62,7 +64,7 @@ func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, return &tfd.vfsfd, nil } -// Read implements FileDescriptionImpl.Read. +// Read implements vfs.FileDescriptionImpl.Read. func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { const sizeofUint64 = 8 if dst.NumBytes() < sizeofUint64 { @@ -128,7 +130,7 @@ func (tfd *TimerFileDescription) ResumeTimer() { tfd.timer.Resume() } -// Release implements FileDescriptionImpl.Release() +// Release implements vfs.FileDescriptionImpl.Release. func (tfd *TimerFileDescription) Release(context.Context) { tfd.timer.Destroy() } diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index e5a4218e8..5209a17af 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -182,7 +182,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } @@ -376,7 +376,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go index ac54d420d..9129d35b7 100644 --- a/pkg/sentry/fsimpl/tmpfs/device_file.go +++ b/pkg/sentry/fsimpl/tmpfs/device_file.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" ) +// +stateify savable type deviceFile struct { inode inode kind vfs.DeviceKind diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 070c75e68..e90669cf0 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// +stateify savable type directory struct { // Since directories can't be hard-linked, each directory can only be // associated with a single dentry, which we can store in the directory @@ -44,7 +45,7 @@ type directory struct { // (with inode == nil) that represent the iteration position of // directoryFDs. childList is used to support directoryFD.IterDirents() // efficiently. childList is protected by iterMu. - iterMu sync.Mutex + iterMu sync.Mutex `state:"nosave"` childList dentryList } @@ -86,6 +87,7 @@ func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error { return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&dir.inode.mode)), auth.KUID(atomic.LoadUint32(&child.inode.uid))) } +// +stateify savable type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index e0de04e05..e39cd305b 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -673,11 +673,11 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts fs.mu.RUnlock() return err } - if err := d.inode.setStat(ctx, rp.Credentials(), &opts); err != nil { - fs.mu.RUnlock() + err = d.inode.setStat(ctx, rp.Credentials(), &opts) + fs.mu.RUnlock() + if err != nil { return err } - fs.mu.RUnlock() if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) @@ -770,7 +770,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return nil } -// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() defer fs.mu.RUnlock() @@ -792,59 +792,59 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath } } -// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { +// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. +func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() defer fs.mu.RUnlock() d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } - return d.inode.listxattr(size) + return d.inode.listXattr(size) } -// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { +// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. +func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { fs.mu.RLock() defer fs.mu.RUnlock() d, err := resolveLocked(ctx, rp) if err != nil { return "", err } - return d.inode.getxattr(rp.Credentials(), &opts) + return d.inode.getXattr(rp.Credentials(), &opts) } -// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. -func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { +// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. +func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { fs.mu.RLock() d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err } - if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil { - fs.mu.RUnlock() + err = d.inode.setXattr(rp.Credentials(), &opts) + fs.mu.RUnlock() + if err != nil { return err } - fs.mu.RUnlock() d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } -// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. -func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { +// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. +func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err } - if err := d.inode.removexattr(rp.Credentials(), name); err != nil { - fs.mu.RUnlock() + err = d.inode.removeXattr(rp.Credentials(), name) + fs.mu.RUnlock() + if err != nil { return err } - fs.mu.RUnlock() d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil @@ -865,8 +865,16 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe } if d.parent == nil { if d.name != "" { - // This must be an anonymous memfd file. + // This file must have been created by + // newUnlinkedRegularFileDescription(). In Linux, + // mm/shmem.c:__shmem_file_setup() => + // fs/file_table.c:alloc_file_pseudo() sets the created + // dentry's dentry_operations to anon_ops, for which d_dname == + // simple_dname. fs/d_path.c:simple_dname() defines the + // dentry's pathname to be its name, prefixed with "/" and + // suffixed with " (deleted)". b.PrependComponent("/" + d.name) + b.AppendString(" (deleted)") return vfs.PrependPathSyntheticError{} } return vfs.PrependPathAtNonMountRootError{} diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go index 5b0471ff4..d772db9e9 100644 --- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// +stateify savable type namedPipe struct { inode inode diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index ec2701d8b..be29a2363 100644 --- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -158,7 +158,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { t.Fatalf("failed to create tmpfs root mount: %v", err) } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 0710b65db..a199eb33d 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -36,12 +36,18 @@ import ( ) // regularFile is a regular (=S_IFREG) tmpfs file. +// +// +stateify savable type regularFile struct { inode inode // memFile is a platform.File used to allocate pages to this regularFile. memFile *pgalloc.MemoryFile + // memoryUsageKind is the memory accounting category under which pages backing + // this regularFile's contents are accounted. + memoryUsageKind usage.MemoryKind + // mapsMu protects mappings. mapsMu sync.Mutex `state:"nosave"` @@ -62,7 +68,7 @@ type regularFile struct { writableMappingPages uint64 // dataMu protects the fields below. - dataMu sync.RWMutex + dataMu sync.RWMutex `state:"nosave"` // data maps offsets into the file to offsets into memFile that store // the file's data. @@ -86,14 +92,75 @@ type regularFile struct { func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode { file := ®ularFile{ - memFile: fs.memFile, - seals: linux.F_SEAL_SEAL, + memFile: fs.memFile, + memoryUsageKind: usage.Tmpfs, + seals: linux.F_SEAL_SEAL, } file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode) file.inode.nlink = 1 // from parent directory return &file.inode } +// newUnlinkedRegularFileDescription creates a regular file on the tmpfs +// filesystem represented by mount and returns an FD representing that file. +// The new file is not reachable by path traversal from any other file. +// +// newUnlinkedRegularFileDescription is analogous to Linux's +// mm/shmem.c:__shmem_file_setup(). +// +// Preconditions: mount must be a tmpfs mount. +func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) { + fs, ok := mount.Filesystem().Impl().(*filesystem) + if !ok { + panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount") + } + + inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777) + d := fs.newDentry(inode) + defer d.DecRef(ctx) + d.name = name + + fd := ®ularFileFD{} + fd.Init(&inode.locks) + flags := uint32(linux.O_RDWR) + if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return fd, nil +} + +// NewZeroFile creates a new regular file and file description as for +// mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is +// initially (implicitly) filled with zeroes. +// +// Preconditions: mount must be a tmpfs mount. +func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) { + // Compare mm/shmem.c:shmem_zero_setup(). + fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero") + if err != nil { + return nil, err + } + rf := fd.inode().impl.(*regularFile) + rf.memoryUsageKind = usage.Anonymous + rf.size = size + return &fd.vfsfd, err +} + +// NewMemfd creates a new regular file and file description as for +// memfd_create. +// +// Preconditions: mount must be a tmpfs mount. +func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) { + fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name) + if err != nil { + return nil, err + } + if allowSeals { + fd.inode().impl.(*regularFile).seals = 0 + } + return &fd.vfsfd, nil +} + // truncate grows or shrinks the file to the given size. It returns true if the // file size was updated. func (rf *regularFile) truncate(newSize uint64) (bool, error) { @@ -226,7 +293,7 @@ func (rf *regularFile) Translate(ctx context.Context, required, optional memmap. optional.End = pgend } - cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) { + cerr := rf.data.Fill(ctx, required, optional, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) { // Newly-allocated pages are zeroed, so we don't need to do anything. return dsts.NumBytes(), nil }) @@ -260,13 +327,14 @@ func (*regularFile) InvalidateUnsavable(context.Context) error { return nil } +// +stateify savable type regularFileFD struct { fileDescription // off is the file offset. off is accessed using atomic memory operations. // offMu serializes operations that may mutate off. off int64 - offMu sync.Mutex + offMu sync.Mutex `state:"nosave"` } // Release implements vfs.FileDescriptionImpl.Release. @@ -575,7 +643,7 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, case gap.Ok(): // Allocate memory for the write. gapMR := gap.Range().Intersect(pgMR) - fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs) + fr, err := rw.file.memFile.Allocate(gapMR.Length(), rw.file.memoryUsageKind) if err != nil { retErr = err goto exitLoop diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go index 3ed650474..5699d5975 100644 --- a/pkg/sentry/fsimpl/tmpfs/socket_file.go +++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go @@ -21,6 +21,8 @@ import ( ) // socketFile is a socket (=S_IFSOCK) tmpfs file. +// +// +stateify savable type socketFile struct { inode inode ep transport.BoundEndpoint diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go index b0de5fabe..a102a2ee2 100644 --- a/pkg/sentry/fsimpl/tmpfs/symlink.go +++ b/pkg/sentry/fsimpl/tmpfs/symlink.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) +// +stateify savable type symlink struct { inode inode target string // immutable diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index c4cec4130..cefec8fde 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -51,9 +51,13 @@ import ( const Name = "tmpfs" // FilesystemType implements vfs.FilesystemType. +// +// +stateify savable type FilesystemType struct{} // filesystem implements vfs.FilesystemImpl. +// +// +stateify savable type filesystem struct { vfsfs vfs.Filesystem @@ -67,7 +71,7 @@ type filesystem struct { devMinor uint32 // mu serializes changes to the Dentry tree. - mu sync.RWMutex + mu sync.RWMutex `state:"nosave"` nextInoMinusOne uint64 // accessed using atomic memory operations } @@ -78,6 +82,8 @@ func (FilesystemType) Name() string { } // FilesystemOpts is used to pass configuration data to tmpfs. +// +// +stateify savable type FilesystemOpts struct { // RootFileType is the FileType of the filesystem root. Valid values // are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR. @@ -221,6 +227,8 @@ var globalStatfs = linux.Statfs{ } // dentry implements vfs.DentryImpl. +// +// +stateify savable type dentry struct { vfsd vfs.Dentry @@ -300,6 +308,8 @@ func (d *dentry) Watches() *vfs.Watches { func (d *dentry) OnZeroWatches(context.Context) {} // inode represents a filesystem object. +// +// +stateify savable type inode struct { // fs is the owning filesystem. fs is immutable. fs *filesystem @@ -316,12 +326,12 @@ type inode struct { // Inode metadata. Writing multiple fields atomically requires holding // mu, othewise atomic operations can be used. - mu sync.Mutex - mode uint32 // file type and mode - nlink uint32 // protected by filesystem.mu instead of inode.mu - uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic - gid uint32 // auth.KGID, but ... - ino uint64 // immutable + mu sync.Mutex `state:"nosave"` + mode uint32 // file type and mode + nlink uint32 // protected by filesystem.mu instead of inode.mu + uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic + gid uint32 // auth.KGID, but ... + ino uint64 // immutable // Linux's tmpfs has no concept of btime. atime int64 // nanoseconds @@ -626,74 +636,50 @@ func (i *inode) touchCMtimeLocked() { atomic.StoreInt64(&i.ctime, now) } -func (i *inode) listxattr(size uint64) ([]string, error) { - return i.xattrs.Listxattr(size) +func (i *inode) listXattr(size uint64) ([]string, error) { + return i.xattrs.ListXattr(size) } -func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { +func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { return "", err } - return i.xattrs.Getxattr(opts) + return i.xattrs.GetXattr(opts) } -func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error { +func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error { if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { return err } - return i.xattrs.Setxattr(opts) + return i.xattrs.SetXattr(opts) } -func (i *inode) removexattr(creds *auth.Credentials, name string) error { +func (i *inode) removeXattr(creds *auth.Credentials, name string) error { if err := i.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { return err } - return i.xattrs.Removexattr(name) + return i.xattrs.RemoveXattr(name) } func (i *inode) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { - switch { - case ats&vfs.MayRead == vfs.MayRead: - if err := i.checkPermissions(creds, vfs.MayRead); err != nil { - return err - } - case ats&vfs.MayWrite == vfs.MayWrite: - if err := i.checkPermissions(creds, vfs.MayWrite); err != nil { - return err - } - default: - panic(fmt.Sprintf("checkXattrPermissions called with impossible AccessTypes: %v", ats)) + // We currently only support extended attributes in the user.* and + // trusted.* namespaces. See b/148380782. + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) && !strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) { + return syserror.EOPNOTSUPP } - - switch { - case strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX): - // The trusted.* namespace can only be accessed by privileged - // users. - if creds.HasCapability(linux.CAP_SYS_ADMIN) { - return nil - } - if ats&vfs.MayWrite == vfs.MayWrite { - return syserror.EPERM - } - return syserror.ENODATA - case strings.HasPrefix(name, linux.XATTR_USER_PREFIX): - // Extended attributes in the user.* namespace are only - // supported for regular files and directories. - filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode) - if filetype == linux.S_IFREG || filetype == linux.S_IFDIR { - return nil - } - if ats&vfs.MayWrite == vfs.MayWrite { - return syserror.EPERM - } - return syserror.ENODATA - + mode := linux.FileMode(atomic.LoadUint32(&i.mode)) + kuid := auth.KUID(atomic.LoadUint32(&i.uid)) + kgid := auth.KGID(atomic.LoadUint32(&i.gid)) + if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { + return err } - return syserror.EOPNOTSUPP + return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) } // fileDescription is embedded by tmpfs implementations of // vfs.FileDescriptionImpl. +// +// +stateify savable type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -738,20 +724,20 @@ func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { return globalStatfs, nil } -// Listxattr implements vfs.FileDescriptionImpl.Listxattr. -func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { - return fd.inode().listxattr(size) +// ListXattr implements vfs.FileDescriptionImpl.ListXattr. +func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { + return fd.inode().listXattr(size) } -// Getxattr implements vfs.FileDescriptionImpl.Getxattr. -func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) { - return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts) +// GetXattr implements vfs.FileDescriptionImpl.GetXattr. +func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { + return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts) } -// Setxattr implements vfs.FileDescriptionImpl.Setxattr. -func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { +// SetXattr implements vfs.FileDescriptionImpl.SetXattr. +func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { d := fd.dentry() - if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil { + if err := d.inode.setXattr(auth.CredentialsFromContext(ctx), &opts); err != nil { return err } @@ -760,10 +746,10 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption return nil } -// Removexattr implements vfs.FileDescriptionImpl.Removexattr. -func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { +// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. +func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { d := fd.dentry() - if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil { + if err := d.inode.removeXattr(auth.CredentialsFromContext(ctx), name); err != nil { return err } @@ -772,37 +758,6 @@ func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { return nil } -// NewMemfd creates a new tmpfs regular file and file description that can back -// an anonymous fd created by memfd_create. -func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) { - fs, ok := mount.Filesystem().Impl().(*filesystem) - if !ok { - panic("NewMemfd() called with non-tmpfs mount") - } - - // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with - // S_IRWXUGO. - inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777) - rf := inode.impl.(*regularFile) - if allowSeals { - rf.seals = 0 - } - - d := fs.newDentry(inode) - defer d.DecRef(ctx) - d.name = name - - // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with - // FMODE_READ | FMODE_WRITE. - var fd regularFileFD - fd.Init(&inode.locks) - flags := uint32(linux.O_RDWR) - if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { - return nil, err - } - return &fd.vfsfd, nil -} - // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go index 6f3e3ae6f..99c8e3c0f 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go @@ -41,7 +41,7 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) } diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD index 326c4ed90..0ca750281 100644 --- a/pkg/sentry/fsimpl/verity/BUILD +++ b/pkg/sentry/fsimpl/verity/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") licenses(["notice"]) @@ -13,12 +13,35 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fspath", + "//pkg/marshal/primitive", "//pkg/merkletree", + "//pkg/sentry/arch", "//pkg/sentry/fs/lock", + "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", + ], +) + +go_test( + name = "verity_test", + srcs = [ + "verity_test.go", + ], + library = ":verity", + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/arch", + "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/contexttest", + "//pkg/sentry/vfs", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go index 0e17dbddc..a560b0797 100644 --- a/pkg/sentry/fsimpl/verity/filesystem.go +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -19,6 +19,7 @@ import ( "fmt" "io" "strconv" + "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -179,23 +180,19 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de // corresponding Merkle tree file. // This is the offset of the root hash for child in its parent's Merkle // tree file. - off, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{ + off, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{ Root: child.lowerMerkleVD, Start: child.lowerMerkleVD, - }, &vfs.GetxattrOptions{ + }, &vfs.GetXattrOptions{ Name: merkleOffsetInParentXattr, - // Offset is a 32 bit integer. - Size: sizeOfInt32, + Size: sizeOfStringInt32, }) // The Merkle tree file for the child should have been created and // contains the expected xattrs. If the file or the xattr does not // exist, it indicates unexpected modifications to the file system. if err == syserror.ENOENT || err == syserror.ENODATA { - if noCrashOnVerificationFailure { - return nil, err - } - panic(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleOffsetInParentXattr, childPath, err)) + return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleOffsetInParentXattr, childPath, err)) } if err != nil { return nil, err @@ -204,10 +201,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de // unexpected modifications to the file system. offset, err := strconv.Atoi(off) if err != nil { - if noCrashOnVerificationFailure { - return nil, syserror.EINVAL - } - panic(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleOffsetInParentXattr, childPath, err)) + return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleOffsetInParentXattr, childPath, err)) } // Open parent Merkle tree file to read and verify child's root hash. @@ -221,10 +215,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de // The parent Merkle tree file should have been created. If it's // missing, it indicates an unexpected modification to the file system. if err == syserror.ENOENT { - if noCrashOnVerificationFailure { - return nil, err - } - panic(fmt.Sprintf("Failed to open parent Merkle file for %s: %v", childPath, err)) + return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to open parent Merkle file for %s: %v", childPath, err)) } if err != nil { return nil, err @@ -233,19 +224,16 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de // dataSize is the size of raw data for the Merkle tree. For a file, // dataSize is the size of the whole file. For a directory, dataSize is // the size of all its children's root hashes. - dataSize, err := parentMerkleFD.Getxattr(ctx, &vfs.GetxattrOptions{ + dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{ Name: merkleSizeXattr, - Size: sizeOfInt32, + Size: sizeOfStringInt32, }) // The Merkle tree file for the child should have been created and // contains the expected xattrs. If the file or the xattr does not // exist, it indicates unexpected modifications to the file system. if err == syserror.ENOENT || err == syserror.ENODATA { - if noCrashOnVerificationFailure { - return nil, err - } - panic(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleSizeXattr, childPath, err)) + return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleSizeXattr, childPath, err)) } if err != nil { return nil, err @@ -255,10 +243,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de // unexpected modifications to the file system. parentSize, err := strconv.Atoi(dataSize) if err != nil { - if noCrashOnVerificationFailure { - return nil, syserror.EINVAL - } - panic(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err)) + return nil, alertIntegrityViolation(syserror.EINVAL, fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err)) } fdReader := vfs.FileReadWriteSeeker{ @@ -270,11 +255,8 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de // contain the root hash of the children in the parent Merkle tree when // Verify returns with success. var buf bytes.Buffer - if err := merkletree.Verify(&buf, &fdReader, &fdReader, int64(parentSize), int64(offset), int64(merkletree.DigestSize()), parent.rootHash, true /* dataAndTreeInSameFile */); err != nil && err != io.EOF { - if noCrashOnVerificationFailure { - return nil, syserror.EIO - } - panic(fmt.Sprintf("Verification for %s failed: %v", childPath, err)) + if _, err := merkletree.Verify(&buf, &fdReader, &fdReader, int64(parentSize), int64(offset), int64(merkletree.DigestSize()), parent.rootHash, true /* dataAndTreeInSameFile */); err != nil && err != io.EOF { + return nil, alertIntegrityViolation(syserror.EIO, fmt.Sprintf("Verification for %s failed: %v", childPath, err)) } // Cache child root hash when it's verified the first time. @@ -370,10 +352,7 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, // corresponding Merkle tree is found. This indicates an // unexpected modification to the file system that // removed/renamed the child. - if noCrashOnVerificationFailure { - return nil, childErr - } - panic(fmt.Sprintf("Target file %s is expected but missing", parentPath+"/"+name)) + return nil, alertIntegrityViolation(childErr, fmt.Sprintf("Target file %s is expected but missing", parentPath+"/"+name)) } else if childErr == nil && childMerkleErr == syserror.ENOENT { // If in allowRuntimeEnable mode, and the Merkle tree file is // not created yet, we create an empty Merkle tree file, so that @@ -392,6 +371,7 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, Path: fspath.Parse(childMerkleFilename), }, &vfs.OpenOptions{ Flags: linux.O_RDWR | linux.O_CREAT, + Mode: 0644, }) if err != nil { return nil, err @@ -409,11 +389,16 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, // If runtime enable is not allowed. This indicates an // unexpected modification to the file system that // removed/renamed the Merkle tree file. - if noCrashOnVerificationFailure { - return nil, childMerkleErr - } - panic(fmt.Sprintf("Expected Merkle file for target %s but none found", parentPath+"/"+name)) + return nil, alertIntegrityViolation(childMerkleErr, fmt.Sprintf("Expected Merkle file for target %s but none found", parentPath+"/"+name)) } + } else if childErr == syserror.ENOENT && childMerkleErr == syserror.ENOENT { + // Both the child and the corresponding Merkle tree are missing. + // This could be an unexpected modification or due to incorrect + // parameter. + // TODO(b/167752508): Investigate possible ways to differentiate + // cases that both files are deleted from cases that they never + // exist in the file system. + return nil, alertIntegrityViolation(childErr, fmt.Sprintf("Failed to find file %s", parentPath+"/"+name)) } mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID) @@ -572,8 +557,183 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // OpenAt implements vfs.FilesystemImpl.OpenAt. func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - //TODO(b/159261227): Implement OpenAt. - return nil, nil + // Verity fs is read-only. + if opts.Flags&(linux.O_WRONLY|linux.O_CREAT) != 0 { + return nil, syserror.EROFS + } + + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + + start := rp.Start().Impl().(*dentry) + if rp.Done() { + return start.openLocked(ctx, rp, &opts) + } + +afterTrailingSymlink: + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + + // Check for search permission in the parent directory. + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + + // Open existing child or follow symlink. + parent.dirMu.Lock() + child, err := fs.stepLocked(ctx, rp, parent, false /*mayFollowSymlinks*/, &ds) + parent.dirMu.Unlock() + if err != nil { + return nil, err + } + if child.isSymlink() && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + start = parent + goto afterTrailingSymlink + } + return child.openLocked(ctx, rp, &opts) +} + +// Preconditions: fs.renameMu must be locked. +func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + // Users should not open the Merkle tree files. Those are for verity fs + // use only. + if strings.Contains(d.name, merklePrefix) { + return nil, syserror.EPERM + } + ats := vfs.AccessTypesForOpenFlags(opts) + if err := d.checkPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + + // Verity fs is read-only. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EROFS + } + + // Get the path to the target file. This is only used to provide path + // information in failure case. + path, err := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD) + if err != nil { + return nil, err + } + + // Open the file in the underlying file system. + lowerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + }, opts) + + // The file should exist, as we succeeded in finding its dentry. If it's + // missing, it indicates an unexpected modification to the file system. + if err != nil { + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(err, fmt.Sprintf("File %s expected but not found", path)) + } + return nil, err + } + + // lowerFD needs to be cleaned up if any error occurs. IncRef will be + // called if a verity FD is successfully created. + defer lowerFD.DecRef(ctx) + + // Open the Merkle tree file corresponding to the current file/directory + // to be used later for verifying Read/Walk. + merkleReader, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerMerkleVD, + Start: d.lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + + // The Merkle tree file should exist, as we succeeded in finding its + // dentry. If it's missing, it indicates an unexpected modification to + // the file system. + if err != nil { + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(err, fmt.Sprintf("Merkle file for %s expected but not found", path)) + } + return nil, err + } + + // merkleReader needs to be cleaned up if any error occurs. IncRef will + // be called if a verity FD is successfully created. + defer merkleReader.DecRef(ctx) + + lowerFlags := lowerFD.StatusFlags() + lowerFDOpts := lowerFD.Options() + var merkleWriter *vfs.FileDescription + var parentMerkleWriter *vfs.FileDescription + + // Only open the Merkle tree files for write if in allowRuntimeEnable + // mode. + if d.fs.allowRuntimeEnable { + merkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerMerkleVD, + Start: d.lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_WRONLY | linux.O_APPEND, + }) + if err != nil { + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(err, fmt.Sprintf("Merkle file for %s expected but not found", path)) + } + return nil, err + } + // merkleWriter is cleaned up if any error occurs. IncRef will + // be called if a verity FD is created successfully. + defer merkleWriter.DecRef(ctx) + + if d.parent != nil { + parentMerkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.parent.lowerMerkleVD, + Start: d.parent.lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_WRONLY | linux.O_APPEND, + }) + if err != nil { + if err == syserror.ENOENT { + parentPath, _ := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.parent.lowerVD) + return nil, alertIntegrityViolation(err, fmt.Sprintf("Merkle file for %s expected but not found", parentPath)) + } + return nil, err + } + // parentMerkleWriter is cleaned up if any error occurs. IncRef + // will be called if a verity FD is created successfully. + defer parentMerkleWriter.DecRef(ctx) + } + } + + fd := &fileDescription{ + d: d, + lowerFD: lowerFD, + merkleReader: merkleReader, + merkleWriter: merkleWriter, + parentMerkleWriter: parentMerkleWriter, + isDir: d.isDir(), + } + + if err := fd.vfsfd.Init(fd, lowerFlags, rp.Mount(), &d.vfsd, &lowerFDOpts); err != nil { + return nil, err + } + lowerFD.IncRef() + merkleReader.IncRef() + if merkleWriter != nil { + merkleWriter.IncRef() + } + if parentMerkleWriter != nil { + parentMerkleWriter.IncRef() + } + return &fd.vfsfd, err } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. @@ -649,7 +809,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return syserror.EROFS } -// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() @@ -660,8 +820,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath return nil, syserror.ECONNREFUSED } -// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { +// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. +func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) @@ -670,14 +830,14 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si return nil, err } lowerVD := d.lowerVD - return fs.vfsfs.VirtualFilesystem().ListxattrAt(ctx, d.fs.creds, &vfs.PathOperation{ + return fs.vfsfs.VirtualFilesystem().ListXattrAt(ctx, d.fs.creds, &vfs.PathOperation{ Root: lowerVD, Start: lowerVD, }, size) } -// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { +// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. +func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) @@ -686,20 +846,20 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt return "", err } lowerVD := d.lowerVD - return fs.vfsfs.VirtualFilesystem().GetxattrAt(ctx, d.fs.creds, &vfs.PathOperation{ + return fs.vfsfs.VirtualFilesystem().GetXattrAt(ctx, d.fs.creds, &vfs.PathOperation{ Root: lowerVD, Start: lowerVD, }, &opts) } -// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. -func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { +// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. +func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { // Verity file system is read-only. return syserror.EROFS } -// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. -func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { +// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. +func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { // Verity file system is read-only. return syserror.EROFS } diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go index eedb5f484..fc5eabbca 100644 --- a/pkg/sentry/fsimpl/verity/verity.go +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -22,16 +22,23 @@ package verity import ( + "fmt" + "strconv" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/marshal/primitive" + "gvisor.dev/gvisor/pkg/merkletree" + "gvisor.dev/gvisor/pkg/sentry/arch" fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Name is the default filesystem name. @@ -50,8 +57,9 @@ const merkleOffsetInParentXattr = "user.merkle.offset" // whole file. For a directory, it's the size of all its children's root hashes. const merkleSizeXattr = "user.merkle.size" -// sizeOfInt32 is the size in bytes for a 32 bit integer in extended attributes. -const sizeOfInt32 = 4 +// sizeOfStringInt32 is the size for a 32 bit integer stored as string in +// extended attributes. The maximum value of a 32 bit integer is 10 digits. +const sizeOfStringInt32 = 10 // noCrashOnVerificationFailure indicates whether the sandbox should panic // whenever verification fails. If true, an error is returned instead of @@ -66,9 +74,13 @@ var noCrashOnVerificationFailure bool var verityMu sync.RWMutex // FilesystemType implements vfs.FilesystemType. +// +// +stateify savable type FilesystemType struct{} // filesystem implements vfs.FilesystemImpl. +// +// +stateify savable type filesystem struct { vfsfs vfs.Filesystem @@ -93,11 +105,13 @@ type filesystem struct { // renameMu synchronizes renaming with non-renaming operations in order // to ensure consistent lock ordering between dentry.dirMu in different // dentries. - renameMu sync.RWMutex + renameMu sync.RWMutex `state:"nosave"` } // InternalFilesystemOptions may be passed as // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. +// +// +stateify savable type InternalFilesystemOptions struct { // RootMerkleFileName is the name of the verity root Merkle tree file. RootMerkleFileName string @@ -128,6 +142,16 @@ func (FilesystemType) Name() string { return Name } +// alertIntegrityViolation alerts a violation of integrity, which usually means +// unexpected modification to the file system is detected. In +// noCrashOnVerificationFailure mode, it returns an error, otherwise it panic. +func alertIntegrityViolation(err error, msg string) error { + if noCrashOnVerificationFailure { + return err + } + panic(msg) +} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { iopts, ok := opts.InternalData.(InternalFilesystemOptions) @@ -141,6 +165,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // verity, and should not be exposed or connected. mopts := &vfs.MountOptions{ GetFilesystemOptions: iopts.LowerGetFSOptions, + InternalMount: true, } mnt, err := vfsObj.MountDisconnected(ctx, creds, "", iopts.LowerName, mopts) if err != nil { @@ -197,15 +222,12 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return nil, nil, err } } else if err != nil { - // Failed to get dentry for the root Merkle file. This indicates - // an attack that removed/renamed the root Merkle file, or it's - // never generated. - if noCrashOnVerificationFailure { - fs.vfsfs.DecRef(ctx) - d.DecRef(ctx) - return nil, nil, err - } - panic("Failed to find root Merkle file") + // Failed to get dentry for the root Merkle file. This + // indicates an unexpected modification that removed/renamed + // the root Merkle file, or it's never generated. + fs.vfsfs.DecRef(ctx) + d.DecRef(ctx) + return nil, nil, alertIntegrityViolation(err, "Failed to find root Merkle file") } d.lowerMerkleVD = lowerMerkleVD @@ -243,6 +265,8 @@ func (fs *filesystem) Release(ctx context.Context) { } // dentry implements vfs.DentryImpl. +// +// +stateify savable type dentry struct { vfsd vfs.Dentry @@ -269,7 +293,7 @@ type dentry struct { // and dirents (if not nil) is a cache of dirents as returned by // directoryFDs representing this directory. children is protected by // dirMu. - dirMu sync.Mutex + dirMu sync.Mutex `state:"nosave"` children map[string]*dentry // lowerVD is the VirtualDentry in the underlying file system. @@ -413,6 +437,8 @@ func (d *dentry) readlink(ctx context.Context) (string, error) { // FileDescription is a wrapper of the underlying lowerFD, with support to build // Merkle trees through the Linux fs-verity API to verify contents read from // lowerFD. +// +// +stateify savable type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -471,12 +497,247 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) return syserror.EPERM } +// generateMerkle generates a Merkle tree file for fd. If fd points to a file +// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The root +// hash of the generated Merkle tree and the data size is returned. +// If fd points to a regular file, the data is the content of the file. If fd +// points to a directory, the data is all root hahes of its children, written +// to the Merkle tree file. +func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, error) { + fdReader := vfs.FileReadWriteSeeker{ + FD: fd.lowerFD, + Ctx: ctx, + } + merkleReader := vfs.FileReadWriteSeeker{ + FD: fd.merkleReader, + Ctx: ctx, + } + merkleWriter := vfs.FileReadWriteSeeker{ + FD: fd.merkleWriter, + Ctx: ctx, + } + var rootHash []byte + var dataSize uint64 + + switch atomic.LoadUint32(&fd.d.mode) & linux.S_IFMT { + case linux.S_IFREG: + // For a regular file, generate a Merkle tree based on its + // content. + var err error + stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{}) + if err != nil { + return nil, 0, err + } + dataSize = stat.Size + + rootHash, err = merkletree.Generate(&fdReader, int64(dataSize), &merkleReader, &merkleWriter, false /* dataAndTreeInSameFile */) + if err != nil { + return nil, 0, err + } + case linux.S_IFDIR: + // For a directory, generate a Merkle tree based on the root + // hashes of its children that has already been written to the + // Merkle tree file. + merkleStat, err := fd.merkleReader.Stat(ctx, vfs.StatOptions{}) + if err != nil { + return nil, 0, err + } + dataSize = merkleStat.Size + + rootHash, err = merkletree.Generate(&merkleReader, int64(dataSize), &merkleReader, &merkleWriter, true /* dataAndTreeInSameFile */) + if err != nil { + return nil, 0, err + } + default: + // TODO(b/167728857): Investigate whether and how we should + // enable other types of file. + return nil, 0, syserror.EINVAL + } + return rootHash, dataSize, nil +} + +// enableVerity enables verity features on fd by generating a Merkle tree file +// and stores its root hash in its parent directory's Merkle tree. +func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO) (uintptr, error) { + if !fd.d.fs.allowRuntimeEnable { + return 0, syserror.EPERM + } + + // Lock to prevent other threads performing enable or access the file + // while it's being enabled. + verityMu.Lock() + defer verityMu.Unlock() + + // In allowRuntimeEnable mode, the underlying fd and read/write fd for + // the Merkle tree file should have all been initialized. For any file + // or directory other than the root, the parent Merkle tree file should + // have also been initialized. + if fd.lowerFD == nil || fd.merkleReader == nil || fd.merkleWriter == nil || (fd.parentMerkleWriter == nil && fd.d != fd.d.fs.rootDentry) { + return 0, alertIntegrityViolation(syserror.EIO, "Unexpected verity fd: missing expected underlying fds") + } + + rootHash, dataSize, err := fd.generateMerkle(ctx) + if err != nil { + return 0, err + } + + if fd.parentMerkleWriter != nil { + stat, err := fd.parentMerkleWriter.Stat(ctx, vfs.StatOptions{}) + if err != nil { + return 0, err + } + + // Write the root hash of fd to the parent directory's Merkle + // tree file, as it should be part of the parent Merkle tree + // data. parentMerkleWriter is open with O_APPEND, so it + // should write directly to the end of the file. + if _, err = fd.parentMerkleWriter.Write(ctx, usermem.BytesIOSequence(rootHash), vfs.WriteOptions{}); err != nil { + return 0, err + } + + // Record the offset of the root hash of fd in parent directory's + // Merkle tree file. + if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{ + Name: merkleOffsetInParentXattr, + Value: strconv.Itoa(int(stat.Size)), + }); err != nil { + return 0, err + } + } + + // Record the size of the data being hashed for fd. + if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{ + Name: merkleSizeXattr, + Value: strconv.Itoa(int(dataSize)), + }); err != nil { + return 0, err + } + fd.d.rootHash = append(fd.d.rootHash, rootHash...) + return 0, nil +} + +// measureVerity returns the root hash of fd, saved in args[2]. +func (fd *fileDescription) measureVerity(ctx context.Context, uio usermem.IO, verityDigest usermem.Addr) (uintptr, error) { + t := kernel.TaskFromContext(ctx) + var metadata linux.DigestMetadata + + // If allowRuntimeEnable is true, an empty fd.d.rootHash indicates that + // verity is not enabled for the file. If allowRuntimeEnable is false, + // this is an integrity violation because all files should have verity + // enabled, in which case fd.d.rootHash should be set. + if len(fd.d.rootHash) == 0 { + if fd.d.fs.allowRuntimeEnable { + return 0, syserror.ENODATA + } + return 0, alertIntegrityViolation(syserror.ENODATA, "Ioctl measureVerity: no root hash found") + } + + // The first part of VerityDigest is the metadata. + if _, err := metadata.CopyIn(t, verityDigest); err != nil { + return 0, err + } + if metadata.DigestSize < uint16(len(fd.d.rootHash)) { + return 0, syserror.EOVERFLOW + } + + // Populate the output digest size, since DigestSize is both input and + // output. + metadata.DigestSize = uint16(len(fd.d.rootHash)) + + // First copy the metadata. + if _, err := metadata.CopyOut(t, verityDigest); err != nil { + return 0, err + } + + // Now copy the root hash bytes to the memory after metadata. + _, err := t.CopyOutBytes(usermem.Addr(uintptr(verityDigest)+linux.SizeOfDigestMetadata), fd.d.rootHash) + return 0, err +} + +func (fd *fileDescription) verityFlags(ctx context.Context, uio usermem.IO, flags usermem.Addr) (uintptr, error) { + f := int32(0) + + // All enabled files should store a root hash. This flag is not settable + // via FS_IOC_SETFLAGS. + if len(fd.d.rootHash) != 0 { + f |= linux.FS_VERITY_FL + } + + t := kernel.TaskFromContext(ctx) + _, err := primitive.CopyInt32Out(t, flags, f) + return 0, err +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch cmd := args[1].Uint(); cmd { + case linux.FS_IOC_ENABLE_VERITY: + return fd.enableVerity(ctx, uio) + case linux.FS_IOC_MEASURE_VERITY: + return fd.measureVerity(ctx, uio, args[2].Pointer()) + case linux.FS_IOC_GETFLAGS: + return fd.verityFlags(ctx, uio, args[2].Pointer()) + default: + // TODO(b/169682228): Investigate which ioctl commands should + // be allowed. + return 0, syserror.ENOSYS + } +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + // No need to verify if the file is not enabled yet in + // allowRuntimeEnable mode. + if fd.d.fs.allowRuntimeEnable && len(fd.d.rootHash) == 0 { + return fd.lowerFD.PRead(ctx, dst, offset, opts) + } + + // dataSize is the size of the whole file. + dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{ + Name: merkleSizeXattr, + Size: sizeOfStringInt32, + }) + + // The Merkle tree file for the child should have been created and + // contains the expected xattrs. If the xattr does not exist, it + // indicates unexpected modifications to the file system. + if err == syserror.ENODATA { + return 0, alertIntegrityViolation(err, fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err)) + } + if err != nil { + return 0, err + } + + // The dataSize xattr should be an integer. If it's not, it indicates + // unexpected modifications to the file system. + size, err := strconv.Atoi(dataSize) + if err != nil { + return 0, alertIntegrityViolation(err, fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err)) + } + + dataReader := vfs.FileReadWriteSeeker{ + FD: fd.lowerFD, + Ctx: ctx, + } + + merkleReader := vfs.FileReadWriteSeeker{ + FD: fd.merkleReader, + Ctx: ctx, + } + + n, err := merkletree.Verify(dst.Writer(ctx), &dataReader, &merkleReader, int64(size), offset, dst.NumBytes(), fd.d.rootHash, false /* dataAndTreeInSameFile */) + if err != nil { + return 0, alertIntegrityViolation(syserror.EINVAL, fmt.Sprintf("Verification failed: %v", err)) + } + return n, err +} + // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { - return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) + return fd.lowerFD.LockPOSIX(ctx, uid, t, start, length, whence, block) } // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { - return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) + return fd.lowerFD.UnlockPOSIX(ctx, uid, start, length, whence) } diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go new file mode 100644 index 000000000..9b5641092 --- /dev/null +++ b/pkg/sentry/fsimpl/verity/verity_test.go @@ -0,0 +1,349 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package verity + +import ( + "fmt" + "io" + "math/rand" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +// rootMerkleFilename is the name of the root Merkle tree file. +const rootMerkleFilename = "root.verity" + +// maxDataSize is the maximum data size written to the file for test. +const maxDataSize = 100000 + +// newVerityRoot creates a new verity mount, and returns the root. The +// underlying file system is tmpfs. If the error is not nil, then cleanup +// should be called when the root is no longer needed. +func newVerityRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) { + rand.Seed(time.Now().UnixNano()) + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(ctx); err != nil { + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err) + } + + vfsObj.MustRegisterFilesystemType("verity", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + + vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + + mntns, err := vfsObj.NewMountNamespace(ctx, auth.CredentialsFromContext(ctx), "", "verity", &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: InternalFilesystemOptions{ + RootMerkleFileName: rootMerkleFilename, + LowerName: "tmpfs", + AllowRuntimeEnable: true, + NoCrashOnVerificationFailure: true, + }, + }, + }) + if err != nil { + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create verity root mount: %v", err) + } + root := mntns.Root() + return vfsObj, root, func() { + root.DecRef(ctx) + mntns.DecRef(ctx) + }, nil +} + +// newFileFD creates a new file in the verity mount, and returns the FD. The FD +// points to a file that has random data generated. +func newFileFD(ctx context.Context, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, filePath string, mode linux.FileMode) (*vfs.FileDescription, int, error) { + creds := auth.CredentialsFromContext(ctx) + lowerRoot := root.Dentry().Impl().(*dentry).lowerVD + + // Create the file in the underlying file system. + lowerFD, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: lowerRoot, + Start: lowerRoot, + Path: fspath.Parse(filePath), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: linux.ModeRegular | mode, + }) + if err != nil { + return nil, 0, err + } + + // Generate random data to be written to the file. + dataSize := rand.Intn(maxDataSize) + 1 + data := make([]byte, dataSize) + rand.Read(data) + + // Write directly to the underlying FD, since verity FD is read-only. + n, err := lowerFD.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + return nil, 0, err + } + + if n != int64(len(data)) { + return nil, 0, fmt.Errorf("lowerFD.Write got write length %d, want %d", n, len(data)) + } + + lowerFD.DecRef(ctx) + + // Now open the verity file descriptor. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filePath), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + Mode: linux.ModeRegular | mode, + }) + return fd, dataSize, err +} + +// corruptRandomBit randomly flips a bit in the file represented by fd. +func corruptRandomBit(ctx context.Context, fd *vfs.FileDescription, size int) error { + // Flip a random bit in the underlying file. + randomPos := int64(rand.Intn(size)) + byteToModify := make([]byte, 1) + if _, err := fd.PRead(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.ReadOptions{}); err != nil { + return fmt.Errorf("lowerFD.PRead failed: %v", err) + } + byteToModify[0] ^= 1 + if _, err := fd.PWrite(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.WriteOptions{}); err != nil { + return fmt.Errorf("lowerFD.PWrite failed: %v", err) + } + return nil +} + +// TestOpen ensures that when a file is created, the corresponding Merkle tree +// file and the root Merkle tree file exist. +func TestOpen(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj, root, cleanup, err := newVerityRoot(ctx) + if err != nil { + t.Fatalf("Failed to create new verity root: %v", err) + } + defer cleanup() + + filename := "verity-test-file" + if _, _, err := newFileFD(ctx, vfsObj, root, filename, 0644); err != nil { + t.Fatalf("Failed to create new file fd: %v", err) + } + + // Ensure that the corresponding Merkle tree file is created. + lowerRoot := root.Dentry().Impl().(*dentry).lowerVD + if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: lowerRoot, + Start: lowerRoot, + Path: fspath.Parse(merklePrefix + filename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }); err != nil { + t.Errorf("Failed to open Merkle tree file %s: %v", merklePrefix+filename, err) + } + + // Ensure the root merkle tree file is created. + if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: lowerRoot, + Start: lowerRoot, + Path: fspath.Parse(merklePrefix + rootMerkleFilename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }); err != nil { + t.Errorf("Failed to open root Merkle tree file %s: %v", merklePrefix+rootMerkleFilename, err) + } +} + +// TestUntouchedFileSucceeds ensures that read from an untouched verity file +// succeeds after enabling verity for it. +func TestReadUntouchedFileSucceeds(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj, root, cleanup, err := newVerityRoot(ctx) + if err != nil { + t.Fatalf("Failed to create new verity root: %v", err) + } + defer cleanup() + + filename := "verity-test-file" + fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("Failed to create new file fd: %v", err) + } + + // Enable verity on the file and confirm a normal read succeeds. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl failed: %v", err) + } + + buf := make([]byte, size) + n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}) + if err != nil && err != io.EOF { + t.Fatalf("fd.PRead failed: %v", err) + } + + if n != int64(size) { + t.Errorf("fd.PRead got read length %d, want %d", n, size) + } +} + +// TestReopenUntouchedFileSucceeds ensures that reopen an untouched verity file +// succeeds after enabling verity for it. +func TestReopenUntouchedFileSucceeds(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj, root, cleanup, err := newVerityRoot(ctx) + if err != nil { + t.Fatalf("Failed to create new verity root: %v", err) + } + defer cleanup() + + filename := "verity-test-file" + fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("Failed to create new file fd: %v", err) + } + + // Enable verity on the file and confirms a normal read succeeds. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl failed: %v", err) + } + + // Ensure reopening the verity enabled file succeeds. + if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + Mode: linux.ModeRegular, + }); err != nil { + t.Errorf("reopen enabled file failed: %v", err) + } +} + +// TestModifiedFileFails ensures that read from a modified verity file fails. +func TestModifiedFileFails(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj, root, cleanup, err := newVerityRoot(ctx) + if err != nil { + t.Fatalf("Failed to create new verity root: %v", err) + } + defer cleanup() + + filename := "verity-test-file" + fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("Failed to create new file fd: %v", err) + } + + // Enable verity on the file. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl failed: %v", err) + } + + // Open a new lowerFD that's read/writable. + lowerVD := fd.Impl().(*fileDescription).d.lowerVD + + lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR, + }) + if err != nil { + t.Fatalf("Open lowerFD failed: %v", err) + } + + if err := corruptRandomBit(ctx, lowerFD, size); err != nil { + t.Fatalf("%v", err) + } + + // Confirm that read from the modified file fails. + buf := make([]byte, size) + if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil { + t.Fatalf("fd.PRead succeeded with modified file") + } +} + +// TestModifiedMerkleFails ensures that read from a verity file fails if the +// corresponding Merkle tree file is modified. +func TestModifiedMerkleFails(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj, root, cleanup, err := newVerityRoot(ctx) + if err != nil { + t.Fatalf("Failed to create new verity root: %v", err) + } + defer cleanup() + + filename := "verity-test-file" + fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("Failed to create new file fd: %v", err) + } + + // Enable verity on the file. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl failed: %v", err) + } + + // Open a new lowerMerkleFD that's read/writable. + lowerMerkleVD := fd.Impl().(*fileDescription).d.lowerMerkleVD + + lowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: lowerMerkleVD, + Start: lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR, + }) + if err != nil { + t.Fatalf("Open lowerMerkleFD failed: %v", err) + } + + // Flip a random bit in the Merkle tree file. + stat, err := lowerMerkleFD.Stat(ctx, vfs.StatOptions{}) + if err != nil { + t.Fatalf("Failed to get lowerMerkleFD stat: %v", err) + } + merkleSize := int(stat.Size) + if err := corruptRandomBit(ctx, lowerMerkleFD, merkleSize); err != nil { + t.Fatalf("%v", err) + } + + // Confirm that read from a file with modified Merkle tree fails. + buf := make([]byte, size) + if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil { + fmt.Println(buf) + t.Fatalf("fd.PRead succeeded with modified Merkle file") + } +} |