diff options
Diffstat (limited to 'pkg/sentry/fsimpl')
79 files changed, 1870 insertions, 893 deletions
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD index 48e13613a..84baaac66 100644 --- a/pkg/sentry/fsimpl/devpts/BUILD +++ b/pkg/sentry/fsimpl/devpts/BUILD @@ -35,6 +35,7 @@ go_library( "//pkg/refs", "//pkg/safemem", "//pkg/sentry/arch", + "//pkg/sentry/fs", "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel", diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 903135fae..d5c5aaa8c 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -37,27 +37,51 @@ const Name = "devpts" // FilesystemType implements vfs.FilesystemType. // // +stateify savable -type FilesystemType struct{} +type FilesystemType struct { + initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported. + initErr error + + // fs backs all mounts of this FilesystemType. root is fs' root. fs and root + // are immutable. + fs *vfs.Filesystem + root *vfs.Dentry +} // Name implements vfs.FilesystemType.Name. -func (FilesystemType) Name() string { +func (*FilesystemType) Name() string { return Name } -var _ vfs.FilesystemType = (*FilesystemType)(nil) - // GetFilesystem implements vfs.FilesystemType.GetFilesystem. -func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +func (fstype *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { // No data allowed. if opts.Data != "" { return nil, nil, syserror.EINVAL } - fs, root, err := fstype.newFilesystem(vfsObj, creds) - if err != nil { - return nil, nil, err + fstype.initOnce.Do(func() { + fs, root, err := fstype.newFilesystem(vfsObj, creds) + if err != nil { + fstype.initErr = err + return + } + fstype.fs = fs.VFSFilesystem() + fstype.root = root.VFSDentry() + }) + if fstype.initErr != nil { + return nil, nil, fstype.initErr + } + fstype.fs.IncRef() + fstype.root.IncRef() + return fstype.fs, fstype.root, nil +} + +// Release implements vfs.FilesystemType.Release. +func (fstype *FilesystemType) Release(ctx context.Context) { + if fstype.fs != nil { + fstype.root.DecRef(ctx) + fstype.fs.DecRef(ctx) } - return fs.Filesystem.VFSFilesystem(), root.VFSDentry(), nil } // +stateify savable @@ -69,7 +93,7 @@ type filesystem struct { // newFilesystem creates a new devpts filesystem with root directory and ptmx // master inode. It returns the filesystem and root Dentry. -func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) { +func (fstype *FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) { devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, nil, err @@ -87,7 +111,9 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555) root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) root.EnableLeakCheck() - root.dentry.Init(root) + + var rootD kernfs.Dentry + rootD.Init(&fs.Filesystem, root) // Construct the pts master inode and dentry. Linux always uses inode // id 2 for ptmx. See fs/devpts/inode.c:mknod_ptmx. @@ -95,15 +121,14 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds root: root, } master.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666) - master.dentry.Init(master) // Add the master as a child of the root. - links := root.OrderedChildren.Populate(&root.dentry, map[string]*kernfs.Dentry{ - "ptmx": &master.dentry, + links := root.OrderedChildren.Populate(map[string]kernfs.Inode{ + "ptmx": master, }) root.IncLinks(links) - return fs, &root.dentry, nil + return fs, &rootD, nil } // Release implements vfs.FilesystemImpl.Release. @@ -117,24 +142,19 @@ func (fs *filesystem) Release(ctx context.Context) { // +stateify savable type rootInode struct { implStatFS - kernfs.AlwaysValid + kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren kernfs.InodeNotSymlink + kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid. kernfs.OrderedChildren rootInodeRefs locks vfs.FileLocks - // Keep a reference to this inode's dentry. - dentry kernfs.Dentry - // master is the master pty inode. Immutable. master *masterInode - // root is the root directory inode for this filesystem. Immutable. - root *rootInode - // mu protects the fields below. mu sync.Mutex `state:"nosave"` @@ -173,21 +193,24 @@ func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error) // Linux always uses pty index + 3 as the inode id. See // fs/devpts/inode.c:devpts_pty_new(). replica.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600) - replica.dentry.Init(replica) i.replicas[idx] = replica return t, nil } // masterClose is called when the master end of t is closed. -func (i *rootInode) masterClose(t *Terminal) { +func (i *rootInode) masterClose(ctx context.Context, t *Terminal) { i.mu.Lock() defer i.mu.Unlock() // Sanity check that replica with idx exists. - if _, ok := i.replicas[t.n]; !ok { + ri, ok := i.replicas[t.n] + if !ok { panic(fmt.Sprintf("pty with index %d does not exist", t.n)) } + + // Drop the ref on replica inode taken during rootInode.allocateTerminal. + ri.DecRef(ctx) delete(i.replicas, t.n) } @@ -203,16 +226,22 @@ func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.D } // Lookup implements kernfs.Inode.Lookup. -func (i *rootInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) { +func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { + // Check if a static entry was looked up. + if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil { + return d, nil + } + + // Not a static entry. idx, err := strconv.ParseUint(name, 10, 32) if err != nil { return nil, syserror.ENOENT } i.mu.Lock() defer i.mu.Unlock() - if si, ok := i.replicas[uint32(idx)]; ok { - si.dentry.IncRef() - return &si.dentry, nil + if ri, ok := i.replicas[uint32(idx)]; ok { + ri.IncRef() // This ref is passed to the dentry upon creation via Init. + return ri, nil } return nil, syserror.ENOENT @@ -243,8 +272,8 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, } // DecRef implements kernfs.Inode.DecRef. -func (i *rootInode) DecRef(context.Context) { - i.rootInodeRefs.DecRef(i.Destroy) +func (i *rootInode) DecRef(ctx context.Context) { + i.rootInodeRefs.DecRef(func() { i.Destroy(ctx) }) } // +stateify savable diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 69c2fe951..fda30fb93 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -42,9 +42,6 @@ type masterInode struct { locks vfs.FileLocks - // Keep a reference to this inode's dentry. - dentry kernfs.Dentry - // root is the devpts root inode. root *rootInode } @@ -103,7 +100,7 @@ var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil) // Release implements vfs.FileDescriptionImpl.Release. func (mfd *masterFileDescription) Release(ctx context.Context) { - mfd.inode.root.masterClose(mfd.t) + mfd.inode.root.masterClose(ctx, mfd.t) } // EventRegister implements waiter.Waitable.EventRegister. diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go index 6515c5536..70c68cf0a 100644 --- a/pkg/sentry/fsimpl/devpts/replica.go +++ b/pkg/sentry/fsimpl/devpts/replica.go @@ -41,9 +41,6 @@ type replicaInode struct { locks vfs.FileLocks - // Keep a reference to this inode's dentry. - dentry kernfs.Dentry - // root is the devpts root inode. root *rootInode diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go index 6d1753080..e6fe0fc0d 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -71,6 +71,15 @@ func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virtua return fst.fs, fst.root, nil } +// Release implements vfs.FilesystemType.Release. +func (fst *FilesystemType) Release(ctx context.Context) { + if fst.fs != nil { + // Release the original reference obtained when creating the filesystem. + fst.root.DecRef(ctx) + fst.fs.DecRef(ctx) + } +} + // Accessor allows devices to create device special files in devtmpfs. type Accessor struct { vfsObj *vfs.VirtualFilesystem @@ -86,10 +95,13 @@ func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth if err != nil { return nil, err } + // Pass a reference on root to the Accessor. + root := mntns.Root() + root.IncRef() return &Accessor{ vfsObj: vfsObj, mntns: mntns, - root: mntns.Root(), + root: root, creds: creds, }, nil } diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go index 3a38b8bb4..e058eda7a 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go @@ -53,6 +53,7 @@ func setupDevtmpfs(t *testing.T) (context.Context, *auth.Credentials, *vfs.Virtu t.Fatalf("failed to create tmpfs root mount: %v", err) } root := mntns.Root() + root.IncRef() devpop := vfs.PathOperation{ Root: root, Start: root, diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index abc610ef3..7b1eec3da 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -51,6 +51,8 @@ go_library( "//pkg/fd", "//pkg/fspath", "//pkg/log", + "//pkg/marshal", + "//pkg/marshal/primitive", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fs", @@ -86,9 +88,9 @@ go_test( library = ":ext", deps = [ "//pkg/abi/linux", - "//pkg/binary", "//pkg/context", "//pkg/fspath", + "//pkg/marshal/primitive", "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/ext/disklayout", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index c349b886e..2ee7cc7ac 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -70,6 +70,7 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys } root := mntns.Root() + root.IncRef() tearDown := func() { root.DecRef(ctx) diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go index 8bb104ff0..1165234f9 100644 --- a/pkg/sentry/fsimpl/ext/block_map_file.go +++ b/pkg/sentry/fsimpl/ext/block_map_file.go @@ -18,7 +18,7 @@ import ( "io" "math" - "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/syserror" ) @@ -34,19 +34,19 @@ type blockMapFile struct { // directBlks are the direct blocks numbers. The physical blocks pointed by // these holds file data. Contains file blocks 0 to 11. - directBlks [numDirectBlks]uint32 + directBlks [numDirectBlks]primitive.Uint32 // indirectBlk is the physical block which contains (blkSize/4) direct block // numbers (as uint32 integers). - indirectBlk uint32 + indirectBlk primitive.Uint32 // doubleIndirectBlk is the physical block which contains (blkSize/4) indirect // block numbers (as uint32 integers). - doubleIndirectBlk uint32 + doubleIndirectBlk primitive.Uint32 // tripleIndirectBlk is the physical block which contains (blkSize/4) doubly // indirect block numbers (as uint32 integers). - tripleIndirectBlk uint32 + tripleIndirectBlk primitive.Uint32 // coverage at (i)th index indicates the amount of file data a node at // height (i) covers. Height 0 is the direct block. @@ -68,10 +68,12 @@ func newBlockMapFile(args inodeArgs) (*blockMapFile, error) { } blkMap := file.regFile.inode.diskInode.Data() - binary.Unmarshal(blkMap[:numDirectBlks*4], binary.LittleEndian, &file.directBlks) - binary.Unmarshal(blkMap[numDirectBlks*4:(numDirectBlks+1)*4], binary.LittleEndian, &file.indirectBlk) - binary.Unmarshal(blkMap[(numDirectBlks+1)*4:(numDirectBlks+2)*4], binary.LittleEndian, &file.doubleIndirectBlk) - binary.Unmarshal(blkMap[(numDirectBlks+2)*4:(numDirectBlks+3)*4], binary.LittleEndian, &file.tripleIndirectBlk) + for i := 0; i < numDirectBlks; i++ { + file.directBlks[i].UnmarshalBytes(blkMap[i*4 : (i+1)*4]) + } + file.indirectBlk.UnmarshalBytes(blkMap[numDirectBlks*4 : (numDirectBlks+1)*4]) + file.doubleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+1)*4 : (numDirectBlks+2)*4]) + file.tripleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+2)*4 : (numDirectBlks+3)*4]) return file, nil } @@ -117,16 +119,16 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) { switch { case offset < dirBlksEnd: // Direct block. - curR, err = f.read(f.directBlks[offset/f.regFile.inode.blkSize], offset%f.regFile.inode.blkSize, 0, dst[read:]) + curR, err = f.read(uint32(f.directBlks[offset/f.regFile.inode.blkSize]), offset%f.regFile.inode.blkSize, 0, dst[read:]) case offset < indirBlkEnd: // Indirect block. - curR, err = f.read(f.indirectBlk, offset-dirBlksEnd, 1, dst[read:]) + curR, err = f.read(uint32(f.indirectBlk), offset-dirBlksEnd, 1, dst[read:]) case offset < doubIndirBlkEnd: // Doubly indirect block. - curR, err = f.read(f.doubleIndirectBlk, offset-indirBlkEnd, 2, dst[read:]) + curR, err = f.read(uint32(f.doubleIndirectBlk), offset-indirBlkEnd, 2, dst[read:]) default: // Triply indirect block. - curR, err = f.read(f.tripleIndirectBlk, offset-doubIndirBlkEnd, 3, dst[read:]) + curR, err = f.read(uint32(f.tripleIndirectBlk), offset-doubIndirBlkEnd, 3, dst[read:]) } read += curR @@ -174,13 +176,13 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds read := 0 curChildOff := relFileOff % childCov for i := startIdx; i < endIdx; i++ { - var childPhyBlk uint32 + var childPhyBlk primitive.Uint32 err := readFromDisk(f.regFile.inode.fs.dev, curPhyBlkOff+int64(i*4), &childPhyBlk) if err != nil { return read, err } - n, err := f.read(childPhyBlk, curChildOff, height-1, dst[read:]) + n, err := f.read(uint32(childPhyBlk), curChildOff, height-1, dst[read:]) read += n if err != nil { return read, err diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go index 6fa84e7aa..ed98b482e 100644 --- a/pkg/sentry/fsimpl/ext/block_map_test.go +++ b/pkg/sentry/fsimpl/ext/block_map_test.go @@ -20,7 +20,7 @@ import ( "testing" "github.com/google/go-cmp/cmp" - "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" ) @@ -87,29 +87,33 @@ func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) { mockDisk := make([]byte, mockBMDiskSize) var fileData []byte blkNums := newBlkNumGen() - var data []byte + off := 0 + data := make([]byte, (numDirectBlks+3)*(*primitive.Uint32)(nil).SizeBytes()) // Write the direct blocks. for i := 0; i < numDirectBlks; i++ { - curBlkNum := blkNums.next() - data = binary.Marshal(data, binary.LittleEndian, curBlkNum) - fileData = append(fileData, writeFileDataToBlock(mockDisk, curBlkNum, 0, blkNums)...) + curBlkNum := primitive.Uint32(blkNums.next()) + curBlkNum.MarshalBytes(data[off:]) + off += curBlkNum.SizeBytes() + fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(curBlkNum), 0, blkNums)...) } // Write to indirect block. - indirectBlk := blkNums.next() - data = binary.Marshal(data, binary.LittleEndian, indirectBlk) - fileData = append(fileData, writeFileDataToBlock(mockDisk, indirectBlk, 1, blkNums)...) - - // Write to indirect block. - doublyIndirectBlk := blkNums.next() - data = binary.Marshal(data, binary.LittleEndian, doublyIndirectBlk) - fileData = append(fileData, writeFileDataToBlock(mockDisk, doublyIndirectBlk, 2, blkNums)...) - - // Write to indirect block. - triplyIndirectBlk := blkNums.next() - data = binary.Marshal(data, binary.LittleEndian, triplyIndirectBlk) - fileData = append(fileData, writeFileDataToBlock(mockDisk, triplyIndirectBlk, 3, blkNums)...) + indirectBlk := primitive.Uint32(blkNums.next()) + indirectBlk.MarshalBytes(data[off:]) + off += indirectBlk.SizeBytes() + fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(indirectBlk), 1, blkNums)...) + + // Write to double indirect block. + doublyIndirectBlk := primitive.Uint32(blkNums.next()) + doublyIndirectBlk.MarshalBytes(data[off:]) + off += doublyIndirectBlk.SizeBytes() + fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(doublyIndirectBlk), 2, blkNums)...) + + // Write to triple indirect block. + triplyIndirectBlk := primitive.Uint32(blkNums.next()) + triplyIndirectBlk.MarshalBytes(data[off:]) + fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(triplyIndirectBlk), 3, blkNums)...) args := inodeArgs{ fs: &filesystem{ @@ -142,9 +146,9 @@ func writeFileDataToBlock(disk []byte, blkNum uint32, height uint, blkNums *blkN var fileData []byte for off := blkNum * mockBMBlkSize; off < (blkNum+1)*mockBMBlkSize; off += 4 { - curBlkNum := blkNums.next() - copy(disk[off:off+4], binary.Marshal(nil, binary.LittleEndian, curBlkNum)) - fileData = append(fileData, writeFileDataToBlock(disk, curBlkNum, height-1, blkNums)...) + curBlkNum := primitive.Uint32(blkNums.next()) + curBlkNum.MarshalBytes(disk[off : off+4]) + fileData = append(fileData, writeFileDataToBlock(disk, uint32(curBlkNum), height-1, blkNums)...) } return fileData } diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index 452450d82..0ad79b381 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -16,7 +16,6 @@ package ext import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -100,7 +99,7 @@ func newDirectory(args inodeArgs, newDirent bool) (*directory, error) { } else { curDirent.diskDirent = &disklayout.DirentOld{} } - binary.Unmarshal(buf, binary.LittleEndian, curDirent.diskDirent) + curDirent.diskDirent.UnmarshalBytes(buf) if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 { // Inode number and name length fields being set to 0 is used to indicate diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD index 9bd9c76c0..d98a05dd8 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/BUILD +++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD @@ -22,10 +22,11 @@ go_library( "superblock_old.go", "test_utils.go", ], + marshal = True, visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/binary", + "//pkg/marshal", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group.go b/pkg/sentry/fsimpl/ext/disklayout/block_group.go index ad6f4fef8..0d56ae9da 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group.go +++ b/pkg/sentry/fsimpl/ext/disklayout/block_group.go @@ -14,6 +14,10 @@ package disklayout +import ( + "gvisor.dev/gvisor/pkg/marshal" +) + // BlockGroup represents a Linux ext block group descriptor. An ext file system // is split into a series of block groups. This provides an access layer to // information needed to access and use a block group. @@ -30,6 +34,8 @@ package disklayout // // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors. type BlockGroup interface { + marshal.Marshallable + // InodeTable returns the absolute block number of the block containing the // inode table. This points to an array of Inode structs. Inode tables are // statically allocated at mkfs time. The superblock records the number of diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go index 3e16c76db..a35fa22a0 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go +++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go @@ -17,6 +17,8 @@ package disklayout // BlockGroup32Bit emulates the first half of struct ext4_group_desc in // fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and // 32-bit ext4 filesystems. It implements BlockGroup interface. +// +// +marshal type BlockGroup32Bit struct { BlockBitmapLo uint32 InodeBitmapLo uint32 diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go index 9a809197a..d54d1d345 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go +++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go @@ -18,6 +18,8 @@ package disklayout // It is the block group descriptor struct for 64-bit ext4 filesystems. // It implements BlockGroup interface. It is an extension of the 32-bit // version of BlockGroup. +// +// +marshal type BlockGroup64Bit struct { // We embed the 32-bit struct here because 64-bit version is just an extension // of the 32-bit version. diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go index 0ef4294c0..e4ce484e4 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go +++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go @@ -21,6 +21,8 @@ import ( // TestBlockGroupSize tests that the block group descriptor structs are of the // correct size. func TestBlockGroupSize(t *testing.T) { - assertSize(t, BlockGroup32Bit{}, 32) - assertSize(t, BlockGroup64Bit{}, 64) + var bgSmall BlockGroup32Bit + assertSize(t, &bgSmall, 32) + var bgBig BlockGroup64Bit + assertSize(t, &bgBig, 64) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent.go b/pkg/sentry/fsimpl/ext/disklayout/dirent.go index 417b6cf65..568c8cb4c 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent.go +++ b/pkg/sentry/fsimpl/ext/disklayout/dirent.go @@ -15,6 +15,7 @@ package disklayout import ( + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/fs" ) @@ -51,6 +52,8 @@ var ( // // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories. type Dirent interface { + marshal.Marshallable + // Inode returns the absolute inode number of the underlying inode. // Inode number 0 signifies an unused dirent. Inode() uint32 diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go index 29ae4a5c2..51f9c2946 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go +++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go @@ -29,12 +29,14 @@ import ( // Note: This struct can be of variable size on disk. The one described below // is of maximum size and the FileName beyond NameLength bytes might contain // garbage. +// +// +marshal type DirentNew struct { InodeNumber uint32 RecordLength uint16 NameLength uint8 FileTypeRaw uint8 - FileNameRaw [MaxFileName]byte + FileNameRaw [MaxFileName]byte `marshal:"unaligned"` } // Compiles only if DirentNew implements Dirent. diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go index 6fff12a6e..d4b19e086 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go +++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go @@ -22,11 +22,13 @@ import "gvisor.dev/gvisor/pkg/sentry/fs" // Note: This struct can be of variable size on disk. The one described below // is of maximum size and the FileName beyond NameLength bytes might contain // garbage. +// +// +marshal type DirentOld struct { InodeNumber uint32 RecordLength uint16 NameLength uint16 - FileNameRaw [MaxFileName]byte + FileNameRaw [MaxFileName]byte `marshal:"unaligned"` } // Compiles only if DirentOld implements Dirent. diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go index 934919f8a..3486864dc 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go +++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go @@ -21,6 +21,8 @@ import ( // TestDirentSize tests that the dirent structs are of the correct // size. func TestDirentSize(t *testing.T) { - assertSize(t, DirentOld{}, uintptr(DirentSize)) - assertSize(t, DirentNew{}, uintptr(DirentSize)) + var dOld DirentOld + assertSize(t, &dOld, DirentSize) + var dNew DirentNew + assertSize(t, &dNew, DirentSize) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go index bdf4e2132..0834e9ba8 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go +++ b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go @@ -36,8 +36,6 @@ // escape analysis on an unknown implementation at compile time. // // Notes: -// - All fields in these structs are exported because binary.Read would -// panic otherwise. // - All structures on disk are in little-endian order. Only jbd2 (journal) // structures are in big-endian order. // - All OS dependent fields in these structures will be interpretted using diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go index 4110649ab..b13999bfc 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/extent.go +++ b/pkg/sentry/fsimpl/ext/disklayout/extent.go @@ -14,6 +14,10 @@ package disklayout +import ( + "gvisor.dev/gvisor/pkg/marshal" +) + // Extents were introduced in ext4 and provide huge performance gains in terms // data locality and reduced metadata block usage. Extents are organized in // extent trees. The root node is contained in inode.BlocksRaw. @@ -64,6 +68,8 @@ type ExtentNode struct { // ExtentEntry represents an extent tree node entry. The entry can either be // an ExtentIdx or Extent itself. This exists to simplify navigation logic. type ExtentEntry interface { + marshal.Marshallable + // FileBlock returns the first file block number covered by this entry. FileBlock() uint32 @@ -75,6 +81,8 @@ type ExtentEntry interface { // tree node begins with this and is followed by `NumEntries` number of: // - Extent if `Depth` == 0 // - ExtentIdx otherwise +// +// +marshal type ExtentHeader struct { // Magic in the extent magic number, must be 0xf30a. Magic uint16 @@ -96,6 +104,8 @@ type ExtentHeader struct { // internal nodes. Sorted in ascending order based on FirstFileBlock since // Linux does a binary search on this. This points to a block containing the // child node. +// +// +marshal type ExtentIdx struct { FirstFileBlock uint32 ChildBlockLo uint32 @@ -121,6 +131,8 @@ func (ei *ExtentIdx) PhysicalBlock() uint64 { // nodes. Sorted in ascending order based on FirstFileBlock since Linux does a // binary search on this. This points to an array of data blocks containing the // file data. It covers `Length` data blocks starting from `StartBlock`. +// +// +marshal type Extent struct { FirstFileBlock uint32 Length uint16 diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go index 8762b90db..c96002e19 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go +++ b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go @@ -21,7 +21,10 @@ import ( // TestExtentSize tests that the extent structs are of the correct // size. func TestExtentSize(t *testing.T) { - assertSize(t, ExtentHeader{}, ExtentHeaderSize) - assertSize(t, ExtentIdx{}, ExtentEntrySize) - assertSize(t, Extent{}, ExtentEntrySize) + var h ExtentHeader + assertSize(t, &h, ExtentHeaderSize) + var i ExtentIdx + assertSize(t, &i, ExtentEntrySize) + var e Extent + assertSize(t, &e, ExtentEntrySize) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode.go b/pkg/sentry/fsimpl/ext/disklayout/inode.go index 88ae913f5..ef25040a9 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/inode.go +++ b/pkg/sentry/fsimpl/ext/disklayout/inode.go @@ -16,6 +16,7 @@ package disklayout import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" ) @@ -38,6 +39,8 @@ const ( // // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes. type Inode interface { + marshal.Marshallable + // Mode returns the linux file mode which is majorly used to extract // information like: // - File permissions (read/write/execute by user/group/others). diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go index 8f9f574ce..a4503f5cf 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go +++ b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go @@ -27,6 +27,8 @@ import "gvisor.dev/gvisor/pkg/sentry/kernel/time" // are used to provide nanoscond precision. Hence, these timestamps will now // overflow in May 2446. // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. +// +// +marshal type InodeNew struct { InodeOld diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go index db25b11b6..e6b28babf 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go +++ b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go @@ -30,6 +30,8 @@ const ( // // All fields representing time are in seconds since the epoch. Which means that // they will overflow in January 2038. +// +// +marshal type InodeOld struct { ModeRaw uint16 UIDLo uint16 diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go index dd03ee50e..90744e956 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go +++ b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go @@ -24,10 +24,12 @@ import ( // TestInodeSize tests that the inode structs are of the correct size. func TestInodeSize(t *testing.T) { - assertSize(t, InodeOld{}, OldInodeSize) + var iOld InodeOld + assertSize(t, &iOld, OldInodeSize) // This was updated from 156 bytes to 160 bytes in Oct 2015. - assertSize(t, InodeNew{}, 160) + var iNew InodeNew + assertSize(t, &iNew, 160) } // TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock.go b/pkg/sentry/fsimpl/ext/disklayout/superblock.go index 8bb327006..70948ebe9 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock.go +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock.go @@ -14,6 +14,10 @@ package disklayout +import ( + "gvisor.dev/gvisor/pkg/marshal" +) + const ( // SbOffset is the absolute offset at which the superblock is placed. SbOffset = 1024 @@ -38,6 +42,8 @@ const ( // // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#super-block. type SuperBlock interface { + marshal.Marshallable + // InodesCount returns the total number of inodes in this filesystem. InodesCount() uint32 diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go index 53e515fd3..4dc6080fb 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go @@ -17,6 +17,8 @@ package disklayout // SuperBlock32Bit implements SuperBlock and represents the 32-bit version of // the ext4_super_block struct in fs/ext4/ext4.h. Should be used only if // RevLevel = DynamicRev and 64-bit feature is disabled. +// +// +marshal type SuperBlock32Bit struct { // We embed the old superblock struct here because the 32-bit version is just // an extension of the old version. diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go index 7c1053fb4..2c9039327 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go @@ -19,6 +19,8 @@ package disklayout // 1024 bytes (smallest possible block size) and hence the superblock always // fits in no more than one data block. Should only be used when the 64-bit // feature is set. +// +// +marshal type SuperBlock64Bit struct { // We embed the 32-bit struct here because 64-bit version is just an extension // of the 32-bit version. diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go index 9221e0251..e4709f23c 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go @@ -16,6 +16,8 @@ package disklayout // SuperBlockOld implements SuperBlock and represents the old version of the // superblock struct. Should be used only if RevLevel = OldRev. +// +// +marshal type SuperBlockOld struct { InodesCountRaw uint32 BlocksCountLo uint32 diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go index 463b5ba21..b734b6987 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go @@ -21,7 +21,10 @@ import ( // TestSuperBlockSize tests that the superblock structs are of the correct // size. func TestSuperBlockSize(t *testing.T) { - assertSize(t, SuperBlockOld{}, 84) - assertSize(t, SuperBlock32Bit{}, 336) - assertSize(t, SuperBlock64Bit{}, 1024) + var sbOld SuperBlockOld + assertSize(t, &sbOld, 84) + var sb32 SuperBlock32Bit + assertSize(t, &sb32, 336) + var sb64 SuperBlock64Bit + assertSize(t, &sb64, 1024) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go index 9c63f04c0..a4bc08411 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go +++ b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go @@ -18,13 +18,13 @@ import ( "reflect" "testing" - "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/marshal" ) -func assertSize(t *testing.T, v interface{}, want uintptr) { +func assertSize(t *testing.T, v marshal.Marshallable, want int) { t.Helper() - if got := binary.Size(v); got != want { + if got := v.SizeBytes(); got != want { t.Errorf("struct %s should be exactly %d bytes but is %d bytes", reflect.TypeOf(v).Name(), want, got) } } diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go index aca258d40..38fb7962b 100644 --- a/pkg/sentry/fsimpl/ext/ext.go +++ b/pkg/sentry/fsimpl/ext/ext.go @@ -38,9 +38,6 @@ const Name = "ext" // +stateify savable type FilesystemType struct{} -// Compiles only if FilesystemType implements vfs.FilesystemType. -var _ vfs.FilesystemType = (*FilesystemType)(nil) - // getDeviceFd returns an io.ReaderAt to the underlying device. // Currently there are two ways of mounting an ext(2/3/4) fs: // 1. Specify a mount with our internal special MountType in the OCI spec. @@ -101,6 +98,9 @@ func (FilesystemType) Name() string { return Name } +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { // TODO(b/134676337): Ensure that the user is mounting readonly. If not, diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 0989558cd..d9fd4590c 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -82,6 +82,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys } root := mntns.Root() + root.IncRef() tearDown := func() { root.DecRef(ctx) diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go index 04917d762..778460107 100644 --- a/pkg/sentry/fsimpl/ext/extent_file.go +++ b/pkg/sentry/fsimpl/ext/extent_file.go @@ -18,7 +18,6 @@ import ( "io" "sort" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/syserror" ) @@ -60,7 +59,7 @@ func newExtentFile(args inodeArgs) (*extentFile, error) { func (f *extentFile) buildExtTree() error { rootNodeData := f.regFile.inode.diskInode.Data() - binary.Unmarshal(rootNodeData[:disklayout.ExtentHeaderSize], binary.LittleEndian, &f.root.Header) + f.root.Header.UnmarshalBytes(rootNodeData[:disklayout.ExtentHeaderSize]) // Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries. if f.root.Header.NumEntries > 4 { @@ -79,7 +78,7 @@ func (f *extentFile) buildExtTree() error { // Internal node. curEntry = &disklayout.ExtentIdx{} } - binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentEntrySize], binary.LittleEndian, curEntry) + curEntry.UnmarshalBytes(rootNodeData[off : off+disklayout.ExtentEntrySize]) f.root.Entries[i].Entry = curEntry } diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go index cd10d46ee..985f76ac0 100644 --- a/pkg/sentry/fsimpl/ext/extent_test.go +++ b/pkg/sentry/fsimpl/ext/extent_test.go @@ -21,7 +21,6 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" ) @@ -202,13 +201,14 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, [] // writeTree writes the tree represented by `root` to the inode and disk. It // also writes random file data on disk. func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBlkSize uint64) []byte { - rootData := binary.Marshal(nil, binary.LittleEndian, root.Header) + rootData := in.diskInode.Data() + root.Header.MarshalBytes(rootData) + off := root.Header.SizeBytes() for _, ep := range root.Entries { - rootData = binary.Marshal(rootData, binary.LittleEndian, ep.Entry) + ep.Entry.MarshalBytes(rootData[off:]) + off += ep.Entry.SizeBytes() } - copy(in.diskInode.Data(), rootData) - var fileData []byte for _, ep := range root.Entries { if root.Header.Height == 0 { @@ -223,13 +223,14 @@ func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBl // writeTreeToDisk is the recursive step for writeTree which writes the tree // on the disk only. Also writes random file data on disk. func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair) []byte { - nodeData := binary.Marshal(nil, binary.LittleEndian, curNode.Node.Header) + nodeData := disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:] + curNode.Node.Header.MarshalBytes(nodeData) + off := curNode.Node.Header.SizeBytes() for _, ep := range curNode.Node.Entries { - nodeData = binary.Marshal(nodeData, binary.LittleEndian, ep.Entry) + ep.Entry.MarshalBytes(nodeData[off:]) + off += ep.Entry.SizeBytes() } - copy(disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:], nodeData) - var fileData []byte for _, ep := range curNode.Node.Entries { if curNode.Node.Header.Height == 0 { diff --git a/pkg/sentry/fsimpl/ext/utils.go b/pkg/sentry/fsimpl/ext/utils.go index d8b728f8c..58ef7b9b8 100644 --- a/pkg/sentry/fsimpl/ext/utils.go +++ b/pkg/sentry/fsimpl/ext/utils.go @@ -17,21 +17,21 @@ package ext import ( "io" - "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/syserror" ) // readFromDisk performs a binary read from disk into the given struct from // the absolute offset provided. -func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error { - n := binary.Size(v) +func readFromDisk(dev io.ReaderAt, abOff int64, v marshal.Marshallable) error { + n := v.SizeBytes() buf := make([]byte, n) if read, _ := dev.ReadAt(buf, abOff); read < int(n) { return syserror.EIO } - binary.Unmarshal(buf, binary.LittleEndian, v) + v.UnmarshalBytes(buf) return nil } diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 65786e42a..e39df21c6 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -98,6 +98,9 @@ func (FilesystemType) Name() string { return Name } +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { devMinor, err := vfsObj.GetAnonBlockDevMinor() @@ -249,14 +252,12 @@ func (fs *filesystem) Release(ctx context.Context) { // +stateify savable type inode struct { inodeRefs + kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren - kernfs.InodeNoDynamicLookup kernfs.InodeNotSymlink kernfs.OrderedChildren - dentry kernfs.Dentry - // the owning filesystem. fs is immutable. fs *filesystem @@ -284,26 +285,24 @@ type inode struct { } func (fs *filesystem) newRootInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry { - i := &inode{fs: fs} + i := &inode{fs: fs, nodeID: 1} i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, 1, linux.ModeDirectory|0755) i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) i.EnableLeakCheck() - i.dentry.Init(i) - i.nodeID = 1 - return &i.dentry + var d kernfs.Dentry + d.Init(&fs.Filesystem, i) + return &d } -func (fs *filesystem) newInode(nodeID uint64, attr linux.FUSEAttr) *kernfs.Dentry { +func (fs *filesystem) newInode(nodeID uint64, attr linux.FUSEAttr) kernfs.Inode { i := &inode{fs: fs, nodeID: nodeID} creds := auth.Credentials{EffectiveKGID: auth.KGID(attr.UID), EffectiveKUID: auth.KUID(attr.UID)} i.InodeAttrs.Init(&creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.FileMode(attr.Mode)) atomic.StoreUint64(&i.size, attr.Size) i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) i.EnableLeakCheck() - i.dentry.Init(i) - - return &i.dentry + return i } // Open implements kernfs.Inode.Open. @@ -410,23 +409,27 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentr } // Lookup implements kernfs.Inode.Lookup. -func (i *inode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) { +func (i *inode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { in := linux.FUSELookupIn{Name: name} return i.newEntry(ctx, name, 0, linux.FUSE_LOOKUP, &in) } +// Keep implements kernfs.Inode.Keep. +func (i *inode) Keep() bool { + // Return true so that kernfs keeps the new dentry pointing to this + // inode in the dentry tree. This is needed because inodes created via + // Lookup are not temporary. They might refer to existing files on server + // that can be Unlink'd/Rmdir'd. + return true +} + // IterDirents implements kernfs.Inode.IterDirents. func (*inode) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { return offset, nil } -// Valid implements kernfs.Inode.Valid. -func (*inode) Valid(ctx context.Context) bool { - return true -} - // NewFile implements kernfs.Inode.NewFile. -func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*kernfs.Dentry, error) { +func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) { kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID) @@ -444,7 +447,7 @@ func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) } // NewNode implements kernfs.Inode.NewNode. -func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*kernfs.Dentry, error) { +func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (kernfs.Inode, error) { in := linux.FUSEMknodIn{ MknodMeta: linux.FUSEMknodMeta{ Mode: uint32(opts.Mode), @@ -457,7 +460,7 @@ func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) } // NewSymlink implements kernfs.Inode.NewSymlink. -func (i *inode) NewSymlink(ctx context.Context, name, target string) (*kernfs.Dentry, error) { +func (i *inode) NewSymlink(ctx context.Context, name, target string) (kernfs.Inode, error) { in := linux.FUSESymLinkIn{ Name: name, Target: target, @@ -466,7 +469,7 @@ func (i *inode) NewSymlink(ctx context.Context, name, target string) (*kernfs.De } // Unlink implements kernfs.Inode.Unlink. -func (i *inode) Unlink(ctx context.Context, name string, child *kernfs.Dentry) error { +func (i *inode) Unlink(ctx context.Context, name string, child kernfs.Inode) error { kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID) @@ -482,14 +485,11 @@ func (i *inode) Unlink(ctx context.Context, name string, child *kernfs.Dentry) e return err } // only return error, discard res. - if err := res.Error(); err != nil { - return err - } - return i.dentry.RemoveChildLocked(name, child) + return res.Error() } // NewDir implements kernfs.Inode.NewDir. -func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*kernfs.Dentry, error) { +func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { in := linux.FUSEMkdirIn{ MkdirMeta: linux.FUSEMkdirMeta{ Mode: uint32(opts.Mode), @@ -501,7 +501,7 @@ func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) } // RmDir implements kernfs.Inode.RmDir. -func (i *inode) RmDir(ctx context.Context, name string, child *kernfs.Dentry) error { +func (i *inode) RmDir(ctx context.Context, name string, child kernfs.Inode) error { fusefs := i.fs task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx) @@ -515,16 +515,12 @@ func (i *inode) RmDir(ctx context.Context, name string, child *kernfs.Dentry) er if err != nil { return err } - if err := res.Error(); err != nil { - return err - } - - return i.dentry.RemoveChildLocked(name, child) + return res.Error() } // newEntry calls FUSE server for entry creation and allocates corresponding entry according to response. // Shared by FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and FUSE_LOOKUP. -func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*kernfs.Dentry, error) { +func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (kernfs.Inode, error) { kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID) @@ -734,8 +730,8 @@ func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptio } // DecRef implements kernfs.Inode.DecRef. -func (i *inode) DecRef(context.Context) { - i.inodeRefs.DecRef(i.Destroy) +func (i *inode) DecRef(ctx context.Context) { + i.inodeRefs.DecRef(func() { i.Destroy(ctx) }) } // StatFS implements kernfs.Inode.StatFS. diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 16787116f..ad0afc41b 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -52,6 +52,7 @@ go_library( "//pkg/fspath", "//pkg/log", "//pkg/p9", + "//pkg/refs", "//pkg/safemem", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/lock", diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 8608471f8..f1dad1b08 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -272,6 +272,9 @@ func (FilesystemType) Name() string { return Name } +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { mfp := pgalloc.MemoryFileProviderFromContext(ctx) diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index eeaf6e444..f8b19bae7 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -395,7 +395,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) End: gapEnd, } optMR := gap.Range() - err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, h.readToBlocksAt) + err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.size, mf, usage.PageCache, h.readToBlocksAt) mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End}) seg, gap = rw.d.cache.Find(rw.off) if !seg.Ok() { @@ -403,10 +403,10 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) rw.d.handleMu.RUnlock() return done, err } - // err might have occurred in part of gap.Range() outside - // gapMR. Forget about it for now; if the error matters and - // persists, we'll run into it again in a later iteration of - // this loop. + // err might have occurred in part of gap.Range() outside gapMR + // (in particular, gap.End() might be beyond EOF). Forget about + // it for now; if the error matters and persists, we'll run + // into it again in a later iteration of this loop. } else { // Read directly from the file. gapDsts := dsts.TakeFirst64(gapMR.Length()) @@ -780,7 +780,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab mf := d.fs.mfp.MemoryFile() h := d.readHandleLocked() - cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, h.readToBlocksAt) + cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), d.size, mf, usage.PageCache, h.readToBlocksAt) var ts []memmap.Translation var translatedEnd uint64 diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index ffe4ddb32..698e913fe 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -118,7 +118,7 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) if err != nil { return nil, err } - d.Init(i) + d.Init(&fs.Filesystem, i) // i.open will take a reference on d. defer d.DecRef(ctx) @@ -151,6 +151,9 @@ func (filesystemType) Name() string { return "none" } +// Release implements vfs.FilesystemType.Release. +func (filesystemType) Release(ctx context.Context) {} + // NewFilesystem sets up and returns a new hostfs filesystem. // // Note that there should only ever be one instance of host.filesystem, @@ -195,6 +198,7 @@ type inode struct { kernfs.InodeNoStatFS kernfs.InodeNotDirectory kernfs.InodeNotSymlink + kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid. locks vfs.FileLocks diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go index 131145b85..8a447e29f 100644 --- a/pkg/sentry/fsimpl/host/socket.go +++ b/pkg/sentry/fsimpl/host/socket.go @@ -348,10 +348,10 @@ func (e *SCMConnectedEndpoint) Init() error { func (e *SCMConnectedEndpoint) Release(ctx context.Context) { e.DecRef(func() { e.mu.Lock() + fdnotifier.RemoveFD(int32(e.fd)) if err := syscall.Close(e.fd); err != nil { log.Warningf("Failed to close host fd %d: %v", err) } - fdnotifier.RemoveFD(int32(e.fd)) e.destroyLocked() e.mu.Unlock() }) diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index 5e91e0536..858cc24ce 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -70,6 +70,17 @@ go_template_instance( }, ) +go_template_instance( + name = "synthetic_directory_refs", + out = "synthetic_directory_refs.go", + package = "kernfs", + prefix = "syntheticDirectory", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "syntheticDirectory", + }, +) + go_library( name = "kernfs", srcs = [ @@ -84,6 +95,7 @@ go_library( "static_directory_refs.go", "symlink.go", "synthetic_directory.go", + "synthetic_directory_refs.go", ], visibility = ["//pkg/sentry:internal"], deps = [ diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 0a4cd4057..abf1905d6 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -201,12 +201,12 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent // these. childIdx := fd.off - 2 for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() { - stat, err := it.Dentry.inode.Stat(ctx, fd.filesystem(), opts) + stat, err := it.inode.Stat(ctx, fd.filesystem(), opts) if err != nil { return err } dirent := vfs.Dirent{ - Name: it.Name, + Name: it.name, Type: linux.FileMode(stat.Mode).DirentType(), Ino: stat.Ino, NextOff: fd.off + 1, diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 5cc1c4281..6426a55f6 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -89,7 +89,7 @@ afterSymlink: } if targetVD.Ok() { err := rp.HandleJump(targetVD) - targetVD.DecRef(ctx) + fs.deferDecRefVD(ctx, targetVD) if err != nil { return nil, err } @@ -120,22 +120,33 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // Cached dentry exists, revalidate. if !child.inode.Valid(ctx) { delete(parent.children, name) - vfsObj.InvalidateDentry(ctx, &child.vfsd) - fs.deferDecRef(child) // Reference from Lookup. + if child.inode.Keep() { + // Drop the ref owned by kernfs. + fs.deferDecRef(child) + } + vfsObj.InvalidateDentry(ctx, child.VFSDentry()) child = nil } } if child == nil { // Dentry isn't cached; it either doesn't exist or failed revalidation. // Attempt to resolve it via Lookup. - c, err := parent.inode.Lookup(ctx, name) + childInode, err := parent.inode.Lookup(ctx, name) if err != nil { return nil, err } - // Reference on c (provided by Lookup) will be dropped when the dentry - // fails validation. - parent.InsertChildLocked(name, c) - child = c + var newChild Dentry + newChild.Init(fs, childInode) // childInode's ref is transferred to newChild. + parent.insertChildLocked(name, &newChild) + child = &newChild + + // Drop the ref on newChild. This will cause the dentry to get pruned + // from the dentry tree by the end of current filesystem operation + // (before returning to the VFS layer) if another ref is not picked on + // this dentry. + if !childInode.Keep() { + fs.deferDecRef(&newChild) + } } return child, nil } @@ -191,7 +202,7 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving } // checkCreateLocked checks that a file named rp.Component() may be created in -// directory parentVFSD, then returns rp.Component(). +// directory parent, then returns rp.Component(). // // Preconditions: // * Filesystem.mu must be locked for at least reading. @@ -298,9 +309,9 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EEXIST } fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() parent, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -324,11 +335,13 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EPERM } - child, err := parent.inode.NewLink(ctx, pc, d.inode) + childI, err := parent.inode.NewLink(ctx, pc, d.inode) if err != nil { return err } - parent.InsertChildLocked(pc, child) + var child Dentry + child.Init(fs, childI) + parent.insertChildLocked(pc, &child) return nil } @@ -338,9 +351,9 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return syserror.EEXIST } fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() parent, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -355,14 +368,16 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } defer rp.Mount().EndWrite() - child, err := parent.inode.NewDir(ctx, pc, opts) + childI, err := parent.inode.NewDir(ctx, pc, opts) if err != nil { if !opts.ForSyntheticMountpoint || err == syserror.EEXIST { return err } - child = newSyntheticDirectory(rp.Credentials(), opts.Mode) + childI = newSyntheticDirectory(rp.Credentials(), opts.Mode) } - parent.InsertChildLocked(pc, child) + var child Dentry + child.Init(fs, childI) + parent.insertChildLocked(pc, &child) return nil } @@ -372,9 +387,9 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return syserror.EEXIST } fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() parent, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -389,11 +404,13 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } defer rp.Mount().EndWrite() - newD, err := parent.inode.NewNode(ctx, pc, opts) + newI, err := parent.inode.NewNode(ctx, pc, opts) if err != nil { return err } - parent.InsertChildLocked(pc, newD) + var newD Dentry + newD.Init(fs, newI) + parent.insertChildLocked(pc, &newD) return nil } @@ -409,22 +426,23 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // Do not create new file. if opts.Flags&linux.O_CREAT == 0 { fs.mu.RLock() + defer fs.processDeferredDecRefs(ctx) d, err := fs.walkExistingLocked(ctx, rp) if err != nil { fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) return nil, err } if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) return nil, err } - d.inode.IncRef() - defer d.inode.DecRef(ctx) + // Open may block so we need to unlock fs.mu. IncRef d to prevent + // its destruction while fs.mu is unlocked. + d.IncRef() fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) - return d.inode.Open(ctx, rp, d, opts) + fd, err := d.inode.Open(ctx, rp, d, opts) + d.DecRef(ctx) + return fd, err } // May create new file. @@ -438,6 +456,10 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf unlocked = true } } + // Process all to-be-decref'd dentries at the end at once. + // Since we defer unlock() AFTER this, fs.mu is guaranteed to be unlocked + // when this is executed. + defer fs.processDeferredDecRefs(ctx) defer unlock() if rp.Done() { if rp.MustBeDir() { @@ -449,14 +471,16 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - d.inode.IncRef() - defer d.inode.DecRef(ctx) + // Open may block so we need to unlock fs.mu. IncRef d to prevent + // its destruction while fs.mu is unlocked. + d.IncRef() unlock() - return d.inode.Open(ctx, rp, d, opts) + fd, err := d.inode.Open(ctx, rp, d, opts) + d.DecRef(ctx) + return fd, err } afterTrailingSymlink: parent, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked(ctx) if err != nil { return nil, err } @@ -487,18 +511,23 @@ afterTrailingSymlink: } defer rp.Mount().EndWrite() // Create and open the child. - child, err := parent.inode.NewFile(ctx, pc, opts) + childI, err := parent.inode.NewFile(ctx, pc, opts) if err != nil { return nil, err } + var child Dentry + child.Init(fs, childI) // FIXME(gvisor.dev/issue/1193): Race between checking existence with - // fs.stepExistingLocked and parent.InsertChild. If possible, we should hold + // fs.stepExistingLocked and parent.insertChild. If possible, we should hold // dirMu from one to the other. - parent.InsertChild(pc, child) - child.inode.IncRef() - defer child.inode.DecRef(ctx) + parent.insertChild(pc, &child) + // Open may block so we need to unlock fs.mu. IncRef child to prevent + // its destruction while fs.mu is unlocked. + child.IncRef() unlock() - return child.inode.Open(ctx, rp, child, opts) + fd, err := child.inode.Open(ctx, rp, &child, opts) + child.DecRef(ctx) + return fd, err } if err != nil { return nil, err @@ -514,7 +543,7 @@ afterTrailingSymlink: } if targetVD.Ok() { err := rp.HandleJump(targetVD) - targetVD.DecRef(ctx) + fs.deferDecRefVD(ctx, targetVD) if err != nil { return nil, err } @@ -530,18 +559,21 @@ afterTrailingSymlink: if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - child.inode.IncRef() - defer child.inode.DecRef(ctx) + // Open may block so we need to unlock fs.mu. IncRef child to prevent + // its destruction while fs.mu is unlocked. + child.IncRef() unlock() - return child.inode.Open(ctx, rp, child, opts) + fd, err := child.inode.Open(ctx, rp, child, opts) + child.DecRef(ctx) + return fd, err } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { fs.mu.RLock() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() d, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) if err != nil { return "", err } @@ -560,7 +592,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 fs.mu.Lock() - defer fs.processDeferredDecRefsLocked(ctx) + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() // Resolve the destination directory first to verify that it's on this @@ -632,24 +664,27 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil { return err } - replaced, err := srcDir.inode.Rename(ctx, src.name, pc, src, dstDir) + err = srcDir.inode.Rename(ctx, src.name, pc, src.inode, dstDir.inode) if err != nil { virtfs.AbortRenameDentry(srcVFSD, dstVFSD) return err } delete(srcDir.children, src.name) if srcDir != dstDir { - fs.deferDecRef(srcDir) - dstDir.IncRef() + fs.deferDecRef(srcDir) // child (src) drops ref on old parent. + dstDir.IncRef() // child (src) takes a ref on the new parent. } src.parent = dstDir src.name = pc if dstDir.children == nil { dstDir.children = make(map[string]*Dentry) } + replaced := dstDir.children[pc] dstDir.children[pc] = src var replaceVFSD *vfs.Dentry if replaced != nil { + // deferDecRef so that fs.mu and dstDir.mu are unlocked by then. + fs.deferDecRef(replaced) replaceVFSD = replaced.VFSDentry() } virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD) @@ -659,10 +694,10 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() d, err := fs.walkExistingLocked(ctx, rp) - fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -691,10 +726,13 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } - if err := parentDentry.inode.RmDir(ctx, d.name, d); err != nil { + if err := parentDentry.inode.RmDir(ctx, d.name, d.inode); err != nil { virtfs.AbortDeleteDentry(vfsd) return err } + delete(parentDentry.children, d.name) + // Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then. + fs.deferDecRef(d) virtfs.CommitDeleteDentry(ctx, vfsd) return nil } @@ -702,9 +740,9 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { fs.mu.RLock() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() d, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) if err != nil { return err } @@ -717,9 +755,9 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { fs.mu.RLock() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() d, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) if err != nil { return linux.Statx{}, err } @@ -729,9 +767,9 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { fs.mu.RLock() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() d, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) if err != nil { return linux.Statfs{}, err } @@ -744,9 +782,9 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return syserror.EEXIST } fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() parent, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -761,21 +799,23 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return err } defer rp.Mount().EndWrite() - child, err := parent.inode.NewSymlink(ctx, pc, target) + childI, err := parent.inode.NewSymlink(ctx, pc, target) if err != nil { return err } - parent.InsertChildLocked(pc, child) + var child Dentry + child.Init(fs, childI) + parent.insertChildLocked(pc, &child) return nil } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() d, err := fs.walkExistingLocked(ctx, rp) - fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -799,10 +839,13 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } - if err := parentDentry.inode.Unlink(ctx, d.name, d); err != nil { + if err := parentDentry.inode.Unlink(ctx, d.name, d.inode); err != nil { virtfs.AbortDeleteDentry(vfsd) return err } + delete(parentDentry.children, d.name) + // Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then. + fs.deferDecRef(d) virtfs.CommitDeleteDentry(ctx, vfsd) return nil } @@ -810,9 +853,9 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() d, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) if err != nil { return nil, err } @@ -825,9 +868,9 @@ func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() _, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) if err != nil { return nil, err } @@ -838,9 +881,9 @@ func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, si // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { fs.mu.RLock() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() _, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) if err != nil { return "", err } @@ -851,9 +894,9 @@ func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { fs.mu.RLock() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() _, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) if err != nil { return err } @@ -864,9 +907,9 @@ func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() _, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) if err != nil { return err } @@ -880,3 +923,16 @@ func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe defer fs.mu.RUnlock() return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b) } + +func (fs *Filesystem) deferDecRefVD(ctx context.Context, vd vfs.VirtualDentry) { + if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs { + // The following is equivalent to vd.DecRef(ctx). This is needed + // because if d belongs to this filesystem, we can not DecRef it right + // away as we may be holding fs.mu. d.DecRef may acquire fs.mu. So we + // defer the DecRef to when locks are dropped. + vd.Mount().DecRef(ctx) + fs.deferDecRef(d) + } else { + vd.DecRef(ctx) + } +} diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 49210e748..122b10591 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -34,6 +34,7 @@ import ( // // +stateify savable type InodeNoopRefCount struct { + InodeTemporary } // IncRef implements Inode.IncRef. @@ -57,27 +58,27 @@ func (InodeNoopRefCount) TryIncRef() bool { type InodeDirectoryNoNewChildren struct{} // NewFile implements Inode.NewFile. -func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*Dentry, error) { +func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) { return nil, syserror.EPERM } // NewDir implements Inode.NewDir. -func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*Dentry, error) { +func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) { return nil, syserror.EPERM } // NewLink implements Inode.NewLink. -func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*Dentry, error) { +func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (Inode, error) { return nil, syserror.EPERM } // NewSymlink implements Inode.NewSymlink. -func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*Dentry, error) { +func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (Inode, error) { return nil, syserror.EPERM } // NewNode implements Inode.NewNode. -func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*Dentry, error) { +func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) { return nil, syserror.EPERM } @@ -88,6 +89,7 @@ func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOpt // // +stateify savable type InodeNotDirectory struct { + InodeAlwaysValid } // HasChildren implements Inode.HasChildren. @@ -96,47 +98,47 @@ func (InodeNotDirectory) HasChildren() bool { } // NewFile implements Inode.NewFile. -func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*Dentry, error) { +func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) { panic("NewFile called on non-directory inode") } // NewDir implements Inode.NewDir. -func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*Dentry, error) { +func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) { panic("NewDir called on non-directory inode") } // NewLink implements Inode.NewLinkink. -func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*Dentry, error) { +func (InodeNotDirectory) NewLink(context.Context, string, Inode) (Inode, error) { panic("NewLink called on non-directory inode") } // NewSymlink implements Inode.NewSymlink. -func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*Dentry, error) { +func (InodeNotDirectory) NewSymlink(context.Context, string, string) (Inode, error) { panic("NewSymlink called on non-directory inode") } // NewNode implements Inode.NewNode. -func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*Dentry, error) { +func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) { panic("NewNode called on non-directory inode") } // Unlink implements Inode.Unlink. -func (InodeNotDirectory) Unlink(context.Context, string, *Dentry) error { +func (InodeNotDirectory) Unlink(context.Context, string, Inode) error { panic("Unlink called on non-directory inode") } // RmDir implements Inode.RmDir. -func (InodeNotDirectory) RmDir(context.Context, string, *Dentry) error { +func (InodeNotDirectory) RmDir(context.Context, string, Inode) error { panic("RmDir called on non-directory inode") } // Rename implements Inode.Rename. -func (InodeNotDirectory) Rename(context.Context, string, string, *Dentry, *Dentry) (*Dentry, error) { +func (InodeNotDirectory) Rename(context.Context, string, string, Inode, Inode) error { panic("Rename called on non-directory inode") } // Lookup implements Inode.Lookup. -func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*Dentry, error) { +func (InodeNotDirectory) Lookup(ctx context.Context, name string) (Inode, error) { panic("Lookup called on non-directory inode") } @@ -145,35 +147,6 @@ func (InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDiren panic("IterDirents called on non-directory inode") } -// Valid implements Inode.Valid. -func (InodeNotDirectory) Valid(context.Context) bool { - return true -} - -// InodeNoDynamicLookup partially implements the Inode interface, specifically -// the inodeDynamicLookup sub interface. Directory inodes that do not support -// dymanic entries (i.e. entries that are not "hashed" into the -// vfs.Dentry.children) can embed this to provide no-op implementations for -// functions related to dynamic entries. -// -// +stateify savable -type InodeNoDynamicLookup struct{} - -// Lookup implements Inode.Lookup. -func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*Dentry, error) { - return nil, syserror.ENOENT -} - -// IterDirents implements Inode.IterDirents. -func (InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { - return offset, nil -} - -// Valid implements Inode.Valid. -func (InodeNoDynamicLookup) Valid(ctx context.Context) bool { - return true -} - // InodeNotSymlink partially implements the Inode interface, specifically the // inodeSymlink sub interface. All inodes that are not symlinks may embed this // to return the appropriate errors from symlink-related functions. @@ -273,7 +246,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut // SetInodeStat sets the corresponding attributes from opts to InodeAttrs. // This function can be used by other kernfs-based filesystem implementation to -// sets the unexported attributes into kernfs.InodeAttrs. +// sets the unexported attributes into InodeAttrs. func (a *InodeAttrs) SetInodeStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { if opts.Stat.Mask == 0 { return nil @@ -344,8 +317,9 @@ func (a *InodeAttrs) DecLinks() { // +stateify savable type slot struct { - Name string - Dentry *Dentry + name string + inode Inode + static bool slotEntry } @@ -361,10 +335,18 @@ type OrderedChildrenOptions struct { } // OrderedChildren partially implements the Inode interface. OrderedChildren can -// be embedded in directory inodes to keep track of the children in the +// be embedded in directory inodes to keep track of children in the // directory, and can then be used to implement a generic directory FD -- see -// GenericDirectoryFD. OrderedChildren is not compatible with dynamic -// directories. +// GenericDirectoryFD. +// +// OrderedChildren can represent a node in an Inode tree. The children inodes +// might be directories themselves using OrderedChildren; hence extending the +// tree. The parent inode (OrderedChildren user) holds a ref on all its static +// children. This lets the static inodes outlive their associated dentry. +// While the dentry might have to be regenerated via a Lookup() call, we can +// keep reusing the same static inode. These static children inodes are finally +// DecRef'd when this directory inode is being destroyed. This makes +// OrderedChildren suitable for static directory entries as well. // // Must be initialize with Init before first use. // @@ -388,33 +370,63 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) { // Destroy clears the children stored in o. It should be called by structs // embedding OrderedChildren upon destruction, i.e. when their reference count // reaches zero. -func (o *OrderedChildren) Destroy() { +func (o *OrderedChildren) Destroy(ctx context.Context) { o.mu.Lock() defer o.mu.Unlock() + // Drop the ref that o owns on the static inodes it holds. + for _, s := range o.set { + if s.static { + s.inode.DecRef(ctx) + } + } o.order.Reset() o.set = nil } -// Populate inserts children into this OrderedChildren, and d's dentry -// cache. Populate returns the number of directories inserted, which the caller +// Populate inserts static children into this OrderedChildren. +// Populate returns the number of directories inserted, which the caller // may use to update the link count for the parent directory. // -// Precondition: d must represent a directory inode. children must not contain -// any conflicting entries already in o. -func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 { +// Precondition: +// * d must represent a directory inode. +// * children must not contain any conflicting entries already in o. +// * Caller must hold a reference on all inodes passed. +// +// Postcondition: Caller's references on inodes are transferred to o. +func (o *OrderedChildren) Populate(children map[string]Inode) uint32 { var links uint32 for name, child := range children { - if child.isDir() { + if child.Mode().IsDir() { links++ } - if err := o.Insert(name, child); err != nil { - panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d)) + if err := o.insert(name, child, true); err != nil { + panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v)", name, child)) } - d.InsertChild(name, child) } return links } +// Lookup implements Inode.Lookup. +func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error) { + o.mu.RLock() + defer o.mu.RUnlock() + + s, ok := o.set[name] + if !ok { + return nil, syserror.ENOENT + } + + s.inode.IncRef() // This ref is passed to the dentry upon creation via Init. + return s.inode, nil +} + +// IterDirents implements Inode.IterDirents. +func (o *OrderedChildren) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { + // All entries from OrderedChildren have already been handled in + // GenericDirectoryFD.IterDirents. + return offset, nil +} + // HasChildren implements Inode.HasChildren. func (o *OrderedChildren) HasChildren() bool { o.mu.RLock() @@ -422,17 +434,27 @@ func (o *OrderedChildren) HasChildren() bool { return len(o.set) > 0 } -// Insert inserts child into o. This ignores the writability of o, as this is -// not part of the vfs.FilesystemImpl interface, and is a lower-level operation. -func (o *OrderedChildren) Insert(name string, child *Dentry) error { +// Insert inserts a dynamic child into o. This ignores the writability of o, as +// this is not part of the vfs.FilesystemImpl interface, and is a lower-level operation. +func (o *OrderedChildren) Insert(name string, child Inode) error { + return o.insert(name, child, false) +} + +// insert inserts child into o. +// +// Precondition: Caller must be holding a ref on child if static is true. +// +// Postcondition: Caller's ref on child is transferred to o if static is true. +func (o *OrderedChildren) insert(name string, child Inode, static bool) error { o.mu.Lock() defer o.mu.Unlock() if _, ok := o.set[name]; ok { return syserror.EEXIST } s := &slot{ - Name: name, - Dentry: child, + name: name, + inode: child, + static: static, } o.order.PushBack(s) o.set[name] = s @@ -442,44 +464,49 @@ func (o *OrderedChildren) Insert(name string, child *Dentry) error { // Precondition: caller must hold o.mu for writing. func (o *OrderedChildren) removeLocked(name string) { if s, ok := o.set[name]; ok { + if s.static { + panic(fmt.Sprintf("removeLocked called on a static inode: %v", s.inode)) + } delete(o.set, name) o.order.Remove(s) } } // Precondition: caller must hold o.mu for writing. -func (o *OrderedChildren) replaceChildLocked(name string, new *Dentry) *Dentry { +func (o *OrderedChildren) replaceChildLocked(ctx context.Context, name string, newI Inode) { if s, ok := o.set[name]; ok { + if s.static { + panic(fmt.Sprintf("replacing a static inode: %v", s.inode)) + } + // Existing slot with given name, simply replace the dentry. - var old *Dentry - old, s.Dentry = s.Dentry, new - return old + s.inode = newI } // No existing slot with given name, create and hash new slot. s := &slot{ - Name: name, - Dentry: new, + name: name, + inode: newI, + static: false, } o.order.PushBack(s) o.set[name] = s - return nil } // Precondition: caller must hold o.mu for reading or writing. -func (o *OrderedChildren) checkExistingLocked(name string, child *Dentry) error { +func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error { s, ok := o.set[name] if !ok { return syserror.ENOENT } - if s.Dentry != child { - panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child)) + if s.inode != child { + panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! OrderedChild: %+v, kernfs: %+v", s.inode, child)) } return nil } // Unlink implements Inode.Unlink. -func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *Dentry) error { +func (o *OrderedChildren) Unlink(ctx context.Context, name string, child Inode) error { if !o.writable { return syserror.EPERM } @@ -494,8 +521,8 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *Dentry return nil } -// Rmdir implements Inode.Rmdir. -func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *Dentry) error { +// RmDir implements Inode.RmDir. +func (o *OrderedChildren) RmDir(ctx context.Context, name string, child Inode) error { // We're not responsible for checking that child is a directory, that it's // empty, or updating any link counts; so this is the same as unlink. return o.Unlink(ctx, name, child) @@ -517,13 +544,13 @@ func (renameAcrossDifferentImplementationsError) Error() string { // that will support Rename. // // Postcondition: reference on any replaced dentry transferred to caller. -func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *Dentry) (*Dentry, error) { - dst, ok := dstDir.inode.(interface{}).(*OrderedChildren) +func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error { + dst, ok := dstDir.(interface{}).(*OrderedChildren) if !ok { - return nil, renameAcrossDifferentImplementationsError{} + return renameAcrossDifferentImplementationsError{} } if !o.writable || !dst.writable { - return nil, syserror.EPERM + return syserror.EPERM } // Note: There's a potential deadlock below if concurrent calls to Rename // refer to the same src and dst directories in reverse. We avoid any @@ -536,12 +563,12 @@ func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, c defer dst.mu.Unlock() } if err := o.checkExistingLocked(oldname, child); err != nil { - return nil, err + return err } // TODO(gvisor.dev/issue/3027): Check sticky bit before removing. - replaced := dst.replaceChildLocked(newname, child) - return replaced, nil + dst.replaceChildLocked(ctx, newname, child) + return nil } // nthLocked returns an iterator to the nth child tracked by this object. The @@ -576,11 +603,12 @@ func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, // // +stateify savable type StaticDirectory struct { + InodeAlwaysValid InodeAttrs InodeDirectoryNoNewChildren - InodeNoDynamicLookup InodeNoStatFS InodeNotSymlink + InodeTemporary OrderedChildren StaticDirectoryRefs @@ -591,19 +619,16 @@ type StaticDirectory struct { var _ Inode = (*StaticDirectory)(nil) // NewStaticDir creates a new static directory and returns its dentry. -func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry, fdOpts GenericDirectoryFDOptions) *Dentry { +func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]Inode, fdOpts GenericDirectoryFDOptions) Inode { inode := &StaticDirectory{} inode.Init(creds, devMajor, devMinor, ino, perm, fdOpts) inode.EnableLeakCheck() - dentry := &Dentry{} - dentry.Init(inode) - inode.OrderedChildren.Init(OrderedChildrenOptions{}) - links := inode.OrderedChildren.Populate(dentry, children) + links := inode.OrderedChildren.Populate(children) inode.IncLinks(links) - return dentry + return inode } // Init initializes StaticDirectory. @@ -615,7 +640,7 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint3 s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm) } -// Open implements kernfs.Inode.Open. +// Open implements Inode.Open. func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := NewGenericDirectoryFD(rp.Mount(), d, &s.OrderedChildren, &s.locks, &opts, s.fdOpts) if err != nil { @@ -624,26 +649,36 @@ func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *De return fd.VFSFileDescription(), nil } -// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } -// DecRef implements kernfs.Inode.DecRef. -func (s *StaticDirectory) DecRef(context.Context) { - s.StaticDirectoryRefs.DecRef(s.Destroy) +// DecRef implements Inode.DecRef. +func (s *StaticDirectory) DecRef(ctx context.Context) { + s.StaticDirectoryRefs.DecRef(func() { s.Destroy(ctx) }) } -// AlwaysValid partially implements kernfs.inodeDynamicLookup. +// InodeAlwaysValid partially implements Inode. // // +stateify savable -type AlwaysValid struct{} +type InodeAlwaysValid struct{} -// Valid implements kernfs.inodeDynamicLookup.Valid. -func (*AlwaysValid) Valid(context.Context) bool { +// Valid implements Inode.Valid. +func (*InodeAlwaysValid) Valid(context.Context) bool { return true } +// InodeTemporary partially implements Inode. +// +// +stateify savable +type InodeTemporary struct{} + +// Keep implements Inode.Keep. +func (*InodeTemporary) Keep() bool { + return false +} + // InodeNoStatFS partially implements the Inode interface, where the client // filesystem doesn't support statfs(2). // diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 6d3d79333..606081e68 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -29,12 +29,16 @@ // // Reference Model: // -// Kernfs dentries represents named pointers to inodes. Dentries and inodes have +// Kernfs dentries represents named pointers to inodes. Kernfs is solely +// reponsible for maintaining and modifying its dentry tree; inode +// implementations can not access the tree. Dentries and inodes have // independent lifetimes and reference counts. A child dentry unconditionally // holds a reference on its parent directory's dentry. A dentry also holds a -// reference on the inode it points to. Multiple dentries can point to the same -// inode (for example, in the case of hardlinks). File descriptors hold a -// reference to the dentry they're opened on. +// reference on the inode it points to (although that might not be the only +// reference on the inode). Due to this inodes can outlive the dentries that +// point to them. Multiple dentries can point to the same inode (for example, +// in the case of hardlinks). File descriptors hold a reference to the dentry +// they're opened on. // // Dentries are guaranteed to exist while holding Filesystem.mu for // reading. Dropping dentries require holding Filesystem.mu for writing. To @@ -47,8 +51,8 @@ // kernfs.Dentry.dirMu // vfs.VirtualFilesystem.mountMu // vfs.Dentry.mu -// kernfs.Filesystem.droppedDentriesMu // (inode implementation locks, if any) +// kernfs.Filesystem.droppedDentriesMu package kernfs import ( @@ -60,7 +64,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory @@ -95,7 +98,7 @@ type Filesystem struct { // example: // // fs.mu.RLock() - // fs.mu.processDeferredDecRefs() + // defer fs.processDeferredDecRefs() // defer fs.mu.RUnlock() // ... // fs.deferDecRef(dentry) @@ -108,8 +111,7 @@ type Filesystem struct { // deferDecRef defers dropping a dentry ref until the next call to // processDeferredDecRefs{,Locked}. See comment on Filesystem.mu. -// -// Precondition: d must not already be pending destruction. +// This may be called while Filesystem.mu or Dentry.dirMu is locked. func (fs *Filesystem) deferDecRef(d *Dentry) { fs.droppedDentriesMu.Lock() fs.droppedDentries = append(fs.droppedDentries, d) @@ -118,17 +120,14 @@ func (fs *Filesystem) deferDecRef(d *Dentry) { // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the // droppedDentries list. See comment on Filesystem.mu. +// +// Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked. func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) { - fs.mu.Lock() - fs.processDeferredDecRefsLocked(ctx) - fs.mu.Unlock() -} - -// Precondition: fs.mu must be held for writing. -func (fs *Filesystem) processDeferredDecRefsLocked(ctx context.Context) { fs.droppedDentriesMu.Lock() for _, d := range fs.droppedDentries { - d.DecRef(ctx) + // Defer the DecRef call so that we are not holding droppedDentriesMu + // when DecRef is called. + defer d.DecRef(ctx) } fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse. fs.droppedDentriesMu.Unlock() @@ -157,17 +156,19 @@ const ( // // A kernfs dentry is similar to a dentry in a traditional filesystem: it's a // named reference to an inode. A dentry generally lives as long as it's part of -// a mounted filesystem tree. Kernfs doesn't cache dentries once all references -// to them are removed. Dentries hold a single reference to the inode they point +// a mounted filesystem tree. Kernfs drops dentries once all references to them +// are dropped. Dentries hold a single reference to the inode they point // to, and child dentries hold a reference on their parent. // // Must be initialized by Init prior to first use. // // +stateify savable type Dentry struct { + vfsd vfs.Dentry DentryRefs - vfsd vfs.Dentry + // fs is the owning filesystem. fs is immutable. + fs *Filesystem // flags caches useful information about the dentry from the inode. See the // dflags* consts above. Must be accessed by atomic ops. @@ -192,8 +193,9 @@ type Dentry struct { // Precondition: Caller must hold a reference on inode. // // Postcondition: Caller's reference on inode is transferred to the dentry. -func (d *Dentry) Init(inode Inode) { +func (d *Dentry) Init(fs *Filesystem, inode Inode) { d.vfsd.Init(d) + d.fs = fs d.inode = inode ftype := inode.Mode().FileType() if ftype == linux.ModeDirectory { @@ -222,14 +224,28 @@ func (d *Dentry) isSymlink() bool { // DecRef implements vfs.DentryImpl.DecRef. func (d *Dentry) DecRef(ctx context.Context) { - // Before the destructor is called, Dentry must be removed from VFS' dentry cache. + decRefParent := false + d.fs.mu.Lock() d.DentryRefs.DecRef(func() { d.inode.DecRef(ctx) // IncRef from Init. d.inode = nil if d.parent != nil { - d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild. + // We will DecRef d.parent once all locks are dropped. + decRefParent = true + d.parent.dirMu.Lock() + // Remove d from parent.children. It might already have been + // removed due to invalidation. + if _, ok := d.parent.children[d.name]; ok { + delete(d.parent.children, d.name) + d.fs.VFSFilesystem().VirtualFilesystem().InvalidateDentry(ctx, d.VFSDentry()) + } + d.parent.dirMu.Unlock() } }) + d.fs.mu.Unlock() + if decRefParent { + d.parent.DecRef(ctx) // IncRef from Dentry.insertChild. + } } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. @@ -247,26 +263,26 @@ func (d *Dentry) Watches() *vfs.Watches { // OnZeroWatches implements vfs.Dentry.OnZeroWatches. func (d *Dentry) OnZeroWatches(context.Context) {} -// InsertChild inserts child into the vfs dentry cache with the given name under +// insertChild inserts child into the vfs dentry cache with the given name under // this dentry. This does not update the directory inode, so calling this on its // own isn't sufficient to insert a child into a directory. // // Precondition: d must represent a directory inode. -func (d *Dentry) InsertChild(name string, child *Dentry) { +func (d *Dentry) insertChild(name string, child *Dentry) { d.dirMu.Lock() - d.InsertChildLocked(name, child) + d.insertChildLocked(name, child) d.dirMu.Unlock() } -// InsertChildLocked is equivalent to InsertChild, with additional +// insertChildLocked is equivalent to insertChild, with additional // preconditions. // // Preconditions: // * d must represent a directory inode. // * d.dirMu must be locked. -func (d *Dentry) InsertChildLocked(name string, child *Dentry) { +func (d *Dentry) insertChildLocked(name string, child *Dentry) { if !d.isDir() { - panic(fmt.Sprintf("InsertChildLocked called on non-directory Dentry: %+v.", d)) + panic(fmt.Sprintf("insertChildLocked called on non-directory Dentry: %+v.", d)) } d.IncRef() // DecRef in child's Dentry.destroy. child.parent = d @@ -277,36 +293,6 @@ func (d *Dentry) InsertChildLocked(name string, child *Dentry) { d.children[name] = child } -// RemoveChild removes child from the vfs dentry cache. This does not update the -// directory inode or modify the inode to be unlinked. So calling this on its own -// isn't sufficient to remove a child from a directory. -// -// Precondition: d must represent a directory inode. -func (d *Dentry) RemoveChild(name string, child *Dentry) error { - d.dirMu.Lock() - defer d.dirMu.Unlock() - return d.RemoveChildLocked(name, child) -} - -// RemoveChildLocked is equivalent to RemoveChild, with additional -// preconditions. -// -// Precondition: d.dirMu must be locked. -func (d *Dentry) RemoveChildLocked(name string, child *Dentry) error { - if !d.isDir() { - panic(fmt.Sprintf("RemoveChild called on non-directory Dentry: %+v.", d)) - } - c, ok := d.children[name] - if !ok { - return syserror.ENOENT - } - if c != child { - panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! Child: %+v, vfs: %+v", c, child)) - } - delete(d.children, name) - return nil -} - // Inode returns the dentry's inode. func (d *Dentry) Inode() Inode { return d.inode @@ -348,11 +334,6 @@ type Inode interface { // a blanket implementation for all non-directory inodes. inodeDirectory - // Method for inodes that represent dynamic directories and their - // children. InodeNoDynamicLookup provides a blanket implementation for all - // non-dynamic-directory inodes. - inodeDynamicLookup - // Open creates a file description for the filesystem object represented by // this inode. The returned file description should hold a reference on the // dentry for its lifetime. @@ -365,6 +346,14 @@ type Inode interface { // corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem // doesn't support statfs(2), this should return ENOSYS. StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) + + // Keep indicates whether the dentry created after Inode.Lookup should be + // kept in the kernfs dentry tree. + Keep() bool + + // Valid should return true if this inode is still valid, or needs to + // be resolved again by a call to Lookup. + Valid(ctx context.Context) bool } type inodeRefs interface { @@ -397,8 +386,8 @@ type inodeMetadata interface { // Precondition: All methods in this interface may only be called on directory // inodes. type inodeDirectory interface { - // The New{File,Dir,Node,Symlink} methods below should return a new inode - // hashed into this inode. + // The New{File,Dir,Node,Link,Symlink} methods below should return a new inode + // that will be hashed into the dentry tree. // // These inode constructors are inode-level operations rather than // filesystem-level operations to allow client filesystems to mix different @@ -409,60 +398,54 @@ type inodeDirectory interface { HasChildren() bool // NewFile creates a new regular file inode. - NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*Dentry, error) + NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error) // NewDir creates a new directory inode. - NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*Dentry, error) + NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error) // NewLink creates a new hardlink to a specified inode in this // directory. Implementations should create a new kernfs Dentry pointing to // target, and update target's link count. - NewLink(ctx context.Context, name string, target Inode) (*Dentry, error) + NewLink(ctx context.Context, name string, target Inode) (Inode, error) // NewSymlink creates a new symbolic link inode. - NewSymlink(ctx context.Context, name, target string) (*Dentry, error) + NewSymlink(ctx context.Context, name, target string) (Inode, error) // NewNode creates a new filesystem node for a mknod syscall. - NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*Dentry, error) + NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error) // Unlink removes a child dentry from this directory inode. - Unlink(ctx context.Context, name string, child *Dentry) error + Unlink(ctx context.Context, name string, child Inode) error // RmDir removes an empty child directory from this directory // inode. Implementations must update the parent directory's link count, // if required. Implementations are not responsible for checking that child // is a directory, checking for an empty directory. - RmDir(ctx context.Context, name string, child *Dentry) error + RmDir(ctx context.Context, name string, child Inode) error // Rename is called on the source directory containing an inode being // renamed. child should point to the resolved child in the source - // directory. If Rename replaces a dentry in the destination directory, it - // should return the replaced dentry or nil otherwise. + // directory. // // Precondition: Caller must serialize concurrent calls to Rename. - Rename(ctx context.Context, oldname, newname string, child, dstDir *Dentry) (replaced *Dentry, err error) -} + Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error -type inodeDynamicLookup interface { - // Lookup should return an appropriate dentry if name should resolve to a - // child of this dynamic directory inode. This gives the directory an - // opportunity on every lookup to resolve additional entries that aren't - // hashed into the directory. This is only called when the inode is a - // directory. If the inode is not a directory, or if the directory only - // contains a static set of children, the implementer can unconditionally - // return an appropriate error (ENOTDIR and ENOENT respectively). + // Lookup should return an appropriate inode if name should resolve to a + // child of this directory inode. This gives the directory an opportunity + // on every lookup to resolve additional entries. This is only called when + // the inode is a directory. // - // The child returned by Lookup will be hashed into the VFS dentry tree. Its - // lifetime can be controlled by the filesystem implementation with an - // appropriate implementation of Valid. + // The child returned by Lookup will be hashed into the VFS dentry tree, + // atleast for the duration of the current FS operation. // - // Lookup returns the child with an extra reference and the caller owns this - // reference. - Lookup(ctx context.Context, name string) (*Dentry, error) - - // Valid should return true if this inode is still valid, or needs to - // be resolved again by a call to Lookup. - Valid(ctx context.Context) bool + // Lookup must return the child with an extra reference whose ownership is + // transferred to the dentry that is created to point to that inode. If + // Inode.Keep returns false, that new dentry will be dropped at the end of + // the current filesystem operation (before returning back to the VFS + // layer) if no other ref is picked on that dentry. If Inode.Keep returns + // true, then the dentry will be cached into the dentry tree until it is + // Unlink'd or RmDir'd. + Lookup(ctx context.Context, name string) (Inode, error) // IterDirents is used to iterate over dynamically created entries. It invokes // cb on each entry in the directory represented by the Inode. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index e413242dc..82fa19c03 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -36,7 +36,7 @@ const staticFileContent = "This is sample content for a static test file." // RootDentryFn is a generator function for creating the root dentry of a test // filesystem. See newTestSystem. -type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry +type RootDentryFn func(*auth.Credentials, *filesystem) kernfs.Inode // newTestSystem sets up a minimal environment for running a test, including an // instance of a test filesystem. Tests can control the contents of the @@ -72,14 +72,11 @@ type file struct { content string } -func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry { +func (fs *filesystem) newFile(creds *auth.Credentials, content string) kernfs.Inode { f := &file{} f.content = content f.DynamicBytesFile.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777) - - d := &kernfs.Dentry{} - d.Init(f) - return d + return f } func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error { @@ -98,27 +95,23 @@ func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.S type readonlyDir struct { readonlyDirRefs attrs + kernfs.InodeAlwaysValid kernfs.InodeDirectoryNoNewChildren - kernfs.InodeNoDynamicLookup kernfs.InodeNoStatFS kernfs.InodeNotSymlink + kernfs.InodeTemporary kernfs.OrderedChildren locks vfs.FileLocks - - dentry kernfs.Dentry } -func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { +func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode { dir := &readonlyDir{} dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) dir.EnableLeakCheck() - dir.dentry.Init(dir) - - dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents)) - - return &dir.dentry + dir.IncLinks(dir.OrderedChildren.Populate(contents)) + return dir } func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { @@ -131,35 +124,33 @@ func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernf return fd.VFSFileDescription(), nil } -func (d *readonlyDir) DecRef(context.Context) { - d.readonlyDirRefs.DecRef(d.Destroy) +func (d *readonlyDir) DecRef(ctx context.Context) { + d.readonlyDirRefs.DecRef(func() { d.Destroy(ctx) }) } type dir struct { dirRefs attrs - kernfs.InodeNoDynamicLookup + kernfs.InodeAlwaysValid kernfs.InodeNotSymlink - kernfs.OrderedChildren kernfs.InodeNoStatFS + kernfs.InodeTemporary + kernfs.OrderedChildren locks vfs.FileLocks - fs *filesystem - dentry kernfs.Dentry + fs *filesystem } -func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { +func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode { dir := &dir{} dir.fs = fs dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) dir.EnableLeakCheck() - dir.dentry.Init(dir) - dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents)) - - return &dir.dentry + dir.IncLinks(dir.OrderedChildren.Populate(contents)) + return dir } func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { @@ -172,11 +163,11 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry return fd.VFSFileDescription(), nil } -func (d *dir) DecRef(context.Context) { - d.dirRefs.DecRef(d.Destroy) +func (d *dir) DecRef(ctx context.Context) { + d.dirRefs.DecRef(func() { d.Destroy(ctx) }) } -func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*kernfs.Dentry, error) { +func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { creds := auth.CredentialsFromContext(ctx) dir := d.fs.newDir(creds, opts.Mode, nil) if err := d.OrderedChildren.Insert(name, dir); err != nil { @@ -187,7 +178,7 @@ func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (* return dir, nil } -func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*kernfs.Dentry, error) { +func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) { creds := auth.CredentialsFromContext(ctx) f := d.fs.newFile(creds, "") if err := d.OrderedChildren.Insert(name, f); err != nil { @@ -197,15 +188,15 @@ func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (* return f, nil } -func (*dir) NewLink(context.Context, string, kernfs.Inode) (*kernfs.Dentry, error) { +func (*dir) NewLink(context.Context, string, kernfs.Inode) (kernfs.Inode, error) { return nil, syserror.EPERM } -func (*dir) NewSymlink(context.Context, string, string) (*kernfs.Dentry, error) { +func (*dir) NewSymlink(context.Context, string, string) (kernfs.Inode, error) { return nil, syserror.EPERM } -func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*kernfs.Dentry, error) { +func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (kernfs.Inode, error) { return nil, syserror.EPERM } @@ -213,18 +204,22 @@ func (fsType) Name() string { return "kernfs" } +func (fsType) Release(ctx context.Context) {} + func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { fs := &filesystem{} fs.VFSFilesystem().Init(vfsObj, &fst, fs) root := fst.rootFn(creds, fs) - return fs.VFSFilesystem(), root.VFSDentry(), nil + var d kernfs.Dentry + d.Init(&fs.Filesystem, root) + return fs.VFSFilesystem(), d.VFSDentry(), nil } // -------------------- Remainder of the file are test cases -------------------- func TestBasic(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { - return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{ "file1": fs.newFile(creds, staticFileContent), }) }) @@ -233,8 +228,8 @@ func TestBasic(t *testing.T) { } func TestMkdirGetDentry(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { - return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{ "dir1": fs.newDir(creds, 0755, nil), }) }) @@ -248,8 +243,8 @@ func TestMkdirGetDentry(t *testing.T) { } func TestReadStaticFile(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { - return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{ "file1": fs.newFile(creds, staticFileContent), }) }) @@ -274,8 +269,8 @@ func TestReadStaticFile(t *testing.T) { } func TestCreateNewFileInStaticDir(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { - return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{ "dir1": fs.newDir(creds, 0755, nil), }) }) @@ -301,7 +296,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) { } func TestDirFDReadWrite(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode { return fs.newReadonlyDir(creds, 0755, nil) }) defer sys.Destroy() @@ -325,11 +320,11 @@ func TestDirFDReadWrite(t *testing.T) { } func TestDirFDIterDirents(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { - return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{ // Fill root with nodes backed by various inode implementations. "dir1": fs.newReadonlyDir(creds, 0755, nil), - "dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{ + "dir2": fs.newDir(creds, 0755, map[string]kernfs.Inode{ "dir3": fs.newDir(creds, 0755, nil), }), "file1": fs.newFile(creds, staticFileContent), diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 58a93eaac..934cc6c9e 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -38,13 +38,10 @@ type StaticSymlink struct { var _ Inode = (*StaticSymlink)(nil) // NewStaticSymlink creates a new symlink file pointing to 'target'. -func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) *Dentry { +func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) Inode { inode := &StaticSymlink{} inode.Init(creds, devMajor, devMinor, ino, target) - - d := &Dentry{} - d.Init(inode) - return d + return inode } // Init initializes the instance. diff --git a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go index ea7f073eb..d0ed17b18 100644 --- a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go +++ b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go @@ -29,24 +29,22 @@ import ( // // +stateify savable type syntheticDirectory struct { + InodeAlwaysValid InodeAttrs InodeNoStatFS - InodeNoopRefCount - InodeNoDynamicLookup InodeNotSymlink OrderedChildren + syntheticDirectoryRefs locks vfs.FileLocks } var _ Inode = (*syntheticDirectory)(nil) -func newSyntheticDirectory(creds *auth.Credentials, perm linux.FileMode) *Dentry { +func newSyntheticDirectory(creds *auth.Credentials, perm linux.FileMode) Inode { inode := &syntheticDirectory{} inode.Init(creds, 0 /* devMajor */, 0 /* devMinor */, 0 /* ino */, perm) - d := &Dentry{} - d.Init(inode) - return d + return inode } func (dir *syntheticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { @@ -69,34 +67,46 @@ func (dir *syntheticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, } // NewFile implements Inode.NewFile. -func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*Dentry, error) { +func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error) { return nil, syserror.EPERM } // NewDir implements Inode.NewDir. -func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*Dentry, error) { +func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error) { if !opts.ForSyntheticMountpoint { return nil, syserror.EPERM } - subdird := newSyntheticDirectory(auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask) - if err := dir.OrderedChildren.Insert(name, subdird); err != nil { - subdird.DecRef(ctx) + subdirI := newSyntheticDirectory(auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask) + if err := dir.OrderedChildren.Insert(name, subdirI); err != nil { + subdirI.DecRef(ctx) return nil, err } - return subdird, nil + return subdirI, nil } // NewLink implements Inode.NewLink. -func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (*Dentry, error) { +func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (Inode, error) { return nil, syserror.EPERM } // NewSymlink implements Inode.NewSymlink. -func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (*Dentry, error) { +func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (Inode, error) { return nil, syserror.EPERM } // NewNode implements Inode.NewNode. -func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*Dentry, error) { +func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error) { return nil, syserror.EPERM } + +// DecRef implements Inode.DecRef. +func (dir *syntheticDirectory) DecRef(ctx context.Context) { + dir.syntheticDirectoryRefs.DecRef(func() { dir.Destroy(ctx) }) +} + +// Keep implements Inode.Keep. This is redundant because inodes will never be +// created via Lookup and inodes are always valid. Makes sense to return true +// because these inodes are not temporary and should only be removed on RmDir. +func (dir *syntheticDirectory) Keep() bool { + return true +} diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index dfbccd05f..e5f506d2e 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -60,6 +60,9 @@ func (FilesystemType) Name() string { return Name } +// Release implements FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + // FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to // FilesystemType.GetFilesystem. // diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index 4e2da4810..e44b79b68 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -39,6 +39,9 @@ func (filesystemType) Name() string { return "pipefs" } +// Release implements vfs.FilesystemType.Release. +func (filesystemType) Release(ctx context.Context) {} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { panic("pipefs.filesystemType.GetFilesystem should never be called") @@ -165,7 +168,7 @@ func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vf fs := mnt.Filesystem().Impl().(*filesystem) inode := newInode(ctx, fs) var d kernfs.Dentry - d.Init(inode) + d.Init(&fs.Filesystem, inode) defer d.DecRef(ctx) return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags) } diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index 05d7948ea..fd70a07de 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -34,13 +34,14 @@ const Name = "proc" // +stateify savable type FilesystemType struct{} -var _ vfs.FilesystemType = (*FilesystemType)(nil) - // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name } +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + // +stateify savable type filesystem struct { kernfs.Filesystem @@ -73,7 +74,9 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF cgroups = data.Cgroups } - _, dentry := procfs.newTasksInode(k, pidns, cgroups) + inode := procfs.newTasksInode(k, pidns, cgroups) + var dentry kernfs.Dentry + dentry.Init(&procfs.Filesystem, inode) return procfs.VFSFilesystem(), dentry.VFSDentry(), nil } @@ -94,12 +97,9 @@ type dynamicInode interface { Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) } -func (fs *filesystem) newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { - inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm) - - d := &kernfs.Dentry{} - d.Init(inode) - return d +func (fs *filesystem) newInode(creds *auth.Credentials, perm linux.FileMode, inode dynamicInode) dynamicInode { + inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), inode, perm) + return inode } // +stateify savable @@ -114,8 +114,8 @@ func newStaticFile(data string) *staticFile { return &staticFile{StaticData: vfs.StaticData{Data: data}} } -func newStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry { - return kernfs.NewStaticDir(creds, devMajor, devMinor, ino, perm, children, kernfs.GenericDirectoryFDOptions{ +func (fs *filesystem) newStaticDir(creds *auth.Credentials, children map[string]kernfs.Inode) kernfs.Inode { + return kernfs.NewStaticDir(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, children, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }) } diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 47ecd941c..bad2fab4f 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -32,10 +32,11 @@ import ( // +stateify savable type subtasksInode struct { implStatFS - kernfs.AlwaysValid + kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren kernfs.InodeNotSymlink + kernfs.InodeTemporary kernfs.OrderedChildren subtasksInodeRefs @@ -49,7 +50,7 @@ type subtasksInode struct { var _ kernfs.Inode = (*subtasksInode)(nil) -func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *kernfs.Dentry { +func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) kernfs.Inode { subInode := &subtasksInode{ fs: fs, task: task, @@ -62,14 +63,11 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, subInode.EnableLeakCheck() inode := &taskOwnedInode{Inode: subInode, owner: task} - dentry := &kernfs.Dentry{} - dentry.Init(inode) - - return dentry + return inode } -// Lookup implements kernfs.inodeDynamicLookup.Lookup. -func (i *subtasksInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) { +// Lookup implements kernfs.inodeDirectory.Lookup. +func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { tid, err := strconv.ParseUint(name, 10, 32) if err != nil { return nil, syserror.ENOENT @@ -82,10 +80,10 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry if subTask.ThreadGroup() != i.task.ThreadGroup() { return nil, syserror.ENOENT } - return i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers), nil + return i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers) } -// IterDirents implements kernfs.inodeDynamicLookup.IterDirents. +// IterDirents implements kernfs.inodeDirectory.IterDirents. func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { tasks := i.task.ThreadGroup().MemberIDs(i.pidns) if len(tasks) == 0 { @@ -186,6 +184,6 @@ func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credential } // DecRef implements kernfs.Inode.DecRef. -func (i *subtasksInode) DecRef(context.Context) { - i.subtasksInodeRefs.DecRef(i.Destroy) +func (i *subtasksInode) DecRef(ctx context.Context) { + i.subtasksInodeRefs.DecRef(func() { i.Destroy(ctx) }) } diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index a7cd6f57e..b63a4eca0 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -35,8 +35,8 @@ type taskInode struct { implStatFS kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren - kernfs.InodeNoDynamicLookup kernfs.InodeNotSymlink + kernfs.InodeTemporary kernfs.OrderedChildren taskInodeRefs @@ -47,41 +47,44 @@ type taskInode struct { var _ kernfs.Inode = (*taskInode)(nil) -func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry { - // TODO(gvisor.dev/issue/164): Fail with ESRCH if task exited. - contents := map[string]*kernfs.Dentry{ - "auxv": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &auxvData{task: task}), - "cmdline": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), +func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) (kernfs.Inode, error) { + if task.ExitState() == kernel.TaskExitDead { + return nil, syserror.ESRCH + } + + contents := map[string]kernfs.Inode{ + "auxv": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &auxvData{task: task}), + "cmdline": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), "comm": fs.newComm(task, fs.NextIno(), 0444), "cwd": fs.newCwdSymlink(task, fs.NextIno()), - "environ": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), + "environ": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), "exe": fs.newExeSymlink(task, fs.NextIno()), "fd": fs.newFDDirInode(task), "fdinfo": fs.newFDInfoDirInode(task), - "gid_map": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}), - "io": fs.newTaskOwnedFile(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)), - "maps": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mapsData{task: task}), - "mountinfo": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountInfoData{task: task}), - "mounts": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountsData{task: task}), + "gid_map": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}), + "io": fs.newTaskOwnedInode(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)), + "maps": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mapsData{task: task}), + "mountinfo": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mountInfoData{task: task}), + "mounts": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mountsData{task: task}), "net": fs.newTaskNetDir(task), - "ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]*kernfs.Dentry{ + "ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]kernfs.Inode{ "net": fs.newNamespaceSymlink(task, fs.NextIno(), "net"), "pid": fs.newNamespaceSymlink(task, fs.NextIno(), "pid"), "user": fs.newNamespaceSymlink(task, fs.NextIno(), "user"), }), - "oom_score": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newStaticFile("0\n")), - "oom_score_adj": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}), - "smaps": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &smapsData{task: task}), - "stat": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), - "statm": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statmData{task: task}), - "status": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}), - "uid_map": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}), + "oom_score": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, newStaticFile("0\n")), + "oom_score_adj": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}), + "smaps": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &smapsData{task: task}), + "stat": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &statmData{task: task}), + "status": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}), + "uid_map": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}), } if isThreadGroup { contents["task"] = fs.newSubtasks(task, pidns, cgroupControllers) } if len(cgroupControllers) > 0 { - contents["cgroup"] = fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers)) + contents["cgroup"] = fs.newTaskOwnedInode(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers)) } taskInode := &taskInode{task: task} @@ -90,17 +93,15 @@ func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace taskInode.EnableLeakCheck() inode := &taskOwnedInode{Inode: taskInode, owner: task} - dentry := &kernfs.Dentry{} - dentry.Init(inode) taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - links := taskInode.OrderedChildren.Populate(dentry, contents) + links := taskInode.OrderedChildren.Populate(contents) taskInode.IncLinks(links) - return dentry + return inode, nil } -// Valid implements kernfs.inodeDynamicLookup. This inode remains valid as long +// Valid implements kernfs.Inode.Valid. This inode remains valid as long // as the task is still running. When it's dead, another tasks with the same // PID could replace it. func (i *taskInode) Valid(ctx context.Context) bool { @@ -124,8 +125,8 @@ func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, v } // DecRef implements kernfs.Inode.DecRef. -func (i *taskInode) DecRef(context.Context) { - i.taskInodeRefs.DecRef(i.Destroy) +func (i *taskInode) DecRef(ctx context.Context) { + i.taskInodeRefs.DecRef(func() { i.Destroy(ctx) }) } // taskOwnedInode implements kernfs.Inode and overrides inode owner with task @@ -141,34 +142,23 @@ type taskOwnedInode struct { var _ kernfs.Inode = (*taskOwnedInode)(nil) -func (fs *filesystem) newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { +func (fs *filesystem) newTaskOwnedInode(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) kernfs.Inode { // Note: credentials are overridden by taskOwnedInode. inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm) - taskInode := &taskOwnedInode{Inode: inode, owner: task} - d := &kernfs.Dentry{} - d.Init(taskInode) - return d + return &taskOwnedInode{Inode: inode, owner: task} } -func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry { - dir := &kernfs.StaticDirectory{} - +func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]kernfs.Inode) kernfs.Inode { // Note: credentials are overridden by taskOwnedInode. - dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, kernfs.GenericDirectoryFDOptions{ - SeekEnd: kernfs.SeekEndZero, - }) - dir.EnableLeakCheck() - - inode := &taskOwnedInode{Inode: dir, owner: task} - d := &kernfs.Dentry{} - d.Init(inode) + fdOpts := kernfs.GenericDirectoryFDOptions{SeekEnd: kernfs.SeekEndZero} + dir := kernfs.NewStaticDir(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, children, fdOpts) - dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - links := dir.OrderedChildren.Populate(d, children) - dir.IncLinks(links) + return &taskOwnedInode{Inode: dir, owner: task} +} - return d +func (i *taskOwnedInode) Valid(ctx context.Context) bool { + return i.owner.ExitState() != kernel.TaskExitDead && i.Inode.Valid(ctx) } // Stat implements kernfs.Inode.Stat. diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 0866cea2b..2c80ac5c2 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -63,7 +63,7 @@ type fdDir struct { produceSymlink bool } -// IterDirents implements kernfs.inodeDynamicLookup.IterDirents. +// IterDirents implements kernfs.inodeDirectory.IterDirents. func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { var fds []int32 i.task.WithMuLocked(func(t *kernel.Task) { @@ -109,16 +109,17 @@ type fdDirInode struct { fdDir fdDirInodeRefs implStatFS - kernfs.AlwaysValid + kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren kernfs.InodeNotSymlink + kernfs.InodeTemporary kernfs.OrderedChildren } var _ kernfs.Inode = (*fdDirInode)(nil) -func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry { +func (fs *filesystem) newFDDirInode(task *kernel.Task) kernfs.Inode { inode := &fdDirInode{ fdDir: fdDir{ fs: fs, @@ -128,16 +129,17 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry { } inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) inode.EnableLeakCheck() - - dentry := &kernfs.Dentry{} - dentry.Init(inode) inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + return inode +} - return dentry +// IterDirents implements kernfs.inodeDirectory.IterDirents. +func (i *fdDirInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + return i.fdDir.IterDirents(ctx, cb, offset, relOffset) } -// Lookup implements kernfs.inodeDynamicLookup.Lookup. -func (i *fdDirInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) { +// Lookup implements kernfs.inodeDirectory.Lookup. +func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { return nil, syserror.ENOENT @@ -183,8 +185,8 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia } // DecRef implements kernfs.Inode.DecRef. -func (i *fdDirInode) DecRef(context.Context) { - i.fdDirInodeRefs.DecRef(i.Destroy) +func (i *fdDirInode) DecRef(ctx context.Context) { + i.fdDirInodeRefs.DecRef(func() { i.Destroy(ctx) }) } // fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file. @@ -202,16 +204,13 @@ type fdSymlink struct { var _ kernfs.Inode = (*fdSymlink)(nil) -func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry { +func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) kernfs.Inode { inode := &fdSymlink{ task: task, fd: fd, } inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) - - d := &kernfs.Dentry{} - d.Init(inode) - return d + return inode } func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { @@ -236,6 +235,11 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen return vd, "", nil } +// Valid implements kernfs.Inode.Valid. +func (s *fdSymlink) Valid(ctx context.Context) bool { + return taskFDExists(ctx, s.task, s.fd) +} + // fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory. // // +stateify savable @@ -243,16 +247,17 @@ type fdInfoDirInode struct { fdDir fdInfoDirInodeRefs implStatFS - kernfs.AlwaysValid + kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren kernfs.InodeNotSymlink + kernfs.InodeTemporary kernfs.OrderedChildren } var _ kernfs.Inode = (*fdInfoDirInode)(nil) -func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry { +func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) kernfs.Inode { inode := &fdInfoDirInode{ fdDir: fdDir{ fs: fs, @@ -261,16 +266,12 @@ func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry { } inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) inode.EnableLeakCheck() - - dentry := &kernfs.Dentry{} - dentry.Init(inode) inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - - return dentry + return inode } -// Lookup implements kernfs.inodeDynamicLookup.Lookup. -func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) { +// Lookup implements kernfs.inodeDirectory.Lookup. +func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { return nil, syserror.ENOENT @@ -283,7 +284,12 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*kernfs.Dentr task: i.task, fd: fd, } - return i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data), nil + return i.fs.newTaskOwnedInode(i.task, i.fs.NextIno(), 0444, data), nil +} + +// IterDirents implements Inode.IterDirents. +func (i *fdInfoDirInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { + return i.fdDir.IterDirents(ctx, cb, offset, relOffset) } // Open implements kernfs.Inode.Open. @@ -298,8 +304,8 @@ func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *ker } // DecRef implements kernfs.Inode.DecRef. -func (i *fdInfoDirInode) DecRef(context.Context) { - i.fdInfoDirInodeRefs.DecRef(i.Destroy) +func (i *fdInfoDirInode) DecRef(ctx context.Context) { + i.fdInfoDirInodeRefs.DecRef(func() { i.Destroy(ctx) }) } // fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd]. @@ -328,3 +334,8 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "flags:\t0%o\n", flags) return nil } + +// Valid implements kernfs.Inode.Valid. +func (d *fdInfoData) Valid(ctx context.Context) bool { + return taskFDExists(ctx, d.task, d.fd) +} diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 3fbf081a6..79f8b7e9f 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -247,13 +247,10 @@ type commInode struct { task *kernel.Task } -func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry { +func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { inode := &commInode{task: task} inode.DynamicBytesFile.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm) - - d := &kernfs.Dentry{} - d.Init(inode) - return d + return inode } func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { @@ -658,13 +655,10 @@ type exeSymlink struct { var _ kernfs.Inode = (*exeSymlink)(nil) -func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry { +func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) kernfs.Inode { inode := &exeSymlink{task: task} inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) - - d := &kernfs.Dentry{} - d.Init(inode) - return d + return inode } // Readlink implements kernfs.Inode.Readlink. @@ -737,13 +731,10 @@ type cwdSymlink struct { var _ kernfs.Inode = (*cwdSymlink)(nil) -func (fs *filesystem) newCwdSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry { +func (fs *filesystem) newCwdSymlink(task *kernel.Task, ino uint64) kernfs.Inode { inode := &cwdSymlink{task: task} inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) - - d := &kernfs.Dentry{} - d.Init(inode) - return d + return inode } // Readlink implements kernfs.Inode.Readlink. @@ -851,7 +842,7 @@ type namespaceSymlink struct { task *kernel.Task } -func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { +func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) kernfs.Inode { // Namespace symlinks should contain the namespace name and the inode number // for the namespace instance, so for example user:[123456]. We currently fake // the inode number by sticking the symlink inode in its place. @@ -862,9 +853,7 @@ func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns stri inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) taskInode := &taskOwnedInode{Inode: inode, owner: task} - d := &kernfs.Dentry{} - d.Init(taskInode) - return d + return taskInode } // Readlink implements kernfs.Inode.Readlink. @@ -882,11 +871,12 @@ func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.Vir } // Create a synthetic inode to represent the namespace. + fs := mnt.Filesystem().Impl().(*filesystem) dentry := &kernfs.Dentry{} - dentry.Init(&namespaceInode{}) + dentry.Init(&fs.Filesystem, &namespaceInode{}) vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry()) - vd.IncRef() - dentry.DecRef(ctx) + // Only IncRef vd.Mount() because vd.Dentry() already holds a ref of 1. + mnt.IncRef() return vd, "", nil } diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index e7f748655..3425e8698 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -37,12 +37,12 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry { +func (fs *filesystem) newTaskNetDir(task *kernel.Task) kernfs.Inode { k := task.Kernel() pidns := task.PIDNamespace() root := auth.NewRootCredentials(pidns.UserNamespace()) - var contents map[string]*kernfs.Dentry + var contents map[string]kernfs.Inode if stack := task.NetworkNamespace().Stack(); stack != nil { const ( arp = "IP address HW type Flags HW address Mask Device\n" @@ -56,34 +56,34 @@ func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry { // TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task // network namespace. - contents = map[string]*kernfs.Dentry{ - "dev": fs.newDentry(root, fs.NextIno(), 0444, &netDevData{stack: stack}), - "snmp": fs.newDentry(root, fs.NextIno(), 0444, &netSnmpData{stack: stack}), + contents = map[string]kernfs.Inode{ + "dev": fs.newInode(root, 0444, &netDevData{stack: stack}), + "snmp": fs.newInode(root, 0444, &netSnmpData{stack: stack}), // The following files are simple stubs until they are implemented in // netstack, if the file contains a header the stub is just the header // otherwise it is an empty file. - "arp": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(arp)), - "netlink": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(netlink)), - "netstat": fs.newDentry(root, fs.NextIno(), 0444, &netStatData{}), - "packet": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(packet)), - "protocols": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(protocols)), + "arp": fs.newInode(root, 0444, newStaticFile(arp)), + "netlink": fs.newInode(root, 0444, newStaticFile(netlink)), + "netstat": fs.newInode(root, 0444, &netStatData{}), + "packet": fs.newInode(root, 0444, newStaticFile(packet)), + "protocols": fs.newInode(root, 0444, newStaticFile(protocols)), // Linux sets psched values to: nsec per usec, psched tick in ns, 1000000, // high res timer ticks per sec (ClockGetres returns 1ns resolution). - "psched": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(psched)), - "ptype": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(ptype)), - "route": fs.newDentry(root, fs.NextIno(), 0444, &netRouteData{stack: stack}), - "tcp": fs.newDentry(root, fs.NextIno(), 0444, &netTCPData{kernel: k}), - "udp": fs.newDentry(root, fs.NextIno(), 0444, &netUDPData{kernel: k}), - "unix": fs.newDentry(root, fs.NextIno(), 0444, &netUnixData{kernel: k}), + "psched": fs.newInode(root, 0444, newStaticFile(psched)), + "ptype": fs.newInode(root, 0444, newStaticFile(ptype)), + "route": fs.newInode(root, 0444, &netRouteData{stack: stack}), + "tcp": fs.newInode(root, 0444, &netTCPData{kernel: k}), + "udp": fs.newInode(root, 0444, &netUDPData{kernel: k}), + "unix": fs.newInode(root, 0444, &netUnixData{kernel: k}), } if stack.SupportsIPv6() { - contents["if_inet6"] = fs.newDentry(root, fs.NextIno(), 0444, &ifinet6{stack: stack}) - contents["ipv6_route"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")) - contents["tcp6"] = fs.newDentry(root, fs.NextIno(), 0444, &netTCP6Data{kernel: k}) - contents["udp6"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(upd6)) + contents["if_inet6"] = fs.newInode(root, 0444, &ifinet6{stack: stack}) + contents["ipv6_route"] = fs.newInode(root, 0444, newStaticFile("")) + contents["tcp6"] = fs.newInode(root, 0444, &netTCP6Data{kernel: k}) + contents["udp6"] = fs.newInode(root, 0444, newStaticFile(upd6)) } } diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index d8f5dd509..3259c3732 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -38,10 +38,11 @@ const ( // +stateify savable type tasksInode struct { implStatFS - kernfs.AlwaysValid + kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren kernfs.InodeNotSymlink + kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid. kernfs.OrderedChildren tasksInodeRefs @@ -52,8 +53,6 @@ type tasksInode struct { // '/proc/self' and '/proc/thread-self' have custom directory offsets in // Linux. So handle them outside of OrderedChildren. - selfSymlink *kernfs.Dentry - threadSelfSymlink *kernfs.Dentry // cgroupControllers is a map of controller name to directory in the // cgroup hierarchy. These controllers are immutable and will be listed @@ -63,52 +62,53 @@ type tasksInode struct { var _ kernfs.Inode = (*tasksInode)(nil) -func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) { +func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *tasksInode { root := auth.NewRootCredentials(pidns.UserNamespace()) - contents := map[string]*kernfs.Dentry{ - "cpuinfo": fs.newDentry(root, fs.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))), - "filesystems": fs.newDentry(root, fs.NextIno(), 0444, &filesystemsData{}), - "loadavg": fs.newDentry(root, fs.NextIno(), 0444, &loadavgData{}), + contents := map[string]kernfs.Inode{ + "cpuinfo": fs.newInode(root, 0444, newStaticFileSetStat(cpuInfoData(k))), + "filesystems": fs.newInode(root, 0444, &filesystemsData{}), + "loadavg": fs.newInode(root, 0444, &loadavgData{}), "sys": fs.newSysDir(root, k), - "meminfo": fs.newDentry(root, fs.NextIno(), 0444, &meminfoData{}), + "meminfo": fs.newInode(root, 0444, &meminfoData{}), "mounts": kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"), "net": kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"), - "stat": fs.newDentry(root, fs.NextIno(), 0444, &statData{}), - "uptime": fs.newDentry(root, fs.NextIno(), 0444, &uptimeData{}), - "version": fs.newDentry(root, fs.NextIno(), 0444, &versionData{}), + "stat": fs.newInode(root, 0444, &statData{}), + "uptime": fs.newInode(root, 0444, &uptimeData{}), + "version": fs.newInode(root, 0444, &versionData{}), } inode := &tasksInode{ pidns: pidns, fs: fs, - selfSymlink: fs.newSelfSymlink(root, fs.NextIno(), pidns), - threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns), cgroupControllers: cgroupControllers, } inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) inode.EnableLeakCheck() - dentry := &kernfs.Dentry{} - dentry.Init(inode) - inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - links := inode.OrderedChildren.Populate(dentry, contents) + links := inode.OrderedChildren.Populate(contents) inode.IncLinks(links) - return inode, dentry + return inode } -// Lookup implements kernfs.inodeDynamicLookup.Lookup. -func (i *tasksInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, error) { - // Try to lookup a corresponding task. +// Lookup implements kernfs.inodeDirectory.Lookup. +func (i *tasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { + // Check if a static entry was looked up. + if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil { + return d, nil + } + + // Not a static entry. Try to lookup a corresponding task. tid, err := strconv.ParseUint(name, 10, 64) if err != nil { + root := auth.NewRootCredentials(i.pidns.UserNamespace()) // If it failed to parse, check if it's one of the special handled files. switch name { case selfName: - return i.selfSymlink, nil + return i.newSelfSymlink(root), nil case threadSelfName: - return i.threadSelfSymlink, nil + return i.newThreadSelfSymlink(root), nil } return nil, syserror.ENOENT } @@ -118,10 +118,10 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*kernfs.Dentry, e return nil, syserror.ENOENT } - return i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers), nil + return i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers) } -// IterDirents implements kernfs.inodeDynamicLookup.IterDirents. +// IterDirents implements kernfs.inodeDirectory.IterDirents. func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) { // fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256 const FIRST_PROCESS_ENTRY = 256 @@ -229,8 +229,8 @@ func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.St } // DecRef implements kernfs.Inode.DecRef. -func (i *tasksInode) DecRef(context.Context) { - i.tasksInodeRefs.DecRef(i.Destroy) +func (i *tasksInode) DecRef(ctx context.Context) { + i.tasksInodeRefs.DecRef(func() { i.Destroy(ctx) }) } // staticFileSetStat implements a special static file that allows inode diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index f268c59b0..07c27cdd9 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -43,13 +43,10 @@ type selfSymlink struct { var _ kernfs.Inode = (*selfSymlink)(nil) -func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { - inode := &selfSymlink{pidns: pidns} - inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) - - d := &kernfs.Dentry{} - d.Init(inode) - return d +func (i *tasksInode) newSelfSymlink(creds *auth.Credentials) kernfs.Inode { + inode := &selfSymlink{pidns: i.pidns} + inode.Init(creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777) + return inode } func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { @@ -87,13 +84,10 @@ type threadSelfSymlink struct { var _ kernfs.Inode = (*threadSelfSymlink)(nil) -func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { - inode := &threadSelfSymlink{pidns: pidns} - inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) - - d := &kernfs.Dentry{} - d.Init(inode) - return d +func (i *tasksInode) newThreadSelfSymlink(creds *auth.Credentials) kernfs.Inode { + inode := &threadSelfSymlink{pidns: i.pidns} + inode.Init(creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777) + return inode } func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 3312b0418..95420368d 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -40,93 +40,93 @@ const ( ) // newSysDir returns the dentry corresponding to /proc/sys directory. -func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry { - return newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ - "kernel": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ - "hostname": fs.newDentry(root, fs.NextIno(), 0444, &hostnameData{}), - "shmall": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMALL)), - "shmmax": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMAX)), - "shmmni": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMNI)), +func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) kernfs.Inode { + return fs.newStaticDir(root, map[string]kernfs.Inode{ + "kernel": fs.newStaticDir(root, map[string]kernfs.Inode{ + "hostname": fs.newInode(root, 0444, &hostnameData{}), + "shmall": fs.newInode(root, 0444, shmData(linux.SHMALL)), + "shmmax": fs.newInode(root, 0444, shmData(linux.SHMMAX)), + "shmmni": fs.newInode(root, 0444, shmData(linux.SHMMNI)), }), - "vm": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ - "mmap_min_addr": fs.newDentry(root, fs.NextIno(), 0444, &mmapMinAddrData{k: k}), - "overcommit_memory": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0\n")), + "vm": fs.newStaticDir(root, map[string]kernfs.Inode{ + "mmap_min_addr": fs.newInode(root, 0444, &mmapMinAddrData{k: k}), + "overcommit_memory": fs.newInode(root, 0444, newStaticFile("0\n")), }), "net": fs.newSysNetDir(root, k), }) } // newSysNetDir returns the dentry corresponding to /proc/sys/net directory. -func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry { - var contents map[string]*kernfs.Dentry +func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) kernfs.Inode { + var contents map[string]kernfs.Inode // TODO(gvisor.dev/issue/1833): Support for using the network stack in the // network namespace of the calling process. if stack := k.RootNetworkNamespace().Stack(); stack != nil { - contents = map[string]*kernfs.Dentry{ - "ipv4": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ - "tcp_recovery": fs.newDentry(root, fs.NextIno(), 0644, &tcpRecoveryData{stack: stack}), - "tcp_rmem": fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpRMem}), - "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}), - "tcp_wmem": fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpWMem}), - "ip_forward": fs.newDentry(root, fs.NextIno(), 0444, &ipForwarding{stack: stack}), + contents = map[string]kernfs.Inode{ + "ipv4": fs.newStaticDir(root, map[string]kernfs.Inode{ + "tcp_recovery": fs.newInode(root, 0644, &tcpRecoveryData{stack: stack}), + "tcp_rmem": fs.newInode(root, 0644, &tcpMemData{stack: stack, dir: tcpRMem}), + "tcp_sack": fs.newInode(root, 0644, &tcpSackData{stack: stack}), + "tcp_wmem": fs.newInode(root, 0644, &tcpMemData{stack: stack, dir: tcpWMem}), + "ip_forward": fs.newInode(root, 0444, &ipForwarding{stack: stack}), // The following files are simple stubs until they are implemented in // netstack, most of these files are configuration related. We use the // value closest to the actual netstack behavior or any empty file, all // of these files will have mode 0444 (read-only for all users). - "ip_local_port_range": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("16000 65535")), - "ip_local_reserved_ports": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), - "ipfrag_time": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("30")), - "ip_nonlocal_bind": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "ip_no_pmtu_disc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + "ip_local_port_range": fs.newInode(root, 0444, newStaticFile("16000 65535")), + "ip_local_reserved_ports": fs.newInode(root, 0444, newStaticFile("")), + "ipfrag_time": fs.newInode(root, 0444, newStaticFile("30")), + "ip_nonlocal_bind": fs.newInode(root, 0444, newStaticFile("0")), + "ip_no_pmtu_disc": fs.newInode(root, 0444, newStaticFile("1")), // tcp_allowed_congestion_control tell the user what they are able to // do as an unprivledged process so we leave it empty. - "tcp_allowed_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), - "tcp_available_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")), - "tcp_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")), + "tcp_allowed_congestion_control": fs.newInode(root, 0444, newStaticFile("")), + "tcp_available_congestion_control": fs.newInode(root, 0444, newStaticFile("reno")), + "tcp_congestion_control": fs.newInode(root, 0444, newStaticFile("reno")), // Many of the following stub files are features netstack doesn't // support. The unsupported features return "0" to indicate they are // disabled. - "tcp_base_mss": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1280")), - "tcp_dsack": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_early_retrans": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_fack": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_fastopen": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_fastopen_key": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), - "tcp_invalid_ratelimit": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_keepalive_intvl": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_keepalive_probes": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_keepalive_time": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("7200")), - "tcp_mtu_probing": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_no_metrics_save": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), - "tcp_probe_interval": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_probe_threshold": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_retries1": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")), - "tcp_retries2": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("15")), - "tcp_rfc1337": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), - "tcp_slow_start_after_idle": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), - "tcp_synack_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")), - "tcp_syn_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")), - "tcp_timestamps": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + "tcp_base_mss": fs.newInode(root, 0444, newStaticFile("1280")), + "tcp_dsack": fs.newInode(root, 0444, newStaticFile("0")), + "tcp_early_retrans": fs.newInode(root, 0444, newStaticFile("0")), + "tcp_fack": fs.newInode(root, 0444, newStaticFile("0")), + "tcp_fastopen": fs.newInode(root, 0444, newStaticFile("0")), + "tcp_fastopen_key": fs.newInode(root, 0444, newStaticFile("")), + "tcp_invalid_ratelimit": fs.newInode(root, 0444, newStaticFile("0")), + "tcp_keepalive_intvl": fs.newInode(root, 0444, newStaticFile("0")), + "tcp_keepalive_probes": fs.newInode(root, 0444, newStaticFile("0")), + "tcp_keepalive_time": fs.newInode(root, 0444, newStaticFile("7200")), + "tcp_mtu_probing": fs.newInode(root, 0444, newStaticFile("0")), + "tcp_no_metrics_save": fs.newInode(root, 0444, newStaticFile("1")), + "tcp_probe_interval": fs.newInode(root, 0444, newStaticFile("0")), + "tcp_probe_threshold": fs.newInode(root, 0444, newStaticFile("0")), + "tcp_retries1": fs.newInode(root, 0444, newStaticFile("3")), + "tcp_retries2": fs.newInode(root, 0444, newStaticFile("15")), + "tcp_rfc1337": fs.newInode(root, 0444, newStaticFile("1")), + "tcp_slow_start_after_idle": fs.newInode(root, 0444, newStaticFile("1")), + "tcp_synack_retries": fs.newInode(root, 0444, newStaticFile("5")), + "tcp_syn_retries": fs.newInode(root, 0444, newStaticFile("3")), + "tcp_timestamps": fs.newInode(root, 0444, newStaticFile("1")), }), - "core": newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ - "default_qdisc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("pfifo_fast")), - "message_burst": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("10")), - "message_cost": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")), - "optmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "rmem_default": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), - "rmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), - "somaxconn": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("128")), - "wmem_default": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), - "wmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), + "core": fs.newStaticDir(root, map[string]kernfs.Inode{ + "default_qdisc": fs.newInode(root, 0444, newStaticFile("pfifo_fast")), + "message_burst": fs.newInode(root, 0444, newStaticFile("10")), + "message_cost": fs.newInode(root, 0444, newStaticFile("5")), + "optmem_max": fs.newInode(root, 0444, newStaticFile("0")), + "rmem_default": fs.newInode(root, 0444, newStaticFile("212992")), + "rmem_max": fs.newInode(root, 0444, newStaticFile("212992")), + "somaxconn": fs.newInode(root, 0444, newStaticFile("128")), + "wmem_default": fs.newInode(root, 0444, newStaticFile("212992")), + "wmem_max": fs.newInode(root, 0444, newStaticFile("212992")), }), } } - return newStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents) + return fs.newStaticDir(root, contents) } // mmapMinAddrData implements vfs.DynamicBytesSource for diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 6975af5a7..2582ababd 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -109,9 +109,12 @@ func setup(t *testing.T) *testutil.System { if err != nil { t.Fatalf("NewMountNamespace(): %v", err) } + root := mntns.Root() + root.IncRef() + defer root.DecRef(ctx) pop := &vfs.PathOperation{ - Root: mntns.Root(), - Start: mntns.Root(), + Root: root, + Start: root, Path: fspath.Parse("/proc"), } if err := k.VFS().MkdirAt(ctx, creds, pop, &vfs.MkdirOptions{Mode: 0777}); err != nil { @@ -119,8 +122,8 @@ func setup(t *testing.T) *testutil.System { } pop = &vfs.PathOperation{ - Root: mntns.Root(), - Start: mntns.Root(), + Root: root, + Start: root, Path: fspath.Parse("/proc"), } mntOpts := &vfs.MountOptions{ diff --git a/pkg/sentry/fsimpl/signalfd/BUILD b/pkg/sentry/fsimpl/signalfd/BUILD index 067c1657f..adb610213 100644 --- a/pkg/sentry/fsimpl/signalfd/BUILD +++ b/pkg/sentry/fsimpl/signalfd/BUILD @@ -8,7 +8,6 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/binary", "//pkg/context", "//pkg/sentry/kernel", "//pkg/sentry/vfs", diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go index bf11b425a..10f1452ef 100644 --- a/pkg/sentry/fsimpl/signalfd/signalfd.go +++ b/pkg/sentry/fsimpl/signalfd/signalfd.go @@ -16,7 +16,6 @@ package signalfd import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -95,8 +94,7 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen } // Copy out the signal info using the specified format. - var buf [128]byte - binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{ + infoNative := linux.SignalfdSiginfo{ Signo: uint32(info.Signo), Errno: info.Errno, Code: info.Code, @@ -105,9 +103,13 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen Status: info.Status(), Overrun: uint32(info.Overrun()), Addr: info.Addr(), - }) - n, err := dst.CopyOut(ctx, buf[:]) - return int64(n), err + } + n, err := infoNative.WriteTo(dst.Writer(ctx)) + if err == usermem.ErrEndOfIOSequence { + // Partial copy-out ok. + err = nil + } + return n, err } // Readiness implements waiter.Waitable.Readiness. diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index 29e5371d6..cf91ea36c 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -46,6 +46,9 @@ func (filesystemType) Name() string { return "sockfs" } +// Release implements vfs.FilesystemType.Release. +func (filesystemType) Release(ctx context.Context) {} + // +stateify savable type filesystem struct { kernfs.Filesystem @@ -114,6 +117,6 @@ func NewDentry(creds *auth.Credentials, mnt *vfs.Mount) *vfs.Dentry { i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode) d := &kernfs.Dentry{} - d.Init(i) + d.Init(&fs.Filesystem, i) return d.VFSDentry() } diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go index b75d70ae6..94366d429 100644 --- a/pkg/sentry/fsimpl/sys/kcov.go +++ b/pkg/sentry/fsimpl/sys/kcov.go @@ -27,12 +27,10 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials) *kernfs.Dentry { +func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials) kernfs.Inode { k := &kcovInode{} k.InodeAttrs.Init(creds, 0, 0, fs.NextIno(), linux.S_IFREG|0600) - d := &kernfs.Dentry{} - d.Init(k) - return d + return k } // kcovInode implements kernfs.Inode. @@ -104,7 +102,7 @@ func (fd *kcovFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) erro func (fd *kcovFD) Release(ctx context.Context) { // kcov instances have reference counts in Linux, but this seems sufficient // for our purposes. - fd.kcov.Reset() + fd.kcov.Clear() } // SetStat implements vfs.FileDescriptionImpl.SetStat. diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 1568c581f..1ad679830 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -52,6 +52,9 @@ func (FilesystemType) Name() string { return Name } +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { devMinor, err := vfsObj.GetAnonBlockDevMinor() @@ -64,15 +67,15 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } fs.VFSFilesystem().Init(vfsObj, &fsType, fs) - root := fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ + root := fs.newDir(creds, defaultSysDirMode, map[string]kernfs.Inode{ "block": fs.newDir(creds, defaultSysDirMode, nil), "bus": fs.newDir(creds, defaultSysDirMode, nil), - "class": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ + "class": fs.newDir(creds, defaultSysDirMode, map[string]kernfs.Inode{ "power_supply": fs.newDir(creds, defaultSysDirMode, nil), }), "dev": fs.newDir(creds, defaultSysDirMode, nil), - "devices": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ - "system": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ + "devices": fs.newDir(creds, defaultSysDirMode, map[string]kernfs.Inode{ + "system": fs.newDir(creds, defaultSysDirMode, map[string]kernfs.Inode{ "cpu": cpuDir(ctx, fs, creds), }), }), @@ -82,13 +85,15 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt "module": fs.newDir(creds, defaultSysDirMode, nil), "power": fs.newDir(creds, defaultSysDirMode, nil), }) - return fs.VFSFilesystem(), root.VFSDentry(), nil + var rootD kernfs.Dentry + rootD.Init(&fs.Filesystem, root) + return fs.VFSFilesystem(), rootD.VFSDentry(), nil } -func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernfs.Dentry { +func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode { k := kernel.KernelFromContext(ctx) maxCPUCores := k.ApplicationCores() - children := map[string]*kernfs.Dentry{ + children := map[string]kernfs.Inode{ "online": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), "possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), "present": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), @@ -99,14 +104,14 @@ func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernf return fs.newDir(creds, defaultSysDirMode, children) } -func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernfs.Dentry { +func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode { // If kcov is available, set up /sys/kernel/debug/kcov. Technically, debugfs // should be mounted at debug/, but for our purposes, it is sufficient to // keep it in sys. - var children map[string]*kernfs.Dentry + var children map[string]kernfs.Inode if coverage.KcovAvailable() { - children = map[string]*kernfs.Dentry{ - "debug": fs.newDir(creds, linux.FileMode(0700), map[string]*kernfs.Dentry{ + children = map[string]kernfs.Inode{ + "debug": fs.newDir(creds, linux.FileMode(0700), map[string]kernfs.Inode{ "kcov": fs.newKcovFile(ctx, creds), }), } @@ -125,27 +130,23 @@ func (fs *filesystem) Release(ctx context.Context) { // +stateify savable type dir struct { dirRefs + kernfs.InodeAlwaysValid kernfs.InodeAttrs - kernfs.InodeNoDynamicLookup kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren + kernfs.InodeTemporary kernfs.OrderedChildren locks vfs.FileLocks - - dentry kernfs.Dentry } -func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { +func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode { d := &dir{} d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) d.EnableLeakCheck() - d.dentry.Init(d) - - d.IncLinks(d.OrderedChildren.Populate(&d.dentry, contents)) - - return &d.dentry + d.IncLinks(d.OrderedChildren.Populate(contents)) + return d } // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. @@ -165,8 +166,8 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry } // DecRef implements kernfs.Inode.DecRef. -func (d *dir) DecRef(context.Context) { - d.dirRefs.DecRef(d.Destroy) +func (d *dir) DecRef(ctx context.Context) { + d.dirRefs.DecRef(func() { d.Destroy(ctx) }) } // StatFS implements kernfs.Inode.StatFS. @@ -190,12 +191,10 @@ func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } -func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) *kernfs.Dentry { +func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) kernfs.Inode { c := &cpuFile{maxCores: maxCores} c.DynamicBytesFile.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode) - d := &kernfs.Dentry{} - d.Init(c) - return d + return c } // +stateify savable diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go index 568132121..1a8525b06 100644 --- a/pkg/sentry/fsimpl/testutil/testutil.go +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -46,16 +46,18 @@ type System struct { // NewSystem constructs a System. // -// Precondition: Caller must hold a reference on MntNs, whose ownership +// Precondition: Caller must hold a reference on mns, whose ownership // is transferred to the new System. func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System { + root := mns.Root() + root.IncRef() s := &System{ t: t, Ctx: ctx, Creds: auth.CredentialsFromContext(ctx), VFS: v, MntNs: mns, - Root: mns.Root(), + Root: root, } return s } @@ -254,10 +256,10 @@ func (d *DirentCollector) Contains(name string, typ uint8) error { defer d.mu.Unlock() dirent, ok := d.dirents[name] if !ok { - return fmt.Errorf("No dirent named %q found", name) + return fmt.Errorf("no dirent named %q found", name) } if dirent.Type != typ { - return fmt.Errorf("Dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent) + return fmt.Errorf("dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent) } return nil } diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 5209a17af..3cc63e732 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -193,6 +193,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { // Create nested directories with given depth. root := mntns.Root() + root.IncRef() defer root.DecRef(ctx) vd := root vd.IncRef() @@ -387,6 +388,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { // Create the mount point. root := mntns.Root() + root.IncRef() defer root.DecRef(ctx) pop := vfs.PathOperation{ Root: root, diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index be29a2363..2f856ce36 100644 --- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -165,6 +165,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy // Create the pipe. root := mntns.Root() + root.IncRef() pop := vfs.PathOperation{ Root: root, Start: root, diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index a199eb33d..ce4e3eda7 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -293,7 +293,7 @@ func (rf *regularFile) Translate(ctx context.Context, required, optional memmap. optional.End = pgend } - cerr := rf.data.Fill(ctx, required, optional, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) { + cerr := rf.data.Fill(ctx, required, optional, rf.size, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) { // Newly-allocated pages are zeroed, so we don't need to do anything. return dsts.NumBytes(), nil }) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index cefec8fde..e2a0aac69 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -74,6 +74,8 @@ type filesystem struct { mu sync.RWMutex `state:"nosave"` nextInoMinusOne uint64 // accessed using atomic memory operations + + root *dentry } // Name implements vfs.FilesystemType.Name. @@ -81,6 +83,9 @@ func (FilesystemType) Name() string { return Name } +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + // FilesystemOpts is used to pass configuration data to tmpfs. // // +stateify savable @@ -194,6 +199,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs.vfsfs.DecRef(ctx) return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) } + fs.root = root return &fs.vfsfs, &root.vfsd, nil } @@ -205,6 +211,37 @@ func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *au // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.mu.Lock() + if fs.root.inode.isDir() { + fs.root.releaseChildrenLocked(ctx) + } + fs.mu.Unlock() +} + +// releaseChildrenLocked is called on the mount point by filesystem.Release() to +// destroy all objects in the mount. It performs a depth-first walk of the +// filesystem and "unlinks" everything by decrementing link counts +// appropriately. There should be no open file descriptors when this is called, +// so each inode should only have one outstanding reference that is removed once +// its link count hits zero. +// +// Note that we do not update filesystem state precisely while tearing down (for +// instance, the child maps are ignored)--we only care to remove all remaining +// references so that every filesystem object gets destroyed. Also note that we +// do not need to trigger DecRef on the mount point itself or any child mount; +// these are taken care of by the destructor of the enclosing MountNamespace. +// +// Precondition: filesystem.mu is held. +func (d *dentry) releaseChildrenLocked(ctx context.Context) { + dir := d.inode.impl.(*directory) + for _, child := range dir.childMap { + if child.inode.isDir() { + child.releaseChildrenLocked(ctx) + child.inode.decLinksLocked(ctx) // link for child/. + dir.inode.decLinksLocked(ctx) // link for child/.. + } + child.inode.decLinksLocked(ctx) // link for child + } } // immutable diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go index 99c8e3c0f..fc5323abc 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go @@ -46,6 +46,7 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) } root := mntns.Root() + root.IncRef() return vfsObj, root, func() { root.DecRef(ctx) mntns.DecRef(ctx) diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD index bc8e38431..0ca750281 100644 --- a/pkg/sentry/fsimpl/verity/BUILD +++ b/pkg/sentry/fsimpl/verity/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") licenses(["notice"]) @@ -26,3 +26,22 @@ go_library( "//pkg/usermem", ], ) + +go_test( + name = "verity_test", + srcs = [ + "verity_test.go", + ], + library = ":verity", + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/arch", + "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/contexttest", + "//pkg/sentry/vfs", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go index 26b117ca4..3b3c8725f 100644 --- a/pkg/sentry/fsimpl/verity/filesystem.go +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -20,6 +20,7 @@ import ( "io" "strconv" "strings" + "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -155,11 +156,10 @@ afterSymlink: return child, nil } -// verifyChild verifies the root hash of child against the already verified -// root hash of the parent to ensure the child is expected. verifyChild -// triggers a sentry panic if unexpected modifications to the file system are -// detected. In noCrashOnVerificationFailure mode it returns a syserror -// instead. +// verifyChild verifies the hash of child against the already verified hash of +// the parent to ensure the child is expected. verifyChild triggers a sentry +// panic if unexpected modifications to the file system are detected. In +// noCrashOnVerificationFailure mode it returns a syserror instead. // Preconditions: fs.renameMu must be locked. d.dirMu must be locked. // TODO(b/166474175): Investigate all possible errors returned in this // function, and make sure we differentiate all errors that indicate unexpected @@ -174,12 +174,12 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de return nil, err } - verityMu.RLock() - defer verityMu.RUnlock() + fs.verityMu.RLock() + defer fs.verityMu.RUnlock() // Read the offset of the child from the extended attributes of the // corresponding Merkle tree file. - // This is the offset of the root hash for child in its parent's Merkle - // tree file. + // This is the offset of the hash for child in its parent's Merkle tree + // file. off, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{ Root: child.lowerMerkleVD, Start: child.lowerMerkleVD, @@ -204,7 +204,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleOffsetInParentXattr, childPath, err)) } - // Open parent Merkle tree file to read and verify child's root hash. + // Open parent Merkle tree file to read and verify child's hash. parentMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ Root: parent.lowerMerkleVD, Start: parent.lowerMerkleVD, @@ -223,7 +223,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de // dataSize is the size of raw data for the Merkle tree. For a file, // dataSize is the size of the whole file. For a directory, dataSize is - // the size of all its children's root hashes. + // the size of all its children's hashes. dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{ Name: merkleSizeXattr, Size: sizeOfStringInt32, @@ -251,32 +251,152 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de Ctx: ctx, } + parentStat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: parent.lowerVD, + Start: parent.lowerVD, + }, &vfs.StatOptions{}) + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(err, fmt.Sprintf("Failed to get parent stat for %s: %v", childPath, err)) + } + if err != nil { + return nil, err + } + // Since we are verifying against a directory Merkle tree, buf should - // contain the root hash of the children in the parent Merkle tree when + // contain the hash of the children in the parent Merkle tree when // Verify returns with success. var buf bytes.Buffer - if _, err := merkletree.Verify(&buf, &fdReader, &fdReader, int64(parentSize), int64(offset), int64(merkletree.DigestSize()), parent.rootHash, true /* dataAndTreeInSameFile */); err != nil && err != io.EOF { + if _, err := merkletree.Verify(&merkletree.VerifyParams{ + Out: &buf, + File: &fdReader, + Tree: &fdReader, + Size: int64(parentSize), + Name: parent.name, + Mode: uint32(parentStat.Mode), + UID: parentStat.UID, + GID: parentStat.GID, + ReadOffset: int64(offset), + ReadSize: int64(merkletree.DigestSize()), + Expected: parent.hash, + DataAndTreeInSameFile: true, + }); err != nil && err != io.EOF { return nil, alertIntegrityViolation(syserror.EIO, fmt.Sprintf("Verification for %s failed: %v", childPath, err)) } - // Cache child root hash when it's verified the first time. - if len(child.rootHash) == 0 { - child.rootHash = buf.Bytes() + // Cache child hash when it's verified the first time. + if len(child.hash) == 0 { + child.hash = buf.Bytes() } return child, nil } +// verifyStat verifies the stat against the verified hash. The mode/uid/gid of +// the file is cached after verified. +func (fs *filesystem) verifyStat(ctx context.Context, d *dentry, stat linux.Statx) error { + vfsObj := fs.vfsfs.VirtualFilesystem() + + // Get the path to the child dentry. This is only used to provide path + // information in failure case. + childPath, err := vfsObj.PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD) + if err != nil { + return err + } + + fs.verityMu.RLock() + defer fs.verityMu.RUnlock() + + fd, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ + Root: d.lowerMerkleVD, + Start: d.lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err == syserror.ENOENT { + return alertIntegrityViolation(err, fmt.Sprintf("Failed to open merkle file for %s: %v", childPath, err)) + } + if err != nil { + return err + } + + merkleSize, err := fd.GetXattr(ctx, &vfs.GetXattrOptions{ + Name: merkleSizeXattr, + Size: sizeOfStringInt32, + }) + + if err == syserror.ENODATA { + return alertIntegrityViolation(err, fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", merkleSizeXattr, childPath, err)) + } + if err != nil { + return err + } + + size, err := strconv.Atoi(merkleSize) + if err != nil { + return alertIntegrityViolation(syserror.EINVAL, fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err)) + } + + fdReader := vfs.FileReadWriteSeeker{ + FD: fd, + Ctx: ctx, + } + + var buf bytes.Buffer + params := &merkletree.VerifyParams{ + Out: &buf, + Tree: &fdReader, + Size: int64(size), + Name: d.name, + Mode: uint32(stat.Mode), + UID: stat.UID, + GID: stat.GID, + ReadOffset: 0, + // Set read size to 0 so only the metadata is verified. + ReadSize: 0, + Expected: d.hash, + DataAndTreeInSameFile: false, + } + if atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR { + params.DataAndTreeInSameFile = true + } + + if _, err := merkletree.Verify(params); err != nil && err != io.EOF { + return alertIntegrityViolation(err, fmt.Sprintf("Verification stat for %s failed: %v", childPath, err)) + } + d.mode = uint32(stat.Mode) + d.uid = stat.UID + d.gid = stat.GID + return nil +} + // Preconditions: fs.renameMu must be locked. d.dirMu must be locked. func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { if child, ok := parent.children[name]; ok { // If enabling verification on files/directories is not allowed // during runtime, all cached children are already verified. If // runtime enable is allowed and the parent directory is - // enabled, we should verify the child root hash here because - // it may be cached before enabled. - if fs.allowRuntimeEnable && len(parent.rootHash) != 0 { - if _, err := fs.verifyChild(ctx, parent, child); err != nil { - return nil, err + // enabled, we should verify the child hash here because it may + // be cached before enabled. + if fs.allowRuntimeEnable { + if isEnabled(parent) { + if _, err := fs.verifyChild(ctx, parent, child); err != nil { + return nil, err + } + } + if isEnabled(child) { + vfsObj := fs.vfsfs.VirtualFilesystem() + mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID) + stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: child.lowerVD, + Start: child.lowerVD, + }, &vfs.StatOptions{ + Mask: mask, + }) + if err != nil { + return nil, err + } + if err := fs.verifyStat(ctx, child, stat); err != nil { + return nil, err + } } } return child, nil @@ -360,9 +480,9 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, // file open and ready to use. // This may cause empty and unused Merkle tree files in // allowRuntimeEnable mode, if they are never enabled. This - // does not affect verification, as we rely on cached root hash - // to decide whether to perform verification, not the existence - // of the Merkle tree file. Also, those Merkle tree files are + // does not affect verification, as we rely on cached hash to + // decide whether to perform verification, not the existence of + // the Merkle tree file. Also, those Merkle tree files are // always hidden and cannot be accessed by verity fs users. if fs.allowRuntimeEnable { childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ @@ -426,20 +546,25 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, child.parent = parent child.name = name - // TODO(b/162788573): Verify child metadata. child.mode = uint32(stat.Mode) child.uid = stat.UID child.gid = stat.GID - // Verify child root hash. This should always be performed unless in + // Verify child hash. This should always be performed unless in // allowRuntimeEnable mode and the parent directory hasn't been enabled // yet. - if !(fs.allowRuntimeEnable && len(parent.rootHash) == 0) { + if isEnabled(parent) { if _, err := fs.verifyChild(ctx, parent, child); err != nil { child.destroyLocked(ctx) return nil, err } } + if isEnabled(child) { + if err := fs.verifyStat(ctx, child, stat); err != nil { + child.destroyLocked(ctx) + return nil, err + } + } return child, nil } @@ -693,22 +818,24 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf // be called if a verity FD is created successfully. defer merkleWriter.DecRef(ctx) - parentMerkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ - Root: d.parent.lowerMerkleVD, - Start: d.parent.lowerMerkleVD, - }, &vfs.OpenOptions{ - Flags: linux.O_WRONLY | linux.O_APPEND, - }) - if err != nil { - if err == syserror.ENOENT { - parentPath, _ := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.parent.lowerVD) - return nil, alertIntegrityViolation(err, fmt.Sprintf("Merkle file for %s expected but not found", parentPath)) + if d.parent != nil { + parentMerkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.parent.lowerMerkleVD, + Start: d.parent.lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_WRONLY | linux.O_APPEND, + }) + if err != nil { + if err == syserror.ENOENT { + parentPath, _ := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.parent.lowerVD) + return nil, alertIntegrityViolation(err, fmt.Sprintf("Merkle file for %s expected but not found", parentPath)) + } + return nil, err } - return nil, err + // parentMerkleWriter is cleaned up if any error occurs. IncRef + // will be called if a verity FD is created successfully. + defer parentMerkleWriter.DecRef(ctx) } - // parentMerkleWriter is cleaned up if any error occurs. IncRef - // will be called if a verity FD is created successfully. - defer parentMerkleWriter.DecRef(ctx) } fd := &fileDescription{ @@ -769,6 +896,8 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts } // StatAt implements vfs.FilesystemImpl.StatAt. +// TODO(b/170157489): Investigate whether stats other than Mode/UID/GID should +// be verified. func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { var ds *[]*dentry fs.renameMu.RLock() @@ -786,6 +915,11 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err != nil { return linux.Statx{}, err } + if isEnabled(d) { + if err := fs.verifyStat(ctx, d, stat); err != nil { + return linux.Statx{}, err + } + } return stat, nil } diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go index 3129f290d..70034280b 100644 --- a/pkg/sentry/fsimpl/verity/verity.go +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -49,12 +49,12 @@ const Name = "verity" const merklePrefix = ".merkle.verity." // merkleoffsetInParentXattr is the extended attribute name specifying the -// offset of child root hash in its parent's Merkle tree. +// offset of child hash in its parent's Merkle tree. const merkleOffsetInParentXattr = "user.merkle.offset" // merkleSizeXattr is the extended attribute name specifying the size of data // hashed by the corresponding Merkle tree. For a file, it's the size of the -// whole file. For a directory, it's the size of all its children's root hashes. +// whole file. For a directory, it's the size of all its children's hashes. const merkleSizeXattr = "user.merkle.size" // sizeOfStringInt32 is the size for a 32 bit integer stored as string in @@ -68,11 +68,6 @@ const sizeOfStringInt32 = 10 // flag. var noCrashOnVerificationFailure bool -// verityMu synchronizes enabling verity files, protects files or directories -// from being enabled by different threads simultaneously. It also ensures that -// verity does not access files that are being enabled. -var verityMu sync.RWMutex - // FilesystemType implements vfs.FilesystemType. // // +stateify savable @@ -106,6 +101,17 @@ type filesystem struct { // to ensure consistent lock ordering between dentry.dirMu in different // dentries. renameMu sync.RWMutex `state:"nosave"` + + // verityMu synchronizes enabling verity files, protects files or + // directories from being enabled by different threads simultaneously. + // It also ensures that verity does not access files that are being + // enabled. + // + // Also, the directory Merkle trees depends on the generated trees of + // its children. So they shouldn't be enabled the same time. This lock + // is for the whole file system to ensure that no more than one file is + // enabled the same time. + verityMu sync.RWMutex } // InternalFilesystemOptions may be passed as @@ -142,6 +148,17 @@ func (FilesystemType) Name() string { return Name } +// isEnabled checks whether the target is enabled with verity features. It +// should always be true if runtime enable is not allowed. In runtime enable +// mode, it returns true if the target has been enabled with +// ioctl(FS_IOC_ENABLE_VERITY). +func isEnabled(d *dentry) bool { + return !d.fs.allowRuntimeEnable || len(d.hash) != 0 +} + +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + // alertIntegrityViolation alerts a violation of integrity, which usually means // unexpected modification to the file system is detected. In // noCrashOnVerificationFailure mode, it returns an error, otherwise it panic. @@ -245,13 +262,18 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return nil, nil, err } - // TODO(b/162788573): Verify Metadata. d.mode = uint32(stat.Mode) d.uid = stat.UID d.gid = stat.GID + d.hash = make([]byte, len(iopts.RootHash)) - d.rootHash = make([]byte, len(iopts.RootHash)) - copy(d.rootHash, iopts.RootHash) + if !fs.allowRuntimeEnable { + if err := fs.verifyStat(ctx, d, stat); err != nil { + return nil, nil, err + } + } + + copy(d.hash, iopts.RootHash) d.vfsd.Init(d) fs.rootDentry = d @@ -303,8 +325,8 @@ type dentry struct { // in the underlying file system. lowerMerkleVD vfs.VirtualDentry - // rootHash is the rootHash for the current file or directory. - rootHash []byte + // hash is the calculated hash for the current file or directory. + hash []byte } // newDentry creates a new dentry representing the given verity file. The @@ -488,6 +510,11 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu if err != nil { return linux.Statx{}, err } + if isEnabled(fd.d) { + if err := fd.d.fs.verifyStat(ctx, fd.d, stat); err != nil { + return linux.Statx{}, err + } + } return stat, nil } @@ -498,11 +525,11 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) } // generateMerkle generates a Merkle tree file for fd. If fd points to a file -// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The root -// hash of the generated Merkle tree and the data size is returned. -// If fd points to a regular file, the data is the content of the file. If fd -// points to a directory, the data is all root hahes of its children, written -// to the Merkle tree file. +// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The hash +// of the generated Merkle tree and the data size is returned. If fd points to +// a regular file, the data is the content of the file. If fd points to a +// directory, the data is all hahes of its children, written to the Merkle tree +// file. func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, error) { fdReader := vfs.FileReadWriteSeeker{ FD: fd.lowerFD, @@ -516,8 +543,10 @@ func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, FD: fd.merkleWriter, Ctx: ctx, } - var rootHash []byte - var dataSize uint64 + params := &merkletree.GenerateParams{ + TreeReader: &merkleReader, + TreeWriter: &merkleWriter, + } switch atomic.LoadUint32(&fd.d.mode) & linux.S_IFMT { case linux.S_IFREG: @@ -528,75 +557,90 @@ func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, if err != nil { return nil, 0, err } - dataSize = stat.Size - rootHash, err = merkletree.Generate(&fdReader, int64(dataSize), &merkleReader, &merkleWriter, false /* dataAndTreeInSameFile */) - if err != nil { - return nil, 0, err - } + params.File = &fdReader + params.Size = int64(stat.Size) + params.Name = fd.d.name + params.Mode = uint32(stat.Mode) + params.UID = stat.UID + params.GID = stat.GID + params.DataAndTreeInSameFile = false case linux.S_IFDIR: - // For a directory, generate a Merkle tree based on the root - // hashes of its children that has already been written to the - // Merkle tree file. + // For a directory, generate a Merkle tree based on the hashes + // of its children that has already been written to the Merkle + // tree file. merkleStat, err := fd.merkleReader.Stat(ctx, vfs.StatOptions{}) if err != nil { return nil, 0, err } - dataSize = merkleStat.Size - rootHash, err = merkletree.Generate(&merkleReader, int64(dataSize), &merkleReader, &merkleWriter, true /* dataAndTreeInSameFile */) + params.Size = int64(merkleStat.Size) + + stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{}) if err != nil { return nil, 0, err } + + params.File = &merkleReader + params.Name = fd.d.name + params.Mode = uint32(stat.Mode) + params.UID = stat.UID + params.GID = stat.GID + params.DataAndTreeInSameFile = true default: // TODO(b/167728857): Investigate whether and how we should // enable other types of file. return nil, 0, syserror.EINVAL } - return rootHash, dataSize, nil + hash, err := merkletree.Generate(params) + return hash, uint64(params.Size), err } // enableVerity enables verity features on fd by generating a Merkle tree file -// and stores its root hash in its parent directory's Merkle tree. -func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { +// and stores its hash in its parent directory's Merkle tree. +func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO) (uintptr, error) { if !fd.d.fs.allowRuntimeEnable { return 0, syserror.EPERM } - // Lock to prevent other threads performing enable or access the file - // while it's being enabled. - verityMu.Lock() - defer verityMu.Unlock() + fd.d.fs.verityMu.Lock() + defer fd.d.fs.verityMu.Unlock() - if fd.lowerFD == nil || fd.merkleReader == nil || fd.merkleWriter == nil || fd.parentMerkleWriter == nil { + // In allowRuntimeEnable mode, the underlying fd and read/write fd for + // the Merkle tree file should have all been initialized. For any file + // or directory other than the root, the parent Merkle tree file should + // have also been initialized. + if fd.lowerFD == nil || fd.merkleReader == nil || fd.merkleWriter == nil || (fd.parentMerkleWriter == nil && fd.d != fd.d.fs.rootDentry) { return 0, alertIntegrityViolation(syserror.EIO, "Unexpected verity fd: missing expected underlying fds") } - rootHash, dataSize, err := fd.generateMerkle(ctx) + hash, dataSize, err := fd.generateMerkle(ctx) if err != nil { return 0, err } - stat, err := fd.parentMerkleWriter.Stat(ctx, vfs.StatOptions{}) - if err != nil { - return 0, err - } + if fd.parentMerkleWriter != nil { + stat, err := fd.parentMerkleWriter.Stat(ctx, vfs.StatOptions{}) + if err != nil { + return 0, err + } - // Write the root hash of fd to the parent directory's Merkle tree - // file, as it should be part of the parent Merkle tree data. - // parentMerkleWriter is open with O_APPEND, so it should write - // directly to the end of the file. - if _, err = fd.parentMerkleWriter.Write(ctx, usermem.BytesIOSequence(rootHash), vfs.WriteOptions{}); err != nil { - return 0, err - } + // Write the hash of fd to the parent directory's Merkle tree + // file, as it should be part of the parent Merkle tree data. + // parentMerkleWriter is open with O_APPEND, so it should write + // directly to the end of the file. + if _, err = fd.parentMerkleWriter.Write(ctx, usermem.BytesIOSequence(hash), vfs.WriteOptions{}); err != nil { + return 0, err + } - // Record the offset of the root hash of fd in parent directory's - // Merkle tree file. - if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{ - Name: merkleOffsetInParentXattr, - Value: strconv.Itoa(int(stat.Size)), - }); err != nil { - return 0, err + // Record the offset of the hash of fd in parent directory's + // Merkle tree file. + if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{ + Name: merkleOffsetInParentXattr, + Value: strconv.Itoa(int(stat.Size)), + }); err != nil { + return 0, err + } } // Record the size of the data being hashed for fd. @@ -606,22 +650,59 @@ func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO, arg }); err != nil { return 0, err } - fd.d.rootHash = append(fd.d.rootHash, rootHash...) + fd.d.hash = append(fd.d.hash, hash...) return 0, nil } -func (fd *fileDescription) getFlags(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { +// measureVerity returns the hash of fd, saved in verityDigest. +func (fd *fileDescription) measureVerity(ctx context.Context, uio usermem.IO, verityDigest usermem.Addr) (uintptr, error) { + t := kernel.TaskFromContext(ctx) + var metadata linux.DigestMetadata + + // If allowRuntimeEnable is true, an empty fd.d.hash indicates that + // verity is not enabled for the file. If allowRuntimeEnable is false, + // this is an integrity violation because all files should have verity + // enabled, in which case fd.d.hash should be set. + if len(fd.d.hash) == 0 { + if fd.d.fs.allowRuntimeEnable { + return 0, syserror.ENODATA + } + return 0, alertIntegrityViolation(syserror.ENODATA, "Ioctl measureVerity: no hash found") + } + + // The first part of VerityDigest is the metadata. + if _, err := metadata.CopyIn(t, verityDigest); err != nil { + return 0, err + } + if metadata.DigestSize < uint16(len(fd.d.hash)) { + return 0, syserror.EOVERFLOW + } + + // Populate the output digest size, since DigestSize is both input and + // output. + metadata.DigestSize = uint16(len(fd.d.hash)) + + // First copy the metadata. + if _, err := metadata.CopyOut(t, verityDigest); err != nil { + return 0, err + } + + // Now copy the root hash bytes to the memory after metadata. + _, err := t.CopyOutBytes(usermem.Addr(uintptr(verityDigest)+linux.SizeOfDigestMetadata), fd.d.hash) + return 0, err +} + +func (fd *fileDescription) verityFlags(ctx context.Context, uio usermem.IO, flags usermem.Addr) (uintptr, error) { f := int32(0) - // All enabled files should store a root hash. This flag is not settable - // via FS_IOC_SETFLAGS. - if len(fd.d.rootHash) != 0 { + // All enabled files should store a hash. This flag is not settable via + // FS_IOC_SETFLAGS. + if len(fd.d.hash) != 0 { f |= linux.FS_VERITY_FL } t := kernel.TaskFromContext(ctx) - addr := args[2].Pointer() - _, err := primitive.CopyInt32Out(t, addr, f) + _, err := primitive.CopyInt32Out(t, flags, f) return 0, err } @@ -629,11 +710,15 @@ func (fd *fileDescription) getFlags(ctx context.Context, uio usermem.IO, args ar func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { switch cmd := args[1].Uint(); cmd { case linux.FS_IOC_ENABLE_VERITY: - return fd.enableVerity(ctx, uio, args) + return fd.enableVerity(ctx, uio) + case linux.FS_IOC_MEASURE_VERITY: + return fd.measureVerity(ctx, uio, args[2].Pointer()) case linux.FS_IOC_GETFLAGS: - return fd.getFlags(ctx, uio, args) + return fd.verityFlags(ctx, uio, args[2].Pointer()) default: - return fd.lowerFD.Ioctl(ctx, uio, args) + // TODO(b/169682228): Investigate which ioctl commands should + // be allowed. + return 0, syserror.ENOSYS } } @@ -641,10 +726,12 @@ func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch. func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { // No need to verify if the file is not enabled yet in // allowRuntimeEnable mode. - if fd.d.fs.allowRuntimeEnable && len(fd.d.rootHash) == 0 { + if !isEnabled(fd.d) { return fd.lowerFD.PRead(ctx, dst, offset, opts) } + fd.d.fs.verityMu.RLock() + defer fd.d.fs.verityMu.RUnlock() // dataSize is the size of the whole file. dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{ Name: merkleSizeXattr, @@ -678,9 +765,22 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of Ctx: ctx, } - n, err := merkletree.Verify(dst.Writer(ctx), &dataReader, &merkleReader, int64(size), offset, dst.NumBytes(), fd.d.rootHash, false /* dataAndTreeInSameFile */) + n, err := merkletree.Verify(&merkletree.VerifyParams{ + Out: dst.Writer(ctx), + File: &dataReader, + Tree: &merkleReader, + Size: int64(size), + Name: fd.d.name, + Mode: fd.d.mode, + UID: fd.d.uid, + GID: fd.d.gid, + ReadOffset: offset, + ReadSize: dst.NumBytes(), + Expected: fd.d.hash, + DataAndTreeInSameFile: false, + }) if err != nil { - return 0, alertIntegrityViolation(syserror.EINVAL, fmt.Sprintf("Verification failed: %v", err)) + return 0, alertIntegrityViolation(syserror.EIO, fmt.Sprintf("Verification failed: %v", err)) } return n, err } diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go new file mode 100644 index 000000000..e301d35f5 --- /dev/null +++ b/pkg/sentry/fsimpl/verity/verity_test.go @@ -0,0 +1,491 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package verity + +import ( + "fmt" + "io" + "math/rand" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +// rootMerkleFilename is the name of the root Merkle tree file. +const rootMerkleFilename = "root.verity" + +// maxDataSize is the maximum data size written to the file for test. +const maxDataSize = 100000 + +// newVerityRoot creates a new verity mount, and returns the root. The +// underlying file system is tmpfs. If the error is not nil, then cleanup +// should be called when the root is no longer needed. +func newVerityRoot(ctx context.Context, t *testing.T) (*vfs.VirtualFilesystem, vfs.VirtualDentry, error) { + rand.Seed(time.Now().UnixNano()) + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(ctx); err != nil { + return nil, vfs.VirtualDentry{}, fmt.Errorf("VFS init: %v", err) + } + + vfsObj.MustRegisterFilesystemType("verity", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + + vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + + mntns, err := vfsObj.NewMountNamespace(ctx, auth.CredentialsFromContext(ctx), "", "verity", &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: InternalFilesystemOptions{ + RootMerkleFileName: rootMerkleFilename, + LowerName: "tmpfs", + AllowRuntimeEnable: true, + NoCrashOnVerificationFailure: true, + }, + }, + }) + if err != nil { + return nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace: %v", err) + } + root := mntns.Root() + root.IncRef() + t.Helper() + t.Cleanup(func() { + root.DecRef(ctx) + mntns.DecRef(ctx) + }) + return vfsObj, root, nil +} + +// newFileFD creates a new file in the verity mount, and returns the FD. The FD +// points to a file that has random data generated. +func newFileFD(ctx context.Context, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, filePath string, mode linux.FileMode) (*vfs.FileDescription, int, error) { + creds := auth.CredentialsFromContext(ctx) + lowerRoot := root.Dentry().Impl().(*dentry).lowerVD + + // Create the file in the underlying file system. + lowerFD, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: lowerRoot, + Start: lowerRoot, + Path: fspath.Parse(filePath), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: linux.ModeRegular | mode, + }) + if err != nil { + return nil, 0, err + } + + // Generate random data to be written to the file. + dataSize := rand.Intn(maxDataSize) + 1 + data := make([]byte, dataSize) + rand.Read(data) + + // Write directly to the underlying FD, since verity FD is read-only. + n, err := lowerFD.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + return nil, 0, err + } + + if n != int64(len(data)) { + return nil, 0, fmt.Errorf("lowerFD.Write got write length %d, want %d", n, len(data)) + } + + lowerFD.DecRef(ctx) + + // Now open the verity file descriptor. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filePath), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + Mode: linux.ModeRegular | mode, + }) + return fd, dataSize, err +} + +// corruptRandomBit randomly flips a bit in the file represented by fd. +func corruptRandomBit(ctx context.Context, fd *vfs.FileDescription, size int) error { + // Flip a random bit in the underlying file. + randomPos := int64(rand.Intn(size)) + byteToModify := make([]byte, 1) + if _, err := fd.PRead(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.ReadOptions{}); err != nil { + return fmt.Errorf("lowerFD.PRead: %v", err) + } + byteToModify[0] ^= 1 + if _, err := fd.PWrite(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.WriteOptions{}); err != nil { + return fmt.Errorf("lowerFD.PWrite: %v", err) + } + return nil +} + +// TestOpen ensures that when a file is created, the corresponding Merkle tree +// file and the root Merkle tree file exist. +func TestOpen(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj, root, err := newVerityRoot(ctx, t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + if _, _, err := newFileFD(ctx, vfsObj, root, filename, 0644); err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Ensure that the corresponding Merkle tree file is created. + lowerRoot := root.Dentry().Impl().(*dentry).lowerVD + if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: lowerRoot, + Start: lowerRoot, + Path: fspath.Parse(merklePrefix + filename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }); err != nil { + t.Errorf("OpenAt Merkle tree file %s: %v", merklePrefix+filename, err) + } + + // Ensure the root merkle tree file is created. + if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: lowerRoot, + Start: lowerRoot, + Path: fspath.Parse(merklePrefix + rootMerkleFilename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }); err != nil { + t.Errorf("OpenAt root Merkle tree file %s: %v", merklePrefix+rootMerkleFilename, err) + } +} + +// TestUnmodifiedFileSucceeds ensures that read from an untouched verity file +// succeeds after enabling verity for it. +func TestReadUnmodifiedFileSucceeds(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj, root, err := newVerityRoot(ctx, t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file and confirm a normal read succeeds. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + buf := make([]byte, size) + n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}) + if err != nil && err != io.EOF { + t.Fatalf("fd.PRead: %v", err) + } + + if n != int64(size) { + t.Errorf("fd.PRead got read length %d, want %d", n, size) + } +} + +// TestReopenUnmodifiedFileSucceeds ensures that reopen an untouched verity file +// succeeds after enabling verity for it. +func TestReopenUnmodifiedFileSucceeds(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj, root, err := newVerityRoot(ctx, t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file and confirms a normal read succeeds. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + // Ensure reopening the verity enabled file succeeds. + if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + Mode: linux.ModeRegular, + }); err != nil { + t.Errorf("reopen enabled file failed: %v", err) + } +} + +// TestModifiedFileFails ensures that read from a modified verity file fails. +func TestModifiedFileFails(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj, root, err := newVerityRoot(ctx, t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + // Open a new lowerFD that's read/writable. + lowerVD := fd.Impl().(*fileDescription).d.lowerVD + + lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR, + }) + if err != nil { + t.Fatalf("OpenAt: %v", err) + } + + if err := corruptRandomBit(ctx, lowerFD, size); err != nil { + t.Fatalf("corruptRandomBit: %v", err) + } + + // Confirm that read from the modified file fails. + buf := make([]byte, size) + if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil { + t.Fatalf("fd.PRead succeeded with modified file") + } +} + +// TestModifiedMerkleFails ensures that read from a verity file fails if the +// corresponding Merkle tree file is modified. +func TestModifiedMerkleFails(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj, root, err := newVerityRoot(ctx, t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + // Open a new lowerMerkleFD that's read/writable. + lowerMerkleVD := fd.Impl().(*fileDescription).d.lowerMerkleVD + + lowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: lowerMerkleVD, + Start: lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR, + }) + if err != nil { + t.Fatalf("OpenAt: %v", err) + } + + // Flip a random bit in the Merkle tree file. + stat, err := lowerMerkleFD.Stat(ctx, vfs.StatOptions{}) + if err != nil { + t.Fatalf("stat: %v", err) + } + merkleSize := int(stat.Size) + if err := corruptRandomBit(ctx, lowerMerkleFD, merkleSize); err != nil { + t.Fatalf("corruptRandomBit: %v", err) + } + + // Confirm that read from a file with modified Merkle tree fails. + buf := make([]byte, size) + if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil { + fmt.Println(buf) + t.Fatalf("fd.PRead succeeded with modified Merkle file") + } +} + +// TestModifiedParentMerkleFails ensures that open a verity enabled file in a +// verity enabled directory fails if the hashes related to the target file in +// the parent Merkle tree file is modified. +func TestModifiedParentMerkleFails(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj, root, err := newVerityRoot(ctx, t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + // Enable verity on the parent directory. + parentFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: root, + Start: root, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err != nil { + t.Fatalf("OpenAt: %v", err) + } + + if _, err := parentFD.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + // Open a new lowerMerkleFD that's read/writable. + parentLowerMerkleVD := fd.Impl().(*fileDescription).d.parent.lowerMerkleVD + + parentLowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: parentLowerMerkleVD, + Start: parentLowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR, + }) + if err != nil { + t.Fatalf("OpenAt: %v", err) + } + + // Flip a random bit in the parent Merkle tree file. + // This parent directory contains only one child, so any random + // modification in the parent Merkle tree should cause verification + // failure when opening the child file. + stat, err := parentLowerMerkleFD.Stat(ctx, vfs.StatOptions{}) + if err != nil { + t.Fatalf("stat: %v", err) + } + parentMerkleSize := int(stat.Size) + if err := corruptRandomBit(ctx, parentLowerMerkleFD, parentMerkleSize); err != nil { + t.Fatalf("corruptRandomBit: %v", err) + } + + parentLowerMerkleFD.DecRef(ctx) + + // Ensure reopening the verity enabled file fails. + if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + Mode: linux.ModeRegular, + }); err == nil { + t.Errorf("OpenAt file with modified parent Merkle succeeded") + } +} + +// TestUnmodifiedStatSucceeds ensures that stat of an untouched verity file +// succeeds after enabling verity for it. +func TestUnmodifiedStatSucceeds(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj, root, err := newVerityRoot(ctx, t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file and confirms stat succeeds. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("fd.Ioctl: %v", err) + } + + if _, err := fd.Stat(ctx, vfs.StatOptions{}); err != nil { + t.Errorf("fd.Stat: %v", err) + } +} + +// TestModifiedStatFails checks that getting stat for a file with modified stat +// should fail. +func TestModifiedStatFails(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj, root, err := newVerityRoot(ctx, t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("fd.Ioctl: %v", err) + } + + lowerFD := fd.Impl().(*fileDescription).lowerFD + // Change the stat of the underlying file, and check that stat fails. + if err := lowerFD.SetStat(ctx, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: uint32(linux.STATX_MODE), + Mode: 0777, + }, + }); err != nil { + t.Fatalf("lowerFD.SetStat: %v", err) + } + + if _, err := fd.Stat(ctx, vfs.StatOptions{}); err == nil { + t.Errorf("fd.Stat succeeded when it should fail") + } +} |