diff options
Diffstat (limited to 'pkg/sentry')
-rw-r--r-- | pkg/sentry/fs/ext/disklayout/inode_new.go | 2 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/disklayout/inode_old.go | 6 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/disklayout/inode_test.go | 2 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/disklayout/superblock_old.go | 2 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/ext.go | 48 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/extent_test.go | 2 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/inode.go | 71 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/utils.go | 6 |
8 files changed, 110 insertions, 29 deletions
diff --git a/pkg/sentry/fs/ext/disklayout/inode_new.go b/pkg/sentry/fs/ext/disklayout/inode_new.go index 4f5348372..8f9f574ce 100644 --- a/pkg/sentry/fs/ext/disklayout/inode_new.go +++ b/pkg/sentry/fs/ext/disklayout/inode_new.go @@ -62,7 +62,7 @@ func (in *InodeNew) Size() uint64 { // InodeSize implements Inode.InodeSize. func (in *InodeNew) InodeSize() uint16 { - return oldInodeSize + in.ExtraInodeSize + return OldInodeSize + in.ExtraInodeSize } // ChangeTime implements Inode.ChangeTime. diff --git a/pkg/sentry/fs/ext/disklayout/inode_old.go b/pkg/sentry/fs/ext/disklayout/inode_old.go index 7d7cc9143..db25b11b6 100644 --- a/pkg/sentry/fs/ext/disklayout/inode_old.go +++ b/pkg/sentry/fs/ext/disklayout/inode_old.go @@ -21,8 +21,8 @@ import ( ) const ( - // oldInodeSize is the inode size in ext2/ext3. - oldInodeSize = 128 + // OldInodeSize is the inode size in ext2/ext3. + OldInodeSize = 128 ) // InodeOld implements Inode interface. It emulates ext2/ext3 inode struct. @@ -85,7 +85,7 @@ func (in *InodeOld) Size() uint64 { } // InodeSize implements Inode.InodeSize. -func (in *InodeOld) InodeSize() uint16 { return oldInodeSize } +func (in *InodeOld) InodeSize() uint16 { return OldInodeSize } // AccessTime implements Inode.AccessTime. func (in *InodeOld) AccessTime() time.Time { diff --git a/pkg/sentry/fs/ext/disklayout/inode_test.go b/pkg/sentry/fs/ext/disklayout/inode_test.go index 9cae9e4f0..dd03ee50e 100644 --- a/pkg/sentry/fs/ext/disklayout/inode_test.go +++ b/pkg/sentry/fs/ext/disklayout/inode_test.go @@ -24,7 +24,7 @@ import ( // TestInodeSize tests that the inode structs are of the correct size. func TestInodeSize(t *testing.T) { - assertSize(t, InodeOld{}, oldInodeSize) + assertSize(t, InodeOld{}, OldInodeSize) // This was updated from 156 bytes to 160 bytes in Oct 2015. assertSize(t, InodeNew{}, 160) diff --git a/pkg/sentry/fs/ext/disklayout/superblock_old.go b/pkg/sentry/fs/ext/disklayout/superblock_old.go index aada8b550..5a64aaaa1 100644 --- a/pkg/sentry/fs/ext/disklayout/superblock_old.go +++ b/pkg/sentry/fs/ext/disklayout/superblock_old.go @@ -81,7 +81,7 @@ func (sb *SuperBlockOld) ClusterSize() uint64 { return 1 << (10 + sb.LogClusterS func (sb *SuperBlockOld) ClustersPerGroup() uint32 { return sb.ClustersPerGroupRaw } // InodeSize implements SuperBlock.InodeSize. -func (sb *SuperBlockOld) InodeSize() uint16 { return oldInodeSize } +func (sb *SuperBlockOld) InodeSize() uint16 { return OldInodeSize } // InodesPerGroup implements SuperBlock.InodesPerGroup. func (sb *SuperBlockOld) InodesPerGroup() uint32 { return sb.InodesPerGroupRaw } diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go index 8bc591c8b..7f4287b01 100644 --- a/pkg/sentry/fs/ext/ext.go +++ b/pkg/sentry/fs/ext/ext.go @@ -26,21 +26,29 @@ import ( // filesystem implements vfs.FilesystemImpl. type filesystem struct { - // dev is the ReadSeeker for the underlying fs device and is protected by mu. - dev io.ReadSeeker + // mu serializes changes to the Dentry tree and the usage of the read seeker. + mu sync.Mutex - // mu synchronizes the usage of dev. The ext filesystems take locality into - // condsideration, i.e. data blocks of a file will tend to be placed close - // together. On a spinning disk, locality reduces the amount of movement of - // the head hence speeding up IO operations. On an SSD there are no moving - // parts but locality increases the size of each transer request. Hence, - // having mutual exclusion on the read seeker while reading a file *should* - // help in achieving the intended performance gains. + // dev is the ReadSeeker for the underlying fs device. It is protected by mu. + // + // The ext filesystems aim to maximize locality, i.e. place all the data + // blocks of a file close together. On a spinning disk, locality reduces the + // amount of movement of the head hence speeding up IO operations. On an SSD + // there are no moving parts but locality increases the size of each transer + // request. Hence, having mutual exclusion on the read seeker while reading a + // file *should* help in achieving the intended performance gains. // // Note: This synchronization was not coupled with the ReadSeeker itself // because we want to synchronize across read/seek operations for the // performance gains mentioned above. Helps enforcing one-file-at-a-time IO. - mu sync.Mutex + dev io.ReadSeeker + + // inodeCache maps absolute inode numbers to the corresponding Inode struct. + // Inodes should be removed from this once their reference count hits 0. + // + // Protected by mu because every addition and removal from this corresponds to + // a change in the dentry tree. + inodeCache map[uint32]*inode // sb represents the filesystem superblock. Immutable after initialization. sb disklayout.SuperBlock @@ -52,7 +60,7 @@ type filesystem struct { // newFilesystem is the filesystem constructor. func newFilesystem(dev io.ReadSeeker) (*filesystem, error) { - fs := filesystem{dev: dev} + fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)} var err error fs.sb, err = readSuperBlock(dev) @@ -73,3 +81,21 @@ func newFilesystem(dev io.ReadSeeker) (*filesystem, error) { return &fs, nil } + +// getOrCreateInode gets the inode corresponding to the inode number passed in. +// It creates a new one with the given inode number if one does not exist. +// +// Preconditions: must be holding fs.mu. +func (fs *filesystem) getOrCreateInode(inodeNum uint32) (*inode, error) { + if in, ok := fs.inodeCache[inodeNum]; ok { + return in, nil + } + + in, err := newInode(fs.dev, fs.sb, fs.bgs, inodeNum) + if err != nil { + return nil, err + } + + fs.inodeCache[inodeNum] = in + return in, nil +} diff --git a/pkg/sentry/fs/ext/extent_test.go b/pkg/sentry/fs/ext/extent_test.go index 9e55187e5..b3f342c8e 100644 --- a/pkg/sentry/fs/ext/extent_test.go +++ b/pkg/sentry/fs/ext/extent_test.go @@ -146,7 +146,7 @@ func TestExtentTree(t *testing.T) { } opt := cmpopts.IgnoreUnexported(disklayout.ExtentIdx{}, disklayout.ExtentHeader{}) - if diff := cmp.Diff(&mockInode.root, node0, opt); diff != "" { + if diff := cmp.Diff(mockInode.root, node0, opt); diff != "" { t.Errorf("extent tree mismatch (-want +got):\n%s", diff) } } diff --git a/pkg/sentry/fs/ext/inode.go b/pkg/sentry/fs/ext/inode.go index 5bf9dbfa3..df1ea0bda 100644 --- a/pkg/sentry/fs/ext/inode.go +++ b/pkg/sentry/fs/ext/inode.go @@ -28,12 +28,16 @@ type inode struct { // refs is a reference count. refs is accessed using atomic memory operations. refs int64 + // inodeNum is the inode number of this inode on disk. This is used to + // identify inodes within the ext filesystem. + inodeNum uint32 + // diskInode gives us access to the inode struct on disk. Immutable. diskInode disklayout.Inode // root is the root extent node. This lives in the 60 byte diskInode.Blocks(). - // Immutable. - root disklayout.ExtentNode + // Immutable. Nil if the inode does not use extents. + root *disklayout.ExtentNode } // incRef increments the inode ref count. @@ -54,20 +58,71 @@ func (in *inode) tryIncRef() bool { } } -// decRef decrements the inode ref count. -func (in *inode) decRef() { - if refs := atomic.AddInt64(&in.refs, -1); refs < 0 { +// decRef decrements the inode ref count and releases the inode resources if +// the ref count hits 0. +// +// Preconditions: Must have locked fs.mu. +func (in *inode) decRef(fs *filesystem) { + if refs := atomic.AddInt64(&in.refs, -1); refs == 0 { + delete(fs.inodeCache, in.inodeNum) + } else if refs < 0 { panic("ext.inode.decRef() called without holding a reference") } } +// newInode is the inode constructor. Reads the inode off disk. Identifies +// inodes based on the absolute inode number on disk. +// +// Preconditions: Must hold the mutex of the filesystem containing dev. +func newInode(dev io.ReadSeeker, sb disklayout.SuperBlock, bgs []disklayout.BlockGroup, inodeNum uint32) (*inode, error) { + if inodeNum == 0 { + panic("inode number 0 on ext filesystems is not possible") + } + + in := &inode{refs: 1, inodeNum: inodeNum} + inodeRecordSize := sb.InodeSize() + if inodeRecordSize == disklayout.OldInodeSize { + in.diskInode = &disklayout.InodeOld{} + } else { + in.diskInode = &disklayout.InodeNew{} + } + + // Calculate where the inode is actually placed. + inodesPerGrp := sb.InodesPerGroup() + blkSize := sb.BlockSize() + inodeTableOff := bgs[getBGNum(inodeNum, inodesPerGrp)].InodeTable() * blkSize + inodeOff := inodeTableOff + uint64(uint32(inodeRecordSize)*getBGOff(inodeNum, inodesPerGrp)) + + // Read it from disk and figure out which type of inode this is. + if err := readFromDisk(dev, int64(inodeOff), in.diskInode); err != nil { + return nil, err + } + + if in.diskInode.Flags().Extents { + in.buildExtTree(dev, blkSize) + } + + return in, nil +} + +// getBGNum returns the block group number that a given inode belongs to. +func getBGNum(inodeNum uint32, inodesPerGrp uint32) uint32 { + return (inodeNum - 1) / inodesPerGrp +} + +// getBGOff returns the offset at which the given inode lives in the block +// group's inode table, i.e. the index of the inode in the inode table. +func getBGOff(inodeNum uint32, inodesPerGrp uint32) uint32 { + return (inodeNum - 1) % inodesPerGrp +} + // buildExtTree builds the extent tree by reading it from disk by doing // running a simple DFS. It first reads the root node from the inode struct in // memory. Then it recursively builds the rest of the tree by reading it off // disk. // // Preconditions: -// - Must have mutual exclusion on device fd. +// - Must hold the mutex of the filesystem containing dev. // - Inode flag InExtents must be set. func (in *inode) buildExtTree(dev io.ReadSeeker, blkSize uint64) error { rootNodeData := in.diskInode.Data() @@ -106,7 +161,7 @@ func (in *inode) buildExtTree(dev io.ReadSeeker, blkSize uint64) error { } } - in.root = disklayout.ExtentNode{rootHeader, rootEntries} + in.root = &disklayout.ExtentNode{rootHeader, rootEntries} return nil } @@ -114,7 +169,7 @@ func (in *inode) buildExtTree(dev io.ReadSeeker, blkSize uint64) error { // builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to // by the ExtentEntry. // -// Preconditions: Must have mutual exclusion on device fd. +// Preconditions: Must hold the mutex of the filesystem containing dev. func buildExtTreeFromDisk(dev io.ReadSeeker, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) { var header disklayout.ExtentHeader off := entry.PhysicalBlock() * blkSize diff --git a/pkg/sentry/fs/ext/utils.go b/pkg/sentry/fs/ext/utils.go index 71e46b2c4..3472c5fa8 100644 --- a/pkg/sentry/fs/ext/utils.go +++ b/pkg/sentry/fs/ext/utils.go @@ -28,7 +28,7 @@ import ( // All disk reads should use this helper so we avoid reading from stale // previously used offsets. This function forces the offset parameter. // -// Precondition: Must have mutual exclusion on device fd. +// Precondition: Must hold the mutex of the filesystem containing dev. func readFromDisk(dev io.ReadSeeker, abOff int64, v interface{}) error { if _, err := dev.Seek(abOff, io.SeekStart); err != nil { return syserror.EIO @@ -45,7 +45,7 @@ func readFromDisk(dev io.ReadSeeker, abOff int64, v interface{}) error { // device. There are three versions of the superblock. This function identifies // and returns the correct version. // -// Precondition: Must have mutual exclusion on device fd. +// Precondition: Must hold the mutex of the filesystem containing dev. func readSuperBlock(dev io.ReadSeeker) (disklayout.SuperBlock, error) { var sb disklayout.SuperBlock = &disklayout.SuperBlockOld{} if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { @@ -87,7 +87,7 @@ func blockGroupsCount(sb disklayout.SuperBlock) uint64 { // readBlockGroups reads the block group descriptor table from block group 0 in // the underlying device. // -// Precondition: Must have mutual exclusion on device fd. +// Precondition: Must hold the mutex of the filesystem containing dev. func readBlockGroups(dev io.ReadSeeker, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) { bgCount := blockGroupsCount(sb) bgdSize := uint64(sb.BgDescSize()) |