diff options
-rw-r--r-- | pkg/sentry/fs/ext/BUILD | 18 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/dentry.go | 23 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/disklayout/inode.go | 11 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/disklayout/inode_old.go | 6 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/ext.go | 10 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/extent_test.go | 184 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/inode.go | 154 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/utils.go | 6 |
8 files changed, 398 insertions, 14 deletions
diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD index b386f31c8..3ba278e08 100644 --- a/pkg/sentry/fs/ext/BUILD +++ b/pkg/sentry/fs/ext/BUILD @@ -1,18 +1,34 @@ package(licenses = ["notice"]) -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools/go_stateify:defs.bzl", "go_library", "go_test") go_library( name = "ext", srcs = [ + "dentry.go", "ext.go", + "inode.go", "utils.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/binary", "//pkg/sentry/fs/ext/disklayout", "//pkg/syserror", ], ) + +go_test( + name = "ext_test", + size = "small", + srcs = ["extent_test.go"], + embed = [":ext"], + deps = [ + "//pkg/binary", + "//pkg/sentry/fs/ext/disklayout", + "@com_github_google_go-cmp//cmp:go_default_library", + "@com_github_google_go-cmp//cmp/cmpopts:go_default_library", + ], +) diff --git a/pkg/sentry/fs/ext/dentry.go b/pkg/sentry/fs/ext/dentry.go new file mode 100644 index 000000000..71cd217df --- /dev/null +++ b/pkg/sentry/fs/ext/dentry.go @@ -0,0 +1,23 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +// dentry implements vfs.DentryImpl. +type dentry struct { + // inode is the inode represented by this dentry. Multiple Dentries may + // share a single non-directory Inode (with hard links). inode is + // immutable. + inode *inode +} diff --git a/pkg/sentry/fs/ext/disklayout/inode.go b/pkg/sentry/fs/ext/disklayout/inode.go index b48001910..9ab9a4988 100644 --- a/pkg/sentry/fs/ext/disklayout/inode.go +++ b/pkg/sentry/fs/ext/disklayout/inode.go @@ -107,16 +107,17 @@ type Inode interface { // Flags returns InodeFlags which represents the inode flags. Flags() InodeFlags - // Blocks returns the underlying inode.i_block array. This field is special - // and is used to store various kinds of things depending on the filesystem - // version and inode type. + // Data returns the underlying inode.i_block array as a slice so it's + // modifiable. This field is special and is used to store various kinds of + // things depending on the filesystem version and inode type. The underlying + // field name in Linux is a little misleading. // - In ext2/ext3, it contains the block map. - // - In ext4, it contains the extent tree. + // - In ext4, it contains the extent tree root node. // - For inline files, it contains the file contents. // - For symlinks, it contains the link path (if it fits here). // // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#the-contents-of-inode-i-block. - Blocks() [60]byte + Data() []byte } // Inode flags. This is not comprehensive and flags which were not used in diff --git a/pkg/sentry/fs/ext/disklayout/inode_old.go b/pkg/sentry/fs/ext/disklayout/inode_old.go index dc4c9d8e4..7d7cc9143 100644 --- a/pkg/sentry/fs/ext/disklayout/inode_old.go +++ b/pkg/sentry/fs/ext/disklayout/inode_old.go @@ -47,7 +47,7 @@ type InodeOld struct { BlocksCountLo uint32 FlagsRaw uint32 VersionLo uint32 // This is OS dependent. - BlocksRaw [60]byte + DataRaw [60]byte Generation uint32 FileACLLo uint32 SizeHi uint32 @@ -113,5 +113,5 @@ func (in *InodeOld) LinksCount() uint16 { return in.LinksCountRaw } // Flags implements Inode.Flags. func (in *InodeOld) Flags() InodeFlags { return InodeFlagsFromInt(in.FlagsRaw) } -// Blocks implements Inode.Blocks. -func (in *InodeOld) Blocks() [60]byte { return in.BlocksRaw } +// Data implements Inode.Data. +func (in *InodeOld) Data() []byte { return in.DataRaw[:] } diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go index 9d33be08f..8bc591c8b 100644 --- a/pkg/sentry/fs/ext/ext.go +++ b/pkg/sentry/fs/ext/ext.go @@ -24,8 +24,8 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -// Filesystem implements vfs.FilesystemImpl. -type Filesystem struct { +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { // dev is the ReadSeeker for the underlying fs device and is protected by mu. dev io.ReadSeeker @@ -50,9 +50,9 @@ type Filesystem struct { bgs []disklayout.BlockGroup } -// newFilesystem is the Filesystem constructor. -func newFilesystem(dev io.ReadSeeker) (*Filesystem, error) { - fs := Filesystem{dev: dev} +// newFilesystem is the filesystem constructor. +func newFilesystem(dev io.ReadSeeker) (*filesystem, error) { + fs := filesystem{dev: dev} var err error fs.sb, err = readSuperBlock(dev) diff --git a/pkg/sentry/fs/ext/extent_test.go b/pkg/sentry/fs/ext/extent_test.go new file mode 100644 index 000000000..c8c52f836 --- /dev/null +++ b/pkg/sentry/fs/ext/extent_test.go @@ -0,0 +1,184 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "bytes" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" +) + +// TestExtentTree tests the extent tree building logic. +// +// Test tree: +// 0.{Head}[Idx][Idx] +// / \ +// / \ +// 1.{Head}[Ext][Ext] 2.{Head}[Idx] +// / | \ +// [Phy] [Phy, Phy] 3.{Head}[Ext] +// | +// [Phy, Phy, Phy] +// +// Legend: +// - Head = ExtentHeader +// - Idx = ExtentIdx +// - Ext = Extent +// - Phy = Physical Block +// +// Please note that ext4 might not construct extent trees looking like this. +// This is purely for testing the tree traversal logic. +func TestExtentTree(t *testing.T) { + blkSize := uint64(64) // No block has more than 1 header + 4 entries. + mockDisk := make([]byte, blkSize*10) + mockInode := &inode{diskInode: &disklayout.InodeNew{}} + + node3 := &disklayout.ExtentNode{ + Header: disklayout.ExtentHeader{ + Magic: disklayout.ExtentMagic, + NumEntries: 1, + MaxEntries: 4, + Height: 0, + }, + Entries: []disklayout.ExtentEntryPair{ + { + Entry: &disklayout.Extent{ + FirstFileBlock: 3, + Length: 3, + StartBlockLo: 6, + }, + Node: nil, + }, + }, + } + + node2 := &disklayout.ExtentNode{ + Header: disklayout.ExtentHeader{ + Magic: disklayout.ExtentMagic, + NumEntries: 1, + MaxEntries: 4, + Height: 1, + }, + Entries: []disklayout.ExtentEntryPair{ + { + Entry: &disklayout.ExtentIdx{ + FirstFileBlock: 3, + ChildBlockLo: 2, + }, + Node: node3, + }, + }, + } + + node1 := &disklayout.ExtentNode{ + Header: disklayout.ExtentHeader{ + Magic: disklayout.ExtentMagic, + NumEntries: 2, + MaxEntries: 4, + Height: 0, + }, + Entries: []disklayout.ExtentEntryPair{ + { + Entry: &disklayout.Extent{ + FirstFileBlock: 0, + Length: 1, + StartBlockLo: 3, + }, + Node: nil, + }, + { + Entry: &disklayout.Extent{ + FirstFileBlock: 1, + Length: 2, + StartBlockLo: 4, + }, + Node: nil, + }, + }, + } + + node0 := &disklayout.ExtentNode{ + Header: disklayout.ExtentHeader{ + Magic: disklayout.ExtentMagic, + NumEntries: 2, + MaxEntries: 4, + Height: 2, + }, + Entries: []disklayout.ExtentEntryPair{ + { + Entry: &disklayout.ExtentIdx{ + FirstFileBlock: 0, + ChildBlockLo: 0, + }, + Node: node1, + }, + { + Entry: &disklayout.ExtentIdx{ + FirstFileBlock: 3, + ChildBlockLo: 1, + }, + Node: node2, + }, + }, + } + + writeTree(mockInode, mockDisk, node0, blkSize) + + r := bytes.NewReader(mockDisk) + if err := mockInode.buildExtTree(r, blkSize); err != nil { + t.Fatalf("inode.buildExtTree failed: %v", err) + } + + if diff := cmp.Diff(&mockInode.root, node0, cmpopts.IgnoreUnexported(disklayout.ExtentNode{})); diff != "" { + t.Errorf("extent tree mismatch (-want +got):\n%s", diff) + } +} + +// writeTree writes the tree represented by `root` to the inode and disk passed. +func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, blkSize uint64) { + rootData := binary.Marshal(nil, binary.LittleEndian, root.Header) + for _, ep := range root.Entries { + rootData = binary.Marshal(rootData, binary.LittleEndian, ep.Entry) + } + + copy(in.diskInode.Data(), rootData) + + if root.Header.Height > 0 { + for _, ep := range root.Entries { + writeTreeToDisk(disk, ep, blkSize) + } + } +} + +// writeTreeToDisk is the recursive step for writeTree which writes the tree +// on the disk only. +func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair, blkSize uint64) { + nodeData := binary.Marshal(nil, binary.LittleEndian, curNode.Node.Header) + for _, ep := range curNode.Node.Entries { + nodeData = binary.Marshal(nodeData, binary.LittleEndian, ep.Entry) + } + + copy(disk[curNode.Entry.PhysicalBlock()*blkSize:], nodeData) + + if curNode.Node.Header.Height > 0 { + for _, ep := range curNode.Node.Entries { + writeTreeToDisk(disk, ep, blkSize) + } + } +} diff --git a/pkg/sentry/fs/ext/inode.go b/pkg/sentry/fs/ext/inode.go new file mode 100644 index 000000000..5bf9dbfa3 --- /dev/null +++ b/pkg/sentry/fs/ext/inode.go @@ -0,0 +1,154 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "io" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" + "gvisor.dev/gvisor/pkg/syserror" +) + +// inode represents an ext inode. +type inode struct { + // refs is a reference count. refs is accessed using atomic memory operations. + refs int64 + + // diskInode gives us access to the inode struct on disk. Immutable. + diskInode disklayout.Inode + + // root is the root extent node. This lives in the 60 byte diskInode.Blocks(). + // Immutable. + root disklayout.ExtentNode +} + +// incRef increments the inode ref count. +func (in *inode) incRef() { + atomic.AddInt64(&in.refs, 1) +} + +// tryIncRef tries to increment the ref count. Returns true if successful. +func (in *inode) tryIncRef() bool { + for { + refs := atomic.LoadInt64(&in.refs) + if refs == 0 { + return false + } + if atomic.CompareAndSwapInt64(&in.refs, refs, refs+1) { + return true + } + } +} + +// decRef decrements the inode ref count. +func (in *inode) decRef() { + if refs := atomic.AddInt64(&in.refs, -1); refs < 0 { + panic("ext.inode.decRef() called without holding a reference") + } +} + +// buildExtTree builds the extent tree by reading it from disk by doing +// running a simple DFS. It first reads the root node from the inode struct in +// memory. Then it recursively builds the rest of the tree by reading it off +// disk. +// +// Preconditions: +// - Must have mutual exclusion on device fd. +// - Inode flag InExtents must be set. +func (in *inode) buildExtTree(dev io.ReadSeeker, blkSize uint64) error { + rootNodeData := in.diskInode.Data() + + var rootHeader disklayout.ExtentHeader + binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &rootHeader) + + // Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries. + if rootHeader.NumEntries > 4 { + // read(2) specifies that EINVAL should be returned if the file is unsuitable + // for reading. + return syserror.EINVAL + } + + rootEntries := make([]disklayout.ExtentEntryPair, rootHeader.NumEntries) + for i, off := uint16(0), disklayout.ExtentStructsSize; i < rootHeader.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize { + var curEntry disklayout.ExtentEntry + if rootHeader.Height == 0 { + // Leaf node. + curEntry = &disklayout.Extent{} + } else { + // Internal node. + curEntry = &disklayout.ExtentIdx{} + } + binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentStructsSize], binary.LittleEndian, curEntry) + rootEntries[i].Entry = curEntry + } + + // If this node is internal, perform DFS. + if rootHeader.Height > 0 { + for i := uint16(0); i < rootHeader.NumEntries; i++ { + var err error + if rootEntries[i].Node, err = buildExtTreeFromDisk(dev, rootEntries[i].Entry, blkSize); err != nil { + return err + } + } + } + + in.root = disklayout.ExtentNode{rootHeader, rootEntries} + return nil +} + +// buildExtTreeFromDisk reads the extent tree nodes from disk and recursively +// builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to +// by the ExtentEntry. +// +// Preconditions: Must have mutual exclusion on device fd. +func buildExtTreeFromDisk(dev io.ReadSeeker, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) { + var header disklayout.ExtentHeader + off := entry.PhysicalBlock() * blkSize + if err := readFromDisk(dev, int64(off), &header); err != nil { + return nil, err + } + + entries := make([]disklayout.ExtentEntryPair, header.NumEntries) + for i, off := uint16(0), off+disklayout.ExtentStructsSize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize { + var curEntry disklayout.ExtentEntry + if header.Height == 0 { + // Leaf node. + curEntry = &disklayout.Extent{} + } else { + // Internal node. + curEntry = &disklayout.ExtentIdx{} + } + + if err := readFromDisk(dev, int64(off), curEntry); err != nil { + return nil, err + } + entries[i].Entry = curEntry + } + + // If this node is internal, perform DFS. + if header.Height > 0 { + for i := uint16(0); i < header.NumEntries; i++ { + var err error + entries[i].Node, err = buildExtTreeFromDisk(dev, entries[i].Entry, blkSize) + if err != nil { + return nil, err + } + } + } + + return &disklayout.ExtentNode{header, entries}, nil +} diff --git a/pkg/sentry/fs/ext/utils.go b/pkg/sentry/fs/ext/utils.go index 861ef9a73..71e46b2c4 100644 --- a/pkg/sentry/fs/ext/utils.go +++ b/pkg/sentry/fs/ext/utils.go @@ -27,6 +27,8 @@ import ( // // All disk reads should use this helper so we avoid reading from stale // previously used offsets. This function forces the offset parameter. +// +// Precondition: Must have mutual exclusion on device fd. func readFromDisk(dev io.ReadSeeker, abOff int64, v interface{}) error { if _, err := dev.Seek(abOff, io.SeekStart); err != nil { return syserror.EIO @@ -42,6 +44,8 @@ func readFromDisk(dev io.ReadSeeker, abOff int64, v interface{}) error { // readSuperBlock reads the SuperBlock from block group 0 in the underlying // device. There are three versions of the superblock. This function identifies // and returns the correct version. +// +// Precondition: Must have mutual exclusion on device fd. func readSuperBlock(dev io.ReadSeeker) (disklayout.SuperBlock, error) { var sb disklayout.SuperBlock = &disklayout.SuperBlockOld{} if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { @@ -82,6 +86,8 @@ func blockGroupsCount(sb disklayout.SuperBlock) uint64 { // readBlockGroups reads the block group descriptor table from block group 0 in // the underlying device. +// +// Precondition: Must have mutual exclusion on device fd. func readBlockGroups(dev io.ReadSeeker, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) { bgCount := blockGroupsCount(sb) bgdSize := uint64(sb.BgDescSize()) |