From bd7708956f0c9e00e88eff9b35b22eea75e71f5f Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Tue, 23 Jul 2019 15:50:35 -0700
Subject: ext: Added extent tree building logic.

PiperOrigin-RevId: 259628657
---
 pkg/sentry/fs/ext/BUILD                   |  18 ++-
 pkg/sentry/fs/ext/dentry.go               |  23 ++++
 pkg/sentry/fs/ext/disklayout/inode.go     |  11 +-
 pkg/sentry/fs/ext/disklayout/inode_old.go |   6 +-
 pkg/sentry/fs/ext/ext.go                  |  10 +-
 pkg/sentry/fs/ext/extent_test.go          | 184 ++++++++++++++++++++++++++++++
 pkg/sentry/fs/ext/inode.go                | 154 +++++++++++++++++++++++++
 pkg/sentry/fs/ext/utils.go                |   6 +
 8 files changed, 398 insertions(+), 14 deletions(-)
 create mode 100644 pkg/sentry/fs/ext/dentry.go
 create mode 100644 pkg/sentry/fs/ext/extent_test.go
 create mode 100644 pkg/sentry/fs/ext/inode.go

diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD
index b386f31c8..3ba278e08 100644
--- a/pkg/sentry/fs/ext/BUILD
+++ b/pkg/sentry/fs/ext/BUILD
@@ -1,18 +1,34 @@
 package(licenses = ["notice"])
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "ext",
     srcs = [
+        "dentry.go",
         "ext.go",
+        "inode.go",
         "utils.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/binary",
         "//pkg/sentry/fs/ext/disklayout",
         "//pkg/syserror",
     ],
 )
+
+go_test(
+    name = "ext_test",
+    size = "small",
+    srcs = ["extent_test.go"],
+    embed = [":ext"],
+    deps = [
+        "//pkg/binary",
+        "//pkg/sentry/fs/ext/disklayout",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/fs/ext/dentry.go b/pkg/sentry/fs/ext/dentry.go
new file mode 100644
index 000000000..71cd217df
--- /dev/null
+++ b/pkg/sentry/fs/ext/dentry.go
@@ -0,0 +1,23 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+	// inode is the inode represented by this dentry. Multiple Dentries may
+	// share a single non-directory Inode (with hard links). inode is
+	// immutable.
+	inode *inode
+}
diff --git a/pkg/sentry/fs/ext/disklayout/inode.go b/pkg/sentry/fs/ext/disklayout/inode.go
index b48001910..9ab9a4988 100644
--- a/pkg/sentry/fs/ext/disklayout/inode.go
+++ b/pkg/sentry/fs/ext/disklayout/inode.go
@@ -107,16 +107,17 @@ type Inode interface {
 	// Flags returns InodeFlags which represents the inode flags.
 	Flags() InodeFlags
 
-	// Blocks returns the underlying inode.i_block array. This field is special
-	// and is used to store various kinds of things depending on the filesystem
-	// version and inode type.
+	// Data returns the underlying inode.i_block array as a slice so it's
+	// modifiable. This field is special and is used to store various kinds of
+	// things depending on the filesystem version and inode type. The underlying
+	// field name in Linux is a little misleading.
 	//   - In ext2/ext3, it contains the block map.
-	//   - In ext4, it contains the extent tree.
+	//   - In ext4, it contains the extent tree root node.
 	//   - For inline files, it contains the file contents.
 	//   - For symlinks, it contains the link path (if it fits here).
 	//
 	// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#the-contents-of-inode-i-block.
-	Blocks() [60]byte
+	Data() []byte
 }
 
 // Inode flags. This is not comprehensive and flags which were not used in
diff --git a/pkg/sentry/fs/ext/disklayout/inode_old.go b/pkg/sentry/fs/ext/disklayout/inode_old.go
index dc4c9d8e4..7d7cc9143 100644
--- a/pkg/sentry/fs/ext/disklayout/inode_old.go
+++ b/pkg/sentry/fs/ext/disklayout/inode_old.go
@@ -47,7 +47,7 @@ type InodeOld struct {
 	BlocksCountLo uint32
 	FlagsRaw      uint32
 	VersionLo     uint32 // This is OS dependent.
-	BlocksRaw     [60]byte
+	DataRaw       [60]byte
 	Generation    uint32
 	FileACLLo     uint32
 	SizeHi        uint32
@@ -113,5 +113,5 @@ func (in *InodeOld) LinksCount() uint16 { return in.LinksCountRaw }
 // Flags implements Inode.Flags.
 func (in *InodeOld) Flags() InodeFlags { return InodeFlagsFromInt(in.FlagsRaw) }
 
-// Blocks implements Inode.Blocks.
-func (in *InodeOld) Blocks() [60]byte { return in.BlocksRaw }
+// Data implements Inode.Data.
+func (in *InodeOld) Data() []byte { return in.DataRaw[:] }
diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go
index 9d33be08f..8bc591c8b 100644
--- a/pkg/sentry/fs/ext/ext.go
+++ b/pkg/sentry/fs/ext/ext.go
@@ -24,8 +24,8 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// Filesystem implements vfs.FilesystemImpl.
-type Filesystem struct {
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
 	// dev is the ReadSeeker for the underlying fs device and is protected by mu.
 	dev io.ReadSeeker
 
@@ -50,9 +50,9 @@ type Filesystem struct {
 	bgs []disklayout.BlockGroup
 }
 
-// newFilesystem is the Filesystem constructor.
-func newFilesystem(dev io.ReadSeeker) (*Filesystem, error) {
-	fs := Filesystem{dev: dev}
+// newFilesystem is the filesystem constructor.
+func newFilesystem(dev io.ReadSeeker) (*filesystem, error) {
+	fs := filesystem{dev: dev}
 	var err error
 
 	fs.sb, err = readSuperBlock(dev)
diff --git a/pkg/sentry/fs/ext/extent_test.go b/pkg/sentry/fs/ext/extent_test.go
new file mode 100644
index 000000000..c8c52f836
--- /dev/null
+++ b/pkg/sentry/fs/ext/extent_test.go
@@ -0,0 +1,184 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"bytes"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+)
+
+// TestExtentTree tests the extent tree building logic.
+//
+// Test tree:
+//            0.{Head}[Idx][Idx]
+//                     /     \
+//                    /       \
+//      1.{Head}[Ext][Ext]  2.{Head}[Idx]
+//               /    |               \
+//           [Phy]  [Phy, Phy]    3.{Head}[Ext]
+//                                          |
+//                                    [Phy, Phy, Phy]
+//
+// Legend:
+//   - Head = ExtentHeader
+//   - Idx  = ExtentIdx
+//   - Ext  = Extent
+//   - Phy  = Physical Block
+//
+// Please note that ext4 might not construct extent trees looking like this.
+// This is purely for testing the tree traversal logic.
+func TestExtentTree(t *testing.T) {
+	blkSize := uint64(64) // No block has more than 1 header + 4 entries.
+	mockDisk := make([]byte, blkSize*10)
+	mockInode := &inode{diskInode: &disklayout.InodeNew{}}
+
+	node3 := &disklayout.ExtentNode{
+		Header: disklayout.ExtentHeader{
+			Magic:      disklayout.ExtentMagic,
+			NumEntries: 1,
+			MaxEntries: 4,
+			Height:     0,
+		},
+		Entries: []disklayout.ExtentEntryPair{
+			{
+				Entry: &disklayout.Extent{
+					FirstFileBlock: 3,
+					Length:         3,
+					StartBlockLo:   6,
+				},
+				Node: nil,
+			},
+		},
+	}
+
+	node2 := &disklayout.ExtentNode{
+		Header: disklayout.ExtentHeader{
+			Magic:      disklayout.ExtentMagic,
+			NumEntries: 1,
+			MaxEntries: 4,
+			Height:     1,
+		},
+		Entries: []disklayout.ExtentEntryPair{
+			{
+				Entry: &disklayout.ExtentIdx{
+					FirstFileBlock: 3,
+					ChildBlockLo:   2,
+				},
+				Node: node3,
+			},
+		},
+	}
+
+	node1 := &disklayout.ExtentNode{
+		Header: disklayout.ExtentHeader{
+			Magic:      disklayout.ExtentMagic,
+			NumEntries: 2,
+			MaxEntries: 4,
+			Height:     0,
+		},
+		Entries: []disklayout.ExtentEntryPair{
+			{
+				Entry: &disklayout.Extent{
+					FirstFileBlock: 0,
+					Length:         1,
+					StartBlockLo:   3,
+				},
+				Node: nil,
+			},
+			{
+				Entry: &disklayout.Extent{
+					FirstFileBlock: 1,
+					Length:         2,
+					StartBlockLo:   4,
+				},
+				Node: nil,
+			},
+		},
+	}
+
+	node0 := &disklayout.ExtentNode{
+		Header: disklayout.ExtentHeader{
+			Magic:      disklayout.ExtentMagic,
+			NumEntries: 2,
+			MaxEntries: 4,
+			Height:     2,
+		},
+		Entries: []disklayout.ExtentEntryPair{
+			{
+				Entry: &disklayout.ExtentIdx{
+					FirstFileBlock: 0,
+					ChildBlockLo:   0,
+				},
+				Node: node1,
+			},
+			{
+				Entry: &disklayout.ExtentIdx{
+					FirstFileBlock: 3,
+					ChildBlockLo:   1,
+				},
+				Node: node2,
+			},
+		},
+	}
+
+	writeTree(mockInode, mockDisk, node0, blkSize)
+
+	r := bytes.NewReader(mockDisk)
+	if err := mockInode.buildExtTree(r, blkSize); err != nil {
+		t.Fatalf("inode.buildExtTree failed: %v", err)
+	}
+
+	if diff := cmp.Diff(&mockInode.root, node0, cmpopts.IgnoreUnexported(disklayout.ExtentNode{})); diff != "" {
+		t.Errorf("extent tree mismatch (-want +got):\n%s", diff)
+	}
+}
+
+// writeTree writes the tree represented by `root` to the inode and disk passed.
+func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, blkSize uint64) {
+	rootData := binary.Marshal(nil, binary.LittleEndian, root.Header)
+	for _, ep := range root.Entries {
+		rootData = binary.Marshal(rootData, binary.LittleEndian, ep.Entry)
+	}
+
+	copy(in.diskInode.Data(), rootData)
+
+	if root.Header.Height > 0 {
+		for _, ep := range root.Entries {
+			writeTreeToDisk(disk, ep, blkSize)
+		}
+	}
+}
+
+// writeTreeToDisk is the recursive step for writeTree which writes the tree
+// on the disk only.
+func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair, blkSize uint64) {
+	nodeData := binary.Marshal(nil, binary.LittleEndian, curNode.Node.Header)
+	for _, ep := range curNode.Node.Entries {
+		nodeData = binary.Marshal(nodeData, binary.LittleEndian, ep.Entry)
+	}
+
+	copy(disk[curNode.Entry.PhysicalBlock()*blkSize:], nodeData)
+
+	if curNode.Node.Header.Height > 0 {
+		for _, ep := range curNode.Node.Entries {
+			writeTreeToDisk(disk, ep, blkSize)
+		}
+	}
+}
diff --git a/pkg/sentry/fs/ext/inode.go b/pkg/sentry/fs/ext/inode.go
new file mode 100644
index 000000000..5bf9dbfa3
--- /dev/null
+++ b/pkg/sentry/fs/ext/inode.go
@@ -0,0 +1,154 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"io"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// inode represents an ext inode.
+type inode struct {
+	// refs is a reference count. refs is accessed using atomic memory operations.
+	refs int64
+
+	// diskInode gives us access to the inode struct on disk. Immutable.
+	diskInode disklayout.Inode
+
+	// root is the root extent node. This lives in the 60 byte diskInode.Blocks().
+	// Immutable.
+	root disklayout.ExtentNode
+}
+
+// incRef increments the inode ref count.
+func (in *inode) incRef() {
+	atomic.AddInt64(&in.refs, 1)
+}
+
+// tryIncRef tries to increment the ref count. Returns true if successful.
+func (in *inode) tryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&in.refs)
+		if refs == 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&in.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+// decRef decrements the inode ref count.
+func (in *inode) decRef() {
+	if refs := atomic.AddInt64(&in.refs, -1); refs < 0 {
+		panic("ext.inode.decRef() called without holding a reference")
+	}
+}
+
+// buildExtTree builds the extent tree by reading it from disk by doing
+// running a simple DFS. It first reads the root node from the inode struct in
+// memory. Then it recursively builds the rest of the tree by reading it off
+// disk.
+//
+// Preconditions:
+//   - Must have mutual exclusion on device fd.
+//   - Inode flag InExtents must be set.
+func (in *inode) buildExtTree(dev io.ReadSeeker, blkSize uint64) error {
+	rootNodeData := in.diskInode.Data()
+
+	var rootHeader disklayout.ExtentHeader
+	binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &rootHeader)
+
+	// Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries.
+	if rootHeader.NumEntries > 4 {
+		// read(2) specifies that EINVAL should be returned if the file is unsuitable
+		// for reading.
+		return syserror.EINVAL
+	}
+
+	rootEntries := make([]disklayout.ExtentEntryPair, rootHeader.NumEntries)
+	for i, off := uint16(0), disklayout.ExtentStructsSize; i < rootHeader.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
+		var curEntry disklayout.ExtentEntry
+		if rootHeader.Height == 0 {
+			// Leaf node.
+			curEntry = &disklayout.Extent{}
+		} else {
+			// Internal node.
+			curEntry = &disklayout.ExtentIdx{}
+		}
+		binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentStructsSize], binary.LittleEndian, curEntry)
+		rootEntries[i].Entry = curEntry
+	}
+
+	// If this node is internal, perform DFS.
+	if rootHeader.Height > 0 {
+		for i := uint16(0); i < rootHeader.NumEntries; i++ {
+			var err error
+			if rootEntries[i].Node, err = buildExtTreeFromDisk(dev, rootEntries[i].Entry, blkSize); err != nil {
+				return err
+			}
+		}
+	}
+
+	in.root = disklayout.ExtentNode{rootHeader, rootEntries}
+	return nil
+}
+
+// buildExtTreeFromDisk reads the extent tree nodes from disk and recursively
+// builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to
+// by the ExtentEntry.
+//
+// Preconditions: Must have mutual exclusion on device fd.
+func buildExtTreeFromDisk(dev io.ReadSeeker, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) {
+	var header disklayout.ExtentHeader
+	off := entry.PhysicalBlock() * blkSize
+	if err := readFromDisk(dev, int64(off), &header); err != nil {
+		return nil, err
+	}
+
+	entries := make([]disklayout.ExtentEntryPair, header.NumEntries)
+	for i, off := uint16(0), off+disklayout.ExtentStructsSize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
+		var curEntry disklayout.ExtentEntry
+		if header.Height == 0 {
+			// Leaf node.
+			curEntry = &disklayout.Extent{}
+		} else {
+			// Internal node.
+			curEntry = &disklayout.ExtentIdx{}
+		}
+
+		if err := readFromDisk(dev, int64(off), curEntry); err != nil {
+			return nil, err
+		}
+		entries[i].Entry = curEntry
+	}
+
+	// If this node is internal, perform DFS.
+	if header.Height > 0 {
+		for i := uint16(0); i < header.NumEntries; i++ {
+			var err error
+			entries[i].Node, err = buildExtTreeFromDisk(dev, entries[i].Entry, blkSize)
+			if err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	return &disklayout.ExtentNode{header, entries}, nil
+}
diff --git a/pkg/sentry/fs/ext/utils.go b/pkg/sentry/fs/ext/utils.go
index 861ef9a73..71e46b2c4 100644
--- a/pkg/sentry/fs/ext/utils.go
+++ b/pkg/sentry/fs/ext/utils.go
@@ -27,6 +27,8 @@ import (
 //
 // All disk reads should use this helper so we avoid reading from stale
 // previously used offsets. This function forces the offset parameter.
+//
+// Precondition: Must have mutual exclusion on device fd.
 func readFromDisk(dev io.ReadSeeker, abOff int64, v interface{}) error {
 	if _, err := dev.Seek(abOff, io.SeekStart); err != nil {
 		return syserror.EIO
@@ -42,6 +44,8 @@ func readFromDisk(dev io.ReadSeeker, abOff int64, v interface{}) error {
 // readSuperBlock reads the SuperBlock from block group 0 in the underlying
 // device. There are three versions of the superblock. This function identifies
 // and returns the correct version.
+//
+// Precondition: Must have mutual exclusion on device fd.
 func readSuperBlock(dev io.ReadSeeker) (disklayout.SuperBlock, error) {
 	var sb disklayout.SuperBlock = &disklayout.SuperBlockOld{}
 	if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil {
@@ -82,6 +86,8 @@ func blockGroupsCount(sb disklayout.SuperBlock) uint64 {
 
 // readBlockGroups reads the block group descriptor table from block group 0 in
 // the underlying device.
+//
+// Precondition: Must have mutual exclusion on device fd.
 func readBlockGroups(dev io.ReadSeeker, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) {
 	bgCount := blockGroupsCount(sb)
 	bgdSize := uint64(sb.BgDescSize())
-- 
cgit v1.2.3