summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fs/ext
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/fs/ext')
-rw-r--r--pkg/sentry/fs/ext/BUILD11
-rw-r--r--pkg/sentry/fs/ext/disklayout/BUILD46
-rw-r--r--pkg/sentry/fs/ext/disklayout/block_group.go127
-rw-r--r--pkg/sentry/fs/ext/disklayout/block_group_32.go72
-rw-r--r--pkg/sentry/fs/ext/disklayout/block_group_64.go93
-rw-r--r--pkg/sentry/fs/ext/disklayout/block_group_test.go26
-rw-r--r--pkg/sentry/fs/ext/disklayout/dirent.go69
-rw-r--r--pkg/sentry/fs/ext/disklayout/dirent_new.go61
-rw-r--r--pkg/sentry/fs/ext/disklayout/dirent_old.go50
-rw-r--r--pkg/sentry/fs/ext/disklayout/dirent_test.go28
-rw-r--r--pkg/sentry/fs/ext/disklayout/disklayout.go50
-rw-r--r--pkg/sentry/fs/ext/disklayout/inode.go267
-rw-r--r--pkg/sentry/fs/ext/disklayout/inode_new.go96
-rw-r--r--pkg/sentry/fs/ext/disklayout/inode_old.go117
-rw-r--r--pkg/sentry/fs/ext/disklayout/inode_test.go222
-rw-r--r--pkg/sentry/fs/ext/disklayout/superblock.go468
-rw-r--r--pkg/sentry/fs/ext/disklayout/superblock_32.go75
-rw-r--r--pkg/sentry/fs/ext/disklayout/superblock_64.go94
-rw-r--r--pkg/sentry/fs/ext/disklayout/superblock_old.go102
-rw-r--r--pkg/sentry/fs/ext/disklayout/superblock_test.go27
-rw-r--r--pkg/sentry/fs/ext/disklayout/test_utils.go30
-rw-r--r--pkg/sentry/fs/ext/ext.go49
22 files changed, 2180 insertions, 0 deletions
diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD
new file mode 100644
index 000000000..3c2a02eac
--- /dev/null
+++ b/pkg/sentry/fs/ext/BUILD
@@ -0,0 +1,11 @@
+package(licenses = ["notice"])
+
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+go_library(
+ name = "ext",
+ srcs = ["ext.go"],
+ importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext",
+ visibility = ["//pkg/sentry:internal"],
+ deps = ["//pkg/sentry/fs/ext/disklayout"],
+)
diff --git a/pkg/sentry/fs/ext/disklayout/BUILD b/pkg/sentry/fs/ext/disklayout/BUILD
new file mode 100644
index 000000000..e4cb26645
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/BUILD
@@ -0,0 +1,46 @@
+package(licenses = ["notice"])
+
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
+go_library(
+ name = "disklayout",
+ srcs = [
+ "block_group.go",
+ "block_group_32.go",
+ "block_group_64.go",
+ "dirent.go",
+ "dirent_new.go",
+ "dirent_old.go",
+ "disklayout.go",
+ "inode.go",
+ "inode_new.go",
+ "inode_old.go",
+ "superblock.go",
+ "superblock_32.go",
+ "superblock_64.go",
+ "superblock_old.go",
+ "test_utils.go",
+ ],
+ importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/binary",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/time",
+ ],
+)
+
+go_test(
+ name = "disklayout_test",
+ size = "small",
+ srcs = [
+ "block_group_test.go",
+ "dirent_test.go",
+ "inode_test.go",
+ "superblock_test.go",
+ ],
+ embed = [":disklayout"],
+ deps = ["//pkg/sentry/kernel/time"],
+)
diff --git a/pkg/sentry/fs/ext/disklayout/block_group.go b/pkg/sentry/fs/ext/disklayout/block_group.go
new file mode 100644
index 000000000..32ea3d97d
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/block_group.go
@@ -0,0 +1,127 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+// BlockGroup represents a Linux ext block group descriptor. An ext file system
+// is split into a series of block groups. This provides an access layer to
+// information needed to access and use a block group.
+//
+// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors.
+type BlockGroup interface {
+ // InodeTable returns the absolute block number of the block containing the
+ // inode table. This points to an array of Inode structs. Inode tables are
+ // statically allocated at mkfs time. The superblock records the number of
+ // inodes per group (length of this table) and the size of each inode struct.
+ InodeTable() uint64
+
+ // BlockBitmap returns the absolute block number of the block containing the
+ // block bitmap. This bitmap tracks the usage of data blocks within this block
+ // group and has its own checksum.
+ BlockBitmap() uint64
+
+ // InodeBitmap returns the absolute block number of the block containing the
+ // inode bitmap. This bitmap tracks the usage of this group's inode table
+ // entries and has its own checksum.
+ InodeBitmap() uint64
+
+ // ExclusionBitmap returns the absolute block number of the snapshot exclusion
+ // bitmap.
+ ExclusionBitmap() uint64
+
+ // FreeBlocksCount returns the number of free blocks in the group.
+ FreeBlocksCount() uint32
+
+ // FreeInodesCount returns the number of free inodes in the group.
+ FreeInodesCount() uint32
+
+ // DirectoryCount returns the number of inodes that represent directories
+ // under this block group.
+ DirectoryCount() uint32
+
+ // UnusedInodeCount returns the number of unused inodes beyond the last used
+ // inode in this group's inode table. As a result, we needn’t scan past the
+ // (InodesPerGroup - UnusedInodeCount())th entry in the inode table.
+ UnusedInodeCount() uint32
+
+ // BlockBitmapChecksum returns the block bitmap checksum. This is calculated
+ // using crc32c(FS UUID + group number + entire bitmap).
+ BlockBitmapChecksum() uint32
+
+ // InodeBitmapChecksum returns the inode bitmap checksum. This is calculated
+ // using crc32c(FS UUID + group number + entire bitmap).
+ InodeBitmapChecksum() uint32
+
+ // Checksum returns this block group's checksum.
+ //
+ // If SbMetadataCsum feature is set:
+ // - checksum is crc32c(FS UUID + group number + group descriptor
+ // structure) & 0xFFFF.
+ //
+ // If SbGdtCsum feature is set:
+ // - checksum is crc16(FS UUID + group number + group descriptor
+ // structure).
+ //
+ // SbMetadataCsum and SbGdtCsum should not be both set.
+ // If they are, Linux warns and asks to run fsck.
+ Checksum() uint16
+
+ // Flags returns BGFlags which represents the block group flags.
+ Flags() BGFlags
+}
+
+// These are the different block group flags.
+const (
+ // BgInodeUninit indicates that inode table and bitmap are not initialized.
+ BgInodeUninit uint16 = 0x1
+
+ // BgBlockUninit indicates that block bitmap is not initialized.
+ BgBlockUninit uint16 = 0x2
+
+ // BgInodeZeroed indicates that inode table is zeroed.
+ BgInodeZeroed uint16 = 0x4
+)
+
+// BGFlags represents all the different combinations of block group flags.
+type BGFlags struct {
+ InodeUninit bool
+ BlockUninit bool
+ InodeZeroed bool
+}
+
+// ToInt converts a BGFlags struct back to its 16-bit representation.
+func (f BGFlags) ToInt() uint16 {
+ var res uint16
+
+ if f.InodeUninit {
+ res |= BgInodeUninit
+ }
+ if f.BlockUninit {
+ res |= BgBlockUninit
+ }
+ if f.InodeZeroed {
+ res |= BgInodeZeroed
+ }
+
+ return res
+}
+
+// BGFlagsFromInt converts the 16-bit flag representation to a BGFlags struct.
+func BGFlagsFromInt(flags uint16) BGFlags {
+ return BGFlags{
+ InodeUninit: flags&BgInodeUninit > 0,
+ BlockUninit: flags&BgBlockUninit > 0,
+ InodeZeroed: flags&BgInodeZeroed > 0,
+ }
+}
diff --git a/pkg/sentry/fs/ext/disklayout/block_group_32.go b/pkg/sentry/fs/ext/disklayout/block_group_32.go
new file mode 100644
index 000000000..3e16c76db
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/block_group_32.go
@@ -0,0 +1,72 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+// BlockGroup32Bit emulates the first half of struct ext4_group_desc in
+// fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and
+// 32-bit ext4 filesystems. It implements BlockGroup interface.
+type BlockGroup32Bit struct {
+ BlockBitmapLo uint32
+ InodeBitmapLo uint32
+ InodeTableLo uint32
+ FreeBlocksCountLo uint16
+ FreeInodesCountLo uint16
+ UsedDirsCountLo uint16
+ FlagsRaw uint16
+ ExcludeBitmapLo uint32
+ BlockBitmapChecksumLo uint16
+ InodeBitmapChecksumLo uint16
+ ItableUnusedLo uint16
+ ChecksumRaw uint16
+}
+
+// Compiles only if BlockGroup32Bit implements BlockGroup.
+var _ BlockGroup = (*BlockGroup32Bit)(nil)
+
+// InodeTable implements BlockGroup.InodeTable.
+func (bg *BlockGroup32Bit) InodeTable() uint64 { return uint64(bg.InodeTableLo) }
+
+// BlockBitmap implements BlockGroup.BlockBitmap.
+func (bg *BlockGroup32Bit) BlockBitmap() uint64 { return uint64(bg.BlockBitmapLo) }
+
+// InodeBitmap implements BlockGroup.InodeBitmap.
+func (bg *BlockGroup32Bit) InodeBitmap() uint64 { return uint64(bg.InodeBitmapLo) }
+
+// ExclusionBitmap implements BlockGroup.ExclusionBitmap.
+func (bg *BlockGroup32Bit) ExclusionBitmap() uint64 { return uint64(bg.ExcludeBitmapLo) }
+
+// FreeBlocksCount implements BlockGroup.FreeBlocksCount.
+func (bg *BlockGroup32Bit) FreeBlocksCount() uint32 { return uint32(bg.FreeBlocksCountLo) }
+
+// FreeInodesCount implements BlockGroup.FreeInodesCount.
+func (bg *BlockGroup32Bit) FreeInodesCount() uint32 { return uint32(bg.FreeInodesCountLo) }
+
+// DirectoryCount implements BlockGroup.DirectoryCount.
+func (bg *BlockGroup32Bit) DirectoryCount() uint32 { return uint32(bg.UsedDirsCountLo) }
+
+// UnusedInodeCount implements BlockGroup.UnusedInodeCount.
+func (bg *BlockGroup32Bit) UnusedInodeCount() uint32 { return uint32(bg.ItableUnusedLo) }
+
+// BlockBitmapChecksum implements BlockGroup.BlockBitmapChecksum.
+func (bg *BlockGroup32Bit) BlockBitmapChecksum() uint32 { return uint32(bg.BlockBitmapChecksumLo) }
+
+// InodeBitmapChecksum implements BlockGroup.InodeBitmapChecksum.
+func (bg *BlockGroup32Bit) InodeBitmapChecksum() uint32 { return uint32(bg.InodeBitmapChecksumLo) }
+
+// Checksum implements BlockGroup.Checksum.
+func (bg *BlockGroup32Bit) Checksum() uint16 { return bg.ChecksumRaw }
+
+// Flags implements BlockGroup.Flags.
+func (bg *BlockGroup32Bit) Flags() BGFlags { return BGFlagsFromInt(bg.FlagsRaw) }
diff --git a/pkg/sentry/fs/ext/disklayout/block_group_64.go b/pkg/sentry/fs/ext/disklayout/block_group_64.go
new file mode 100644
index 000000000..9a809197a
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/block_group_64.go
@@ -0,0 +1,93 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+// BlockGroup64Bit emulates struct ext4_group_desc in fs/ext4/ext4.h.
+// It is the block group descriptor struct for 64-bit ext4 filesystems.
+// It implements BlockGroup interface. It is an extension of the 32-bit
+// version of BlockGroup.
+type BlockGroup64Bit struct {
+ // We embed the 32-bit struct here because 64-bit version is just an extension
+ // of the 32-bit version.
+ BlockGroup32Bit
+
+ // 64-bit specific fields.
+ BlockBitmapHi uint32
+ InodeBitmapHi uint32
+ InodeTableHi uint32
+ FreeBlocksCountHi uint16
+ FreeInodesCountHi uint16
+ UsedDirsCountHi uint16
+ ItableUnusedHi uint16
+ ExcludeBitmapHi uint32
+ BlockBitmapChecksumHi uint16
+ InodeBitmapChecksumHi uint16
+ _ uint32 // Padding to 64 bytes.
+}
+
+// Compiles only if BlockGroup64Bit implements BlockGroup.
+var _ BlockGroup = (*BlockGroup64Bit)(nil)
+
+// Methods to override. Checksum() and Flags() are not overridden.
+
+// InodeTable implements BlockGroup.InodeTable.
+func (bg *BlockGroup64Bit) InodeTable() uint64 {
+ return (uint64(bg.InodeTableHi) << 32) | uint64(bg.InodeTableLo)
+}
+
+// BlockBitmap implements BlockGroup.BlockBitmap.
+func (bg *BlockGroup64Bit) BlockBitmap() uint64 {
+ return (uint64(bg.BlockBitmapHi) << 32) | uint64(bg.BlockBitmapLo)
+}
+
+// InodeBitmap implements BlockGroup.InodeBitmap.
+func (bg *BlockGroup64Bit) InodeBitmap() uint64 {
+ return (uint64(bg.InodeBitmapHi) << 32) | uint64(bg.InodeBitmapLo)
+}
+
+// ExclusionBitmap implements BlockGroup.ExclusionBitmap.
+func (bg *BlockGroup64Bit) ExclusionBitmap() uint64 {
+ return (uint64(bg.ExcludeBitmapHi) << 32) | uint64(bg.ExcludeBitmapLo)
+}
+
+// FreeBlocksCount implements BlockGroup.FreeBlocksCount.
+func (bg *BlockGroup64Bit) FreeBlocksCount() uint32 {
+ return (uint32(bg.FreeBlocksCountHi) << 16) | uint32(bg.FreeBlocksCountLo)
+}
+
+// FreeInodesCount implements BlockGroup.FreeInodesCount.
+func (bg *BlockGroup64Bit) FreeInodesCount() uint32 {
+ return (uint32(bg.FreeInodesCountHi) << 16) | uint32(bg.FreeInodesCountLo)
+}
+
+// DirectoryCount implements BlockGroup.DirectoryCount.
+func (bg *BlockGroup64Bit) DirectoryCount() uint32 {
+ return (uint32(bg.UsedDirsCountHi) << 16) | uint32(bg.UsedDirsCountLo)
+}
+
+// UnusedInodeCount implements BlockGroup.UnusedInodeCount.
+func (bg *BlockGroup64Bit) UnusedInodeCount() uint32 {
+ return (uint32(bg.ItableUnusedHi) << 16) | uint32(bg.ItableUnusedLo)
+}
+
+// BlockBitmapChecksum implements BlockGroup.BlockBitmapChecksum.
+func (bg *BlockGroup64Bit) BlockBitmapChecksum() uint32 {
+ return (uint32(bg.BlockBitmapChecksumHi) << 16) | uint32(bg.BlockBitmapChecksumLo)
+}
+
+// InodeBitmapChecksum implements BlockGroup.InodeBitmapChecksum.
+func (bg *BlockGroup64Bit) InodeBitmapChecksum() uint32 {
+ return (uint32(bg.InodeBitmapChecksumHi) << 16) | uint32(bg.InodeBitmapChecksumLo)
+}
diff --git a/pkg/sentry/fs/ext/disklayout/block_group_test.go b/pkg/sentry/fs/ext/disklayout/block_group_test.go
new file mode 100644
index 000000000..0ef4294c0
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/block_group_test.go
@@ -0,0 +1,26 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+import (
+ "testing"
+)
+
+// TestBlockGroupSize tests that the block group descriptor structs are of the
+// correct size.
+func TestBlockGroupSize(t *testing.T) {
+ assertSize(t, BlockGroup32Bit{}, 32)
+ assertSize(t, BlockGroup64Bit{}, 64)
+}
diff --git a/pkg/sentry/fs/ext/disklayout/dirent.go b/pkg/sentry/fs/ext/disklayout/dirent.go
new file mode 100644
index 000000000..685bf57b8
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/dirent.go
@@ -0,0 +1,69 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+)
+
+const (
+ // MaxFileName is the maximum length of an ext fs file's name.
+ MaxFileName = 255
+)
+
+var (
+ // inodeTypeByFileType maps ext4 file types to vfs inode types.
+ //
+ // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#ftype.
+ inodeTypeByFileType = map[uint8]fs.InodeType{
+ 0: fs.Anonymous,
+ 1: fs.RegularFile,
+ 2: fs.Directory,
+ 3: fs.CharacterDevice,
+ 4: fs.BlockDevice,
+ 5: fs.Pipe,
+ 6: fs.Socket,
+ 7: fs.Symlink,
+ }
+)
+
+// The Dirent interface should be implemented by structs representing ext
+// directory entries. These are for the linear classical directories which
+// just store a list of dirent structs. A directory is a series of data blocks
+// where is each data block contains a linear array of dirents. The last entry
+// of the block has a record size that takes it to the end of the block. The
+// end of the directory is when you read dirInode.Size() bytes from the blocks.
+//
+// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories.
+type Dirent interface {
+ // Inode returns the absolute inode number of the underlying inode.
+ // Inode number 0 signifies an unused dirent.
+ Inode() uint32
+
+ // RecordSize returns the record length of this dirent on disk. The next
+ // dirent in the dirent list should be read after these many bytes from
+ // the current dirent. Must be a multiple of 4.
+ RecordSize() uint16
+
+ // FileName returns the name of the file. Can be at most 255 is length.
+ FileName() string
+
+ // FileType returns the inode type of the underlying inode. This is a
+ // performance hack so that we do not have to read the underlying inode struct
+ // to know the type of inode. This will only work when the SbDirentFileType
+ // feature is set. If not, the second returned value will be false indicating
+ // that user code has to use the inode mode to extract the file type.
+ FileType() (fs.InodeType, bool)
+}
diff --git a/pkg/sentry/fs/ext/disklayout/dirent_new.go b/pkg/sentry/fs/ext/disklayout/dirent_new.go
new file mode 100644
index 000000000..29ae4a5c2
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/dirent_new.go
@@ -0,0 +1,61 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+)
+
+// DirentNew represents the ext4 directory entry struct. This emulates Linux's
+// ext4_dir_entry_2 struct. The FileName can not be more than 255 bytes so we
+// only need 8 bits to store the NameLength. As a result, NameLength has been
+// shortened and the other 8 bits are used to encode the file type. Use the
+// FileTypeRaw field only if the SbDirentFileType feature is set.
+//
+// Note: This struct can be of variable size on disk. The one described below
+// is of maximum size and the FileName beyond NameLength bytes might contain
+// garbage.
+type DirentNew struct {
+ InodeNumber uint32
+ RecordLength uint16
+ NameLength uint8
+ FileTypeRaw uint8
+ FileNameRaw [MaxFileName]byte
+}
+
+// Compiles only if DirentNew implements Dirent.
+var _ Dirent = (*DirentNew)(nil)
+
+// Inode implements Dirent.Inode.
+func (d *DirentNew) Inode() uint32 { return d.InodeNumber }
+
+// RecordSize implements Dirent.RecordSize.
+func (d *DirentNew) RecordSize() uint16 { return d.RecordLength }
+
+// FileName implements Dirent.FileName.
+func (d *DirentNew) FileName() string {
+ return string(d.FileNameRaw[:d.NameLength])
+}
+
+// FileType implements Dirent.FileType.
+func (d *DirentNew) FileType() (fs.InodeType, bool) {
+ if inodeType, ok := inodeTypeByFileType[d.FileTypeRaw]; ok {
+ return inodeType, true
+ }
+
+ panic(fmt.Sprintf("unknown file type %v", d.FileTypeRaw))
+}
diff --git a/pkg/sentry/fs/ext/disklayout/dirent_old.go b/pkg/sentry/fs/ext/disklayout/dirent_old.go
new file mode 100644
index 000000000..2e0f9c812
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/dirent_old.go
@@ -0,0 +1,50 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+import "gvisor.dev/gvisor/pkg/sentry/fs"
+
+// DirentOld represents the old directory entry struct which does not contain
+// the file type. This emulates Linux's ext4_dir_entry struct. This is used in
+// ext2, ext3 and sometimes in ext4.
+//
+// Note: This struct can be of variable size on disk. The one described below
+// is of maximum size and the FileName beyond NameLength bytes might contain
+// garbage.
+type DirentOld struct {
+ InodeNumber uint32
+ RecordLength uint16
+ NameLength uint16
+ FileNameRaw [MaxFileName]byte
+}
+
+// Compiles only if DirentOld implements Dirent.
+var _ Dirent = (*DirentOld)(nil)
+
+// Inode implements Dirent.Inode.
+func (d *DirentOld) Inode() uint32 { return d.InodeNumber }
+
+// RecordSize implements Dirent.RecordSize.
+func (d *DirentOld) RecordSize() uint16 { return d.RecordLength }
+
+// FileName implements Dirent.FileName.
+func (d *DirentOld) FileName() string {
+ return string(d.FileNameRaw[:d.NameLength])
+}
+
+// FileType implements Dirent.FileType.
+func (d *DirentOld) FileType() (fs.InodeType, bool) {
+ return fs.Anonymous, false
+}
diff --git a/pkg/sentry/fs/ext/disklayout/dirent_test.go b/pkg/sentry/fs/ext/disklayout/dirent_test.go
new file mode 100644
index 000000000..cc6dff2c9
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/dirent_test.go
@@ -0,0 +1,28 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+import (
+ "testing"
+)
+
+// TestDirentSize tests that the dirent structs are of the correct
+// size.
+func TestDirentSize(t *testing.T) {
+ want := uintptr(263)
+
+ assertSize(t, DirentOld{}, want)
+ assertSize(t, DirentNew{}, want)
+}
diff --git a/pkg/sentry/fs/ext/disklayout/disklayout.go b/pkg/sentry/fs/ext/disklayout/disklayout.go
new file mode 100644
index 000000000..bdf4e2132
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/disklayout.go
@@ -0,0 +1,50 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package disklayout provides Linux ext file system's disk level structures
+// which can be directly read into from the underlying device. Structs aim to
+// emulate structures `exactly` how they are layed out on disk.
+//
+// This library aims to be compatible with all ext(2/3/4) systems so it
+// provides a generic interface for all major structures and various
+// implementations (for different versions). The user code is responsible for
+// using appropriate implementations based on the underlying device.
+//
+// Interfacing all major structures here serves a few purposes:
+// - Abstracts away the complexity of the underlying structure from client
+// code. The client only has to figure out versioning on set up and then
+// can use these as black boxes and pass it higher up the stack.
+// - Having pointer receivers forces the user to use pointers to these
+// heavy structs. Hence, prevents the client code from unintentionally
+// copying these by value while passing the interface around.
+// - Version-based implementation selection is resolved on set up hence
+// avoiding per call overhead of choosing implementation.
+// - All interface methods are pretty light weight (do not take in any
+// parameters by design). Passing pointer arguments to interface methods
+// can lead to heap allocation as the compiler won't be able to perform
+// escape analysis on an unknown implementation at compile time.
+//
+// Notes:
+// - All fields in these structs are exported because binary.Read would
+// panic otherwise.
+// - All structures on disk are in little-endian order. Only jbd2 (journal)
+// structures are in big-endian order.
+// - All OS dependent fields in these structures will be interpretted using
+// the Linux version of that field.
+// - The suffix `Lo` in field names stands for lower bits of that field.
+// - The suffix `Hi` in field names stands for upper bits of that field.
+// - The suffix `Raw` has been added to indicate that the field is not split
+// into Lo and Hi fields and also to resolve name collision with the
+// respective interface.
+package disklayout
diff --git a/pkg/sentry/fs/ext/disklayout/inode.go b/pkg/sentry/fs/ext/disklayout/inode.go
new file mode 100644
index 000000000..b48001910
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/inode.go
@@ -0,0 +1,267 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+)
+
+// The Inode interface must be implemented by structs representing ext inodes.
+// The inode stores all the metadata pertaining to the file (except for the
+// file name which is held by the directory entry). It does NOT expose all
+// fields and should be extended if need be.
+//
+// Some file systems (e.g. FAT) use the directory entry to store all this
+// information. Ext file systems do not so that they can support hard links.
+// However, ext4 cheats a little bit and duplicates the file type in the
+// directory entry for performance gains.
+//
+// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes.
+type Inode interface {
+ // Mode returns the linux file mode which is majorly used to extract
+ // information like:
+ // - File permissions (read/write/execute by user/group/others).
+ // - Sticky, set UID and GID bits.
+ // - File type.
+ //
+ // Masks to extract this information are provided in pkg/abi/linux/file.go.
+ Mode() linux.FileMode
+
+ // UID returns the owner UID.
+ UID() auth.KUID
+
+ // GID returns the owner GID.
+ GID() auth.KGID
+
+ // Size returns the size of the file in bytes.
+ Size() uint64
+
+ // InodeSize returns the size of this inode struct in bytes.
+ // In ext2 and ext3, the inode struct and inode disk record size was fixed at
+ // 128 bytes. Ext4 makes it possible for the inode struct to be bigger.
+ // However, accessing any field beyond the 128 bytes marker must be verified
+ // using this method.
+ InodeSize() uint16
+
+ // AccessTime returns the last access time. Shows when the file was last read.
+ //
+ // If InExtendedAttr is set, then this should NOT be used because the
+ // underlying field is used to store the extended attribute value checksum.
+ AccessTime() time.Time
+
+ // ChangeTime returns the last change time. Shows when the file meta data
+ // (like permissions) was last changed.
+ //
+ // If InExtendedAttr is set, then this should NOT be used because the
+ // underlying field is used to store the lower 32 bits of the attribute
+ // value’s reference count.
+ ChangeTime() time.Time
+
+ // ModificationTime returns the last modification time. Shows when the file
+ // content was last modified.
+ //
+ // If InExtendedAttr is set, then this should NOT be used because
+ // the underlying field contains the number of the inode that owns the
+ // extended attribute.
+ ModificationTime() time.Time
+
+ // DeletionTime returns the deletion time. Inodes are marked as deleted by
+ // writing to the underlying field. FS tools can restore files until they are
+ // actually overwritten.
+ DeletionTime() time.Time
+
+ // LinksCount returns the number of hard links to this inode.
+ //
+ // Normally there is an upper limit on the number of hard links:
+ // - ext2/ext3 = 32,000
+ // - ext4 = 65,000
+ //
+ // This implies that an ext4 directory cannot have more than 64,998
+ // subdirectories because each subdirectory will have a hard link to the
+ // directory via the `..` entry. The directory has hard link via the `.` entry
+ // of its own. And finally the inode is initiated with 1 hard link (itself).
+ //
+ // The underlying value is reset to 1 if all the following hold:
+ // - Inode is a directory.
+ // - SbDirNlink is enabled.
+ // - Number of hard links is incremented past 64,999.
+ // Hard link value of 1 for a directory would indicate that the number of hard
+ // links is unknown because a directory can have minimum 2 hard links (itself
+ // and `.` entry).
+ LinksCount() uint16
+
+ // Flags returns InodeFlags which represents the inode flags.
+ Flags() InodeFlags
+
+ // Blocks returns the underlying inode.i_block array. This field is special
+ // and is used to store various kinds of things depending on the filesystem
+ // version and inode type.
+ // - In ext2/ext3, it contains the block map.
+ // - In ext4, it contains the extent tree.
+ // - For inline files, it contains the file contents.
+ // - For symlinks, it contains the link path (if it fits here).
+ //
+ // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#the-contents-of-inode-i-block.
+ Blocks() [60]byte
+}
+
+// Inode flags. This is not comprehensive and flags which were not used in
+// the Linux kernel have been excluded.
+const (
+ // InSync indicates that all writes to the file must be synchronous.
+ InSync = 0x8
+
+ // InImmutable indicates that this file is immutable.
+ InImmutable = 0x10
+
+ // InAppend indicates that this file can only be appended to.
+ InAppend = 0x20
+
+ // InNoDump indicates that teh dump(1) utility should not dump this file.
+ InNoDump = 0x40
+
+ // InNoAccessTime indicates that the access time of this inode must not be
+ // updated.
+ InNoAccessTime = 0x80
+
+ // InIndex indicates that this directory has hashed indexes.
+ InIndex = 0x1000
+
+ // InJournalData indicates that file data must always be written through a
+ // journal device.
+ InJournalData = 0x4000
+
+ // InDirSync indicates that all the directory entiry data must be written
+ // synchronously.
+ InDirSync = 0x10000
+
+ // InTopDir indicates that this inode is at the top of the directory hierarchy.
+ InTopDir = 0x20000
+
+ // InHugeFile indicates that this is a huge file.
+ InHugeFile = 0x40000
+
+ // InExtents indicates that this inode uses extents.
+ InExtents = 0x80000
+
+ // InExtendedAttr indicates that this inode stores a large extended attribute
+ // value in its data blocks.
+ InExtendedAttr = 0x200000
+
+ // InInline indicates that this inode has inline data.
+ InInline = 0x10000000
+
+ // InReserved indicates that this inode is reserved for the ext4 library.
+ InReserved = 0x80000000
+)
+
+// InodeFlags represents all possible combinations of inode flags. It aims to
+// cover the bit masks and provide a more user-friendly interface.
+type InodeFlags struct {
+ Sync bool
+ Immutable bool
+ Append bool
+ NoDump bool
+ NoAccessTime bool
+ Index bool
+ JournalData bool
+ DirSync bool
+ TopDir bool
+ HugeFile bool
+ Extents bool
+ ExtendedAttr bool
+ Inline bool
+ Reserved bool
+}
+
+// ToInt converts inode flags back to its 32-bit rep.
+func (f InodeFlags) ToInt() uint32 {
+ var res uint32
+
+ if f.Sync {
+ res |= InSync
+ }
+ if f.Immutable {
+ res |= InImmutable
+ }
+ if f.Append {
+ res |= InAppend
+ }
+ if f.NoDump {
+ res |= InNoDump
+ }
+ if f.NoAccessTime {
+ res |= InNoAccessTime
+ }
+ if f.Index {
+ res |= InIndex
+ }
+ if f.JournalData {
+ res |= InJournalData
+ }
+ if f.DirSync {
+ res |= InDirSync
+ }
+ if f.TopDir {
+ res |= InTopDir
+ }
+ if f.HugeFile {
+ res |= InHugeFile
+ }
+ if f.Extents {
+ res |= InExtents
+ }
+ if f.ExtendedAttr {
+ res |= InExtendedAttr
+ }
+ if f.Inline {
+ res |= InInline
+ }
+ if f.Reserved {
+ res |= InReserved
+ }
+
+ return res
+}
+
+// InodeFlagsFromInt converts the integer representation of inode flags to
+// a InodeFlags struct.
+func InodeFlagsFromInt(f uint32) InodeFlags {
+ return InodeFlags{
+ Sync: f&InSync > 0,
+ Immutable: f&InImmutable > 0,
+ Append: f&InAppend > 0,
+ NoDump: f&InNoDump > 0,
+ NoAccessTime: f&InNoAccessTime > 0,
+ Index: f&InIndex > 0,
+ JournalData: f&InJournalData > 0,
+ DirSync: f&InDirSync > 0,
+ TopDir: f&InTopDir > 0,
+ HugeFile: f&InHugeFile > 0,
+ Extents: f&InExtents > 0,
+ ExtendedAttr: f&InExtendedAttr > 0,
+ Inline: f&InInline > 0,
+ Reserved: f&InReserved > 0,
+ }
+}
+
+// These masks define how users can view/modify inode flags. The rest of the
+// flags are for internal kernel usage only.
+const (
+ InUserReadFlagMask = 0x4BDFFF
+ InUserWriteFlagMask = 0x4B80FF
+)
diff --git a/pkg/sentry/fs/ext/disklayout/inode_new.go b/pkg/sentry/fs/ext/disklayout/inode_new.go
new file mode 100644
index 000000000..4f5348372
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/inode_new.go
@@ -0,0 +1,96 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+import "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+
+// InodeNew represents ext4 inode structure which can be bigger than
+// OldInodeSize. The actual size of this struct should be determined using
+// inode.ExtraInodeSize. Accessing any field here should be verified with the
+// actual size. The extra space between the end of the inode struct and end of
+// the inode record can be used to store extended attr.
+//
+// If the TimeExtra fields are in scope, the lower 2 bits of those are used
+// to extend their counter part to be 34 bits wide; the rest (upper) 30 bits
+// are used to provide nanoscond precision. Hence, these timestamps will now
+// overflow in May 2446.
+// See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps.
+type InodeNew struct {
+ InodeOld
+
+ ExtraInodeSize uint16
+ ChecksumHi uint16
+ ChangeTimeExtra uint32
+ ModificationTimeExtra uint32
+ AccessTimeExtra uint32
+ CreationTime uint32
+ CreationTimeExtra uint32
+ VersionHi uint32
+ ProjectID uint32
+}
+
+// Compiles only if InodeNew implements Inode.
+var _ Inode = (*InodeNew)(nil)
+
+// fromExtraTime decodes the extra time and constructs the kernel time struct
+// with nanosecond precision.
+func fromExtraTime(lo int32, extra uint32) time.Time {
+ // See description above InodeNew for format.
+ seconds := (int64(extra&0x3) << 32) + int64(lo)
+ nanoseconds := int64(extra >> 2)
+ return time.FromUnix(seconds, nanoseconds)
+}
+
+// Only override methods which change due to ext4 specific fields.
+
+// Size implements Inode.Size.
+func (in *InodeNew) Size() uint64 {
+ return (uint64(in.SizeHi) << 32) | uint64(in.SizeLo)
+}
+
+// InodeSize implements Inode.InodeSize.
+func (in *InodeNew) InodeSize() uint16 {
+ return oldInodeSize + in.ExtraInodeSize
+}
+
+// ChangeTime implements Inode.ChangeTime.
+func (in *InodeNew) ChangeTime() time.Time {
+ // Apply new timestamp logic if inode.ChangeTimeExtra is in scope.
+ if in.ExtraInodeSize >= 8 {
+ return fromExtraTime(in.ChangeTimeRaw, in.ChangeTimeExtra)
+ }
+
+ return in.InodeOld.ChangeTime()
+}
+
+// ModificationTime implements Inode.ModificationTime.
+func (in *InodeNew) ModificationTime() time.Time {
+ // Apply new timestamp logic if inode.ModificationTimeExtra is in scope.
+ if in.ExtraInodeSize >= 12 {
+ return fromExtraTime(in.ModificationTimeRaw, in.ModificationTimeExtra)
+ }
+
+ return in.InodeOld.ModificationTime()
+}
+
+// AccessTime implements Inode.AccessTime.
+func (in *InodeNew) AccessTime() time.Time {
+ // Apply new timestamp logic if inode.AccessTimeExtra is in scope.
+ if in.ExtraInodeSize >= 16 {
+ return fromExtraTime(in.AccessTimeRaw, in.AccessTimeExtra)
+ }
+
+ return in.InodeOld.AccessTime()
+}
diff --git a/pkg/sentry/fs/ext/disklayout/inode_old.go b/pkg/sentry/fs/ext/disklayout/inode_old.go
new file mode 100644
index 000000000..dc4c9d8e4
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/inode_old.go
@@ -0,0 +1,117 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+)
+
+const (
+ // oldInodeSize is the inode size in ext2/ext3.
+ oldInodeSize = 128
+)
+
+// InodeOld implements Inode interface. It emulates ext2/ext3 inode struct.
+// Inode struct size and record size are both 128 bytes for this.
+//
+// All fields representing time are in seconds since the epoch. Which means that
+// they will overflow in January 2038.
+type InodeOld struct {
+ ModeRaw uint16
+ UIDLo uint16
+ SizeLo uint32
+
+ // The time fields are signed integers because they could be negative to
+ // represent time before the epoch.
+ AccessTimeRaw int32
+ ChangeTimeRaw int32
+ ModificationTimeRaw int32
+ DeletionTimeRaw int32
+
+ GIDLo uint16
+ LinksCountRaw uint16
+ BlocksCountLo uint32
+ FlagsRaw uint32
+ VersionLo uint32 // This is OS dependent.
+ BlocksRaw [60]byte
+ Generation uint32
+ FileACLLo uint32
+ SizeHi uint32
+ ObsoFaddr uint32
+
+ // OS dependent fields have been inlined here.
+ BlocksCountHi uint16
+ FileACLHi uint16
+ UIDHi uint16
+ GIDHi uint16
+ ChecksumLo uint16
+ _ uint16
+}
+
+// Compiles only if InodeOld implements Inode.
+var _ Inode = (*InodeOld)(nil)
+
+// Mode implements Inode.Mode.
+func (in *InodeOld) Mode() linux.FileMode { return linux.FileMode(in.ModeRaw) }
+
+// UID implements Inode.UID.
+func (in *InodeOld) UID() auth.KUID {
+ return auth.KUID((uint32(in.UIDHi) << 16) | uint32(in.UIDLo))
+}
+
+// GID implements Inode.GID.
+func (in *InodeOld) GID() auth.KGID {
+ return auth.KGID((uint32(in.GIDHi) << 16) | uint32(in.GIDLo))
+}
+
+// Size implements Inode.Size.
+func (in *InodeOld) Size() uint64 {
+ // In ext2/ext3, in.SizeHi did not exist, it was instead named in.DirACL.
+ return uint64(in.SizeLo)
+}
+
+// InodeSize implements Inode.InodeSize.
+func (in *InodeOld) InodeSize() uint16 { return oldInodeSize }
+
+// AccessTime implements Inode.AccessTime.
+func (in *InodeOld) AccessTime() time.Time {
+ return time.FromUnix(int64(in.AccessTimeRaw), 0)
+}
+
+// ChangeTime implements Inode.ChangeTime.
+func (in *InodeOld) ChangeTime() time.Time {
+ return time.FromUnix(int64(in.ChangeTimeRaw), 0)
+}
+
+// ModificationTime implements Inode.ModificationTime.
+func (in *InodeOld) ModificationTime() time.Time {
+ return time.FromUnix(int64(in.ModificationTimeRaw), 0)
+}
+
+// DeletionTime implements Inode.DeletionTime.
+func (in *InodeOld) DeletionTime() time.Time {
+ return time.FromUnix(int64(in.DeletionTimeRaw), 0)
+}
+
+// LinksCount implements Inode.LinksCount.
+func (in *InodeOld) LinksCount() uint16 { return in.LinksCountRaw }
+
+// Flags implements Inode.Flags.
+func (in *InodeOld) Flags() InodeFlags { return InodeFlagsFromInt(in.FlagsRaw) }
+
+// Blocks implements Inode.Blocks.
+func (in *InodeOld) Blocks() [60]byte { return in.BlocksRaw }
diff --git a/pkg/sentry/fs/ext/disklayout/inode_test.go b/pkg/sentry/fs/ext/disklayout/inode_test.go
new file mode 100644
index 000000000..9cae9e4f0
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/inode_test.go
@@ -0,0 +1,222 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+import (
+ "fmt"
+ "strconv"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+)
+
+// TestInodeSize tests that the inode structs are of the correct size.
+func TestInodeSize(t *testing.T) {
+ assertSize(t, InodeOld{}, oldInodeSize)
+
+ // This was updated from 156 bytes to 160 bytes in Oct 2015.
+ assertSize(t, InodeNew{}, 160)
+}
+
+// TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in
+// ext4 inode structs are decoded correctly.
+//
+// These tests are derived from the table under https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps.
+func TestTimestampSeconds(t *testing.T) {
+ type timestampTest struct {
+ // msbSet tells if the most significant bit of InodeOld.[X]TimeRaw is set.
+ // If this is set then the 32-bit time is negative.
+ msbSet bool
+
+ // lowerBound tells if we should take the lowest possible value of
+ // InodeOld.[X]TimeRaw while satisfying test.msbSet condition. If set to
+ // false it tells to take the highest possible value.
+ lowerBound bool
+
+ // extraBits is InodeNew.[X]TimeExtra.
+ extraBits uint32
+
+ // want is the kernel time struct that is expected.
+ want time.Time
+ }
+
+ tests := []timestampTest{
+ // 1901-12-13
+ {
+ msbSet: true,
+ lowerBound: true,
+ extraBits: 0,
+ want: time.FromUnix(int64(-0x80000000), 0),
+ },
+
+ // 1969-12-31
+ {
+ msbSet: true,
+ lowerBound: false,
+ extraBits: 0,
+ want: time.FromUnix(int64(-1), 0),
+ },
+
+ // 1970-01-01
+ {
+ msbSet: false,
+ lowerBound: true,
+ extraBits: 0,
+ want: time.FromUnix(int64(0), 0),
+ },
+
+ // 2038-01-19
+ {
+ msbSet: false,
+ lowerBound: false,
+ extraBits: 0,
+ want: time.FromUnix(int64(0x7fffffff), 0),
+ },
+
+ // 2038-01-19
+ {
+ msbSet: true,
+ lowerBound: true,
+ extraBits: 1,
+ want: time.FromUnix(int64(0x80000000), 0),
+ },
+
+ // 2106-02-07
+ {
+ msbSet: true,
+ lowerBound: false,
+ extraBits: 1,
+ want: time.FromUnix(int64(0xffffffff), 0),
+ },
+
+ // 2106-02-07
+ {
+ msbSet: false,
+ lowerBound: true,
+ extraBits: 1,
+ want: time.FromUnix(int64(0x100000000), 0),
+ },
+
+ // 2174-02-25
+ {
+ msbSet: false,
+ lowerBound: false,
+ extraBits: 1,
+ want: time.FromUnix(int64(0x17fffffff), 0),
+ },
+
+ // 2174-02-25
+ {
+ msbSet: true,
+ lowerBound: true,
+ extraBits: 2,
+ want: time.FromUnix(int64(0x180000000), 0),
+ },
+
+ // 2242-03-16
+ {
+ msbSet: true,
+ lowerBound: false,
+ extraBits: 2,
+ want: time.FromUnix(int64(0x1ffffffff), 0),
+ },
+
+ // 2242-03-16
+ {
+ msbSet: false,
+ lowerBound: true,
+ extraBits: 2,
+ want: time.FromUnix(int64(0x200000000), 0),
+ },
+
+ // 2310-04-04
+ {
+ msbSet: false,
+ lowerBound: false,
+ extraBits: 2,
+ want: time.FromUnix(int64(0x27fffffff), 0),
+ },
+
+ // 2310-04-04
+ {
+ msbSet: true,
+ lowerBound: true,
+ extraBits: 3,
+ want: time.FromUnix(int64(0x280000000), 0),
+ },
+
+ // 2378-04-22
+ {
+ msbSet: true,
+ lowerBound: false,
+ extraBits: 3,
+ want: time.FromUnix(int64(0x2ffffffff), 0),
+ },
+
+ // 2378-04-22
+ {
+ msbSet: false,
+ lowerBound: true,
+ extraBits: 3,
+ want: time.FromUnix(int64(0x300000000), 0),
+ },
+
+ // 2446-05-10
+ {
+ msbSet: false,
+ lowerBound: false,
+ extraBits: 3,
+ want: time.FromUnix(int64(0x37fffffff), 0),
+ },
+ }
+
+ lowerMSB0 := int32(0) // binary: 00000000 00000000 00000000 00000000
+ upperMSB0 := int32(0x7fffffff) // binary: 01111111 11111111 11111111 11111111
+ lowerMSB1 := int32(-0x80000000) // binary: 10000000 00000000 00000000 00000000
+ upperMSB1 := int32(-1) // binary: 11111111 11111111 11111111 11111111
+
+ get32BitTime := func(test timestampTest) int32 {
+ if test.msbSet {
+ if test.lowerBound {
+ return lowerMSB1
+ }
+
+ return upperMSB1
+ }
+
+ if test.lowerBound {
+ return lowerMSB0
+ }
+
+ return upperMSB0
+ }
+
+ getTestName := func(test timestampTest) string {
+ return fmt.Sprintf(
+ "Tests time decoding with epoch bits 0b%s and 32-bit raw time: MSB set=%t, lower bound=%t",
+ strconv.FormatInt(int64(test.extraBits), 2),
+ test.msbSet,
+ test.lowerBound,
+ )
+ }
+
+ for _, test := range tests {
+ t.Run(getTestName(test), func(t *testing.T) {
+ if got := fromExtraTime(get32BitTime(test), test.extraBits); got != test.want {
+ t.Errorf("Expected: %v, Got: %v", test.want, got)
+ }
+ })
+ }
+}
diff --git a/pkg/sentry/fs/ext/disklayout/superblock.go b/pkg/sentry/fs/ext/disklayout/superblock.go
new file mode 100644
index 000000000..e4b8f46fb
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/superblock.go
@@ -0,0 +1,468 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+// SuperBlock should be implemented by structs representing the ext superblock.
+// The superblock holds a lot of information about the enclosing filesystem.
+// This interface aims to provide access methods to important information held
+// by the superblock. It does NOT expose all fields of the superblock, only the
+// ones necessary. This can be expanded when need be.
+//
+// Location and replication:
+// - The superblock is located at offset 1024 in block group 0.
+// - Redundant copies of the superblock and group descriptors are kept in
+// all groups if SbSparse feature flag is NOT set. If it is set, the
+// replicas only exist in groups whose group number is either 0 or a
+// power of 3, 5, or 7.
+// - There is also a sparse superblock feature v2 in which there are just
+// two replicas saved in the block groups pointed by sb.s_backup_bgs.
+//
+// Replicas should eventually be updated if the superblock is updated.
+//
+// See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#super-block.
+type SuperBlock interface {
+ // InodesCount returns the total number of inodes in this filesystem.
+ InodesCount() uint32
+
+ // BlocksCount returns the total number of data blocks in this filesystem.
+ BlocksCount() uint64
+
+ // FreeBlocksCount returns the number of free blocks in this filesystem.
+ FreeBlocksCount() uint64
+
+ // FreeInodesCount returns the number of free inodes in this filesystem.
+ FreeInodesCount() uint32
+
+ // MountCount returns the number of mounts since the last fsck.
+ MountCount() uint16
+
+ // MaxMountCount returns the number of mounts allowed beyond which a fsck is
+ // needed.
+ MaxMountCount() uint16
+
+ // FirstDataBlock returns the absolute block number of the first data block,
+ // which contains the super block itself.
+ //
+ // If the filesystem has 1kb data blocks then this should return 1. For all
+ // other configurations, this typically returns 0.
+ //
+ // The first block group descriptor is in (FirstDataBlock() + 1)th block.
+ FirstDataBlock() uint32
+
+ // BlockSize returns the size of one data block in this filesystem.
+ // This can be calculated by 2^(10 + sb.s_log_block_size). This ensures that
+ // the smallest block size is 1kb.
+ BlockSize() uint64
+
+ // BlocksPerGroup returns the number of data blocks in a block group.
+ BlocksPerGroup() uint32
+
+ // ClusterSize returns block cluster size (set during mkfs time by admin).
+ // This can be calculated by 2^(10 + sb.s_log_cluster_size). This ensures that
+ // the smallest cluster size is 1kb.
+ //
+ // sb.s_log_cluster_size must equal sb.s_log_block_size if bigalloc feature
+ // is NOT set and consequently BlockSize() = ClusterSize() in that case.
+ ClusterSize() uint64
+
+ // ClustersPerGroup returns:
+ // - number of clusters per group if bigalloc is enabled.
+ // - BlocksPerGroup() otherwise.
+ ClustersPerGroup() uint32
+
+ // InodeSize returns the size of the inode disk record size in bytes. Use this
+ // to iterate over inode arrays on disk.
+ //
+ // In ext2 and ext3:
+ // - Each inode had a disk record of 128 bytes.
+ // - The inode struct size was fixed at 128 bytes.
+ //
+ // In ext4 its possible to allocate larger on-disk inodes:
+ // - Inode disk record size = sb.s_inode_size (function return value).
+ // = 256 (default)
+ // - Inode struct size = 128 + inode.i_extra_isize.
+ // = 128 + 32 = 160 (default)
+ InodeSize() uint16
+
+ // InodesPerGroup returns the number of inodes in a block group.
+ InodesPerGroup() uint32
+
+ // BgDescSize returns the size of the block group descriptor struct.
+ //
+ // In ext2, ext3, ext4 (without 64-bit feature), the block group descriptor
+ // is only 32 bytes long.
+ // In ext4 with 64-bit feature, the block group descriptor expands to AT LEAST
+ // 64 bytes. It might be bigger than that.
+ BgDescSize() uint16
+
+ // CompatibleFeatures returns the CompatFeatures struct which holds all the
+ // compatible features this fs supports.
+ CompatibleFeatures() CompatFeatures
+
+ // IncompatibleFeatures returns the CompatFeatures struct which holds all the
+ // incompatible features this fs supports.
+ IncompatibleFeatures() IncompatFeatures
+
+ // ReadOnlyCompatibleFeatures returns the CompatFeatures struct which holds all the
+ // readonly compatible features this fs supports.
+ ReadOnlyCompatibleFeatures() RoCompatFeatures
+
+ // Magic() returns the magic signature which must be 0xef53.
+ Magic() uint16
+
+ // Revision returns the superblock revision. Superblock struct fields from
+ // offset 0x54 till 0x150 should only be used if superblock has DynamicRev.
+ Revision() SbRevision
+}
+
+// SbRevision is the type for superblock revisions.
+type SbRevision int
+
+// Super block revisions.
+const (
+ // OldRev is the good old (original) format.
+ OldRev SbRevision = 0
+
+ // DynamicRev is v2 format w/ dynamic inode sizes.
+ DynamicRev SbRevision = 1
+)
+
+// Superblock compatible features.
+// This is not exhaustive, unused features are not listed.
+const (
+ // SbDirPrealloc indicates directory preallocation.
+ SbDirPrealloc = 0x1
+
+ // SbHasJournal indicates the presence of a journal. jbd2 should only work
+ // with this being set.
+ SbHasJournal = 0x4
+
+ // SbExtAttr indicates extended attributes support.
+ SbExtAttr = 0x8
+
+ // SbResizeInode indicates that the fs has reserved GDT blocks (right after
+ // group descriptors) for fs expansion.
+ SbResizeInode = 0x10
+
+ // SbDirIndex indicates that the fs has directory indices.
+ SbDirIndex = 0x20
+
+ // SbSparseV2 stands for Sparse superblock version 2.
+ SbSparseV2 = 0x200
+)
+
+// CompatFeatures represents a superblock's compatible feature set. If the
+// kernel does not understand any of these feature, it can still read/write
+// to this fs.
+type CompatFeatures struct {
+ DirPrealloc bool
+ HasJournal bool
+ ExtAttr bool
+ ResizeInode bool
+ DirIndex bool
+ SparseV2 bool
+}
+
+// ToInt converts superblock compatible features back to its 32-bit rep.
+func (f CompatFeatures) ToInt() uint32 {
+ var res uint32
+
+ if f.DirPrealloc {
+ res |= SbDirPrealloc
+ }
+ if f.HasJournal {
+ res |= SbHasJournal
+ }
+ if f.ExtAttr {
+ res |= SbExtAttr
+ }
+ if f.ResizeInode {
+ res |= SbResizeInode
+ }
+ if f.DirIndex {
+ res |= SbDirIndex
+ }
+ if f.SparseV2 {
+ res |= SbSparseV2
+ }
+
+ return res
+}
+
+// CompatFeaturesFromInt converts the integer representation of superblock
+// compatible features to CompatFeatures struct.
+func CompatFeaturesFromInt(f uint32) CompatFeatures {
+ return CompatFeatures{
+ DirPrealloc: f&SbDirPrealloc > 0,
+ HasJournal: f&SbHasJournal > 0,
+ ExtAttr: f&SbExtAttr > 0,
+ ResizeInode: f&SbResizeInode > 0,
+ DirIndex: f&SbDirIndex > 0,
+ SparseV2: f&SbSparseV2 > 0,
+ }
+}
+
+// Superblock incompatible features.
+// This is not exhaustive, unused features are not listed.
+const (
+ // SbDirentFileType indicates that directory entries record the file type.
+ // We should use struct ext4_dir_entry_2 for dirents then.
+ SbDirentFileType = 0x2
+
+ // SbRecovery indicates that the filesystem needs recovery.
+ SbRecovery = 0x4
+
+ // SbJournalDev indicates that the filesystem has a separate journal device.
+ SbJournalDev = 0x8
+
+ // SbMetaBG indicates that the filesystem is using Meta block groups. Moves
+ // the group descriptors from the congested first block group into the first
+ // group of each metablock group to increase the maximum block groups limit
+ // and hence support much larger filesystems.
+ //
+ // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#meta-block-groups.
+ SbMetaBG = 0x10
+
+ // SbExtents indicates that the filesystem uses extents. Must be set in ext4
+ // filesystems.
+ SbExtents = 0x40
+
+ // SbIs64Bit indicates that this filesystem addresses blocks with 64-bits.
+ // Hence can support 2^64 data blocks.
+ SbIs64Bit = 0x80
+
+ // SbMMP indicates that this filesystem has multiple mount protection.
+ //
+ // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#multiple-mount-protection.
+ SbMMP = 0x100
+
+ // SbFlexBg indicates that this filesystem has flexible block groups. Several
+ // block groups are tied into one logical block group so that all the metadata
+ // for the block groups (bitmaps and inode tables) are close together for
+ // faster loading. Consequently, large files will be continuous on disk.
+ // However, this does not affect the placement of redundant superblocks and
+ // group descriptors.
+ //
+ // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#flexible-block-groups.
+ SbFlexBg = 0x200
+
+ // SbLargeDir shows that large directory enabled. Directory htree can be 3
+ // levels deep. Directory htrees are allowed to be 2 levels deep otherwise.
+ SbLargeDir = 0x4000
+
+ // SbInlineData allows inline data in inodes for really small files.
+ SbInlineData = 0x8000
+
+ // SbEncrypted indicates that this fs contains encrypted inodes.
+ SbEncrypted = 0x10000
+)
+
+// IncompatFeatures represents a superblock's incompatible feature set. If the
+// kernel does not understand any of these feature, it should refuse to mount.
+type IncompatFeatures struct {
+ DirentFileType bool
+ Recovery bool
+ JournalDev bool
+ MetaBG bool
+ Extents bool
+ Is64Bit bool
+ MMP bool
+ FlexBg bool
+ LargeDir bool
+ InlineData bool
+ Encrypted bool
+}
+
+// ToInt converts superblock incompatible features back to its 32-bit rep.
+func (f IncompatFeatures) ToInt() uint32 {
+ var res uint32
+
+ if f.DirentFileType {
+ res |= SbDirentFileType
+ }
+ if f.Recovery {
+ res |= SbRecovery
+ }
+ if f.JournalDev {
+ res |= SbJournalDev
+ }
+ if f.MetaBG {
+ res |= SbMetaBG
+ }
+ if f.Extents {
+ res |= SbExtents
+ }
+ if f.Is64Bit {
+ res |= SbIs64Bit
+ }
+ if f.MMP {
+ res |= SbMMP
+ }
+ if f.FlexBg {
+ res |= SbFlexBg
+ }
+ if f.LargeDir {
+ res |= SbLargeDir
+ }
+ if f.InlineData {
+ res |= SbInlineData
+ }
+ if f.Encrypted {
+ res |= SbEncrypted
+ }
+
+ return res
+}
+
+// IncompatFeaturesFromInt converts the integer representation of superblock
+// incompatible features to IncompatFeatures struct.
+func IncompatFeaturesFromInt(f uint32) IncompatFeatures {
+ return IncompatFeatures{
+ DirentFileType: f&SbDirentFileType > 0,
+ Recovery: f&SbRecovery > 0,
+ JournalDev: f&SbJournalDev > 0,
+ MetaBG: f&SbMetaBG > 0,
+ Extents: f&SbExtents > 0,
+ Is64Bit: f&SbIs64Bit > 0,
+ MMP: f&SbMMP > 0,
+ FlexBg: f&SbFlexBg > 0,
+ LargeDir: f&SbLargeDir > 0,
+ InlineData: f&SbInlineData > 0,
+ Encrypted: f&SbEncrypted > 0,
+ }
+}
+
+// Superblock readonly compatible features.
+// This is not exhaustive, unused features are not listed.
+const (
+ // SbSparse indicates sparse superblocks. Only groups with number either 0 or
+ // a power of 3, 5, or 7 will have redundant copies of the superblock and
+ // block descriptors.
+ SbSparse = 0x1
+
+ // SbLargeFile indicates that this fs has been used to store a file >= 2GiB.
+ SbLargeFile = 0x2
+
+ // SbHugeFile indicates that this fs contains files whose sizes are
+ // represented in units of logicals blocks, not 512-byte sectors.
+ SbHugeFile = 0x8
+
+ // SbGdtCsum indicates that group descriptors have checksums.
+ SbGdtCsum = 0x10
+
+ // SbDirNlink indicates that the new subdirectory limit is 64,999. Ext3 has a
+ // 32,000 subdirectory limit.
+ SbDirNlink = 0x20
+
+ // SbExtraIsize indicates that large inodes exist on this filesystem.
+ SbExtraIsize = 0x40
+
+ // SbHasSnapshot indicates the existence of a snapshot.
+ SbHasSnapshot = 0x80
+
+ // SbQuota enables usage tracking for all quota types.
+ SbQuota = 0x100
+
+ // SbBigalloc maps to the bigalloc feature. When set, the minimum allocation
+ // unit becomes a cluster rather than a data block. Then block bitmaps track
+ // clusters, not data blocks.
+ //
+ // See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#bigalloc.
+ SbBigalloc = 0x200
+
+ // SbMetadataCsum indicates that the fs supports metadata checksumming.
+ SbMetadataCsum = 0x400
+
+ // SbReadOnly marks this filesystem as readonly. Should refuse to mount in
+ // read/write mode.
+ SbReadOnly = 0x1000
+)
+
+// RoCompatFeatures represents a superblock's readonly compatible feature set.
+// If the kernel does not understand any of these feature, it can still mount
+// readonly. But if the user wants to mount read/write, the kernel should
+// refuse to mount.
+type RoCompatFeatures struct {
+ Sparse bool
+ LargeFile bool
+ HugeFile bool
+ GdtCsum bool
+ DirNlink bool
+ ExtraIsize bool
+ HasSnapshot bool
+ Quota bool
+ Bigalloc bool
+ MetadataCsum bool
+ ReadOnly bool
+}
+
+// ToInt converts superblock readonly compatible features to its 32-bit rep.
+func (f RoCompatFeatures) ToInt() uint32 {
+ var res uint32
+
+ if f.Sparse {
+ res |= SbSparse
+ }
+ if f.LargeFile {
+ res |= SbLargeFile
+ }
+ if f.HugeFile {
+ res |= SbHugeFile
+ }
+ if f.GdtCsum {
+ res |= SbGdtCsum
+ }
+ if f.DirNlink {
+ res |= SbDirNlink
+ }
+ if f.ExtraIsize {
+ res |= SbExtraIsize
+ }
+ if f.HasSnapshot {
+ res |= SbHasSnapshot
+ }
+ if f.Quota {
+ res |= SbQuota
+ }
+ if f.Bigalloc {
+ res |= SbBigalloc
+ }
+ if f.MetadataCsum {
+ res |= SbMetadataCsum
+ }
+ if f.ReadOnly {
+ res |= SbReadOnly
+ }
+
+ return res
+}
+
+// RoCompatFeaturesFromInt converts the integer representation of superblock
+// readonly compatible features to RoCompatFeatures struct.
+func RoCompatFeaturesFromInt(f uint32) RoCompatFeatures {
+ return RoCompatFeatures{
+ Sparse: f&SbSparse > 0,
+ LargeFile: f&SbLargeFile > 0,
+ HugeFile: f&SbHugeFile > 0,
+ GdtCsum: f&SbGdtCsum > 0,
+ DirNlink: f&SbDirNlink > 0,
+ ExtraIsize: f&SbExtraIsize > 0,
+ HasSnapshot: f&SbHasSnapshot > 0,
+ Quota: f&SbQuota > 0,
+ Bigalloc: f&SbBigalloc > 0,
+ MetadataCsum: f&SbMetadataCsum > 0,
+ ReadOnly: f&SbReadOnly > 0,
+ }
+}
diff --git a/pkg/sentry/fs/ext/disklayout/superblock_32.go b/pkg/sentry/fs/ext/disklayout/superblock_32.go
new file mode 100644
index 000000000..587e4afaa
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/superblock_32.go
@@ -0,0 +1,75 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+// SuperBlock32Bit implements SuperBlock and represents the 32-bit version of
+// the ext4_super_block struct in fs/ext4/ext4.h.
+type SuperBlock32Bit struct {
+ // We embed the old superblock struct here because the 32-bit version is just
+ // an extension of the old version.
+ SuperBlockOld
+
+ FirstInode uint32
+ InodeSizeRaw uint16
+ BlockGroupNumber uint16
+ FeatureCompat uint32
+ FeatureIncompat uint32
+ FeatureRoCompat uint32
+ UUID [16]byte
+ VolumeName [16]byte
+ LastMounted [64]byte
+ AlgoUsageBitmap uint32
+ PreallocBlocks uint8
+ PreallocDirBlocks uint8
+ ReservedGdtBlocks uint16
+ JournalUUID [16]byte
+ JournalInum uint32
+ JournalDev uint32
+ LastOrphan uint32
+ HashSeed [4]uint32
+ DefaultHashVersion uint8
+ JnlBackupType uint8
+ BgDescSizeRaw uint16
+ DefaultMountOpts uint32
+ FirstMetaBg uint32
+ MkfsTime uint32
+ JnlBlocks [17]uint32
+}
+
+// Compiles only if SuperBlock32Bit implements SuperBlock.
+var _ SuperBlock = (*SuperBlock32Bit)(nil)
+
+// Only override methods which change based on the additional fields above.
+// Not overriding SuperBlock.BgDescSize because it would still return 32 here.
+
+// InodeSize implements SuperBlock.InodeSize.
+func (sb *SuperBlock32Bit) InodeSize() uint16 {
+ return sb.InodeSizeRaw
+}
+
+// CompatibleFeatures implements SuperBlock.CompatibleFeatures.
+func (sb *SuperBlock32Bit) CompatibleFeatures() CompatFeatures {
+ return CompatFeaturesFromInt(sb.FeatureCompat)
+}
+
+// IncompatibleFeatures implements SuperBlock.IncompatibleFeatures.
+func (sb *SuperBlock32Bit) IncompatibleFeatures() IncompatFeatures {
+ return IncompatFeaturesFromInt(sb.FeatureIncompat)
+}
+
+// ReadOnlyCompatibleFeatures implements SuperBlock.ReadOnlyCompatibleFeatures.
+func (sb *SuperBlock32Bit) ReadOnlyCompatibleFeatures() RoCompatFeatures {
+ return RoCompatFeaturesFromInt(sb.FeatureRoCompat)
+}
diff --git a/pkg/sentry/fs/ext/disklayout/superblock_64.go b/pkg/sentry/fs/ext/disklayout/superblock_64.go
new file mode 100644
index 000000000..a2c2278fb
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/superblock_64.go
@@ -0,0 +1,94 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+// SuperBlock64Bit implements SuperBlock and represents the 64-bit version of
+// the ext4_super_block struct in fs/ext4/ext4.h. This sums up to be exactly
+// 1024 bytes (smallest possible block size) and hence the superblock always
+// fits in no more than one data block.
+type SuperBlock64Bit struct {
+ // We embed the 32-bit struct here because 64-bit version is just an extension
+ // of the 32-bit version.
+ SuperBlock32Bit
+
+ BlocksCountHi uint32
+ ReservedBlocksCountHi uint32
+ FreeBlocksCountHi uint32
+ MinInodeSize uint16
+ WantInodeSize uint16
+ Flags uint32
+ RaidStride uint16
+ MmpInterval uint16
+ MmpBlock uint64
+ RaidStripeWidth uint32
+ LogGroupsPerFlex uint8
+ ChecksumType uint8
+ _ uint16
+ KbytesWritten uint64
+ SnapshotInum uint32
+ SnapshotID uint32
+ SnapshotRsrvBlocksCount uint64
+ SnapshotList uint32
+ ErrorCount uint32
+ FirstErrorTime uint32
+ FirstErrorInode uint32
+ FirstErrorBlock uint64
+ FirstErrorFunction [32]byte
+ FirstErrorLine uint32
+ LastErrorTime uint32
+ LastErrorInode uint32
+ LastErrorLine uint32
+ LastErrorBlock uint64
+ LastErrorFunction [32]byte
+ MountOpts [64]byte
+ UserQuotaInum uint32
+ GroupQuotaInum uint32
+ OverheadBlocks uint32
+ BackupBgs [2]uint32
+ EncryptAlgos [4]uint8
+ EncryptPwSalt [16]uint8
+ LostFoundInode uint32
+ ProjectQuotaInode uint32
+ ChecksumSeed uint32
+ WtimeHi uint8
+ MtimeHi uint8
+ MkfsTimeHi uint8
+ LastCheckHi uint8
+ FirstErrorTimeHi uint8
+ LastErrorTimeHi uint8
+ _ [2]uint8
+ Encoding uint16
+ EncodingFlags uint16
+ _ [95]uint32
+ Checksum uint32
+}
+
+// Compiles only if SuperBlock64Bit implements SuperBlock.
+var _ SuperBlock = (*SuperBlock64Bit)(nil)
+
+// Only override methods which change based on the 64-bit feature.
+
+// BlocksCount implements SuperBlock.BlocksCount.
+func (sb *SuperBlock64Bit) BlocksCount() uint64 {
+ return (uint64(sb.BlocksCountHi) << 32) | uint64(sb.BlocksCountLo)
+}
+
+// FreeBlocksCount implements SuperBlock.FreeBlocksCount.
+func (sb *SuperBlock64Bit) FreeBlocksCount() uint64 {
+ return (uint64(sb.FreeBlocksCountHi) << 32) | uint64(sb.FreeBlocksCountLo)
+}
+
+// BgDescSize implements SuperBlock.BgDescSize.
+func (sb *SuperBlock64Bit) BgDescSize() uint16 { return sb.BgDescSizeRaw }
diff --git a/pkg/sentry/fs/ext/disklayout/superblock_old.go b/pkg/sentry/fs/ext/disklayout/superblock_old.go
new file mode 100644
index 000000000..c74953610
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/superblock_old.go
@@ -0,0 +1,102 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+// SuperBlockOld implements SuperBlock and represents the old version of the
+// superblock struct in ext2 and ext3 systems.
+type SuperBlockOld struct {
+ InodesCountRaw uint32
+ BlocksCountLo uint32
+ ReservedBlocksCount uint32
+ FreeBlocksCountLo uint32
+ FreeInodesCountRaw uint32
+ FirstDataBlockRaw uint32
+ LogBlockSize uint32
+ LogClusterSize uint32
+ BlocksPerGroupRaw uint32
+ ClustersPerGroupRaw uint32
+ InodesPerGroupRaw uint32
+ Mtime uint32
+ Wtime uint32
+ MountCountRaw uint16
+ MaxMountCountRaw uint16
+ MagicRaw uint16
+ State uint16
+ Errors uint16
+ MinorRevLevel uint16
+ LastCheck uint32
+ CheckInterval uint32
+ CreatorOS uint32
+ RevLevel uint32
+ DefResUID uint16
+ DefResGID uint16
+}
+
+// InodesCount implements SuperBlock.InodesCount.
+func (sb *SuperBlockOld) InodesCount() uint32 { return sb.InodesCountRaw }
+
+// BlocksCount implements SuperBlock.BlocksCount.
+func (sb *SuperBlockOld) BlocksCount() uint64 { return uint64(sb.BlocksCountLo) }
+
+// FreeBlocksCount implements SuperBlock.FreeBlocksCount.
+func (sb *SuperBlockOld) FreeBlocksCount() uint64 { return uint64(sb.FreeBlocksCountLo) }
+
+// FreeInodesCount implements SuperBlock.FreeInodesCount.
+func (sb *SuperBlockOld) FreeInodesCount() uint32 { return sb.FreeInodesCountRaw }
+
+// MountCount implements SuperBlock.MountCount.
+func (sb *SuperBlockOld) MountCount() uint16 { return sb.MountCountRaw }
+
+// MaxMountCount implements SuperBlock.MaxMountCount.
+func (sb *SuperBlockOld) MaxMountCount() uint16 { return sb.MaxMountCountRaw }
+
+// FirstDataBlock implements SuperBlock.FirstDataBlock.
+func (sb *SuperBlockOld) FirstDataBlock() uint32 { return sb.FirstDataBlockRaw }
+
+// BlockSize implements SuperBlock.BlockSize.
+func (sb *SuperBlockOld) BlockSize() uint64 { return 1 << (10 + sb.LogBlockSize) }
+
+// BlocksPerGroup implements SuperBlock.BlocksPerGroup.
+func (sb *SuperBlockOld) BlocksPerGroup() uint32 { return sb.BlocksPerGroupRaw }
+
+// ClusterSize implements SuperBlock.ClusterSize.
+func (sb *SuperBlockOld) ClusterSize() uint64 { return 1 << (10 + sb.LogClusterSize) }
+
+// ClustersPerGroup implements SuperBlock.ClustersPerGroup.
+func (sb *SuperBlockOld) ClustersPerGroup() uint32 { return sb.ClustersPerGroupRaw }
+
+// InodeSize implements SuperBlock.InodeSize.
+func (sb *SuperBlockOld) InodeSize() uint16 { return oldInodeSize }
+
+// InodesPerGroup implements SuperBlock.InodesPerGroup.
+func (sb *SuperBlockOld) InodesPerGroup() uint32 { return sb.InodesPerGroupRaw }
+
+// BgDescSize implements SuperBlock.BgDescSize.
+func (sb *SuperBlockOld) BgDescSize() uint16 { return 32 }
+
+// CompatibleFeatures implements SuperBlock.CompatibleFeatures.
+func (sb *SuperBlockOld) CompatibleFeatures() CompatFeatures { return CompatFeatures{} }
+
+// IncompatibleFeatures implements SuperBlock.IncompatibleFeatures.
+func (sb *SuperBlockOld) IncompatibleFeatures() IncompatFeatures { return IncompatFeatures{} }
+
+// ReadOnlyCompatibleFeatures implements SuperBlock.ReadOnlyCompatibleFeatures.
+func (sb *SuperBlockOld) ReadOnlyCompatibleFeatures() RoCompatFeatures { return RoCompatFeatures{} }
+
+// Magic implements SuperBlock.Magic.
+func (sb *SuperBlockOld) Magic() uint16 { return sb.MagicRaw }
+
+// Revision implements SuperBlock.Revision.
+func (sb *SuperBlockOld) Revision() SbRevision { return SbRevision(sb.RevLevel) }
diff --git a/pkg/sentry/fs/ext/disklayout/superblock_test.go b/pkg/sentry/fs/ext/disklayout/superblock_test.go
new file mode 100644
index 000000000..463b5ba21
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/superblock_test.go
@@ -0,0 +1,27 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+import (
+ "testing"
+)
+
+// TestSuperBlockSize tests that the superblock structs are of the correct
+// size.
+func TestSuperBlockSize(t *testing.T) {
+ assertSize(t, SuperBlockOld{}, 84)
+ assertSize(t, SuperBlock32Bit{}, 336)
+ assertSize(t, SuperBlock64Bit{}, 1024)
+}
diff --git a/pkg/sentry/fs/ext/disklayout/test_utils.go b/pkg/sentry/fs/ext/disklayout/test_utils.go
new file mode 100644
index 000000000..9c63f04c0
--- /dev/null
+++ b/pkg/sentry/fs/ext/disklayout/test_utils.go
@@ -0,0 +1,30 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package disklayout
+
+import (
+ "reflect"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/binary"
+)
+
+func assertSize(t *testing.T, v interface{}, want uintptr) {
+ t.Helper()
+
+ if got := binary.Size(v); got != want {
+ t.Errorf("struct %s should be exactly %d bytes but is %d bytes", reflect.TypeOf(v).Name(), want, got)
+ }
+}
diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go
new file mode 100644
index 000000000..7602e2bf0
--- /dev/null
+++ b/pkg/sentry/fs/ext/ext.go
@@ -0,0 +1,49 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ext implements readonly ext(2/3/4) filesystems.
+package ext
+
+import (
+ "io"
+ "sync"
+
+ "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+)
+
+// Filesystem implements vfs.FilesystemImpl.
+type Filesystem struct {
+ // dev is the ReadSeeker for the underlying fs device and is protected by mu.
+ dev io.ReadSeeker
+
+ // mu synchronizes the usage of dev. The ext filesystems take locality into
+ // condsideration, i.e. data blocks of a file will tend to be placed close
+ // together. On a spinning disk, locality reduces the amount of movement of
+ // the head hence speeding up IO operations. On an SSD there are no moving
+ // parts but locality increases the size of each transer request. Hence,
+ // having mutual exclusion on the read seeker while reading a file *should*
+ // help in achieving the intended performance gains.
+ //
+ // Note: This synchronization was not coupled with the ReadSeeker itself
+ // because we want to synchronize across read/seek operations for the
+ // performance gains mentioned above. Helps enforcing one-file-at-a-time IO.
+ mu sync.Mutex
+
+ // sb represents the filesystem superblock. Immutable after initialization.
+ sb disklayout.SuperBlock
+
+ // bgs represents all the block group descriptors for the filesystem.
+ // Immutable after initialization.
+ bgs []disklayout.BlockGroup
+}