summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/fs/ext/BUILD22
-rw-r--r--pkg/sentry/fs/ext/dentry.go33
-rw-r--r--pkg/sentry/fs/ext/disklayout/dirent_old.go3
-rw-r--r--pkg/sentry/fs/ext/disklayout/inode.go6
-rw-r--r--pkg/sentry/fs/ext/disklayout/superblock_32.go3
-rw-r--r--pkg/sentry/fs/ext/disklayout/superblock_64.go3
-rw-r--r--pkg/sentry/fs/ext/disklayout/superblock_old.go2
-rw-r--r--pkg/sentry/fs/ext/ext.go102
-rw-r--r--pkg/sentry/fs/ext/ext_test.go407
-rw-r--r--pkg/sentry/fs/ext/filesystem.go137
-rw-r--r--pkg/sentry/fs/inode_overlay.go6
-rw-r--r--pkg/sentry/kernel/kernel.go5
-rw-r--r--pkg/sentry/kernel/threads.go12
-rw-r--r--pkg/sentry/socket/epsocket/stack.go4
14 files changed, 674 insertions, 71 deletions
diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD
index 3ba278e08..2c15875f5 100644
--- a/pkg/sentry/fs/ext/BUILD
+++ b/pkg/sentry/fs/ext/BUILD
@@ -7,6 +7,7 @@ go_library(
srcs = [
"dentry.go",
"ext.go",
+ "filesystem.go",
"inode.go",
"utils.go",
],
@@ -15,7 +16,10 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/binary",
+ "//pkg/sentry/context",
"//pkg/sentry/fs/ext/disklayout",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/vfs",
"//pkg/syserror",
],
)
@@ -23,11 +27,27 @@ go_library(
go_test(
name = "ext_test",
size = "small",
- srcs = ["extent_test.go"],
+ srcs = [
+ "ext_test.go",
+ "extent_test.go",
+ ],
+ data = [
+ "//pkg/sentry/fs/ext:assets/bigfile.txt",
+ "//pkg/sentry/fs/ext:assets/file.txt",
+ "//pkg/sentry/fs/ext:assets/tiny.ext2",
+ "//pkg/sentry/fs/ext:assets/tiny.ext3",
+ "//pkg/sentry/fs/ext:assets/tiny.ext4",
+ ],
embed = [":ext"],
deps = [
+ "//pkg/abi/linux",
"//pkg/binary",
+ "//pkg/sentry/context",
+ "//pkg/sentry/context/contexttest",
"//pkg/sentry/fs/ext/disklayout",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/vfs",
+ "//runsc/test/testutil",
"@com_github_google_go-cmp//cmp:go_default_library",
"@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
],
diff --git a/pkg/sentry/fs/ext/dentry.go b/pkg/sentry/fs/ext/dentry.go
index 71cd217df..054fb42b6 100644
--- a/pkg/sentry/fs/ext/dentry.go
+++ b/pkg/sentry/fs/ext/dentry.go
@@ -14,10 +14,43 @@
package ext
+import (
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
// dentry implements vfs.DentryImpl.
type dentry struct {
+ vfsd vfs.Dentry
+
// inode is the inode represented by this dentry. Multiple Dentries may
// share a single non-directory Inode (with hard links). inode is
// immutable.
inode *inode
}
+
+// Compiles only if dentry implements vfs.DentryImpl.
+var _ vfs.DentryImpl = (*dentry)(nil)
+
+// newDentry is the dentry constructor.
+func newDentry(in *inode) *dentry {
+ d := &dentry{
+ inode: in,
+ }
+ d.vfsd.Init(d)
+ return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef(vfsfs *vfs.Filesystem) {
+ d.inode.incRef()
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
+ return d.inode.tryIncRef()
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef(vfsfs *vfs.Filesystem) {
+ d.inode.decRef(vfsfs.Impl().(*filesystem))
+}
diff --git a/pkg/sentry/fs/ext/disklayout/dirent_old.go b/pkg/sentry/fs/ext/disklayout/dirent_old.go
index 2e0f9c812..6fff12a6e 100644
--- a/pkg/sentry/fs/ext/disklayout/dirent_old.go
+++ b/pkg/sentry/fs/ext/disklayout/dirent_old.go
@@ -17,8 +17,7 @@ package disklayout
import "gvisor.dev/gvisor/pkg/sentry/fs"
// DirentOld represents the old directory entry struct which does not contain
-// the file type. This emulates Linux's ext4_dir_entry struct. This is used in
-// ext2, ext3 and sometimes in ext4.
+// the file type. This emulates Linux's ext4_dir_entry struct.
//
// Note: This struct can be of variable size on disk. The one described below
// is of maximum size and the FileName beyond NameLength bytes might contain
diff --git a/pkg/sentry/fs/ext/disklayout/inode.go b/pkg/sentry/fs/ext/disklayout/inode.go
index 9ab9a4988..88ae913f5 100644
--- a/pkg/sentry/fs/ext/disklayout/inode.go
+++ b/pkg/sentry/fs/ext/disklayout/inode.go
@@ -20,6 +20,12 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/time"
)
+// Special inodes. See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#special-inodes.
+const (
+ // RootDirInode is the inode number of the root directory inode.
+ RootDirInode = 2
+)
+
// The Inode interface must be implemented by structs representing ext inodes.
// The inode stores all the metadata pertaining to the file (except for the
// file name which is held by the directory entry). It does NOT expose all
diff --git a/pkg/sentry/fs/ext/disklayout/superblock_32.go b/pkg/sentry/fs/ext/disklayout/superblock_32.go
index 587e4afaa..53e515fd3 100644
--- a/pkg/sentry/fs/ext/disklayout/superblock_32.go
+++ b/pkg/sentry/fs/ext/disklayout/superblock_32.go
@@ -15,7 +15,8 @@
package disklayout
// SuperBlock32Bit implements SuperBlock and represents the 32-bit version of
-// the ext4_super_block struct in fs/ext4/ext4.h.
+// the ext4_super_block struct in fs/ext4/ext4.h. Should be used only if
+// RevLevel = DynamicRev and 64-bit feature is disabled.
type SuperBlock32Bit struct {
// We embed the old superblock struct here because the 32-bit version is just
// an extension of the old version.
diff --git a/pkg/sentry/fs/ext/disklayout/superblock_64.go b/pkg/sentry/fs/ext/disklayout/superblock_64.go
index a2c2278fb..7c1053fb4 100644
--- a/pkg/sentry/fs/ext/disklayout/superblock_64.go
+++ b/pkg/sentry/fs/ext/disklayout/superblock_64.go
@@ -17,7 +17,8 @@ package disklayout
// SuperBlock64Bit implements SuperBlock and represents the 64-bit version of
// the ext4_super_block struct in fs/ext4/ext4.h. This sums up to be exactly
// 1024 bytes (smallest possible block size) and hence the superblock always
-// fits in no more than one data block.
+// fits in no more than one data block. Should only be used when the 64-bit
+// feature is set.
type SuperBlock64Bit struct {
// We embed the 32-bit struct here because 64-bit version is just an extension
// of the 32-bit version.
diff --git a/pkg/sentry/fs/ext/disklayout/superblock_old.go b/pkg/sentry/fs/ext/disklayout/superblock_old.go
index 5a64aaaa1..9221e0251 100644
--- a/pkg/sentry/fs/ext/disklayout/superblock_old.go
+++ b/pkg/sentry/fs/ext/disklayout/superblock_old.go
@@ -15,7 +15,7 @@
package disklayout
// SuperBlockOld implements SuperBlock and represents the old version of the
-// superblock struct in ext2 and ext3 systems.
+// superblock struct. Should be used only if RevLevel = OldRev.
type SuperBlockOld struct {
InodesCountRaw uint32
BlocksCountLo uint32
diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go
index 7f4287b01..10e235fb1 100644
--- a/pkg/sentry/fs/ext/ext.go
+++ b/pkg/sentry/fs/ext/ext.go
@@ -16,86 +16,82 @@
package ext
import (
+ "errors"
+ "fmt"
"io"
- "sync"
+ "os"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
)
-// filesystem implements vfs.FilesystemImpl.
-type filesystem struct {
- // mu serializes changes to the Dentry tree and the usage of the read seeker.
- mu sync.Mutex
-
- // dev is the ReadSeeker for the underlying fs device. It is protected by mu.
- //
- // The ext filesystems aim to maximize locality, i.e. place all the data
- // blocks of a file close together. On a spinning disk, locality reduces the
- // amount of movement of the head hence speeding up IO operations. On an SSD
- // there are no moving parts but locality increases the size of each transer
- // request. Hence, having mutual exclusion on the read seeker while reading a
- // file *should* help in achieving the intended performance gains.
- //
- // Note: This synchronization was not coupled with the ReadSeeker itself
- // because we want to synchronize across read/seek operations for the
- // performance gains mentioned above. Helps enforcing one-file-at-a-time IO.
- dev io.ReadSeeker
+// filesystemType implements vfs.FilesystemType.
+type filesystemType struct{}
+
+// Compiles only if filesystemType implements vfs.FilesystemType.
+var _ vfs.FilesystemType = (*filesystemType)(nil)
+
+// getDeviceFd returns the read seeker to the underlying device.
+// Currently there are two ways of mounting an ext(2/3/4) fs:
+// 1. Specify a mount with our internal special MountType in the OCI spec.
+// 2. Expose the device to the container and mount it from application layer.
+func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReadSeeker, error) {
+ if opts.InternalData == nil {
+ // User mount call.
+ // TODO(b/134676337): Open the device specified by `source` and return that.
+ panic("unimplemented")
+ }
- // inodeCache maps absolute inode numbers to the corresponding Inode struct.
- // Inodes should be removed from this once their reference count hits 0.
- //
- // Protected by mu because every addition and removal from this corresponds to
- // a change in the dentry tree.
- inodeCache map[uint32]*inode
+ // NewFilesystem call originated from within the sentry.
+ fd, ok := opts.InternalData.(uintptr)
+ if !ok {
+ return nil, errors.New("internal data for ext fs must be a uintptr containing the file descriptor to device")
+ }
- // sb represents the filesystem superblock. Immutable after initialization.
- sb disklayout.SuperBlock
+ // We do not close this file because that would close the underlying device
+ // file descriptor (which is required for reading the fs from disk).
+ // TODO(b/134676337): Use pkg/fd instead.
+ deviceFile := os.NewFile(fd, source)
+ if deviceFile == nil {
+ return nil, fmt.Errorf("ext4 device file descriptor is not valid: %d", fd)
+ }
- // bgs represents all the block group descriptors for the filesystem.
- // Immutable after initialization.
- bgs []disklayout.BlockGroup
+ return deviceFile, nil
}
-// newFilesystem is the filesystem constructor.
-func newFilesystem(dev io.ReadSeeker) (*filesystem, error) {
- fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)}
- var err error
+// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
+func (fstype filesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ dev, err := getDeviceFd(source, opts)
+ if err != nil {
+ return nil, nil, err
+ }
+ fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)}
+ fs.vfsfs.Init(&fs)
fs.sb, err = readSuperBlock(dev)
if err != nil {
- return nil, err
+ return nil, nil, err
}
if fs.sb.Magic() != linux.EXT_SUPER_MAGIC {
// mount(2) specifies that EINVAL should be returned if the superblock is
// invalid.
- return nil, syserror.EINVAL
+ return nil, nil, syserror.EINVAL
}
fs.bgs, err = readBlockGroups(dev, fs.sb)
if err != nil {
- return nil, err
- }
-
- return &fs, nil
-}
-
-// getOrCreateInode gets the inode corresponding to the inode number passed in.
-// It creates a new one with the given inode number if one does not exist.
-//
-// Preconditions: must be holding fs.mu.
-func (fs *filesystem) getOrCreateInode(inodeNum uint32) (*inode, error) {
- if in, ok := fs.inodeCache[inodeNum]; ok {
- return in, nil
+ return nil, nil, err
}
- in, err := newInode(fs.dev, fs.sb, fs.bgs, inodeNum)
+ rootInode, err := fs.getOrCreateInode(disklayout.RootDirInode)
if err != nil {
- return nil, err
+ return nil, nil, err
}
- fs.inodeCache[inodeNum] = in
- return in, nil
+ return &fs.vfsfs, &newDentry(rootInode).vfsd, nil
}
diff --git a/pkg/sentry/fs/ext/ext_test.go b/pkg/sentry/fs/ext/ext_test.go
new file mode 100644
index 000000000..ee7f7907c
--- /dev/null
+++ b/pkg/sentry/fs/ext/ext_test.go
@@ -0,0 +1,407 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+ "fmt"
+ "os"
+ "path"
+ "testing"
+
+ "github.com/google/go-cmp/cmp"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+
+ "gvisor.dev/gvisor/runsc/test/testutil"
+)
+
+const (
+ assetsDir = "pkg/sentry/fs/ext/assets"
+)
+
+var (
+ ext2ImagePath = path.Join(assetsDir, "tiny.ext2")
+ ext3ImagePath = path.Join(assetsDir, "tiny.ext3")
+ ext4ImagePath = path.Join(assetsDir, "tiny.ext4")
+)
+
+func beginning(_ uint64) uint64 {
+ return 0
+}
+
+func middle(i uint64) uint64 {
+ return i / 2
+}
+
+func end(i uint64) uint64 {
+ return i
+}
+
+// setUp opens imagePath as an ext Filesystem and returns all necessary
+// elements required to run tests. If error is non-nil, it also returns a tear
+// down function which must be called after the test is run for clean up.
+func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *vfs.Dentry, func(), error) {
+ localImagePath, err := testutil.FindFile(imagePath)
+ if err != nil {
+ return nil, nil, nil, nil, fmt.Errorf("failed to open local image at path %s: %v", imagePath, err)
+ }
+
+ f, err := os.Open(localImagePath)
+ if err != nil {
+ return nil, nil, nil, nil, err
+ }
+
+ // Mount the ext4 fs and retrieve the inode structure for the file.
+ mockCtx := contexttest.Context(t)
+ fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: f.Fd()})
+ if err != nil {
+ f.Close()
+ return nil, nil, nil, nil, err
+ }
+
+ tearDown := func() {
+ if err := f.Close(); err != nil {
+ t.Fatalf("tearDown failed: %v", err)
+ }
+ }
+ return mockCtx, fs, d, tearDown, nil
+}
+
+// TestRootDir tests that the root directory inode is correctly initialized and
+// returned from setUp.
+func TestRootDir(t *testing.T) {
+ type inodeProps struct {
+ Mode linux.FileMode
+ UID auth.KUID
+ GID auth.KGID
+ Size uint64
+ InodeSize uint16
+ Links uint16
+ Flags disklayout.InodeFlags
+ }
+
+ type rootDirTest struct {
+ name string
+ image string
+ wantInode inodeProps
+ }
+
+ tests := []rootDirTest{
+ {
+ name: "ext4 root dir",
+ image: ext4ImagePath,
+ wantInode: inodeProps{
+ Mode: linux.ModeDirectory | 0755,
+ Size: 0x400,
+ InodeSize: 0x80,
+ Links: 3,
+ Flags: disklayout.InodeFlags{Extents: true},
+ },
+ },
+ {
+ name: "ext3 root dir",
+ image: ext3ImagePath,
+ wantInode: inodeProps{
+ Mode: linux.ModeDirectory | 0755,
+ Size: 0x400,
+ InodeSize: 0x80,
+ Links: 3,
+ },
+ },
+ {
+ name: "ext2 root dir",
+ image: ext2ImagePath,
+ wantInode: inodeProps{
+ Mode: linux.ModeDirectory | 0755,
+ Size: 0x400,
+ InodeSize: 0x80,
+ Links: 3,
+ },
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ _, _, vfsd, tearDown, err := setUp(t, test.image)
+ if err != nil {
+ t.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ d, ok := vfsd.Impl().(*dentry)
+ if !ok {
+ t.Fatalf("ext dentry of incorrect type: %T", vfsd.Impl())
+ }
+
+ // Offload inode contents into local structs for comparison.
+ gotInode := inodeProps{
+ Mode: d.inode.diskInode.Mode(),
+ UID: d.inode.diskInode.UID(),
+ GID: d.inode.diskInode.GID(),
+ Size: d.inode.diskInode.Size(),
+ InodeSize: d.inode.diskInode.InodeSize(),
+ Links: d.inode.diskInode.LinksCount(),
+ Flags: d.inode.diskInode.Flags(),
+ }
+
+ if diff := cmp.Diff(gotInode, test.wantInode); diff != "" {
+ t.Errorf("inode mismatch (-want +got):\n%s", diff)
+ }
+ })
+ }
+}
+
+// TestFilesystemInit tests that the filesystem superblock and block group
+// descriptors are correctly read in and initialized.
+func TestFilesystemInit(t *testing.T) {
+ // sb only contains the immutable properties of the superblock.
+ type sb struct {
+ InodesCount uint32
+ BlocksCount uint64
+ MaxMountCount uint16
+ FirstDataBlock uint32
+ BlockSize uint64
+ BlocksPerGroup uint32
+ ClusterSize uint64
+ ClustersPerGroup uint32
+ InodeSize uint16
+ InodesPerGroup uint32
+ BgDescSize uint16
+ Magic uint16
+ Revision disklayout.SbRevision
+ CompatFeatures disklayout.CompatFeatures
+ IncompatFeatures disklayout.IncompatFeatures
+ RoCompatFeatures disklayout.RoCompatFeatures
+ }
+
+ // bg only contains the immutable properties of the block group descriptor.
+ type bg struct {
+ InodeTable uint64
+ BlockBitmap uint64
+ InodeBitmap uint64
+ ExclusionBitmap uint64
+ Flags disklayout.BGFlags
+ }
+
+ type fsInitTest struct {
+ name string
+ image string
+ wantSb sb
+ wantBgs []bg
+ }
+
+ tests := []fsInitTest{
+ {
+ name: "ext4 filesystem init",
+ image: ext4ImagePath,
+ wantSb: sb{
+ InodesCount: 0x10,
+ BlocksCount: 0x40,
+ MaxMountCount: 0xffff,
+ FirstDataBlock: 0x1,
+ BlockSize: 0x400,
+ BlocksPerGroup: 0x2000,
+ ClusterSize: 0x400,
+ ClustersPerGroup: 0x2000,
+ InodeSize: 0x80,
+ InodesPerGroup: 0x10,
+ BgDescSize: 0x40,
+ Magic: linux.EXT_SUPER_MAGIC,
+ Revision: disklayout.DynamicRev,
+ CompatFeatures: disklayout.CompatFeatures{
+ ExtAttr: true,
+ ResizeInode: true,
+ DirIndex: true,
+ },
+ IncompatFeatures: disklayout.IncompatFeatures{
+ DirentFileType: true,
+ Extents: true,
+ Is64Bit: true,
+ FlexBg: true,
+ },
+ RoCompatFeatures: disklayout.RoCompatFeatures{
+ Sparse: true,
+ LargeFile: true,
+ HugeFile: true,
+ DirNlink: true,
+ ExtraIsize: true,
+ MetadataCsum: true,
+ },
+ },
+ wantBgs: []bg{
+ {
+ InodeTable: 0x23,
+ BlockBitmap: 0x3,
+ InodeBitmap: 0x13,
+ Flags: disklayout.BGFlags{
+ InodeZeroed: true,
+ },
+ },
+ },
+ },
+ {
+ name: "ext3 filesystem init",
+ image: ext3ImagePath,
+ wantSb: sb{
+ InodesCount: 0x10,
+ BlocksCount: 0x40,
+ MaxMountCount: 0xffff,
+ FirstDataBlock: 0x1,
+ BlockSize: 0x400,
+ BlocksPerGroup: 0x2000,
+ ClusterSize: 0x400,
+ ClustersPerGroup: 0x2000,
+ InodeSize: 0x80,
+ InodesPerGroup: 0x10,
+ BgDescSize: 0x20,
+ Magic: linux.EXT_SUPER_MAGIC,
+ Revision: disklayout.DynamicRev,
+ CompatFeatures: disklayout.CompatFeatures{
+ ExtAttr: true,
+ ResizeInode: true,
+ DirIndex: true,
+ },
+ IncompatFeatures: disklayout.IncompatFeatures{
+ DirentFileType: true,
+ },
+ RoCompatFeatures: disklayout.RoCompatFeatures{
+ Sparse: true,
+ LargeFile: true,
+ },
+ },
+ wantBgs: []bg{
+ {
+ InodeTable: 0x5,
+ BlockBitmap: 0x3,
+ InodeBitmap: 0x4,
+ Flags: disklayout.BGFlags{
+ InodeZeroed: true,
+ },
+ },
+ },
+ },
+ {
+ name: "ext2 filesystem init",
+ image: ext2ImagePath,
+ wantSb: sb{
+ InodesCount: 0x10,
+ BlocksCount: 0x40,
+ MaxMountCount: 0xffff,
+ FirstDataBlock: 0x1,
+ BlockSize: 0x400,
+ BlocksPerGroup: 0x2000,
+ ClusterSize: 0x400,
+ ClustersPerGroup: 0x2000,
+ InodeSize: 0x80,
+ InodesPerGroup: 0x10,
+ BgDescSize: 0x20,
+ Magic: linux.EXT_SUPER_MAGIC,
+ Revision: disklayout.DynamicRev,
+ CompatFeatures: disklayout.CompatFeatures{
+ ExtAttr: true,
+ ResizeInode: true,
+ DirIndex: true,
+ },
+ IncompatFeatures: disklayout.IncompatFeatures{
+ DirentFileType: true,
+ },
+ RoCompatFeatures: disklayout.RoCompatFeatures{
+ Sparse: true,
+ LargeFile: true,
+ },
+ },
+ wantBgs: []bg{
+ {
+ InodeTable: 0x5,
+ BlockBitmap: 0x3,
+ InodeBitmap: 0x4,
+ Flags: disklayout.BGFlags{
+ InodeZeroed: true,
+ },
+ },
+ },
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ _, vfsfs, _, tearDown, err := setUp(t, test.image)
+ if err != nil {
+ t.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ fs, ok := vfsfs.Impl().(*filesystem)
+ if !ok {
+ t.Fatalf("ext filesystem of incorrect type: %T", vfsfs.Impl())
+ }
+
+ // Offload superblock and block group descriptors contents into
+ // local structs for comparison.
+ totalFreeInodes := uint32(0)
+ totalFreeBlocks := uint64(0)
+ gotSb := sb{
+ InodesCount: fs.sb.InodesCount(),
+ BlocksCount: fs.sb.BlocksCount(),
+ MaxMountCount: fs.sb.MaxMountCount(),
+ FirstDataBlock: fs.sb.FirstDataBlock(),
+ BlockSize: fs.sb.BlockSize(),
+ BlocksPerGroup: fs.sb.BlocksPerGroup(),
+ ClusterSize: fs.sb.ClusterSize(),
+ ClustersPerGroup: fs.sb.ClustersPerGroup(),
+ InodeSize: fs.sb.InodeSize(),
+ InodesPerGroup: fs.sb.InodesPerGroup(),
+ BgDescSize: fs.sb.BgDescSize(),
+ Magic: fs.sb.Magic(),
+ Revision: fs.sb.Revision(),
+ CompatFeatures: fs.sb.CompatibleFeatures(),
+ IncompatFeatures: fs.sb.IncompatibleFeatures(),
+ RoCompatFeatures: fs.sb.ReadOnlyCompatibleFeatures(),
+ }
+ gotNumBgs := len(fs.bgs)
+ gotBgs := make([]bg, gotNumBgs)
+ for i := 0; i < gotNumBgs; i++ {
+ gotBgs[i].InodeTable = fs.bgs[i].InodeTable()
+ gotBgs[i].BlockBitmap = fs.bgs[i].BlockBitmap()
+ gotBgs[i].InodeBitmap = fs.bgs[i].InodeBitmap()
+ gotBgs[i].ExclusionBitmap = fs.bgs[i].ExclusionBitmap()
+ gotBgs[i].Flags = fs.bgs[i].Flags()
+
+ totalFreeInodes += fs.bgs[i].FreeInodesCount()
+ totalFreeBlocks += uint64(fs.bgs[i].FreeBlocksCount())
+ }
+
+ if diff := cmp.Diff(gotSb, test.wantSb); diff != "" {
+ t.Errorf("superblock mismatch (-want +got):\n%s", diff)
+ }
+
+ if diff := cmp.Diff(gotBgs, test.wantBgs); diff != "" {
+ t.Errorf("block group descriptors mismatch (-want +got):\n%s", diff)
+ }
+
+ if diff := cmp.Diff(totalFreeInodes, fs.sb.FreeInodesCount()); diff != "" {
+ t.Errorf("total free inodes mismatch (-want +got):\n%s", diff)
+ }
+
+ if diff := cmp.Diff(totalFreeBlocks, fs.sb.FreeBlocksCount()); diff != "" {
+ t.Errorf("total free blocks mismatch (-want +got):\n%s", diff)
+ }
+ })
+ }
+}
diff --git a/pkg/sentry/fs/ext/filesystem.go b/pkg/sentry/fs/ext/filesystem.go
new file mode 100644
index 000000000..7150e75a5
--- /dev/null
+++ b/pkg/sentry/fs/ext/filesystem.go
@@ -0,0 +1,137 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+ "io"
+ "sync"
+
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+ // TODO(b/134676337): Remove when all methods have been implemented.
+ vfs.FilesystemImpl
+
+ vfsfs vfs.Filesystem
+
+ // mu serializes changes to the Dentry tree and the usage of the read seeker.
+ mu sync.Mutex
+
+ // dev is the ReadSeeker for the underlying fs device. It is protected by mu.
+ //
+ // The ext filesystems aim to maximize locality, i.e. place all the data
+ // blocks of a file close together. On a spinning disk, locality reduces the
+ // amount of movement of the head hence speeding up IO operations. On an SSD
+ // there are no moving parts but locality increases the size of each transer
+ // request. Hence, having mutual exclusion on the read seeker while reading a
+ // file *should* help in achieving the intended performance gains.
+ //
+ // Note: This synchronization was not coupled with the ReadSeeker itself
+ // because we want to synchronize across read/seek operations for the
+ // performance gains mentioned above. Helps enforcing one-file-at-a-time IO.
+ dev io.ReadSeeker
+
+ // inodeCache maps absolute inode numbers to the corresponding Inode struct.
+ // Inodes should be removed from this once their reference count hits 0.
+ //
+ // Protected by mu because every addition and removal from this corresponds to
+ // a change in the dentry tree.
+ inodeCache map[uint32]*inode
+
+ // sb represents the filesystem superblock. Immutable after initialization.
+ sb disklayout.SuperBlock
+
+ // bgs represents all the block group descriptors for the filesystem.
+ // Immutable after initialization.
+ bgs []disklayout.BlockGroup
+}
+
+// Compiles only if filesystem implements vfs.FilesystemImpl.
+var _ vfs.FilesystemImpl = (*filesystem)(nil)
+
+// getOrCreateInode gets the inode corresponding to the inode number passed in.
+// It creates a new one with the given inode number if one does not exist.
+//
+// Preconditions: must be holding fs.mu.
+func (fs *filesystem) getOrCreateInode(inodeNum uint32) (*inode, error) {
+ if in, ok := fs.inodeCache[inodeNum]; ok {
+ return in, nil
+ }
+
+ in, err := newInode(fs.dev, fs.sb, fs.bgs, inodeNum)
+ if err != nil {
+ return nil, err
+ }
+
+ fs.inodeCache[inodeNum] = in
+ return in, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+ // This is a readonly filesystem for now.
+ return nil
+}
+
+// The vfs.FilesystemImpl functions below return EROFS because their respective
+// man pages say that EROFS must be returned if the path resolves to a file on
+// a read-only filesystem.
+
+// TODO(b/134676337): Implement path traversal and return EROFS only if the
+// path resolves to a Dentry within ext fs.
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+ return syserror.EROFS
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+ return syserror.EROFS
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+ return syserror.EROFS
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ return syserror.EROFS
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+ return syserror.EROFS
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+ return syserror.EROFS
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ return syserror.EROFS
+}
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index e0602da17..246b97161 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -571,12 +571,6 @@ func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error {
if o.upper != nil {
err = o.upper.check(ctx, p)
} else {
- if p.Write {
- // Since writes will be redirected to the upper filesystem, the lower
- // filesystem need not be writable, but must be readable for copy-up.
- p.Write = false
- p.Read = true
- }
err = o.lower.check(ctx, p)
}
o.copyMu.RUnlock()
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 70f5a3f0b..4c2d48e65 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -240,6 +240,9 @@ type InitKernelArgs struct {
// RootAbstractSocketNamespace is the root Abstract Socket namespace.
RootAbstractSocketNamespace *AbstractSocketNamespace
+
+ // PIDNamespace is the root PID namespace.
+ PIDNamespace *PIDNamespace
}
// Init initialize the Kernel with no tasks.
@@ -262,7 +265,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
k.featureSet = args.FeatureSet
k.timekeeper = args.Timekeeper
- k.tasks = newTaskSet()
+ k.tasks = newTaskSet(args.PIDNamespace)
k.rootUserNamespace = args.RootUserNamespace
k.rootUTSNamespace = args.RootUTSNamespace
k.rootIPCNamespace = args.RootIPCNamespace
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index b21b182fc..8267929a6 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -90,9 +90,9 @@ type TaskSet struct {
}
// newTaskSet returns a new, empty TaskSet.
-func newTaskSet() *TaskSet {
- ts := &TaskSet{}
- ts.Root = newPIDNamespace(ts, nil /* parent */, auth.NewRootUserNamespace())
+func newTaskSet(pidns *PIDNamespace) *TaskSet {
+ ts := &TaskSet{Root: pidns}
+ pidns.owner = ts
return ts
}
@@ -186,6 +186,12 @@ func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespa
}
}
+// NewRootPIDNamespace creates the root PID namespace. 'owner' is not available
+// yet when root namespace is created and must be set by caller.
+func NewRootPIDNamespace(userns *auth.UserNamespace) *PIDNamespace {
+ return newPIDNamespace(nil, nil, userns)
+}
+
// NewChild returns a new, empty PID namespace that is a child of ns. Authority
// over the new PID namespace is controlled by userns.
func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace {
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go
index 7eef19f74..8fe489c0e 100644
--- a/pkg/sentry/socket/epsocket/stack.go
+++ b/pkg/sentry/socket/epsocket/stack.go
@@ -75,8 +75,8 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
addrs = append(addrs, inet.InterfaceAddr{
Family: family,
- PrefixLen: uint8(len(a.Address) * 8),
- Addr: []byte(a.Address),
+ PrefixLen: uint8(a.AddressWithPrefix.PrefixLen),
+ Addr: []byte(a.AddressWithPrefix.Address),
// TODO(b/68878065): Other fields.
})
}