diff options
Diffstat (limited to 'pkg/sentry')
-rw-r--r-- | pkg/sentry/fs/ext/BUILD | 22 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/dentry.go | 33 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/disklayout/dirent_old.go | 3 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/disklayout/inode.go | 6 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/disklayout/superblock_32.go | 3 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/disklayout/superblock_64.go | 3 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/disklayout/superblock_old.go | 2 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/ext.go | 102 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/ext_test.go | 407 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/filesystem.go | 137 | ||||
-rw-r--r-- | pkg/sentry/fs/inode_overlay.go | 6 | ||||
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 5 | ||||
-rw-r--r-- | pkg/sentry/kernel/threads.go | 12 | ||||
-rw-r--r-- | pkg/sentry/socket/epsocket/stack.go | 4 |
14 files changed, 674 insertions, 71 deletions
diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD index 3ba278e08..2c15875f5 100644 --- a/pkg/sentry/fs/ext/BUILD +++ b/pkg/sentry/fs/ext/BUILD @@ -7,6 +7,7 @@ go_library( srcs = [ "dentry.go", "ext.go", + "filesystem.go", "inode.go", "utils.go", ], @@ -15,7 +16,10 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/sentry/context", "//pkg/sentry/fs/ext/disklayout", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", "//pkg/syserror", ], ) @@ -23,11 +27,27 @@ go_library( go_test( name = "ext_test", size = "small", - srcs = ["extent_test.go"], + srcs = [ + "ext_test.go", + "extent_test.go", + ], + data = [ + "//pkg/sentry/fs/ext:assets/bigfile.txt", + "//pkg/sentry/fs/ext:assets/file.txt", + "//pkg/sentry/fs/ext:assets/tiny.ext2", + "//pkg/sentry/fs/ext:assets/tiny.ext3", + "//pkg/sentry/fs/ext:assets/tiny.ext4", + ], embed = [":ext"], deps = [ + "//pkg/abi/linux", "//pkg/binary", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", "//pkg/sentry/fs/ext/disklayout", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//runsc/test/testutil", "@com_github_google_go-cmp//cmp:go_default_library", "@com_github_google_go-cmp//cmp/cmpopts:go_default_library", ], diff --git a/pkg/sentry/fs/ext/dentry.go b/pkg/sentry/fs/ext/dentry.go index 71cd217df..054fb42b6 100644 --- a/pkg/sentry/fs/ext/dentry.go +++ b/pkg/sentry/fs/ext/dentry.go @@ -14,10 +14,43 @@ package ext +import ( + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + // dentry implements vfs.DentryImpl. type dentry struct { + vfsd vfs.Dentry + // inode is the inode represented by this dentry. Multiple Dentries may // share a single non-directory Inode (with hard links). inode is // immutable. inode *inode } + +// Compiles only if dentry implements vfs.DentryImpl. +var _ vfs.DentryImpl = (*dentry)(nil) + +// newDentry is the dentry constructor. +func newDentry(in *inode) *dentry { + d := &dentry{ + inode: in, + } + d.vfsd.Init(d) + return d +} + +// IncRef implements vfs.DentryImpl.IncRef. +func (d *dentry) IncRef(vfsfs *vfs.Filesystem) { + d.inode.incRef() +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool { + return d.inode.tryIncRef() +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *dentry) DecRef(vfsfs *vfs.Filesystem) { + d.inode.decRef(vfsfs.Impl().(*filesystem)) +} diff --git a/pkg/sentry/fs/ext/disklayout/dirent_old.go b/pkg/sentry/fs/ext/disklayout/dirent_old.go index 2e0f9c812..6fff12a6e 100644 --- a/pkg/sentry/fs/ext/disklayout/dirent_old.go +++ b/pkg/sentry/fs/ext/disklayout/dirent_old.go @@ -17,8 +17,7 @@ package disklayout import "gvisor.dev/gvisor/pkg/sentry/fs" // DirentOld represents the old directory entry struct which does not contain -// the file type. This emulates Linux's ext4_dir_entry struct. This is used in -// ext2, ext3 and sometimes in ext4. +// the file type. This emulates Linux's ext4_dir_entry struct. // // Note: This struct can be of variable size on disk. The one described below // is of maximum size and the FileName beyond NameLength bytes might contain diff --git a/pkg/sentry/fs/ext/disklayout/inode.go b/pkg/sentry/fs/ext/disklayout/inode.go index 9ab9a4988..88ae913f5 100644 --- a/pkg/sentry/fs/ext/disklayout/inode.go +++ b/pkg/sentry/fs/ext/disklayout/inode.go @@ -20,6 +20,12 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/time" ) +// Special inodes. See https://www.kernel.org/doc/html/latest/filesystems/ext4/overview.html#special-inodes. +const ( + // RootDirInode is the inode number of the root directory inode. + RootDirInode = 2 +) + // The Inode interface must be implemented by structs representing ext inodes. // The inode stores all the metadata pertaining to the file (except for the // file name which is held by the directory entry). It does NOT expose all diff --git a/pkg/sentry/fs/ext/disklayout/superblock_32.go b/pkg/sentry/fs/ext/disklayout/superblock_32.go index 587e4afaa..53e515fd3 100644 --- a/pkg/sentry/fs/ext/disklayout/superblock_32.go +++ b/pkg/sentry/fs/ext/disklayout/superblock_32.go @@ -15,7 +15,8 @@ package disklayout // SuperBlock32Bit implements SuperBlock and represents the 32-bit version of -// the ext4_super_block struct in fs/ext4/ext4.h. +// the ext4_super_block struct in fs/ext4/ext4.h. Should be used only if +// RevLevel = DynamicRev and 64-bit feature is disabled. type SuperBlock32Bit struct { // We embed the old superblock struct here because the 32-bit version is just // an extension of the old version. diff --git a/pkg/sentry/fs/ext/disklayout/superblock_64.go b/pkg/sentry/fs/ext/disklayout/superblock_64.go index a2c2278fb..7c1053fb4 100644 --- a/pkg/sentry/fs/ext/disklayout/superblock_64.go +++ b/pkg/sentry/fs/ext/disklayout/superblock_64.go @@ -17,7 +17,8 @@ package disklayout // SuperBlock64Bit implements SuperBlock and represents the 64-bit version of // the ext4_super_block struct in fs/ext4/ext4.h. This sums up to be exactly // 1024 bytes (smallest possible block size) and hence the superblock always -// fits in no more than one data block. +// fits in no more than one data block. Should only be used when the 64-bit +// feature is set. type SuperBlock64Bit struct { // We embed the 32-bit struct here because 64-bit version is just an extension // of the 32-bit version. diff --git a/pkg/sentry/fs/ext/disklayout/superblock_old.go b/pkg/sentry/fs/ext/disklayout/superblock_old.go index 5a64aaaa1..9221e0251 100644 --- a/pkg/sentry/fs/ext/disklayout/superblock_old.go +++ b/pkg/sentry/fs/ext/disklayout/superblock_old.go @@ -15,7 +15,7 @@ package disklayout // SuperBlockOld implements SuperBlock and represents the old version of the -// superblock struct in ext2 and ext3 systems. +// superblock struct. Should be used only if RevLevel = OldRev. type SuperBlockOld struct { InodesCountRaw uint32 BlocksCountLo uint32 diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go index 7f4287b01..10e235fb1 100644 --- a/pkg/sentry/fs/ext/ext.go +++ b/pkg/sentry/fs/ext/ext.go @@ -16,86 +16,82 @@ package ext import ( + "errors" + "fmt" "io" - "sync" + "os" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) -// filesystem implements vfs.FilesystemImpl. -type filesystem struct { - // mu serializes changes to the Dentry tree and the usage of the read seeker. - mu sync.Mutex - - // dev is the ReadSeeker for the underlying fs device. It is protected by mu. - // - // The ext filesystems aim to maximize locality, i.e. place all the data - // blocks of a file close together. On a spinning disk, locality reduces the - // amount of movement of the head hence speeding up IO operations. On an SSD - // there are no moving parts but locality increases the size of each transer - // request. Hence, having mutual exclusion on the read seeker while reading a - // file *should* help in achieving the intended performance gains. - // - // Note: This synchronization was not coupled with the ReadSeeker itself - // because we want to synchronize across read/seek operations for the - // performance gains mentioned above. Helps enforcing one-file-at-a-time IO. - dev io.ReadSeeker +// filesystemType implements vfs.FilesystemType. +type filesystemType struct{} + +// Compiles only if filesystemType implements vfs.FilesystemType. +var _ vfs.FilesystemType = (*filesystemType)(nil) + +// getDeviceFd returns the read seeker to the underlying device. +// Currently there are two ways of mounting an ext(2/3/4) fs: +// 1. Specify a mount with our internal special MountType in the OCI spec. +// 2. Expose the device to the container and mount it from application layer. +func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReadSeeker, error) { + if opts.InternalData == nil { + // User mount call. + // TODO(b/134676337): Open the device specified by `source` and return that. + panic("unimplemented") + } - // inodeCache maps absolute inode numbers to the corresponding Inode struct. - // Inodes should be removed from this once their reference count hits 0. - // - // Protected by mu because every addition and removal from this corresponds to - // a change in the dentry tree. - inodeCache map[uint32]*inode + // NewFilesystem call originated from within the sentry. + fd, ok := opts.InternalData.(uintptr) + if !ok { + return nil, errors.New("internal data for ext fs must be a uintptr containing the file descriptor to device") + } - // sb represents the filesystem superblock. Immutable after initialization. - sb disklayout.SuperBlock + // We do not close this file because that would close the underlying device + // file descriptor (which is required for reading the fs from disk). + // TODO(b/134676337): Use pkg/fd instead. + deviceFile := os.NewFile(fd, source) + if deviceFile == nil { + return nil, fmt.Errorf("ext4 device file descriptor is not valid: %d", fd) + } - // bgs represents all the block group descriptors for the filesystem. - // Immutable after initialization. - bgs []disklayout.BlockGroup + return deviceFile, nil } -// newFilesystem is the filesystem constructor. -func newFilesystem(dev io.ReadSeeker) (*filesystem, error) { - fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)} - var err error +// NewFilesystem implements vfs.FilesystemType.NewFilesystem. +func (fstype filesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + dev, err := getDeviceFd(source, opts) + if err != nil { + return nil, nil, err + } + fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)} + fs.vfsfs.Init(&fs) fs.sb, err = readSuperBlock(dev) if err != nil { - return nil, err + return nil, nil, err } if fs.sb.Magic() != linux.EXT_SUPER_MAGIC { // mount(2) specifies that EINVAL should be returned if the superblock is // invalid. - return nil, syserror.EINVAL + return nil, nil, syserror.EINVAL } fs.bgs, err = readBlockGroups(dev, fs.sb) if err != nil { - return nil, err - } - - return &fs, nil -} - -// getOrCreateInode gets the inode corresponding to the inode number passed in. -// It creates a new one with the given inode number if one does not exist. -// -// Preconditions: must be holding fs.mu. -func (fs *filesystem) getOrCreateInode(inodeNum uint32) (*inode, error) { - if in, ok := fs.inodeCache[inodeNum]; ok { - return in, nil + return nil, nil, err } - in, err := newInode(fs.dev, fs.sb, fs.bgs, inodeNum) + rootInode, err := fs.getOrCreateInode(disklayout.RootDirInode) if err != nil { - return nil, err + return nil, nil, err } - fs.inodeCache[inodeNum] = in - return in, nil + return &fs.vfsfs, &newDentry(rootInode).vfsd, nil } diff --git a/pkg/sentry/fs/ext/ext_test.go b/pkg/sentry/fs/ext/ext_test.go new file mode 100644 index 000000000..ee7f7907c --- /dev/null +++ b/pkg/sentry/fs/ext/ext_test.go @@ -0,0 +1,407 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "fmt" + "os" + "path" + "testing" + + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + + "gvisor.dev/gvisor/runsc/test/testutil" +) + +const ( + assetsDir = "pkg/sentry/fs/ext/assets" +) + +var ( + ext2ImagePath = path.Join(assetsDir, "tiny.ext2") + ext3ImagePath = path.Join(assetsDir, "tiny.ext3") + ext4ImagePath = path.Join(assetsDir, "tiny.ext4") +) + +func beginning(_ uint64) uint64 { + return 0 +} + +func middle(i uint64) uint64 { + return i / 2 +} + +func end(i uint64) uint64 { + return i +} + +// setUp opens imagePath as an ext Filesystem and returns all necessary +// elements required to run tests. If error is non-nil, it also returns a tear +// down function which must be called after the test is run for clean up. +func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *vfs.Dentry, func(), error) { + localImagePath, err := testutil.FindFile(imagePath) + if err != nil { + return nil, nil, nil, nil, fmt.Errorf("failed to open local image at path %s: %v", imagePath, err) + } + + f, err := os.Open(localImagePath) + if err != nil { + return nil, nil, nil, nil, err + } + + // Mount the ext4 fs and retrieve the inode structure for the file. + mockCtx := contexttest.Context(t) + fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: f.Fd()}) + if err != nil { + f.Close() + return nil, nil, nil, nil, err + } + + tearDown := func() { + if err := f.Close(); err != nil { + t.Fatalf("tearDown failed: %v", err) + } + } + return mockCtx, fs, d, tearDown, nil +} + +// TestRootDir tests that the root directory inode is correctly initialized and +// returned from setUp. +func TestRootDir(t *testing.T) { + type inodeProps struct { + Mode linux.FileMode + UID auth.KUID + GID auth.KGID + Size uint64 + InodeSize uint16 + Links uint16 + Flags disklayout.InodeFlags + } + + type rootDirTest struct { + name string + image string + wantInode inodeProps + } + + tests := []rootDirTest{ + { + name: "ext4 root dir", + image: ext4ImagePath, + wantInode: inodeProps{ + Mode: linux.ModeDirectory | 0755, + Size: 0x400, + InodeSize: 0x80, + Links: 3, + Flags: disklayout.InodeFlags{Extents: true}, + }, + }, + { + name: "ext3 root dir", + image: ext3ImagePath, + wantInode: inodeProps{ + Mode: linux.ModeDirectory | 0755, + Size: 0x400, + InodeSize: 0x80, + Links: 3, + }, + }, + { + name: "ext2 root dir", + image: ext2ImagePath, + wantInode: inodeProps{ + Mode: linux.ModeDirectory | 0755, + Size: 0x400, + InodeSize: 0x80, + Links: 3, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + _, _, vfsd, tearDown, err := setUp(t, test.image) + if err != nil { + t.Fatalf("setUp failed: %v", err) + } + defer tearDown() + + d, ok := vfsd.Impl().(*dentry) + if !ok { + t.Fatalf("ext dentry of incorrect type: %T", vfsd.Impl()) + } + + // Offload inode contents into local structs for comparison. + gotInode := inodeProps{ + Mode: d.inode.diskInode.Mode(), + UID: d.inode.diskInode.UID(), + GID: d.inode.diskInode.GID(), + Size: d.inode.diskInode.Size(), + InodeSize: d.inode.diskInode.InodeSize(), + Links: d.inode.diskInode.LinksCount(), + Flags: d.inode.diskInode.Flags(), + } + + if diff := cmp.Diff(gotInode, test.wantInode); diff != "" { + t.Errorf("inode mismatch (-want +got):\n%s", diff) + } + }) + } +} + +// TestFilesystemInit tests that the filesystem superblock and block group +// descriptors are correctly read in and initialized. +func TestFilesystemInit(t *testing.T) { + // sb only contains the immutable properties of the superblock. + type sb struct { + InodesCount uint32 + BlocksCount uint64 + MaxMountCount uint16 + FirstDataBlock uint32 + BlockSize uint64 + BlocksPerGroup uint32 + ClusterSize uint64 + ClustersPerGroup uint32 + InodeSize uint16 + InodesPerGroup uint32 + BgDescSize uint16 + Magic uint16 + Revision disklayout.SbRevision + CompatFeatures disklayout.CompatFeatures + IncompatFeatures disklayout.IncompatFeatures + RoCompatFeatures disklayout.RoCompatFeatures + } + + // bg only contains the immutable properties of the block group descriptor. + type bg struct { + InodeTable uint64 + BlockBitmap uint64 + InodeBitmap uint64 + ExclusionBitmap uint64 + Flags disklayout.BGFlags + } + + type fsInitTest struct { + name string + image string + wantSb sb + wantBgs []bg + } + + tests := []fsInitTest{ + { + name: "ext4 filesystem init", + image: ext4ImagePath, + wantSb: sb{ + InodesCount: 0x10, + BlocksCount: 0x40, + MaxMountCount: 0xffff, + FirstDataBlock: 0x1, + BlockSize: 0x400, + BlocksPerGroup: 0x2000, + ClusterSize: 0x400, + ClustersPerGroup: 0x2000, + InodeSize: 0x80, + InodesPerGroup: 0x10, + BgDescSize: 0x40, + Magic: linux.EXT_SUPER_MAGIC, + Revision: disklayout.DynamicRev, + CompatFeatures: disklayout.CompatFeatures{ + ExtAttr: true, + ResizeInode: true, + DirIndex: true, + }, + IncompatFeatures: disklayout.IncompatFeatures{ + DirentFileType: true, + Extents: true, + Is64Bit: true, + FlexBg: true, + }, + RoCompatFeatures: disklayout.RoCompatFeatures{ + Sparse: true, + LargeFile: true, + HugeFile: true, + DirNlink: true, + ExtraIsize: true, + MetadataCsum: true, + }, + }, + wantBgs: []bg{ + { + InodeTable: 0x23, + BlockBitmap: 0x3, + InodeBitmap: 0x13, + Flags: disklayout.BGFlags{ + InodeZeroed: true, + }, + }, + }, + }, + { + name: "ext3 filesystem init", + image: ext3ImagePath, + wantSb: sb{ + InodesCount: 0x10, + BlocksCount: 0x40, + MaxMountCount: 0xffff, + FirstDataBlock: 0x1, + BlockSize: 0x400, + BlocksPerGroup: 0x2000, + ClusterSize: 0x400, + ClustersPerGroup: 0x2000, + InodeSize: 0x80, + InodesPerGroup: 0x10, + BgDescSize: 0x20, + Magic: linux.EXT_SUPER_MAGIC, + Revision: disklayout.DynamicRev, + CompatFeatures: disklayout.CompatFeatures{ + ExtAttr: true, + ResizeInode: true, + DirIndex: true, + }, + IncompatFeatures: disklayout.IncompatFeatures{ + DirentFileType: true, + }, + RoCompatFeatures: disklayout.RoCompatFeatures{ + Sparse: true, + LargeFile: true, + }, + }, + wantBgs: []bg{ + { + InodeTable: 0x5, + BlockBitmap: 0x3, + InodeBitmap: 0x4, + Flags: disklayout.BGFlags{ + InodeZeroed: true, + }, + }, + }, + }, + { + name: "ext2 filesystem init", + image: ext2ImagePath, + wantSb: sb{ + InodesCount: 0x10, + BlocksCount: 0x40, + MaxMountCount: 0xffff, + FirstDataBlock: 0x1, + BlockSize: 0x400, + BlocksPerGroup: 0x2000, + ClusterSize: 0x400, + ClustersPerGroup: 0x2000, + InodeSize: 0x80, + InodesPerGroup: 0x10, + BgDescSize: 0x20, + Magic: linux.EXT_SUPER_MAGIC, + Revision: disklayout.DynamicRev, + CompatFeatures: disklayout.CompatFeatures{ + ExtAttr: true, + ResizeInode: true, + DirIndex: true, + }, + IncompatFeatures: disklayout.IncompatFeatures{ + DirentFileType: true, + }, + RoCompatFeatures: disklayout.RoCompatFeatures{ + Sparse: true, + LargeFile: true, + }, + }, + wantBgs: []bg{ + { + InodeTable: 0x5, + BlockBitmap: 0x3, + InodeBitmap: 0x4, + Flags: disklayout.BGFlags{ + InodeZeroed: true, + }, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + _, vfsfs, _, tearDown, err := setUp(t, test.image) + if err != nil { + t.Fatalf("setUp failed: %v", err) + } + defer tearDown() + + fs, ok := vfsfs.Impl().(*filesystem) + if !ok { + t.Fatalf("ext filesystem of incorrect type: %T", vfsfs.Impl()) + } + + // Offload superblock and block group descriptors contents into + // local structs for comparison. + totalFreeInodes := uint32(0) + totalFreeBlocks := uint64(0) + gotSb := sb{ + InodesCount: fs.sb.InodesCount(), + BlocksCount: fs.sb.BlocksCount(), + MaxMountCount: fs.sb.MaxMountCount(), + FirstDataBlock: fs.sb.FirstDataBlock(), + BlockSize: fs.sb.BlockSize(), + BlocksPerGroup: fs.sb.BlocksPerGroup(), + ClusterSize: fs.sb.ClusterSize(), + ClustersPerGroup: fs.sb.ClustersPerGroup(), + InodeSize: fs.sb.InodeSize(), + InodesPerGroup: fs.sb.InodesPerGroup(), + BgDescSize: fs.sb.BgDescSize(), + Magic: fs.sb.Magic(), + Revision: fs.sb.Revision(), + CompatFeatures: fs.sb.CompatibleFeatures(), + IncompatFeatures: fs.sb.IncompatibleFeatures(), + RoCompatFeatures: fs.sb.ReadOnlyCompatibleFeatures(), + } + gotNumBgs := len(fs.bgs) + gotBgs := make([]bg, gotNumBgs) + for i := 0; i < gotNumBgs; i++ { + gotBgs[i].InodeTable = fs.bgs[i].InodeTable() + gotBgs[i].BlockBitmap = fs.bgs[i].BlockBitmap() + gotBgs[i].InodeBitmap = fs.bgs[i].InodeBitmap() + gotBgs[i].ExclusionBitmap = fs.bgs[i].ExclusionBitmap() + gotBgs[i].Flags = fs.bgs[i].Flags() + + totalFreeInodes += fs.bgs[i].FreeInodesCount() + totalFreeBlocks += uint64(fs.bgs[i].FreeBlocksCount()) + } + + if diff := cmp.Diff(gotSb, test.wantSb); diff != "" { + t.Errorf("superblock mismatch (-want +got):\n%s", diff) + } + + if diff := cmp.Diff(gotBgs, test.wantBgs); diff != "" { + t.Errorf("block group descriptors mismatch (-want +got):\n%s", diff) + } + + if diff := cmp.Diff(totalFreeInodes, fs.sb.FreeInodesCount()); diff != "" { + t.Errorf("total free inodes mismatch (-want +got):\n%s", diff) + } + + if diff := cmp.Diff(totalFreeBlocks, fs.sb.FreeBlocksCount()); diff != "" { + t.Errorf("total free blocks mismatch (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/sentry/fs/ext/filesystem.go b/pkg/sentry/fs/ext/filesystem.go new file mode 100644 index 000000000..7150e75a5 --- /dev/null +++ b/pkg/sentry/fs/ext/filesystem.go @@ -0,0 +1,137 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "io" + "sync" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + // TODO(b/134676337): Remove when all methods have been implemented. + vfs.FilesystemImpl + + vfsfs vfs.Filesystem + + // mu serializes changes to the Dentry tree and the usage of the read seeker. + mu sync.Mutex + + // dev is the ReadSeeker for the underlying fs device. It is protected by mu. + // + // The ext filesystems aim to maximize locality, i.e. place all the data + // blocks of a file close together. On a spinning disk, locality reduces the + // amount of movement of the head hence speeding up IO operations. On an SSD + // there are no moving parts but locality increases the size of each transer + // request. Hence, having mutual exclusion on the read seeker while reading a + // file *should* help in achieving the intended performance gains. + // + // Note: This synchronization was not coupled with the ReadSeeker itself + // because we want to synchronize across read/seek operations for the + // performance gains mentioned above. Helps enforcing one-file-at-a-time IO. + dev io.ReadSeeker + + // inodeCache maps absolute inode numbers to the corresponding Inode struct. + // Inodes should be removed from this once their reference count hits 0. + // + // Protected by mu because every addition and removal from this corresponds to + // a change in the dentry tree. + inodeCache map[uint32]*inode + + // sb represents the filesystem superblock. Immutable after initialization. + sb disklayout.SuperBlock + + // bgs represents all the block group descriptors for the filesystem. + // Immutable after initialization. + bgs []disklayout.BlockGroup +} + +// Compiles only if filesystem implements vfs.FilesystemImpl. +var _ vfs.FilesystemImpl = (*filesystem)(nil) + +// getOrCreateInode gets the inode corresponding to the inode number passed in. +// It creates a new one with the given inode number if one does not exist. +// +// Preconditions: must be holding fs.mu. +func (fs *filesystem) getOrCreateInode(inodeNum uint32) (*inode, error) { + if in, ok := fs.inodeCache[inodeNum]; ok { + return in, nil + } + + in, err := newInode(fs.dev, fs.sb, fs.bgs, inodeNum) + if err != nil { + return nil, err + } + + fs.inodeCache[inodeNum] = in + return in, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { +} + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *filesystem) Sync(ctx context.Context) error { + // This is a readonly filesystem for now. + return nil +} + +// The vfs.FilesystemImpl functions below return EROFS because their respective +// man pages say that EROFS must be returned if the path resolves to a file on +// a read-only filesystem. + +// TODO(b/134676337): Implement path traversal and return EROFS only if the +// path resolves to a Dentry within ext fs. + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + return syserror.EROFS +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + return syserror.EROFS +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error { + return syserror.EROFS +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + return syserror.EROFS +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + return syserror.EROFS +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + return syserror.EROFS +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + return syserror.EROFS +} diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index e0602da17..246b97161 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -571,12 +571,6 @@ func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error { if o.upper != nil { err = o.upper.check(ctx, p) } else { - if p.Write { - // Since writes will be redirected to the upper filesystem, the lower - // filesystem need not be writable, but must be readable for copy-up. - p.Write = false - p.Read = true - } err = o.lower.check(ctx, p) } o.copyMu.RUnlock() diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 70f5a3f0b..4c2d48e65 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -240,6 +240,9 @@ type InitKernelArgs struct { // RootAbstractSocketNamespace is the root Abstract Socket namespace. RootAbstractSocketNamespace *AbstractSocketNamespace + + // PIDNamespace is the root PID namespace. + PIDNamespace *PIDNamespace } // Init initialize the Kernel with no tasks. @@ -262,7 +265,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.featureSet = args.FeatureSet k.timekeeper = args.Timekeeper - k.tasks = newTaskSet() + k.tasks = newTaskSet(args.PIDNamespace) k.rootUserNamespace = args.RootUserNamespace k.rootUTSNamespace = args.RootUTSNamespace k.rootIPCNamespace = args.RootIPCNamespace diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go index b21b182fc..8267929a6 100644 --- a/pkg/sentry/kernel/threads.go +++ b/pkg/sentry/kernel/threads.go @@ -90,9 +90,9 @@ type TaskSet struct { } // newTaskSet returns a new, empty TaskSet. -func newTaskSet() *TaskSet { - ts := &TaskSet{} - ts.Root = newPIDNamespace(ts, nil /* parent */, auth.NewRootUserNamespace()) +func newTaskSet(pidns *PIDNamespace) *TaskSet { + ts := &TaskSet{Root: pidns} + pidns.owner = ts return ts } @@ -186,6 +186,12 @@ func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespa } } +// NewRootPIDNamespace creates the root PID namespace. 'owner' is not available +// yet when root namespace is created and must be set by caller. +func NewRootPIDNamespace(userns *auth.UserNamespace) *PIDNamespace { + return newPIDNamespace(nil, nil, userns) +} + // NewChild returns a new, empty PID namespace that is a child of ns. Authority // over the new PID namespace is controlled by userns. func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace { diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go index 7eef19f74..8fe489c0e 100644 --- a/pkg/sentry/socket/epsocket/stack.go +++ b/pkg/sentry/socket/epsocket/stack.go @@ -75,8 +75,8 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { addrs = append(addrs, inet.InterfaceAddr{ Family: family, - PrefixLen: uint8(len(a.Address) * 8), - Addr: []byte(a.Address), + PrefixLen: uint8(a.AddressWithPrefix.PrefixLen), + Addr: []byte(a.AddressWithPrefix.Address), // TODO(b/68878065): Other fields. }) } |