diff options
-rw-r--r-- | pkg/sentry/fs/ext/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/block_map_file.go | 8 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/ext.go | 23 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/ext_test.go | 2 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/extent_file.go | 25 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/extent_test.go | 2 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/filesystem.go | 26 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/inline_file.go | 2 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/inode.go | 6 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/regular_file.go | 9 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/symlink.go | 4 | ||||
-rw-r--r-- | pkg/sentry/fs/ext/utils.go | 43 |
12 files changed, 55 insertions, 96 deletions
diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD index a89dc5166..8158aa522 100644 --- a/pkg/sentry/fs/ext/BUILD +++ b/pkg/sentry/fs/ext/BUILD @@ -37,6 +37,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/fd", "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/ext/disklayout", diff --git a/pkg/sentry/fs/ext/block_map_file.go b/pkg/sentry/fs/ext/block_map_file.go index eb0b35e36..9aabbd145 100644 --- a/pkg/sentry/fs/ext/block_map_file.go +++ b/pkg/sentry/fs/ext/block_map_file.go @@ -16,6 +16,7 @@ package ext import ( "io" + "sync" "gvisor.dev/gvisor/pkg/binary" ) @@ -25,10 +26,13 @@ import ( type blockMapFile struct { regFile regularFile + // mu serializes changes to fileToPhysBlks. + mu sync.RWMutex + // fileToPhysBlks maps the file block numbers to the physical block numbers. // the physical block number for the (i)th file block is stored in the (i)th // index. This is initialized (at max) with the first 12 entries. The rest - // have to be read in from disk when required. + // have to be read in from disk when required. Protected by mu. fileToPhysBlks []uint32 } @@ -36,7 +40,7 @@ type blockMapFile struct { var _ fileReader = (*blockMapFile)(nil) // Read implements fileReader.getFileReader. -func (f *blockMapFile) getFileReader(dev io.ReadSeeker, blkSize uint64, offset uint64) io.Reader { +func (f *blockMapFile) getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader { panic("unimplemented") } diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go index 2380f15da..d303dd122 100644 --- a/pkg/sentry/fs/ext/ext.go +++ b/pkg/sentry/fs/ext/ext.go @@ -19,9 +19,9 @@ import ( "errors" "fmt" "io" - "os" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -35,11 +35,11 @@ type filesystemType struct{} // Compiles only if filesystemType implements vfs.FilesystemType. var _ vfs.FilesystemType = (*filesystemType)(nil) -// getDeviceFd returns the read seeker to the underlying device. +// getDeviceFd returns an io.ReaderAt to the underlying device. // Currently there are two ways of mounting an ext(2/3/4) fs: // 1. Specify a mount with our internal special MountType in the OCI spec. // 2. Expose the device to the container and mount it from application layer. -func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReadSeeker, error) { +func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, error) { if opts.InternalData == nil { // User mount call. // TODO(b/134676337): Open the device specified by `source` and return that. @@ -47,20 +47,19 @@ func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReadSeeker, e } // NewFilesystem call originated from within the sentry. - fd, ok := opts.InternalData.(uintptr) + devFd, ok := opts.InternalData.(int) if !ok { - return nil, errors.New("internal data for ext fs must be a uintptr containing the file descriptor to device") + return nil, errors.New("internal data for ext fs must be an int containing the file descriptor to device") } - // We do not close this file because that would close the underlying device - // file descriptor (which is required for reading the fs from disk). - // TODO(b/134676337): Use pkg/fd instead. - deviceFile := os.NewFile(fd, source) - if deviceFile == nil { - return nil, fmt.Errorf("ext4 device file descriptor is not valid: %d", fd) + if devFd < 0 { + return nil, fmt.Errorf("ext device file descriptor is not valid: %d", devFd) } - return deviceFile, nil + // The fd.ReadWriter returned from fd.NewReadWriter() does not take ownership + // of the file descriptor and hence will not close it when it is garbage + // collected. + return fd.NewReadWriter(devFd), nil } // NewFilesystem implements vfs.FilesystemType.NewFilesystem. diff --git a/pkg/sentry/fs/ext/ext_test.go b/pkg/sentry/fs/ext/ext_test.go index ee7f7907c..18764e92a 100644 --- a/pkg/sentry/fs/ext/ext_test.go +++ b/pkg/sentry/fs/ext/ext_test.go @@ -69,7 +69,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *v // Mount the ext4 fs and retrieve the inode structure for the file. mockCtx := contexttest.Context(t) - fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: f.Fd()}) + fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: int(f.Fd())}) if err != nil { f.Close() return nil, nil, nil, nil, err diff --git a/pkg/sentry/fs/ext/extent_file.go b/pkg/sentry/fs/ext/extent_file.go index 882300d96..aa4102dbb 100644 --- a/pkg/sentry/fs/ext/extent_file.go +++ b/pkg/sentry/fs/ext/extent_file.go @@ -36,7 +36,7 @@ type extentFile struct { var _ fileReader = (*extentFile)(nil) // Read implements fileReader.getFileReader. -func (f *extentFile) getFileReader(dev io.ReadSeeker, blkSize uint64, offset uint64) io.Reader { +func (f *extentFile) getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader { return &extentReader{ dev: dev, file: f, @@ -47,10 +47,8 @@ func (f *extentFile) getFileReader(dev io.ReadSeeker, blkSize uint64, offset uin // newExtentFile is the extent file constructor. It reads the entire extent // tree into memory. -// -// Preconditions: Must hold the mutex of the filesystem containing dev. // TODO(b/134676337): Build extent tree on demand to reduce memory usage. -func newExtentFile(dev io.ReadSeeker, blkSize uint64, regFile regularFile) (*extentFile, error) { +func newExtentFile(dev io.ReaderAt, blkSize uint64, regFile regularFile) (*extentFile, error) { file := &extentFile{regFile: regFile} file.regFile.impl = file err := file.buildExtTree(dev, blkSize) @@ -65,10 +63,8 @@ func newExtentFile(dev io.ReadSeeker, blkSize uint64, regFile regularFile) (*ext // memory. Then it recursively builds the rest of the tree by reading it off // disk. // -// Preconditions: -// - Must hold the mutex of the filesystem containing dev. -// - Inode flag InExtents must be set. -func (f *extentFile) buildExtTree(dev io.ReadSeeker, blkSize uint64) error { +// Precondition: inode flag InExtents must be set. +func (f *extentFile) buildExtTree(dev io.ReaderAt, blkSize uint64) error { rootNodeData := f.regFile.inode.diskInode.Data() binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &f.root.Header) @@ -110,9 +106,7 @@ func (f *extentFile) buildExtTree(dev io.ReadSeeker, blkSize uint64) error { // buildExtTreeFromDisk reads the extent tree nodes from disk and recursively // builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to // by the ExtentEntry. -// -// Preconditions: Must hold the mutex of the filesystem containing dev. -func buildExtTreeFromDisk(dev io.ReadSeeker, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) { +func buildExtTreeFromDisk(dev io.ReaderAt, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) { var header disklayout.ExtentHeader off := entry.PhysicalBlock() * blkSize err := readFromDisk(dev, int64(off), &header) @@ -155,7 +149,7 @@ func buildExtTreeFromDisk(dev io.ReadSeeker, entry disklayout.ExtentEntry, blkSi // extentReader implements io.Reader which can traverse the extent tree and // read file data. This is not thread safe. type extentReader struct { - dev io.ReadSeeker + dev io.ReaderAt file *extentFile fileOff uint64 // Represents the current file offset being read from. blkSize uint64 @@ -247,9 +241,12 @@ func (r *extentReader) readFromExtent(ex *disklayout.Extent, dst []byte) (int, e toRead = len(dst) } - n, err := readFull(r.dev, int64(readStart), dst[:toRead]) + n, _ := r.dev.ReadAt(dst[:toRead], int64(readStart)) r.fileOff += uint64(n) - return n, err + if n < toRead { + return n, syserror.EIO + } + return n, nil } // fileBlock returns the file block number we are currently reading. diff --git a/pkg/sentry/fs/ext/extent_test.go b/pkg/sentry/fs/ext/extent_test.go index fb7921add..dff401114 100644 --- a/pkg/sentry/fs/ext/extent_test.go +++ b/pkg/sentry/fs/ext/extent_test.go @@ -201,7 +201,7 @@ func TestBuildExtentTree(t *testing.T) { // extentTreeSetUp writes the passed extent tree to a mock disk as an extent // tree. It also constucts a mock extent file with the same tree built in it. // It also writes random data file data and returns it. -func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (io.ReadSeeker, *extentFile, []byte) { +func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (io.ReaderAt, *extentFile, []byte) { t.Helper() mockDisk := make([]byte, mockExtentBlkSize*10) diff --git a/pkg/sentry/fs/ext/filesystem.go b/pkg/sentry/fs/ext/filesystem.go index 32ca11026..12aeb5dac 100644 --- a/pkg/sentry/fs/ext/filesystem.go +++ b/pkg/sentry/fs/ext/filesystem.go @@ -31,22 +31,16 @@ type filesystem struct { vfsfs vfs.Filesystem - // mu serializes changes to the Dentry tree and the usage of the read seeker. - mu sync.Mutex + // mu serializes changes to the Dentry tree. + mu sync.RWMutex - // dev is the ReadSeeker for the underlying fs device. It is protected by mu. - // - // The ext filesystems aim to maximize locality, i.e. place all the data - // blocks of a file close together. On a spinning disk, locality reduces the - // amount of movement of the head hence speeding up IO operations. On an SSD - // there are no moving parts but locality increases the size of each transer - // request. Hence, having mutual exclusion on the read seeker while reading a - // file *should* help in achieving the intended performance gains. - // - // Note: This synchronization was not coupled with the ReadSeeker itself - // because we want to synchronize across read/seek operations for the - // performance gains mentioned above. Helps enforcing one-file-at-a-time IO. - dev io.ReadSeeker + // dev is the io.ReaderAt for the underlying fs device. It does not require + // protection because io.ReaderAt permits concurrent read calls to it. It + // translates to the pread syscall which passes on the read request directly + // to the device driver. Device drivers are intelligent in serving multiple + // concurrent read requests in the optimal order (taking locality into + // consideration). + dev io.ReaderAt // inodeCache maps absolute inode numbers to the corresponding Inode struct. // Inodes should be removed from this once their reference count hits 0. @@ -69,7 +63,7 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil) // getOrCreateInode gets the inode corresponding to the inode number passed in. // It creates a new one with the given inode number if one does not exist. // -// Preconditions: must be holding fs.mu. +// Precondition: must be holding fs.mu. func (fs *filesystem) getOrCreateInode(ctx context.Context, inodeNum uint32) (*inode, error) { if in, ok := fs.inodeCache[inodeNum]; ok { return in, nil diff --git a/pkg/sentry/fs/ext/inline_file.go b/pkg/sentry/fs/ext/inline_file.go index 8f1395567..b9adfe548 100644 --- a/pkg/sentry/fs/ext/inline_file.go +++ b/pkg/sentry/fs/ext/inline_file.go @@ -28,7 +28,7 @@ type inlineFile struct { var _ fileReader = (*inlineFile)(nil) // getFileReader implements fileReader.getFileReader. -func (f *inlineFile) getFileReader(_ io.ReadSeeker, _ uint64, offset uint64) io.Reader { +func (f *inlineFile) getFileReader(_ io.ReaderAt, _ uint64, offset uint64) io.Reader { diskInode := f.regFile.inode.diskInode return &inlineReader{offset: offset, data: diskInode.Data()[:diskInode.Size()]} } diff --git a/pkg/sentry/fs/ext/inode.go b/pkg/sentry/fs/ext/inode.go index 7d2a445fb..00e022953 100644 --- a/pkg/sentry/fs/ext/inode.go +++ b/pkg/sentry/fs/ext/inode.go @@ -75,7 +75,7 @@ func (in *inode) tryIncRef() bool { // decRef decrements the inode ref count and releases the inode resources if // the ref count hits 0. // -// Preconditions: Must have locked fs.mu. +// Precondition: Must have locked fs.mu. func (in *inode) decRef(fs *filesystem) { if refs := atomic.AddInt64(&in.refs, -1); refs == 0 { delete(fs.inodeCache, in.inodeNum) @@ -86,9 +86,7 @@ func (in *inode) decRef(fs *filesystem) { // newInode is the inode constructor. Reads the inode off disk. Identifies // inodes based on the absolute inode number on disk. -// -// Preconditions: Must hold the mutex of the filesystem containing dev. -func newInode(ctx context.Context, dev io.ReadSeeker, sb disklayout.SuperBlock, bgs []disklayout.BlockGroup, inodeNum uint32) (*inode, error) { +func newInode(ctx context.Context, dev io.ReaderAt, sb disklayout.SuperBlock, bgs []disklayout.BlockGroup, inodeNum uint32) (*inode, error) { if inodeNum == 0 { panic("inode number 0 on ext filesystems is not possible") } diff --git a/pkg/sentry/fs/ext/regular_file.go b/pkg/sentry/fs/ext/regular_file.go index 9bf39acfe..b48f61795 100644 --- a/pkg/sentry/fs/ext/regular_file.go +++ b/pkg/sentry/fs/ext/regular_file.go @@ -29,10 +29,7 @@ type fileReader interface { // // This reader is not meant to be retained across Read operations as it needs // to be reinitialized with the correct offset for every Read. - // - // Precondition: Must hold the mutex of the filesystem containing dev while - // using the Reader. - getFileReader(dev io.ReadSeeker, blkSize uint64, offset uint64) io.Reader + getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader } // regularFile represents a regular file's inode. This too follows the @@ -48,9 +45,7 @@ type regularFile struct { // newRegularFile is the regularFile constructor. It figures out what kind of // file this is and initializes the fileReader. -// -// Preconditions: Must hold the mutex of the filesystem containing dev. -func newRegularFile(dev io.ReadSeeker, blkSize uint64, inode inode) (*regularFile, error) { +func newRegularFile(dev io.ReaderAt, blkSize uint64, inode inode) (*regularFile, error) { regFile := regularFile{ inode: inode, } diff --git a/pkg/sentry/fs/ext/symlink.go b/pkg/sentry/fs/ext/symlink.go index 0ed67c0e4..6a55c1a7b 100644 --- a/pkg/sentry/fs/ext/symlink.go +++ b/pkg/sentry/fs/ext/symlink.go @@ -28,9 +28,7 @@ type symlink struct { // newSymlink is the symlink constructor. It reads out the symlink target from // the inode (however it might have been stored). -// -// Preconditions: Must hold the mutex of the filesystem containing dev. -func newSymlink(dev io.ReadSeeker, blkSize uint64, inode inode) (*symlink, error) { +func newSymlink(dev io.ReaderAt, blkSize uint64, inode inode) (*symlink, error) { var file *symlink var link []byte diff --git a/pkg/sentry/fs/ext/utils.go b/pkg/sentry/fs/ext/utils.go index 7c33919e0..3d89d664d 100644 --- a/pkg/sentry/fs/ext/utils.go +++ b/pkg/sentry/fs/ext/utils.go @@ -15,55 +15,30 @@ package ext import ( - "encoding/binary" "io" + "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" "gvisor.dev/gvisor/pkg/syserror" ) // readFromDisk performs a binary read from disk into the given struct from // the absolute offset provided. -// -// All disk reads should use this helper so we avoid reading from stale -// previously used offsets. This function forces the offset parameter. -// -// Precondition: Must hold the mutex of the filesystem containing dev. -func readFromDisk(dev io.ReadSeeker, abOff int64, v interface{}) error { - if _, err := dev.Seek(abOff, io.SeekStart); err != nil { - return syserror.EIO - } - - if err := binary.Read(dev, binary.LittleEndian, v); err != nil { +func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error { + n := binary.Size(v) + buf := make([]byte, n) + if read, _ := dev.ReadAt(buf, abOff); read < int(n) { return syserror.EIO } + binary.Unmarshal(buf, binary.LittleEndian, v) return nil } -// readFull is a wrapper around io.ReadFull which enforces the absolute offset -// parameter so that we can ensure that we do not perform incorrect reads from -// stale previously used offsets. -// -// Precondition: Must hold the mutex of the filesystem containing dev. -func readFull(dev io.ReadSeeker, abOff int64, dst []byte) (int, error) { - if _, err := dev.Seek(abOff, io.SeekStart); err != nil { - return 0, syserror.EIO - } - - n, err := io.ReadFull(dev, dst) - if err != nil { - err = syserror.EIO - } - return n, err -} - // readSuperBlock reads the SuperBlock from block group 0 in the underlying // device. There are three versions of the superblock. This function identifies // and returns the correct version. -// -// Precondition: Must hold the mutex of the filesystem containing dev. -func readSuperBlock(dev io.ReadSeeker) (disklayout.SuperBlock, error) { +func readSuperBlock(dev io.ReaderAt) (disklayout.SuperBlock, error) { var sb disklayout.SuperBlock = &disklayout.SuperBlockOld{} if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil { return nil, err @@ -98,9 +73,7 @@ func blockGroupsCount(sb disklayout.SuperBlock) uint64 { // readBlockGroups reads the block group descriptor table from block group 0 in // the underlying device. -// -// Precondition: Must hold the mutex of the filesystem containing dev. -func readBlockGroups(dev io.ReadSeeker, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) { +func readBlockGroups(dev io.ReaderAt, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) { bgCount := blockGroupsCount(sb) bgdSize := uint64(sb.BgDescSize()) is64Bit := sb.IncompatibleFeatures().Is64Bit |