summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/sentry/fs/ext/BUILD1
-rw-r--r--pkg/sentry/fs/ext/block_map_file.go8
-rw-r--r--pkg/sentry/fs/ext/ext.go23
-rw-r--r--pkg/sentry/fs/ext/ext_test.go2
-rw-r--r--pkg/sentry/fs/ext/extent_file.go25
-rw-r--r--pkg/sentry/fs/ext/extent_test.go2
-rw-r--r--pkg/sentry/fs/ext/filesystem.go26
-rw-r--r--pkg/sentry/fs/ext/inline_file.go2
-rw-r--r--pkg/sentry/fs/ext/inode.go6
-rw-r--r--pkg/sentry/fs/ext/regular_file.go9
-rw-r--r--pkg/sentry/fs/ext/symlink.go4
-rw-r--r--pkg/sentry/fs/ext/utils.go43
12 files changed, 55 insertions, 96 deletions
diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD
index a89dc5166..8158aa522 100644
--- a/pkg/sentry/fs/ext/BUILD
+++ b/pkg/sentry/fs/ext/BUILD
@@ -37,6 +37,7 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/binary",
+ "//pkg/fd",
"//pkg/sentry/context",
"//pkg/sentry/fs",
"//pkg/sentry/fs/ext/disklayout",
diff --git a/pkg/sentry/fs/ext/block_map_file.go b/pkg/sentry/fs/ext/block_map_file.go
index eb0b35e36..9aabbd145 100644
--- a/pkg/sentry/fs/ext/block_map_file.go
+++ b/pkg/sentry/fs/ext/block_map_file.go
@@ -16,6 +16,7 @@ package ext
import (
"io"
+ "sync"
"gvisor.dev/gvisor/pkg/binary"
)
@@ -25,10 +26,13 @@ import (
type blockMapFile struct {
regFile regularFile
+ // mu serializes changes to fileToPhysBlks.
+ mu sync.RWMutex
+
// fileToPhysBlks maps the file block numbers to the physical block numbers.
// the physical block number for the (i)th file block is stored in the (i)th
// index. This is initialized (at max) with the first 12 entries. The rest
- // have to be read in from disk when required.
+ // have to be read in from disk when required. Protected by mu.
fileToPhysBlks []uint32
}
@@ -36,7 +40,7 @@ type blockMapFile struct {
var _ fileReader = (*blockMapFile)(nil)
// Read implements fileReader.getFileReader.
-func (f *blockMapFile) getFileReader(dev io.ReadSeeker, blkSize uint64, offset uint64) io.Reader {
+func (f *blockMapFile) getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader {
panic("unimplemented")
}
diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go
index 2380f15da..d303dd122 100644
--- a/pkg/sentry/fs/ext/ext.go
+++ b/pkg/sentry/fs/ext/ext.go
@@ -19,9 +19,9 @@ import (
"errors"
"fmt"
"io"
- "os"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -35,11 +35,11 @@ type filesystemType struct{}
// Compiles only if filesystemType implements vfs.FilesystemType.
var _ vfs.FilesystemType = (*filesystemType)(nil)
-// getDeviceFd returns the read seeker to the underlying device.
+// getDeviceFd returns an io.ReaderAt to the underlying device.
// Currently there are two ways of mounting an ext(2/3/4) fs:
// 1. Specify a mount with our internal special MountType in the OCI spec.
// 2. Expose the device to the container and mount it from application layer.
-func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReadSeeker, error) {
+func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, error) {
if opts.InternalData == nil {
// User mount call.
// TODO(b/134676337): Open the device specified by `source` and return that.
@@ -47,20 +47,19 @@ func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReadSeeker, e
}
// NewFilesystem call originated from within the sentry.
- fd, ok := opts.InternalData.(uintptr)
+ devFd, ok := opts.InternalData.(int)
if !ok {
- return nil, errors.New("internal data for ext fs must be a uintptr containing the file descriptor to device")
+ return nil, errors.New("internal data for ext fs must be an int containing the file descriptor to device")
}
- // We do not close this file because that would close the underlying device
- // file descriptor (which is required for reading the fs from disk).
- // TODO(b/134676337): Use pkg/fd instead.
- deviceFile := os.NewFile(fd, source)
- if deviceFile == nil {
- return nil, fmt.Errorf("ext4 device file descriptor is not valid: %d", fd)
+ if devFd < 0 {
+ return nil, fmt.Errorf("ext device file descriptor is not valid: %d", devFd)
}
- return deviceFile, nil
+ // The fd.ReadWriter returned from fd.NewReadWriter() does not take ownership
+ // of the file descriptor and hence will not close it when it is garbage
+ // collected.
+ return fd.NewReadWriter(devFd), nil
}
// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
diff --git a/pkg/sentry/fs/ext/ext_test.go b/pkg/sentry/fs/ext/ext_test.go
index ee7f7907c..18764e92a 100644
--- a/pkg/sentry/fs/ext/ext_test.go
+++ b/pkg/sentry/fs/ext/ext_test.go
@@ -69,7 +69,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *v
// Mount the ext4 fs and retrieve the inode structure for the file.
mockCtx := contexttest.Context(t)
- fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: f.Fd()})
+ fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
if err != nil {
f.Close()
return nil, nil, nil, nil, err
diff --git a/pkg/sentry/fs/ext/extent_file.go b/pkg/sentry/fs/ext/extent_file.go
index 882300d96..aa4102dbb 100644
--- a/pkg/sentry/fs/ext/extent_file.go
+++ b/pkg/sentry/fs/ext/extent_file.go
@@ -36,7 +36,7 @@ type extentFile struct {
var _ fileReader = (*extentFile)(nil)
// Read implements fileReader.getFileReader.
-func (f *extentFile) getFileReader(dev io.ReadSeeker, blkSize uint64, offset uint64) io.Reader {
+func (f *extentFile) getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader {
return &extentReader{
dev: dev,
file: f,
@@ -47,10 +47,8 @@ func (f *extentFile) getFileReader(dev io.ReadSeeker, blkSize uint64, offset uin
// newExtentFile is the extent file constructor. It reads the entire extent
// tree into memory.
-//
-// Preconditions: Must hold the mutex of the filesystem containing dev.
// TODO(b/134676337): Build extent tree on demand to reduce memory usage.
-func newExtentFile(dev io.ReadSeeker, blkSize uint64, regFile regularFile) (*extentFile, error) {
+func newExtentFile(dev io.ReaderAt, blkSize uint64, regFile regularFile) (*extentFile, error) {
file := &extentFile{regFile: regFile}
file.regFile.impl = file
err := file.buildExtTree(dev, blkSize)
@@ -65,10 +63,8 @@ func newExtentFile(dev io.ReadSeeker, blkSize uint64, regFile regularFile) (*ext
// memory. Then it recursively builds the rest of the tree by reading it off
// disk.
//
-// Preconditions:
-// - Must hold the mutex of the filesystem containing dev.
-// - Inode flag InExtents must be set.
-func (f *extentFile) buildExtTree(dev io.ReadSeeker, blkSize uint64) error {
+// Precondition: inode flag InExtents must be set.
+func (f *extentFile) buildExtTree(dev io.ReaderAt, blkSize uint64) error {
rootNodeData := f.regFile.inode.diskInode.Data()
binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &f.root.Header)
@@ -110,9 +106,7 @@ func (f *extentFile) buildExtTree(dev io.ReadSeeker, blkSize uint64) error {
// buildExtTreeFromDisk reads the extent tree nodes from disk and recursively
// builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to
// by the ExtentEntry.
-//
-// Preconditions: Must hold the mutex of the filesystem containing dev.
-func buildExtTreeFromDisk(dev io.ReadSeeker, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) {
+func buildExtTreeFromDisk(dev io.ReaderAt, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) {
var header disklayout.ExtentHeader
off := entry.PhysicalBlock() * blkSize
err := readFromDisk(dev, int64(off), &header)
@@ -155,7 +149,7 @@ func buildExtTreeFromDisk(dev io.ReadSeeker, entry disklayout.ExtentEntry, blkSi
// extentReader implements io.Reader which can traverse the extent tree and
// read file data. This is not thread safe.
type extentReader struct {
- dev io.ReadSeeker
+ dev io.ReaderAt
file *extentFile
fileOff uint64 // Represents the current file offset being read from.
blkSize uint64
@@ -247,9 +241,12 @@ func (r *extentReader) readFromExtent(ex *disklayout.Extent, dst []byte) (int, e
toRead = len(dst)
}
- n, err := readFull(r.dev, int64(readStart), dst[:toRead])
+ n, _ := r.dev.ReadAt(dst[:toRead], int64(readStart))
r.fileOff += uint64(n)
- return n, err
+ if n < toRead {
+ return n, syserror.EIO
+ }
+ return n, nil
}
// fileBlock returns the file block number we are currently reading.
diff --git a/pkg/sentry/fs/ext/extent_test.go b/pkg/sentry/fs/ext/extent_test.go
index fb7921add..dff401114 100644
--- a/pkg/sentry/fs/ext/extent_test.go
+++ b/pkg/sentry/fs/ext/extent_test.go
@@ -201,7 +201,7 @@ func TestBuildExtentTree(t *testing.T) {
// extentTreeSetUp writes the passed extent tree to a mock disk as an extent
// tree. It also constucts a mock extent file with the same tree built in it.
// It also writes random data file data and returns it.
-func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (io.ReadSeeker, *extentFile, []byte) {
+func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (io.ReaderAt, *extentFile, []byte) {
t.Helper()
mockDisk := make([]byte, mockExtentBlkSize*10)
diff --git a/pkg/sentry/fs/ext/filesystem.go b/pkg/sentry/fs/ext/filesystem.go
index 32ca11026..12aeb5dac 100644
--- a/pkg/sentry/fs/ext/filesystem.go
+++ b/pkg/sentry/fs/ext/filesystem.go
@@ -31,22 +31,16 @@ type filesystem struct {
vfsfs vfs.Filesystem
- // mu serializes changes to the Dentry tree and the usage of the read seeker.
- mu sync.Mutex
+ // mu serializes changes to the Dentry tree.
+ mu sync.RWMutex
- // dev is the ReadSeeker for the underlying fs device. It is protected by mu.
- //
- // The ext filesystems aim to maximize locality, i.e. place all the data
- // blocks of a file close together. On a spinning disk, locality reduces the
- // amount of movement of the head hence speeding up IO operations. On an SSD
- // there are no moving parts but locality increases the size of each transer
- // request. Hence, having mutual exclusion on the read seeker while reading a
- // file *should* help in achieving the intended performance gains.
- //
- // Note: This synchronization was not coupled with the ReadSeeker itself
- // because we want to synchronize across read/seek operations for the
- // performance gains mentioned above. Helps enforcing one-file-at-a-time IO.
- dev io.ReadSeeker
+ // dev is the io.ReaderAt for the underlying fs device. It does not require
+ // protection because io.ReaderAt permits concurrent read calls to it. It
+ // translates to the pread syscall which passes on the read request directly
+ // to the device driver. Device drivers are intelligent in serving multiple
+ // concurrent read requests in the optimal order (taking locality into
+ // consideration).
+ dev io.ReaderAt
// inodeCache maps absolute inode numbers to the corresponding Inode struct.
// Inodes should be removed from this once their reference count hits 0.
@@ -69,7 +63,7 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil)
// getOrCreateInode gets the inode corresponding to the inode number passed in.
// It creates a new one with the given inode number if one does not exist.
//
-// Preconditions: must be holding fs.mu.
+// Precondition: must be holding fs.mu.
func (fs *filesystem) getOrCreateInode(ctx context.Context, inodeNum uint32) (*inode, error) {
if in, ok := fs.inodeCache[inodeNum]; ok {
return in, nil
diff --git a/pkg/sentry/fs/ext/inline_file.go b/pkg/sentry/fs/ext/inline_file.go
index 8f1395567..b9adfe548 100644
--- a/pkg/sentry/fs/ext/inline_file.go
+++ b/pkg/sentry/fs/ext/inline_file.go
@@ -28,7 +28,7 @@ type inlineFile struct {
var _ fileReader = (*inlineFile)(nil)
// getFileReader implements fileReader.getFileReader.
-func (f *inlineFile) getFileReader(_ io.ReadSeeker, _ uint64, offset uint64) io.Reader {
+func (f *inlineFile) getFileReader(_ io.ReaderAt, _ uint64, offset uint64) io.Reader {
diskInode := f.regFile.inode.diskInode
return &inlineReader{offset: offset, data: diskInode.Data()[:diskInode.Size()]}
}
diff --git a/pkg/sentry/fs/ext/inode.go b/pkg/sentry/fs/ext/inode.go
index 7d2a445fb..00e022953 100644
--- a/pkg/sentry/fs/ext/inode.go
+++ b/pkg/sentry/fs/ext/inode.go
@@ -75,7 +75,7 @@ func (in *inode) tryIncRef() bool {
// decRef decrements the inode ref count and releases the inode resources if
// the ref count hits 0.
//
-// Preconditions: Must have locked fs.mu.
+// Precondition: Must have locked fs.mu.
func (in *inode) decRef(fs *filesystem) {
if refs := atomic.AddInt64(&in.refs, -1); refs == 0 {
delete(fs.inodeCache, in.inodeNum)
@@ -86,9 +86,7 @@ func (in *inode) decRef(fs *filesystem) {
// newInode is the inode constructor. Reads the inode off disk. Identifies
// inodes based on the absolute inode number on disk.
-//
-// Preconditions: Must hold the mutex of the filesystem containing dev.
-func newInode(ctx context.Context, dev io.ReadSeeker, sb disklayout.SuperBlock, bgs []disklayout.BlockGroup, inodeNum uint32) (*inode, error) {
+func newInode(ctx context.Context, dev io.ReaderAt, sb disklayout.SuperBlock, bgs []disklayout.BlockGroup, inodeNum uint32) (*inode, error) {
if inodeNum == 0 {
panic("inode number 0 on ext filesystems is not possible")
}
diff --git a/pkg/sentry/fs/ext/regular_file.go b/pkg/sentry/fs/ext/regular_file.go
index 9bf39acfe..b48f61795 100644
--- a/pkg/sentry/fs/ext/regular_file.go
+++ b/pkg/sentry/fs/ext/regular_file.go
@@ -29,10 +29,7 @@ type fileReader interface {
//
// This reader is not meant to be retained across Read operations as it needs
// to be reinitialized with the correct offset for every Read.
- //
- // Precondition: Must hold the mutex of the filesystem containing dev while
- // using the Reader.
- getFileReader(dev io.ReadSeeker, blkSize uint64, offset uint64) io.Reader
+ getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader
}
// regularFile represents a regular file's inode. This too follows the
@@ -48,9 +45,7 @@ type regularFile struct {
// newRegularFile is the regularFile constructor. It figures out what kind of
// file this is and initializes the fileReader.
-//
-// Preconditions: Must hold the mutex of the filesystem containing dev.
-func newRegularFile(dev io.ReadSeeker, blkSize uint64, inode inode) (*regularFile, error) {
+func newRegularFile(dev io.ReaderAt, blkSize uint64, inode inode) (*regularFile, error) {
regFile := regularFile{
inode: inode,
}
diff --git a/pkg/sentry/fs/ext/symlink.go b/pkg/sentry/fs/ext/symlink.go
index 0ed67c0e4..6a55c1a7b 100644
--- a/pkg/sentry/fs/ext/symlink.go
+++ b/pkg/sentry/fs/ext/symlink.go
@@ -28,9 +28,7 @@ type symlink struct {
// newSymlink is the symlink constructor. It reads out the symlink target from
// the inode (however it might have been stored).
-//
-// Preconditions: Must hold the mutex of the filesystem containing dev.
-func newSymlink(dev io.ReadSeeker, blkSize uint64, inode inode) (*symlink, error) {
+func newSymlink(dev io.ReaderAt, blkSize uint64, inode inode) (*symlink, error) {
var file *symlink
var link []byte
diff --git a/pkg/sentry/fs/ext/utils.go b/pkg/sentry/fs/ext/utils.go
index 7c33919e0..3d89d664d 100644
--- a/pkg/sentry/fs/ext/utils.go
+++ b/pkg/sentry/fs/ext/utils.go
@@ -15,55 +15,30 @@
package ext
import (
- "encoding/binary"
"io"
+ "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
"gvisor.dev/gvisor/pkg/syserror"
)
// readFromDisk performs a binary read from disk into the given struct from
// the absolute offset provided.
-//
-// All disk reads should use this helper so we avoid reading from stale
-// previously used offsets. This function forces the offset parameter.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readFromDisk(dev io.ReadSeeker, abOff int64, v interface{}) error {
- if _, err := dev.Seek(abOff, io.SeekStart); err != nil {
- return syserror.EIO
- }
-
- if err := binary.Read(dev, binary.LittleEndian, v); err != nil {
+func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error {
+ n := binary.Size(v)
+ buf := make([]byte, n)
+ if read, _ := dev.ReadAt(buf, abOff); read < int(n) {
return syserror.EIO
}
+ binary.Unmarshal(buf, binary.LittleEndian, v)
return nil
}
-// readFull is a wrapper around io.ReadFull which enforces the absolute offset
-// parameter so that we can ensure that we do not perform incorrect reads from
-// stale previously used offsets.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readFull(dev io.ReadSeeker, abOff int64, dst []byte) (int, error) {
- if _, err := dev.Seek(abOff, io.SeekStart); err != nil {
- return 0, syserror.EIO
- }
-
- n, err := io.ReadFull(dev, dst)
- if err != nil {
- err = syserror.EIO
- }
- return n, err
-}
-
// readSuperBlock reads the SuperBlock from block group 0 in the underlying
// device. There are three versions of the superblock. This function identifies
// and returns the correct version.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readSuperBlock(dev io.ReadSeeker) (disklayout.SuperBlock, error) {
+func readSuperBlock(dev io.ReaderAt) (disklayout.SuperBlock, error) {
var sb disklayout.SuperBlock = &disklayout.SuperBlockOld{}
if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil {
return nil, err
@@ -98,9 +73,7 @@ func blockGroupsCount(sb disklayout.SuperBlock) uint64 {
// readBlockGroups reads the block group descriptor table from block group 0 in
// the underlying device.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readBlockGroups(dev io.ReadSeeker, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) {
+func readBlockGroups(dev io.ReaderAt, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) {
bgCount := blockGroupsCount(sb)
bgdSize := uint64(sb.BgDescSize())
is64Bit := sb.IncompatibleFeatures().Is64Bit