From 1c9781a4edce5fa9688f868149a2506f2ec5fa86 Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Wed, 7 Aug 2019 14:22:19 -0700 Subject: ext: vfs.FileDescriptionImpl and vfs.FilesystemImpl implementations. - This also gets rid of pipes for now because pipe does not have vfs2 specific support yet. - Added file path resolution logic. - Fixes testing infrastructure. - Does not include unit tests yet. PiperOrigin-RevId: 262213950 --- pkg/sentry/fs/attr.go | 44 ++++ pkg/sentry/fs/ext/BUILD | 21 +- pkg/sentry/fs/ext/README.md | 117 ++++++++++ pkg/sentry/fs/ext/block_map_file.go | 9 +- pkg/sentry/fs/ext/dentry.go | 2 - pkg/sentry/fs/ext/directory.go | 288 ++++++++++++++++++++++- pkg/sentry/fs/ext/disklayout/dirent.go | 3 + pkg/sentry/fs/ext/disklayout/dirent_test.go | 6 +- pkg/sentry/fs/ext/disklayout/superblock.go | 2 +- pkg/sentry/fs/ext/ext.go | 41 +++- pkg/sentry/fs/ext/ext_test.go | 30 ++- pkg/sentry/fs/ext/extent_file.go | 6 +- pkg/sentry/fs/ext/file_description.go | 110 +++++++++ pkg/sentry/fs/ext/filesystem.go | 341 ++++++++++++++++++++++++++-- pkg/sentry/fs/ext/inline_file.go | 55 ----- pkg/sentry/fs/ext/inode.go | 78 ++++++- pkg/sentry/fs/ext/named_pipe.go | 40 ---- pkg/sentry/fs/ext/regular_file.go | 112 ++++++++- pkg/sentry/fs/ext/symlink.go | 61 ++++- pkg/sentry/safemem/io.go | 55 ++++- pkg/sentry/syscalls/linux/sys_getdents.go | 24 +- 21 files changed, 1250 insertions(+), 195 deletions(-) create mode 100644 pkg/sentry/fs/ext/README.md create mode 100644 pkg/sentry/fs/ext/file_description.go delete mode 100644 pkg/sentry/fs/ext/inline_file.go delete mode 100644 pkg/sentry/fs/ext/named_pipe.go (limited to 'pkg/sentry') diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go index 9fc6a5bc2..4f3d6410e 100644 --- a/pkg/sentry/fs/attr.go +++ b/pkg/sentry/fs/attr.go @@ -111,6 +111,50 @@ func (n InodeType) LinuxType() uint32 { } } +// ToDirentType converts an InodeType to a linux dirent type field. +func ToDirentType(nodeType InodeType) uint8 { + switch nodeType { + case RegularFile, SpecialFile: + return linux.DT_REG + case Symlink: + return linux.DT_LNK + case Directory, SpecialDirectory: + return linux.DT_DIR + case Pipe: + return linux.DT_FIFO + case CharacterDevice: + return linux.DT_CHR + case BlockDevice: + return linux.DT_BLK + case Socket: + return linux.DT_SOCK + default: + return linux.DT_UNKNOWN + } +} + +// ToInodeType coverts a linux file type to InodeType. +func ToInodeType(linuxFileType linux.FileMode) InodeType { + switch linuxFileType { + case linux.ModeRegular: + return RegularFile + case linux.ModeDirectory: + return Directory + case linux.ModeSymlink: + return Symlink + case linux.ModeNamedPipe: + return Pipe + case linux.ModeCharacterDevice: + return CharacterDevice + case linux.ModeBlockDevice: + return BlockDevice + case linux.ModeSocket: + return Socket + default: + panic(fmt.Sprintf("unknown file mode: %d", linuxFileType)) + } +} + // StableAttr contains Inode attributes that will be stable throughout the // lifetime of the Inode. // diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD index e3d617576..c6168da0a 100644 --- a/pkg/sentry/fs/ext/BUILD +++ b/pkg/sentry/fs/ext/BUILD @@ -4,14 +4,14 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") go_template_instance( - name = "dentry_list", - out = "dentry_list.go", + name = "dirent_list", + out = "dirent_list.go", package = "ext", - prefix = "dentry", + prefix = "dirent", template = "//pkg/ilist:generic_list", types = { - "Element": "*dentry", - "Linker": "*dentry", + "Element": "*dirent", + "Linker": "*dirent", }, ) @@ -20,14 +20,13 @@ go_library( srcs = [ "block_map_file.go", "dentry.go", - "dentry_list.go", "directory.go", + "dirent_list.go", "ext.go", "extent_file.go", + "file_description.go", "filesystem.go", - "inline_file.go", "inode.go", - "named_pipe.go", "regular_file.go", "symlink.go", "utils.go", @@ -38,15 +37,19 @@ go_library( "//pkg/abi/linux", "//pkg/binary", "//pkg/fd", + "//pkg/log", + "//pkg/sentry/arch", "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/ext/disklayout", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/pipe", + "//pkg/sentry/memmap", "//pkg/sentry/safemem", + "//pkg/sentry/syscalls/linux", "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", + "//pkg/waiter", ], ) diff --git a/pkg/sentry/fs/ext/README.md b/pkg/sentry/fs/ext/README.md new file mode 100644 index 000000000..e212717aa --- /dev/null +++ b/pkg/sentry/fs/ext/README.md @@ -0,0 +1,117 @@ +## EXT(2/3/4) File System + +This is a filesystem driver which supports ext2, ext3 and ext4 filesystems. +Linux has specialized drivers for each variant but none which supports all. This +library takes advantage of ext's backward compatibility and understands the +internal organization of on-disk structures to support all variants. + +This driver implementation diverges from the Linux implementations in being more +forgiving about versioning. For instance, if a filesystem contains both extent +based inodes and classical block map based inodes, this driver will not complain +and interpret them both correctly. While in Linux this would be an issue. This +blurs the line between the three ext fs variants. + +Ext2 is considered deprecated as of Red Hat Enterprise Linux 7, and ext3 has +been superseded by ext4 by large performance gains. Thus it is recommended to +upgrade older filesystem images to ext4 using e2fsprogs for better performance. + +### Read Only + +This driver currently only allows read only operations. A lot of the design +decisions are based on this feature. There are plans to implement write (the +process for which is documented in the future work section). + +### Performance + +One of the biggest wins about this driver is that it directly talks to the +underlying block device (or whatever persistent storage is being used), instead +of making expensive RPCs to a gofer. + +Another advantage is that ext fs supports fast concurrent reads. Currently the +device is represented using a `io.ReaderAt` which allows for concurrent reads. +All reads are directly passed to the device driver which intelligently serves +the read requests in the optimal order. There is no congestion due to locking +while reading in the filesystem level. + +Reads are optimized further in the way file data is transferred over to user +memory. Ext fs directly copies over file data from disk into user memory with no +additional allocations on the way. We can only get faster by preloading file +data into memory (see future work section). + +The internal structures used to represent files, inodes and file descriptors use +a lot of inheritance. With the level of indirection that an interface adds with +an internal pointer, it can quickly fragment a structure across memory. As this +runs along side a full blown kernel (which is memory intensive), having a +fragmented struct might hurt performance. Hence these internal structures, +though interfaced, are tightly packed in memory using the same inheritance +pattern that pkg/sentry/vfs uses. The pkg/sentry/fs/ext/disklayout package makes +an execption to this pattern for reasons documented in the package. + +### Security + +This driver also intends to help sandbox the container better by reducing the +surface of the host kernel that the application touches. It prevents the +application from exploiting vulnerabilities in the host filesystem driver. All +`io.ReaderAt.ReadAt()` calls are translated to `pread(2)` which are directly +passed to the device driver in the kernel. Hence this reduces the surface for +attack. + +The application can not affect any host filesystems other than the one passed +via block device by the user. + +### Future Work + +#### Write + +To support write operations we would need to modify the block device underneath. +Currently, the driver does not modify the device at all, not even for updating +the access times for reads. Modifying the filesystem incorrectly can corrupt it +and render it unreadable for other correct ext(x) drivers. Hence caution must be +maintained while modifying metadata structures. + +Ext4 specifically is built for performance and has added a lot of complexity as +to how metadata structures are modified. For instance, files that are organized +via an extent tree which must be balanced and file data blocks must be placed in +the same extent as much as possible to increase locality. Such properties must +be maintained while modifying the tree. + +Ext filesystems boast a lot about locality, which plays a big role in them being +performant. The block allocation algorithm in Linux does a good job in keeping +related data together. This behavior must be maintained as much as possible, +else we might end up degrading the filesystem performance over time. + +Ext4 also supports a wide variety of features which are specialized for varying +use cases. Implementing all of them can get difficult very quickly. + +Ext(x) checksums all its metadata structures to check for corruption, so +modification of any metadata struct must correspond with re-checksumming the +struct. Linux filesystem drivers also order on-disk updates intelligently to not +corrupt the filesystem and also remain performant. The in-memory metadata +structures must be kept in sync with what is on disk. + +There is also replication of some important structures across the filesystem. +All replicas must be updated when their original copy is updated. There is also +provisioning for snapshotting which must be kept in mind, although it should not +affect this implementation unless we allow users to create filesystem snapshots. + +Ext4 also introduced journaling (jbd2). The journal must be updated +appropriately. + +#### Performance + +To improve performance we should implement a buffer cache, and optionally, read +ahead for small files. While doing so we must also keep in mind the memory usage +and have a reasonable cap on how much file data we want to hold in memory. + +#### Features + +Our current implementation will work with most ext4 filesystems for readonly +purposed. However, the following features are not supported yet: + +- Journal +- Snapshotting +- Extended Attributes +- Hash Tree Directories +- Meta Block Groups +- Multiple Mount Protection +- Bigalloc diff --git a/pkg/sentry/fs/ext/block_map_file.go b/pkg/sentry/fs/ext/block_map_file.go index f30c3a174..cea89bcd9 100644 --- a/pkg/sentry/fs/ext/block_map_file.go +++ b/pkg/sentry/fs/ext/block_map_file.go @@ -85,7 +85,8 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) { } offset := uint64(off) - if offset >= f.regFile.inode.diskInode.Size() { + size := f.regFile.inode.diskInode.Size() + if offset >= size { return 0, io.EOF } @@ -104,6 +105,9 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) { read := 0 toRead := len(dst) + if uint64(toRead)+offset > size { + toRead = int(size - offset) + } for read < toRead { var err error var curR int @@ -131,6 +135,9 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) { } } + if read < len(dst) { + return read, io.EOF + } return read, nil } diff --git a/pkg/sentry/fs/ext/dentry.go b/pkg/sentry/fs/ext/dentry.go index 19c9b3b2d..054fb42b6 100644 --- a/pkg/sentry/fs/ext/dentry.go +++ b/pkg/sentry/fs/ext/dentry.go @@ -26,8 +26,6 @@ type dentry struct { // share a single non-directory Inode (with hard links). inode is // immutable. inode *inode - // dentryEntry links Dentries into their parent directory.childList. - dentryEntry } // Compiles only if dentry implements vfs.DentryImpl. diff --git a/pkg/sentry/fs/ext/directory.go b/pkg/sentry/fs/ext/directory.go index ab2b59e44..f896dbe1d 100644 --- a/pkg/sentry/fs/ext/directory.go +++ b/pkg/sentry/fs/ext/directory.go @@ -14,23 +14,293 @@ package ext +import ( + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + // directory represents a directory inode. It holds the childList in memory. type directory struct { inode inode - // childList is a list containing (1) child Dentries and (2) fake Dentries - // (with inode == nil) that represent the iteration position of + // mu serializes the changes to childList. + // Lock Order (outermost locks must be taken first): + // directory.mu + // filesystem.mu + mu sync.Mutex + + // childList is a list containing (1) child dirents and (2) fake dirents + // (with diskDirent == nil) that represent the iteration position of // directoryFDs. childList is used to support directoryFD.IterDirents() - // efficiently. childList is immutable. - childList dentryList + // efficiently. childList is protected by mu. + childList direntList - // TODO(b/134676337): Add directory navigators. + // childMap maps the child's filename to the dirent structure stored in + // childList. This adds some data replication but helps in faster path + // traversal. For consistency, key == childMap[key].diskDirent.FileName(). + // Immutable. + childMap map[string]*dirent } // newDirectroy is the directory constructor. -func newDirectroy(inode inode) *directory { - // TODO(b/134676337): initialize childList. - file := &directory{inode: inode} +func newDirectroy(inode inode, newDirent bool) (*directory, error) { + file := &directory{inode: inode, childMap: make(map[string]*dirent)} file.inode.impl = file - return file + + // Initialize childList by reading dirents from the underlying file. + if inode.diskInode.Flags().Index { + // TODO(b/134676337): Support hash tree directories. Currently only the '.' + // and '..' entries are read in. + + // Users cannot navigate this hash tree directory yet. + log.Warningf("hash tree directory being used which is unsupported") + return file, nil + } + + // The dirents are organized in a linear array in the file data. + // Extract the file data and decode the dirents. + regFile, err := newRegularFile(inode) + if err != nil { + return nil, err + } + + // buf is used as scratch space for reading in dirents from disk and + // unmarshalling them into dirent structs. + buf := make([]byte, disklayout.DirentSize) + size := inode.diskInode.Size() + for off, inc := uint64(0), uint64(0); off < size; off += inc { + toRead := size - off + if toRead > disklayout.DirentSize { + toRead = disklayout.DirentSize + } + if n, err := regFile.impl.ReadAt(buf[:toRead], int64(off)); uint64(n) < toRead { + return nil, err + } + + var curDirent dirent + if newDirent { + curDirent.diskDirent = &disklayout.DirentNew{} + } else { + curDirent.diskDirent = &disklayout.DirentOld{} + } + binary.Unmarshal(buf, binary.LittleEndian, curDirent.diskDirent) + + if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 { + // Inode number and name length fields being set to 0 is used to indicate + // an unused dirent. + file.childList.PushBack(&curDirent) + file.childMap[curDirent.diskDirent.FileName()] = &curDirent + } + + // The next dirent is placed exactly after this dirent record on disk. + inc = uint64(curDirent.diskDirent.RecordSize()) + } + + return file, nil +} + +func (i *inode) isDir() bool { + _, ok := i.impl.(*directory) + return ok +} + +// dirent is the directory.childList node. +type dirent struct { + diskDirent disklayout.Dirent + + // direntEntry links dirents into their parent directory.childList. + direntEntry +} + +// directoryFD represents a directory file description. It implements +// vfs.FileDescriptionImpl. +type directoryFD struct { + fileDescription + vfs.DirectoryFileDescriptionDefaultImpl + + // Protected by directory.mu. + iter *dirent + off int64 +} + +// Compiles only if directoryFD implements vfs.FileDescriptionImpl. +var _ vfs.FileDescriptionImpl = (*directoryFD)(nil) + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *directoryFD) Release() { + if fd.iter == nil { + return + } + + dir := fd.inode().impl.(*directory) + dir.mu.Lock() + dir.childList.Remove(fd.iter) + dir.mu.Unlock() + fd.iter = nil +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + extfs := fd.filesystem() + dir := fd.inode().impl.(*directory) + + dir.mu.Lock() + defer dir.mu.Unlock() + + // Ensure that fd.iter exists and is not linked into dir.childList. + var child *dirent + if fd.iter == nil { + // Start iteration at the beginning of dir. + child = dir.childList.Front() + fd.iter = &dirent{} + } else { + // Continue iteration from where we left off. + child = fd.iter.Next() + dir.childList.Remove(fd.iter) + } + for ; child != nil; child = child.Next() { + // Skip other directoryFD iterators. + if child.diskDirent != nil { + childType, ok := child.diskDirent.FileType() + if !ok { + // We will need to read the inode off disk. Do not increment + // ref count here because this inode is not being added to the + // dentry tree. + extfs.mu.Lock() + childInode, err := extfs.getOrCreateInodeLocked(child.diskDirent.Inode()) + extfs.mu.Unlock() + if err != nil { + // Usage of the file description after the error is + // undefined. This implementation would continue reading + // from the next dirent. + fd.off++ + dir.childList.InsertAfter(child, fd.iter) + return err + } + childType = fs.ToInodeType(childInode.diskInode.Mode().FileType()) + } + + if !cb.Handle(vfs.Dirent{ + Name: child.diskDirent.FileName(), + Type: fs.ToDirentType(childType), + Ino: uint64(child.diskDirent.Inode()), + Off: fd.off, + }) { + dir.childList.InsertBefore(child, fd.iter) + return nil + } + fd.off++ + } + } + dir.childList.PushBack(fd.iter) + return nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + if whence != linux.SEEK_SET && whence != linux.SEEK_CUR { + return 0, syserror.EINVAL + } + + dir := fd.inode().impl.(*directory) + + dir.mu.Lock() + defer dir.mu.Unlock() + + // Find resulting offset. + offset += fd.off + + if offset < 0 { + // lseek(2) specifies that EINVAL should be returned if the resulting offset + // is negative. + return 0, syserror.EINVAL + } + + n := int64(len(dir.childMap)) + realWantOff := offset + if realWantOff > n { + realWantOff = n + } + realCurOff := fd.off + if realCurOff > n { + realCurOff = n + } + + // Ensure that fd.iter exists and is linked into dir.childList so we can + // intelligently seek from the optimal position. + if fd.iter == nil { + fd.iter = &dirent{} + dir.childList.PushFront(fd.iter) + } + + // Guess that iterating from the current position is optimal. + child := fd.iter + diff := realWantOff - realCurOff // Shows direction and magnitude of travel. + + // See if starting from the beginning or end is better. + abDiff := diff + if diff < 0 { + abDiff = -diff + } + if abDiff > realWantOff { + // Starting from the beginning is best. + child = dir.childList.Front() + diff = realWantOff + } else if abDiff > (n - realWantOff) { + // Starting from the end is best. + child = dir.childList.Back() + // (n - 1) because the last non-nil dirent represents the (n-1)th offset. + diff = realWantOff - (n - 1) + } + + for child != nil { + // Skip other directoryFD iterators. + if child.diskDirent != nil { + if diff == 0 { + if child != fd.iter { + dir.childList.Remove(fd.iter) + dir.childList.InsertBefore(child, fd.iter) + } + + fd.off = offset + return offset, nil + } + + if diff < 0 { + diff++ + child = child.Prev() + } else { + diff-- + child = child.Next() + } + continue + } + + if diff < 0 { + child = child.Prev() + } else { + child = child.Next() + } + } + + // Reaching here indicates that the offset is beyond the end of the childList. + dir.childList.Remove(fd.iter) + dir.childList.PushBack(fd.iter) + fd.off = offset + return offset, nil +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error { + // mmap(2) specifies that EACCESS should be returned for non-regular file fds. + return syserror.EACCES } diff --git a/pkg/sentry/fs/ext/disklayout/dirent.go b/pkg/sentry/fs/ext/disklayout/dirent.go index 685bf57b8..417b6cf65 100644 --- a/pkg/sentry/fs/ext/disklayout/dirent.go +++ b/pkg/sentry/fs/ext/disklayout/dirent.go @@ -21,6 +21,9 @@ import ( const ( // MaxFileName is the maximum length of an ext fs file's name. MaxFileName = 255 + + // DirentSize is the size of ext dirent structures. + DirentSize = 263 ) var ( diff --git a/pkg/sentry/fs/ext/disklayout/dirent_test.go b/pkg/sentry/fs/ext/disklayout/dirent_test.go index cc6dff2c9..934919f8a 100644 --- a/pkg/sentry/fs/ext/disklayout/dirent_test.go +++ b/pkg/sentry/fs/ext/disklayout/dirent_test.go @@ -21,8 +21,6 @@ import ( // TestDirentSize tests that the dirent structs are of the correct // size. func TestDirentSize(t *testing.T) { - want := uintptr(263) - - assertSize(t, DirentOld{}, want) - assertSize(t, DirentNew{}, want) + assertSize(t, DirentOld{}, uintptr(DirentSize)) + assertSize(t, DirentNew{}, uintptr(DirentSize)) } diff --git a/pkg/sentry/fs/ext/disklayout/superblock.go b/pkg/sentry/fs/ext/disklayout/superblock.go index 7a337a5e0..8bb327006 100644 --- a/pkg/sentry/fs/ext/disklayout/superblock.go +++ b/pkg/sentry/fs/ext/disklayout/superblock.go @@ -221,7 +221,7 @@ func CompatFeaturesFromInt(f uint32) CompatFeatures { // This is not exhaustive, unused features are not listed. const ( // SbDirentFileType indicates that directory entries record the file type. - // We should use struct ext4_dir_entry_2 for dirents then. + // We should use struct DirentNew for dirents then. SbDirentFileType = 0x2 // SbRecovery indicates that the filesystem needs recovery. diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go index d303dd122..c3e2c9efb 100644 --- a/pkg/sentry/fs/ext/ext.go +++ b/pkg/sentry/fs/ext/ext.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -62,8 +63,40 @@ func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, err return fd.NewReadWriter(devFd), nil } +// isCompatible checks if the superblock has feature sets which are compatible. +// We only need to check the superblock incompatible feature set since we are +// mounting readonly. We will also need to check readonly compatible feature +// set when mounting for read/write. +func isCompatible(sb disklayout.SuperBlock) bool { + // Please note that what is being checked is limited based on the fact that we + // are mounting readonly and that we are not journaling. When mounting + // read/write or with a journal, this must be reevaluated. + incompatFeatures := sb.IncompatibleFeatures() + if incompatFeatures.MetaBG { + log.Warningf("ext fs: meta block groups are not supported") + return false + } + if incompatFeatures.MMP { + log.Warningf("ext fs: multiple mount protection is not supported") + return false + } + if incompatFeatures.Encrypted { + log.Warningf("ext fs: encrypted inodes not supported") + return false + } + if incompatFeatures.InlineData { + log.Warningf("ext fs: inline files not supported") + return false + } + return true +} + // NewFilesystem implements vfs.FilesystemType.NewFilesystem. func (fstype filesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + // TODO(b/134676337): Ensure that the user is mounting readonly. If not, + // EACCESS should be returned according to mount(2). Filesystem independent + // flags (like readonly) are currently not available in pkg/sentry/vfs. + dev, err := getDeviceFd(source, opts) if err != nil { return nil, nil, err @@ -82,15 +115,21 @@ func (fstype filesystemType) NewFilesystem(ctx context.Context, creds *auth.Cred return nil, nil, syserror.EINVAL } + // Refuse to mount if the filesystem is incompatible. + if !isCompatible(fs.sb) { + return nil, nil, syserror.EINVAL + } + fs.bgs, err = readBlockGroups(dev, fs.sb) if err != nil { return nil, nil, err } - rootInode, err := fs.getOrCreateInode(ctx, disklayout.RootDirInode) + rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode) if err != nil { return nil, nil, err } + rootInode.incRef() return &fs.vfsfs, &newDentry(rootInode).vfsd, nil } diff --git a/pkg/sentry/fs/ext/ext_test.go b/pkg/sentry/fs/ext/ext_test.go index 6396886cc..6517e7ea5 100644 --- a/pkg/sentry/fs/ext/ext_test.go +++ b/pkg/sentry/fs/ext/ext_test.go @@ -44,7 +44,7 @@ var ( // setUp opens imagePath as an ext Filesystem and returns all necessary // elements required to run tests. If error is non-nil, it also returns a tear // down function which must be called after the test is run for clean up. -func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *vfs.Dentry, func(), error) { +func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesystem, *vfs.VirtualDentry, func(), error) { localImagePath, err := testutil.FindFile(imagePath) if err != nil { return nil, nil, nil, nil, fmt.Errorf("failed to open local image at path %s: %v", imagePath, err) @@ -55,20 +55,28 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *v return nil, nil, nil, nil, err } - // Mount the ext4 fs and retrieve the inode structure for the file. - mockCtx := contexttest.Context(t) - fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: int(f.Fd())}) + ctx := contexttest.Context(t) + creds := auth.CredentialsFromContext(ctx) + + // Create VFS. + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType("extfs", filesystemType{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())}) if err != nil { f.Close() return nil, nil, nil, nil, err } + root := mntns.Root() + tearDown := func() { + root.DecRef() + if err := f.Close(); err != nil { t.Fatalf("tearDown failed: %v", err) } } - return mockCtx, fs, d, tearDown, nil + return ctx, vfsObj, &root, tearDown, nil } // TestRootDir tests that the root directory inode is correctly initialized and @@ -126,15 +134,15 @@ func TestRootDir(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - _, _, vfsd, tearDown, err := setUp(t, test.image) + _, _, vd, tearDown, err := setUp(t, test.image) if err != nil { t.Fatalf("setUp failed: %v", err) } defer tearDown() - d, ok := vfsd.Impl().(*dentry) + d, ok := vd.Dentry().Impl().(*dentry) if !ok { - t.Fatalf("ext dentry of incorrect type: %T", vfsd.Impl()) + t.Fatalf("ext dentry of incorrect type: %T", vd.Dentry().Impl()) } // Offload inode contents into local structs for comparison. @@ -329,15 +337,15 @@ func TestFilesystemInit(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - _, vfsfs, _, tearDown, err := setUp(t, test.image) + _, _, vd, tearDown, err := setUp(t, test.image) if err != nil { t.Fatalf("setUp failed: %v", err) } defer tearDown() - fs, ok := vfsfs.Impl().(*filesystem) + fs, ok := vd.Mount().Filesystem().Impl().(*filesystem) if !ok { - t.Fatalf("ext filesystem of incorrect type: %T", vfsfs.Impl()) + t.Fatalf("ext filesystem of incorrect type: %T", vd.Mount().Filesystem().Impl()) } // Offload superblock and block group descriptors contents into diff --git a/pkg/sentry/fs/ext/extent_file.go b/pkg/sentry/fs/ext/extent_file.go index 44fb9c01f..1b9bf449b 100644 --- a/pkg/sentry/fs/ext/extent_file.go +++ b/pkg/sentry/fs/ext/extent_file.go @@ -150,7 +150,11 @@ func (f *extentFile) ReadAt(dst []byte, off int64) (int, error) { return 0, io.EOF } - return f.read(&f.root, uint64(off), dst) + n, err := f.read(&f.root, uint64(off), dst) + if n < len(dst) && err == nil { + err = io.EOF + } + return n, err } // read is the recursive step of extentFile.ReadAt which traverses the extent diff --git a/pkg/sentry/fs/ext/file_description.go b/pkg/sentry/fs/ext/file_description.go new file mode 100644 index 000000000..d244cf1e7 --- /dev/null +++ b/pkg/sentry/fs/ext/file_description.go @@ -0,0 +1,110 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// fileDescription is embedded by ext implementations of +// vfs.FileDescriptionImpl. +type fileDescription struct { + vfsfd vfs.FileDescription + + // flags is the same as vfs.OpenOptions.Flags which are passed to + // vfs.FilesystemImpl.OpenAt. + // TODO(b/134676337): syscalls like read(2), write(2), fchmod(2), fchown(2), + // fgetxattr(2), ioctl(2), mmap(2) should fail with EBADF if O_PATH is set. + // Only close(2), fstat(2), fstatfs(2) should work. + flags uint32 +} + +func (fd *fileDescription) filesystem() *filesystem { + return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem) +} + +func (fd *fileDescription) inode() *inode { + return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode +} + +// OnClose implements vfs.FileDescriptionImpl.OnClose. +func (fd *fileDescription) OnClose() error { return nil } + +// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. +func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) { + return fd.flags, nil +} + +// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. +func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { + // None of the flags settable by fcntl(F_SETFL) are supported, so this is a + // no-op. + return nil +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + var stat linux.Statx + fd.inode().statTo(&stat) + return stat, nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + if opts.Stat.Mask == 0 { + return nil + } + return syserror.EPERM +} + +// SetStat implements vfs.FileDescriptionImpl.StatFS. +func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { + var stat linux.Statfs + fd.filesystem().statTo(&stat) + return stat, nil +} + +// Readiness implements waiter.Waitable.Readiness analogously to +// file_operations::poll == NULL in Linux. +func (fd *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + // include/linux/poll.h:vfs_poll() => DEFAULT_POLLMASK + return waiter.EventIn | waiter.EventOut +} + +// EventRegister implements waiter.Waitable.EventRegister analogously to +// file_operations::poll == NULL in Linux. +func (fd *fileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {} + +// EventUnregister implements waiter.Waitable.EventUnregister analogously to +// file_operations::poll == NULL in Linux. +func (fd *fileDescription) EventUnregister(e *waiter.Entry) {} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *fileDescription) Sync(ctx context.Context) error { + return nil +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + // ioctl(2) specifies that ENOTTY must be returned if the file descriptor is + // not associated with a character special device (which is unimplemented). + return 0, syserror.ENOTTY +} diff --git a/pkg/sentry/fs/ext/filesystem.go b/pkg/sentry/fs/ext/filesystem.go index 45b43b9e2..e08839f48 100644 --- a/pkg/sentry/fs/ext/filesystem.go +++ b/pkg/sentry/fs/ext/filesystem.go @@ -15,20 +15,27 @@ package ext import ( + "errors" "io" "sync" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) +var ( + // errResolveDirent indicates that the vfs.ResolvingPath.Component() does + // not exist on the dentry tree but does exist on disk. So it has to be read in + // using the in-memory dirent and added to the dentry tree. Usually indicates + // the need to lock filesystem.mu for writing. + errResolveDirent = errors.New("resolve path component using dirent") +) + // filesystem implements vfs.FilesystemImpl. type filesystem struct { - // TODO(b/134676337): Remove when all methods have been implemented. - vfs.FilesystemImpl - vfsfs vfs.Filesystem // mu serializes changes to the Dentry tree. @@ -44,8 +51,8 @@ type filesystem struct { // inodeCache maps absolute inode numbers to the corresponding Inode struct. // Inodes should be removed from this once their reference count hits 0. // - // Protected by mu because every addition and removal from this corresponds to - // a change in the dentry tree. + // Protected by mu because most additions (see IterDirents) and all removals + // from this corresponds to a change in the dentry tree. inodeCache map[uint32]*inode // sb represents the filesystem superblock. Immutable after initialization. @@ -59,16 +66,172 @@ type filesystem struct { // Compiles only if filesystem implements vfs.FilesystemImpl. var _ vfs.FilesystemImpl = (*filesystem)(nil) -// getOrCreateInode gets the inode corresponding to the inode number passed in. +// stepLocked resolves rp.Component() in parent directory vfsd. The write +// parameter passed tells if the caller has acquired filesystem.mu for writing +// or not. If set to true, an existing inode on disk can be added to the dentry +// tree if not present already. +// +// stepLocked is loosely analogous to fs/namei.c:walk_component(). +// +// Preconditions: +// - filesystem.mu must be locked (for writing if write param is true). +// - !rp.Done(). +// - inode == vfsd.Impl().(*Dentry).inode. +func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) { + if !inode.isDir() { + return nil, nil, syserror.ENOTDIR + } + if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, nil, err + } + + for { + nextVFSD, err := rp.ResolveComponent(vfsd) + if err != nil { + return nil, nil, err + } + if nextVFSD == nil { + // Since the Dentry tree is not the sole source of truth for extfs, if it's + // not in the Dentry tree, it might need to be pulled from disk. + childDirent, ok := inode.impl.(*directory).childMap[rp.Component()] + if !ok { + // The underlying inode does not exist on disk. + return nil, nil, syserror.ENOENT + } + + if !write { + // filesystem.mu must be held for writing to add to the dentry tree. + return nil, nil, errResolveDirent + } + + // Create and add the component's dirent to the dentry tree. + fs := rp.Mount().Filesystem().Impl().(*filesystem) + childInode, err := fs.getOrCreateInodeLocked(childDirent.diskDirent.Inode()) + if err != nil { + return nil, nil, err + } + // incRef because this is being added to the dentry tree. + childInode.incRef() + child := newDentry(childInode) + vfsd.InsertChild(&child.vfsd, rp.Component()) + + // Continue as usual now that nextVFSD is not nil. + nextVFSD = &child.vfsd + } + nextInode := nextVFSD.Impl().(*dentry).inode + if nextInode.isSymlink() && rp.ShouldFollowSymlink() { + if err := rp.HandleSymlink(inode.impl.(*symlink).target); err != nil { + return nil, nil, err + } + continue + } + rp.Advance() + return nextVFSD, nextInode, nil + } +} + +// walkLocked resolves rp to an existing file. The write parameter +// passed tells if the caller has acquired filesystem.mu for writing or not. +// If set to true, additions can be made to the dentry tree while walking. +// If errResolveDirent is returned, the walk needs to be continued with an +// upgraded filesystem.mu. +// +// walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). +// +// Preconditions: +// - filesystem.mu must be locked (for writing if write param is true). +func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { + vfsd := rp.Start() + inode := vfsd.Impl().(*dentry).inode + for !rp.Done() { + var err error + vfsd, inode, err = stepLocked(rp, vfsd, inode, write) + if err != nil { + return nil, nil, err + } + } + if rp.MustBeDir() && !inode.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, inode, nil +} + +// walkParentLocked resolves all but the last path component of rp to an +// existing directory. It does not check that the returned directory is +// searchable by the provider of rp. The write parameter passed tells if the +// caller has acquired filesystem.mu for writing or not. If set to true, +// additions can be made to the dentry tree while walking. +// If errResolveDirent is returned, the walk needs to be continued with an +// upgraded filesystem.mu. +// +// walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat(). +// +// Preconditions: +// - filesystem.mu must be locked (for writing if write param is true). +// - !rp.Done(). +func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { + vfsd := rp.Start() + inode := vfsd.Impl().(*dentry).inode + for !rp.Final() { + var err error + vfsd, inode, err = stepLocked(rp, vfsd, inode, write) + if err != nil { + return nil, nil, err + } + } + if !inode.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, inode, nil +} + +// walk resolves rp to an existing file. If parent is set to true, it resolves +// the rp till the parent of the last component which should be an existing +// directory. If parent is false then resolves rp entirely. Attemps to resolve +// the path as far as it can with a read lock and upgrades the lock if needed. +func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) { + var ( + vfsd *vfs.Dentry + inode *inode + err error + ) + + // Try walking with the hopes that all dentries have already been pulled out + // of disk. This reduces congestion (allows concurrent walks). + fs.mu.RLock() + if parent { + vfsd, inode, err = walkParentLocked(rp, false) + } else { + vfsd, inode, err = walkLocked(rp, false) + } + fs.mu.RUnlock() + + if err == errResolveDirent { + // Upgrade lock and continue walking. Lock upgrading in the middle of the + // walk is fine as this is a read only filesystem. + fs.mu.Lock() + if parent { + vfsd, inode, err = walkParentLocked(rp, true) + } else { + vfsd, inode, err = walkLocked(rp, true) + } + fs.mu.Unlock() + } + + return vfsd, inode, err +} + +// getOrCreateInodeLocked gets the inode corresponding to the inode number passed in. // It creates a new one with the given inode number if one does not exist. +// The caller must increment the ref count if adding this to the dentry tree. // -// Precondition: must be holding fs.mu. -func (fs *filesystem) getOrCreateInode(ctx context.Context, inodeNum uint32) (*inode, error) { +// Precondition: must be holding fs.mu for writing. +func (fs *filesystem) getOrCreateInodeLocked(inodeNum uint32) (*inode, error) { if in, ok := fs.inodeCache[inodeNum]; ok { return in, nil } - in, err := newInode(ctx, fs, inodeNum) + in, err := newInode(fs, inodeNum) if err != nil { return nil, err } @@ -77,10 +240,92 @@ func (fs *filesystem) getOrCreateInode(ctx context.Context, inodeNum uint32) (*i return in, nil } -// Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +// statTo writes the statfs fields to the output parameter. +func (fs *filesystem) statTo(stat *linux.Statfs) { + stat.Type = uint64(fs.sb.Magic()) + stat.BlockSize = int64(fs.sb.BlockSize()) + stat.Blocks = fs.sb.BlocksCount() + stat.BlocksFree = fs.sb.FreeBlocksCount() + stat.BlocksAvailable = fs.sb.FreeBlocksCount() + stat.Files = uint64(fs.sb.InodesCount()) + stat.FilesFree = uint64(fs.sb.FreeInodesCount()) + stat.NameLength = disklayout.MaxFileName + stat.FragmentSize = int64(fs.sb.BlockSize()) + // TODO(b/134676337): Set Statfs.Flags and Statfs.FSID. +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + vfsd, inode, err := fs.walk(rp, false) + if err != nil { + return nil, err + } + + if opts.CheckSearchable { + if !inode.isDir() { + return nil, syserror.ENOTDIR + } + if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + } + + inode.incRef() + return vfsd, nil +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + vfsd, inode, err := fs.walk(rp, false) + if err != nil { + return nil, err + } + + // EROFS is returned if write access is needed. + if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 { + return nil, syserror.EROFS + } + return inode.open(rp, vfsd, opts.Flags) +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + _, inode, err := fs.walk(rp, false) + if err != nil { + return "", err + } + symlink, ok := inode.impl.(*symlink) + if !ok { + return "", syserror.EINVAL + } + return symlink.target, nil +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + _, inode, err := fs.walk(rp, false) + if err != nil { + return linux.Statx{}, err + } + var stat linux.Statx + inode.statTo(&stat) + return stat, nil } +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + if _, _, err := fs.walk(rp, false); err != nil { + return linux.Statfs{}, err + } + + var stat linux.Statfs + fs.statTo(&stat) + return stat, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() {} + // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { // This is a readonly filesystem for now. @@ -89,42 +334,110 @@ func (fs *filesystem) Sync(ctx context.Context) error { // The vfs.FilesystemImpl functions below return EROFS because their respective // man pages say that EROFS must be returned if the path resolves to a file on -// a read-only filesystem. +// this read-only filesystem. -// TODO(b/134676337): Implement path traversal and return EROFS only if the -// path resolves to a Dentry within ext fs. +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + if rp.Done() { + return syserror.EEXIST + } + + if _, _, err := fs.walk(rp, true); err != nil { + return err + } + + return syserror.EROFS +} // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + if rp.Done() { + return syserror.EEXIST + } + + if _, _, err := fs.walk(rp, true); err != nil { + return err + } + return syserror.EROFS } // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + if rp.Done() { + return syserror.EEXIST + } + + _, _, err := fs.walk(rp, true) + if err != nil { + return err + } + return syserror.EROFS } // RenameAt implements vfs.FilesystemImpl.RenameAt. func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error { + if rp.Done() { + return syserror.ENOENT + } + + _, _, err := fs.walk(rp, false) + if err != nil { + return err + } + return syserror.EROFS } // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + _, inode, err := fs.walk(rp, false) + if err != nil { + return err + } + + if !inode.isDir() { + return syserror.ENOTDIR + } + return syserror.EROFS } // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + _, _, err := fs.walk(rp, false) + if err != nil { + return err + } + return syserror.EROFS } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + if rp.Done() { + return syserror.EEXIST + } + + _, _, err := fs.walk(rp, true) + if err != nil { + return err + } + return syserror.EROFS } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + _, inode, err := fs.walk(rp, false) + if err != nil { + return err + } + + if inode.isDir() { + return syserror.EISDIR + } + return syserror.EROFS } diff --git a/pkg/sentry/fs/ext/inline_file.go b/pkg/sentry/fs/ext/inline_file.go deleted file mode 100644 index 67a538ba0..000000000 --- a/pkg/sentry/fs/ext/inline_file.go +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "io" -) - -// inlineFile is a type of regular file. All the data here is stored in the -// inode.Data() array. -type inlineFile struct { - regFile regularFile -} - -// Compiles only if inlineFile implements io.ReaderAt. -var _ io.ReaderAt = (*inlineFile)(nil) - -// newInlineFile is the inlineFile constructor. -func newInlineFile(regFile regularFile) *inlineFile { - file := &inlineFile{regFile: regFile} - file.regFile.impl = file - return file -} - -// ReadAt implements io.ReaderAt.ReadAt. -func (f *inlineFile) ReadAt(dst []byte, off int64) (int, error) { - if len(dst) == 0 { - return 0, nil - } - - size := f.regFile.inode.diskInode.Size() - if uint64(off) >= size { - return 0, io.EOF - } - - to := uint64(off) + uint64(len(dst)) - if to > size { - to = size - } - - n := copy(dst, f.regFile.inode.diskInode.Data()[off:to]) - return n, nil -} diff --git a/pkg/sentry/fs/ext/inode.go b/pkg/sentry/fs/ext/inode.go index 364980e4c..178bd6376 100644 --- a/pkg/sentry/fs/ext/inode.go +++ b/pkg/sentry/fs/ext/inode.go @@ -15,12 +15,14 @@ package ext import ( + "fmt" "io" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -31,13 +33,11 @@ import ( // // Implementations: // inode -- -// |-- pipe // |-- dir // |-- symlink // |-- regular-- // |-- extent file // |-- block map file -// |-- inline file type inode struct { // refs is a reference count. refs is accessed using atomic memory operations. refs int64 @@ -92,7 +92,7 @@ func (in *inode) decRef(fs *filesystem) { // newInode is the inode constructor. Reads the inode off disk. Identifies // inodes based on the absolute inode number on disk. -func newInode(ctx context.Context, fs *filesystem, inodeNum uint32) (*inode, error) { +func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { if inodeNum == 0 { panic("inode number 0 on ext filesystems is not possible") } @@ -117,7 +117,6 @@ func newInode(ctx context.Context, fs *filesystem, inodeNum uint32) (*inode, err // Build the inode based on its type. inode := inode{ - refs: 1, inodeNum: inodeNum, dev: fs.dev, blkSize: blkSize, @@ -138,15 +137,76 @@ func newInode(ctx context.Context, fs *filesystem, inodeNum uint32) (*inode, err } return &f.inode, nil case linux.ModeDirectory: - return &newDirectroy(inode).inode, nil - case linux.ModeNamedPipe: - return &newNamedPipe(ctx, inode).inode, nil + f, err := newDirectroy(inode, fs.sb.IncompatibleFeatures().DirentFileType) + if err != nil { + return nil, err + } + return &f.inode, nil default: - // TODO(b/134676337): Return appropriate errors for sockets and devices. + // TODO(b/134676337): Return appropriate errors for sockets, pipes and devices. return nil, syserror.EINVAL } } +// open creates and returns a file description for the dentry passed in. +func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(flags) + if err := in.checkPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + switch in.impl.(type) { + case *regularFile: + var fd regularFileFD + fd.flags = flags + fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + return &fd.vfsfd, nil + case *directory: + // Can't open directories writably. This check is not necessary for a read + // only filesystem but will be required when write is implemented. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EISDIR + } + var fd directoryFD + fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + fd.flags = flags + return &fd.vfsfd, nil + case *symlink: + if flags&linux.O_PATH == 0 { + // Can't open symlinks without O_PATH. + return nil, syserror.ELOOP + } + var fd symlinkFD + fd.flags = flags + fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + return &fd.vfsfd, nil + default: + panic(fmt.Sprintf("unknown inode type: %T", in.impl)) + } +} + +func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, in.isDir(), uint16(in.diskInode.Mode()), in.diskInode.UID(), in.diskInode.GID()) +} + +// statTo writes the statx fields to the output parameter. +func (in *inode) statTo(stat *linux.Statx) { + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | + linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | + linux.STATX_ATIME | linux.STATX_CTIME | linux.STATX_MTIME + stat.Blksize = uint32(in.blkSize) + stat.Mode = uint16(in.diskInode.Mode()) + stat.Nlink = uint32(in.diskInode.LinksCount()) + stat.UID = uint32(in.diskInode.UID()) + stat.GID = uint32(in.diskInode.GID()) + stat.Ino = uint64(in.inodeNum) + stat.Size = in.diskInode.Size() + stat.Atime = in.diskInode.AccessTime().StatxTimestamp() + stat.Ctime = in.diskInode.ChangeTime().StatxTimestamp() + stat.Mtime = in.diskInode.ModificationTime().StatxTimestamp() + // TODO(b/134676337): Set stat.Blocks which is the number of 512 byte blocks + // (including metadata blocks) required to represent this file. +} + // getBGNum returns the block group number that a given inode belongs to. func getBGNum(inodeNum uint32, inodesPerGrp uint32) uint32 { return (inodeNum - 1) / inodesPerGrp diff --git a/pkg/sentry/fs/ext/named_pipe.go b/pkg/sentry/fs/ext/named_pipe.go deleted file mode 100644 index 0f3af1b53..000000000 --- a/pkg/sentry/fs/ext/named_pipe.go +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" - "gvisor.dev/gvisor/pkg/sentry/usermem" -) - -// namedPipe represents a named pipe inode. It is currently just a wrapper -// around pkg/sentry/kernel/pipe. -type namedPipe struct { - inode inode - - p *pipe.Pipe - inodeOps fs.InodeOperations -} - -// newNamedPipe is the namedPipe constructor. -func newNamedPipe(ctx context.Context, inode inode) *namedPipe { - file := &namedPipe{inode: inode} - file.inode.impl = file - file.p = pipe.NewPipe(ctx, true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize) - file.inodeOps = pipe.NewInodeOperations(ctx, fs.FilePermsFromMode(file.inode.diskInode.Mode()), file.p) - return file -} diff --git a/pkg/sentry/fs/ext/regular_file.go b/pkg/sentry/fs/ext/regular_file.go index fb1bd38ef..ffc76ba5b 100644 --- a/pkg/sentry/fs/ext/regular_file.go +++ b/pkg/sentry/fs/ext/regular_file.go @@ -16,6 +16,15 @@ package ext import ( "io" + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/safemem" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" ) // regularFile represents a regular file's inode. This too follows the @@ -26,6 +35,9 @@ type regularFile struct { // This is immutable. The first field of fileReader implementations must be // regularFile to ensure temporality. + // io.ReaderAt is more strict than io.Reader in the sense that a partial read + // is always accompanied by an error. If a read spans past the end of file, a + // partial read (within file range) is done and io.EOF is returned. impl io.ReaderAt } @@ -48,16 +60,6 @@ func newRegularFile(inode inode) (*regularFile, error) { return &file.regFile, nil } - if inodeFlags.Inline { - if inode.diskInode.Size() > 60 { - panic("ext fs: inline file larger than 60 bytes") - } - - file := newInlineFile(regFile) - file.regFile.inode.impl = &file.regFile - return &file.regFile, nil - } - file, err := newBlockMapFile(regFile) if err != nil { return nil, err @@ -66,6 +68,92 @@ func newRegularFile(inode inode) (*regularFile, error) { return &file.regFile, nil } -func (f *regularFile) blksUsed(blkSize uint64) uint64 { - return (f.inode.diskInode.Size() + blkSize - 1) / blkSize +func (in *inode) isRegular() bool { + _, ok := in.impl.(*regularFile) + return ok +} + +// directoryFD represents a directory file description. It implements +// vfs.FileDescriptionImpl. +type regularFileFD struct { + fileDescription + + // off is the file offset. off is accessed using atomic memory operations. + off int64 + + // offMu serializes operations that may mutate off. + offMu sync.Mutex +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *regularFileFD) Release() {} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + safeReader := safemem.FromIOReaderAt{ + ReaderAt: fd.inode().impl.(*regularFile).impl, + Offset: offset, + } + + // Copies data from disk directly into usermem without any intermediate + // allocations (if dst is converted into BlockSeq such that it does not need + // safe copying). + return dst.CopyOutFrom(ctx, safeReader) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.offMu.Lock() + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + // write(2) specifies that EBADF must be returned if the fd is not open for + // writing. + return 0, syserror.EBADF +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + n, err := fd.PWrite(ctx, src, fd.off, opts) + fd.offMu.Lock() + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *regularFileFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + return syserror.ENOTDIR +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.offMu.Lock() + defer fd.offMu.Unlock() + switch whence { + case linux.SEEK_SET: + // Use offset as specified. + case linux.SEEK_CUR: + offset += fd.off + case linux.SEEK_END: + offset += int64(fd.inode().diskInode.Size()) + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error { + // TODO(b/134676337): Implement mmap(2). + return syserror.ENODEV } diff --git a/pkg/sentry/fs/ext/symlink.go b/pkg/sentry/fs/ext/symlink.go index 9f498d989..e06548a98 100644 --- a/pkg/sentry/fs/ext/symlink.go +++ b/pkg/sentry/fs/ext/symlink.go @@ -15,6 +15,10 @@ package ext import ( + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -43,8 +47,8 @@ func newSymlink(inode inode) (*symlink, error) { } link = make([]byte, size) - if n, _ := regFile.impl.ReadAt(link, 0); uint64(n) < size { - return nil, syserror.EIO + if n, err := regFile.impl.ReadAt(link, 0); uint64(n) < size { + return nil, err } } @@ -52,3 +56,56 @@ func newSymlink(inode inode) (*symlink, error) { file.inode.impl = file return file, nil } + +func (in *inode) isSymlink() bool { + _, ok := in.impl.(*symlink) + return ok +} + +// symlinkFD represents a symlink file description and implements implements +// vfs.FileDescriptionImpl. which may only be used if open options contains +// O_PATH. For this reason most of the functions return EBADF. +type symlinkFD struct { + fileDescription +} + +// Compiles only if symlinkFD implements vfs.FileDescriptionImpl. +var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil) + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *symlinkFD) Release() {} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *symlinkFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.EBADF +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *symlinkFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *symlinkFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.EBADF +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *symlinkFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + return syserror.ENOTDIR +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *symlinkFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, syserror.EBADF +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *symlinkFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error { + return syserror.EBADF +} diff --git a/pkg/sentry/safemem/io.go b/pkg/sentry/safemem/io.go index 5c3d73eb7..f039a5c34 100644 --- a/pkg/sentry/safemem/io.go +++ b/pkg/sentry/safemem/io.go @@ -157,7 +157,8 @@ func (w ToIOWriter) Write(src []byte) (int, error) { } // FromIOReader implements Reader for an io.Reader by repeatedly invoking -// io.Reader.Read until it returns an error or partial read. +// io.Reader.Read until it returns an error or partial read. This is not +// thread-safe. // // FromIOReader will return a successful partial read iff Reader.Read does so. type FromIOReader struct { @@ -206,6 +207,58 @@ func (r FromIOReader) readToBlock(dst Block, buf []byte) (int, []byte, error) { return wbn, buf, rerr } +// FromIOReaderAt implements Reader for an io.ReaderAt. Does not repeatedly +// invoke io.ReaderAt.ReadAt because ReadAt is more strict than Read. A partial +// read indicates an error. This is not thread-safe. +type FromIOReaderAt struct { + ReaderAt io.ReaderAt + Offset int64 +} + +// ReadToBlocks implements Reader.ReadToBlocks. +func (r FromIOReaderAt) ReadToBlocks(dsts BlockSeq) (uint64, error) { + var buf []byte + var done uint64 + for !dsts.IsEmpty() { + dst := dsts.Head() + var n int + var err error + n, buf, err = r.readToBlock(dst, buf) + done += uint64(n) + if n != dst.Len() { + return done, err + } + dsts = dsts.Tail() + if err != nil { + if dsts.IsEmpty() && err == io.EOF { + return done, nil + } + return done, err + } + } + return done, nil +} + +func (r FromIOReaderAt) readToBlock(dst Block, buf []byte) (int, []byte, error) { + // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require + // safecopy. + if !dst.NeedSafecopy() { + n, err := r.ReaderAt.ReadAt(dst.ToSlice(), r.Offset) + r.Offset += int64(n) + return n, buf, err + } + if len(buf) < dst.Len() { + buf = make([]byte, dst.Len()) + } + rn, rerr := r.ReaderAt.ReadAt(buf[:dst.Len()], r.Offset) + r.Offset += int64(rn) + wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn])) + if wberr != nil { + return wbn, buf, wberr + } + return wbn, buf, rerr +} + // FromIOWriter implements Writer for an io.Writer by repeatedly invoking // io.Writer.Write until it returns an error or partial write. // diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index 63e2c5a5d..912cbe4ff 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -120,7 +120,7 @@ func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent Ino: attr.InodeID, Off: offset, }, - Typ: toType(attr.Type), + Typ: fs.ToDirentType(attr.Type), }, Name: []byte(name), } @@ -142,28 +142,6 @@ func smallestDirent64(a arch.Context) uint { return uint(binary.Size(d.Hdr)) + a.Width() } -// toType converts an fs.InodeOperationsInfo to a linux dirent typ field. -func toType(nodeType fs.InodeType) uint8 { - switch nodeType { - case fs.RegularFile, fs.SpecialFile: - return linux.DT_REG - case fs.Symlink: - return linux.DT_LNK - case fs.Directory, fs.SpecialDirectory: - return linux.DT_DIR - case fs.Pipe: - return linux.DT_FIFO - case fs.CharacterDevice: - return linux.DT_CHR - case fs.BlockDevice: - return linux.DT_BLK - case fs.Socket: - return linux.DT_SOCK - default: - return linux.DT_UNKNOWN - } -} - // padRec pads the name field until the rec length is a multiple of the width, // which must be a power of 2. It returns the padded rec length. func (d *dirent) padRec(width int) uint16 { -- cgit v1.2.3