summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/fs/attr.go44
-rw-r--r--pkg/sentry/fs/ext/BUILD23
-rw-r--r--pkg/sentry/fs/ext/README.md117
-rw-r--r--pkg/sentry/fs/ext/block_map_file.go9
-rw-r--r--pkg/sentry/fs/ext/dentry.go2
-rw-r--r--pkg/sentry/fs/ext/directory.go290
-rw-r--r--pkg/sentry/fs/ext/disklayout/dirent.go3
-rw-r--r--pkg/sentry/fs/ext/disklayout/dirent_test.go6
-rw-r--r--pkg/sentry/fs/ext/disklayout/superblock.go2
-rw-r--r--pkg/sentry/fs/ext/ext.go41
-rw-r--r--pkg/sentry/fs/ext/ext_test.go544
-rw-r--r--pkg/sentry/fs/ext/extent_file.go6
-rw-r--r--pkg/sentry/fs/ext/file_description.go110
-rw-r--r--pkg/sentry/fs/ext/filesystem.go341
-rw-r--r--pkg/sentry/fs/ext/inline_file.go55
-rw-r--r--pkg/sentry/fs/ext/inode.go78
-rw-r--r--pkg/sentry/fs/ext/named_pipe.go40
-rw-r--r--pkg/sentry/fs/ext/regular_file.go112
-rw-r--r--pkg/sentry/fs/ext/symlink.go61
-rw-r--r--pkg/sentry/fsimpl/memfs/BUILD4
-rw-r--r--pkg/sentry/fsimpl/memfs/directory.go55
-rw-r--r--pkg/sentry/fsimpl/memfs/filesystem.go68
-rw-r--r--pkg/sentry/fsimpl/memfs/memfs.go92
-rw-r--r--pkg/sentry/fsimpl/memfs/regular_file.go6
-rw-r--r--pkg/sentry/fsimpl/memfs/symlink.go4
-rw-r--r--pkg/sentry/safemem/io.go55
-rw-r--r--pkg/sentry/socket/rpcinet/notifier/BUILD2
-rw-r--r--pkg/sentry/syscalls/linux/sys_getdents.go24
-rw-r--r--pkg/sentry/syscalls/linux/sys_splice.go8
29 files changed, 1897 insertions, 305 deletions
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 9fc6a5bc2..4f3d6410e 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -111,6 +111,50 @@ func (n InodeType) LinuxType() uint32 {
}
}
+// ToDirentType converts an InodeType to a linux dirent type field.
+func ToDirentType(nodeType InodeType) uint8 {
+ switch nodeType {
+ case RegularFile, SpecialFile:
+ return linux.DT_REG
+ case Symlink:
+ return linux.DT_LNK
+ case Directory, SpecialDirectory:
+ return linux.DT_DIR
+ case Pipe:
+ return linux.DT_FIFO
+ case CharacterDevice:
+ return linux.DT_CHR
+ case BlockDevice:
+ return linux.DT_BLK
+ case Socket:
+ return linux.DT_SOCK
+ default:
+ return linux.DT_UNKNOWN
+ }
+}
+
+// ToInodeType coverts a linux file type to InodeType.
+func ToInodeType(linuxFileType linux.FileMode) InodeType {
+ switch linuxFileType {
+ case linux.ModeRegular:
+ return RegularFile
+ case linux.ModeDirectory:
+ return Directory
+ case linux.ModeSymlink:
+ return Symlink
+ case linux.ModeNamedPipe:
+ return Pipe
+ case linux.ModeCharacterDevice:
+ return CharacterDevice
+ case linux.ModeBlockDevice:
+ return BlockDevice
+ case linux.ModeSocket:
+ return Socket
+ default:
+ panic(fmt.Sprintf("unknown file mode: %d", linuxFileType))
+ }
+}
+
// StableAttr contains Inode attributes that will be stable throughout the
// lifetime of the Inode.
//
diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD
index e3d617576..a3b1e4bd6 100644
--- a/pkg/sentry/fs/ext/BUILD
+++ b/pkg/sentry/fs/ext/BUILD
@@ -4,14 +4,14 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
load("//tools/go_generics:defs.bzl", "go_template_instance")
go_template_instance(
- name = "dentry_list",
- out = "dentry_list.go",
+ name = "dirent_list",
+ out = "dirent_list.go",
package = "ext",
- prefix = "dentry",
+ prefix = "dirent",
template = "//pkg/ilist:generic_list",
types = {
- "Element": "*dentry",
- "Linker": "*dentry",
+ "Element": "*dirent",
+ "Linker": "*dirent",
},
)
@@ -20,14 +20,13 @@ go_library(
srcs = [
"block_map_file.go",
"dentry.go",
- "dentry_list.go",
"directory.go",
+ "dirent_list.go",
"ext.go",
"extent_file.go",
+ "file_description.go",
"filesystem.go",
- "inline_file.go",
"inode.go",
- "named_pipe.go",
"regular_file.go",
"symlink.go",
"utils.go",
@@ -38,15 +37,19 @@ go_library(
"//pkg/abi/linux",
"//pkg/binary",
"//pkg/fd",
+ "//pkg/log",
+ "//pkg/sentry/arch",
"//pkg/sentry/context",
"//pkg/sentry/fs",
"//pkg/sentry/fs/ext/disklayout",
"//pkg/sentry/kernel/auth",
- "//pkg/sentry/kernel/pipe",
+ "//pkg/sentry/memmap",
"//pkg/sentry/safemem",
+ "//pkg/sentry/syscalls/linux",
"//pkg/sentry/usermem",
"//pkg/sentry/vfs",
"//pkg/syserror",
+ "//pkg/waiter",
],
)
@@ -73,7 +76,9 @@ go_test(
"//pkg/sentry/context/contexttest",
"//pkg/sentry/fs/ext/disklayout",
"//pkg/sentry/kernel/auth",
+ "//pkg/sentry/usermem",
"//pkg/sentry/vfs",
+ "//pkg/syserror",
"//runsc/test/testutil",
"@com_github_google_go-cmp//cmp:go_default_library",
"@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
diff --git a/pkg/sentry/fs/ext/README.md b/pkg/sentry/fs/ext/README.md
new file mode 100644
index 000000000..e212717aa
--- /dev/null
+++ b/pkg/sentry/fs/ext/README.md
@@ -0,0 +1,117 @@
+## EXT(2/3/4) File System
+
+This is a filesystem driver which supports ext2, ext3 and ext4 filesystems.
+Linux has specialized drivers for each variant but none which supports all. This
+library takes advantage of ext's backward compatibility and understands the
+internal organization of on-disk structures to support all variants.
+
+This driver implementation diverges from the Linux implementations in being more
+forgiving about versioning. For instance, if a filesystem contains both extent
+based inodes and classical block map based inodes, this driver will not complain
+and interpret them both correctly. While in Linux this would be an issue. This
+blurs the line between the three ext fs variants.
+
+Ext2 is considered deprecated as of Red Hat Enterprise Linux 7, and ext3 has
+been superseded by ext4 by large performance gains. Thus it is recommended to
+upgrade older filesystem images to ext4 using e2fsprogs for better performance.
+
+### Read Only
+
+This driver currently only allows read only operations. A lot of the design
+decisions are based on this feature. There are plans to implement write (the
+process for which is documented in the future work section).
+
+### Performance
+
+One of the biggest wins about this driver is that it directly talks to the
+underlying block device (or whatever persistent storage is being used), instead
+of making expensive RPCs to a gofer.
+
+Another advantage is that ext fs supports fast concurrent reads. Currently the
+device is represented using a `io.ReaderAt` which allows for concurrent reads.
+All reads are directly passed to the device driver which intelligently serves
+the read requests in the optimal order. There is no congestion due to locking
+while reading in the filesystem level.
+
+Reads are optimized further in the way file data is transferred over to user
+memory. Ext fs directly copies over file data from disk into user memory with no
+additional allocations on the way. We can only get faster by preloading file
+data into memory (see future work section).
+
+The internal structures used to represent files, inodes and file descriptors use
+a lot of inheritance. With the level of indirection that an interface adds with
+an internal pointer, it can quickly fragment a structure across memory. As this
+runs along side a full blown kernel (which is memory intensive), having a
+fragmented struct might hurt performance. Hence these internal structures,
+though interfaced, are tightly packed in memory using the same inheritance
+pattern that pkg/sentry/vfs uses. The pkg/sentry/fs/ext/disklayout package makes
+an execption to this pattern for reasons documented in the package.
+
+### Security
+
+This driver also intends to help sandbox the container better by reducing the
+surface of the host kernel that the application touches. It prevents the
+application from exploiting vulnerabilities in the host filesystem driver. All
+`io.ReaderAt.ReadAt()` calls are translated to `pread(2)` which are directly
+passed to the device driver in the kernel. Hence this reduces the surface for
+attack.
+
+The application can not affect any host filesystems other than the one passed
+via block device by the user.
+
+### Future Work
+
+#### Write
+
+To support write operations we would need to modify the block device underneath.
+Currently, the driver does not modify the device at all, not even for updating
+the access times for reads. Modifying the filesystem incorrectly can corrupt it
+and render it unreadable for other correct ext(x) drivers. Hence caution must be
+maintained while modifying metadata structures.
+
+Ext4 specifically is built for performance and has added a lot of complexity as
+to how metadata structures are modified. For instance, files that are organized
+via an extent tree which must be balanced and file data blocks must be placed in
+the same extent as much as possible to increase locality. Such properties must
+be maintained while modifying the tree.
+
+Ext filesystems boast a lot about locality, which plays a big role in them being
+performant. The block allocation algorithm in Linux does a good job in keeping
+related data together. This behavior must be maintained as much as possible,
+else we might end up degrading the filesystem performance over time.
+
+Ext4 also supports a wide variety of features which are specialized for varying
+use cases. Implementing all of them can get difficult very quickly.
+
+Ext(x) checksums all its metadata structures to check for corruption, so
+modification of any metadata struct must correspond with re-checksumming the
+struct. Linux filesystem drivers also order on-disk updates intelligently to not
+corrupt the filesystem and also remain performant. The in-memory metadata
+structures must be kept in sync with what is on disk.
+
+There is also replication of some important structures across the filesystem.
+All replicas must be updated when their original copy is updated. There is also
+provisioning for snapshotting which must be kept in mind, although it should not
+affect this implementation unless we allow users to create filesystem snapshots.
+
+Ext4 also introduced journaling (jbd2). The journal must be updated
+appropriately.
+
+#### Performance
+
+To improve performance we should implement a buffer cache, and optionally, read
+ahead for small files. While doing so we must also keep in mind the memory usage
+and have a reasonable cap on how much file data we want to hold in memory.
+
+#### Features
+
+Our current implementation will work with most ext4 filesystems for readonly
+purposed. However, the following features are not supported yet:
+
+- Journal
+- Snapshotting
+- Extended Attributes
+- Hash Tree Directories
+- Meta Block Groups
+- Multiple Mount Protection
+- Bigalloc
diff --git a/pkg/sentry/fs/ext/block_map_file.go b/pkg/sentry/fs/ext/block_map_file.go
index f30c3a174..cea89bcd9 100644
--- a/pkg/sentry/fs/ext/block_map_file.go
+++ b/pkg/sentry/fs/ext/block_map_file.go
@@ -85,7 +85,8 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) {
}
offset := uint64(off)
- if offset >= f.regFile.inode.diskInode.Size() {
+ size := f.regFile.inode.diskInode.Size()
+ if offset >= size {
return 0, io.EOF
}
@@ -104,6 +105,9 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) {
read := 0
toRead := len(dst)
+ if uint64(toRead)+offset > size {
+ toRead = int(size - offset)
+ }
for read < toRead {
var err error
var curR int
@@ -131,6 +135,9 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) {
}
}
+ if read < len(dst) {
+ return read, io.EOF
+ }
return read, nil
}
diff --git a/pkg/sentry/fs/ext/dentry.go b/pkg/sentry/fs/ext/dentry.go
index 19c9b3b2d..054fb42b6 100644
--- a/pkg/sentry/fs/ext/dentry.go
+++ b/pkg/sentry/fs/ext/dentry.go
@@ -26,8 +26,6 @@ type dentry struct {
// share a single non-directory Inode (with hard links). inode is
// immutable.
inode *inode
- // dentryEntry links Dentries into their parent directory.childList.
- dentryEntry
}
// Compiles only if dentry implements vfs.DentryImpl.
diff --git a/pkg/sentry/fs/ext/directory.go b/pkg/sentry/fs/ext/directory.go
index ab2b59e44..1ba8bf54c 100644
--- a/pkg/sentry/fs/ext/directory.go
+++ b/pkg/sentry/fs/ext/directory.go
@@ -14,23 +14,295 @@
package ext
+import (
+ "sync"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
// directory represents a directory inode. It holds the childList in memory.
type directory struct {
inode inode
- // childList is a list containing (1) child Dentries and (2) fake Dentries
- // (with inode == nil) that represent the iteration position of
+ // mu serializes the changes to childList.
+ // Lock Order (outermost locks must be taken first):
+ // directory.mu
+ // filesystem.mu
+ mu sync.Mutex
+
+ // childList is a list containing (1) child dirents and (2) fake dirents
+ // (with diskDirent == nil) that represent the iteration position of
// directoryFDs. childList is used to support directoryFD.IterDirents()
- // efficiently. childList is immutable.
- childList dentryList
+ // efficiently. childList is protected by mu.
+ childList direntList
- // TODO(b/134676337): Add directory navigators.
+ // childMap maps the child's filename to the dirent structure stored in
+ // childList. This adds some data replication but helps in faster path
+ // traversal. For consistency, key == childMap[key].diskDirent.FileName().
+ // Immutable.
+ childMap map[string]*dirent
}
// newDirectroy is the directory constructor.
-func newDirectroy(inode inode) *directory {
- // TODO(b/134676337): initialize childList.
- file := &directory{inode: inode}
+func newDirectroy(inode inode, newDirent bool) (*directory, error) {
+ file := &directory{inode: inode, childMap: make(map[string]*dirent)}
file.inode.impl = file
- return file
+
+ // Initialize childList by reading dirents from the underlying file.
+ if inode.diskInode.Flags().Index {
+ // TODO(b/134676337): Support hash tree directories. Currently only the '.'
+ // and '..' entries are read in.
+
+ // Users cannot navigate this hash tree directory yet.
+ log.Warningf("hash tree directory being used which is unsupported")
+ return file, nil
+ }
+
+ // The dirents are organized in a linear array in the file data.
+ // Extract the file data and decode the dirents.
+ regFile, err := newRegularFile(inode)
+ if err != nil {
+ return nil, err
+ }
+
+ // buf is used as scratch space for reading in dirents from disk and
+ // unmarshalling them into dirent structs.
+ buf := make([]byte, disklayout.DirentSize)
+ size := inode.diskInode.Size()
+ for off, inc := uint64(0), uint64(0); off < size; off += inc {
+ toRead := size - off
+ if toRead > disklayout.DirentSize {
+ toRead = disklayout.DirentSize
+ }
+ if n, err := regFile.impl.ReadAt(buf[:toRead], int64(off)); uint64(n) < toRead {
+ return nil, err
+ }
+
+ var curDirent dirent
+ if newDirent {
+ curDirent.diskDirent = &disklayout.DirentNew{}
+ } else {
+ curDirent.diskDirent = &disklayout.DirentOld{}
+ }
+ binary.Unmarshal(buf, binary.LittleEndian, curDirent.diskDirent)
+
+ if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 {
+ // Inode number and name length fields being set to 0 is used to indicate
+ // an unused dirent.
+ file.childList.PushBack(&curDirent)
+ file.childMap[curDirent.diskDirent.FileName()] = &curDirent
+ }
+
+ // The next dirent is placed exactly after this dirent record on disk.
+ inc = uint64(curDirent.diskDirent.RecordSize())
+ }
+
+ return file, nil
+}
+
+func (i *inode) isDir() bool {
+ _, ok := i.impl.(*directory)
+ return ok
+}
+
+// dirent is the directory.childList node.
+type dirent struct {
+ diskDirent disklayout.Dirent
+
+ // direntEntry links dirents into their parent directory.childList.
+ direntEntry
+}
+
+// directoryFD represents a directory file description. It implements
+// vfs.FileDescriptionImpl.
+type directoryFD struct {
+ fileDescription
+ vfs.DirectoryFileDescriptionDefaultImpl
+
+ // Protected by directory.mu.
+ iter *dirent
+ off int64
+}
+
+// Compiles only if directoryFD implements vfs.FileDescriptionImpl.
+var _ vfs.FileDescriptionImpl = (*directoryFD)(nil)
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release() {
+ if fd.iter == nil {
+ return
+ }
+
+ dir := fd.inode().impl.(*directory)
+ dir.mu.Lock()
+ dir.childList.Remove(fd.iter)
+ dir.mu.Unlock()
+ fd.iter = nil
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ extfs := fd.filesystem()
+ dir := fd.inode().impl.(*directory)
+
+ dir.mu.Lock()
+ defer dir.mu.Unlock()
+
+ // Ensure that fd.iter exists and is not linked into dir.childList.
+ var child *dirent
+ if fd.iter == nil {
+ // Start iteration at the beginning of dir.
+ child = dir.childList.Front()
+ fd.iter = &dirent{}
+ } else {
+ // Continue iteration from where we left off.
+ child = fd.iter.Next()
+ dir.childList.Remove(fd.iter)
+ }
+ for ; child != nil; child = child.Next() {
+ // Skip other directoryFD iterators.
+ if child.diskDirent != nil {
+ childType, ok := child.diskDirent.FileType()
+ if !ok {
+ // We will need to read the inode off disk. Do not increment
+ // ref count here because this inode is not being added to the
+ // dentry tree.
+ extfs.mu.Lock()
+ childInode, err := extfs.getOrCreateInodeLocked(child.diskDirent.Inode())
+ extfs.mu.Unlock()
+ if err != nil {
+ // Usage of the file description after the error is
+ // undefined. This implementation would continue reading
+ // from the next dirent.
+ fd.off++
+ dir.childList.InsertAfter(child, fd.iter)
+ return err
+ }
+ childType = fs.ToInodeType(childInode.diskInode.Mode().FileType())
+ }
+
+ if !cb.Handle(vfs.Dirent{
+ Name: child.diskDirent.FileName(),
+ Type: fs.ToDirentType(childType),
+ Ino: uint64(child.diskDirent.Inode()),
+ Off: fd.off,
+ }) {
+ dir.childList.InsertBefore(child, fd.iter)
+ return nil
+ }
+ fd.off++
+ }
+ }
+ dir.childList.PushBack(fd.iter)
+ return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ if whence != linux.SEEK_SET && whence != linux.SEEK_CUR {
+ return 0, syserror.EINVAL
+ }
+
+ dir := fd.inode().impl.(*directory)
+
+ dir.mu.Lock()
+ defer dir.mu.Unlock()
+
+ // Find resulting offset.
+ if whence == linux.SEEK_CUR {
+ offset += fd.off
+ }
+
+ if offset < 0 {
+ // lseek(2) specifies that EINVAL should be returned if the resulting offset
+ // is negative.
+ return 0, syserror.EINVAL
+ }
+
+ n := int64(len(dir.childMap))
+ realWantOff := offset
+ if realWantOff > n {
+ realWantOff = n
+ }
+ realCurOff := fd.off
+ if realCurOff > n {
+ realCurOff = n
+ }
+
+ // Ensure that fd.iter exists and is linked into dir.childList so we can
+ // intelligently seek from the optimal position.
+ if fd.iter == nil {
+ fd.iter = &dirent{}
+ dir.childList.PushFront(fd.iter)
+ }
+
+ // Guess that iterating from the current position is optimal.
+ child := fd.iter
+ diff := realWantOff - realCurOff // Shows direction and magnitude of travel.
+
+ // See if starting from the beginning or end is better.
+ abDiff := diff
+ if diff < 0 {
+ abDiff = -diff
+ }
+ if abDiff > realWantOff {
+ // Starting from the beginning is best.
+ child = dir.childList.Front()
+ diff = realWantOff
+ } else if abDiff > (n - realWantOff) {
+ // Starting from the end is best.
+ child = dir.childList.Back()
+ // (n - 1) because the last non-nil dirent represents the (n-1)th offset.
+ diff = realWantOff - (n - 1)
+ }
+
+ for child != nil {
+ // Skip other directoryFD iterators.
+ if child.diskDirent != nil {
+ if diff == 0 {
+ if child != fd.iter {
+ dir.childList.Remove(fd.iter)
+ dir.childList.InsertBefore(child, fd.iter)
+ }
+
+ fd.off = offset
+ return offset, nil
+ }
+
+ if diff < 0 {
+ diff++
+ child = child.Prev()
+ } else {
+ diff--
+ child = child.Next()
+ }
+ continue
+ }
+
+ if diff < 0 {
+ child = child.Prev()
+ } else {
+ child = child.Next()
+ }
+ }
+
+ // Reaching here indicates that the offset is beyond the end of the childList.
+ dir.childList.Remove(fd.iter)
+ dir.childList.PushBack(fd.iter)
+ fd.off = offset
+ return offset, nil
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+ // mmap(2) specifies that EACCESS should be returned for non-regular file fds.
+ return syserror.EACCES
}
diff --git a/pkg/sentry/fs/ext/disklayout/dirent.go b/pkg/sentry/fs/ext/disklayout/dirent.go
index 685bf57b8..417b6cf65 100644
--- a/pkg/sentry/fs/ext/disklayout/dirent.go
+++ b/pkg/sentry/fs/ext/disklayout/dirent.go
@@ -21,6 +21,9 @@ import (
const (
// MaxFileName is the maximum length of an ext fs file's name.
MaxFileName = 255
+
+ // DirentSize is the size of ext dirent structures.
+ DirentSize = 263
)
var (
diff --git a/pkg/sentry/fs/ext/disklayout/dirent_test.go b/pkg/sentry/fs/ext/disklayout/dirent_test.go
index cc6dff2c9..934919f8a 100644
--- a/pkg/sentry/fs/ext/disklayout/dirent_test.go
+++ b/pkg/sentry/fs/ext/disklayout/dirent_test.go
@@ -21,8 +21,6 @@ import (
// TestDirentSize tests that the dirent structs are of the correct
// size.
func TestDirentSize(t *testing.T) {
- want := uintptr(263)
-
- assertSize(t, DirentOld{}, want)
- assertSize(t, DirentNew{}, want)
+ assertSize(t, DirentOld{}, uintptr(DirentSize))
+ assertSize(t, DirentNew{}, uintptr(DirentSize))
}
diff --git a/pkg/sentry/fs/ext/disklayout/superblock.go b/pkg/sentry/fs/ext/disklayout/superblock.go
index 7a337a5e0..8bb327006 100644
--- a/pkg/sentry/fs/ext/disklayout/superblock.go
+++ b/pkg/sentry/fs/ext/disklayout/superblock.go
@@ -221,7 +221,7 @@ func CompatFeaturesFromInt(f uint32) CompatFeatures {
// This is not exhaustive, unused features are not listed.
const (
// SbDirentFileType indicates that directory entries record the file type.
- // We should use struct ext4_dir_entry_2 for dirents then.
+ // We should use struct DirentNew for dirents then.
SbDirentFileType = 0x2
// SbRecovery indicates that the filesystem needs recovery.
diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go
index d303dd122..c3e2c9efb 100644
--- a/pkg/sentry/fs/ext/ext.go
+++ b/pkg/sentry/fs/ext/ext.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/fd"
+ "gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -62,8 +63,40 @@ func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, err
return fd.NewReadWriter(devFd), nil
}
+// isCompatible checks if the superblock has feature sets which are compatible.
+// We only need to check the superblock incompatible feature set since we are
+// mounting readonly. We will also need to check readonly compatible feature
+// set when mounting for read/write.
+func isCompatible(sb disklayout.SuperBlock) bool {
+ // Please note that what is being checked is limited based on the fact that we
+ // are mounting readonly and that we are not journaling. When mounting
+ // read/write or with a journal, this must be reevaluated.
+ incompatFeatures := sb.IncompatibleFeatures()
+ if incompatFeatures.MetaBG {
+ log.Warningf("ext fs: meta block groups are not supported")
+ return false
+ }
+ if incompatFeatures.MMP {
+ log.Warningf("ext fs: multiple mount protection is not supported")
+ return false
+ }
+ if incompatFeatures.Encrypted {
+ log.Warningf("ext fs: encrypted inodes not supported")
+ return false
+ }
+ if incompatFeatures.InlineData {
+ log.Warningf("ext fs: inline files not supported")
+ return false
+ }
+ return true
+}
+
// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
func (fstype filesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ // TODO(b/134676337): Ensure that the user is mounting readonly. If not,
+ // EACCESS should be returned according to mount(2). Filesystem independent
+ // flags (like readonly) are currently not available in pkg/sentry/vfs.
+
dev, err := getDeviceFd(source, opts)
if err != nil {
return nil, nil, err
@@ -82,15 +115,21 @@ func (fstype filesystemType) NewFilesystem(ctx context.Context, creds *auth.Cred
return nil, nil, syserror.EINVAL
}
+ // Refuse to mount if the filesystem is incompatible.
+ if !isCompatible(fs.sb) {
+ return nil, nil, syserror.EINVAL
+ }
+
fs.bgs, err = readBlockGroups(dev, fs.sb)
if err != nil {
return nil, nil, err
}
- rootInode, err := fs.getOrCreateInode(ctx, disklayout.RootDirInode)
+ rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode)
if err != nil {
return nil, nil, err
}
+ rootInode.incRef()
return &fs.vfsfs, &newDentry(rootInode).vfsd, nil
}
diff --git a/pkg/sentry/fs/ext/ext_test.go b/pkg/sentry/fs/ext/ext_test.go
index 6396886cc..270f38074 100644
--- a/pkg/sentry/fs/ext/ext_test.go
+++ b/pkg/sentry/fs/ext/ext_test.go
@@ -16,17 +16,22 @@ package ext
import (
"fmt"
+ "io"
"os"
"path"
+ "sort"
"testing"
"github.com/google/go-cmp/cmp"
+ "github.com/google/go-cmp/cmp/cmpopts"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/runsc/test/testutil"
)
@@ -44,7 +49,7 @@ var (
// setUp opens imagePath as an ext Filesystem and returns all necessary
// elements required to run tests. If error is non-nil, it also returns a tear
// down function which must be called after the test is run for clean up.
-func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *vfs.Dentry, func(), error) {
+func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesystem, *vfs.VirtualDentry, func(), error) {
localImagePath, err := testutil.FindFile(imagePath)
if err != nil {
return nil, nil, nil, nil, fmt.Errorf("failed to open local image at path %s: %v", imagePath, err)
@@ -55,20 +60,537 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *v
return nil, nil, nil, nil, err
}
- // Mount the ext4 fs and retrieve the inode structure for the file.
- mockCtx := contexttest.Context(t)
- fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
+ ctx := contexttest.Context(t)
+ creds := auth.CredentialsFromContext(ctx)
+
+ // Create VFS.
+ vfsObj := vfs.New()
+ vfsObj.MustRegisterFilesystemType("extfs", filesystemType{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
if err != nil {
f.Close()
return nil, nil, nil, nil, err
}
+ root := mntns.Root()
+
tearDown := func() {
+ root.DecRef()
+
if err := f.Close(); err != nil {
t.Fatalf("tearDown failed: %v", err)
}
}
- return mockCtx, fs, d, tearDown, nil
+ return ctx, vfsObj, &root, tearDown, nil
+}
+
+// TODO(b/134676337): Test vfs.FilesystemImpl.ReadlinkAt and
+// vfs.FilesystemImpl.StatFSAt which are not implemented in
+// vfs.VirtualFilesystem yet.
+
+// TestSeek tests vfs.FileDescriptionImpl.Seek functionality.
+func TestSeek(t *testing.T) {
+ type seekTest struct {
+ name string
+ image string
+ path string
+ }
+
+ tests := []seekTest{
+ {
+ name: "ext4 root dir seek",
+ image: ext4ImagePath,
+ path: "/",
+ },
+ {
+ name: "ext3 root dir seek",
+ image: ext3ImagePath,
+ path: "/",
+ },
+ {
+ name: "ext2 root dir seek",
+ image: ext2ImagePath,
+ path: "/",
+ },
+ {
+ name: "ext4 reg file seek",
+ image: ext4ImagePath,
+ path: "/file.txt",
+ },
+ {
+ name: "ext3 reg file seek",
+ image: ext3ImagePath,
+ path: "/file.txt",
+ },
+ {
+ name: "ext2 reg file seek",
+ image: ext2ImagePath,
+ path: "/file.txt",
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ ctx, vfsfs, root, tearDown, err := setUp(t, test.image)
+ if err != nil {
+ t.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ fd, err := vfsfs.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.OpenAt failed: %v", err)
+ }
+
+ if n, err := fd.Impl().Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil {
+ t.Errorf("expected seek position 0, got %d and error %v", n, err)
+ }
+
+ stat, err := fd.Impl().Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ t.Errorf("fd.stat failed for file %s in image %s: %v", test.path, test.image, err)
+ }
+
+ // We should be able to seek beyond the end of file.
+ size := int64(stat.Size)
+ if n, err := fd.Impl().Seek(ctx, size, linux.SEEK_SET); n != size || err != nil {
+ t.Errorf("expected seek position %d, got %d and error %v", size, n, err)
+ }
+
+ // EINVAL should be returned if the resulting offset is negative.
+ if _, err := fd.Impl().Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL {
+ t.Errorf("expected error EINVAL but got %v", err)
+ }
+
+ if n, err := fd.Impl().Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil {
+ t.Errorf("expected seek position %d, got %d and error %v", size+3, n, err)
+ }
+
+ // Make sure negative offsets work with SEEK_CUR.
+ if n, err := fd.Impl().Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil {
+ t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
+ }
+
+ // EINVAL should be returned if the resulting offset is negative.
+ if _, err := fd.Impl().Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL {
+ t.Errorf("expected error EINVAL but got %v", err)
+ }
+
+ // Make sure SEEK_END works with regular files.
+ switch fd.Impl().(type) {
+ case *regularFileFD:
+ // Seek back to 0.
+ if n, err := fd.Impl().Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil {
+ t.Errorf("expected seek position %d, got %d and error %v", 0, n, err)
+ }
+
+ // Seek forward beyond EOF.
+ if n, err := fd.Impl().Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil {
+ t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
+ }
+
+ // EINVAL should be returned if the resulting offset is negative.
+ if _, err := fd.Impl().Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL {
+ t.Errorf("expected error EINVAL but got %v", err)
+ }
+ }
+ })
+ }
+}
+
+// TestStatAt tests filesystem.StatAt functionality.
+func TestStatAt(t *testing.T) {
+ type statAtTest struct {
+ name string
+ image string
+ path string
+ want linux.Statx
+ }
+
+ tests := []statAtTest{
+ {
+ name: "ext4 statx small file",
+ image: ext4ImagePath,
+ path: "/file.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13,
+ },
+ },
+ {
+ name: "ext3 statx small file",
+ image: ext3ImagePath,
+ path: "/file.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13,
+ },
+ },
+ {
+ name: "ext2 statx small file",
+ image: ext2ImagePath,
+ path: "/file.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13,
+ },
+ },
+ {
+ name: "ext4 statx big file",
+ image: ext4ImagePath,
+ path: "/bigfile.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13042,
+ },
+ },
+ {
+ name: "ext3 statx big file",
+ image: ext3ImagePath,
+ path: "/bigfile.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13042,
+ },
+ },
+ {
+ name: "ext2 statx big file",
+ image: ext2ImagePath,
+ path: "/bigfile.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13042,
+ },
+ },
+ {
+ name: "ext4 statx symlink file",
+ image: ext4ImagePath,
+ path: "/symlink.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0777 | linux.ModeSymlink,
+ Size: 8,
+ },
+ },
+ {
+ name: "ext3 statx symlink file",
+ image: ext3ImagePath,
+ path: "/symlink.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0777 | linux.ModeSymlink,
+ Size: 8,
+ },
+ },
+ {
+ name: "ext2 statx symlink file",
+ image: ext2ImagePath,
+ path: "/symlink.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0777 | linux.ModeSymlink,
+ Size: 8,
+ },
+ },
+ }
+
+ // Ignore the fields that are not supported by filesystem.StatAt yet and
+ // those which are likely to change as the image does.
+ ignoredFields := map[string]bool{
+ "Attributes": true,
+ "AttributesMask": true,
+ "Atime": true,
+ "Blocks": true,
+ "Btime": true,
+ "Ctime": true,
+ "DevMajor": true,
+ "DevMinor": true,
+ "Ino": true,
+ "Mask": true,
+ "Mtime": true,
+ "RdevMajor": true,
+ "RdevMinor": true,
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ ctx, vfsfs, root, tearDown, err := setUp(t, test.image)
+ if err != nil {
+ t.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ got, err := vfsfs.StatAt(ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+ &vfs.StatOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.StatAt failed for file %s in image %s: %v", test.path, test.image, err)
+ }
+
+ cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool {
+ _, ok := ignoredFields[p.String()]
+ return ok
+ }, cmp.Ignore())
+ if diff := cmp.Diff(got, test.want, cmpIgnoreFields, cmpopts.IgnoreUnexported(linux.Statx{})); diff != "" {
+ t.Errorf("stat mismatch (-want +got):\n%s", diff)
+ }
+ })
+ }
+}
+
+// TestRead tests the read functionality for vfs file descriptions.
+func TestRead(t *testing.T) {
+ type readTest struct {
+ name string
+ image string
+ absPath string
+ }
+
+ tests := []readTest{
+ {
+ name: "ext4 read small file",
+ image: ext4ImagePath,
+ absPath: "/file.txt",
+ },
+ {
+ name: "ext3 read small file",
+ image: ext3ImagePath,
+ absPath: "/file.txt",
+ },
+ {
+ name: "ext2 read small file",
+ image: ext2ImagePath,
+ absPath: "/file.txt",
+ },
+ {
+ name: "ext4 read big file",
+ image: ext4ImagePath,
+ absPath: "/bigfile.txt",
+ },
+ {
+ name: "ext3 read big file",
+ image: ext3ImagePath,
+ absPath: "/bigfile.txt",
+ },
+ {
+ name: "ext2 read big file",
+ image: ext2ImagePath,
+ absPath: "/bigfile.txt",
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ ctx, vfsfs, root, tearDown, err := setUp(t, test.image)
+ if err != nil {
+ t.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ fd, err := vfsfs.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.absPath},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.OpenAt failed: %v", err)
+ }
+
+ // Get a local file descriptor and compare its functionality with a vfs file
+ // description for the same file.
+ localFile, err := testutil.FindFile(path.Join(assetsDir, test.absPath))
+ if err != nil {
+ t.Fatalf("testutil.FindFile failed for %s: %v", test.absPath, err)
+ }
+
+ f, err := os.Open(localFile)
+ if err != nil {
+ t.Fatalf("os.Open failed for %s: %v", localFile, err)
+ }
+ defer f.Close()
+
+ // Read the entire file by reading one byte repeatedly. Doing this stress
+ // tests the underlying file reader implementation.
+ got := make([]byte, 1)
+ want := make([]byte, 1)
+ for {
+ n, err := f.Read(want)
+ fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{})
+
+ if diff := cmp.Diff(got, want); diff != "" {
+ t.Errorf("file data mismatch (-want +got):\n%s", diff)
+ }
+
+ // Make sure there is no more file data left after getting EOF.
+ if n == 0 || err == io.EOF {
+ if n, _ := fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 {
+ t.Errorf("extra unexpected file data in file %s in image %s", test.absPath, test.image)
+ }
+
+ break
+ }
+
+ if err != nil {
+ t.Fatalf("read failed: %v", err)
+ }
+ }
+ })
+ }
+}
+
+// iterDirentsCb is a simple callback which just keeps adding the dirents to an
+// internal list. Implements vfs.IterDirentsCallback.
+type iterDirentsCb struct {
+ dirents []vfs.Dirent
+}
+
+// Compiles only if iterDirentCb implements vfs.IterDirentsCallback.
+var _ vfs.IterDirentsCallback = (*iterDirentsCb)(nil)
+
+// newIterDirentsCb is the iterDirent
+func newIterDirentCb() *iterDirentsCb {
+ return &iterDirentsCb{dirents: make([]vfs.Dirent, 0)}
+}
+
+// Handle implements vfs.IterDirentsCallback.Handle.
+func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) bool {
+ cb.dirents = append(cb.dirents, dirent)
+ return true
+}
+
+// TestIterDirents tests the FileDescriptionImpl.IterDirents functionality.
+func TestIterDirents(t *testing.T) {
+ type iterDirentTest struct {
+ name string
+ image string
+ path string
+ want []vfs.Dirent
+ }
+
+ wantDirents := []vfs.Dirent{
+ vfs.Dirent{
+ Name: ".",
+ Type: linux.DT_DIR,
+ },
+ vfs.Dirent{
+ Name: "..",
+ Type: linux.DT_DIR,
+ },
+ vfs.Dirent{
+ Name: "lost+found",
+ Type: linux.DT_DIR,
+ },
+ vfs.Dirent{
+ Name: "file.txt",
+ Type: linux.DT_REG,
+ },
+ vfs.Dirent{
+ Name: "bigfile.txt",
+ Type: linux.DT_REG,
+ },
+ vfs.Dirent{
+ Name: "symlink.txt",
+ Type: linux.DT_LNK,
+ },
+ }
+ tests := []iterDirentTest{
+ {
+ name: "ext4 root dir iteration",
+ image: ext4ImagePath,
+ path: "/",
+ want: wantDirents,
+ },
+ {
+ name: "ext3 root dir iteration",
+ image: ext3ImagePath,
+ path: "/",
+ want: wantDirents,
+ },
+ {
+ name: "ext2 root dir iteration",
+ image: ext2ImagePath,
+ path: "/",
+ want: wantDirents,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ ctx, vfsfs, root, tearDown, err := setUp(t, test.image)
+ if err != nil {
+ t.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ fd, err := vfsfs.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.OpenAt failed: %v", err)
+ }
+
+ cb := &iterDirentsCb{}
+ if err = fd.Impl().IterDirents(ctx, cb); err != nil {
+ t.Fatalf("dir fd.IterDirents() failed: %v", err)
+ }
+
+ sort.Slice(cb.dirents, func(i int, j int) bool { return cb.dirents[i].Name < cb.dirents[j].Name })
+ sort.Slice(test.want, func(i int, j int) bool { return test.want[i].Name < test.want[j].Name })
+
+ // Ignore the inode number and offset of dirents because those are likely to
+ // change as the underlying image changes.
+ cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool {
+ return p.String() == "Ino" || p.String() == "Off"
+ }, cmp.Ignore())
+ if diff := cmp.Diff(cb.dirents, test.want, cmpIgnoreFields); diff != "" {
+ t.Errorf("dirents mismatch (-want +got):\n%s", diff)
+ }
+ })
+ }
}
// TestRootDir tests that the root directory inode is correctly initialized and
@@ -126,15 +648,15 @@ func TestRootDir(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
- _, _, vfsd, tearDown, err := setUp(t, test.image)
+ _, _, vd, tearDown, err := setUp(t, test.image)
if err != nil {
t.Fatalf("setUp failed: %v", err)
}
defer tearDown()
- d, ok := vfsd.Impl().(*dentry)
+ d, ok := vd.Dentry().Impl().(*dentry)
if !ok {
- t.Fatalf("ext dentry of incorrect type: %T", vfsd.Impl())
+ t.Fatalf("ext dentry of incorrect type: %T", vd.Dentry().Impl())
}
// Offload inode contents into local structs for comparison.
@@ -329,15 +851,15 @@ func TestFilesystemInit(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
- _, vfsfs, _, tearDown, err := setUp(t, test.image)
+ _, _, vd, tearDown, err := setUp(t, test.image)
if err != nil {
t.Fatalf("setUp failed: %v", err)
}
defer tearDown()
- fs, ok := vfsfs.Impl().(*filesystem)
+ fs, ok := vd.Mount().Filesystem().Impl().(*filesystem)
if !ok {
- t.Fatalf("ext filesystem of incorrect type: %T", vfsfs.Impl())
+ t.Fatalf("ext filesystem of incorrect type: %T", vd.Mount().Filesystem().Impl())
}
// Offload superblock and block group descriptors contents into
diff --git a/pkg/sentry/fs/ext/extent_file.go b/pkg/sentry/fs/ext/extent_file.go
index 44fb9c01f..1b9bf449b 100644
--- a/pkg/sentry/fs/ext/extent_file.go
+++ b/pkg/sentry/fs/ext/extent_file.go
@@ -150,7 +150,11 @@ func (f *extentFile) ReadAt(dst []byte, off int64) (int, error) {
return 0, io.EOF
}
- return f.read(&f.root, uint64(off), dst)
+ n, err := f.read(&f.root, uint64(off), dst)
+ if n < len(dst) && err == nil {
+ err = io.EOF
+ }
+ return n, err
}
// read is the recursive step of extentFile.ReadAt which traverses the extent
diff --git a/pkg/sentry/fs/ext/file_description.go b/pkg/sentry/fs/ext/file_description.go
new file mode 100644
index 000000000..d244cf1e7
--- /dev/null
+++ b/pkg/sentry/fs/ext/file_description.go
@@ -0,0 +1,110 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// fileDescription is embedded by ext implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+ vfsfd vfs.FileDescription
+
+ // flags is the same as vfs.OpenOptions.Flags which are passed to
+ // vfs.FilesystemImpl.OpenAt.
+ // TODO(b/134676337): syscalls like read(2), write(2), fchmod(2), fchown(2),
+ // fgetxattr(2), ioctl(2), mmap(2) should fail with EBADF if O_PATH is set.
+ // Only close(2), fstat(2), fstatfs(2) should work.
+ flags uint32
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+ return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) inode() *inode {
+ return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *fileDescription) OnClose() error { return nil }
+
+// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
+func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
+ return fd.flags, nil
+}
+
+// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
+func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
+ // None of the flags settable by fcntl(F_SETFL) are supported, so this is a
+ // no-op.
+ return nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ var stat linux.Statx
+ fd.inode().statTo(&stat)
+ return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ if opts.Stat.Mask == 0 {
+ return nil
+ }
+ return syserror.EPERM
+}
+
+// SetStat implements vfs.FileDescriptionImpl.StatFS.
+func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+ var stat linux.Statfs
+ fd.filesystem().statTo(&stat)
+ return stat, nil
+}
+
+// Readiness implements waiter.Waitable.Readiness analogously to
+// file_operations::poll == NULL in Linux.
+func (fd *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+ // include/linux/poll.h:vfs_poll() => DEFAULT_POLLMASK
+ return waiter.EventIn | waiter.EventOut
+}
+
+// EventRegister implements waiter.Waitable.EventRegister analogously to
+// file_operations::poll == NULL in Linux.
+func (fd *fileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {}
+
+// EventUnregister implements waiter.Waitable.EventUnregister analogously to
+// file_operations::poll == NULL in Linux.
+func (fd *fileDescription) EventUnregister(e *waiter.Entry) {}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *fileDescription) Sync(ctx context.Context) error {
+ return nil
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ // ioctl(2) specifies that ENOTTY must be returned if the file descriptor is
+ // not associated with a character special device (which is unimplemented).
+ return 0, syserror.ENOTTY
+}
diff --git a/pkg/sentry/fs/ext/filesystem.go b/pkg/sentry/fs/ext/filesystem.go
index 45b43b9e2..e08839f48 100644
--- a/pkg/sentry/fs/ext/filesystem.go
+++ b/pkg/sentry/fs/ext/filesystem.go
@@ -15,20 +15,27 @@
package ext
import (
+ "errors"
"io"
"sync"
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
)
+var (
+ // errResolveDirent indicates that the vfs.ResolvingPath.Component() does
+ // not exist on the dentry tree but does exist on disk. So it has to be read in
+ // using the in-memory dirent and added to the dentry tree. Usually indicates
+ // the need to lock filesystem.mu for writing.
+ errResolveDirent = errors.New("resolve path component using dirent")
+)
+
// filesystem implements vfs.FilesystemImpl.
type filesystem struct {
- // TODO(b/134676337): Remove when all methods have been implemented.
- vfs.FilesystemImpl
-
vfsfs vfs.Filesystem
// mu serializes changes to the Dentry tree.
@@ -44,8 +51,8 @@ type filesystem struct {
// inodeCache maps absolute inode numbers to the corresponding Inode struct.
// Inodes should be removed from this once their reference count hits 0.
//
- // Protected by mu because every addition and removal from this corresponds to
- // a change in the dentry tree.
+ // Protected by mu because most additions (see IterDirents) and all removals
+ // from this corresponds to a change in the dentry tree.
inodeCache map[uint32]*inode
// sb represents the filesystem superblock. Immutable after initialization.
@@ -59,16 +66,172 @@ type filesystem struct {
// Compiles only if filesystem implements vfs.FilesystemImpl.
var _ vfs.FilesystemImpl = (*filesystem)(nil)
-// getOrCreateInode gets the inode corresponding to the inode number passed in.
+// stepLocked resolves rp.Component() in parent directory vfsd. The write
+// parameter passed tells if the caller has acquired filesystem.mu for writing
+// or not. If set to true, an existing inode on disk can be added to the dentry
+// tree if not present already.
+//
+// stepLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions:
+// - filesystem.mu must be locked (for writing if write param is true).
+// - !rp.Done().
+// - inode == vfsd.Impl().(*Dentry).inode.
+func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
+ if !inode.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, nil, err
+ }
+
+ for {
+ nextVFSD, err := rp.ResolveComponent(vfsd)
+ if err != nil {
+ return nil, nil, err
+ }
+ if nextVFSD == nil {
+ // Since the Dentry tree is not the sole source of truth for extfs, if it's
+ // not in the Dentry tree, it might need to be pulled from disk.
+ childDirent, ok := inode.impl.(*directory).childMap[rp.Component()]
+ if !ok {
+ // The underlying inode does not exist on disk.
+ return nil, nil, syserror.ENOENT
+ }
+
+ if !write {
+ // filesystem.mu must be held for writing to add to the dentry tree.
+ return nil, nil, errResolveDirent
+ }
+
+ // Create and add the component's dirent to the dentry tree.
+ fs := rp.Mount().Filesystem().Impl().(*filesystem)
+ childInode, err := fs.getOrCreateInodeLocked(childDirent.diskDirent.Inode())
+ if err != nil {
+ return nil, nil, err
+ }
+ // incRef because this is being added to the dentry tree.
+ childInode.incRef()
+ child := newDentry(childInode)
+ vfsd.InsertChild(&child.vfsd, rp.Component())
+
+ // Continue as usual now that nextVFSD is not nil.
+ nextVFSD = &child.vfsd
+ }
+ nextInode := nextVFSD.Impl().(*dentry).inode
+ if nextInode.isSymlink() && rp.ShouldFollowSymlink() {
+ if err := rp.HandleSymlink(inode.impl.(*symlink).target); err != nil {
+ return nil, nil, err
+ }
+ continue
+ }
+ rp.Advance()
+ return nextVFSD, nextInode, nil
+ }
+}
+
+// walkLocked resolves rp to an existing file. The write parameter
+// passed tells if the caller has acquired filesystem.mu for writing or not.
+// If set to true, additions can be made to the dentry tree while walking.
+// If errResolveDirent is returned, the walk needs to be continued with an
+// upgraded filesystem.mu.
+//
+// walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
+//
+// Preconditions:
+// - filesystem.mu must be locked (for writing if write param is true).
+func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
+ vfsd := rp.Start()
+ inode := vfsd.Impl().(*dentry).inode
+ for !rp.Done() {
+ var err error
+ vfsd, inode, err = stepLocked(rp, vfsd, inode, write)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
+ if rp.MustBeDir() && !inode.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ return vfsd, inode, nil
+}
+
+// walkParentLocked resolves all but the last path component of rp to an
+// existing directory. It does not check that the returned directory is
+// searchable by the provider of rp. The write parameter passed tells if the
+// caller has acquired filesystem.mu for writing or not. If set to true,
+// additions can be made to the dentry tree while walking.
+// If errResolveDirent is returned, the walk needs to be continued with an
+// upgraded filesystem.mu.
+//
+// walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat().
+//
+// Preconditions:
+// - filesystem.mu must be locked (for writing if write param is true).
+// - !rp.Done().
+func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
+ vfsd := rp.Start()
+ inode := vfsd.Impl().(*dentry).inode
+ for !rp.Final() {
+ var err error
+ vfsd, inode, err = stepLocked(rp, vfsd, inode, write)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
+ if !inode.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ return vfsd, inode, nil
+}
+
+// walk resolves rp to an existing file. If parent is set to true, it resolves
+// the rp till the parent of the last component which should be an existing
+// directory. If parent is false then resolves rp entirely. Attemps to resolve
+// the path as far as it can with a read lock and upgrades the lock if needed.
+func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) {
+ var (
+ vfsd *vfs.Dentry
+ inode *inode
+ err error
+ )
+
+ // Try walking with the hopes that all dentries have already been pulled out
+ // of disk. This reduces congestion (allows concurrent walks).
+ fs.mu.RLock()
+ if parent {
+ vfsd, inode, err = walkParentLocked(rp, false)
+ } else {
+ vfsd, inode, err = walkLocked(rp, false)
+ }
+ fs.mu.RUnlock()
+
+ if err == errResolveDirent {
+ // Upgrade lock and continue walking. Lock upgrading in the middle of the
+ // walk is fine as this is a read only filesystem.
+ fs.mu.Lock()
+ if parent {
+ vfsd, inode, err = walkParentLocked(rp, true)
+ } else {
+ vfsd, inode, err = walkLocked(rp, true)
+ }
+ fs.mu.Unlock()
+ }
+
+ return vfsd, inode, err
+}
+
+// getOrCreateInodeLocked gets the inode corresponding to the inode number passed in.
// It creates a new one with the given inode number if one does not exist.
+// The caller must increment the ref count if adding this to the dentry tree.
//
-// Precondition: must be holding fs.mu.
-func (fs *filesystem) getOrCreateInode(ctx context.Context, inodeNum uint32) (*inode, error) {
+// Precondition: must be holding fs.mu for writing.
+func (fs *filesystem) getOrCreateInodeLocked(inodeNum uint32) (*inode, error) {
if in, ok := fs.inodeCache[inodeNum]; ok {
return in, nil
}
- in, err := newInode(ctx, fs, inodeNum)
+ in, err := newInode(fs, inodeNum)
if err != nil {
return nil, err
}
@@ -77,10 +240,92 @@ func (fs *filesystem) getOrCreateInode(ctx context.Context, inodeNum uint32) (*i
return in, nil
}
-// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+// statTo writes the statfs fields to the output parameter.
+func (fs *filesystem) statTo(stat *linux.Statfs) {
+ stat.Type = uint64(fs.sb.Magic())
+ stat.BlockSize = int64(fs.sb.BlockSize())
+ stat.Blocks = fs.sb.BlocksCount()
+ stat.BlocksFree = fs.sb.FreeBlocksCount()
+ stat.BlocksAvailable = fs.sb.FreeBlocksCount()
+ stat.Files = uint64(fs.sb.InodesCount())
+ stat.FilesFree = uint64(fs.sb.FreeInodesCount())
+ stat.NameLength = disklayout.MaxFileName
+ stat.FragmentSize = int64(fs.sb.BlockSize())
+ // TODO(b/134676337): Set Statfs.Flags and Statfs.FSID.
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+ vfsd, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return nil, err
+ }
+
+ if opts.CheckSearchable {
+ if !inode.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+ }
+
+ inode.incRef()
+ return vfsd, nil
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ vfsd, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return nil, err
+ }
+
+ // EROFS is returned if write access is needed.
+ if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 {
+ return nil, syserror.EROFS
+ }
+ return inode.open(rp, vfsd, opts.Flags)
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+ _, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return "", err
+ }
+ symlink, ok := inode.impl.(*symlink)
+ if !ok {
+ return "", syserror.EINVAL
+ }
+ return symlink.target, nil
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+ _, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ var stat linux.Statx
+ inode.statTo(&stat)
+ return stat, nil
}
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+ if _, _, err := fs.walk(rp, false); err != nil {
+ return linux.Statfs{}, err
+ }
+
+ var stat linux.Statfs
+ fs.statTo(&stat)
+ return stat, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {}
+
// Sync implements vfs.FilesystemImpl.Sync.
func (fs *filesystem) Sync(ctx context.Context) error {
// This is a readonly filesystem for now.
@@ -89,42 +334,110 @@ func (fs *filesystem) Sync(ctx context.Context) error {
// The vfs.FilesystemImpl functions below return EROFS because their respective
// man pages say that EROFS must be returned if the path resolves to a file on
-// a read-only filesystem.
+// this read-only filesystem.
-// TODO(b/134676337): Implement path traversal and return EROFS only if the
-// path resolves to a Dentry within ext fs.
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+
+ if _, _, err := fs.walk(rp, true); err != nil {
+ return err
+ }
+
+ return syserror.EROFS
+}
// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+
+ if _, _, err := fs.walk(rp, true); err != nil {
+ return err
+ }
+
return syserror.EROFS
}
// MknodAt implements vfs.FilesystemImpl.MknodAt.
func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+
+ _, _, err := fs.walk(rp, true)
+ if err != nil {
+ return err
+ }
+
return syserror.EROFS
}
// RenameAt implements vfs.FilesystemImpl.RenameAt.
func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+ if rp.Done() {
+ return syserror.ENOENT
+ }
+
+ _, _, err := fs.walk(rp, false)
+ if err != nil {
+ return err
+ }
+
return syserror.EROFS
}
// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ _, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return err
+ }
+
+ if !inode.isDir() {
+ return syserror.ENOTDIR
+ }
+
return syserror.EROFS
}
// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+ _, _, err := fs.walk(rp, false)
+ if err != nil {
+ return err
+ }
+
return syserror.EROFS
}
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+
+ _, _, err := fs.walk(rp, true)
+ if err != nil {
+ return err
+ }
+
return syserror.EROFS
}
// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ _, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return err
+ }
+
+ if inode.isDir() {
+ return syserror.EISDIR
+ }
+
return syserror.EROFS
}
diff --git a/pkg/sentry/fs/ext/inline_file.go b/pkg/sentry/fs/ext/inline_file.go
deleted file mode 100644
index 67a538ba0..000000000
--- a/pkg/sentry/fs/ext/inline_file.go
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ext
-
-import (
- "io"
-)
-
-// inlineFile is a type of regular file. All the data here is stored in the
-// inode.Data() array.
-type inlineFile struct {
- regFile regularFile
-}
-
-// Compiles only if inlineFile implements io.ReaderAt.
-var _ io.ReaderAt = (*inlineFile)(nil)
-
-// newInlineFile is the inlineFile constructor.
-func newInlineFile(regFile regularFile) *inlineFile {
- file := &inlineFile{regFile: regFile}
- file.regFile.impl = file
- return file
-}
-
-// ReadAt implements io.ReaderAt.ReadAt.
-func (f *inlineFile) ReadAt(dst []byte, off int64) (int, error) {
- if len(dst) == 0 {
- return 0, nil
- }
-
- size := f.regFile.inode.diskInode.Size()
- if uint64(off) >= size {
- return 0, io.EOF
- }
-
- to := uint64(off) + uint64(len(dst))
- if to > size {
- to = size
- }
-
- n := copy(dst, f.regFile.inode.diskInode.Data()[off:to])
- return n, nil
-}
diff --git a/pkg/sentry/fs/ext/inode.go b/pkg/sentry/fs/ext/inode.go
index 364980e4c..178bd6376 100644
--- a/pkg/sentry/fs/ext/inode.go
+++ b/pkg/sentry/fs/ext/inode.go
@@ -15,12 +15,14 @@
package ext
import (
+ "fmt"
"io"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -31,13 +33,11 @@ import (
//
// Implementations:
// inode --
-// |-- pipe
// |-- dir
// |-- symlink
// |-- regular--
// |-- extent file
// |-- block map file
-// |-- inline file
type inode struct {
// refs is a reference count. refs is accessed using atomic memory operations.
refs int64
@@ -92,7 +92,7 @@ func (in *inode) decRef(fs *filesystem) {
// newInode is the inode constructor. Reads the inode off disk. Identifies
// inodes based on the absolute inode number on disk.
-func newInode(ctx context.Context, fs *filesystem, inodeNum uint32) (*inode, error) {
+func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
if inodeNum == 0 {
panic("inode number 0 on ext filesystems is not possible")
}
@@ -117,7 +117,6 @@ func newInode(ctx context.Context, fs *filesystem, inodeNum uint32) (*inode, err
// Build the inode based on its type.
inode := inode{
- refs: 1,
inodeNum: inodeNum,
dev: fs.dev,
blkSize: blkSize,
@@ -138,15 +137,76 @@ func newInode(ctx context.Context, fs *filesystem, inodeNum uint32) (*inode, err
}
return &f.inode, nil
case linux.ModeDirectory:
- return &newDirectroy(inode).inode, nil
- case linux.ModeNamedPipe:
- return &newNamedPipe(ctx, inode).inode, nil
+ f, err := newDirectroy(inode, fs.sb.IncompatibleFeatures().DirentFileType)
+ if err != nil {
+ return nil, err
+ }
+ return &f.inode, nil
default:
- // TODO(b/134676337): Return appropriate errors for sockets and devices.
+ // TODO(b/134676337): Return appropriate errors for sockets, pipes and devices.
return nil, syserror.EINVAL
}
}
+// open creates and returns a file description for the dentry passed in.
+func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+ ats := vfs.AccessTypesForOpenFlags(flags)
+ if err := in.checkPermissions(rp.Credentials(), ats); err != nil {
+ return nil, err
+ }
+ switch in.impl.(type) {
+ case *regularFile:
+ var fd regularFileFD
+ fd.flags = flags
+ fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+ return &fd.vfsfd, nil
+ case *directory:
+ // Can't open directories writably. This check is not necessary for a read
+ // only filesystem but will be required when write is implemented.
+ if ats&vfs.MayWrite != 0 {
+ return nil, syserror.EISDIR
+ }
+ var fd directoryFD
+ fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+ fd.flags = flags
+ return &fd.vfsfd, nil
+ case *symlink:
+ if flags&linux.O_PATH == 0 {
+ // Can't open symlinks without O_PATH.
+ return nil, syserror.ELOOP
+ }
+ var fd symlinkFD
+ fd.flags = flags
+ fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+ return &fd.vfsfd, nil
+ default:
+ panic(fmt.Sprintf("unknown inode type: %T", in.impl))
+ }
+}
+
+func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+ return vfs.GenericCheckPermissions(creds, ats, in.isDir(), uint16(in.diskInode.Mode()), in.diskInode.UID(), in.diskInode.GID())
+}
+
+// statTo writes the statx fields to the output parameter.
+func (in *inode) statTo(stat *linux.Statx) {
+ stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
+ linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE |
+ linux.STATX_ATIME | linux.STATX_CTIME | linux.STATX_MTIME
+ stat.Blksize = uint32(in.blkSize)
+ stat.Mode = uint16(in.diskInode.Mode())
+ stat.Nlink = uint32(in.diskInode.LinksCount())
+ stat.UID = uint32(in.diskInode.UID())
+ stat.GID = uint32(in.diskInode.GID())
+ stat.Ino = uint64(in.inodeNum)
+ stat.Size = in.diskInode.Size()
+ stat.Atime = in.diskInode.AccessTime().StatxTimestamp()
+ stat.Ctime = in.diskInode.ChangeTime().StatxTimestamp()
+ stat.Mtime = in.diskInode.ModificationTime().StatxTimestamp()
+ // TODO(b/134676337): Set stat.Blocks which is the number of 512 byte blocks
+ // (including metadata blocks) required to represent this file.
+}
+
// getBGNum returns the block group number that a given inode belongs to.
func getBGNum(inodeNum uint32, inodesPerGrp uint32) uint32 {
return (inodeNum - 1) / inodesPerGrp
diff --git a/pkg/sentry/fs/ext/named_pipe.go b/pkg/sentry/fs/ext/named_pipe.go
deleted file mode 100644
index 0f3af1b53..000000000
--- a/pkg/sentry/fs/ext/named_pipe.go
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ext
-
-import (
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
-)
-
-// namedPipe represents a named pipe inode. It is currently just a wrapper
-// around pkg/sentry/kernel/pipe.
-type namedPipe struct {
- inode inode
-
- p *pipe.Pipe
- inodeOps fs.InodeOperations
-}
-
-// newNamedPipe is the namedPipe constructor.
-func newNamedPipe(ctx context.Context, inode inode) *namedPipe {
- file := &namedPipe{inode: inode}
- file.inode.impl = file
- file.p = pipe.NewPipe(ctx, true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)
- file.inodeOps = pipe.NewInodeOperations(ctx, fs.FilePermsFromMode(file.inode.diskInode.Mode()), file.p)
- return file
-}
diff --git a/pkg/sentry/fs/ext/regular_file.go b/pkg/sentry/fs/ext/regular_file.go
index fb1bd38ef..ffc76ba5b 100644
--- a/pkg/sentry/fs/ext/regular_file.go
+++ b/pkg/sentry/fs/ext/regular_file.go
@@ -16,6 +16,15 @@ package ext
import (
"io"
+ "sync"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
)
// regularFile represents a regular file's inode. This too follows the
@@ -26,6 +35,9 @@ type regularFile struct {
// This is immutable. The first field of fileReader implementations must be
// regularFile to ensure temporality.
+ // io.ReaderAt is more strict than io.Reader in the sense that a partial read
+ // is always accompanied by an error. If a read spans past the end of file, a
+ // partial read (within file range) is done and io.EOF is returned.
impl io.ReaderAt
}
@@ -48,16 +60,6 @@ func newRegularFile(inode inode) (*regularFile, error) {
return &file.regFile, nil
}
- if inodeFlags.Inline {
- if inode.diskInode.Size() > 60 {
- panic("ext fs: inline file larger than 60 bytes")
- }
-
- file := newInlineFile(regFile)
- file.regFile.inode.impl = &file.regFile
- return &file.regFile, nil
- }
-
file, err := newBlockMapFile(regFile)
if err != nil {
return nil, err
@@ -66,6 +68,92 @@ func newRegularFile(inode inode) (*regularFile, error) {
return &file.regFile, nil
}
-func (f *regularFile) blksUsed(blkSize uint64) uint64 {
- return (f.inode.diskInode.Size() + blkSize - 1) / blkSize
+func (in *inode) isRegular() bool {
+ _, ok := in.impl.(*regularFile)
+ return ok
+}
+
+// directoryFD represents a directory file description. It implements
+// vfs.FileDescriptionImpl.
+type regularFileFD struct {
+ fileDescription
+
+ // off is the file offset. off is accessed using atomic memory operations.
+ off int64
+
+ // offMu serializes operations that may mutate off.
+ offMu sync.Mutex
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release() {}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ safeReader := safemem.FromIOReaderAt{
+ ReaderAt: fd.inode().impl.(*regularFile).impl,
+ Offset: offset,
+ }
+
+ // Copies data from disk directly into usermem without any intermediate
+ // allocations (if dst is converted into BlockSeq such that it does not need
+ // safe copying).
+ return dst.CopyOutFrom(ctx, safeReader)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ n, err := fd.PRead(ctx, dst, fd.off, opts)
+ fd.offMu.Lock()
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ // write(2) specifies that EBADF must be returned if the fd is not open for
+ // writing.
+ return 0, syserror.EBADF
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ n, err := fd.PWrite(ctx, src, fd.off, opts)
+ fd.offMu.Lock()
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *regularFileFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ return syserror.ENOTDIR
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fd.offMu.Lock()
+ defer fd.offMu.Unlock()
+ switch whence {
+ case linux.SEEK_SET:
+ // Use offset as specified.
+ case linux.SEEK_CUR:
+ offset += fd.off
+ case linux.SEEK_END:
+ offset += int64(fd.inode().diskInode.Size())
+ default:
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ fd.off = offset
+ return offset, nil
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+ // TODO(b/134676337): Implement mmap(2).
+ return syserror.ENODEV
}
diff --git a/pkg/sentry/fs/ext/symlink.go b/pkg/sentry/fs/ext/symlink.go
index 9f498d989..e06548a98 100644
--- a/pkg/sentry/fs/ext/symlink.go
+++ b/pkg/sentry/fs/ext/symlink.go
@@ -15,6 +15,10 @@
package ext
import (
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -43,8 +47,8 @@ func newSymlink(inode inode) (*symlink, error) {
}
link = make([]byte, size)
- if n, _ := regFile.impl.ReadAt(link, 0); uint64(n) < size {
- return nil, syserror.EIO
+ if n, err := regFile.impl.ReadAt(link, 0); uint64(n) < size {
+ return nil, err
}
}
@@ -52,3 +56,56 @@ func newSymlink(inode inode) (*symlink, error) {
file.inode.impl = file
return file, nil
}
+
+func (in *inode) isSymlink() bool {
+ _, ok := in.impl.(*symlink)
+ return ok
+}
+
+// symlinkFD represents a symlink file description and implements implements
+// vfs.FileDescriptionImpl. which may only be used if open options contains
+// O_PATH. For this reason most of the functions return EBADF.
+type symlinkFD struct {
+ fileDescription
+}
+
+// Compiles only if symlinkFD implements vfs.FileDescriptionImpl.
+var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil)
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *symlinkFD) Release() {}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *symlinkFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *symlinkFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *symlinkFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *symlinkFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ return syserror.ENOTDIR
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *symlinkFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *symlinkFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+ return syserror.EBADF
+}
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
index d5d4f68df..d2450e810 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -11,8 +11,8 @@ go_template_instance(
prefix = "dentry",
template = "//pkg/ilist:generic_list",
types = {
- "Element": "*Dentry",
- "Linker": "*Dentry",
+ "Element": "*dentry",
+ "Linker": "*dentry",
},
)
diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/memfs/directory.go
index b0c3ea39a..c52dc781c 100644
--- a/pkg/sentry/fsimpl/memfs/directory.go
+++ b/pkg/sentry/fsimpl/memfs/directory.go
@@ -23,23 +23,23 @@ import (
)
type directory struct {
- inode Inode
+ inode inode
// childList is a list containing (1) child Dentries and (2) fake Dentries
// (with inode == nil) that represent the iteration position of
// directoryFDs. childList is used to support directoryFD.IterDirents()
- // efficiently. childList is protected by Filesystem.mu.
+ // efficiently. childList is protected by filesystem.mu.
childList dentryList
}
-func (fs *Filesystem) newDirectory(creds *auth.Credentials, mode uint16) *Inode {
+func (fs *filesystem) newDirectory(creds *auth.Credentials, mode uint16) *inode {
dir := &directory{}
dir.inode.init(dir, fs, creds, mode)
dir.inode.nlink = 2 // from "." and parent directory or ".." for root
return &dir.inode
}
-func (i *Inode) isDir() bool {
+func (i *inode) isDir() bool {
_, ok := i.impl.(*directory)
return ok
}
@@ -48,8 +48,8 @@ type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
- // Protected by Filesystem.mu.
- iter *Dentry
+ // Protected by filesystem.mu.
+ iter *dentry
off int64
}
@@ -68,7 +68,7 @@ func (fd *directoryFD) Release() {
// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
fs := fd.filesystem()
- d := fd.vfsfd.VirtualDentry().Dentry()
+ vfsd := fd.vfsfd.VirtualDentry().Dentry()
fs.mu.Lock()
defer fs.mu.Unlock()
@@ -77,7 +77,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
if !cb.Handle(vfs.Dirent{
Name: ".",
Type: linux.DT_DIR,
- Ino: d.Impl().(*Dentry).inode.ino,
+ Ino: vfsd.Impl().(*dentry).inode.ino,
Off: 0,
}) {
return nil
@@ -85,7 +85,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
fd.off++
}
if fd.off == 1 {
- parentInode := d.ParentOrSelf().Impl().(*Dentry).inode
+ parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
if !cb.Handle(vfs.Dirent{
Name: "..",
Type: parentInode.direntType(),
@@ -97,12 +97,12 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
fd.off++
}
- dir := d.Impl().(*Dentry).inode.impl.(*directory)
- var child *Dentry
+ dir := vfsd.Impl().(*dentry).inode.impl.(*directory)
+ var child *dentry
if fd.iter == nil {
// Start iteration at the beginning of dir.
child = dir.childList.Front()
- fd.iter = &Dentry{}
+ fd.iter = &dentry{}
} else {
// Continue iteration from where we left off.
child = fd.iter.Next()
@@ -130,32 +130,41 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
- if whence != linux.SEEK_SET {
- // TODO: Linux also allows SEEK_CUR.
+ fs := fd.filesystem()
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+
+ switch whence {
+ case linux.SEEK_SET:
+ // Use offset as given.
+ case linux.SEEK_CUR:
+ offset += fd.off
+ default:
return 0, syserror.EINVAL
}
if offset < 0 {
return 0, syserror.EINVAL
}
+ // If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't
+ // seek even if doing so might reposition the iterator due to concurrent
+ // mutation of the directory. Compare fs/libfs.c:dcache_dir_lseek().
+ if fd.off == offset {
+ return offset, nil
+ }
+
fd.off = offset
// Compensate for "." and "..".
- var remChildren int64
- if offset < 2 {
- remChildren = 0
- } else {
+ remChildren := int64(0)
+ if offset >= 2 {
remChildren = offset - 2
}
- fs := fd.filesystem()
dir := fd.inode().impl.(*directory)
- fs.mu.Lock()
- defer fs.mu.Unlock()
-
// Ensure that fd.iter exists and is not linked into dir.childList.
if fd.iter == nil {
- fd.iter = &Dentry{}
+ fd.iter = &dentry{}
} else {
dir.childList.Remove(fd.iter)
}
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index 4d989eeaf..f79e2d9c8 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -28,9 +28,9 @@ import (
//
// stepLocked is loosely analogous to fs/namei.c:walk_component().
//
-// Preconditions: Filesystem.mu must be locked. !rp.Done(). inode ==
-// vfsd.Impl().(*Dentry).inode.
-func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *Inode) (*vfs.Dentry, *Inode, error) {
+// Preconditions: filesystem.mu must be locked. !rp.Done(). inode ==
+// vfsd.Impl().(*dentry).inode.
+func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode) (*vfs.Dentry, *inode, error) {
if !inode.isDir() {
return nil, nil, syserror.ENOTDIR
}
@@ -47,7 +47,7 @@ afterSymlink:
// not in the Dentry tree, it doesn't exist.
return nil, nil, syserror.ENOENT
}
- nextInode := nextVFSD.Impl().(*Dentry).inode
+ nextInode := nextVFSD.Impl().(*dentry).inode
if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
// TODO: symlink traversals update access time
if err := rp.HandleSymlink(symlink.target); err != nil {
@@ -64,10 +64,10 @@ afterSymlink:
// walkExistingLocked is loosely analogous to Linux's
// fs/namei.c:path_lookupat().
//
-// Preconditions: Filesystem.mu must be locked.
-func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
+// Preconditions: filesystem.mu must be locked.
+func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
vfsd := rp.Start()
- inode := vfsd.Impl().(*Dentry).inode
+ inode := vfsd.Impl().(*dentry).inode
for !rp.Done() {
var err error
vfsd, inode, err = stepLocked(rp, vfsd, inode)
@@ -88,10 +88,10 @@ func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
// walkParentDirLocked is loosely analogous to Linux's
// fs/namei.c:path_parentat().
//
-// Preconditions: Filesystem.mu must be locked. !rp.Done().
-func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
vfsd := rp.Start()
- inode := vfsd.Impl().(*Dentry).inode
+ inode := vfsd.Impl().(*dentry).inode
for !rp.Final() {
var err error
vfsd, inode, err = stepLocked(rp, vfsd, inode)
@@ -108,9 +108,9 @@ func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
// checkCreateLocked checks that a file named rp.Component() may be created in
// directory parentVFSD, then returns rp.Component().
//
-// Preconditions: Filesystem.mu must be locked. parentInode ==
-// parentVFSD.Impl().(*Dentry).inode. parentInode.isDir() == true.
-func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *Inode) (string, error) {
+// Preconditions: filesystem.mu must be locked. parentInode ==
+// parentVFSD.Impl().(*dentry).inode. parentInode.isDir() == true.
+func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *inode) (string, error) {
if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
return "", err
}
@@ -144,7 +144,7 @@ func checkDeleteLocked(vfsd *vfs.Dentry) error {
}
// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
-func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
vfsd, inode, err := walkExistingLocked(rp)
@@ -164,7 +164,7 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
}
// LinkAt implements vfs.FilesystemImpl.LinkAt.
-func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
if rp.Done() {
return syserror.EEXIST
}
@@ -185,7 +185,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
return err
}
defer rp.Mount().EndWrite()
- d := vd.Dentry().Impl().(*Dentry)
+ d := vd.Dentry().Impl().(*dentry)
if d.inode.isDir() {
return syserror.EPERM
}
@@ -197,7 +197,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
}
// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
-func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
if rp.Done() {
return syserror.EEXIST
}
@@ -223,7 +223,7 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
}
// MknodAt implements vfs.FilesystemImpl.MknodAt.
-func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
if rp.Done() {
return syserror.EEXIST
}
@@ -246,7 +246,7 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
}
// OpenAt implements vfs.FilesystemImpl.OpenAt.
-func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
// Filter out flags that are not supported by memfs. O_DIRECTORY and
// O_NOFOLLOW have no effect here (they're handled by VFS by setting
// appropriate bits in rp), but are returned by
@@ -265,11 +265,10 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
mustCreate := opts.Flags&linux.O_EXCL != 0
vfsd := rp.Start()
- inode := vfsd.Impl().(*Dentry).inode
+ inode := vfsd.Impl().(*dentry).inode
fs.mu.Lock()
defer fs.mu.Unlock()
if rp.Done() {
- // FIXME: ???
if rp.MustBeDir() {
return nil, syserror.EISDIR
}
@@ -327,7 +326,7 @@ afterTrailingSymlink:
if mustCreate {
return nil, syserror.EEXIST
}
- childInode := childVFSD.Impl().(*Dentry).inode
+ childInode := childVFSD.Impl().(*dentry).inode
if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
// TODO: symlink traversals update access time
if err := rp.HandleSymlink(symlink.target); err != nil {
@@ -340,7 +339,7 @@ afterTrailingSymlink:
return childInode.open(rp, childVFSD, opts.Flags, false)
}
-func (i *Inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
+func (i *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
ats := vfs.AccessTypesForOpenFlags(flags)
if !afterCreate {
if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil {
@@ -385,7 +384,7 @@ func (i *Inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afte
}
// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
-func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
fs.mu.RLock()
_, inode, err := walkExistingLocked(rp)
fs.mu.RUnlock()
@@ -400,9 +399,8 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
}
// RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
if rp.Done() {
- // FIXME
return syserror.ENOENT
}
fs.mu.Lock()
@@ -424,7 +422,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vf
}
// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
-func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
vfsd, inode, err := walkExistingLocked(rp)
@@ -447,12 +445,14 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
return err
}
+ // Remove from parent directory's childList.
+ vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
inode.decRef()
return nil
}
// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
-func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
fs.mu.RLock()
_, _, err := walkExistingLocked(rp)
fs.mu.RUnlock()
@@ -462,12 +462,12 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
if opts.Stat.Mask == 0 {
return nil
}
- // TODO: implement Inode.setStat
+ // TODO: implement inode.setStat
return syserror.EPERM
}
// StatAt implements vfs.FilesystemImpl.StatAt.
-func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
fs.mu.RLock()
_, inode, err := walkExistingLocked(rp)
fs.mu.RUnlock()
@@ -480,7 +480,7 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
}
// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
-func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
fs.mu.RLock()
_, _, err := walkExistingLocked(rp)
fs.mu.RUnlock()
@@ -492,7 +492,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
}
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
-func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
if rp.Done() {
return syserror.EEXIST
}
@@ -517,7 +517,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
}
// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
-func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
vfsd, inode, err := walkExistingLocked(rp)
@@ -537,6 +537,8 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
return err
}
+ // Remove from parent directory's childList.
+ vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
inode.decLinksLocked()
return nil
}
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
index f381e1a88..59612da14 100644
--- a/pkg/sentry/fsimpl/memfs/memfs.go
+++ b/pkg/sentry/fsimpl/memfs/memfs.go
@@ -21,10 +21,10 @@
//
// Lock order:
//
-// Filesystem.mu
+// filesystem.mu
// regularFileFD.offMu
// regularFile.mu
-// Inode.mu
+// inode.mu
package memfs
import (
@@ -42,8 +42,8 @@ import (
// FilesystemType implements vfs.FilesystemType.
type FilesystemType struct{}
-// Filesystem implements vfs.FilesystemImpl.
-type Filesystem struct {
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
vfsfs vfs.Filesystem
// mu serializes changes to the Dentry tree.
@@ -54,44 +54,44 @@ type Filesystem struct {
// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
func (fstype FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
- var fs Filesystem
+ var fs filesystem
fs.vfsfs.Init(&fs)
root := fs.newDentry(fs.newDirectory(creds, 01777))
return &fs.vfsfs, &root.vfsd, nil
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *Filesystem) Release() {
+func (fs *filesystem) Release() {
}
// Sync implements vfs.FilesystemImpl.Sync.
-func (fs *Filesystem) Sync(ctx context.Context) error {
+func (fs *filesystem) Sync(ctx context.Context) error {
// All filesystem state is in-memory.
return nil
}
-// Dentry implements vfs.DentryImpl.
-type Dentry struct {
+// dentry implements vfs.DentryImpl.
+type dentry struct {
vfsd vfs.Dentry
- // inode is the inode represented by this Dentry. Multiple Dentries may
- // share a single non-directory Inode (with hard links). inode is
+ // inode is the inode represented by this dentry. Multiple Dentries may
+ // share a single non-directory inode (with hard links). inode is
// immutable.
- inode *Inode
+ inode *inode
- // memfs doesn't count references on Dentries; because the Dentry tree is
+ // memfs doesn't count references on dentries; because the dentry tree is
// the sole source of truth, it is by definition always consistent with the
- // state of the filesystem. However, it does count references on Inodes,
- // because Inode resources are released when all references are dropped.
+ // state of the filesystem. However, it does count references on inodes,
+ // because inode resources are released when all references are dropped.
// (memfs doesn't really have resources to release, but we implement
// reference counting because tmpfs regular files will.)
- // dentryEntry (ugh) links Dentries into their parent directory.childList.
+ // dentryEntry (ugh) links dentries into their parent directory.childList.
dentryEntry
}
-func (fs *Filesystem) newDentry(inode *Inode) *Dentry {
- d := &Dentry{
+func (fs *filesystem) newDentry(inode *inode) *dentry {
+ d := &dentry{
inode: inode,
}
d.vfsd.Init(d)
@@ -99,37 +99,37 @@ func (fs *Filesystem) newDentry(inode *Inode) *Dentry {
}
// IncRef implements vfs.DentryImpl.IncRef.
-func (d *Dentry) IncRef(vfsfs *vfs.Filesystem) {
+func (d *dentry) IncRef(vfsfs *vfs.Filesystem) {
d.inode.incRef()
}
// TryIncRef implements vfs.DentryImpl.TryIncRef.
-func (d *Dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
+func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
return d.inode.tryIncRef()
}
// DecRef implements vfs.DentryImpl.DecRef.
-func (d *Dentry) DecRef(vfsfs *vfs.Filesystem) {
+func (d *dentry) DecRef(vfsfs *vfs.Filesystem) {
d.inode.decRef()
}
-// Inode represents a filesystem object.
-type Inode struct {
+// inode represents a filesystem object.
+type inode struct {
// refs is a reference count. refs is accessed using atomic memory
// operations.
//
- // A reference is held on all Inodes that are reachable in the filesystem
+ // A reference is held on all inodes that are reachable in the filesystem
// tree. For non-directories (which may have multiple hard links), this
// means that a reference is dropped when nlink reaches 0. For directories,
// nlink never reaches 0 due to the "." entry; instead,
- // Filesystem.RmdirAt() drops the reference.
+ // filesystem.RmdirAt() drops the reference.
refs int64
// Inode metadata; protected by mu and accessed using atomic memory
// operations unless otherwise specified.
mu sync.RWMutex
mode uint32 // excluding file type bits, which are based on impl
- nlink uint32 // protected by Filesystem.mu instead of Inode.mu
+ nlink uint32 // protected by filesystem.mu instead of inode.mu
uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
gid uint32 // auth.KGID, but ...
ino uint64 // immutable
@@ -137,7 +137,7 @@ type Inode struct {
impl interface{} // immutable
}
-func (i *Inode) init(impl interface{}, fs *Filesystem, creds *auth.Credentials, mode uint16) {
+func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode uint16) {
i.refs = 1
i.mode = uint32(mode)
i.uid = uint32(creds.EffectiveKUID)
@@ -147,29 +147,29 @@ func (i *Inode) init(impl interface{}, fs *Filesystem, creds *auth.Credentials,
i.impl = impl
}
-// Preconditions: Filesystem.mu must be locked for writing.
-func (i *Inode) incLinksLocked() {
+// Preconditions: filesystem.mu must be locked for writing.
+func (i *inode) incLinksLocked() {
if atomic.AddUint32(&i.nlink, 1) <= 1 {
- panic("memfs.Inode.incLinksLocked() called with no existing links")
+ panic("memfs.inode.incLinksLocked() called with no existing links")
}
}
-// Preconditions: Filesystem.mu must be locked for writing.
-func (i *Inode) decLinksLocked() {
+// Preconditions: filesystem.mu must be locked for writing.
+func (i *inode) decLinksLocked() {
if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 {
i.decRef()
} else if nlink == ^uint32(0) { // negative overflow
- panic("memfs.Inode.decLinksLocked() called with no existing links")
+ panic("memfs.inode.decLinksLocked() called with no existing links")
}
}
-func (i *Inode) incRef() {
+func (i *inode) incRef() {
if atomic.AddInt64(&i.refs, 1) <= 1 {
- panic("memfs.Inode.incRef() called without holding a reference")
+ panic("memfs.inode.incRef() called without holding a reference")
}
}
-func (i *Inode) tryIncRef() bool {
+func (i *inode) tryIncRef() bool {
for {
refs := atomic.LoadInt64(&i.refs)
if refs == 0 {
@@ -181,7 +181,7 @@ func (i *Inode) tryIncRef() bool {
}
}
-func (i *Inode) decRef() {
+func (i *inode) decRef() {
if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
// This is unnecessary; it's mostly to simulate what tmpfs would do.
if regfile, ok := i.impl.(*regularFile); ok {
@@ -191,18 +191,18 @@ func (i *Inode) decRef() {
regfile.mu.Unlock()
}
} else if refs < 0 {
- panic("memfs.Inode.decRef() called without holding a reference")
+ panic("memfs.inode.decRef() called without holding a reference")
}
}
-func (i *Inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
+func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
}
// Go won't inline this function, and returning linux.Statx (which is quite
// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
// output parameter.
-func (i *Inode) statTo(stat *linux.Statx) {
+func (i *inode) statTo(stat *linux.Statx) {
stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
stat.Blksize = 1 // usermem.PageSize in tmpfs
stat.Nlink = atomic.LoadUint32(&i.nlink)
@@ -241,7 +241,7 @@ func allocatedBlocksForSize(size uint64) uint64 {
return (size + 511) / 512
}
-func (i *Inode) direntType() uint8 {
+func (i *inode) direntType() uint8 {
switch i.impl.(type) {
case *regularFile:
return linux.DT_REG
@@ -262,12 +262,12 @@ type fileDescription struct {
flags uint32 // status flags; immutable
}
-func (fd *fileDescription) filesystem() *Filesystem {
- return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*Filesystem)
+func (fd *fileDescription) filesystem() *filesystem {
+ return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem)
}
-func (fd *fileDescription) inode() *Inode {
- return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+func (fd *fileDescription) inode() *inode {
+ return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode
}
// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
@@ -294,6 +294,6 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
if opts.Stat.Mask == 0 {
return nil
}
- // TODO: implement Inode.setStat
+ // TODO: implement inode.setStat
return syserror.EPERM
}
diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go
index 4a3603cc8..7a16d5719 100644
--- a/pkg/sentry/fsimpl/memfs/regular_file.go
+++ b/pkg/sentry/fsimpl/memfs/regular_file.go
@@ -28,16 +28,16 @@ import (
)
type regularFile struct {
- inode Inode
+ inode inode
mu sync.RWMutex
data []byte
// dataLen is len(data), but accessed using atomic memory operations to
- // avoid locking in Inode.stat().
+ // avoid locking in inode.stat().
dataLen int64
}
-func (fs *Filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *Inode {
+func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *inode {
file := &regularFile{}
file.inode.init(file, fs, creds, mode)
file.inode.nlink = 1 // from parent directory
diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/memfs/symlink.go
index e002d1727..b2ac2cbeb 100644
--- a/pkg/sentry/fsimpl/memfs/symlink.go
+++ b/pkg/sentry/fsimpl/memfs/symlink.go
@@ -19,11 +19,11 @@ import (
)
type symlink struct {
- inode Inode
+ inode inode
target string // immutable
}
-func (fs *Filesystem) newSymlink(creds *auth.Credentials, target string) *Inode {
+func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode {
link := &symlink{
target: target,
}
diff --git a/pkg/sentry/safemem/io.go b/pkg/sentry/safemem/io.go
index 5c3d73eb7..f039a5c34 100644
--- a/pkg/sentry/safemem/io.go
+++ b/pkg/sentry/safemem/io.go
@@ -157,7 +157,8 @@ func (w ToIOWriter) Write(src []byte) (int, error) {
}
// FromIOReader implements Reader for an io.Reader by repeatedly invoking
-// io.Reader.Read until it returns an error or partial read.
+// io.Reader.Read until it returns an error or partial read. This is not
+// thread-safe.
//
// FromIOReader will return a successful partial read iff Reader.Read does so.
type FromIOReader struct {
@@ -206,6 +207,58 @@ func (r FromIOReader) readToBlock(dst Block, buf []byte) (int, []byte, error) {
return wbn, buf, rerr
}
+// FromIOReaderAt implements Reader for an io.ReaderAt. Does not repeatedly
+// invoke io.ReaderAt.ReadAt because ReadAt is more strict than Read. A partial
+// read indicates an error. This is not thread-safe.
+type FromIOReaderAt struct {
+ ReaderAt io.ReaderAt
+ Offset int64
+}
+
+// ReadToBlocks implements Reader.ReadToBlocks.
+func (r FromIOReaderAt) ReadToBlocks(dsts BlockSeq) (uint64, error) {
+ var buf []byte
+ var done uint64
+ for !dsts.IsEmpty() {
+ dst := dsts.Head()
+ var n int
+ var err error
+ n, buf, err = r.readToBlock(dst, buf)
+ done += uint64(n)
+ if n != dst.Len() {
+ return done, err
+ }
+ dsts = dsts.Tail()
+ if err != nil {
+ if dsts.IsEmpty() && err == io.EOF {
+ return done, nil
+ }
+ return done, err
+ }
+ }
+ return done, nil
+}
+
+func (r FromIOReaderAt) readToBlock(dst Block, buf []byte) (int, []byte, error) {
+ // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require
+ // safecopy.
+ if !dst.NeedSafecopy() {
+ n, err := r.ReaderAt.ReadAt(dst.ToSlice(), r.Offset)
+ r.Offset += int64(n)
+ return n, buf, err
+ }
+ if len(buf) < dst.Len() {
+ buf = make([]byte, dst.Len())
+ }
+ rn, rerr := r.ReaderAt.ReadAt(buf[:dst.Len()], r.Offset)
+ r.Offset += int64(rn)
+ wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn]))
+ if wberr != nil {
+ return wbn, buf, wberr
+ }
+ return wbn, buf, rerr
+}
+
// FromIOWriter implements Writer for an io.Writer by repeatedly invoking
// io.Writer.Write until it returns an error or partial write.
//
diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD
index e43775898..a3585e10d 100644
--- a/pkg/sentry/socket/rpcinet/notifier/BUILD
+++ b/pkg/sentry/socket/rpcinet/notifier/BUILD
@@ -6,7 +6,7 @@ go_library(
name = "notifier",
srcs = ["notifier.go"],
importpath = "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/notifier",
- visibility = ["//pkg/sentry:internal"],
+ visibility = ["//:sandbox"],
deps = [
"//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto",
"//pkg/sentry/socket/rpcinet/conn",
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index 63e2c5a5d..912cbe4ff 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -120,7 +120,7 @@ func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent
Ino: attr.InodeID,
Off: offset,
},
- Typ: toType(attr.Type),
+ Typ: fs.ToDirentType(attr.Type),
},
Name: []byte(name),
}
@@ -142,28 +142,6 @@ func smallestDirent64(a arch.Context) uint {
return uint(binary.Size(d.Hdr)) + a.Width()
}
-// toType converts an fs.InodeOperationsInfo to a linux dirent typ field.
-func toType(nodeType fs.InodeType) uint8 {
- switch nodeType {
- case fs.RegularFile, fs.SpecialFile:
- return linux.DT_REG
- case fs.Symlink:
- return linux.DT_LNK
- case fs.Directory, fs.SpecialDirectory:
- return linux.DT_DIR
- case fs.Pipe:
- return linux.DT_FIFO
- case fs.CharacterDevice:
- return linux.DT_CHR
- case fs.BlockDevice:
- return linux.DT_BLK
- case fs.Socket:
- return linux.DT_SOCK
- default:
- return linux.DT_UNKNOWN
- }
-}
-
// padRec pads the name field until the rec length is a multiple of the width,
// which must be a power of 2. It returns the padded rec length.
func (d *dirent) padRec(width int) uint16 {
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index a7c98efcb..17e3dde1f 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -207,6 +207,10 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, syserror.ESPIPE
}
if outOffset != 0 {
+ if !outFile.Flags().Pwrite {
+ return 0, nil, syserror.EINVAL
+ }
+
var offset int64
if _, err := t.CopyIn(outOffset, &offset); err != nil {
return 0, nil, err
@@ -220,6 +224,10 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, syserror.ESPIPE
}
if inOffset != 0 {
+ if !inFile.Flags().Pread {
+ return 0, nil, syserror.EINVAL
+ }
+
var offset int64
if _, err := t.CopyIn(inOffset, &offset); err != nil {
return 0, nil, err