summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/sentry/fs/attr.go44
-rw-r--r--pkg/sentry/fs/ext/BUILD23
-rw-r--r--pkg/sentry/fs/ext/README.md117
-rw-r--r--pkg/sentry/fs/ext/block_map_file.go9
-rw-r--r--pkg/sentry/fs/ext/dentry.go2
-rw-r--r--pkg/sentry/fs/ext/directory.go290
-rw-r--r--pkg/sentry/fs/ext/disklayout/dirent.go3
-rw-r--r--pkg/sentry/fs/ext/disklayout/dirent_test.go6
-rw-r--r--pkg/sentry/fs/ext/disklayout/superblock.go2
-rw-r--r--pkg/sentry/fs/ext/ext.go41
-rw-r--r--pkg/sentry/fs/ext/ext_test.go544
-rw-r--r--pkg/sentry/fs/ext/extent_file.go6
-rw-r--r--pkg/sentry/fs/ext/file_description.go110
-rw-r--r--pkg/sentry/fs/ext/filesystem.go341
-rw-r--r--pkg/sentry/fs/ext/inline_file.go55
-rw-r--r--pkg/sentry/fs/ext/inode.go78
-rw-r--r--pkg/sentry/fs/ext/named_pipe.go40
-rw-r--r--pkg/sentry/fs/ext/regular_file.go112
-rw-r--r--pkg/sentry/fs/ext/symlink.go61
-rw-r--r--pkg/sentry/fsimpl/memfs/BUILD4
-rw-r--r--pkg/sentry/fsimpl/memfs/directory.go55
-rw-r--r--pkg/sentry/fsimpl/memfs/filesystem.go68
-rw-r--r--pkg/sentry/fsimpl/memfs/memfs.go92
-rw-r--r--pkg/sentry/fsimpl/memfs/regular_file.go6
-rw-r--r--pkg/sentry/fsimpl/memfs/symlink.go4
-rw-r--r--pkg/sentry/safemem/io.go55
-rw-r--r--pkg/sentry/socket/rpcinet/notifier/BUILD2
-rw-r--r--pkg/sentry/syscalls/linux/sys_getdents.go24
-rw-r--r--pkg/sentry/syscalls/linux/sys_splice.go8
-rw-r--r--pkg/tcpip/network/arp/arp.go1
-rw-r--r--pkg/tcpip/network/arp/arp_test.go59
-rw-r--r--pkg/tcpip/transport/tcp/accept.go89
-rw-r--r--pkg/tcpip/transport/tcp/dual_stack_test.go86
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go49
-rw-r--r--runsc/container/container.go82
-rw-r--r--runsc/container/multi_container_test.go72
-rw-r--r--runsc/test/testutil/testutil.go11
-rw-r--r--test/runtimes/README.md2
-rw-r--r--test/runtimes/go/Dockerfile5
-rw-r--r--test/runtimes/java/Dockerfile43
-rw-r--r--test/runtimes/java/proctor-java.go5
-rw-r--r--test/runtimes/nodejs/Dockerfile3
-rw-r--r--test/runtimes/php/Dockerfile3
-rw-r--r--test/runtimes/python/Dockerfile3
-rw-r--r--test/runtimes/runtimes_test.go34
-rw-r--r--test/syscalls/BUILD4
-rw-r--r--test/syscalls/linux/socket_inet_loopback.cc61
-rw-r--r--test/syscalls/linux/splice.cc75
48 files changed, 2438 insertions, 451 deletions
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 9fc6a5bc2..4f3d6410e 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -111,6 +111,50 @@ func (n InodeType) LinuxType() uint32 {
}
}
+// ToDirentType converts an InodeType to a linux dirent type field.
+func ToDirentType(nodeType InodeType) uint8 {
+ switch nodeType {
+ case RegularFile, SpecialFile:
+ return linux.DT_REG
+ case Symlink:
+ return linux.DT_LNK
+ case Directory, SpecialDirectory:
+ return linux.DT_DIR
+ case Pipe:
+ return linux.DT_FIFO
+ case CharacterDevice:
+ return linux.DT_CHR
+ case BlockDevice:
+ return linux.DT_BLK
+ case Socket:
+ return linux.DT_SOCK
+ default:
+ return linux.DT_UNKNOWN
+ }
+}
+
+// ToInodeType coverts a linux file type to InodeType.
+func ToInodeType(linuxFileType linux.FileMode) InodeType {
+ switch linuxFileType {
+ case linux.ModeRegular:
+ return RegularFile
+ case linux.ModeDirectory:
+ return Directory
+ case linux.ModeSymlink:
+ return Symlink
+ case linux.ModeNamedPipe:
+ return Pipe
+ case linux.ModeCharacterDevice:
+ return CharacterDevice
+ case linux.ModeBlockDevice:
+ return BlockDevice
+ case linux.ModeSocket:
+ return Socket
+ default:
+ panic(fmt.Sprintf("unknown file mode: %d", linuxFileType))
+ }
+}
+
// StableAttr contains Inode attributes that will be stable throughout the
// lifetime of the Inode.
//
diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD
index e3d617576..a3b1e4bd6 100644
--- a/pkg/sentry/fs/ext/BUILD
+++ b/pkg/sentry/fs/ext/BUILD
@@ -4,14 +4,14 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
load("//tools/go_generics:defs.bzl", "go_template_instance")
go_template_instance(
- name = "dentry_list",
- out = "dentry_list.go",
+ name = "dirent_list",
+ out = "dirent_list.go",
package = "ext",
- prefix = "dentry",
+ prefix = "dirent",
template = "//pkg/ilist:generic_list",
types = {
- "Element": "*dentry",
- "Linker": "*dentry",
+ "Element": "*dirent",
+ "Linker": "*dirent",
},
)
@@ -20,14 +20,13 @@ go_library(
srcs = [
"block_map_file.go",
"dentry.go",
- "dentry_list.go",
"directory.go",
+ "dirent_list.go",
"ext.go",
"extent_file.go",
+ "file_description.go",
"filesystem.go",
- "inline_file.go",
"inode.go",
- "named_pipe.go",
"regular_file.go",
"symlink.go",
"utils.go",
@@ -38,15 +37,19 @@ go_library(
"//pkg/abi/linux",
"//pkg/binary",
"//pkg/fd",
+ "//pkg/log",
+ "//pkg/sentry/arch",
"//pkg/sentry/context",
"//pkg/sentry/fs",
"//pkg/sentry/fs/ext/disklayout",
"//pkg/sentry/kernel/auth",
- "//pkg/sentry/kernel/pipe",
+ "//pkg/sentry/memmap",
"//pkg/sentry/safemem",
+ "//pkg/sentry/syscalls/linux",
"//pkg/sentry/usermem",
"//pkg/sentry/vfs",
"//pkg/syserror",
+ "//pkg/waiter",
],
)
@@ -73,7 +76,9 @@ go_test(
"//pkg/sentry/context/contexttest",
"//pkg/sentry/fs/ext/disklayout",
"//pkg/sentry/kernel/auth",
+ "//pkg/sentry/usermem",
"//pkg/sentry/vfs",
+ "//pkg/syserror",
"//runsc/test/testutil",
"@com_github_google_go-cmp//cmp:go_default_library",
"@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
diff --git a/pkg/sentry/fs/ext/README.md b/pkg/sentry/fs/ext/README.md
new file mode 100644
index 000000000..e212717aa
--- /dev/null
+++ b/pkg/sentry/fs/ext/README.md
@@ -0,0 +1,117 @@
+## EXT(2/3/4) File System
+
+This is a filesystem driver which supports ext2, ext3 and ext4 filesystems.
+Linux has specialized drivers for each variant but none which supports all. This
+library takes advantage of ext's backward compatibility and understands the
+internal organization of on-disk structures to support all variants.
+
+This driver implementation diverges from the Linux implementations in being more
+forgiving about versioning. For instance, if a filesystem contains both extent
+based inodes and classical block map based inodes, this driver will not complain
+and interpret them both correctly. While in Linux this would be an issue. This
+blurs the line between the three ext fs variants.
+
+Ext2 is considered deprecated as of Red Hat Enterprise Linux 7, and ext3 has
+been superseded by ext4 by large performance gains. Thus it is recommended to
+upgrade older filesystem images to ext4 using e2fsprogs for better performance.
+
+### Read Only
+
+This driver currently only allows read only operations. A lot of the design
+decisions are based on this feature. There are plans to implement write (the
+process for which is documented in the future work section).
+
+### Performance
+
+One of the biggest wins about this driver is that it directly talks to the
+underlying block device (or whatever persistent storage is being used), instead
+of making expensive RPCs to a gofer.
+
+Another advantage is that ext fs supports fast concurrent reads. Currently the
+device is represented using a `io.ReaderAt` which allows for concurrent reads.
+All reads are directly passed to the device driver which intelligently serves
+the read requests in the optimal order. There is no congestion due to locking
+while reading in the filesystem level.
+
+Reads are optimized further in the way file data is transferred over to user
+memory. Ext fs directly copies over file data from disk into user memory with no
+additional allocations on the way. We can only get faster by preloading file
+data into memory (see future work section).
+
+The internal structures used to represent files, inodes and file descriptors use
+a lot of inheritance. With the level of indirection that an interface adds with
+an internal pointer, it can quickly fragment a structure across memory. As this
+runs along side a full blown kernel (which is memory intensive), having a
+fragmented struct might hurt performance. Hence these internal structures,
+though interfaced, are tightly packed in memory using the same inheritance
+pattern that pkg/sentry/vfs uses. The pkg/sentry/fs/ext/disklayout package makes
+an execption to this pattern for reasons documented in the package.
+
+### Security
+
+This driver also intends to help sandbox the container better by reducing the
+surface of the host kernel that the application touches. It prevents the
+application from exploiting vulnerabilities in the host filesystem driver. All
+`io.ReaderAt.ReadAt()` calls are translated to `pread(2)` which are directly
+passed to the device driver in the kernel. Hence this reduces the surface for
+attack.
+
+The application can not affect any host filesystems other than the one passed
+via block device by the user.
+
+### Future Work
+
+#### Write
+
+To support write operations we would need to modify the block device underneath.
+Currently, the driver does not modify the device at all, not even for updating
+the access times for reads. Modifying the filesystem incorrectly can corrupt it
+and render it unreadable for other correct ext(x) drivers. Hence caution must be
+maintained while modifying metadata structures.
+
+Ext4 specifically is built for performance and has added a lot of complexity as
+to how metadata structures are modified. For instance, files that are organized
+via an extent tree which must be balanced and file data blocks must be placed in
+the same extent as much as possible to increase locality. Such properties must
+be maintained while modifying the tree.
+
+Ext filesystems boast a lot about locality, which plays a big role in them being
+performant. The block allocation algorithm in Linux does a good job in keeping
+related data together. This behavior must be maintained as much as possible,
+else we might end up degrading the filesystem performance over time.
+
+Ext4 also supports a wide variety of features which are specialized for varying
+use cases. Implementing all of them can get difficult very quickly.
+
+Ext(x) checksums all its metadata structures to check for corruption, so
+modification of any metadata struct must correspond with re-checksumming the
+struct. Linux filesystem drivers also order on-disk updates intelligently to not
+corrupt the filesystem and also remain performant. The in-memory metadata
+structures must be kept in sync with what is on disk.
+
+There is also replication of some important structures across the filesystem.
+All replicas must be updated when their original copy is updated. There is also
+provisioning for snapshotting which must be kept in mind, although it should not
+affect this implementation unless we allow users to create filesystem snapshots.
+
+Ext4 also introduced journaling (jbd2). The journal must be updated
+appropriately.
+
+#### Performance
+
+To improve performance we should implement a buffer cache, and optionally, read
+ahead for small files. While doing so we must also keep in mind the memory usage
+and have a reasonable cap on how much file data we want to hold in memory.
+
+#### Features
+
+Our current implementation will work with most ext4 filesystems for readonly
+purposed. However, the following features are not supported yet:
+
+- Journal
+- Snapshotting
+- Extended Attributes
+- Hash Tree Directories
+- Meta Block Groups
+- Multiple Mount Protection
+- Bigalloc
diff --git a/pkg/sentry/fs/ext/block_map_file.go b/pkg/sentry/fs/ext/block_map_file.go
index f30c3a174..cea89bcd9 100644
--- a/pkg/sentry/fs/ext/block_map_file.go
+++ b/pkg/sentry/fs/ext/block_map_file.go
@@ -85,7 +85,8 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) {
}
offset := uint64(off)
- if offset >= f.regFile.inode.diskInode.Size() {
+ size := f.regFile.inode.diskInode.Size()
+ if offset >= size {
return 0, io.EOF
}
@@ -104,6 +105,9 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) {
read := 0
toRead := len(dst)
+ if uint64(toRead)+offset > size {
+ toRead = int(size - offset)
+ }
for read < toRead {
var err error
var curR int
@@ -131,6 +135,9 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) {
}
}
+ if read < len(dst) {
+ return read, io.EOF
+ }
return read, nil
}
diff --git a/pkg/sentry/fs/ext/dentry.go b/pkg/sentry/fs/ext/dentry.go
index 19c9b3b2d..054fb42b6 100644
--- a/pkg/sentry/fs/ext/dentry.go
+++ b/pkg/sentry/fs/ext/dentry.go
@@ -26,8 +26,6 @@ type dentry struct {
// share a single non-directory Inode (with hard links). inode is
// immutable.
inode *inode
- // dentryEntry links Dentries into their parent directory.childList.
- dentryEntry
}
// Compiles only if dentry implements vfs.DentryImpl.
diff --git a/pkg/sentry/fs/ext/directory.go b/pkg/sentry/fs/ext/directory.go
index ab2b59e44..1ba8bf54c 100644
--- a/pkg/sentry/fs/ext/directory.go
+++ b/pkg/sentry/fs/ext/directory.go
@@ -14,23 +14,295 @@
package ext
+import (
+ "sync"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
// directory represents a directory inode. It holds the childList in memory.
type directory struct {
inode inode
- // childList is a list containing (1) child Dentries and (2) fake Dentries
- // (with inode == nil) that represent the iteration position of
+ // mu serializes the changes to childList.
+ // Lock Order (outermost locks must be taken first):
+ // directory.mu
+ // filesystem.mu
+ mu sync.Mutex
+
+ // childList is a list containing (1) child dirents and (2) fake dirents
+ // (with diskDirent == nil) that represent the iteration position of
// directoryFDs. childList is used to support directoryFD.IterDirents()
- // efficiently. childList is immutable.
- childList dentryList
+ // efficiently. childList is protected by mu.
+ childList direntList
- // TODO(b/134676337): Add directory navigators.
+ // childMap maps the child's filename to the dirent structure stored in
+ // childList. This adds some data replication but helps in faster path
+ // traversal. For consistency, key == childMap[key].diskDirent.FileName().
+ // Immutable.
+ childMap map[string]*dirent
}
// newDirectroy is the directory constructor.
-func newDirectroy(inode inode) *directory {
- // TODO(b/134676337): initialize childList.
- file := &directory{inode: inode}
+func newDirectroy(inode inode, newDirent bool) (*directory, error) {
+ file := &directory{inode: inode, childMap: make(map[string]*dirent)}
file.inode.impl = file
- return file
+
+ // Initialize childList by reading dirents from the underlying file.
+ if inode.diskInode.Flags().Index {
+ // TODO(b/134676337): Support hash tree directories. Currently only the '.'
+ // and '..' entries are read in.
+
+ // Users cannot navigate this hash tree directory yet.
+ log.Warningf("hash tree directory being used which is unsupported")
+ return file, nil
+ }
+
+ // The dirents are organized in a linear array in the file data.
+ // Extract the file data and decode the dirents.
+ regFile, err := newRegularFile(inode)
+ if err != nil {
+ return nil, err
+ }
+
+ // buf is used as scratch space for reading in dirents from disk and
+ // unmarshalling them into dirent structs.
+ buf := make([]byte, disklayout.DirentSize)
+ size := inode.diskInode.Size()
+ for off, inc := uint64(0), uint64(0); off < size; off += inc {
+ toRead := size - off
+ if toRead > disklayout.DirentSize {
+ toRead = disklayout.DirentSize
+ }
+ if n, err := regFile.impl.ReadAt(buf[:toRead], int64(off)); uint64(n) < toRead {
+ return nil, err
+ }
+
+ var curDirent dirent
+ if newDirent {
+ curDirent.diskDirent = &disklayout.DirentNew{}
+ } else {
+ curDirent.diskDirent = &disklayout.DirentOld{}
+ }
+ binary.Unmarshal(buf, binary.LittleEndian, curDirent.diskDirent)
+
+ if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 {
+ // Inode number and name length fields being set to 0 is used to indicate
+ // an unused dirent.
+ file.childList.PushBack(&curDirent)
+ file.childMap[curDirent.diskDirent.FileName()] = &curDirent
+ }
+
+ // The next dirent is placed exactly after this dirent record on disk.
+ inc = uint64(curDirent.diskDirent.RecordSize())
+ }
+
+ return file, nil
+}
+
+func (i *inode) isDir() bool {
+ _, ok := i.impl.(*directory)
+ return ok
+}
+
+// dirent is the directory.childList node.
+type dirent struct {
+ diskDirent disklayout.Dirent
+
+ // direntEntry links dirents into their parent directory.childList.
+ direntEntry
+}
+
+// directoryFD represents a directory file description. It implements
+// vfs.FileDescriptionImpl.
+type directoryFD struct {
+ fileDescription
+ vfs.DirectoryFileDescriptionDefaultImpl
+
+ // Protected by directory.mu.
+ iter *dirent
+ off int64
+}
+
+// Compiles only if directoryFD implements vfs.FileDescriptionImpl.
+var _ vfs.FileDescriptionImpl = (*directoryFD)(nil)
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release() {
+ if fd.iter == nil {
+ return
+ }
+
+ dir := fd.inode().impl.(*directory)
+ dir.mu.Lock()
+ dir.childList.Remove(fd.iter)
+ dir.mu.Unlock()
+ fd.iter = nil
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ extfs := fd.filesystem()
+ dir := fd.inode().impl.(*directory)
+
+ dir.mu.Lock()
+ defer dir.mu.Unlock()
+
+ // Ensure that fd.iter exists and is not linked into dir.childList.
+ var child *dirent
+ if fd.iter == nil {
+ // Start iteration at the beginning of dir.
+ child = dir.childList.Front()
+ fd.iter = &dirent{}
+ } else {
+ // Continue iteration from where we left off.
+ child = fd.iter.Next()
+ dir.childList.Remove(fd.iter)
+ }
+ for ; child != nil; child = child.Next() {
+ // Skip other directoryFD iterators.
+ if child.diskDirent != nil {
+ childType, ok := child.diskDirent.FileType()
+ if !ok {
+ // We will need to read the inode off disk. Do not increment
+ // ref count here because this inode is not being added to the
+ // dentry tree.
+ extfs.mu.Lock()
+ childInode, err := extfs.getOrCreateInodeLocked(child.diskDirent.Inode())
+ extfs.mu.Unlock()
+ if err != nil {
+ // Usage of the file description after the error is
+ // undefined. This implementation would continue reading
+ // from the next dirent.
+ fd.off++
+ dir.childList.InsertAfter(child, fd.iter)
+ return err
+ }
+ childType = fs.ToInodeType(childInode.diskInode.Mode().FileType())
+ }
+
+ if !cb.Handle(vfs.Dirent{
+ Name: child.diskDirent.FileName(),
+ Type: fs.ToDirentType(childType),
+ Ino: uint64(child.diskDirent.Inode()),
+ Off: fd.off,
+ }) {
+ dir.childList.InsertBefore(child, fd.iter)
+ return nil
+ }
+ fd.off++
+ }
+ }
+ dir.childList.PushBack(fd.iter)
+ return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ if whence != linux.SEEK_SET && whence != linux.SEEK_CUR {
+ return 0, syserror.EINVAL
+ }
+
+ dir := fd.inode().impl.(*directory)
+
+ dir.mu.Lock()
+ defer dir.mu.Unlock()
+
+ // Find resulting offset.
+ if whence == linux.SEEK_CUR {
+ offset += fd.off
+ }
+
+ if offset < 0 {
+ // lseek(2) specifies that EINVAL should be returned if the resulting offset
+ // is negative.
+ return 0, syserror.EINVAL
+ }
+
+ n := int64(len(dir.childMap))
+ realWantOff := offset
+ if realWantOff > n {
+ realWantOff = n
+ }
+ realCurOff := fd.off
+ if realCurOff > n {
+ realCurOff = n
+ }
+
+ // Ensure that fd.iter exists and is linked into dir.childList so we can
+ // intelligently seek from the optimal position.
+ if fd.iter == nil {
+ fd.iter = &dirent{}
+ dir.childList.PushFront(fd.iter)
+ }
+
+ // Guess that iterating from the current position is optimal.
+ child := fd.iter
+ diff := realWantOff - realCurOff // Shows direction and magnitude of travel.
+
+ // See if starting from the beginning or end is better.
+ abDiff := diff
+ if diff < 0 {
+ abDiff = -diff
+ }
+ if abDiff > realWantOff {
+ // Starting from the beginning is best.
+ child = dir.childList.Front()
+ diff = realWantOff
+ } else if abDiff > (n - realWantOff) {
+ // Starting from the end is best.
+ child = dir.childList.Back()
+ // (n - 1) because the last non-nil dirent represents the (n-1)th offset.
+ diff = realWantOff - (n - 1)
+ }
+
+ for child != nil {
+ // Skip other directoryFD iterators.
+ if child.diskDirent != nil {
+ if diff == 0 {
+ if child != fd.iter {
+ dir.childList.Remove(fd.iter)
+ dir.childList.InsertBefore(child, fd.iter)
+ }
+
+ fd.off = offset
+ return offset, nil
+ }
+
+ if diff < 0 {
+ diff++
+ child = child.Prev()
+ } else {
+ diff--
+ child = child.Next()
+ }
+ continue
+ }
+
+ if diff < 0 {
+ child = child.Prev()
+ } else {
+ child = child.Next()
+ }
+ }
+
+ // Reaching here indicates that the offset is beyond the end of the childList.
+ dir.childList.Remove(fd.iter)
+ dir.childList.PushBack(fd.iter)
+ fd.off = offset
+ return offset, nil
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+ // mmap(2) specifies that EACCESS should be returned for non-regular file fds.
+ return syserror.EACCES
}
diff --git a/pkg/sentry/fs/ext/disklayout/dirent.go b/pkg/sentry/fs/ext/disklayout/dirent.go
index 685bf57b8..417b6cf65 100644
--- a/pkg/sentry/fs/ext/disklayout/dirent.go
+++ b/pkg/sentry/fs/ext/disklayout/dirent.go
@@ -21,6 +21,9 @@ import (
const (
// MaxFileName is the maximum length of an ext fs file's name.
MaxFileName = 255
+
+ // DirentSize is the size of ext dirent structures.
+ DirentSize = 263
)
var (
diff --git a/pkg/sentry/fs/ext/disklayout/dirent_test.go b/pkg/sentry/fs/ext/disklayout/dirent_test.go
index cc6dff2c9..934919f8a 100644
--- a/pkg/sentry/fs/ext/disklayout/dirent_test.go
+++ b/pkg/sentry/fs/ext/disklayout/dirent_test.go
@@ -21,8 +21,6 @@ import (
// TestDirentSize tests that the dirent structs are of the correct
// size.
func TestDirentSize(t *testing.T) {
- want := uintptr(263)
-
- assertSize(t, DirentOld{}, want)
- assertSize(t, DirentNew{}, want)
+ assertSize(t, DirentOld{}, uintptr(DirentSize))
+ assertSize(t, DirentNew{}, uintptr(DirentSize))
}
diff --git a/pkg/sentry/fs/ext/disklayout/superblock.go b/pkg/sentry/fs/ext/disklayout/superblock.go
index 7a337a5e0..8bb327006 100644
--- a/pkg/sentry/fs/ext/disklayout/superblock.go
+++ b/pkg/sentry/fs/ext/disklayout/superblock.go
@@ -221,7 +221,7 @@ func CompatFeaturesFromInt(f uint32) CompatFeatures {
// This is not exhaustive, unused features are not listed.
const (
// SbDirentFileType indicates that directory entries record the file type.
- // We should use struct ext4_dir_entry_2 for dirents then.
+ // We should use struct DirentNew for dirents then.
SbDirentFileType = 0x2
// SbRecovery indicates that the filesystem needs recovery.
diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go
index d303dd122..c3e2c9efb 100644
--- a/pkg/sentry/fs/ext/ext.go
+++ b/pkg/sentry/fs/ext/ext.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/fd"
+ "gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -62,8 +63,40 @@ func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, err
return fd.NewReadWriter(devFd), nil
}
+// isCompatible checks if the superblock has feature sets which are compatible.
+// We only need to check the superblock incompatible feature set since we are
+// mounting readonly. We will also need to check readonly compatible feature
+// set when mounting for read/write.
+func isCompatible(sb disklayout.SuperBlock) bool {
+ // Please note that what is being checked is limited based on the fact that we
+ // are mounting readonly and that we are not journaling. When mounting
+ // read/write or with a journal, this must be reevaluated.
+ incompatFeatures := sb.IncompatibleFeatures()
+ if incompatFeatures.MetaBG {
+ log.Warningf("ext fs: meta block groups are not supported")
+ return false
+ }
+ if incompatFeatures.MMP {
+ log.Warningf("ext fs: multiple mount protection is not supported")
+ return false
+ }
+ if incompatFeatures.Encrypted {
+ log.Warningf("ext fs: encrypted inodes not supported")
+ return false
+ }
+ if incompatFeatures.InlineData {
+ log.Warningf("ext fs: inline files not supported")
+ return false
+ }
+ return true
+}
+
// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
func (fstype filesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ // TODO(b/134676337): Ensure that the user is mounting readonly. If not,
+ // EACCESS should be returned according to mount(2). Filesystem independent
+ // flags (like readonly) are currently not available in pkg/sentry/vfs.
+
dev, err := getDeviceFd(source, opts)
if err != nil {
return nil, nil, err
@@ -82,15 +115,21 @@ func (fstype filesystemType) NewFilesystem(ctx context.Context, creds *auth.Cred
return nil, nil, syserror.EINVAL
}
+ // Refuse to mount if the filesystem is incompatible.
+ if !isCompatible(fs.sb) {
+ return nil, nil, syserror.EINVAL
+ }
+
fs.bgs, err = readBlockGroups(dev, fs.sb)
if err != nil {
return nil, nil, err
}
- rootInode, err := fs.getOrCreateInode(ctx, disklayout.RootDirInode)
+ rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode)
if err != nil {
return nil, nil, err
}
+ rootInode.incRef()
return &fs.vfsfs, &newDentry(rootInode).vfsd, nil
}
diff --git a/pkg/sentry/fs/ext/ext_test.go b/pkg/sentry/fs/ext/ext_test.go
index 6396886cc..270f38074 100644
--- a/pkg/sentry/fs/ext/ext_test.go
+++ b/pkg/sentry/fs/ext/ext_test.go
@@ -16,17 +16,22 @@ package ext
import (
"fmt"
+ "io"
"os"
"path"
+ "sort"
"testing"
"github.com/google/go-cmp/cmp"
+ "github.com/google/go-cmp/cmp/cmpopts"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/runsc/test/testutil"
)
@@ -44,7 +49,7 @@ var (
// setUp opens imagePath as an ext Filesystem and returns all necessary
// elements required to run tests. If error is non-nil, it also returns a tear
// down function which must be called after the test is run for clean up.
-func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *vfs.Dentry, func(), error) {
+func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesystem, *vfs.VirtualDentry, func(), error) {
localImagePath, err := testutil.FindFile(imagePath)
if err != nil {
return nil, nil, nil, nil, fmt.Errorf("failed to open local image at path %s: %v", imagePath, err)
@@ -55,20 +60,537 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *v
return nil, nil, nil, nil, err
}
- // Mount the ext4 fs and retrieve the inode structure for the file.
- mockCtx := contexttest.Context(t)
- fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
+ ctx := contexttest.Context(t)
+ creds := auth.CredentialsFromContext(ctx)
+
+ // Create VFS.
+ vfsObj := vfs.New()
+ vfsObj.MustRegisterFilesystemType("extfs", filesystemType{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
if err != nil {
f.Close()
return nil, nil, nil, nil, err
}
+ root := mntns.Root()
+
tearDown := func() {
+ root.DecRef()
+
if err := f.Close(); err != nil {
t.Fatalf("tearDown failed: %v", err)
}
}
- return mockCtx, fs, d, tearDown, nil
+ return ctx, vfsObj, &root, tearDown, nil
+}
+
+// TODO(b/134676337): Test vfs.FilesystemImpl.ReadlinkAt and
+// vfs.FilesystemImpl.StatFSAt which are not implemented in
+// vfs.VirtualFilesystem yet.
+
+// TestSeek tests vfs.FileDescriptionImpl.Seek functionality.
+func TestSeek(t *testing.T) {
+ type seekTest struct {
+ name string
+ image string
+ path string
+ }
+
+ tests := []seekTest{
+ {
+ name: "ext4 root dir seek",
+ image: ext4ImagePath,
+ path: "/",
+ },
+ {
+ name: "ext3 root dir seek",
+ image: ext3ImagePath,
+ path: "/",
+ },
+ {
+ name: "ext2 root dir seek",
+ image: ext2ImagePath,
+ path: "/",
+ },
+ {
+ name: "ext4 reg file seek",
+ image: ext4ImagePath,
+ path: "/file.txt",
+ },
+ {
+ name: "ext3 reg file seek",
+ image: ext3ImagePath,
+ path: "/file.txt",
+ },
+ {
+ name: "ext2 reg file seek",
+ image: ext2ImagePath,
+ path: "/file.txt",
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ ctx, vfsfs, root, tearDown, err := setUp(t, test.image)
+ if err != nil {
+ t.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ fd, err := vfsfs.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.OpenAt failed: %v", err)
+ }
+
+ if n, err := fd.Impl().Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil {
+ t.Errorf("expected seek position 0, got %d and error %v", n, err)
+ }
+
+ stat, err := fd.Impl().Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ t.Errorf("fd.stat failed for file %s in image %s: %v", test.path, test.image, err)
+ }
+
+ // We should be able to seek beyond the end of file.
+ size := int64(stat.Size)
+ if n, err := fd.Impl().Seek(ctx, size, linux.SEEK_SET); n != size || err != nil {
+ t.Errorf("expected seek position %d, got %d and error %v", size, n, err)
+ }
+
+ // EINVAL should be returned if the resulting offset is negative.
+ if _, err := fd.Impl().Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL {
+ t.Errorf("expected error EINVAL but got %v", err)
+ }
+
+ if n, err := fd.Impl().Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil {
+ t.Errorf("expected seek position %d, got %d and error %v", size+3, n, err)
+ }
+
+ // Make sure negative offsets work with SEEK_CUR.
+ if n, err := fd.Impl().Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil {
+ t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
+ }
+
+ // EINVAL should be returned if the resulting offset is negative.
+ if _, err := fd.Impl().Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL {
+ t.Errorf("expected error EINVAL but got %v", err)
+ }
+
+ // Make sure SEEK_END works with regular files.
+ switch fd.Impl().(type) {
+ case *regularFileFD:
+ // Seek back to 0.
+ if n, err := fd.Impl().Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil {
+ t.Errorf("expected seek position %d, got %d and error %v", 0, n, err)
+ }
+
+ // Seek forward beyond EOF.
+ if n, err := fd.Impl().Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil {
+ t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
+ }
+
+ // EINVAL should be returned if the resulting offset is negative.
+ if _, err := fd.Impl().Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL {
+ t.Errorf("expected error EINVAL but got %v", err)
+ }
+ }
+ })
+ }
+}
+
+// TestStatAt tests filesystem.StatAt functionality.
+func TestStatAt(t *testing.T) {
+ type statAtTest struct {
+ name string
+ image string
+ path string
+ want linux.Statx
+ }
+
+ tests := []statAtTest{
+ {
+ name: "ext4 statx small file",
+ image: ext4ImagePath,
+ path: "/file.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13,
+ },
+ },
+ {
+ name: "ext3 statx small file",
+ image: ext3ImagePath,
+ path: "/file.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13,
+ },
+ },
+ {
+ name: "ext2 statx small file",
+ image: ext2ImagePath,
+ path: "/file.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13,
+ },
+ },
+ {
+ name: "ext4 statx big file",
+ image: ext4ImagePath,
+ path: "/bigfile.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13042,
+ },
+ },
+ {
+ name: "ext3 statx big file",
+ image: ext3ImagePath,
+ path: "/bigfile.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13042,
+ },
+ },
+ {
+ name: "ext2 statx big file",
+ image: ext2ImagePath,
+ path: "/bigfile.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13042,
+ },
+ },
+ {
+ name: "ext4 statx symlink file",
+ image: ext4ImagePath,
+ path: "/symlink.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0777 | linux.ModeSymlink,
+ Size: 8,
+ },
+ },
+ {
+ name: "ext3 statx symlink file",
+ image: ext3ImagePath,
+ path: "/symlink.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0777 | linux.ModeSymlink,
+ Size: 8,
+ },
+ },
+ {
+ name: "ext2 statx symlink file",
+ image: ext2ImagePath,
+ path: "/symlink.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0777 | linux.ModeSymlink,
+ Size: 8,
+ },
+ },
+ }
+
+ // Ignore the fields that are not supported by filesystem.StatAt yet and
+ // those which are likely to change as the image does.
+ ignoredFields := map[string]bool{
+ "Attributes": true,
+ "AttributesMask": true,
+ "Atime": true,
+ "Blocks": true,
+ "Btime": true,
+ "Ctime": true,
+ "DevMajor": true,
+ "DevMinor": true,
+ "Ino": true,
+ "Mask": true,
+ "Mtime": true,
+ "RdevMajor": true,
+ "RdevMinor": true,
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ ctx, vfsfs, root, tearDown, err := setUp(t, test.image)
+ if err != nil {
+ t.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ got, err := vfsfs.StatAt(ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+ &vfs.StatOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.StatAt failed for file %s in image %s: %v", test.path, test.image, err)
+ }
+
+ cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool {
+ _, ok := ignoredFields[p.String()]
+ return ok
+ }, cmp.Ignore())
+ if diff := cmp.Diff(got, test.want, cmpIgnoreFields, cmpopts.IgnoreUnexported(linux.Statx{})); diff != "" {
+ t.Errorf("stat mismatch (-want +got):\n%s", diff)
+ }
+ })
+ }
+}
+
+// TestRead tests the read functionality for vfs file descriptions.
+func TestRead(t *testing.T) {
+ type readTest struct {
+ name string
+ image string
+ absPath string
+ }
+
+ tests := []readTest{
+ {
+ name: "ext4 read small file",
+ image: ext4ImagePath,
+ absPath: "/file.txt",
+ },
+ {
+ name: "ext3 read small file",
+ image: ext3ImagePath,
+ absPath: "/file.txt",
+ },
+ {
+ name: "ext2 read small file",
+ image: ext2ImagePath,
+ absPath: "/file.txt",
+ },
+ {
+ name: "ext4 read big file",
+ image: ext4ImagePath,
+ absPath: "/bigfile.txt",
+ },
+ {
+ name: "ext3 read big file",
+ image: ext3ImagePath,
+ absPath: "/bigfile.txt",
+ },
+ {
+ name: "ext2 read big file",
+ image: ext2ImagePath,
+ absPath: "/bigfile.txt",
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ ctx, vfsfs, root, tearDown, err := setUp(t, test.image)
+ if err != nil {
+ t.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ fd, err := vfsfs.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.absPath},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.OpenAt failed: %v", err)
+ }
+
+ // Get a local file descriptor and compare its functionality with a vfs file
+ // description for the same file.
+ localFile, err := testutil.FindFile(path.Join(assetsDir, test.absPath))
+ if err != nil {
+ t.Fatalf("testutil.FindFile failed for %s: %v", test.absPath, err)
+ }
+
+ f, err := os.Open(localFile)
+ if err != nil {
+ t.Fatalf("os.Open failed for %s: %v", localFile, err)
+ }
+ defer f.Close()
+
+ // Read the entire file by reading one byte repeatedly. Doing this stress
+ // tests the underlying file reader implementation.
+ got := make([]byte, 1)
+ want := make([]byte, 1)
+ for {
+ n, err := f.Read(want)
+ fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{})
+
+ if diff := cmp.Diff(got, want); diff != "" {
+ t.Errorf("file data mismatch (-want +got):\n%s", diff)
+ }
+
+ // Make sure there is no more file data left after getting EOF.
+ if n == 0 || err == io.EOF {
+ if n, _ := fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 {
+ t.Errorf("extra unexpected file data in file %s in image %s", test.absPath, test.image)
+ }
+
+ break
+ }
+
+ if err != nil {
+ t.Fatalf("read failed: %v", err)
+ }
+ }
+ })
+ }
+}
+
+// iterDirentsCb is a simple callback which just keeps adding the dirents to an
+// internal list. Implements vfs.IterDirentsCallback.
+type iterDirentsCb struct {
+ dirents []vfs.Dirent
+}
+
+// Compiles only if iterDirentCb implements vfs.IterDirentsCallback.
+var _ vfs.IterDirentsCallback = (*iterDirentsCb)(nil)
+
+// newIterDirentsCb is the iterDirent
+func newIterDirentCb() *iterDirentsCb {
+ return &iterDirentsCb{dirents: make([]vfs.Dirent, 0)}
+}
+
+// Handle implements vfs.IterDirentsCallback.Handle.
+func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) bool {
+ cb.dirents = append(cb.dirents, dirent)
+ return true
+}
+
+// TestIterDirents tests the FileDescriptionImpl.IterDirents functionality.
+func TestIterDirents(t *testing.T) {
+ type iterDirentTest struct {
+ name string
+ image string
+ path string
+ want []vfs.Dirent
+ }
+
+ wantDirents := []vfs.Dirent{
+ vfs.Dirent{
+ Name: ".",
+ Type: linux.DT_DIR,
+ },
+ vfs.Dirent{
+ Name: "..",
+ Type: linux.DT_DIR,
+ },
+ vfs.Dirent{
+ Name: "lost+found",
+ Type: linux.DT_DIR,
+ },
+ vfs.Dirent{
+ Name: "file.txt",
+ Type: linux.DT_REG,
+ },
+ vfs.Dirent{
+ Name: "bigfile.txt",
+ Type: linux.DT_REG,
+ },
+ vfs.Dirent{
+ Name: "symlink.txt",
+ Type: linux.DT_LNK,
+ },
+ }
+ tests := []iterDirentTest{
+ {
+ name: "ext4 root dir iteration",
+ image: ext4ImagePath,
+ path: "/",
+ want: wantDirents,
+ },
+ {
+ name: "ext3 root dir iteration",
+ image: ext3ImagePath,
+ path: "/",
+ want: wantDirents,
+ },
+ {
+ name: "ext2 root dir iteration",
+ image: ext2ImagePath,
+ path: "/",
+ want: wantDirents,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ ctx, vfsfs, root, tearDown, err := setUp(t, test.image)
+ if err != nil {
+ t.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ fd, err := vfsfs.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.OpenAt failed: %v", err)
+ }
+
+ cb := &iterDirentsCb{}
+ if err = fd.Impl().IterDirents(ctx, cb); err != nil {
+ t.Fatalf("dir fd.IterDirents() failed: %v", err)
+ }
+
+ sort.Slice(cb.dirents, func(i int, j int) bool { return cb.dirents[i].Name < cb.dirents[j].Name })
+ sort.Slice(test.want, func(i int, j int) bool { return test.want[i].Name < test.want[j].Name })
+
+ // Ignore the inode number and offset of dirents because those are likely to
+ // change as the underlying image changes.
+ cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool {
+ return p.String() == "Ino" || p.String() == "Off"
+ }, cmp.Ignore())
+ if diff := cmp.Diff(cb.dirents, test.want, cmpIgnoreFields); diff != "" {
+ t.Errorf("dirents mismatch (-want +got):\n%s", diff)
+ }
+ })
+ }
}
// TestRootDir tests that the root directory inode is correctly initialized and
@@ -126,15 +648,15 @@ func TestRootDir(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
- _, _, vfsd, tearDown, err := setUp(t, test.image)
+ _, _, vd, tearDown, err := setUp(t, test.image)
if err != nil {
t.Fatalf("setUp failed: %v", err)
}
defer tearDown()
- d, ok := vfsd.Impl().(*dentry)
+ d, ok := vd.Dentry().Impl().(*dentry)
if !ok {
- t.Fatalf("ext dentry of incorrect type: %T", vfsd.Impl())
+ t.Fatalf("ext dentry of incorrect type: %T", vd.Dentry().Impl())
}
// Offload inode contents into local structs for comparison.
@@ -329,15 +851,15 @@ func TestFilesystemInit(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
- _, vfsfs, _, tearDown, err := setUp(t, test.image)
+ _, _, vd, tearDown, err := setUp(t, test.image)
if err != nil {
t.Fatalf("setUp failed: %v", err)
}
defer tearDown()
- fs, ok := vfsfs.Impl().(*filesystem)
+ fs, ok := vd.Mount().Filesystem().Impl().(*filesystem)
if !ok {
- t.Fatalf("ext filesystem of incorrect type: %T", vfsfs.Impl())
+ t.Fatalf("ext filesystem of incorrect type: %T", vd.Mount().Filesystem().Impl())
}
// Offload superblock and block group descriptors contents into
diff --git a/pkg/sentry/fs/ext/extent_file.go b/pkg/sentry/fs/ext/extent_file.go
index 44fb9c01f..1b9bf449b 100644
--- a/pkg/sentry/fs/ext/extent_file.go
+++ b/pkg/sentry/fs/ext/extent_file.go
@@ -150,7 +150,11 @@ func (f *extentFile) ReadAt(dst []byte, off int64) (int, error) {
return 0, io.EOF
}
- return f.read(&f.root, uint64(off), dst)
+ n, err := f.read(&f.root, uint64(off), dst)
+ if n < len(dst) && err == nil {
+ err = io.EOF
+ }
+ return n, err
}
// read is the recursive step of extentFile.ReadAt which traverses the extent
diff --git a/pkg/sentry/fs/ext/file_description.go b/pkg/sentry/fs/ext/file_description.go
new file mode 100644
index 000000000..d244cf1e7
--- /dev/null
+++ b/pkg/sentry/fs/ext/file_description.go
@@ -0,0 +1,110 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// fileDescription is embedded by ext implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+ vfsfd vfs.FileDescription
+
+ // flags is the same as vfs.OpenOptions.Flags which are passed to
+ // vfs.FilesystemImpl.OpenAt.
+ // TODO(b/134676337): syscalls like read(2), write(2), fchmod(2), fchown(2),
+ // fgetxattr(2), ioctl(2), mmap(2) should fail with EBADF if O_PATH is set.
+ // Only close(2), fstat(2), fstatfs(2) should work.
+ flags uint32
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+ return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) inode() *inode {
+ return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *fileDescription) OnClose() error { return nil }
+
+// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
+func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
+ return fd.flags, nil
+}
+
+// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
+func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
+ // None of the flags settable by fcntl(F_SETFL) are supported, so this is a
+ // no-op.
+ return nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ var stat linux.Statx
+ fd.inode().statTo(&stat)
+ return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ if opts.Stat.Mask == 0 {
+ return nil
+ }
+ return syserror.EPERM
+}
+
+// SetStat implements vfs.FileDescriptionImpl.StatFS.
+func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+ var stat linux.Statfs
+ fd.filesystem().statTo(&stat)
+ return stat, nil
+}
+
+// Readiness implements waiter.Waitable.Readiness analogously to
+// file_operations::poll == NULL in Linux.
+func (fd *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+ // include/linux/poll.h:vfs_poll() => DEFAULT_POLLMASK
+ return waiter.EventIn | waiter.EventOut
+}
+
+// EventRegister implements waiter.Waitable.EventRegister analogously to
+// file_operations::poll == NULL in Linux.
+func (fd *fileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {}
+
+// EventUnregister implements waiter.Waitable.EventUnregister analogously to
+// file_operations::poll == NULL in Linux.
+func (fd *fileDescription) EventUnregister(e *waiter.Entry) {}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *fileDescription) Sync(ctx context.Context) error {
+ return nil
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ // ioctl(2) specifies that ENOTTY must be returned if the file descriptor is
+ // not associated with a character special device (which is unimplemented).
+ return 0, syserror.ENOTTY
+}
diff --git a/pkg/sentry/fs/ext/filesystem.go b/pkg/sentry/fs/ext/filesystem.go
index 45b43b9e2..e08839f48 100644
--- a/pkg/sentry/fs/ext/filesystem.go
+++ b/pkg/sentry/fs/ext/filesystem.go
@@ -15,20 +15,27 @@
package ext
import (
+ "errors"
"io"
"sync"
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
)
+var (
+ // errResolveDirent indicates that the vfs.ResolvingPath.Component() does
+ // not exist on the dentry tree but does exist on disk. So it has to be read in
+ // using the in-memory dirent and added to the dentry tree. Usually indicates
+ // the need to lock filesystem.mu for writing.
+ errResolveDirent = errors.New("resolve path component using dirent")
+)
+
// filesystem implements vfs.FilesystemImpl.
type filesystem struct {
- // TODO(b/134676337): Remove when all methods have been implemented.
- vfs.FilesystemImpl
-
vfsfs vfs.Filesystem
// mu serializes changes to the Dentry tree.
@@ -44,8 +51,8 @@ type filesystem struct {
// inodeCache maps absolute inode numbers to the corresponding Inode struct.
// Inodes should be removed from this once their reference count hits 0.
//
- // Protected by mu because every addition and removal from this corresponds to
- // a change in the dentry tree.
+ // Protected by mu because most additions (see IterDirents) and all removals
+ // from this corresponds to a change in the dentry tree.
inodeCache map[uint32]*inode
// sb represents the filesystem superblock. Immutable after initialization.
@@ -59,16 +66,172 @@ type filesystem struct {
// Compiles only if filesystem implements vfs.FilesystemImpl.
var _ vfs.FilesystemImpl = (*filesystem)(nil)
-// getOrCreateInode gets the inode corresponding to the inode number passed in.
+// stepLocked resolves rp.Component() in parent directory vfsd. The write
+// parameter passed tells if the caller has acquired filesystem.mu for writing
+// or not. If set to true, an existing inode on disk can be added to the dentry
+// tree if not present already.
+//
+// stepLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions:
+// - filesystem.mu must be locked (for writing if write param is true).
+// - !rp.Done().
+// - inode == vfsd.Impl().(*Dentry).inode.
+func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
+ if !inode.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, nil, err
+ }
+
+ for {
+ nextVFSD, err := rp.ResolveComponent(vfsd)
+ if err != nil {
+ return nil, nil, err
+ }
+ if nextVFSD == nil {
+ // Since the Dentry tree is not the sole source of truth for extfs, if it's
+ // not in the Dentry tree, it might need to be pulled from disk.
+ childDirent, ok := inode.impl.(*directory).childMap[rp.Component()]
+ if !ok {
+ // The underlying inode does not exist on disk.
+ return nil, nil, syserror.ENOENT
+ }
+
+ if !write {
+ // filesystem.mu must be held for writing to add to the dentry tree.
+ return nil, nil, errResolveDirent
+ }
+
+ // Create and add the component's dirent to the dentry tree.
+ fs := rp.Mount().Filesystem().Impl().(*filesystem)
+ childInode, err := fs.getOrCreateInodeLocked(childDirent.diskDirent.Inode())
+ if err != nil {
+ return nil, nil, err
+ }
+ // incRef because this is being added to the dentry tree.
+ childInode.incRef()
+ child := newDentry(childInode)
+ vfsd.InsertChild(&child.vfsd, rp.Component())
+
+ // Continue as usual now that nextVFSD is not nil.
+ nextVFSD = &child.vfsd
+ }
+ nextInode := nextVFSD.Impl().(*dentry).inode
+ if nextInode.isSymlink() && rp.ShouldFollowSymlink() {
+ if err := rp.HandleSymlink(inode.impl.(*symlink).target); err != nil {
+ return nil, nil, err
+ }
+ continue
+ }
+ rp.Advance()
+ return nextVFSD, nextInode, nil
+ }
+}
+
+// walkLocked resolves rp to an existing file. The write parameter
+// passed tells if the caller has acquired filesystem.mu for writing or not.
+// If set to true, additions can be made to the dentry tree while walking.
+// If errResolveDirent is returned, the walk needs to be continued with an
+// upgraded filesystem.mu.
+//
+// walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
+//
+// Preconditions:
+// - filesystem.mu must be locked (for writing if write param is true).
+func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
+ vfsd := rp.Start()
+ inode := vfsd.Impl().(*dentry).inode
+ for !rp.Done() {
+ var err error
+ vfsd, inode, err = stepLocked(rp, vfsd, inode, write)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
+ if rp.MustBeDir() && !inode.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ return vfsd, inode, nil
+}
+
+// walkParentLocked resolves all but the last path component of rp to an
+// existing directory. It does not check that the returned directory is
+// searchable by the provider of rp. The write parameter passed tells if the
+// caller has acquired filesystem.mu for writing or not. If set to true,
+// additions can be made to the dentry tree while walking.
+// If errResolveDirent is returned, the walk needs to be continued with an
+// upgraded filesystem.mu.
+//
+// walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat().
+//
+// Preconditions:
+// - filesystem.mu must be locked (for writing if write param is true).
+// - !rp.Done().
+func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
+ vfsd := rp.Start()
+ inode := vfsd.Impl().(*dentry).inode
+ for !rp.Final() {
+ var err error
+ vfsd, inode, err = stepLocked(rp, vfsd, inode, write)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
+ if !inode.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ return vfsd, inode, nil
+}
+
+// walk resolves rp to an existing file. If parent is set to true, it resolves
+// the rp till the parent of the last component which should be an existing
+// directory. If parent is false then resolves rp entirely. Attemps to resolve
+// the path as far as it can with a read lock and upgrades the lock if needed.
+func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) {
+ var (
+ vfsd *vfs.Dentry
+ inode *inode
+ err error
+ )
+
+ // Try walking with the hopes that all dentries have already been pulled out
+ // of disk. This reduces congestion (allows concurrent walks).
+ fs.mu.RLock()
+ if parent {
+ vfsd, inode, err = walkParentLocked(rp, false)
+ } else {
+ vfsd, inode, err = walkLocked(rp, false)
+ }
+ fs.mu.RUnlock()
+
+ if err == errResolveDirent {
+ // Upgrade lock and continue walking. Lock upgrading in the middle of the
+ // walk is fine as this is a read only filesystem.
+ fs.mu.Lock()
+ if parent {
+ vfsd, inode, err = walkParentLocked(rp, true)
+ } else {
+ vfsd, inode, err = walkLocked(rp, true)
+ }
+ fs.mu.Unlock()
+ }
+
+ return vfsd, inode, err
+}
+
+// getOrCreateInodeLocked gets the inode corresponding to the inode number passed in.
// It creates a new one with the given inode number if one does not exist.
+// The caller must increment the ref count if adding this to the dentry tree.
//
-// Precondition: must be holding fs.mu.
-func (fs *filesystem) getOrCreateInode(ctx context.Context, inodeNum uint32) (*inode, error) {
+// Precondition: must be holding fs.mu for writing.
+func (fs *filesystem) getOrCreateInodeLocked(inodeNum uint32) (*inode, error) {
if in, ok := fs.inodeCache[inodeNum]; ok {
return in, nil
}
- in, err := newInode(ctx, fs, inodeNum)
+ in, err := newInode(fs, inodeNum)
if err != nil {
return nil, err
}
@@ -77,10 +240,92 @@ func (fs *filesystem) getOrCreateInode(ctx context.Context, inodeNum uint32) (*i
return in, nil
}
-// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+// statTo writes the statfs fields to the output parameter.
+func (fs *filesystem) statTo(stat *linux.Statfs) {
+ stat.Type = uint64(fs.sb.Magic())
+ stat.BlockSize = int64(fs.sb.BlockSize())
+ stat.Blocks = fs.sb.BlocksCount()
+ stat.BlocksFree = fs.sb.FreeBlocksCount()
+ stat.BlocksAvailable = fs.sb.FreeBlocksCount()
+ stat.Files = uint64(fs.sb.InodesCount())
+ stat.FilesFree = uint64(fs.sb.FreeInodesCount())
+ stat.NameLength = disklayout.MaxFileName
+ stat.FragmentSize = int64(fs.sb.BlockSize())
+ // TODO(b/134676337): Set Statfs.Flags and Statfs.FSID.
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+ vfsd, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return nil, err
+ }
+
+ if opts.CheckSearchable {
+ if !inode.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+ }
+
+ inode.incRef()
+ return vfsd, nil
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ vfsd, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return nil, err
+ }
+
+ // EROFS is returned if write access is needed.
+ if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 {
+ return nil, syserror.EROFS
+ }
+ return inode.open(rp, vfsd, opts.Flags)
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+ _, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return "", err
+ }
+ symlink, ok := inode.impl.(*symlink)
+ if !ok {
+ return "", syserror.EINVAL
+ }
+ return symlink.target, nil
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+ _, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ var stat linux.Statx
+ inode.statTo(&stat)
+ return stat, nil
}
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+ if _, _, err := fs.walk(rp, false); err != nil {
+ return linux.Statfs{}, err
+ }
+
+ var stat linux.Statfs
+ fs.statTo(&stat)
+ return stat, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {}
+
// Sync implements vfs.FilesystemImpl.Sync.
func (fs *filesystem) Sync(ctx context.Context) error {
// This is a readonly filesystem for now.
@@ -89,42 +334,110 @@ func (fs *filesystem) Sync(ctx context.Context) error {
// The vfs.FilesystemImpl functions below return EROFS because their respective
// man pages say that EROFS must be returned if the path resolves to a file on
-// a read-only filesystem.
+// this read-only filesystem.
-// TODO(b/134676337): Implement path traversal and return EROFS only if the
-// path resolves to a Dentry within ext fs.
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+
+ if _, _, err := fs.walk(rp, true); err != nil {
+ return err
+ }
+
+ return syserror.EROFS
+}
// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+
+ if _, _, err := fs.walk(rp, true); err != nil {
+ return err
+ }
+
return syserror.EROFS
}
// MknodAt implements vfs.FilesystemImpl.MknodAt.
func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+
+ _, _, err := fs.walk(rp, true)
+ if err != nil {
+ return err
+ }
+
return syserror.EROFS
}
// RenameAt implements vfs.FilesystemImpl.RenameAt.
func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+ if rp.Done() {
+ return syserror.ENOENT
+ }
+
+ _, _, err := fs.walk(rp, false)
+ if err != nil {
+ return err
+ }
+
return syserror.EROFS
}
// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ _, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return err
+ }
+
+ if !inode.isDir() {
+ return syserror.ENOTDIR
+ }
+
return syserror.EROFS
}
// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+ _, _, err := fs.walk(rp, false)
+ if err != nil {
+ return err
+ }
+
return syserror.EROFS
}
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+
+ _, _, err := fs.walk(rp, true)
+ if err != nil {
+ return err
+ }
+
return syserror.EROFS
}
// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ _, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return err
+ }
+
+ if inode.isDir() {
+ return syserror.EISDIR
+ }
+
return syserror.EROFS
}
diff --git a/pkg/sentry/fs/ext/inline_file.go b/pkg/sentry/fs/ext/inline_file.go
deleted file mode 100644
index 67a538ba0..000000000
--- a/pkg/sentry/fs/ext/inline_file.go
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ext
-
-import (
- "io"
-)
-
-// inlineFile is a type of regular file. All the data here is stored in the
-// inode.Data() array.
-type inlineFile struct {
- regFile regularFile
-}
-
-// Compiles only if inlineFile implements io.ReaderAt.
-var _ io.ReaderAt = (*inlineFile)(nil)
-
-// newInlineFile is the inlineFile constructor.
-func newInlineFile(regFile regularFile) *inlineFile {
- file := &inlineFile{regFile: regFile}
- file.regFile.impl = file
- return file
-}
-
-// ReadAt implements io.ReaderAt.ReadAt.
-func (f *inlineFile) ReadAt(dst []byte, off int64) (int, error) {
- if len(dst) == 0 {
- return 0, nil
- }
-
- size := f.regFile.inode.diskInode.Size()
- if uint64(off) >= size {
- return 0, io.EOF
- }
-
- to := uint64(off) + uint64(len(dst))
- if to > size {
- to = size
- }
-
- n := copy(dst, f.regFile.inode.diskInode.Data()[off:to])
- return n, nil
-}
diff --git a/pkg/sentry/fs/ext/inode.go b/pkg/sentry/fs/ext/inode.go
index 364980e4c..178bd6376 100644
--- a/pkg/sentry/fs/ext/inode.go
+++ b/pkg/sentry/fs/ext/inode.go
@@ -15,12 +15,14 @@
package ext
import (
+ "fmt"
"io"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -31,13 +33,11 @@ import (
//
// Implementations:
// inode --
-// |-- pipe
// |-- dir
// |-- symlink
// |-- regular--
// |-- extent file
// |-- block map file
-// |-- inline file
type inode struct {
// refs is a reference count. refs is accessed using atomic memory operations.
refs int64
@@ -92,7 +92,7 @@ func (in *inode) decRef(fs *filesystem) {
// newInode is the inode constructor. Reads the inode off disk. Identifies
// inodes based on the absolute inode number on disk.
-func newInode(ctx context.Context, fs *filesystem, inodeNum uint32) (*inode, error) {
+func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
if inodeNum == 0 {
panic("inode number 0 on ext filesystems is not possible")
}
@@ -117,7 +117,6 @@ func newInode(ctx context.Context, fs *filesystem, inodeNum uint32) (*inode, err
// Build the inode based on its type.
inode := inode{
- refs: 1,
inodeNum: inodeNum,
dev: fs.dev,
blkSize: blkSize,
@@ -138,15 +137,76 @@ func newInode(ctx context.Context, fs *filesystem, inodeNum uint32) (*inode, err
}
return &f.inode, nil
case linux.ModeDirectory:
- return &newDirectroy(inode).inode, nil
- case linux.ModeNamedPipe:
- return &newNamedPipe(ctx, inode).inode, nil
+ f, err := newDirectroy(inode, fs.sb.IncompatibleFeatures().DirentFileType)
+ if err != nil {
+ return nil, err
+ }
+ return &f.inode, nil
default:
- // TODO(b/134676337): Return appropriate errors for sockets and devices.
+ // TODO(b/134676337): Return appropriate errors for sockets, pipes and devices.
return nil, syserror.EINVAL
}
}
+// open creates and returns a file description for the dentry passed in.
+func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+ ats := vfs.AccessTypesForOpenFlags(flags)
+ if err := in.checkPermissions(rp.Credentials(), ats); err != nil {
+ return nil, err
+ }
+ switch in.impl.(type) {
+ case *regularFile:
+ var fd regularFileFD
+ fd.flags = flags
+ fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+ return &fd.vfsfd, nil
+ case *directory:
+ // Can't open directories writably. This check is not necessary for a read
+ // only filesystem but will be required when write is implemented.
+ if ats&vfs.MayWrite != 0 {
+ return nil, syserror.EISDIR
+ }
+ var fd directoryFD
+ fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+ fd.flags = flags
+ return &fd.vfsfd, nil
+ case *symlink:
+ if flags&linux.O_PATH == 0 {
+ // Can't open symlinks without O_PATH.
+ return nil, syserror.ELOOP
+ }
+ var fd symlinkFD
+ fd.flags = flags
+ fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+ return &fd.vfsfd, nil
+ default:
+ panic(fmt.Sprintf("unknown inode type: %T", in.impl))
+ }
+}
+
+func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+ return vfs.GenericCheckPermissions(creds, ats, in.isDir(), uint16(in.diskInode.Mode()), in.diskInode.UID(), in.diskInode.GID())
+}
+
+// statTo writes the statx fields to the output parameter.
+func (in *inode) statTo(stat *linux.Statx) {
+ stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
+ linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE |
+ linux.STATX_ATIME | linux.STATX_CTIME | linux.STATX_MTIME
+ stat.Blksize = uint32(in.blkSize)
+ stat.Mode = uint16(in.diskInode.Mode())
+ stat.Nlink = uint32(in.diskInode.LinksCount())
+ stat.UID = uint32(in.diskInode.UID())
+ stat.GID = uint32(in.diskInode.GID())
+ stat.Ino = uint64(in.inodeNum)
+ stat.Size = in.diskInode.Size()
+ stat.Atime = in.diskInode.AccessTime().StatxTimestamp()
+ stat.Ctime = in.diskInode.ChangeTime().StatxTimestamp()
+ stat.Mtime = in.diskInode.ModificationTime().StatxTimestamp()
+ // TODO(b/134676337): Set stat.Blocks which is the number of 512 byte blocks
+ // (including metadata blocks) required to represent this file.
+}
+
// getBGNum returns the block group number that a given inode belongs to.
func getBGNum(inodeNum uint32, inodesPerGrp uint32) uint32 {
return (inodeNum - 1) / inodesPerGrp
diff --git a/pkg/sentry/fs/ext/named_pipe.go b/pkg/sentry/fs/ext/named_pipe.go
deleted file mode 100644
index 0f3af1b53..000000000
--- a/pkg/sentry/fs/ext/named_pipe.go
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ext
-
-import (
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
-)
-
-// namedPipe represents a named pipe inode. It is currently just a wrapper
-// around pkg/sentry/kernel/pipe.
-type namedPipe struct {
- inode inode
-
- p *pipe.Pipe
- inodeOps fs.InodeOperations
-}
-
-// newNamedPipe is the namedPipe constructor.
-func newNamedPipe(ctx context.Context, inode inode) *namedPipe {
- file := &namedPipe{inode: inode}
- file.inode.impl = file
- file.p = pipe.NewPipe(ctx, true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)
- file.inodeOps = pipe.NewInodeOperations(ctx, fs.FilePermsFromMode(file.inode.diskInode.Mode()), file.p)
- return file
-}
diff --git a/pkg/sentry/fs/ext/regular_file.go b/pkg/sentry/fs/ext/regular_file.go
index fb1bd38ef..ffc76ba5b 100644
--- a/pkg/sentry/fs/ext/regular_file.go
+++ b/pkg/sentry/fs/ext/regular_file.go
@@ -16,6 +16,15 @@ package ext
import (
"io"
+ "sync"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
)
// regularFile represents a regular file's inode. This too follows the
@@ -26,6 +35,9 @@ type regularFile struct {
// This is immutable. The first field of fileReader implementations must be
// regularFile to ensure temporality.
+ // io.ReaderAt is more strict than io.Reader in the sense that a partial read
+ // is always accompanied by an error. If a read spans past the end of file, a
+ // partial read (within file range) is done and io.EOF is returned.
impl io.ReaderAt
}
@@ -48,16 +60,6 @@ func newRegularFile(inode inode) (*regularFile, error) {
return &file.regFile, nil
}
- if inodeFlags.Inline {
- if inode.diskInode.Size() > 60 {
- panic("ext fs: inline file larger than 60 bytes")
- }
-
- file := newInlineFile(regFile)
- file.regFile.inode.impl = &file.regFile
- return &file.regFile, nil
- }
-
file, err := newBlockMapFile(regFile)
if err != nil {
return nil, err
@@ -66,6 +68,92 @@ func newRegularFile(inode inode) (*regularFile, error) {
return &file.regFile, nil
}
-func (f *regularFile) blksUsed(blkSize uint64) uint64 {
- return (f.inode.diskInode.Size() + blkSize - 1) / blkSize
+func (in *inode) isRegular() bool {
+ _, ok := in.impl.(*regularFile)
+ return ok
+}
+
+// directoryFD represents a directory file description. It implements
+// vfs.FileDescriptionImpl.
+type regularFileFD struct {
+ fileDescription
+
+ // off is the file offset. off is accessed using atomic memory operations.
+ off int64
+
+ // offMu serializes operations that may mutate off.
+ offMu sync.Mutex
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release() {}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ safeReader := safemem.FromIOReaderAt{
+ ReaderAt: fd.inode().impl.(*regularFile).impl,
+ Offset: offset,
+ }
+
+ // Copies data from disk directly into usermem without any intermediate
+ // allocations (if dst is converted into BlockSeq such that it does not need
+ // safe copying).
+ return dst.CopyOutFrom(ctx, safeReader)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ n, err := fd.PRead(ctx, dst, fd.off, opts)
+ fd.offMu.Lock()
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ // write(2) specifies that EBADF must be returned if the fd is not open for
+ // writing.
+ return 0, syserror.EBADF
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ n, err := fd.PWrite(ctx, src, fd.off, opts)
+ fd.offMu.Lock()
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *regularFileFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ return syserror.ENOTDIR
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fd.offMu.Lock()
+ defer fd.offMu.Unlock()
+ switch whence {
+ case linux.SEEK_SET:
+ // Use offset as specified.
+ case linux.SEEK_CUR:
+ offset += fd.off
+ case linux.SEEK_END:
+ offset += int64(fd.inode().diskInode.Size())
+ default:
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ fd.off = offset
+ return offset, nil
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+ // TODO(b/134676337): Implement mmap(2).
+ return syserror.ENODEV
}
diff --git a/pkg/sentry/fs/ext/symlink.go b/pkg/sentry/fs/ext/symlink.go
index 9f498d989..e06548a98 100644
--- a/pkg/sentry/fs/ext/symlink.go
+++ b/pkg/sentry/fs/ext/symlink.go
@@ -15,6 +15,10 @@
package ext
import (
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -43,8 +47,8 @@ func newSymlink(inode inode) (*symlink, error) {
}
link = make([]byte, size)
- if n, _ := regFile.impl.ReadAt(link, 0); uint64(n) < size {
- return nil, syserror.EIO
+ if n, err := regFile.impl.ReadAt(link, 0); uint64(n) < size {
+ return nil, err
}
}
@@ -52,3 +56,56 @@ func newSymlink(inode inode) (*symlink, error) {
file.inode.impl = file
return file, nil
}
+
+func (in *inode) isSymlink() bool {
+ _, ok := in.impl.(*symlink)
+ return ok
+}
+
+// symlinkFD represents a symlink file description and implements implements
+// vfs.FileDescriptionImpl. which may only be used if open options contains
+// O_PATH. For this reason most of the functions return EBADF.
+type symlinkFD struct {
+ fileDescription
+}
+
+// Compiles only if symlinkFD implements vfs.FileDescriptionImpl.
+var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil)
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *symlinkFD) Release() {}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *symlinkFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *symlinkFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *symlinkFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *symlinkFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ return syserror.ENOTDIR
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *symlinkFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *symlinkFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+ return syserror.EBADF
+}
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
index d5d4f68df..d2450e810 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -11,8 +11,8 @@ go_template_instance(
prefix = "dentry",
template = "//pkg/ilist:generic_list",
types = {
- "Element": "*Dentry",
- "Linker": "*Dentry",
+ "Element": "*dentry",
+ "Linker": "*dentry",
},
)
diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/memfs/directory.go
index b0c3ea39a..c52dc781c 100644
--- a/pkg/sentry/fsimpl/memfs/directory.go
+++ b/pkg/sentry/fsimpl/memfs/directory.go
@@ -23,23 +23,23 @@ import (
)
type directory struct {
- inode Inode
+ inode inode
// childList is a list containing (1) child Dentries and (2) fake Dentries
// (with inode == nil) that represent the iteration position of
// directoryFDs. childList is used to support directoryFD.IterDirents()
- // efficiently. childList is protected by Filesystem.mu.
+ // efficiently. childList is protected by filesystem.mu.
childList dentryList
}
-func (fs *Filesystem) newDirectory(creds *auth.Credentials, mode uint16) *Inode {
+func (fs *filesystem) newDirectory(creds *auth.Credentials, mode uint16) *inode {
dir := &directory{}
dir.inode.init(dir, fs, creds, mode)
dir.inode.nlink = 2 // from "." and parent directory or ".." for root
return &dir.inode
}
-func (i *Inode) isDir() bool {
+func (i *inode) isDir() bool {
_, ok := i.impl.(*directory)
return ok
}
@@ -48,8 +48,8 @@ type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
- // Protected by Filesystem.mu.
- iter *Dentry
+ // Protected by filesystem.mu.
+ iter *dentry
off int64
}
@@ -68,7 +68,7 @@ func (fd *directoryFD) Release() {
// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
fs := fd.filesystem()
- d := fd.vfsfd.VirtualDentry().Dentry()
+ vfsd := fd.vfsfd.VirtualDentry().Dentry()
fs.mu.Lock()
defer fs.mu.Unlock()
@@ -77,7 +77,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
if !cb.Handle(vfs.Dirent{
Name: ".",
Type: linux.DT_DIR,
- Ino: d.Impl().(*Dentry).inode.ino,
+ Ino: vfsd.Impl().(*dentry).inode.ino,
Off: 0,
}) {
return nil
@@ -85,7 +85,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
fd.off++
}
if fd.off == 1 {
- parentInode := d.ParentOrSelf().Impl().(*Dentry).inode
+ parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
if !cb.Handle(vfs.Dirent{
Name: "..",
Type: parentInode.direntType(),
@@ -97,12 +97,12 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
fd.off++
}
- dir := d.Impl().(*Dentry).inode.impl.(*directory)
- var child *Dentry
+ dir := vfsd.Impl().(*dentry).inode.impl.(*directory)
+ var child *dentry
if fd.iter == nil {
// Start iteration at the beginning of dir.
child = dir.childList.Front()
- fd.iter = &Dentry{}
+ fd.iter = &dentry{}
} else {
// Continue iteration from where we left off.
child = fd.iter.Next()
@@ -130,32 +130,41 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
- if whence != linux.SEEK_SET {
- // TODO: Linux also allows SEEK_CUR.
+ fs := fd.filesystem()
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+
+ switch whence {
+ case linux.SEEK_SET:
+ // Use offset as given.
+ case linux.SEEK_CUR:
+ offset += fd.off
+ default:
return 0, syserror.EINVAL
}
if offset < 0 {
return 0, syserror.EINVAL
}
+ // If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't
+ // seek even if doing so might reposition the iterator due to concurrent
+ // mutation of the directory. Compare fs/libfs.c:dcache_dir_lseek().
+ if fd.off == offset {
+ return offset, nil
+ }
+
fd.off = offset
// Compensate for "." and "..".
- var remChildren int64
- if offset < 2 {
- remChildren = 0
- } else {
+ remChildren := int64(0)
+ if offset >= 2 {
remChildren = offset - 2
}
- fs := fd.filesystem()
dir := fd.inode().impl.(*directory)
- fs.mu.Lock()
- defer fs.mu.Unlock()
-
// Ensure that fd.iter exists and is not linked into dir.childList.
if fd.iter == nil {
- fd.iter = &Dentry{}
+ fd.iter = &dentry{}
} else {
dir.childList.Remove(fd.iter)
}
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index 4d989eeaf..f79e2d9c8 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -28,9 +28,9 @@ import (
//
// stepLocked is loosely analogous to fs/namei.c:walk_component().
//
-// Preconditions: Filesystem.mu must be locked. !rp.Done(). inode ==
-// vfsd.Impl().(*Dentry).inode.
-func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *Inode) (*vfs.Dentry, *Inode, error) {
+// Preconditions: filesystem.mu must be locked. !rp.Done(). inode ==
+// vfsd.Impl().(*dentry).inode.
+func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode) (*vfs.Dentry, *inode, error) {
if !inode.isDir() {
return nil, nil, syserror.ENOTDIR
}
@@ -47,7 +47,7 @@ afterSymlink:
// not in the Dentry tree, it doesn't exist.
return nil, nil, syserror.ENOENT
}
- nextInode := nextVFSD.Impl().(*Dentry).inode
+ nextInode := nextVFSD.Impl().(*dentry).inode
if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
// TODO: symlink traversals update access time
if err := rp.HandleSymlink(symlink.target); err != nil {
@@ -64,10 +64,10 @@ afterSymlink:
// walkExistingLocked is loosely analogous to Linux's
// fs/namei.c:path_lookupat().
//
-// Preconditions: Filesystem.mu must be locked.
-func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
+// Preconditions: filesystem.mu must be locked.
+func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
vfsd := rp.Start()
- inode := vfsd.Impl().(*Dentry).inode
+ inode := vfsd.Impl().(*dentry).inode
for !rp.Done() {
var err error
vfsd, inode, err = stepLocked(rp, vfsd, inode)
@@ -88,10 +88,10 @@ func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
// walkParentDirLocked is loosely analogous to Linux's
// fs/namei.c:path_parentat().
//
-// Preconditions: Filesystem.mu must be locked. !rp.Done().
-func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
vfsd := rp.Start()
- inode := vfsd.Impl().(*Dentry).inode
+ inode := vfsd.Impl().(*dentry).inode
for !rp.Final() {
var err error
vfsd, inode, err = stepLocked(rp, vfsd, inode)
@@ -108,9 +108,9 @@ func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
// checkCreateLocked checks that a file named rp.Component() may be created in
// directory parentVFSD, then returns rp.Component().
//
-// Preconditions: Filesystem.mu must be locked. parentInode ==
-// parentVFSD.Impl().(*Dentry).inode. parentInode.isDir() == true.
-func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *Inode) (string, error) {
+// Preconditions: filesystem.mu must be locked. parentInode ==
+// parentVFSD.Impl().(*dentry).inode. parentInode.isDir() == true.
+func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *inode) (string, error) {
if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
return "", err
}
@@ -144,7 +144,7 @@ func checkDeleteLocked(vfsd *vfs.Dentry) error {
}
// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
-func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
vfsd, inode, err := walkExistingLocked(rp)
@@ -164,7 +164,7 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
}
// LinkAt implements vfs.FilesystemImpl.LinkAt.
-func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
if rp.Done() {
return syserror.EEXIST
}
@@ -185,7 +185,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
return err
}
defer rp.Mount().EndWrite()
- d := vd.Dentry().Impl().(*Dentry)
+ d := vd.Dentry().Impl().(*dentry)
if d.inode.isDir() {
return syserror.EPERM
}
@@ -197,7 +197,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
}
// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
-func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
if rp.Done() {
return syserror.EEXIST
}
@@ -223,7 +223,7 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
}
// MknodAt implements vfs.FilesystemImpl.MknodAt.
-func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
if rp.Done() {
return syserror.EEXIST
}
@@ -246,7 +246,7 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
}
// OpenAt implements vfs.FilesystemImpl.OpenAt.
-func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
// Filter out flags that are not supported by memfs. O_DIRECTORY and
// O_NOFOLLOW have no effect here (they're handled by VFS by setting
// appropriate bits in rp), but are returned by
@@ -265,11 +265,10 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
mustCreate := opts.Flags&linux.O_EXCL != 0
vfsd := rp.Start()
- inode := vfsd.Impl().(*Dentry).inode
+ inode := vfsd.Impl().(*dentry).inode
fs.mu.Lock()
defer fs.mu.Unlock()
if rp.Done() {
- // FIXME: ???
if rp.MustBeDir() {
return nil, syserror.EISDIR
}
@@ -327,7 +326,7 @@ afterTrailingSymlink:
if mustCreate {
return nil, syserror.EEXIST
}
- childInode := childVFSD.Impl().(*Dentry).inode
+ childInode := childVFSD.Impl().(*dentry).inode
if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
// TODO: symlink traversals update access time
if err := rp.HandleSymlink(symlink.target); err != nil {
@@ -340,7 +339,7 @@ afterTrailingSymlink:
return childInode.open(rp, childVFSD, opts.Flags, false)
}
-func (i *Inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
+func (i *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
ats := vfs.AccessTypesForOpenFlags(flags)
if !afterCreate {
if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil {
@@ -385,7 +384,7 @@ func (i *Inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afte
}
// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
-func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
fs.mu.RLock()
_, inode, err := walkExistingLocked(rp)
fs.mu.RUnlock()
@@ -400,9 +399,8 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
}
// RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
if rp.Done() {
- // FIXME
return syserror.ENOENT
}
fs.mu.Lock()
@@ -424,7 +422,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vf
}
// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
-func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
vfsd, inode, err := walkExistingLocked(rp)
@@ -447,12 +445,14 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
return err
}
+ // Remove from parent directory's childList.
+ vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
inode.decRef()
return nil
}
// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
-func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
fs.mu.RLock()
_, _, err := walkExistingLocked(rp)
fs.mu.RUnlock()
@@ -462,12 +462,12 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
if opts.Stat.Mask == 0 {
return nil
}
- // TODO: implement Inode.setStat
+ // TODO: implement inode.setStat
return syserror.EPERM
}
// StatAt implements vfs.FilesystemImpl.StatAt.
-func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
fs.mu.RLock()
_, inode, err := walkExistingLocked(rp)
fs.mu.RUnlock()
@@ -480,7 +480,7 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
}
// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
-func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
fs.mu.RLock()
_, _, err := walkExistingLocked(rp)
fs.mu.RUnlock()
@@ -492,7 +492,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
}
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
-func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
if rp.Done() {
return syserror.EEXIST
}
@@ -517,7 +517,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
}
// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
-func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
vfsd, inode, err := walkExistingLocked(rp)
@@ -537,6 +537,8 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
return err
}
+ // Remove from parent directory's childList.
+ vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
inode.decLinksLocked()
return nil
}
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
index f381e1a88..59612da14 100644
--- a/pkg/sentry/fsimpl/memfs/memfs.go
+++ b/pkg/sentry/fsimpl/memfs/memfs.go
@@ -21,10 +21,10 @@
//
// Lock order:
//
-// Filesystem.mu
+// filesystem.mu
// regularFileFD.offMu
// regularFile.mu
-// Inode.mu
+// inode.mu
package memfs
import (
@@ -42,8 +42,8 @@ import (
// FilesystemType implements vfs.FilesystemType.
type FilesystemType struct{}
-// Filesystem implements vfs.FilesystemImpl.
-type Filesystem struct {
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
vfsfs vfs.Filesystem
// mu serializes changes to the Dentry tree.
@@ -54,44 +54,44 @@ type Filesystem struct {
// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
func (fstype FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
- var fs Filesystem
+ var fs filesystem
fs.vfsfs.Init(&fs)
root := fs.newDentry(fs.newDirectory(creds, 01777))
return &fs.vfsfs, &root.vfsd, nil
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *Filesystem) Release() {
+func (fs *filesystem) Release() {
}
// Sync implements vfs.FilesystemImpl.Sync.
-func (fs *Filesystem) Sync(ctx context.Context) error {
+func (fs *filesystem) Sync(ctx context.Context) error {
// All filesystem state is in-memory.
return nil
}
-// Dentry implements vfs.DentryImpl.
-type Dentry struct {
+// dentry implements vfs.DentryImpl.
+type dentry struct {
vfsd vfs.Dentry
- // inode is the inode represented by this Dentry. Multiple Dentries may
- // share a single non-directory Inode (with hard links). inode is
+ // inode is the inode represented by this dentry. Multiple Dentries may
+ // share a single non-directory inode (with hard links). inode is
// immutable.
- inode *Inode
+ inode *inode
- // memfs doesn't count references on Dentries; because the Dentry tree is
+ // memfs doesn't count references on dentries; because the dentry tree is
// the sole source of truth, it is by definition always consistent with the
- // state of the filesystem. However, it does count references on Inodes,
- // because Inode resources are released when all references are dropped.
+ // state of the filesystem. However, it does count references on inodes,
+ // because inode resources are released when all references are dropped.
// (memfs doesn't really have resources to release, but we implement
// reference counting because tmpfs regular files will.)
- // dentryEntry (ugh) links Dentries into their parent directory.childList.
+ // dentryEntry (ugh) links dentries into their parent directory.childList.
dentryEntry
}
-func (fs *Filesystem) newDentry(inode *Inode) *Dentry {
- d := &Dentry{
+func (fs *filesystem) newDentry(inode *inode) *dentry {
+ d := &dentry{
inode: inode,
}
d.vfsd.Init(d)
@@ -99,37 +99,37 @@ func (fs *Filesystem) newDentry(inode *Inode) *Dentry {
}
// IncRef implements vfs.DentryImpl.IncRef.
-func (d *Dentry) IncRef(vfsfs *vfs.Filesystem) {
+func (d *dentry) IncRef(vfsfs *vfs.Filesystem) {
d.inode.incRef()
}
// TryIncRef implements vfs.DentryImpl.TryIncRef.
-func (d *Dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
+func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
return d.inode.tryIncRef()
}
// DecRef implements vfs.DentryImpl.DecRef.
-func (d *Dentry) DecRef(vfsfs *vfs.Filesystem) {
+func (d *dentry) DecRef(vfsfs *vfs.Filesystem) {
d.inode.decRef()
}
-// Inode represents a filesystem object.
-type Inode struct {
+// inode represents a filesystem object.
+type inode struct {
// refs is a reference count. refs is accessed using atomic memory
// operations.
//
- // A reference is held on all Inodes that are reachable in the filesystem
+ // A reference is held on all inodes that are reachable in the filesystem
// tree. For non-directories (which may have multiple hard links), this
// means that a reference is dropped when nlink reaches 0. For directories,
// nlink never reaches 0 due to the "." entry; instead,
- // Filesystem.RmdirAt() drops the reference.
+ // filesystem.RmdirAt() drops the reference.
refs int64
// Inode metadata; protected by mu and accessed using atomic memory
// operations unless otherwise specified.
mu sync.RWMutex
mode uint32 // excluding file type bits, which are based on impl
- nlink uint32 // protected by Filesystem.mu instead of Inode.mu
+ nlink uint32 // protected by filesystem.mu instead of inode.mu
uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
gid uint32 // auth.KGID, but ...
ino uint64 // immutable
@@ -137,7 +137,7 @@ type Inode struct {
impl interface{} // immutable
}
-func (i *Inode) init(impl interface{}, fs *Filesystem, creds *auth.Credentials, mode uint16) {
+func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode uint16) {
i.refs = 1
i.mode = uint32(mode)
i.uid = uint32(creds.EffectiveKUID)
@@ -147,29 +147,29 @@ func (i *Inode) init(impl interface{}, fs *Filesystem, creds *auth.Credentials,
i.impl = impl
}
-// Preconditions: Filesystem.mu must be locked for writing.
-func (i *Inode) incLinksLocked() {
+// Preconditions: filesystem.mu must be locked for writing.
+func (i *inode) incLinksLocked() {
if atomic.AddUint32(&i.nlink, 1) <= 1 {
- panic("memfs.Inode.incLinksLocked() called with no existing links")
+ panic("memfs.inode.incLinksLocked() called with no existing links")
}
}
-// Preconditions: Filesystem.mu must be locked for writing.
-func (i *Inode) decLinksLocked() {
+// Preconditions: filesystem.mu must be locked for writing.
+func (i *inode) decLinksLocked() {
if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 {
i.decRef()
} else if nlink == ^uint32(0) { // negative overflow
- panic("memfs.Inode.decLinksLocked() called with no existing links")
+ panic("memfs.inode.decLinksLocked() called with no existing links")
}
}
-func (i *Inode) incRef() {
+func (i *inode) incRef() {
if atomic.AddInt64(&i.refs, 1) <= 1 {
- panic("memfs.Inode.incRef() called without holding a reference")
+ panic("memfs.inode.incRef() called without holding a reference")
}
}
-func (i *Inode) tryIncRef() bool {
+func (i *inode) tryIncRef() bool {
for {
refs := atomic.LoadInt64(&i.refs)
if refs == 0 {
@@ -181,7 +181,7 @@ func (i *Inode) tryIncRef() bool {
}
}
-func (i *Inode) decRef() {
+func (i *inode) decRef() {
if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
// This is unnecessary; it's mostly to simulate what tmpfs would do.
if regfile, ok := i.impl.(*regularFile); ok {
@@ -191,18 +191,18 @@ func (i *Inode) decRef() {
regfile.mu.Unlock()
}
} else if refs < 0 {
- panic("memfs.Inode.decRef() called without holding a reference")
+ panic("memfs.inode.decRef() called without holding a reference")
}
}
-func (i *Inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
+func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
}
// Go won't inline this function, and returning linux.Statx (which is quite
// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
// output parameter.
-func (i *Inode) statTo(stat *linux.Statx) {
+func (i *inode) statTo(stat *linux.Statx) {
stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
stat.Blksize = 1 // usermem.PageSize in tmpfs
stat.Nlink = atomic.LoadUint32(&i.nlink)
@@ -241,7 +241,7 @@ func allocatedBlocksForSize(size uint64) uint64 {
return (size + 511) / 512
}
-func (i *Inode) direntType() uint8 {
+func (i *inode) direntType() uint8 {
switch i.impl.(type) {
case *regularFile:
return linux.DT_REG
@@ -262,12 +262,12 @@ type fileDescription struct {
flags uint32 // status flags; immutable
}
-func (fd *fileDescription) filesystem() *Filesystem {
- return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*Filesystem)
+func (fd *fileDescription) filesystem() *filesystem {
+ return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem)
}
-func (fd *fileDescription) inode() *Inode {
- return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+func (fd *fileDescription) inode() *inode {
+ return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode
}
// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
@@ -294,6 +294,6 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
if opts.Stat.Mask == 0 {
return nil
}
- // TODO: implement Inode.setStat
+ // TODO: implement inode.setStat
return syserror.EPERM
}
diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go
index 4a3603cc8..7a16d5719 100644
--- a/pkg/sentry/fsimpl/memfs/regular_file.go
+++ b/pkg/sentry/fsimpl/memfs/regular_file.go
@@ -28,16 +28,16 @@ import (
)
type regularFile struct {
- inode Inode
+ inode inode
mu sync.RWMutex
data []byte
// dataLen is len(data), but accessed using atomic memory operations to
- // avoid locking in Inode.stat().
+ // avoid locking in inode.stat().
dataLen int64
}
-func (fs *Filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *Inode {
+func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *inode {
file := &regularFile{}
file.inode.init(file, fs, creds, mode)
file.inode.nlink = 1 // from parent directory
diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/memfs/symlink.go
index e002d1727..b2ac2cbeb 100644
--- a/pkg/sentry/fsimpl/memfs/symlink.go
+++ b/pkg/sentry/fsimpl/memfs/symlink.go
@@ -19,11 +19,11 @@ import (
)
type symlink struct {
- inode Inode
+ inode inode
target string // immutable
}
-func (fs *Filesystem) newSymlink(creds *auth.Credentials, target string) *Inode {
+func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode {
link := &symlink{
target: target,
}
diff --git a/pkg/sentry/safemem/io.go b/pkg/sentry/safemem/io.go
index 5c3d73eb7..f039a5c34 100644
--- a/pkg/sentry/safemem/io.go
+++ b/pkg/sentry/safemem/io.go
@@ -157,7 +157,8 @@ func (w ToIOWriter) Write(src []byte) (int, error) {
}
// FromIOReader implements Reader for an io.Reader by repeatedly invoking
-// io.Reader.Read until it returns an error or partial read.
+// io.Reader.Read until it returns an error or partial read. This is not
+// thread-safe.
//
// FromIOReader will return a successful partial read iff Reader.Read does so.
type FromIOReader struct {
@@ -206,6 +207,58 @@ func (r FromIOReader) readToBlock(dst Block, buf []byte) (int, []byte, error) {
return wbn, buf, rerr
}
+// FromIOReaderAt implements Reader for an io.ReaderAt. Does not repeatedly
+// invoke io.ReaderAt.ReadAt because ReadAt is more strict than Read. A partial
+// read indicates an error. This is not thread-safe.
+type FromIOReaderAt struct {
+ ReaderAt io.ReaderAt
+ Offset int64
+}
+
+// ReadToBlocks implements Reader.ReadToBlocks.
+func (r FromIOReaderAt) ReadToBlocks(dsts BlockSeq) (uint64, error) {
+ var buf []byte
+ var done uint64
+ for !dsts.IsEmpty() {
+ dst := dsts.Head()
+ var n int
+ var err error
+ n, buf, err = r.readToBlock(dst, buf)
+ done += uint64(n)
+ if n != dst.Len() {
+ return done, err
+ }
+ dsts = dsts.Tail()
+ if err != nil {
+ if dsts.IsEmpty() && err == io.EOF {
+ return done, nil
+ }
+ return done, err
+ }
+ }
+ return done, nil
+}
+
+func (r FromIOReaderAt) readToBlock(dst Block, buf []byte) (int, []byte, error) {
+ // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require
+ // safecopy.
+ if !dst.NeedSafecopy() {
+ n, err := r.ReaderAt.ReadAt(dst.ToSlice(), r.Offset)
+ r.Offset += int64(n)
+ return n, buf, err
+ }
+ if len(buf) < dst.Len() {
+ buf = make([]byte, dst.Len())
+ }
+ rn, rerr := r.ReaderAt.ReadAt(buf[:dst.Len()], r.Offset)
+ r.Offset += int64(rn)
+ wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn]))
+ if wberr != nil {
+ return wbn, buf, wberr
+ }
+ return wbn, buf, rerr
+}
+
// FromIOWriter implements Writer for an io.Writer by repeatedly invoking
// io.Writer.Write until it returns an error or partial write.
//
diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD
index e43775898..a3585e10d 100644
--- a/pkg/sentry/socket/rpcinet/notifier/BUILD
+++ b/pkg/sentry/socket/rpcinet/notifier/BUILD
@@ -6,7 +6,7 @@ go_library(
name = "notifier",
srcs = ["notifier.go"],
importpath = "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/notifier",
- visibility = ["//pkg/sentry:internal"],
+ visibility = ["//:sandbox"],
deps = [
"//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto",
"//pkg/sentry/socket/rpcinet/conn",
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index 63e2c5a5d..912cbe4ff 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -120,7 +120,7 @@ func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent
Ino: attr.InodeID,
Off: offset,
},
- Typ: toType(attr.Type),
+ Typ: fs.ToDirentType(attr.Type),
},
Name: []byte(name),
}
@@ -142,28 +142,6 @@ func smallestDirent64(a arch.Context) uint {
return uint(binary.Size(d.Hdr)) + a.Width()
}
-// toType converts an fs.InodeOperationsInfo to a linux dirent typ field.
-func toType(nodeType fs.InodeType) uint8 {
- switch nodeType {
- case fs.RegularFile, fs.SpecialFile:
- return linux.DT_REG
- case fs.Symlink:
- return linux.DT_LNK
- case fs.Directory, fs.SpecialDirectory:
- return linux.DT_DIR
- case fs.Pipe:
- return linux.DT_FIFO
- case fs.CharacterDevice:
- return linux.DT_CHR
- case fs.BlockDevice:
- return linux.DT_BLK
- case fs.Socket:
- return linux.DT_SOCK
- default:
- return linux.DT_UNKNOWN
- }
-}
-
// padRec pads the name field until the rec length is a multiple of the width,
// which must be a power of 2. It returns the padded rec length.
func (d *dirent) padRec(width int) uint16 {
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index a7c98efcb..17e3dde1f 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -207,6 +207,10 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, syserror.ESPIPE
}
if outOffset != 0 {
+ if !outFile.Flags().Pwrite {
+ return 0, nil, syserror.EINVAL
+ }
+
var offset int64
if _, err := t.CopyIn(outOffset, &offset); err != nil {
return 0, nil, err
@@ -220,6 +224,10 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, syserror.ESPIPE
}
if inOffset != 0 {
+ if !inFile.Flags().Pread {
+ return 0, nil, syserror.EINVAL
+ }
+
var offset int64
if _, err := t.CopyIn(inOffset, &offset); err != nil {
return 0, nil, err
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 60070874d..ea7296e6a 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -109,6 +109,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
pkt.SetOp(header.ARPReply)
copy(pkt.HardwareAddressSender(), r.LocalLinkAddress[:])
copy(pkt.ProtocolAddressSender(), h.ProtocolAddressTarget())
+ copy(pkt.HardwareAddressTarget(), h.HardwareAddressSender())
copy(pkt.ProtocolAddressTarget(), h.ProtocolAddressSender())
e.linkEP.WritePacket(r, nil /* gso */, hdr, buffer.VectorisedView{}, ProtocolNumber)
fallthrough // also fill the cache from requests
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 66c55821b..e477046db 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -15,6 +15,7 @@
package arp_test
import (
+ "strconv"
"testing"
"time"
@@ -101,40 +102,30 @@ func TestDirectRequest(t *testing.T) {
c.linkEP.Inject(arp.ProtocolNumber, v.ToVectorisedView())
}
- inject(stackAddr1)
- {
- pkt := <-c.linkEP.C
- if pkt.Proto != arp.ProtocolNumber {
- t.Fatalf("stackAddr1: expected ARP response, got network protocol number %v", pkt.Proto)
- }
- rep := header.ARP(pkt.Header)
- if !rep.IsValid() {
- t.Fatalf("stackAddr1: invalid ARP response len(pkt.Header)=%d", len(pkt.Header))
- }
- if tcpip.Address(rep.ProtocolAddressSender()) != stackAddr1 {
- t.Errorf("stackAddr1: expected sender to be set")
- }
- if got := tcpip.LinkAddress(rep.HardwareAddressSender()); got != stackLinkAddr {
- t.Errorf("stackAddr1: expected sender to be stackLinkAddr, got %q", got)
- }
- }
-
- inject(stackAddr2)
- {
- pkt := <-c.linkEP.C
- if pkt.Proto != arp.ProtocolNumber {
- t.Fatalf("stackAddr2: expected ARP response, got network protocol number %v", pkt.Proto)
- }
- rep := header.ARP(pkt.Header)
- if !rep.IsValid() {
- t.Fatalf("stackAddr2: invalid ARP response len(pkt.Header)=%d", len(pkt.Header))
- }
- if tcpip.Address(rep.ProtocolAddressSender()) != stackAddr2 {
- t.Errorf("stackAddr2: expected sender to be set")
- }
- if got := tcpip.LinkAddress(rep.HardwareAddressSender()); got != stackLinkAddr {
- t.Errorf("stackAddr2: expected sender to be stackLinkAddr, got %q", got)
- }
+ for i, address := range []tcpip.Address{stackAddr1, stackAddr2} {
+ t.Run(strconv.Itoa(i), func(t *testing.T) {
+ inject(address)
+ pkt := <-c.linkEP.C
+ if pkt.Proto != arp.ProtocolNumber {
+ t.Fatalf("expected ARP response, got network protocol number %d", pkt.Proto)
+ }
+ rep := header.ARP(pkt.Header)
+ if !rep.IsValid() {
+ t.Fatalf("invalid ARP response len(pkt.Header)=%d", len(pkt.Header))
+ }
+ if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr; got != want {
+ t.Errorf("got HardwareAddressSender = %s, want = %s", got, want)
+ }
+ if got, want := tcpip.Address(rep.ProtocolAddressSender()), tcpip.Address(h.ProtocolAddressTarget()); got != want {
+ t.Errorf("got ProtocolAddressSender = %s, want = %s", got, want)
+ }
+ if got, want := tcpip.LinkAddress(rep.HardwareAddressTarget()), tcpip.LinkAddress(h.HardwareAddressSender()); got != want {
+ t.Errorf("got HardwareAddressTarget = %s, want = %s", got, want)
+ }
+ if got, want := tcpip.Address(rep.ProtocolAddressTarget()), tcpip.Address(h.ProtocolAddressSender()); got != want {
+ t.Errorf("got ProtocolAddressTarget = %s, want = %s", got, want)
+ }
+ })
}
inject(stackAddrBad)
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 52fd1bfa3..e9c5099ea 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -96,6 +96,17 @@ type listenContext struct {
hasher hash.Hash
v6only bool
netProto tcpip.NetworkProtocolNumber
+ // pendingMu protects pendingEndpoints. This should only be accessed
+ // by the listening endpoint's worker goroutine.
+ //
+ // Lock Ordering: listenEP.workerMu -> pendingMu
+ pendingMu sync.Mutex
+ // pending is used to wait for all pendingEndpoints to finish when
+ // a socket is closed.
+ pending sync.WaitGroup
+ // pendingEndpoints is a map of all endpoints for which a handshake is
+ // in progress.
+ pendingEndpoints map[stack.TransportEndpointID]*endpoint
}
// timeStamp returns an 8-bit timestamp with a granularity of 64 seconds.
@@ -133,14 +144,15 @@ func decSynRcvdCount() {
}
// newListenContext creates a new listen context.
-func newListenContext(stack *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, v6only bool, netProto tcpip.NetworkProtocolNumber) *listenContext {
+func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, v6only bool, netProto tcpip.NetworkProtocolNumber) *listenContext {
l := &listenContext{
- stack: stack,
- rcvWnd: rcvWnd,
- hasher: sha1.New(),
- v6only: v6only,
- netProto: netProto,
- listenEP: listenEP,
+ stack: stk,
+ rcvWnd: rcvWnd,
+ hasher: sha1.New(),
+ v6only: v6only,
+ netProto: netProto,
+ listenEP: listenEP,
+ pendingEndpoints: make(map[stack.TransportEndpointID]*endpoint),
}
rand.Read(l.nonce[0][:])
@@ -253,6 +265,17 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
return nil, err
}
+ // listenEP is nil when listenContext is used by tcp.Forwarder.
+ if l.listenEP != nil {
+ l.listenEP.mu.Lock()
+ if l.listenEP.state != StateListen {
+ l.listenEP.mu.Unlock()
+ return nil, tcpip.ErrConnectionAborted
+ }
+ l.addPendingEndpoint(ep)
+ l.listenEP.mu.Unlock()
+ }
+
// Perform the 3-way handshake.
h := newHandshake(ep, seqnum.Size(ep.initialReceiveWindow()))
@@ -260,6 +283,9 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
if err := h.execute(); err != nil {
ep.stack.Stats().TCP.FailedConnectionAttempts.Increment()
ep.Close()
+ if l.listenEP != nil {
+ l.removePendingEndpoint(ep)
+ }
return nil, err
}
ep.mu.Lock()
@@ -274,15 +300,41 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
return ep, nil
}
+func (l *listenContext) addPendingEndpoint(n *endpoint) {
+ l.pendingMu.Lock()
+ l.pendingEndpoints[n.id] = n
+ l.pending.Add(1)
+ l.pendingMu.Unlock()
+}
+
+func (l *listenContext) removePendingEndpoint(n *endpoint) {
+ l.pendingMu.Lock()
+ delete(l.pendingEndpoints, n.id)
+ l.pending.Done()
+ l.pendingMu.Unlock()
+}
+
+func (l *listenContext) closeAllPendingEndpoints() {
+ l.pendingMu.Lock()
+ for _, n := range l.pendingEndpoints {
+ n.notifyProtocolGoroutine(notifyClose)
+ }
+ l.pendingMu.Unlock()
+ l.pending.Wait()
+}
+
// deliverAccepted delivers the newly-accepted endpoint to the listener. If the
// endpoint has transitioned out of the listen state, the new endpoint is closed
// instead.
func (e *endpoint) deliverAccepted(n *endpoint) {
- e.mu.RLock()
+ e.mu.Lock()
state := e.state
- e.mu.RUnlock()
+ e.pendingAccepted.Add(1)
+ defer e.pendingAccepted.Done()
+ acceptedChan := e.acceptedChan
+ e.mu.Unlock()
if state == StateListen {
- e.acceptedChan <- n
+ acceptedChan <- n
e.waiterQueue.Notify(waiter.EventIn)
} else {
n.Close()
@@ -304,7 +356,7 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
return
}
-
+ ctx.removePendingEndpoint(n)
e.deliverAccepted(n)
}
@@ -451,6 +503,11 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
// protocolListenLoop is the main loop of a listening TCP endpoint. It runs in
// its own goroutine and is responsible for handling connection requests.
func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
+ e.mu.Lock()
+ v6only := e.v6only
+ e.mu.Unlock()
+ ctx := newListenContext(e.stack, e, rcvWnd, v6only, e.netProto)
+
defer func() {
// Mark endpoint as closed. This will prevent goroutines running
// handleSynSegment() from attempting to queue new connections
@@ -458,6 +515,9 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
e.mu.Lock()
e.state = StateClose
+ // close any endpoints in SYN-RCVD state.
+ ctx.closeAllPendingEndpoints()
+
// Do cleanup if needed.
e.completeWorkerLocked()
@@ -470,12 +530,6 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
e.waiterQueue.Notify(waiter.EventIn | waiter.EventOut)
}()
- e.mu.Lock()
- v6only := e.v6only
- e.mu.Unlock()
-
- ctx := newListenContext(e.stack, e, rcvWnd, v6only, e.netProto)
-
s := sleep.Sleeper{}
s.AddWaker(&e.notificationWaker, wakerForNotification)
s.AddWaker(&e.newSegmentWaker, wakerForNewSegment)
@@ -492,7 +546,6 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
e.handleListenSegment(ctx, s)
s.decRef()
}
- synRcvdCount.pending.Wait()
close(e.drainDone)
<-e.undrain
}
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index d9f79e8c5..c54610a87 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -570,3 +570,89 @@ func TestV4AcceptOnV4(t *testing.T) {
// Test acceptance.
testV4Accept(t, c)
}
+
+func testV4ListenClose(t *testing.T, c *context.Context) {
+ // Set the SynRcvd threshold to zero to force a syn cookie based accept
+ // to happen.
+ saved := tcp.SynRcvdCountThreshold
+ defer func() {
+ tcp.SynRcvdCountThreshold = saved
+ }()
+ tcp.SynRcvdCountThreshold = 0
+ const n = uint16(32)
+
+ // Start listening.
+ if err := c.EP.Listen(int(tcp.SynRcvdCountThreshold + 1)); err != nil {
+ t.Fatalf("Listen failed: %v", err)
+ }
+
+ irs := seqnum.Value(789)
+ for i := uint16(0); i < n; i++ {
+ // Send a SYN request.
+ c.SendPacket(nil, &context.Headers{
+ SrcPort: context.TestPort + i,
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagSyn,
+ SeqNum: irs,
+ RcvWnd: 30000,
+ })
+ }
+
+ // Each of these ACK's will cause a syn-cookie based connection to be
+ // accepted and delivered to the listening endpoint.
+ for i := uint16(0); i < n; i++ {
+ b := c.GetPacket()
+ tcp := header.TCP(header.IPv4(b).Payload())
+ iss := seqnum.Value(tcp.SequenceNumber())
+ // Send ACK.
+ c.SendPacket(nil, &context.Headers{
+ SrcPort: tcp.DestinationPort(),
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagAck,
+ SeqNum: irs + 1,
+ AckNum: iss + 1,
+ RcvWnd: 30000,
+ })
+ }
+
+ // Try to accept the connection.
+ we, ch := waiter.NewChannelEntry(nil)
+ c.WQ.EventRegister(&we, waiter.EventIn)
+ defer c.WQ.EventUnregister(&we)
+ nep, _, err := c.EP.Accept()
+ if err == tcpip.ErrWouldBlock {
+ // Wait for connection to be established.
+ select {
+ case <-ch:
+ nep, _, err = c.EP.Accept()
+ if err != nil {
+ t.Fatalf("Accept failed: %v", err)
+ }
+
+ case <-time.After(10 * time.Second):
+ t.Fatalf("Timed out waiting for accept")
+ }
+ }
+ nep.Close()
+ c.EP.Close()
+}
+
+func TestV4ListenCloseOnV4(t *testing.T) {
+ c := context.New(t, defaultMTU)
+ defer c.Cleanup()
+
+ // Create TCP endpoint.
+ var err *tcpip.Error
+ c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+ if err != nil {
+ t.Fatalf("NewEndpoint failed: %v", err)
+ }
+
+ // Bind to wildcard.
+ if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+ t.Fatalf("Bind failed: %v", err)
+ }
+
+ // Test acceptance.
+ testV4ListenClose(t, c)
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 353e2efaf..0e16877e7 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -362,6 +362,12 @@ type endpoint struct {
// without hearing a response, the connection is closed.
keepalive keepalive
+ // pendingAccepted is a synchronization primitive used to track number
+ // of connections that are queued up to be delivered to the accepted
+ // channel. We use this to ensure that all goroutines blocked on writing
+ // to the acceptedChan below terminate before we close acceptedChan.
+ pendingAccepted sync.WaitGroup `state:"nosave"`
+
// acceptedChan is used by a listening endpoint protocol goroutine to
// send newly accepted connections to the endpoint so that they can be
// read by Accept() calls.
@@ -375,7 +381,11 @@ type endpoint struct {
// The goroutine drain completion notification channel.
drainDone chan struct{} `state:"nosave"`
- // The goroutine undrain notification channel.
+ // The goroutine undrain notification channel. This is currently used as
+ // a way to block the worker goroutines. Today nothing closes/writes
+ // this channel and this causes any goroutines waiting on this to just
+ // block. This is used during save/restore to prevent worker goroutines
+ // from mutating state as it's being saved.
undrain chan struct{} `state:"nosave"`
// probe if not nil is invoked on every received segment. It is passed
@@ -575,6 +585,34 @@ func (e *endpoint) Close() {
e.mu.Unlock()
}
+// closePendingAcceptableConnections closes all connections that have completed
+// handshake but not yet been delivered to the application.
+func (e *endpoint) closePendingAcceptableConnectionsLocked() {
+ done := make(chan struct{})
+ // Spin a goroutine up as ranging on e.acceptedChan will just block when
+ // there are no more connections in the channel. Using a non-blocking
+ // select does not work as it can potentially select the default case
+ // even when there are pending writes but that are not yet written to
+ // the channel.
+ go func() {
+ defer close(done)
+ for n := range e.acceptedChan {
+ n.mu.Lock()
+ n.resetConnectionLocked(tcpip.ErrConnectionAborted)
+ n.mu.Unlock()
+ n.Close()
+ }
+ }()
+ // pendingAccepted(see endpoint.deliverAccepted) tracks the number of
+ // endpoints which have completed handshake but are not yet written to
+ // the e.acceptedChan. We wait here till the goroutine above can drain
+ // all such connections from e.acceptedChan.
+ e.pendingAccepted.Wait()
+ close(e.acceptedChan)
+ <-done
+ e.acceptedChan = nil
+}
+
// cleanupLocked frees all resources associated with the endpoint. It is called
// after Close() is called and the worker goroutine (if any) is done with its
// work.
@@ -582,14 +620,7 @@ func (e *endpoint) cleanupLocked() {
// Close all endpoints that might have been accepted by TCP but not by
// the client.
if e.acceptedChan != nil {
- close(e.acceptedChan)
- for n := range e.acceptedChan {
- n.mu.Lock()
- n.resetConnectionLocked(tcpip.ErrConnectionAborted)
- n.mu.Unlock()
- n.Close()
- }
- e.acceptedChan = nil
+ e.closePendingAcceptableConnectionsLocked()
}
e.workerCleanup = false
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 27e9c2e0f..bbb364214 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -138,6 +138,34 @@ type Container struct {
RootContainerDir string
}
+// loadSandbox loads all containers that belong to the sandbox with the given
+// ID.
+func loadSandbox(rootDir, id string) ([]*Container, error) {
+ cids, err := List(rootDir)
+ if err != nil {
+ return nil, err
+ }
+
+ // Load the container metadata.
+ var containers []*Container
+ for _, cid := range cids {
+ container, err := Load(rootDir, cid)
+ if err != nil {
+ // Container file may not exist if it raced with creation/deletion or
+ // directory was left behind. Load provides a snapshot in time, so it's
+ // fine to skip it.
+ if os.IsNotExist(err) {
+ continue
+ }
+ return nil, fmt.Errorf("loading container %q: %v", id, err)
+ }
+ if container.Sandbox.ID == id {
+ containers = append(containers, container)
+ }
+ }
+ return containers, nil
+}
+
// Load loads a container with the given id from a metadata file. id may be an
// abbreviation of the full container id, in which case Load loads the
// container to which id unambiguously refers to.
@@ -180,7 +208,7 @@ func Load(rootDir, id string) (*Container, error) {
// If the status is "Running" or "Created", check that the sandbox
// process still exists, and set it to Stopped if it does not.
//
- // This is inherently racey.
+ // This is inherently racy.
if c.Status == Running || c.Status == Created {
// Check if the sandbox process is still running.
if !c.isSandboxRunning() {
@@ -237,7 +265,13 @@ func List(rootDir string) ([]string, error) {
}
var out []string
for _, f := range fs {
- out = append(out, f.Name())
+ // Filter out directories that do no belong to a container.
+ cid := f.Name()
+ if validateID(cid) == nil {
+ if _, err := os.Stat(filepath.Join(rootDir, cid, metadataFilename)); err == nil {
+ out = append(out, f.Name())
+ }
+ }
}
return out, nil
}
@@ -1108,6 +1142,10 @@ func runInCgroup(cg *cgroup.Cgroup, fn func() error) error {
// adjustOOMScoreAdj sets the oom_score_adj for the sandbox and all gofers.
// oom_score_adj is set to the lowest oom_score_adj among the containers
// running in the sandbox.
+//
+// TODO(gvisor.dev/issue/512): This call could race with other containers being
+// created at the same time and end up setting the wrong oom_score_adj to the
+// sandbox.
func (c *Container) adjustOOMScoreAdj(conf *boot.Config) error {
// If this container's OOMScoreAdj is nil then we can exit early as no
// change should be made to oom_score_adj for the sandbox.
@@ -1115,21 +1153,9 @@ func (c *Container) adjustOOMScoreAdj(conf *boot.Config) error {
return nil
}
- ids, err := List(conf.RootDir)
+ containers, err := loadSandbox(conf.RootDir, c.Sandbox.ID)
if err != nil {
- return err
- }
-
- // Load the container metadata.
- var containers []*Container
- for _, id := range ids {
- container, err := Load(conf.RootDir, id)
- if err != nil {
- return fmt.Errorf("loading container %q: %v", id, err)
- }
- if container.Sandbox.ID == c.Sandbox.ID {
- containers = append(containers, container)
- }
+ return fmt.Errorf("loading sandbox containers: %v", err)
}
// Get the lowest score for all containers.
@@ -1150,30 +1176,16 @@ func (c *Container) adjustOOMScoreAdj(conf *boot.Config) error {
return nil
}
- // Set oom_score_adj for the sandbox.
+ // Set the lowest of all containers oom_score_adj to the sandbox.
if err := setOOMScoreAdj(c.Sandbox.Pid, lowScore); err != nil {
return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", c.Sandbox.ID, err)
}
- // Set the gofer's oom_score_adj to the minimum of -500 and the
- // sandbox's oom_score_adj to better ensure that the sandbox is killed
- // before the gofer.
- //
- // TODO(gvisor.dev/issue/601) Set oom_score_adj for the gofer to
- // the same oom_score_adj as the sandbox.
- goferScoreAdj := -500
- if lowScore < goferScoreAdj {
- goferScoreAdj = lowScore
- }
-
- // Set oom_score_adj for gofers for all containers in the sandbox.
- for _, container := range containers {
- err := setOOMScoreAdj(container.GoferPid, goferScoreAdj)
- if err != nil {
- return fmt.Errorf("setting oom_score_adj for container %q: %v", container.ID, err)
- }
+ // Set container's oom_score_adj to the gofer since it is dedicated to the
+ // container, in case the gofer uses up too much memory.
+ if err := setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj); err != nil {
+ return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err)
}
-
return nil
}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 2ef065a15..2d51fecc6 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -60,11 +60,14 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
}
func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
- rootDir, err := testutil.SetupRootDir()
- if err != nil {
- return nil, nil, fmt.Errorf("error creating root dir: %v", err)
+ // Setup root dir if one hasn't been provided.
+ if len(conf.RootDir) == 0 {
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ return nil, nil, fmt.Errorf("error creating root dir: %v", err)
+ }
+ conf.RootDir = rootDir
}
- conf.RootDir = rootDir
var containers []*Container
var bundles []string
@@ -75,7 +78,7 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
for _, b := range bundles {
os.RemoveAll(b)
}
- os.RemoveAll(rootDir)
+ os.RemoveAll(conf.RootDir)
}
for i, spec := range specs {
bundleDir, err := testutil.SetupBundleDir(spec)
@@ -1423,3 +1426,62 @@ func TestMultiContainerGoferKilled(t *testing.T) {
}
}
}
+
+func TestMultiContainerLoadSandbox(t *testing.T) {
+ sleep := []string{"sleep", "100"}
+ specs, ids := createSpecs(sleep, sleep, sleep)
+ conf := testutil.TestConfig()
+
+ // Create containers for the sandbox.
+ wants, cleanup, err := startContainers(conf, specs, ids)
+ if err != nil {
+ t.Fatalf("error starting containers: %v", err)
+ }
+ defer cleanup()
+
+ // Then create unrelated containers.
+ for i := 0; i < 3; i++ {
+ specs, ids = createSpecs(sleep, sleep, sleep)
+ _, cleanup, err = startContainers(conf, specs, ids)
+ if err != nil {
+ t.Fatalf("error starting containers: %v", err)
+ }
+ defer cleanup()
+ }
+
+ // Create an unrelated directory under root.
+ dir := filepath.Join(conf.RootDir, "not-a-container")
+ if err := os.MkdirAll(dir, 0755); err != nil {
+ t.Fatalf("os.MkdirAll(%q)=%v", dir, err)
+ }
+
+ // Create a valid but empty container directory.
+ randomCID := testutil.UniqueContainerID()
+ dir = filepath.Join(conf.RootDir, randomCID)
+ if err := os.MkdirAll(dir, 0755); err != nil {
+ t.Fatalf("os.MkdirAll(%q)=%v", dir, err)
+ }
+
+ // Load the sandbox and check that the correct containers were returned.
+ id := wants[0].Sandbox.ID
+ gots, err := loadSandbox(conf.RootDir, id)
+ if err != nil {
+ t.Fatalf("loadSandbox()=%v", err)
+ }
+ wantIDs := make(map[string]struct{})
+ for _, want := range wants {
+ wantIDs[want.ID] = struct{}{}
+ }
+ for _, got := range gots {
+ if got.Sandbox.ID != id {
+ t.Errorf("wrong sandbox ID, got: %v, want: %v", got.Sandbox.ID, id)
+ }
+ if _, ok := wantIDs[got.ID]; !ok {
+ t.Errorf("wrong container ID, got: %v, wants: %v", got.ID, wantIDs)
+ }
+ delete(wantIDs, got.ID)
+ }
+ if len(wantIDs) != 0 {
+ t.Errorf("containers not found: %v", wantIDs)
+ }
+}
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 406f6d08a..e288c7758 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -189,11 +189,14 @@ func SetupRootDir() (string, error) {
// SetupContainer creates a bundle and root dir for the container, generates a
// test config, and writes the spec to config.json in the bundle dir.
func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, err error) {
- rootDir, err = SetupRootDir()
- if err != nil {
- return "", "", err
+ // Setup root dir if one hasn't been provided.
+ if len(conf.RootDir) == 0 {
+ rootDir, err = SetupRootDir()
+ if err != nil {
+ return "", "", err
+ }
+ conf.RootDir = rootDir
}
- conf.RootDir = rootDir
bundleDir, err = SetupBundleDir(spec)
return rootDir, bundleDir, err
}
diff --git a/test/runtimes/README.md b/test/runtimes/README.md
index 4e5a950bc..34d3507be 100644
--- a/test/runtimes/README.md
+++ b/test/runtimes/README.md
@@ -16,7 +16,7 @@ The following runtimes are currently supported:
1) [Install and configure Docker](https://docs.docker.com/install/)
-2) Build each Docker container:
+2) Build each Docker container from the runtimes directory:
```bash
$ docker build -f $LANG/Dockerfile [-t $NAME] .
diff --git a/test/runtimes/go/Dockerfile b/test/runtimes/go/Dockerfile
index cd55608cd..1d5202b70 100644
--- a/test/runtimes/go/Dockerfile
+++ b/test/runtimes/go/Dockerfile
@@ -23,9 +23,12 @@ ENV LANG_DIR=${GOROOT}
WORKDIR ${LANG_DIR}/src
RUN ./make.bash
+# Pre-compile the tests for faster execution
+RUN ["/root/go/bin/go", "tool", "dist", "test", "-compile-only"]
WORKDIR ${LANG_DIR}
-COPY proctor-go.go ${LANG_DIR}
+COPY common /root/go/src/gvisor.dev/gvisor/test/runtimes/common/common
+COPY go/proctor-go.go ${LANG_DIR}
ENTRYPOINT ["/root/go/bin/go", "run", "proctor-go.go"]
diff --git a/test/runtimes/java/Dockerfile b/test/runtimes/java/Dockerfile
index e162d7218..b9132b575 100644
--- a/test/runtimes/java/Dockerfile
+++ b/test/runtimes/java/Dockerfile
@@ -1,25 +1,16 @@
FROM ubuntu:bionic
# This hash is associated with a specific JDK release and needed for ensuring
# the same version is downloaded every time.
-ENV LANG_HASH=af47e0398606
-ENV LANG_VER=11u-dev
+ENV LANG_HASH=76072a077ee1
+ENV LANG_VER=11
ENV LANG_NAME=Java
RUN apt-get update && apt-get install -y \
autoconf \
build-essential \
- curl\
- file \
- libasound2-dev \
- libcups2-dev \
- libfontconfig1-dev \
- libx11-dev \
- libxext-dev \
- libxrandr-dev \
- libxrender-dev \
- libxt-dev \
- libxtst-dev \
+ curl \
make \
+ openjdk-${LANG_VER}-jdk \
unzip \
zip
@@ -27,26 +18,18 @@ WORKDIR /root
RUN curl -o go.tar.gz https://dl.google.com/go/go1.12.6.linux-amd64.tar.gz
RUN tar -zxf go.tar.gz
-# Use curl instead of ADD to prevent redownload every time.
-RUN curl -o jdk.tar.gz http://hg.openjdk.java.net/jdk-updates/jdk${LANG_VER}/archive/${LANG_HASH}.tar.gz
-# Download Java version N-1 to be used as the Boot JDK to build Java version N.
-RUN curl -o bootjdk.tar.gz https://download.java.net/openjdk/jdk10/ri/openjdk-10+44_linux-x64_bin_ri.tar.gz
-
-RUN tar -zxf jdk.tar.gz
-RUN tar -zxf bootjdk.tar.gz
-
-# Specify the JDK to be used by jtreg.
-ENV JT_JAVA=/root/jdk${LANG_VER}-${LANG_HASH}/build/linux-x86_64-normal-server-release/jdk
-ENV LANG_DIR=/root/jdk${LANG_VER}-${LANG_HASH}
-
-WORKDIR ${LANG_DIR}
+# Download the JDK test library.
+RUN set -ex \
+ && curl -fsSL --retry 10 -o /tmp/jdktests.tar.gz http://hg.openjdk.java.net/jdk/jdk${LANG_VER}/archive/${LANG_HASH}.tar.gz/test \
+ && tar -xzf /tmp/jdktests.tar.gz -C /root \
+ && rm -f /tmp/jdktests.tar.gz
RUN curl -o jtreg.tar.gz https://ci.adoptopenjdk.net/view/Dependencies/job/jtreg/lastSuccessfulBuild/artifact/jtreg-4.2.0-tip.tar.gz
RUN tar -xzf jtreg.tar.gz
-RUN bash configure --with-boot-jdk=/root/jdk-10 --with-jtreg=${LANG_DIR}/jtreg
-RUN make clean
-RUN make images
-COPY proctor-java.go ${LANG_DIR}
+ENV LANG_DIR=/root
+
+COPY common /root/go/src/gvisor.dev/gvisor/test/runtimes/common/common
+COPY java/proctor-java.go ${LANG_DIR}
ENTRYPOINT ["/root/go/bin/go", "run", "proctor-java.go"]
diff --git a/test/runtimes/java/proctor-java.go b/test/runtimes/java/proctor-java.go
index c1e611b4b..7f6a66f4f 100644
--- a/test/runtimes/java/proctor-java.go
+++ b/test/runtimes/java/proctor-java.go
@@ -29,6 +29,7 @@ import (
var (
dir = os.Getenv("LANG_DIR")
+ hash = os.Getenv("LANG_HASH")
jtreg = filepath.Join(dir, "jtreg/bin/jtreg")
exclDirs = regexp.MustCompile(`(^(sun\/security)|(java\/util\/stream)|(java\/time)| )`)
)
@@ -44,7 +45,7 @@ func main() {
func (j javaRunner) ListTests() ([]string, error) {
args := []string{
- "-dir:test/jdk",
+ "-dir:/root/jdk11-" + hash + "/test/jdk",
"-ignore:quiet",
"-a",
"-listtests",
@@ -69,7 +70,7 @@ func (j javaRunner) ListTests() ([]string, error) {
}
func (j javaRunner) RunTest(test string) error {
- args := []string{"-dir:test/jdk/", test}
+ args := []string{"-noreport", "-dir:/root/jdk11-" + hash + "/test/jdk", test}
cmd := exec.Command(jtreg, args...)
cmd.Stdout, cmd.Stderr = os.Stdout, os.Stderr
if err := cmd.Run(); err != nil {
diff --git a/test/runtimes/nodejs/Dockerfile b/test/runtimes/nodejs/Dockerfile
index b2416cce8..aba30d2ee 100644
--- a/test/runtimes/nodejs/Dockerfile
+++ b/test/runtimes/nodejs/Dockerfile
@@ -22,7 +22,8 @@ RUN ./configure
RUN make
RUN make test-build
-COPY proctor-nodejs.go ${LANG_DIR}
+COPY common /root/go/src/gvisor.dev/gvisor/test/runtimes/common/common
+COPY nodejs/proctor-nodejs.go ${LANG_DIR}
# Including dumb-init emulates the Linux "init" process, preventing the failure
# of tests involving worker processes.
diff --git a/test/runtimes/php/Dockerfile b/test/runtimes/php/Dockerfile
index 1f8959b50..491ab902d 100644
--- a/test/runtimes/php/Dockerfile
+++ b/test/runtimes/php/Dockerfile
@@ -24,6 +24,7 @@ WORKDIR ${LANG_DIR}
RUN ./configure
RUN make
-COPY proctor-php.go ${LANG_DIR}
+COPY common /root/go/src/gvisor.dev/gvisor/test/runtimes/common/common
+COPY php/proctor-php.go ${LANG_DIR}
ENTRYPOINT ["/root/go/bin/go", "run", "proctor-php.go"]
diff --git a/test/runtimes/python/Dockerfile b/test/runtimes/python/Dockerfile
index 811f48f8a..710daee43 100644
--- a/test/runtimes/python/Dockerfile
+++ b/test/runtimes/python/Dockerfile
@@ -26,6 +26,7 @@ WORKDIR ${LANG_DIR}
RUN ./configure --with-pydebug
RUN make -s -j2
-COPY proctor-python.go ${LANG_DIR}
+COPY common /root/go/src/gvisor.dev/gvisor/test/runtimes/common/common
+COPY python/proctor-python.go ${LANG_DIR}
ENTRYPOINT ["/root/go/bin/go", "run", "proctor-python.go"]
diff --git a/test/runtimes/runtimes_test.go b/test/runtimes/runtimes_test.go
index 6bf954e78..43dd6f5b7 100644
--- a/test/runtimes/runtimes_test.go
+++ b/test/runtimes/runtimes_test.go
@@ -22,8 +22,16 @@ import (
"gvisor.dev/gvisor/runsc/test/testutil"
)
-func TestNodeJS(t *testing.T) {
- const img = "gcr.io/gvisor-proctor/nodejs"
+// Wait time for each test to run.
+const timeout = 180 * time.Second
+
+// Helper function to execute the docker container associated with the
+// language passed. Captures the output of the list function and executes
+// each test individually, supplying any errors recieved.
+func testLang(t *testing.T, lang string) {
+ t.Helper()
+
+ img := "gcr.io/gvisor-presubmit/" + lang
if err := testutil.Pull(img); err != nil {
t.Fatalf("docker pull failed: %v", err)
}
@@ -49,7 +57,7 @@ func TestNodeJS(t *testing.T) {
}
defer d.CleanUp()
- status, err := d.Wait(60 * time.Second)
+ status, err := d.Wait(timeout)
if err != nil {
t.Fatalf("docker test %q failed to wait: %v", tc, err)
}
@@ -65,3 +73,23 @@ func TestNodeJS(t *testing.T) {
})
}
}
+
+func TestGo(t *testing.T) {
+ testLang(t, "go")
+}
+
+func TestJava(t *testing.T) {
+ testLang(t, "java")
+}
+
+func TestNodejs(t *testing.T) {
+ testLang(t, "nodejs")
+}
+
+func TestPhp(t *testing.T) {
+ testLang(t, "php")
+}
+
+func TestPython(t *testing.T) {
+ testLang(t, "python")
+}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index a3e43cad2..aa1e33fb4 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -440,7 +440,9 @@ syscall_test(
)
syscall_test(
- size = "medium",
+ size = "large",
+ parallel = False,
+ shard_count = 10,
test = "//test/syscalls/linux:socket_inet_loopback_test",
)
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index df31d25b5..322ee07ad 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -145,6 +145,67 @@ TEST_P(SocketInetLoopbackTest, TCP) {
ASSERT_THAT(shutdown(conn_fd.get(), SHUT_RDWR), SyscallSucceeds());
}
+TEST_P(SocketInetLoopbackTest, TCPListenClose) {
+ auto const& param = GetParam();
+
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ // Create the listening socket.
+ FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(listen_fd.get(), 1001), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+ DisableSave ds; // Too many system calls.
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ constexpr int kFDs = 2048;
+ constexpr int kThreadCount = 4;
+ constexpr int kFDsPerThread = kFDs / kThreadCount;
+ FileDescriptor clients[kFDs];
+ std::unique_ptr<ScopedThread> threads[kThreadCount];
+ for (int i = 0; i < kFDs; i++) {
+ clients[i] = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+ }
+ for (int i = 0; i < kThreadCount; i++) {
+ threads[i] = absl::make_unique<ScopedThread>([&connector, &conn_addr,
+ &clients, i]() {
+ for (int j = 0; j < kFDsPerThread; j++) {
+ int k = i * kFDsPerThread + j;
+ int ret =
+ connect(clients[k].get(), reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len);
+ if (ret != 0) {
+ EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
+ }
+ }
+ });
+ }
+ for (int i = 0; i < kThreadCount; i++) {
+ threads[i]->Join();
+ }
+ for (int i = 0; i < 32; i++) {
+ auto accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+ }
+ // TODO(b/138400178): Fix cooperative S/R failure when ds.reset() is invoked
+ // before function end.
+ // ds.reset()
+}
+
TEST_P(SocketInetLoopbackTest, TCPbacklog) {
auto const& param = GetParam();
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
index 1875f4533..e25f264f6 100644
--- a/test/syscalls/linux/splice.cc
+++ b/test/syscalls/linux/splice.cc
@@ -13,6 +13,7 @@
// limitations under the License.
#include <fcntl.h>
+#include <sys/eventfd.h>
#include <sys/sendfile.h>
#include <unistd.h>
@@ -135,6 +136,80 @@ TEST(SpliceTest, PipeOffsets) {
SyscallFailsWithErrno(ESPIPE));
}
+// Event FDs may be used with splice without an offset.
+TEST(SpliceTest, FromEventFD) {
+ // Open the input eventfd with an initial value so that it is readable.
+ constexpr uint64_t kEventFDValue = 1;
+ int efd;
+ ASSERT_THAT(efd = eventfd(kEventFDValue, 0), SyscallSucceeds());
+ const FileDescriptor inf(efd);
+
+ // Create a new pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Splice 8-byte eventfd value to pipe.
+ constexpr int kEventFDSize = 8;
+ EXPECT_THAT(splice(inf.get(), nullptr, wfd.get(), nullptr, kEventFDSize, 0),
+ SyscallSucceedsWithValue(kEventFDSize));
+
+ // Contents should be equal.
+ std::vector<char> rbuf(kEventFDSize);
+ ASSERT_THAT(read(rfd.get(), rbuf.data(), rbuf.size()),
+ SyscallSucceedsWithValue(kEventFDSize));
+ EXPECT_EQ(memcmp(rbuf.data(), &kEventFDValue, rbuf.size()), 0);
+}
+
+// Event FDs may not be used with splice with an offset.
+TEST(SpliceTest, FromEventFDOffset) {
+ int efd;
+ ASSERT_THAT(efd = eventfd(0, 0), SyscallSucceeds());
+ const FileDescriptor inf(efd);
+
+ // Create a new pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Attempt to splice 8-byte eventfd value to pipe with offset.
+ //
+ // This is not allowed because eventfd doesn't support pread.
+ constexpr int kEventFDSize = 8;
+ loff_t in_off = 0;
+ EXPECT_THAT(splice(inf.get(), &in_off, wfd.get(), nullptr, kEventFDSize, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// Event FDs may not be used with splice with an offset.
+TEST(SpliceTest, ToEventFDOffset) {
+ // Create a new pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Fill with a value.
+ constexpr int kEventFDSize = 8;
+ std::vector<char> buf(kEventFDSize);
+ buf[0] = 1;
+ ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kEventFDSize));
+
+ int efd;
+ ASSERT_THAT(efd = eventfd(0, 0), SyscallSucceeds());
+ const FileDescriptor outf(efd);
+
+ // Attempt to splice 8-byte eventfd value to pipe with offset.
+ //
+ // This is not allowed because eventfd doesn't support pwrite.
+ loff_t out_off = 0;
+ EXPECT_THAT(splice(rfd.get(), nullptr, outf.get(), &out_off, kEventFDSize, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
TEST(SpliceTest, ToPipe) {
// Open the input file.
const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());