diff options
48 files changed, 2438 insertions, 451 deletions
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go index 9fc6a5bc2..4f3d6410e 100644 --- a/pkg/sentry/fs/attr.go +++ b/pkg/sentry/fs/attr.go @@ -111,6 +111,50 @@ func (n InodeType) LinuxType() uint32 { } } +// ToDirentType converts an InodeType to a linux dirent type field. +func ToDirentType(nodeType InodeType) uint8 { + switch nodeType { + case RegularFile, SpecialFile: + return linux.DT_REG + case Symlink: + return linux.DT_LNK + case Directory, SpecialDirectory: + return linux.DT_DIR + case Pipe: + return linux.DT_FIFO + case CharacterDevice: + return linux.DT_CHR + case BlockDevice: + return linux.DT_BLK + case Socket: + return linux.DT_SOCK + default: + return linux.DT_UNKNOWN + } +} + +// ToInodeType coverts a linux file type to InodeType. +func ToInodeType(linuxFileType linux.FileMode) InodeType { + switch linuxFileType { + case linux.ModeRegular: + return RegularFile + case linux.ModeDirectory: + return Directory + case linux.ModeSymlink: + return Symlink + case linux.ModeNamedPipe: + return Pipe + case linux.ModeCharacterDevice: + return CharacterDevice + case linux.ModeBlockDevice: + return BlockDevice + case linux.ModeSocket: + return Socket + default: + panic(fmt.Sprintf("unknown file mode: %d", linuxFileType)) + } +} + // StableAttr contains Inode attributes that will be stable throughout the // lifetime of the Inode. // diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD index e3d617576..a3b1e4bd6 100644 --- a/pkg/sentry/fs/ext/BUILD +++ b/pkg/sentry/fs/ext/BUILD @@ -4,14 +4,14 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") go_template_instance( - name = "dentry_list", - out = "dentry_list.go", + name = "dirent_list", + out = "dirent_list.go", package = "ext", - prefix = "dentry", + prefix = "dirent", template = "//pkg/ilist:generic_list", types = { - "Element": "*dentry", - "Linker": "*dentry", + "Element": "*dirent", + "Linker": "*dirent", }, ) @@ -20,14 +20,13 @@ go_library( srcs = [ "block_map_file.go", "dentry.go", - "dentry_list.go", "directory.go", + "dirent_list.go", "ext.go", "extent_file.go", + "file_description.go", "filesystem.go", - "inline_file.go", "inode.go", - "named_pipe.go", "regular_file.go", "symlink.go", "utils.go", @@ -38,15 +37,19 @@ go_library( "//pkg/abi/linux", "//pkg/binary", "//pkg/fd", + "//pkg/log", + "//pkg/sentry/arch", "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/ext/disklayout", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/pipe", + "//pkg/sentry/memmap", "//pkg/sentry/safemem", + "//pkg/sentry/syscalls/linux", "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", + "//pkg/waiter", ], ) @@ -73,7 +76,9 @@ go_test( "//pkg/sentry/context/contexttest", "//pkg/sentry/fs/ext/disklayout", "//pkg/sentry/kernel/auth", + "//pkg/sentry/usermem", "//pkg/sentry/vfs", + "//pkg/syserror", "//runsc/test/testutil", "@com_github_google_go-cmp//cmp:go_default_library", "@com_github_google_go-cmp//cmp/cmpopts:go_default_library", diff --git a/pkg/sentry/fs/ext/README.md b/pkg/sentry/fs/ext/README.md new file mode 100644 index 000000000..e212717aa --- /dev/null +++ b/pkg/sentry/fs/ext/README.md @@ -0,0 +1,117 @@ +## EXT(2/3/4) File System + +This is a filesystem driver which supports ext2, ext3 and ext4 filesystems. +Linux has specialized drivers for each variant but none which supports all. This +library takes advantage of ext's backward compatibility and understands the +internal organization of on-disk structures to support all variants. + +This driver implementation diverges from the Linux implementations in being more +forgiving about versioning. For instance, if a filesystem contains both extent +based inodes and classical block map based inodes, this driver will not complain +and interpret them both correctly. While in Linux this would be an issue. This +blurs the line between the three ext fs variants. + +Ext2 is considered deprecated as of Red Hat Enterprise Linux 7, and ext3 has +been superseded by ext4 by large performance gains. Thus it is recommended to +upgrade older filesystem images to ext4 using e2fsprogs for better performance. + +### Read Only + +This driver currently only allows read only operations. A lot of the design +decisions are based on this feature. There are plans to implement write (the +process for which is documented in the future work section). + +### Performance + +One of the biggest wins about this driver is that it directly talks to the +underlying block device (or whatever persistent storage is being used), instead +of making expensive RPCs to a gofer. + +Another advantage is that ext fs supports fast concurrent reads. Currently the +device is represented using a `io.ReaderAt` which allows for concurrent reads. +All reads are directly passed to the device driver which intelligently serves +the read requests in the optimal order. There is no congestion due to locking +while reading in the filesystem level. + +Reads are optimized further in the way file data is transferred over to user +memory. Ext fs directly copies over file data from disk into user memory with no +additional allocations on the way. We can only get faster by preloading file +data into memory (see future work section). + +The internal structures used to represent files, inodes and file descriptors use +a lot of inheritance. With the level of indirection that an interface adds with +an internal pointer, it can quickly fragment a structure across memory. As this +runs along side a full blown kernel (which is memory intensive), having a +fragmented struct might hurt performance. Hence these internal structures, +though interfaced, are tightly packed in memory using the same inheritance +pattern that pkg/sentry/vfs uses. The pkg/sentry/fs/ext/disklayout package makes +an execption to this pattern for reasons documented in the package. + +### Security + +This driver also intends to help sandbox the container better by reducing the +surface of the host kernel that the application touches. It prevents the +application from exploiting vulnerabilities in the host filesystem driver. All +`io.ReaderAt.ReadAt()` calls are translated to `pread(2)` which are directly +passed to the device driver in the kernel. Hence this reduces the surface for +attack. + +The application can not affect any host filesystems other than the one passed +via block device by the user. + +### Future Work + +#### Write + +To support write operations we would need to modify the block device underneath. +Currently, the driver does not modify the device at all, not even for updating +the access times for reads. Modifying the filesystem incorrectly can corrupt it +and render it unreadable for other correct ext(x) drivers. Hence caution must be +maintained while modifying metadata structures. + +Ext4 specifically is built for performance and has added a lot of complexity as +to how metadata structures are modified. For instance, files that are organized +via an extent tree which must be balanced and file data blocks must be placed in +the same extent as much as possible to increase locality. Such properties must +be maintained while modifying the tree. + +Ext filesystems boast a lot about locality, which plays a big role in them being +performant. The block allocation algorithm in Linux does a good job in keeping +related data together. This behavior must be maintained as much as possible, +else we might end up degrading the filesystem performance over time. + +Ext4 also supports a wide variety of features which are specialized for varying +use cases. Implementing all of them can get difficult very quickly. + +Ext(x) checksums all its metadata structures to check for corruption, so +modification of any metadata struct must correspond with re-checksumming the +struct. Linux filesystem drivers also order on-disk updates intelligently to not +corrupt the filesystem and also remain performant. The in-memory metadata +structures must be kept in sync with what is on disk. + +There is also replication of some important structures across the filesystem. +All replicas must be updated when their original copy is updated. There is also +provisioning for snapshotting which must be kept in mind, although it should not +affect this implementation unless we allow users to create filesystem snapshots. + +Ext4 also introduced journaling (jbd2). The journal must be updated +appropriately. + +#### Performance + +To improve performance we should implement a buffer cache, and optionally, read +ahead for small files. While doing so we must also keep in mind the memory usage +and have a reasonable cap on how much file data we want to hold in memory. + +#### Features + +Our current implementation will work with most ext4 filesystems for readonly +purposed. However, the following features are not supported yet: + +- Journal +- Snapshotting +- Extended Attributes +- Hash Tree Directories +- Meta Block Groups +- Multiple Mount Protection +- Bigalloc diff --git a/pkg/sentry/fs/ext/block_map_file.go b/pkg/sentry/fs/ext/block_map_file.go index f30c3a174..cea89bcd9 100644 --- a/pkg/sentry/fs/ext/block_map_file.go +++ b/pkg/sentry/fs/ext/block_map_file.go @@ -85,7 +85,8 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) { } offset := uint64(off) - if offset >= f.regFile.inode.diskInode.Size() { + size := f.regFile.inode.diskInode.Size() + if offset >= size { return 0, io.EOF } @@ -104,6 +105,9 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) { read := 0 toRead := len(dst) + if uint64(toRead)+offset > size { + toRead = int(size - offset) + } for read < toRead { var err error var curR int @@ -131,6 +135,9 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) { } } + if read < len(dst) { + return read, io.EOF + } return read, nil } diff --git a/pkg/sentry/fs/ext/dentry.go b/pkg/sentry/fs/ext/dentry.go index 19c9b3b2d..054fb42b6 100644 --- a/pkg/sentry/fs/ext/dentry.go +++ b/pkg/sentry/fs/ext/dentry.go @@ -26,8 +26,6 @@ type dentry struct { // share a single non-directory Inode (with hard links). inode is // immutable. inode *inode - // dentryEntry links Dentries into their parent directory.childList. - dentryEntry } // Compiles only if dentry implements vfs.DentryImpl. diff --git a/pkg/sentry/fs/ext/directory.go b/pkg/sentry/fs/ext/directory.go index ab2b59e44..1ba8bf54c 100644 --- a/pkg/sentry/fs/ext/directory.go +++ b/pkg/sentry/fs/ext/directory.go @@ -14,23 +14,295 @@ package ext +import ( + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + // directory represents a directory inode. It holds the childList in memory. type directory struct { inode inode - // childList is a list containing (1) child Dentries and (2) fake Dentries - // (with inode == nil) that represent the iteration position of + // mu serializes the changes to childList. + // Lock Order (outermost locks must be taken first): + // directory.mu + // filesystem.mu + mu sync.Mutex + + // childList is a list containing (1) child dirents and (2) fake dirents + // (with diskDirent == nil) that represent the iteration position of // directoryFDs. childList is used to support directoryFD.IterDirents() - // efficiently. childList is immutable. - childList dentryList + // efficiently. childList is protected by mu. + childList direntList - // TODO(b/134676337): Add directory navigators. + // childMap maps the child's filename to the dirent structure stored in + // childList. This adds some data replication but helps in faster path + // traversal. For consistency, key == childMap[key].diskDirent.FileName(). + // Immutable. + childMap map[string]*dirent } // newDirectroy is the directory constructor. -func newDirectroy(inode inode) *directory { - // TODO(b/134676337): initialize childList. - file := &directory{inode: inode} +func newDirectroy(inode inode, newDirent bool) (*directory, error) { + file := &directory{inode: inode, childMap: make(map[string]*dirent)} file.inode.impl = file - return file + + // Initialize childList by reading dirents from the underlying file. + if inode.diskInode.Flags().Index { + // TODO(b/134676337): Support hash tree directories. Currently only the '.' + // and '..' entries are read in. + + // Users cannot navigate this hash tree directory yet. + log.Warningf("hash tree directory being used which is unsupported") + return file, nil + } + + // The dirents are organized in a linear array in the file data. + // Extract the file data and decode the dirents. + regFile, err := newRegularFile(inode) + if err != nil { + return nil, err + } + + // buf is used as scratch space for reading in dirents from disk and + // unmarshalling them into dirent structs. + buf := make([]byte, disklayout.DirentSize) + size := inode.diskInode.Size() + for off, inc := uint64(0), uint64(0); off < size; off += inc { + toRead := size - off + if toRead > disklayout.DirentSize { + toRead = disklayout.DirentSize + } + if n, err := regFile.impl.ReadAt(buf[:toRead], int64(off)); uint64(n) < toRead { + return nil, err + } + + var curDirent dirent + if newDirent { + curDirent.diskDirent = &disklayout.DirentNew{} + } else { + curDirent.diskDirent = &disklayout.DirentOld{} + } + binary.Unmarshal(buf, binary.LittleEndian, curDirent.diskDirent) + + if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 { + // Inode number and name length fields being set to 0 is used to indicate + // an unused dirent. + file.childList.PushBack(&curDirent) + file.childMap[curDirent.diskDirent.FileName()] = &curDirent + } + + // The next dirent is placed exactly after this dirent record on disk. + inc = uint64(curDirent.diskDirent.RecordSize()) + } + + return file, nil +} + +func (i *inode) isDir() bool { + _, ok := i.impl.(*directory) + return ok +} + +// dirent is the directory.childList node. +type dirent struct { + diskDirent disklayout.Dirent + + // direntEntry links dirents into their parent directory.childList. + direntEntry +} + +// directoryFD represents a directory file description. It implements +// vfs.FileDescriptionImpl. +type directoryFD struct { + fileDescription + vfs.DirectoryFileDescriptionDefaultImpl + + // Protected by directory.mu. + iter *dirent + off int64 +} + +// Compiles only if directoryFD implements vfs.FileDescriptionImpl. +var _ vfs.FileDescriptionImpl = (*directoryFD)(nil) + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *directoryFD) Release() { + if fd.iter == nil { + return + } + + dir := fd.inode().impl.(*directory) + dir.mu.Lock() + dir.childList.Remove(fd.iter) + dir.mu.Unlock() + fd.iter = nil +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + extfs := fd.filesystem() + dir := fd.inode().impl.(*directory) + + dir.mu.Lock() + defer dir.mu.Unlock() + + // Ensure that fd.iter exists and is not linked into dir.childList. + var child *dirent + if fd.iter == nil { + // Start iteration at the beginning of dir. + child = dir.childList.Front() + fd.iter = &dirent{} + } else { + // Continue iteration from where we left off. + child = fd.iter.Next() + dir.childList.Remove(fd.iter) + } + for ; child != nil; child = child.Next() { + // Skip other directoryFD iterators. + if child.diskDirent != nil { + childType, ok := child.diskDirent.FileType() + if !ok { + // We will need to read the inode off disk. Do not increment + // ref count here because this inode is not being added to the + // dentry tree. + extfs.mu.Lock() + childInode, err := extfs.getOrCreateInodeLocked(child.diskDirent.Inode()) + extfs.mu.Unlock() + if err != nil { + // Usage of the file description after the error is + // undefined. This implementation would continue reading + // from the next dirent. + fd.off++ + dir.childList.InsertAfter(child, fd.iter) + return err + } + childType = fs.ToInodeType(childInode.diskInode.Mode().FileType()) + } + + if !cb.Handle(vfs.Dirent{ + Name: child.diskDirent.FileName(), + Type: fs.ToDirentType(childType), + Ino: uint64(child.diskDirent.Inode()), + Off: fd.off, + }) { + dir.childList.InsertBefore(child, fd.iter) + return nil + } + fd.off++ + } + } + dir.childList.PushBack(fd.iter) + return nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + if whence != linux.SEEK_SET && whence != linux.SEEK_CUR { + return 0, syserror.EINVAL + } + + dir := fd.inode().impl.(*directory) + + dir.mu.Lock() + defer dir.mu.Unlock() + + // Find resulting offset. + if whence == linux.SEEK_CUR { + offset += fd.off + } + + if offset < 0 { + // lseek(2) specifies that EINVAL should be returned if the resulting offset + // is negative. + return 0, syserror.EINVAL + } + + n := int64(len(dir.childMap)) + realWantOff := offset + if realWantOff > n { + realWantOff = n + } + realCurOff := fd.off + if realCurOff > n { + realCurOff = n + } + + // Ensure that fd.iter exists and is linked into dir.childList so we can + // intelligently seek from the optimal position. + if fd.iter == nil { + fd.iter = &dirent{} + dir.childList.PushFront(fd.iter) + } + + // Guess that iterating from the current position is optimal. + child := fd.iter + diff := realWantOff - realCurOff // Shows direction and magnitude of travel. + + // See if starting from the beginning or end is better. + abDiff := diff + if diff < 0 { + abDiff = -diff + } + if abDiff > realWantOff { + // Starting from the beginning is best. + child = dir.childList.Front() + diff = realWantOff + } else if abDiff > (n - realWantOff) { + // Starting from the end is best. + child = dir.childList.Back() + // (n - 1) because the last non-nil dirent represents the (n-1)th offset. + diff = realWantOff - (n - 1) + } + + for child != nil { + // Skip other directoryFD iterators. + if child.diskDirent != nil { + if diff == 0 { + if child != fd.iter { + dir.childList.Remove(fd.iter) + dir.childList.InsertBefore(child, fd.iter) + } + + fd.off = offset + return offset, nil + } + + if diff < 0 { + diff++ + child = child.Prev() + } else { + diff-- + child = child.Next() + } + continue + } + + if diff < 0 { + child = child.Prev() + } else { + child = child.Next() + } + } + + // Reaching here indicates that the offset is beyond the end of the childList. + dir.childList.Remove(fd.iter) + dir.childList.PushBack(fd.iter) + fd.off = offset + return offset, nil +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error { + // mmap(2) specifies that EACCESS should be returned for non-regular file fds. + return syserror.EACCES } diff --git a/pkg/sentry/fs/ext/disklayout/dirent.go b/pkg/sentry/fs/ext/disklayout/dirent.go index 685bf57b8..417b6cf65 100644 --- a/pkg/sentry/fs/ext/disklayout/dirent.go +++ b/pkg/sentry/fs/ext/disklayout/dirent.go @@ -21,6 +21,9 @@ import ( const ( // MaxFileName is the maximum length of an ext fs file's name. MaxFileName = 255 + + // DirentSize is the size of ext dirent structures. + DirentSize = 263 ) var ( diff --git a/pkg/sentry/fs/ext/disklayout/dirent_test.go b/pkg/sentry/fs/ext/disklayout/dirent_test.go index cc6dff2c9..934919f8a 100644 --- a/pkg/sentry/fs/ext/disklayout/dirent_test.go +++ b/pkg/sentry/fs/ext/disklayout/dirent_test.go @@ -21,8 +21,6 @@ import ( // TestDirentSize tests that the dirent structs are of the correct // size. func TestDirentSize(t *testing.T) { - want := uintptr(263) - - assertSize(t, DirentOld{}, want) - assertSize(t, DirentNew{}, want) + assertSize(t, DirentOld{}, uintptr(DirentSize)) + assertSize(t, DirentNew{}, uintptr(DirentSize)) } diff --git a/pkg/sentry/fs/ext/disklayout/superblock.go b/pkg/sentry/fs/ext/disklayout/superblock.go index 7a337a5e0..8bb327006 100644 --- a/pkg/sentry/fs/ext/disklayout/superblock.go +++ b/pkg/sentry/fs/ext/disklayout/superblock.go @@ -221,7 +221,7 @@ func CompatFeaturesFromInt(f uint32) CompatFeatures { // This is not exhaustive, unused features are not listed. const ( // SbDirentFileType indicates that directory entries record the file type. - // We should use struct ext4_dir_entry_2 for dirents then. + // We should use struct DirentNew for dirents then. SbDirentFileType = 0x2 // SbRecovery indicates that the filesystem needs recovery. diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go index d303dd122..c3e2c9efb 100644 --- a/pkg/sentry/fs/ext/ext.go +++ b/pkg/sentry/fs/ext/ext.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -62,8 +63,40 @@ func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, err return fd.NewReadWriter(devFd), nil } +// isCompatible checks if the superblock has feature sets which are compatible. +// We only need to check the superblock incompatible feature set since we are +// mounting readonly. We will also need to check readonly compatible feature +// set when mounting for read/write. +func isCompatible(sb disklayout.SuperBlock) bool { + // Please note that what is being checked is limited based on the fact that we + // are mounting readonly and that we are not journaling. When mounting + // read/write or with a journal, this must be reevaluated. + incompatFeatures := sb.IncompatibleFeatures() + if incompatFeatures.MetaBG { + log.Warningf("ext fs: meta block groups are not supported") + return false + } + if incompatFeatures.MMP { + log.Warningf("ext fs: multiple mount protection is not supported") + return false + } + if incompatFeatures.Encrypted { + log.Warningf("ext fs: encrypted inodes not supported") + return false + } + if incompatFeatures.InlineData { + log.Warningf("ext fs: inline files not supported") + return false + } + return true +} + // NewFilesystem implements vfs.FilesystemType.NewFilesystem. func (fstype filesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + // TODO(b/134676337): Ensure that the user is mounting readonly. If not, + // EACCESS should be returned according to mount(2). Filesystem independent + // flags (like readonly) are currently not available in pkg/sentry/vfs. + dev, err := getDeviceFd(source, opts) if err != nil { return nil, nil, err @@ -82,15 +115,21 @@ func (fstype filesystemType) NewFilesystem(ctx context.Context, creds *auth.Cred return nil, nil, syserror.EINVAL } + // Refuse to mount if the filesystem is incompatible. + if !isCompatible(fs.sb) { + return nil, nil, syserror.EINVAL + } + fs.bgs, err = readBlockGroups(dev, fs.sb) if err != nil { return nil, nil, err } - rootInode, err := fs.getOrCreateInode(ctx, disklayout.RootDirInode) + rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode) if err != nil { return nil, nil, err } + rootInode.incRef() return &fs.vfsfs, &newDentry(rootInode).vfsd, nil } diff --git a/pkg/sentry/fs/ext/ext_test.go b/pkg/sentry/fs/ext/ext_test.go index 6396886cc..270f38074 100644 --- a/pkg/sentry/fs/ext/ext_test.go +++ b/pkg/sentry/fs/ext/ext_test.go @@ -16,17 +16,22 @@ package ext import ( "fmt" + "io" "os" "path" + "sort" "testing" "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/runsc/test/testutil" ) @@ -44,7 +49,7 @@ var ( // setUp opens imagePath as an ext Filesystem and returns all necessary // elements required to run tests. If error is non-nil, it also returns a tear // down function which must be called after the test is run for clean up. -func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *vfs.Dentry, func(), error) { +func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesystem, *vfs.VirtualDentry, func(), error) { localImagePath, err := testutil.FindFile(imagePath) if err != nil { return nil, nil, nil, nil, fmt.Errorf("failed to open local image at path %s: %v", imagePath, err) @@ -55,20 +60,537 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *v return nil, nil, nil, nil, err } - // Mount the ext4 fs and retrieve the inode structure for the file. - mockCtx := contexttest.Context(t) - fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: int(f.Fd())}) + ctx := contexttest.Context(t) + creds := auth.CredentialsFromContext(ctx) + + // Create VFS. + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType("extfs", filesystemType{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())}) if err != nil { f.Close() return nil, nil, nil, nil, err } + root := mntns.Root() + tearDown := func() { + root.DecRef() + if err := f.Close(); err != nil { t.Fatalf("tearDown failed: %v", err) } } - return mockCtx, fs, d, tearDown, nil + return ctx, vfsObj, &root, tearDown, nil +} + +// TODO(b/134676337): Test vfs.FilesystemImpl.ReadlinkAt and +// vfs.FilesystemImpl.StatFSAt which are not implemented in +// vfs.VirtualFilesystem yet. + +// TestSeek tests vfs.FileDescriptionImpl.Seek functionality. +func TestSeek(t *testing.T) { + type seekTest struct { + name string + image string + path string + } + + tests := []seekTest{ + { + name: "ext4 root dir seek", + image: ext4ImagePath, + path: "/", + }, + { + name: "ext3 root dir seek", + image: ext3ImagePath, + path: "/", + }, + { + name: "ext2 root dir seek", + image: ext2ImagePath, + path: "/", + }, + { + name: "ext4 reg file seek", + image: ext4ImagePath, + path: "/file.txt", + }, + { + name: "ext3 reg file seek", + image: ext3ImagePath, + path: "/file.txt", + }, + { + name: "ext2 reg file seek", + image: ext2ImagePath, + path: "/file.txt", + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ctx, vfsfs, root, tearDown, err := setUp(t, test.image) + if err != nil { + t.Fatalf("setUp failed: %v", err) + } + defer tearDown() + + fd, err := vfsfs.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt failed: %v", err) + } + + if n, err := fd.Impl().Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil { + t.Errorf("expected seek position 0, got %d and error %v", n, err) + } + + stat, err := fd.Impl().Stat(ctx, vfs.StatOptions{}) + if err != nil { + t.Errorf("fd.stat failed for file %s in image %s: %v", test.path, test.image, err) + } + + // We should be able to seek beyond the end of file. + size := int64(stat.Size) + if n, err := fd.Impl().Seek(ctx, size, linux.SEEK_SET); n != size || err != nil { + t.Errorf("expected seek position %d, got %d and error %v", size, n, err) + } + + // EINVAL should be returned if the resulting offset is negative. + if _, err := fd.Impl().Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL { + t.Errorf("expected error EINVAL but got %v", err) + } + + if n, err := fd.Impl().Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil { + t.Errorf("expected seek position %d, got %d and error %v", size+3, n, err) + } + + // Make sure negative offsets work with SEEK_CUR. + if n, err := fd.Impl().Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil { + t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err) + } + + // EINVAL should be returned if the resulting offset is negative. + if _, err := fd.Impl().Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL { + t.Errorf("expected error EINVAL but got %v", err) + } + + // Make sure SEEK_END works with regular files. + switch fd.Impl().(type) { + case *regularFileFD: + // Seek back to 0. + if n, err := fd.Impl().Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil { + t.Errorf("expected seek position %d, got %d and error %v", 0, n, err) + } + + // Seek forward beyond EOF. + if n, err := fd.Impl().Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil { + t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err) + } + + // EINVAL should be returned if the resulting offset is negative. + if _, err := fd.Impl().Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL { + t.Errorf("expected error EINVAL but got %v", err) + } + } + }) + } +} + +// TestStatAt tests filesystem.StatAt functionality. +func TestStatAt(t *testing.T) { + type statAtTest struct { + name string + image string + path string + want linux.Statx + } + + tests := []statAtTest{ + { + name: "ext4 statx small file", + image: ext4ImagePath, + path: "/file.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0644 | linux.ModeRegular, + Size: 13, + }, + }, + { + name: "ext3 statx small file", + image: ext3ImagePath, + path: "/file.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0644 | linux.ModeRegular, + Size: 13, + }, + }, + { + name: "ext2 statx small file", + image: ext2ImagePath, + path: "/file.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0644 | linux.ModeRegular, + Size: 13, + }, + }, + { + name: "ext4 statx big file", + image: ext4ImagePath, + path: "/bigfile.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0644 | linux.ModeRegular, + Size: 13042, + }, + }, + { + name: "ext3 statx big file", + image: ext3ImagePath, + path: "/bigfile.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0644 | linux.ModeRegular, + Size: 13042, + }, + }, + { + name: "ext2 statx big file", + image: ext2ImagePath, + path: "/bigfile.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0644 | linux.ModeRegular, + Size: 13042, + }, + }, + { + name: "ext4 statx symlink file", + image: ext4ImagePath, + path: "/symlink.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0777 | linux.ModeSymlink, + Size: 8, + }, + }, + { + name: "ext3 statx symlink file", + image: ext3ImagePath, + path: "/symlink.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0777 | linux.ModeSymlink, + Size: 8, + }, + }, + { + name: "ext2 statx symlink file", + image: ext2ImagePath, + path: "/symlink.txt", + want: linux.Statx{ + Blksize: 0x400, + Nlink: 1, + UID: 0, + GID: 0, + Mode: 0777 | linux.ModeSymlink, + Size: 8, + }, + }, + } + + // Ignore the fields that are not supported by filesystem.StatAt yet and + // those which are likely to change as the image does. + ignoredFields := map[string]bool{ + "Attributes": true, + "AttributesMask": true, + "Atime": true, + "Blocks": true, + "Btime": true, + "Ctime": true, + "DevMajor": true, + "DevMinor": true, + "Ino": true, + "Mask": true, + "Mtime": true, + "RdevMajor": true, + "RdevMinor": true, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ctx, vfsfs, root, tearDown, err := setUp(t, test.image) + if err != nil { + t.Fatalf("setUp failed: %v", err) + } + defer tearDown() + + got, err := vfsfs.StatAt(ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path}, + &vfs.StatOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.StatAt failed for file %s in image %s: %v", test.path, test.image, err) + } + + cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool { + _, ok := ignoredFields[p.String()] + return ok + }, cmp.Ignore()) + if diff := cmp.Diff(got, test.want, cmpIgnoreFields, cmpopts.IgnoreUnexported(linux.Statx{})); diff != "" { + t.Errorf("stat mismatch (-want +got):\n%s", diff) + } + }) + } +} + +// TestRead tests the read functionality for vfs file descriptions. +func TestRead(t *testing.T) { + type readTest struct { + name string + image string + absPath string + } + + tests := []readTest{ + { + name: "ext4 read small file", + image: ext4ImagePath, + absPath: "/file.txt", + }, + { + name: "ext3 read small file", + image: ext3ImagePath, + absPath: "/file.txt", + }, + { + name: "ext2 read small file", + image: ext2ImagePath, + absPath: "/file.txt", + }, + { + name: "ext4 read big file", + image: ext4ImagePath, + absPath: "/bigfile.txt", + }, + { + name: "ext3 read big file", + image: ext3ImagePath, + absPath: "/bigfile.txt", + }, + { + name: "ext2 read big file", + image: ext2ImagePath, + absPath: "/bigfile.txt", + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ctx, vfsfs, root, tearDown, err := setUp(t, test.image) + if err != nil { + t.Fatalf("setUp failed: %v", err) + } + defer tearDown() + + fd, err := vfsfs.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.absPath}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt failed: %v", err) + } + + // Get a local file descriptor and compare its functionality with a vfs file + // description for the same file. + localFile, err := testutil.FindFile(path.Join(assetsDir, test.absPath)) + if err != nil { + t.Fatalf("testutil.FindFile failed for %s: %v", test.absPath, err) + } + + f, err := os.Open(localFile) + if err != nil { + t.Fatalf("os.Open failed for %s: %v", localFile, err) + } + defer f.Close() + + // Read the entire file by reading one byte repeatedly. Doing this stress + // tests the underlying file reader implementation. + got := make([]byte, 1) + want := make([]byte, 1) + for { + n, err := f.Read(want) + fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}) + + if diff := cmp.Diff(got, want); diff != "" { + t.Errorf("file data mismatch (-want +got):\n%s", diff) + } + + // Make sure there is no more file data left after getting EOF. + if n == 0 || err == io.EOF { + if n, _ := fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 { + t.Errorf("extra unexpected file data in file %s in image %s", test.absPath, test.image) + } + + break + } + + if err != nil { + t.Fatalf("read failed: %v", err) + } + } + }) + } +} + +// iterDirentsCb is a simple callback which just keeps adding the dirents to an +// internal list. Implements vfs.IterDirentsCallback. +type iterDirentsCb struct { + dirents []vfs.Dirent +} + +// Compiles only if iterDirentCb implements vfs.IterDirentsCallback. +var _ vfs.IterDirentsCallback = (*iterDirentsCb)(nil) + +// newIterDirentsCb is the iterDirent +func newIterDirentCb() *iterDirentsCb { + return &iterDirentsCb{dirents: make([]vfs.Dirent, 0)} +} + +// Handle implements vfs.IterDirentsCallback.Handle. +func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) bool { + cb.dirents = append(cb.dirents, dirent) + return true +} + +// TestIterDirents tests the FileDescriptionImpl.IterDirents functionality. +func TestIterDirents(t *testing.T) { + type iterDirentTest struct { + name string + image string + path string + want []vfs.Dirent + } + + wantDirents := []vfs.Dirent{ + vfs.Dirent{ + Name: ".", + Type: linux.DT_DIR, + }, + vfs.Dirent{ + Name: "..", + Type: linux.DT_DIR, + }, + vfs.Dirent{ + Name: "lost+found", + Type: linux.DT_DIR, + }, + vfs.Dirent{ + Name: "file.txt", + Type: linux.DT_REG, + }, + vfs.Dirent{ + Name: "bigfile.txt", + Type: linux.DT_REG, + }, + vfs.Dirent{ + Name: "symlink.txt", + Type: linux.DT_LNK, + }, + } + tests := []iterDirentTest{ + { + name: "ext4 root dir iteration", + image: ext4ImagePath, + path: "/", + want: wantDirents, + }, + { + name: "ext3 root dir iteration", + image: ext3ImagePath, + path: "/", + want: wantDirents, + }, + { + name: "ext2 root dir iteration", + image: ext2ImagePath, + path: "/", + want: wantDirents, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ctx, vfsfs, root, tearDown, err := setUp(t, test.image) + if err != nil { + t.Fatalf("setUp failed: %v", err) + } + defer tearDown() + + fd, err := vfsfs.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt failed: %v", err) + } + + cb := &iterDirentsCb{} + if err = fd.Impl().IterDirents(ctx, cb); err != nil { + t.Fatalf("dir fd.IterDirents() failed: %v", err) + } + + sort.Slice(cb.dirents, func(i int, j int) bool { return cb.dirents[i].Name < cb.dirents[j].Name }) + sort.Slice(test.want, func(i int, j int) bool { return test.want[i].Name < test.want[j].Name }) + + // Ignore the inode number and offset of dirents because those are likely to + // change as the underlying image changes. + cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool { + return p.String() == "Ino" || p.String() == "Off" + }, cmp.Ignore()) + if diff := cmp.Diff(cb.dirents, test.want, cmpIgnoreFields); diff != "" { + t.Errorf("dirents mismatch (-want +got):\n%s", diff) + } + }) + } } // TestRootDir tests that the root directory inode is correctly initialized and @@ -126,15 +648,15 @@ func TestRootDir(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - _, _, vfsd, tearDown, err := setUp(t, test.image) + _, _, vd, tearDown, err := setUp(t, test.image) if err != nil { t.Fatalf("setUp failed: %v", err) } defer tearDown() - d, ok := vfsd.Impl().(*dentry) + d, ok := vd.Dentry().Impl().(*dentry) if !ok { - t.Fatalf("ext dentry of incorrect type: %T", vfsd.Impl()) + t.Fatalf("ext dentry of incorrect type: %T", vd.Dentry().Impl()) } // Offload inode contents into local structs for comparison. @@ -329,15 +851,15 @@ func TestFilesystemInit(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - _, vfsfs, _, tearDown, err := setUp(t, test.image) + _, _, vd, tearDown, err := setUp(t, test.image) if err != nil { t.Fatalf("setUp failed: %v", err) } defer tearDown() - fs, ok := vfsfs.Impl().(*filesystem) + fs, ok := vd.Mount().Filesystem().Impl().(*filesystem) if !ok { - t.Fatalf("ext filesystem of incorrect type: %T", vfsfs.Impl()) + t.Fatalf("ext filesystem of incorrect type: %T", vd.Mount().Filesystem().Impl()) } // Offload superblock and block group descriptors contents into diff --git a/pkg/sentry/fs/ext/extent_file.go b/pkg/sentry/fs/ext/extent_file.go index 44fb9c01f..1b9bf449b 100644 --- a/pkg/sentry/fs/ext/extent_file.go +++ b/pkg/sentry/fs/ext/extent_file.go @@ -150,7 +150,11 @@ func (f *extentFile) ReadAt(dst []byte, off int64) (int, error) { return 0, io.EOF } - return f.read(&f.root, uint64(off), dst) + n, err := f.read(&f.root, uint64(off), dst) + if n < len(dst) && err == nil { + err = io.EOF + } + return n, err } // read is the recursive step of extentFile.ReadAt which traverses the extent diff --git a/pkg/sentry/fs/ext/file_description.go b/pkg/sentry/fs/ext/file_description.go new file mode 100644 index 000000000..d244cf1e7 --- /dev/null +++ b/pkg/sentry/fs/ext/file_description.go @@ -0,0 +1,110 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ext + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// fileDescription is embedded by ext implementations of +// vfs.FileDescriptionImpl. +type fileDescription struct { + vfsfd vfs.FileDescription + + // flags is the same as vfs.OpenOptions.Flags which are passed to + // vfs.FilesystemImpl.OpenAt. + // TODO(b/134676337): syscalls like read(2), write(2), fchmod(2), fchown(2), + // fgetxattr(2), ioctl(2), mmap(2) should fail with EBADF if O_PATH is set. + // Only close(2), fstat(2), fstatfs(2) should work. + flags uint32 +} + +func (fd *fileDescription) filesystem() *filesystem { + return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem) +} + +func (fd *fileDescription) inode() *inode { + return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode +} + +// OnClose implements vfs.FileDescriptionImpl.OnClose. +func (fd *fileDescription) OnClose() error { return nil } + +// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. +func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) { + return fd.flags, nil +} + +// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. +func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { + // None of the flags settable by fcntl(F_SETFL) are supported, so this is a + // no-op. + return nil +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + var stat linux.Statx + fd.inode().statTo(&stat) + return stat, nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + if opts.Stat.Mask == 0 { + return nil + } + return syserror.EPERM +} + +// SetStat implements vfs.FileDescriptionImpl.StatFS. +func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { + var stat linux.Statfs + fd.filesystem().statTo(&stat) + return stat, nil +} + +// Readiness implements waiter.Waitable.Readiness analogously to +// file_operations::poll == NULL in Linux. +func (fd *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + // include/linux/poll.h:vfs_poll() => DEFAULT_POLLMASK + return waiter.EventIn | waiter.EventOut +} + +// EventRegister implements waiter.Waitable.EventRegister analogously to +// file_operations::poll == NULL in Linux. +func (fd *fileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {} + +// EventUnregister implements waiter.Waitable.EventUnregister analogously to +// file_operations::poll == NULL in Linux. +func (fd *fileDescription) EventUnregister(e *waiter.Entry) {} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *fileDescription) Sync(ctx context.Context) error { + return nil +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + // ioctl(2) specifies that ENOTTY must be returned if the file descriptor is + // not associated with a character special device (which is unimplemented). + return 0, syserror.ENOTTY +} diff --git a/pkg/sentry/fs/ext/filesystem.go b/pkg/sentry/fs/ext/filesystem.go index 45b43b9e2..e08839f48 100644 --- a/pkg/sentry/fs/ext/filesystem.go +++ b/pkg/sentry/fs/ext/filesystem.go @@ -15,20 +15,27 @@ package ext import ( + "errors" "io" "sync" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) +var ( + // errResolveDirent indicates that the vfs.ResolvingPath.Component() does + // not exist on the dentry tree but does exist on disk. So it has to be read in + // using the in-memory dirent and added to the dentry tree. Usually indicates + // the need to lock filesystem.mu for writing. + errResolveDirent = errors.New("resolve path component using dirent") +) + // filesystem implements vfs.FilesystemImpl. type filesystem struct { - // TODO(b/134676337): Remove when all methods have been implemented. - vfs.FilesystemImpl - vfsfs vfs.Filesystem // mu serializes changes to the Dentry tree. @@ -44,8 +51,8 @@ type filesystem struct { // inodeCache maps absolute inode numbers to the corresponding Inode struct. // Inodes should be removed from this once their reference count hits 0. // - // Protected by mu because every addition and removal from this corresponds to - // a change in the dentry tree. + // Protected by mu because most additions (see IterDirents) and all removals + // from this corresponds to a change in the dentry tree. inodeCache map[uint32]*inode // sb represents the filesystem superblock. Immutable after initialization. @@ -59,16 +66,172 @@ type filesystem struct { // Compiles only if filesystem implements vfs.FilesystemImpl. var _ vfs.FilesystemImpl = (*filesystem)(nil) -// getOrCreateInode gets the inode corresponding to the inode number passed in. +// stepLocked resolves rp.Component() in parent directory vfsd. The write +// parameter passed tells if the caller has acquired filesystem.mu for writing +// or not. If set to true, an existing inode on disk can be added to the dentry +// tree if not present already. +// +// stepLocked is loosely analogous to fs/namei.c:walk_component(). +// +// Preconditions: +// - filesystem.mu must be locked (for writing if write param is true). +// - !rp.Done(). +// - inode == vfsd.Impl().(*Dentry).inode. +func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) { + if !inode.isDir() { + return nil, nil, syserror.ENOTDIR + } + if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, nil, err + } + + for { + nextVFSD, err := rp.ResolveComponent(vfsd) + if err != nil { + return nil, nil, err + } + if nextVFSD == nil { + // Since the Dentry tree is not the sole source of truth for extfs, if it's + // not in the Dentry tree, it might need to be pulled from disk. + childDirent, ok := inode.impl.(*directory).childMap[rp.Component()] + if !ok { + // The underlying inode does not exist on disk. + return nil, nil, syserror.ENOENT + } + + if !write { + // filesystem.mu must be held for writing to add to the dentry tree. + return nil, nil, errResolveDirent + } + + // Create and add the component's dirent to the dentry tree. + fs := rp.Mount().Filesystem().Impl().(*filesystem) + childInode, err := fs.getOrCreateInodeLocked(childDirent.diskDirent.Inode()) + if err != nil { + return nil, nil, err + } + // incRef because this is being added to the dentry tree. + childInode.incRef() + child := newDentry(childInode) + vfsd.InsertChild(&child.vfsd, rp.Component()) + + // Continue as usual now that nextVFSD is not nil. + nextVFSD = &child.vfsd + } + nextInode := nextVFSD.Impl().(*dentry).inode + if nextInode.isSymlink() && rp.ShouldFollowSymlink() { + if err := rp.HandleSymlink(inode.impl.(*symlink).target); err != nil { + return nil, nil, err + } + continue + } + rp.Advance() + return nextVFSD, nextInode, nil + } +} + +// walkLocked resolves rp to an existing file. The write parameter +// passed tells if the caller has acquired filesystem.mu for writing or not. +// If set to true, additions can be made to the dentry tree while walking. +// If errResolveDirent is returned, the walk needs to be continued with an +// upgraded filesystem.mu. +// +// walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). +// +// Preconditions: +// - filesystem.mu must be locked (for writing if write param is true). +func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { + vfsd := rp.Start() + inode := vfsd.Impl().(*dentry).inode + for !rp.Done() { + var err error + vfsd, inode, err = stepLocked(rp, vfsd, inode, write) + if err != nil { + return nil, nil, err + } + } + if rp.MustBeDir() && !inode.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, inode, nil +} + +// walkParentLocked resolves all but the last path component of rp to an +// existing directory. It does not check that the returned directory is +// searchable by the provider of rp. The write parameter passed tells if the +// caller has acquired filesystem.mu for writing or not. If set to true, +// additions can be made to the dentry tree while walking. +// If errResolveDirent is returned, the walk needs to be continued with an +// upgraded filesystem.mu. +// +// walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat(). +// +// Preconditions: +// - filesystem.mu must be locked (for writing if write param is true). +// - !rp.Done(). +func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { + vfsd := rp.Start() + inode := vfsd.Impl().(*dentry).inode + for !rp.Final() { + var err error + vfsd, inode, err = stepLocked(rp, vfsd, inode, write) + if err != nil { + return nil, nil, err + } + } + if !inode.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, inode, nil +} + +// walk resolves rp to an existing file. If parent is set to true, it resolves +// the rp till the parent of the last component which should be an existing +// directory. If parent is false then resolves rp entirely. Attemps to resolve +// the path as far as it can with a read lock and upgrades the lock if needed. +func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) { + var ( + vfsd *vfs.Dentry + inode *inode + err error + ) + + // Try walking with the hopes that all dentries have already been pulled out + // of disk. This reduces congestion (allows concurrent walks). + fs.mu.RLock() + if parent { + vfsd, inode, err = walkParentLocked(rp, false) + } else { + vfsd, inode, err = walkLocked(rp, false) + } + fs.mu.RUnlock() + + if err == errResolveDirent { + // Upgrade lock and continue walking. Lock upgrading in the middle of the + // walk is fine as this is a read only filesystem. + fs.mu.Lock() + if parent { + vfsd, inode, err = walkParentLocked(rp, true) + } else { + vfsd, inode, err = walkLocked(rp, true) + } + fs.mu.Unlock() + } + + return vfsd, inode, err +} + +// getOrCreateInodeLocked gets the inode corresponding to the inode number passed in. // It creates a new one with the given inode number if one does not exist. +// The caller must increment the ref count if adding this to the dentry tree. // -// Precondition: must be holding fs.mu. -func (fs *filesystem) getOrCreateInode(ctx context.Context, inodeNum uint32) (*inode, error) { +// Precondition: must be holding fs.mu for writing. +func (fs *filesystem) getOrCreateInodeLocked(inodeNum uint32) (*inode, error) { if in, ok := fs.inodeCache[inodeNum]; ok { return in, nil } - in, err := newInode(ctx, fs, inodeNum) + in, err := newInode(fs, inodeNum) if err != nil { return nil, err } @@ -77,10 +240,92 @@ func (fs *filesystem) getOrCreateInode(ctx context.Context, inodeNum uint32) (*i return in, nil } -// Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +// statTo writes the statfs fields to the output parameter. +func (fs *filesystem) statTo(stat *linux.Statfs) { + stat.Type = uint64(fs.sb.Magic()) + stat.BlockSize = int64(fs.sb.BlockSize()) + stat.Blocks = fs.sb.BlocksCount() + stat.BlocksFree = fs.sb.FreeBlocksCount() + stat.BlocksAvailable = fs.sb.FreeBlocksCount() + stat.Files = uint64(fs.sb.InodesCount()) + stat.FilesFree = uint64(fs.sb.FreeInodesCount()) + stat.NameLength = disklayout.MaxFileName + stat.FragmentSize = int64(fs.sb.BlockSize()) + // TODO(b/134676337): Set Statfs.Flags and Statfs.FSID. +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + vfsd, inode, err := fs.walk(rp, false) + if err != nil { + return nil, err + } + + if opts.CheckSearchable { + if !inode.isDir() { + return nil, syserror.ENOTDIR + } + if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + } + + inode.incRef() + return vfsd, nil +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + vfsd, inode, err := fs.walk(rp, false) + if err != nil { + return nil, err + } + + // EROFS is returned if write access is needed. + if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 { + return nil, syserror.EROFS + } + return inode.open(rp, vfsd, opts.Flags) +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + _, inode, err := fs.walk(rp, false) + if err != nil { + return "", err + } + symlink, ok := inode.impl.(*symlink) + if !ok { + return "", syserror.EINVAL + } + return symlink.target, nil +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + _, inode, err := fs.walk(rp, false) + if err != nil { + return linux.Statx{}, err + } + var stat linux.Statx + inode.statTo(&stat) + return stat, nil } +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + if _, _, err := fs.walk(rp, false); err != nil { + return linux.Statfs{}, err + } + + var stat linux.Statfs + fs.statTo(&stat) + return stat, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() {} + // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { // This is a readonly filesystem for now. @@ -89,42 +334,110 @@ func (fs *filesystem) Sync(ctx context.Context) error { // The vfs.FilesystemImpl functions below return EROFS because their respective // man pages say that EROFS must be returned if the path resolves to a file on -// a read-only filesystem. +// this read-only filesystem. -// TODO(b/134676337): Implement path traversal and return EROFS only if the -// path resolves to a Dentry within ext fs. +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + if rp.Done() { + return syserror.EEXIST + } + + if _, _, err := fs.walk(rp, true); err != nil { + return err + } + + return syserror.EROFS +} // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + if rp.Done() { + return syserror.EEXIST + } + + if _, _, err := fs.walk(rp, true); err != nil { + return err + } + return syserror.EROFS } // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + if rp.Done() { + return syserror.EEXIST + } + + _, _, err := fs.walk(rp, true) + if err != nil { + return err + } + return syserror.EROFS } // RenameAt implements vfs.FilesystemImpl.RenameAt. func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error { + if rp.Done() { + return syserror.ENOENT + } + + _, _, err := fs.walk(rp, false) + if err != nil { + return err + } + return syserror.EROFS } // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + _, inode, err := fs.walk(rp, false) + if err != nil { + return err + } + + if !inode.isDir() { + return syserror.ENOTDIR + } + return syserror.EROFS } // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + _, _, err := fs.walk(rp, false) + if err != nil { + return err + } + return syserror.EROFS } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + if rp.Done() { + return syserror.EEXIST + } + + _, _, err := fs.walk(rp, true) + if err != nil { + return err + } + return syserror.EROFS } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + _, inode, err := fs.walk(rp, false) + if err != nil { + return err + } + + if inode.isDir() { + return syserror.EISDIR + } + return syserror.EROFS } diff --git a/pkg/sentry/fs/ext/inline_file.go b/pkg/sentry/fs/ext/inline_file.go deleted file mode 100644 index 67a538ba0..000000000 --- a/pkg/sentry/fs/ext/inline_file.go +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "io" -) - -// inlineFile is a type of regular file. All the data here is stored in the -// inode.Data() array. -type inlineFile struct { - regFile regularFile -} - -// Compiles only if inlineFile implements io.ReaderAt. -var _ io.ReaderAt = (*inlineFile)(nil) - -// newInlineFile is the inlineFile constructor. -func newInlineFile(regFile regularFile) *inlineFile { - file := &inlineFile{regFile: regFile} - file.regFile.impl = file - return file -} - -// ReadAt implements io.ReaderAt.ReadAt. -func (f *inlineFile) ReadAt(dst []byte, off int64) (int, error) { - if len(dst) == 0 { - return 0, nil - } - - size := f.regFile.inode.diskInode.Size() - if uint64(off) >= size { - return 0, io.EOF - } - - to := uint64(off) + uint64(len(dst)) - if to > size { - to = size - } - - n := copy(dst, f.regFile.inode.diskInode.Data()[off:to]) - return n, nil -} diff --git a/pkg/sentry/fs/ext/inode.go b/pkg/sentry/fs/ext/inode.go index 364980e4c..178bd6376 100644 --- a/pkg/sentry/fs/ext/inode.go +++ b/pkg/sentry/fs/ext/inode.go @@ -15,12 +15,14 @@ package ext import ( + "fmt" "io" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -31,13 +33,11 @@ import ( // // Implementations: // inode -- -// |-- pipe // |-- dir // |-- symlink // |-- regular-- // |-- extent file // |-- block map file -// |-- inline file type inode struct { // refs is a reference count. refs is accessed using atomic memory operations. refs int64 @@ -92,7 +92,7 @@ func (in *inode) decRef(fs *filesystem) { // newInode is the inode constructor. Reads the inode off disk. Identifies // inodes based on the absolute inode number on disk. -func newInode(ctx context.Context, fs *filesystem, inodeNum uint32) (*inode, error) { +func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { if inodeNum == 0 { panic("inode number 0 on ext filesystems is not possible") } @@ -117,7 +117,6 @@ func newInode(ctx context.Context, fs *filesystem, inodeNum uint32) (*inode, err // Build the inode based on its type. inode := inode{ - refs: 1, inodeNum: inodeNum, dev: fs.dev, blkSize: blkSize, @@ -138,15 +137,76 @@ func newInode(ctx context.Context, fs *filesystem, inodeNum uint32) (*inode, err } return &f.inode, nil case linux.ModeDirectory: - return &newDirectroy(inode).inode, nil - case linux.ModeNamedPipe: - return &newNamedPipe(ctx, inode).inode, nil + f, err := newDirectroy(inode, fs.sb.IncompatibleFeatures().DirentFileType) + if err != nil { + return nil, err + } + return &f.inode, nil default: - // TODO(b/134676337): Return appropriate errors for sockets and devices. + // TODO(b/134676337): Return appropriate errors for sockets, pipes and devices. return nil, syserror.EINVAL } } +// open creates and returns a file description for the dentry passed in. +func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(flags) + if err := in.checkPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + switch in.impl.(type) { + case *regularFile: + var fd regularFileFD + fd.flags = flags + fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + return &fd.vfsfd, nil + case *directory: + // Can't open directories writably. This check is not necessary for a read + // only filesystem but will be required when write is implemented. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EISDIR + } + var fd directoryFD + fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + fd.flags = flags + return &fd.vfsfd, nil + case *symlink: + if flags&linux.O_PATH == 0 { + // Can't open symlinks without O_PATH. + return nil, syserror.ELOOP + } + var fd symlinkFD + fd.flags = flags + fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + return &fd.vfsfd, nil + default: + panic(fmt.Sprintf("unknown inode type: %T", in.impl)) + } +} + +func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, in.isDir(), uint16(in.diskInode.Mode()), in.diskInode.UID(), in.diskInode.GID()) +} + +// statTo writes the statx fields to the output parameter. +func (in *inode) statTo(stat *linux.Statx) { + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | + linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | + linux.STATX_ATIME | linux.STATX_CTIME | linux.STATX_MTIME + stat.Blksize = uint32(in.blkSize) + stat.Mode = uint16(in.diskInode.Mode()) + stat.Nlink = uint32(in.diskInode.LinksCount()) + stat.UID = uint32(in.diskInode.UID()) + stat.GID = uint32(in.diskInode.GID()) + stat.Ino = uint64(in.inodeNum) + stat.Size = in.diskInode.Size() + stat.Atime = in.diskInode.AccessTime().StatxTimestamp() + stat.Ctime = in.diskInode.ChangeTime().StatxTimestamp() + stat.Mtime = in.diskInode.ModificationTime().StatxTimestamp() + // TODO(b/134676337): Set stat.Blocks which is the number of 512 byte blocks + // (including metadata blocks) required to represent this file. +} + // getBGNum returns the block group number that a given inode belongs to. func getBGNum(inodeNum uint32, inodesPerGrp uint32) uint32 { return (inodeNum - 1) / inodesPerGrp diff --git a/pkg/sentry/fs/ext/named_pipe.go b/pkg/sentry/fs/ext/named_pipe.go deleted file mode 100644 index 0f3af1b53..000000000 --- a/pkg/sentry/fs/ext/named_pipe.go +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ext - -import ( - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" - "gvisor.dev/gvisor/pkg/sentry/usermem" -) - -// namedPipe represents a named pipe inode. It is currently just a wrapper -// around pkg/sentry/kernel/pipe. -type namedPipe struct { - inode inode - - p *pipe.Pipe - inodeOps fs.InodeOperations -} - -// newNamedPipe is the namedPipe constructor. -func newNamedPipe(ctx context.Context, inode inode) *namedPipe { - file := &namedPipe{inode: inode} - file.inode.impl = file - file.p = pipe.NewPipe(ctx, true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize) - file.inodeOps = pipe.NewInodeOperations(ctx, fs.FilePermsFromMode(file.inode.diskInode.Mode()), file.p) - return file -} diff --git a/pkg/sentry/fs/ext/regular_file.go b/pkg/sentry/fs/ext/regular_file.go index fb1bd38ef..ffc76ba5b 100644 --- a/pkg/sentry/fs/ext/regular_file.go +++ b/pkg/sentry/fs/ext/regular_file.go @@ -16,6 +16,15 @@ package ext import ( "io" + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/safemem" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" ) // regularFile represents a regular file's inode. This too follows the @@ -26,6 +35,9 @@ type regularFile struct { // This is immutable. The first field of fileReader implementations must be // regularFile to ensure temporality. + // io.ReaderAt is more strict than io.Reader in the sense that a partial read + // is always accompanied by an error. If a read spans past the end of file, a + // partial read (within file range) is done and io.EOF is returned. impl io.ReaderAt } @@ -48,16 +60,6 @@ func newRegularFile(inode inode) (*regularFile, error) { return &file.regFile, nil } - if inodeFlags.Inline { - if inode.diskInode.Size() > 60 { - panic("ext fs: inline file larger than 60 bytes") - } - - file := newInlineFile(regFile) - file.regFile.inode.impl = &file.regFile - return &file.regFile, nil - } - file, err := newBlockMapFile(regFile) if err != nil { return nil, err @@ -66,6 +68,92 @@ func newRegularFile(inode inode) (*regularFile, error) { return &file.regFile, nil } -func (f *regularFile) blksUsed(blkSize uint64) uint64 { - return (f.inode.diskInode.Size() + blkSize - 1) / blkSize +func (in *inode) isRegular() bool { + _, ok := in.impl.(*regularFile) + return ok +} + +// directoryFD represents a directory file description. It implements +// vfs.FileDescriptionImpl. +type regularFileFD struct { + fileDescription + + // off is the file offset. off is accessed using atomic memory operations. + off int64 + + // offMu serializes operations that may mutate off. + offMu sync.Mutex +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *regularFileFD) Release() {} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + safeReader := safemem.FromIOReaderAt{ + ReaderAt: fd.inode().impl.(*regularFile).impl, + Offset: offset, + } + + // Copies data from disk directly into usermem without any intermediate + // allocations (if dst is converted into BlockSeq such that it does not need + // safe copying). + return dst.CopyOutFrom(ctx, safeReader) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.offMu.Lock() + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + // write(2) specifies that EBADF must be returned if the fd is not open for + // writing. + return 0, syserror.EBADF +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + n, err := fd.PWrite(ctx, src, fd.off, opts) + fd.offMu.Lock() + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *regularFileFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + return syserror.ENOTDIR +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.offMu.Lock() + defer fd.offMu.Unlock() + switch whence { + case linux.SEEK_SET: + // Use offset as specified. + case linux.SEEK_CUR: + offset += fd.off + case linux.SEEK_END: + offset += int64(fd.inode().diskInode.Size()) + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error { + // TODO(b/134676337): Implement mmap(2). + return syserror.ENODEV } diff --git a/pkg/sentry/fs/ext/symlink.go b/pkg/sentry/fs/ext/symlink.go index 9f498d989..e06548a98 100644 --- a/pkg/sentry/fs/ext/symlink.go +++ b/pkg/sentry/fs/ext/symlink.go @@ -15,6 +15,10 @@ package ext import ( + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -43,8 +47,8 @@ func newSymlink(inode inode) (*symlink, error) { } link = make([]byte, size) - if n, _ := regFile.impl.ReadAt(link, 0); uint64(n) < size { - return nil, syserror.EIO + if n, err := regFile.impl.ReadAt(link, 0); uint64(n) < size { + return nil, err } } @@ -52,3 +56,56 @@ func newSymlink(inode inode) (*symlink, error) { file.inode.impl = file return file, nil } + +func (in *inode) isSymlink() bool { + _, ok := in.impl.(*symlink) + return ok +} + +// symlinkFD represents a symlink file description and implements implements +// vfs.FileDescriptionImpl. which may only be used if open options contains +// O_PATH. For this reason most of the functions return EBADF. +type symlinkFD struct { + fileDescription +} + +// Compiles only if symlinkFD implements vfs.FileDescriptionImpl. +var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil) + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *symlinkFD) Release() {} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *symlinkFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.EBADF +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *symlinkFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *symlinkFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.EBADF +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *symlinkFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + return syserror.ENOTDIR +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *symlinkFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, syserror.EBADF +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *symlinkFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error { + return syserror.EBADF +} diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD index d5d4f68df..d2450e810 100644 --- a/pkg/sentry/fsimpl/memfs/BUILD +++ b/pkg/sentry/fsimpl/memfs/BUILD @@ -11,8 +11,8 @@ go_template_instance( prefix = "dentry", template = "//pkg/ilist:generic_list", types = { - "Element": "*Dentry", - "Linker": "*Dentry", + "Element": "*dentry", + "Linker": "*dentry", }, ) diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/memfs/directory.go index b0c3ea39a..c52dc781c 100644 --- a/pkg/sentry/fsimpl/memfs/directory.go +++ b/pkg/sentry/fsimpl/memfs/directory.go @@ -23,23 +23,23 @@ import ( ) type directory struct { - inode Inode + inode inode // childList is a list containing (1) child Dentries and (2) fake Dentries // (with inode == nil) that represent the iteration position of // directoryFDs. childList is used to support directoryFD.IterDirents() - // efficiently. childList is protected by Filesystem.mu. + // efficiently. childList is protected by filesystem.mu. childList dentryList } -func (fs *Filesystem) newDirectory(creds *auth.Credentials, mode uint16) *Inode { +func (fs *filesystem) newDirectory(creds *auth.Credentials, mode uint16) *inode { dir := &directory{} dir.inode.init(dir, fs, creds, mode) dir.inode.nlink = 2 // from "." and parent directory or ".." for root return &dir.inode } -func (i *Inode) isDir() bool { +func (i *inode) isDir() bool { _, ok := i.impl.(*directory) return ok } @@ -48,8 +48,8 @@ type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl - // Protected by Filesystem.mu. - iter *Dentry + // Protected by filesystem.mu. + iter *dentry off int64 } @@ -68,7 +68,7 @@ func (fd *directoryFD) Release() { // IterDirents implements vfs.FileDescriptionImpl.IterDirents. func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { fs := fd.filesystem() - d := fd.vfsfd.VirtualDentry().Dentry() + vfsd := fd.vfsfd.VirtualDentry().Dentry() fs.mu.Lock() defer fs.mu.Unlock() @@ -77,7 +77,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba if !cb.Handle(vfs.Dirent{ Name: ".", Type: linux.DT_DIR, - Ino: d.Impl().(*Dentry).inode.ino, + Ino: vfsd.Impl().(*dentry).inode.ino, Off: 0, }) { return nil @@ -85,7 +85,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.off++ } if fd.off == 1 { - parentInode := d.ParentOrSelf().Impl().(*Dentry).inode + parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode if !cb.Handle(vfs.Dirent{ Name: "..", Type: parentInode.direntType(), @@ -97,12 +97,12 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.off++ } - dir := d.Impl().(*Dentry).inode.impl.(*directory) - var child *Dentry + dir := vfsd.Impl().(*dentry).inode.impl.(*directory) + var child *dentry if fd.iter == nil { // Start iteration at the beginning of dir. child = dir.childList.Front() - fd.iter = &Dentry{} + fd.iter = &dentry{} } else { // Continue iteration from where we left off. child = fd.iter.Next() @@ -130,32 +130,41 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - if whence != linux.SEEK_SET { - // TODO: Linux also allows SEEK_CUR. + fs := fd.filesystem() + fs.mu.Lock() + defer fs.mu.Unlock() + + switch whence { + case linux.SEEK_SET: + // Use offset as given. + case linux.SEEK_CUR: + offset += fd.off + default: return 0, syserror.EINVAL } if offset < 0 { return 0, syserror.EINVAL } + // If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't + // seek even if doing so might reposition the iterator due to concurrent + // mutation of the directory. Compare fs/libfs.c:dcache_dir_lseek(). + if fd.off == offset { + return offset, nil + } + fd.off = offset // Compensate for "." and "..". - var remChildren int64 - if offset < 2 { - remChildren = 0 - } else { + remChildren := int64(0) + if offset >= 2 { remChildren = offset - 2 } - fs := fd.filesystem() dir := fd.inode().impl.(*directory) - fs.mu.Lock() - defer fs.mu.Unlock() - // Ensure that fd.iter exists and is not linked into dir.childList. if fd.iter == nil { - fd.iter = &Dentry{} + fd.iter = &dentry{} } else { dir.childList.Remove(fd.iter) } diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go index 4d989eeaf..f79e2d9c8 100644 --- a/pkg/sentry/fsimpl/memfs/filesystem.go +++ b/pkg/sentry/fsimpl/memfs/filesystem.go @@ -28,9 +28,9 @@ import ( // // stepLocked is loosely analogous to fs/namei.c:walk_component(). // -// Preconditions: Filesystem.mu must be locked. !rp.Done(). inode == -// vfsd.Impl().(*Dentry).inode. -func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *Inode) (*vfs.Dentry, *Inode, error) { +// Preconditions: filesystem.mu must be locked. !rp.Done(). inode == +// vfsd.Impl().(*dentry).inode. +func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode) (*vfs.Dentry, *inode, error) { if !inode.isDir() { return nil, nil, syserror.ENOTDIR } @@ -47,7 +47,7 @@ afterSymlink: // not in the Dentry tree, it doesn't exist. return nil, nil, syserror.ENOENT } - nextInode := nextVFSD.Impl().(*Dentry).inode + nextInode := nextVFSD.Impl().(*dentry).inode if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { // TODO: symlink traversals update access time if err := rp.HandleSymlink(symlink.target); err != nil { @@ -64,10 +64,10 @@ afterSymlink: // walkExistingLocked is loosely analogous to Linux's // fs/namei.c:path_lookupat(). // -// Preconditions: Filesystem.mu must be locked. -func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) { +// Preconditions: filesystem.mu must be locked. +func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) { vfsd := rp.Start() - inode := vfsd.Impl().(*Dentry).inode + inode := vfsd.Impl().(*dentry).inode for !rp.Done() { var err error vfsd, inode, err = stepLocked(rp, vfsd, inode) @@ -88,10 +88,10 @@ func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) { // walkParentDirLocked is loosely analogous to Linux's // fs/namei.c:path_parentat(). // -// Preconditions: Filesystem.mu must be locked. !rp.Done(). -func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) { +// Preconditions: filesystem.mu must be locked. !rp.Done(). +func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) { vfsd := rp.Start() - inode := vfsd.Impl().(*Dentry).inode + inode := vfsd.Impl().(*dentry).inode for !rp.Final() { var err error vfsd, inode, err = stepLocked(rp, vfsd, inode) @@ -108,9 +108,9 @@ func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) { // checkCreateLocked checks that a file named rp.Component() may be created in // directory parentVFSD, then returns rp.Component(). // -// Preconditions: Filesystem.mu must be locked. parentInode == -// parentVFSD.Impl().(*Dentry).inode. parentInode.isDir() == true. -func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *Inode) (string, error) { +// Preconditions: filesystem.mu must be locked. parentInode == +// parentVFSD.Impl().(*dentry).inode. parentInode.isDir() == true. +func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *inode) (string, error) { if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { return "", err } @@ -144,7 +144,7 @@ func checkDeleteLocked(vfsd *vfs.Dentry) error { } // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. -func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.mu.RUnlock() vfsd, inode, err := walkExistingLocked(rp) @@ -164,7 +164,7 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op } // LinkAt implements vfs.FilesystemImpl.LinkAt. -func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { if rp.Done() { return syserror.EEXIST } @@ -185,7 +185,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return err } defer rp.Mount().EndWrite() - d := vd.Dentry().Impl().(*Dentry) + d := vd.Dentry().Impl().(*dentry) if d.inode.isDir() { return syserror.EPERM } @@ -197,7 +197,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. -func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { +func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { if rp.Done() { return syserror.EEXIST } @@ -223,7 +223,7 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v } // MknodAt implements vfs.FilesystemImpl.MknodAt. -func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { +func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { if rp.Done() { return syserror.EEXIST } @@ -246,7 +246,7 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v } // OpenAt implements vfs.FilesystemImpl.OpenAt. -func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { // Filter out flags that are not supported by memfs. O_DIRECTORY and // O_NOFOLLOW have no effect here (they're handled by VFS by setting // appropriate bits in rp), but are returned by @@ -265,11 +265,10 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf mustCreate := opts.Flags&linux.O_EXCL != 0 vfsd := rp.Start() - inode := vfsd.Impl().(*Dentry).inode + inode := vfsd.Impl().(*dentry).inode fs.mu.Lock() defer fs.mu.Unlock() if rp.Done() { - // FIXME: ??? if rp.MustBeDir() { return nil, syserror.EISDIR } @@ -327,7 +326,7 @@ afterTrailingSymlink: if mustCreate { return nil, syserror.EEXIST } - childInode := childVFSD.Impl().(*Dentry).inode + childInode := childVFSD.Impl().(*dentry).inode if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { // TODO: symlink traversals update access time if err := rp.HandleSymlink(symlink.target); err != nil { @@ -340,7 +339,7 @@ afterTrailingSymlink: return childInode.open(rp, childVFSD, opts.Flags, false) } -func (i *Inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) { +func (i *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(flags) if !afterCreate { if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil { @@ -385,7 +384,7 @@ func (i *Inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afte } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. -func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { fs.mu.RLock() _, inode, err := walkExistingLocked(rp) fs.mu.RUnlock() @@ -400,9 +399,8 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st } // RenameAt implements vfs.FilesystemImpl.RenameAt. -func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error { +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error { if rp.Done() { - // FIXME return syserror.ENOENT } fs.mu.Lock() @@ -424,7 +422,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vf } // RmdirAt implements vfs.FilesystemImpl.RmdirAt. -func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { +func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() vfsd, inode, err := walkExistingLocked(rp) @@ -447,12 +445,14 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { return err } + // Remove from parent directory's childList. + vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry)) inode.decRef() return nil } // SetStatAt implements vfs.FilesystemImpl.SetStatAt. -func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { +func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { fs.mu.RLock() _, _, err := walkExistingLocked(rp) fs.mu.RUnlock() @@ -462,12 +462,12 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts if opts.Stat.Mask == 0 { return nil } - // TODO: implement Inode.setStat + // TODO: implement inode.setStat return syserror.EPERM } // StatAt implements vfs.FilesystemImpl.StatAt. -func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { fs.mu.RLock() _, inode, err := walkExistingLocked(rp) fs.mu.RUnlock() @@ -480,7 +480,7 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf } // StatFSAt implements vfs.FilesystemImpl.StatFSAt. -func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { fs.mu.RLock() _, _, err := walkExistingLocked(rp) fs.mu.RUnlock() @@ -492,7 +492,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. -func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { +func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { if rp.Done() { return syserror.EEXIST } @@ -517,7 +517,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. -func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { +func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() vfsd, inode, err := walkExistingLocked(rp) @@ -537,6 +537,8 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { return err } + // Remove from parent directory's childList. + vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry)) inode.decLinksLocked() return nil } diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go index f381e1a88..59612da14 100644 --- a/pkg/sentry/fsimpl/memfs/memfs.go +++ b/pkg/sentry/fsimpl/memfs/memfs.go @@ -21,10 +21,10 @@ // // Lock order: // -// Filesystem.mu +// filesystem.mu // regularFileFD.offMu // regularFile.mu -// Inode.mu +// inode.mu package memfs import ( @@ -42,8 +42,8 @@ import ( // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} -// Filesystem implements vfs.FilesystemImpl. -type Filesystem struct { +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { vfsfs vfs.Filesystem // mu serializes changes to the Dentry tree. @@ -54,44 +54,44 @@ type Filesystem struct { // NewFilesystem implements vfs.FilesystemType.NewFilesystem. func (fstype FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - var fs Filesystem + var fs filesystem fs.vfsfs.Init(&fs) root := fs.newDentry(fs.newDirectory(creds, 01777)) return &fs.vfsfs, &root.vfsd, nil } // Release implements vfs.FilesystemImpl.Release. -func (fs *Filesystem) Release() { +func (fs *filesystem) Release() { } // Sync implements vfs.FilesystemImpl.Sync. -func (fs *Filesystem) Sync(ctx context.Context) error { +func (fs *filesystem) Sync(ctx context.Context) error { // All filesystem state is in-memory. return nil } -// Dentry implements vfs.DentryImpl. -type Dentry struct { +// dentry implements vfs.DentryImpl. +type dentry struct { vfsd vfs.Dentry - // inode is the inode represented by this Dentry. Multiple Dentries may - // share a single non-directory Inode (with hard links). inode is + // inode is the inode represented by this dentry. Multiple Dentries may + // share a single non-directory inode (with hard links). inode is // immutable. - inode *Inode + inode *inode - // memfs doesn't count references on Dentries; because the Dentry tree is + // memfs doesn't count references on dentries; because the dentry tree is // the sole source of truth, it is by definition always consistent with the - // state of the filesystem. However, it does count references on Inodes, - // because Inode resources are released when all references are dropped. + // state of the filesystem. However, it does count references on inodes, + // because inode resources are released when all references are dropped. // (memfs doesn't really have resources to release, but we implement // reference counting because tmpfs regular files will.) - // dentryEntry (ugh) links Dentries into their parent directory.childList. + // dentryEntry (ugh) links dentries into their parent directory.childList. dentryEntry } -func (fs *Filesystem) newDentry(inode *Inode) *Dentry { - d := &Dentry{ +func (fs *filesystem) newDentry(inode *inode) *dentry { + d := &dentry{ inode: inode, } d.vfsd.Init(d) @@ -99,37 +99,37 @@ func (fs *Filesystem) newDentry(inode *Inode) *Dentry { } // IncRef implements vfs.DentryImpl.IncRef. -func (d *Dentry) IncRef(vfsfs *vfs.Filesystem) { +func (d *dentry) IncRef(vfsfs *vfs.Filesystem) { d.inode.incRef() } // TryIncRef implements vfs.DentryImpl.TryIncRef. -func (d *Dentry) TryIncRef(vfsfs *vfs.Filesystem) bool { +func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool { return d.inode.tryIncRef() } // DecRef implements vfs.DentryImpl.DecRef. -func (d *Dentry) DecRef(vfsfs *vfs.Filesystem) { +func (d *dentry) DecRef(vfsfs *vfs.Filesystem) { d.inode.decRef() } -// Inode represents a filesystem object. -type Inode struct { +// inode represents a filesystem object. +type inode struct { // refs is a reference count. refs is accessed using atomic memory // operations. // - // A reference is held on all Inodes that are reachable in the filesystem + // A reference is held on all inodes that are reachable in the filesystem // tree. For non-directories (which may have multiple hard links), this // means that a reference is dropped when nlink reaches 0. For directories, // nlink never reaches 0 due to the "." entry; instead, - // Filesystem.RmdirAt() drops the reference. + // filesystem.RmdirAt() drops the reference. refs int64 // Inode metadata; protected by mu and accessed using atomic memory // operations unless otherwise specified. mu sync.RWMutex mode uint32 // excluding file type bits, which are based on impl - nlink uint32 // protected by Filesystem.mu instead of Inode.mu + nlink uint32 // protected by filesystem.mu instead of inode.mu uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic gid uint32 // auth.KGID, but ... ino uint64 // immutable @@ -137,7 +137,7 @@ type Inode struct { impl interface{} // immutable } -func (i *Inode) init(impl interface{}, fs *Filesystem, creds *auth.Credentials, mode uint16) { +func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode uint16) { i.refs = 1 i.mode = uint32(mode) i.uid = uint32(creds.EffectiveKUID) @@ -147,29 +147,29 @@ func (i *Inode) init(impl interface{}, fs *Filesystem, creds *auth.Credentials, i.impl = impl } -// Preconditions: Filesystem.mu must be locked for writing. -func (i *Inode) incLinksLocked() { +// Preconditions: filesystem.mu must be locked for writing. +func (i *inode) incLinksLocked() { if atomic.AddUint32(&i.nlink, 1) <= 1 { - panic("memfs.Inode.incLinksLocked() called with no existing links") + panic("memfs.inode.incLinksLocked() called with no existing links") } } -// Preconditions: Filesystem.mu must be locked for writing. -func (i *Inode) decLinksLocked() { +// Preconditions: filesystem.mu must be locked for writing. +func (i *inode) decLinksLocked() { if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 { i.decRef() } else if nlink == ^uint32(0) { // negative overflow - panic("memfs.Inode.decLinksLocked() called with no existing links") + panic("memfs.inode.decLinksLocked() called with no existing links") } } -func (i *Inode) incRef() { +func (i *inode) incRef() { if atomic.AddInt64(&i.refs, 1) <= 1 { - panic("memfs.Inode.incRef() called without holding a reference") + panic("memfs.inode.incRef() called without holding a reference") } } -func (i *Inode) tryIncRef() bool { +func (i *inode) tryIncRef() bool { for { refs := atomic.LoadInt64(&i.refs) if refs == 0 { @@ -181,7 +181,7 @@ func (i *Inode) tryIncRef() bool { } } -func (i *Inode) decRef() { +func (i *inode) decRef() { if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { // This is unnecessary; it's mostly to simulate what tmpfs would do. if regfile, ok := i.impl.(*regularFile); ok { @@ -191,18 +191,18 @@ func (i *Inode) decRef() { regfile.mu.Unlock() } } else if refs < 0 { - panic("memfs.Inode.decRef() called without holding a reference") + panic("memfs.inode.decRef() called without holding a reference") } } -func (i *Inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { +func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))) } // Go won't inline this function, and returning linux.Statx (which is quite // big) means spending a lot of time in runtime.duffcopy(), so instead it's an // output parameter. -func (i *Inode) statTo(stat *linux.Statx) { +func (i *inode) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO stat.Blksize = 1 // usermem.PageSize in tmpfs stat.Nlink = atomic.LoadUint32(&i.nlink) @@ -241,7 +241,7 @@ func allocatedBlocksForSize(size uint64) uint64 { return (size + 511) / 512 } -func (i *Inode) direntType() uint8 { +func (i *inode) direntType() uint8 { switch i.impl.(type) { case *regularFile: return linux.DT_REG @@ -262,12 +262,12 @@ type fileDescription struct { flags uint32 // status flags; immutable } -func (fd *fileDescription) filesystem() *Filesystem { - return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*Filesystem) +func (fd *fileDescription) filesystem() *filesystem { + return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem) } -func (fd *fileDescription) inode() *Inode { - return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode +func (fd *fileDescription) inode() *inode { + return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode } // StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. @@ -294,6 +294,6 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) if opts.Stat.Mask == 0 { return nil } - // TODO: implement Inode.setStat + // TODO: implement inode.setStat return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go index 4a3603cc8..7a16d5719 100644 --- a/pkg/sentry/fsimpl/memfs/regular_file.go +++ b/pkg/sentry/fsimpl/memfs/regular_file.go @@ -28,16 +28,16 @@ import ( ) type regularFile struct { - inode Inode + inode inode mu sync.RWMutex data []byte // dataLen is len(data), but accessed using atomic memory operations to - // avoid locking in Inode.stat(). + // avoid locking in inode.stat(). dataLen int64 } -func (fs *Filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *Inode { +func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *inode { file := ®ularFile{} file.inode.init(file, fs, creds, mode) file.inode.nlink = 1 // from parent directory diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/memfs/symlink.go index e002d1727..b2ac2cbeb 100644 --- a/pkg/sentry/fsimpl/memfs/symlink.go +++ b/pkg/sentry/fsimpl/memfs/symlink.go @@ -19,11 +19,11 @@ import ( ) type symlink struct { - inode Inode + inode inode target string // immutable } -func (fs *Filesystem) newSymlink(creds *auth.Credentials, target string) *Inode { +func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode { link := &symlink{ target: target, } diff --git a/pkg/sentry/safemem/io.go b/pkg/sentry/safemem/io.go index 5c3d73eb7..f039a5c34 100644 --- a/pkg/sentry/safemem/io.go +++ b/pkg/sentry/safemem/io.go @@ -157,7 +157,8 @@ func (w ToIOWriter) Write(src []byte) (int, error) { } // FromIOReader implements Reader for an io.Reader by repeatedly invoking -// io.Reader.Read until it returns an error or partial read. +// io.Reader.Read until it returns an error or partial read. This is not +// thread-safe. // // FromIOReader will return a successful partial read iff Reader.Read does so. type FromIOReader struct { @@ -206,6 +207,58 @@ func (r FromIOReader) readToBlock(dst Block, buf []byte) (int, []byte, error) { return wbn, buf, rerr } +// FromIOReaderAt implements Reader for an io.ReaderAt. Does not repeatedly +// invoke io.ReaderAt.ReadAt because ReadAt is more strict than Read. A partial +// read indicates an error. This is not thread-safe. +type FromIOReaderAt struct { + ReaderAt io.ReaderAt + Offset int64 +} + +// ReadToBlocks implements Reader.ReadToBlocks. +func (r FromIOReaderAt) ReadToBlocks(dsts BlockSeq) (uint64, error) { + var buf []byte + var done uint64 + for !dsts.IsEmpty() { + dst := dsts.Head() + var n int + var err error + n, buf, err = r.readToBlock(dst, buf) + done += uint64(n) + if n != dst.Len() { + return done, err + } + dsts = dsts.Tail() + if err != nil { + if dsts.IsEmpty() && err == io.EOF { + return done, nil + } + return done, err + } + } + return done, nil +} + +func (r FromIOReaderAt) readToBlock(dst Block, buf []byte) (int, []byte, error) { + // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require + // safecopy. + if !dst.NeedSafecopy() { + n, err := r.ReaderAt.ReadAt(dst.ToSlice(), r.Offset) + r.Offset += int64(n) + return n, buf, err + } + if len(buf) < dst.Len() { + buf = make([]byte, dst.Len()) + } + rn, rerr := r.ReaderAt.ReadAt(buf[:dst.Len()], r.Offset) + r.Offset += int64(rn) + wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn])) + if wberr != nil { + return wbn, buf, wberr + } + return wbn, buf, rerr +} + // FromIOWriter implements Writer for an io.Writer by repeatedly invoking // io.Writer.Write until it returns an error or partial write. // diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD index e43775898..a3585e10d 100644 --- a/pkg/sentry/socket/rpcinet/notifier/BUILD +++ b/pkg/sentry/socket/rpcinet/notifier/BUILD @@ -6,7 +6,7 @@ go_library( name = "notifier", srcs = ["notifier.go"], importpath = "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/notifier", - visibility = ["//pkg/sentry:internal"], + visibility = ["//:sandbox"], deps = [ "//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto", "//pkg/sentry/socket/rpcinet/conn", diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index 63e2c5a5d..912cbe4ff 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -120,7 +120,7 @@ func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent Ino: attr.InodeID, Off: offset, }, - Typ: toType(attr.Type), + Typ: fs.ToDirentType(attr.Type), }, Name: []byte(name), } @@ -142,28 +142,6 @@ func smallestDirent64(a arch.Context) uint { return uint(binary.Size(d.Hdr)) + a.Width() } -// toType converts an fs.InodeOperationsInfo to a linux dirent typ field. -func toType(nodeType fs.InodeType) uint8 { - switch nodeType { - case fs.RegularFile, fs.SpecialFile: - return linux.DT_REG - case fs.Symlink: - return linux.DT_LNK - case fs.Directory, fs.SpecialDirectory: - return linux.DT_DIR - case fs.Pipe: - return linux.DT_FIFO - case fs.CharacterDevice: - return linux.DT_CHR - case fs.BlockDevice: - return linux.DT_BLK - case fs.Socket: - return linux.DT_SOCK - default: - return linux.DT_UNKNOWN - } -} - // padRec pads the name field until the rec length is a multiple of the width, // which must be a power of 2. It returns the padded rec length. func (d *dirent) padRec(width int) uint16 { diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index a7c98efcb..17e3dde1f 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -207,6 +207,10 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, syserror.ESPIPE } if outOffset != 0 { + if !outFile.Flags().Pwrite { + return 0, nil, syserror.EINVAL + } + var offset int64 if _, err := t.CopyIn(outOffset, &offset); err != nil { return 0, nil, err @@ -220,6 +224,10 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, syserror.ESPIPE } if inOffset != 0 { + if !inFile.Flags().Pread { + return 0, nil, syserror.EINVAL + } + var offset int64 if _, err := t.CopyIn(inOffset, &offset); err != nil { return 0, nil, err diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go index 60070874d..ea7296e6a 100644 --- a/pkg/tcpip/network/arp/arp.go +++ b/pkg/tcpip/network/arp/arp.go @@ -109,6 +109,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) { pkt.SetOp(header.ARPReply) copy(pkt.HardwareAddressSender(), r.LocalLinkAddress[:]) copy(pkt.ProtocolAddressSender(), h.ProtocolAddressTarget()) + copy(pkt.HardwareAddressTarget(), h.HardwareAddressSender()) copy(pkt.ProtocolAddressTarget(), h.ProtocolAddressSender()) e.linkEP.WritePacket(r, nil /* gso */, hdr, buffer.VectorisedView{}, ProtocolNumber) fallthrough // also fill the cache from requests diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go index 66c55821b..e477046db 100644 --- a/pkg/tcpip/network/arp/arp_test.go +++ b/pkg/tcpip/network/arp/arp_test.go @@ -15,6 +15,7 @@ package arp_test import ( + "strconv" "testing" "time" @@ -101,40 +102,30 @@ func TestDirectRequest(t *testing.T) { c.linkEP.Inject(arp.ProtocolNumber, v.ToVectorisedView()) } - inject(stackAddr1) - { - pkt := <-c.linkEP.C - if pkt.Proto != arp.ProtocolNumber { - t.Fatalf("stackAddr1: expected ARP response, got network protocol number %v", pkt.Proto) - } - rep := header.ARP(pkt.Header) - if !rep.IsValid() { - t.Fatalf("stackAddr1: invalid ARP response len(pkt.Header)=%d", len(pkt.Header)) - } - if tcpip.Address(rep.ProtocolAddressSender()) != stackAddr1 { - t.Errorf("stackAddr1: expected sender to be set") - } - if got := tcpip.LinkAddress(rep.HardwareAddressSender()); got != stackLinkAddr { - t.Errorf("stackAddr1: expected sender to be stackLinkAddr, got %q", got) - } - } - - inject(stackAddr2) - { - pkt := <-c.linkEP.C - if pkt.Proto != arp.ProtocolNumber { - t.Fatalf("stackAddr2: expected ARP response, got network protocol number %v", pkt.Proto) - } - rep := header.ARP(pkt.Header) - if !rep.IsValid() { - t.Fatalf("stackAddr2: invalid ARP response len(pkt.Header)=%d", len(pkt.Header)) - } - if tcpip.Address(rep.ProtocolAddressSender()) != stackAddr2 { - t.Errorf("stackAddr2: expected sender to be set") - } - if got := tcpip.LinkAddress(rep.HardwareAddressSender()); got != stackLinkAddr { - t.Errorf("stackAddr2: expected sender to be stackLinkAddr, got %q", got) - } + for i, address := range []tcpip.Address{stackAddr1, stackAddr2} { + t.Run(strconv.Itoa(i), func(t *testing.T) { + inject(address) + pkt := <-c.linkEP.C + if pkt.Proto != arp.ProtocolNumber { + t.Fatalf("expected ARP response, got network protocol number %d", pkt.Proto) + } + rep := header.ARP(pkt.Header) + if !rep.IsValid() { + t.Fatalf("invalid ARP response len(pkt.Header)=%d", len(pkt.Header)) + } + if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr; got != want { + t.Errorf("got HardwareAddressSender = %s, want = %s", got, want) + } + if got, want := tcpip.Address(rep.ProtocolAddressSender()), tcpip.Address(h.ProtocolAddressTarget()); got != want { + t.Errorf("got ProtocolAddressSender = %s, want = %s", got, want) + } + if got, want := tcpip.LinkAddress(rep.HardwareAddressTarget()), tcpip.LinkAddress(h.HardwareAddressSender()); got != want { + t.Errorf("got HardwareAddressTarget = %s, want = %s", got, want) + } + if got, want := tcpip.Address(rep.ProtocolAddressTarget()), tcpip.Address(h.ProtocolAddressSender()); got != want { + t.Errorf("got ProtocolAddressTarget = %s, want = %s", got, want) + } + }) } inject(stackAddrBad) diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index 52fd1bfa3..e9c5099ea 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -96,6 +96,17 @@ type listenContext struct { hasher hash.Hash v6only bool netProto tcpip.NetworkProtocolNumber + // pendingMu protects pendingEndpoints. This should only be accessed + // by the listening endpoint's worker goroutine. + // + // Lock Ordering: listenEP.workerMu -> pendingMu + pendingMu sync.Mutex + // pending is used to wait for all pendingEndpoints to finish when + // a socket is closed. + pending sync.WaitGroup + // pendingEndpoints is a map of all endpoints for which a handshake is + // in progress. + pendingEndpoints map[stack.TransportEndpointID]*endpoint } // timeStamp returns an 8-bit timestamp with a granularity of 64 seconds. @@ -133,14 +144,15 @@ func decSynRcvdCount() { } // newListenContext creates a new listen context. -func newListenContext(stack *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, v6only bool, netProto tcpip.NetworkProtocolNumber) *listenContext { +func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, v6only bool, netProto tcpip.NetworkProtocolNumber) *listenContext { l := &listenContext{ - stack: stack, - rcvWnd: rcvWnd, - hasher: sha1.New(), - v6only: v6only, - netProto: netProto, - listenEP: listenEP, + stack: stk, + rcvWnd: rcvWnd, + hasher: sha1.New(), + v6only: v6only, + netProto: netProto, + listenEP: listenEP, + pendingEndpoints: make(map[stack.TransportEndpointID]*endpoint), } rand.Read(l.nonce[0][:]) @@ -253,6 +265,17 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head return nil, err } + // listenEP is nil when listenContext is used by tcp.Forwarder. + if l.listenEP != nil { + l.listenEP.mu.Lock() + if l.listenEP.state != StateListen { + l.listenEP.mu.Unlock() + return nil, tcpip.ErrConnectionAborted + } + l.addPendingEndpoint(ep) + l.listenEP.mu.Unlock() + } + // Perform the 3-way handshake. h := newHandshake(ep, seqnum.Size(ep.initialReceiveWindow())) @@ -260,6 +283,9 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head if err := h.execute(); err != nil { ep.stack.Stats().TCP.FailedConnectionAttempts.Increment() ep.Close() + if l.listenEP != nil { + l.removePendingEndpoint(ep) + } return nil, err } ep.mu.Lock() @@ -274,15 +300,41 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head return ep, nil } +func (l *listenContext) addPendingEndpoint(n *endpoint) { + l.pendingMu.Lock() + l.pendingEndpoints[n.id] = n + l.pending.Add(1) + l.pendingMu.Unlock() +} + +func (l *listenContext) removePendingEndpoint(n *endpoint) { + l.pendingMu.Lock() + delete(l.pendingEndpoints, n.id) + l.pending.Done() + l.pendingMu.Unlock() +} + +func (l *listenContext) closeAllPendingEndpoints() { + l.pendingMu.Lock() + for _, n := range l.pendingEndpoints { + n.notifyProtocolGoroutine(notifyClose) + } + l.pendingMu.Unlock() + l.pending.Wait() +} + // deliverAccepted delivers the newly-accepted endpoint to the listener. If the // endpoint has transitioned out of the listen state, the new endpoint is closed // instead. func (e *endpoint) deliverAccepted(n *endpoint) { - e.mu.RLock() + e.mu.Lock() state := e.state - e.mu.RUnlock() + e.pendingAccepted.Add(1) + defer e.pendingAccepted.Done() + acceptedChan := e.acceptedChan + e.mu.Unlock() if state == StateListen { - e.acceptedChan <- n + acceptedChan <- n e.waiterQueue.Notify(waiter.EventIn) } else { n.Close() @@ -304,7 +356,7 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header e.stack.Stats().TCP.FailedConnectionAttempts.Increment() return } - + ctx.removePendingEndpoint(n) e.deliverAccepted(n) } @@ -451,6 +503,11 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) { // protocolListenLoop is the main loop of a listening TCP endpoint. It runs in // its own goroutine and is responsible for handling connection requests. func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error { + e.mu.Lock() + v6only := e.v6only + e.mu.Unlock() + ctx := newListenContext(e.stack, e, rcvWnd, v6only, e.netProto) + defer func() { // Mark endpoint as closed. This will prevent goroutines running // handleSynSegment() from attempting to queue new connections @@ -458,6 +515,9 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error { e.mu.Lock() e.state = StateClose + // close any endpoints in SYN-RCVD state. + ctx.closeAllPendingEndpoints() + // Do cleanup if needed. e.completeWorkerLocked() @@ -470,12 +530,6 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error { e.waiterQueue.Notify(waiter.EventIn | waiter.EventOut) }() - e.mu.Lock() - v6only := e.v6only - e.mu.Unlock() - - ctx := newListenContext(e.stack, e, rcvWnd, v6only, e.netProto) - s := sleep.Sleeper{} s.AddWaker(&e.notificationWaker, wakerForNotification) s.AddWaker(&e.newSegmentWaker, wakerForNewSegment) @@ -492,7 +546,6 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error { e.handleListenSegment(ctx, s) s.decRef() } - synRcvdCount.pending.Wait() close(e.drainDone) <-e.undrain } diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go index d9f79e8c5..c54610a87 100644 --- a/pkg/tcpip/transport/tcp/dual_stack_test.go +++ b/pkg/tcpip/transport/tcp/dual_stack_test.go @@ -570,3 +570,89 @@ func TestV4AcceptOnV4(t *testing.T) { // Test acceptance. testV4Accept(t, c) } + +func testV4ListenClose(t *testing.T, c *context.Context) { + // Set the SynRcvd threshold to zero to force a syn cookie based accept + // to happen. + saved := tcp.SynRcvdCountThreshold + defer func() { + tcp.SynRcvdCountThreshold = saved + }() + tcp.SynRcvdCountThreshold = 0 + const n = uint16(32) + + // Start listening. + if err := c.EP.Listen(int(tcp.SynRcvdCountThreshold + 1)); err != nil { + t.Fatalf("Listen failed: %v", err) + } + + irs := seqnum.Value(789) + for i := uint16(0); i < n; i++ { + // Send a SYN request. + c.SendPacket(nil, &context.Headers{ + SrcPort: context.TestPort + i, + DstPort: context.StackPort, + Flags: header.TCPFlagSyn, + SeqNum: irs, + RcvWnd: 30000, + }) + } + + // Each of these ACK's will cause a syn-cookie based connection to be + // accepted and delivered to the listening endpoint. + for i := uint16(0); i < n; i++ { + b := c.GetPacket() + tcp := header.TCP(header.IPv4(b).Payload()) + iss := seqnum.Value(tcp.SequenceNumber()) + // Send ACK. + c.SendPacket(nil, &context.Headers{ + SrcPort: tcp.DestinationPort(), + DstPort: context.StackPort, + Flags: header.TCPFlagAck, + SeqNum: irs + 1, + AckNum: iss + 1, + RcvWnd: 30000, + }) + } + + // Try to accept the connection. + we, ch := waiter.NewChannelEntry(nil) + c.WQ.EventRegister(&we, waiter.EventIn) + defer c.WQ.EventUnregister(&we) + nep, _, err := c.EP.Accept() + if err == tcpip.ErrWouldBlock { + // Wait for connection to be established. + select { + case <-ch: + nep, _, err = c.EP.Accept() + if err != nil { + t.Fatalf("Accept failed: %v", err) + } + + case <-time.After(10 * time.Second): + t.Fatalf("Timed out waiting for accept") + } + } + nep.Close() + c.EP.Close() +} + +func TestV4ListenCloseOnV4(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + // Create TCP endpoint. + var err *tcpip.Error + c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ) + if err != nil { + t.Fatalf("NewEndpoint failed: %v", err) + } + + // Bind to wildcard. + if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil { + t.Fatalf("Bind failed: %v", err) + } + + // Test acceptance. + testV4ListenClose(t, c) +} diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 353e2efaf..0e16877e7 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -362,6 +362,12 @@ type endpoint struct { // without hearing a response, the connection is closed. keepalive keepalive + // pendingAccepted is a synchronization primitive used to track number + // of connections that are queued up to be delivered to the accepted + // channel. We use this to ensure that all goroutines blocked on writing + // to the acceptedChan below terminate before we close acceptedChan. + pendingAccepted sync.WaitGroup `state:"nosave"` + // acceptedChan is used by a listening endpoint protocol goroutine to // send newly accepted connections to the endpoint so that they can be // read by Accept() calls. @@ -375,7 +381,11 @@ type endpoint struct { // The goroutine drain completion notification channel. drainDone chan struct{} `state:"nosave"` - // The goroutine undrain notification channel. + // The goroutine undrain notification channel. This is currently used as + // a way to block the worker goroutines. Today nothing closes/writes + // this channel and this causes any goroutines waiting on this to just + // block. This is used during save/restore to prevent worker goroutines + // from mutating state as it's being saved. undrain chan struct{} `state:"nosave"` // probe if not nil is invoked on every received segment. It is passed @@ -575,6 +585,34 @@ func (e *endpoint) Close() { e.mu.Unlock() } +// closePendingAcceptableConnections closes all connections that have completed +// handshake but not yet been delivered to the application. +func (e *endpoint) closePendingAcceptableConnectionsLocked() { + done := make(chan struct{}) + // Spin a goroutine up as ranging on e.acceptedChan will just block when + // there are no more connections in the channel. Using a non-blocking + // select does not work as it can potentially select the default case + // even when there are pending writes but that are not yet written to + // the channel. + go func() { + defer close(done) + for n := range e.acceptedChan { + n.mu.Lock() + n.resetConnectionLocked(tcpip.ErrConnectionAborted) + n.mu.Unlock() + n.Close() + } + }() + // pendingAccepted(see endpoint.deliverAccepted) tracks the number of + // endpoints which have completed handshake but are not yet written to + // the e.acceptedChan. We wait here till the goroutine above can drain + // all such connections from e.acceptedChan. + e.pendingAccepted.Wait() + close(e.acceptedChan) + <-done + e.acceptedChan = nil +} + // cleanupLocked frees all resources associated with the endpoint. It is called // after Close() is called and the worker goroutine (if any) is done with its // work. @@ -582,14 +620,7 @@ func (e *endpoint) cleanupLocked() { // Close all endpoints that might have been accepted by TCP but not by // the client. if e.acceptedChan != nil { - close(e.acceptedChan) - for n := range e.acceptedChan { - n.mu.Lock() - n.resetConnectionLocked(tcpip.ErrConnectionAborted) - n.mu.Unlock() - n.Close() - } - e.acceptedChan = nil + e.closePendingAcceptableConnectionsLocked() } e.workerCleanup = false diff --git a/runsc/container/container.go b/runsc/container/container.go index 27e9c2e0f..bbb364214 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -138,6 +138,34 @@ type Container struct { RootContainerDir string } +// loadSandbox loads all containers that belong to the sandbox with the given +// ID. +func loadSandbox(rootDir, id string) ([]*Container, error) { + cids, err := List(rootDir) + if err != nil { + return nil, err + } + + // Load the container metadata. + var containers []*Container + for _, cid := range cids { + container, err := Load(rootDir, cid) + if err != nil { + // Container file may not exist if it raced with creation/deletion or + // directory was left behind. Load provides a snapshot in time, so it's + // fine to skip it. + if os.IsNotExist(err) { + continue + } + return nil, fmt.Errorf("loading container %q: %v", id, err) + } + if container.Sandbox.ID == id { + containers = append(containers, container) + } + } + return containers, nil +} + // Load loads a container with the given id from a metadata file. id may be an // abbreviation of the full container id, in which case Load loads the // container to which id unambiguously refers to. @@ -180,7 +208,7 @@ func Load(rootDir, id string) (*Container, error) { // If the status is "Running" or "Created", check that the sandbox // process still exists, and set it to Stopped if it does not. // - // This is inherently racey. + // This is inherently racy. if c.Status == Running || c.Status == Created { // Check if the sandbox process is still running. if !c.isSandboxRunning() { @@ -237,7 +265,13 @@ func List(rootDir string) ([]string, error) { } var out []string for _, f := range fs { - out = append(out, f.Name()) + // Filter out directories that do no belong to a container. + cid := f.Name() + if validateID(cid) == nil { + if _, err := os.Stat(filepath.Join(rootDir, cid, metadataFilename)); err == nil { + out = append(out, f.Name()) + } + } } return out, nil } @@ -1108,6 +1142,10 @@ func runInCgroup(cg *cgroup.Cgroup, fn func() error) error { // adjustOOMScoreAdj sets the oom_score_adj for the sandbox and all gofers. // oom_score_adj is set to the lowest oom_score_adj among the containers // running in the sandbox. +// +// TODO(gvisor.dev/issue/512): This call could race with other containers being +// created at the same time and end up setting the wrong oom_score_adj to the +// sandbox. func (c *Container) adjustOOMScoreAdj(conf *boot.Config) error { // If this container's OOMScoreAdj is nil then we can exit early as no // change should be made to oom_score_adj for the sandbox. @@ -1115,21 +1153,9 @@ func (c *Container) adjustOOMScoreAdj(conf *boot.Config) error { return nil } - ids, err := List(conf.RootDir) + containers, err := loadSandbox(conf.RootDir, c.Sandbox.ID) if err != nil { - return err - } - - // Load the container metadata. - var containers []*Container - for _, id := range ids { - container, err := Load(conf.RootDir, id) - if err != nil { - return fmt.Errorf("loading container %q: %v", id, err) - } - if container.Sandbox.ID == c.Sandbox.ID { - containers = append(containers, container) - } + return fmt.Errorf("loading sandbox containers: %v", err) } // Get the lowest score for all containers. @@ -1150,30 +1176,16 @@ func (c *Container) adjustOOMScoreAdj(conf *boot.Config) error { return nil } - // Set oom_score_adj for the sandbox. + // Set the lowest of all containers oom_score_adj to the sandbox. if err := setOOMScoreAdj(c.Sandbox.Pid, lowScore); err != nil { return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", c.Sandbox.ID, err) } - // Set the gofer's oom_score_adj to the minimum of -500 and the - // sandbox's oom_score_adj to better ensure that the sandbox is killed - // before the gofer. - // - // TODO(gvisor.dev/issue/601) Set oom_score_adj for the gofer to - // the same oom_score_adj as the sandbox. - goferScoreAdj := -500 - if lowScore < goferScoreAdj { - goferScoreAdj = lowScore - } - - // Set oom_score_adj for gofers for all containers in the sandbox. - for _, container := range containers { - err := setOOMScoreAdj(container.GoferPid, goferScoreAdj) - if err != nil { - return fmt.Errorf("setting oom_score_adj for container %q: %v", container.ID, err) - } + // Set container's oom_score_adj to the gofer since it is dedicated to the + // container, in case the gofer uses up too much memory. + if err := setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj); err != nil { + return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err) } - return nil } diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index 2ef065a15..2d51fecc6 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -60,11 +60,14 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) { } func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) { - rootDir, err := testutil.SetupRootDir() - if err != nil { - return nil, nil, fmt.Errorf("error creating root dir: %v", err) + // Setup root dir if one hasn't been provided. + if len(conf.RootDir) == 0 { + rootDir, err := testutil.SetupRootDir() + if err != nil { + return nil, nil, fmt.Errorf("error creating root dir: %v", err) + } + conf.RootDir = rootDir } - conf.RootDir = rootDir var containers []*Container var bundles []string @@ -75,7 +78,7 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C for _, b := range bundles { os.RemoveAll(b) } - os.RemoveAll(rootDir) + os.RemoveAll(conf.RootDir) } for i, spec := range specs { bundleDir, err := testutil.SetupBundleDir(spec) @@ -1423,3 +1426,62 @@ func TestMultiContainerGoferKilled(t *testing.T) { } } } + +func TestMultiContainerLoadSandbox(t *testing.T) { + sleep := []string{"sleep", "100"} + specs, ids := createSpecs(sleep, sleep, sleep) + conf := testutil.TestConfig() + + // Create containers for the sandbox. + wants, cleanup, err := startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Then create unrelated containers. + for i := 0; i < 3; i++ { + specs, ids = createSpecs(sleep, sleep, sleep) + _, cleanup, err = startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + } + + // Create an unrelated directory under root. + dir := filepath.Join(conf.RootDir, "not-a-container") + if err := os.MkdirAll(dir, 0755); err != nil { + t.Fatalf("os.MkdirAll(%q)=%v", dir, err) + } + + // Create a valid but empty container directory. + randomCID := testutil.UniqueContainerID() + dir = filepath.Join(conf.RootDir, randomCID) + if err := os.MkdirAll(dir, 0755); err != nil { + t.Fatalf("os.MkdirAll(%q)=%v", dir, err) + } + + // Load the sandbox and check that the correct containers were returned. + id := wants[0].Sandbox.ID + gots, err := loadSandbox(conf.RootDir, id) + if err != nil { + t.Fatalf("loadSandbox()=%v", err) + } + wantIDs := make(map[string]struct{}) + for _, want := range wants { + wantIDs[want.ID] = struct{}{} + } + for _, got := range gots { + if got.Sandbox.ID != id { + t.Errorf("wrong sandbox ID, got: %v, want: %v", got.Sandbox.ID, id) + } + if _, ok := wantIDs[got.ID]; !ok { + t.Errorf("wrong container ID, got: %v, wants: %v", got.ID, wantIDs) + } + delete(wantIDs, got.ID) + } + if len(wantIDs) != 0 { + t.Errorf("containers not found: %v", wantIDs) + } +} diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go index 406f6d08a..e288c7758 100644 --- a/runsc/test/testutil/testutil.go +++ b/runsc/test/testutil/testutil.go @@ -189,11 +189,14 @@ func SetupRootDir() (string, error) { // SetupContainer creates a bundle and root dir for the container, generates a // test config, and writes the spec to config.json in the bundle dir. func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, err error) { - rootDir, err = SetupRootDir() - if err != nil { - return "", "", err + // Setup root dir if one hasn't been provided. + if len(conf.RootDir) == 0 { + rootDir, err = SetupRootDir() + if err != nil { + return "", "", err + } + conf.RootDir = rootDir } - conf.RootDir = rootDir bundleDir, err = SetupBundleDir(spec) return rootDir, bundleDir, err } diff --git a/test/runtimes/README.md b/test/runtimes/README.md index 4e5a950bc..34d3507be 100644 --- a/test/runtimes/README.md +++ b/test/runtimes/README.md @@ -16,7 +16,7 @@ The following runtimes are currently supported: 1) [Install and configure Docker](https://docs.docker.com/install/) -2) Build each Docker container: +2) Build each Docker container from the runtimes directory: ```bash $ docker build -f $LANG/Dockerfile [-t $NAME] . diff --git a/test/runtimes/go/Dockerfile b/test/runtimes/go/Dockerfile index cd55608cd..1d5202b70 100644 --- a/test/runtimes/go/Dockerfile +++ b/test/runtimes/go/Dockerfile @@ -23,9 +23,12 @@ ENV LANG_DIR=${GOROOT} WORKDIR ${LANG_DIR}/src RUN ./make.bash +# Pre-compile the tests for faster execution +RUN ["/root/go/bin/go", "tool", "dist", "test", "-compile-only"] WORKDIR ${LANG_DIR} -COPY proctor-go.go ${LANG_DIR} +COPY common /root/go/src/gvisor.dev/gvisor/test/runtimes/common/common +COPY go/proctor-go.go ${LANG_DIR} ENTRYPOINT ["/root/go/bin/go", "run", "proctor-go.go"] diff --git a/test/runtimes/java/Dockerfile b/test/runtimes/java/Dockerfile index e162d7218..b9132b575 100644 --- a/test/runtimes/java/Dockerfile +++ b/test/runtimes/java/Dockerfile @@ -1,25 +1,16 @@ FROM ubuntu:bionic # This hash is associated with a specific JDK release and needed for ensuring # the same version is downloaded every time. -ENV LANG_HASH=af47e0398606 -ENV LANG_VER=11u-dev +ENV LANG_HASH=76072a077ee1 +ENV LANG_VER=11 ENV LANG_NAME=Java RUN apt-get update && apt-get install -y \ autoconf \ build-essential \ - curl\ - file \ - libasound2-dev \ - libcups2-dev \ - libfontconfig1-dev \ - libx11-dev \ - libxext-dev \ - libxrandr-dev \ - libxrender-dev \ - libxt-dev \ - libxtst-dev \ + curl \ make \ + openjdk-${LANG_VER}-jdk \ unzip \ zip @@ -27,26 +18,18 @@ WORKDIR /root RUN curl -o go.tar.gz https://dl.google.com/go/go1.12.6.linux-amd64.tar.gz RUN tar -zxf go.tar.gz -# Use curl instead of ADD to prevent redownload every time. -RUN curl -o jdk.tar.gz http://hg.openjdk.java.net/jdk-updates/jdk${LANG_VER}/archive/${LANG_HASH}.tar.gz -# Download Java version N-1 to be used as the Boot JDK to build Java version N. -RUN curl -o bootjdk.tar.gz https://download.java.net/openjdk/jdk10/ri/openjdk-10+44_linux-x64_bin_ri.tar.gz - -RUN tar -zxf jdk.tar.gz -RUN tar -zxf bootjdk.tar.gz - -# Specify the JDK to be used by jtreg. -ENV JT_JAVA=/root/jdk${LANG_VER}-${LANG_HASH}/build/linux-x86_64-normal-server-release/jdk -ENV LANG_DIR=/root/jdk${LANG_VER}-${LANG_HASH} - -WORKDIR ${LANG_DIR} +# Download the JDK test library. +RUN set -ex \ + && curl -fsSL --retry 10 -o /tmp/jdktests.tar.gz http://hg.openjdk.java.net/jdk/jdk${LANG_VER}/archive/${LANG_HASH}.tar.gz/test \ + && tar -xzf /tmp/jdktests.tar.gz -C /root \ + && rm -f /tmp/jdktests.tar.gz RUN curl -o jtreg.tar.gz https://ci.adoptopenjdk.net/view/Dependencies/job/jtreg/lastSuccessfulBuild/artifact/jtreg-4.2.0-tip.tar.gz RUN tar -xzf jtreg.tar.gz -RUN bash configure --with-boot-jdk=/root/jdk-10 --with-jtreg=${LANG_DIR}/jtreg -RUN make clean -RUN make images -COPY proctor-java.go ${LANG_DIR} +ENV LANG_DIR=/root + +COPY common /root/go/src/gvisor.dev/gvisor/test/runtimes/common/common +COPY java/proctor-java.go ${LANG_DIR} ENTRYPOINT ["/root/go/bin/go", "run", "proctor-java.go"] diff --git a/test/runtimes/java/proctor-java.go b/test/runtimes/java/proctor-java.go index c1e611b4b..7f6a66f4f 100644 --- a/test/runtimes/java/proctor-java.go +++ b/test/runtimes/java/proctor-java.go @@ -29,6 +29,7 @@ import ( var ( dir = os.Getenv("LANG_DIR") + hash = os.Getenv("LANG_HASH") jtreg = filepath.Join(dir, "jtreg/bin/jtreg") exclDirs = regexp.MustCompile(`(^(sun\/security)|(java\/util\/stream)|(java\/time)| )`) ) @@ -44,7 +45,7 @@ func main() { func (j javaRunner) ListTests() ([]string, error) { args := []string{ - "-dir:test/jdk", + "-dir:/root/jdk11-" + hash + "/test/jdk", "-ignore:quiet", "-a", "-listtests", @@ -69,7 +70,7 @@ func (j javaRunner) ListTests() ([]string, error) { } func (j javaRunner) RunTest(test string) error { - args := []string{"-dir:test/jdk/", test} + args := []string{"-noreport", "-dir:/root/jdk11-" + hash + "/test/jdk", test} cmd := exec.Command(jtreg, args...) cmd.Stdout, cmd.Stderr = os.Stdout, os.Stderr if err := cmd.Run(); err != nil { diff --git a/test/runtimes/nodejs/Dockerfile b/test/runtimes/nodejs/Dockerfile index b2416cce8..aba30d2ee 100644 --- a/test/runtimes/nodejs/Dockerfile +++ b/test/runtimes/nodejs/Dockerfile @@ -22,7 +22,8 @@ RUN ./configure RUN make RUN make test-build -COPY proctor-nodejs.go ${LANG_DIR} +COPY common /root/go/src/gvisor.dev/gvisor/test/runtimes/common/common +COPY nodejs/proctor-nodejs.go ${LANG_DIR} # Including dumb-init emulates the Linux "init" process, preventing the failure # of tests involving worker processes. diff --git a/test/runtimes/php/Dockerfile b/test/runtimes/php/Dockerfile index 1f8959b50..491ab902d 100644 --- a/test/runtimes/php/Dockerfile +++ b/test/runtimes/php/Dockerfile @@ -24,6 +24,7 @@ WORKDIR ${LANG_DIR} RUN ./configure RUN make -COPY proctor-php.go ${LANG_DIR} +COPY common /root/go/src/gvisor.dev/gvisor/test/runtimes/common/common +COPY php/proctor-php.go ${LANG_DIR} ENTRYPOINT ["/root/go/bin/go", "run", "proctor-php.go"] diff --git a/test/runtimes/python/Dockerfile b/test/runtimes/python/Dockerfile index 811f48f8a..710daee43 100644 --- a/test/runtimes/python/Dockerfile +++ b/test/runtimes/python/Dockerfile @@ -26,6 +26,7 @@ WORKDIR ${LANG_DIR} RUN ./configure --with-pydebug RUN make -s -j2 -COPY proctor-python.go ${LANG_DIR} +COPY common /root/go/src/gvisor.dev/gvisor/test/runtimes/common/common +COPY python/proctor-python.go ${LANG_DIR} ENTRYPOINT ["/root/go/bin/go", "run", "proctor-python.go"] diff --git a/test/runtimes/runtimes_test.go b/test/runtimes/runtimes_test.go index 6bf954e78..43dd6f5b7 100644 --- a/test/runtimes/runtimes_test.go +++ b/test/runtimes/runtimes_test.go @@ -22,8 +22,16 @@ import ( "gvisor.dev/gvisor/runsc/test/testutil" ) -func TestNodeJS(t *testing.T) { - const img = "gcr.io/gvisor-proctor/nodejs" +// Wait time for each test to run. +const timeout = 180 * time.Second + +// Helper function to execute the docker container associated with the +// language passed. Captures the output of the list function and executes +// each test individually, supplying any errors recieved. +func testLang(t *testing.T, lang string) { + t.Helper() + + img := "gcr.io/gvisor-presubmit/" + lang if err := testutil.Pull(img); err != nil { t.Fatalf("docker pull failed: %v", err) } @@ -49,7 +57,7 @@ func TestNodeJS(t *testing.T) { } defer d.CleanUp() - status, err := d.Wait(60 * time.Second) + status, err := d.Wait(timeout) if err != nil { t.Fatalf("docker test %q failed to wait: %v", tc, err) } @@ -65,3 +73,23 @@ func TestNodeJS(t *testing.T) { }) } } + +func TestGo(t *testing.T) { + testLang(t, "go") +} + +func TestJava(t *testing.T) { + testLang(t, "java") +} + +func TestNodejs(t *testing.T) { + testLang(t, "nodejs") +} + +func TestPhp(t *testing.T) { + testLang(t, "php") +} + +func TestPython(t *testing.T) { + testLang(t, "python") +} diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index a3e43cad2..aa1e33fb4 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -440,7 +440,9 @@ syscall_test( ) syscall_test( - size = "medium", + size = "large", + parallel = False, + shard_count = 10, test = "//test/syscalls/linux:socket_inet_loopback_test", ) diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc index df31d25b5..322ee07ad 100644 --- a/test/syscalls/linux/socket_inet_loopback.cc +++ b/test/syscalls/linux/socket_inet_loopback.cc @@ -145,6 +145,67 @@ TEST_P(SocketInetLoopbackTest, TCP) { ASSERT_THAT(shutdown(conn_fd.get(), SHUT_RDWR), SyscallSucceeds()); } +TEST_P(SocketInetLoopbackTest, TCPListenClose) { + auto const& param = GetParam(); + + TestAddress const& listener = param.listener; + TestAddress const& connector = param.connector; + + // Create the listening socket. + FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE( + Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP)); + sockaddr_storage listen_addr = listener.addr; + ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr), + listener.addr_len), + SyscallSucceeds()); + ASSERT_THAT(listen(listen_fd.get(), 1001), SyscallSucceeds()); + + // Get the port bound by the listening socket. + socklen_t addrlen = listener.addr_len; + ASSERT_THAT(getsockname(listen_fd.get(), + reinterpret_cast<sockaddr*>(&listen_addr), &addrlen), + SyscallSucceeds()); + uint16_t const port = + ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr)); + + DisableSave ds; // Too many system calls. + sockaddr_storage conn_addr = connector.addr; + ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port)); + constexpr int kFDs = 2048; + constexpr int kThreadCount = 4; + constexpr int kFDsPerThread = kFDs / kThreadCount; + FileDescriptor clients[kFDs]; + std::unique_ptr<ScopedThread> threads[kThreadCount]; + for (int i = 0; i < kFDs; i++) { + clients[i] = ASSERT_NO_ERRNO_AND_VALUE( + Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP)); + } + for (int i = 0; i < kThreadCount; i++) { + threads[i] = absl::make_unique<ScopedThread>([&connector, &conn_addr, + &clients, i]() { + for (int j = 0; j < kFDsPerThread; j++) { + int k = i * kFDsPerThread + j; + int ret = + connect(clients[k].get(), reinterpret_cast<sockaddr*>(&conn_addr), + connector.addr_len); + if (ret != 0) { + EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS)); + } + } + }); + } + for (int i = 0; i < kThreadCount; i++) { + threads[i]->Join(); + } + for (int i = 0; i < 32; i++) { + auto accepted = + ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr)); + } + // TODO(b/138400178): Fix cooperative S/R failure when ds.reset() is invoked + // before function end. + // ds.reset() +} + TEST_P(SocketInetLoopbackTest, TCPbacklog) { auto const& param = GetParam(); diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc index 1875f4533..e25f264f6 100644 --- a/test/syscalls/linux/splice.cc +++ b/test/syscalls/linux/splice.cc @@ -13,6 +13,7 @@ // limitations under the License. #include <fcntl.h> +#include <sys/eventfd.h> #include <sys/sendfile.h> #include <unistd.h> @@ -135,6 +136,80 @@ TEST(SpliceTest, PipeOffsets) { SyscallFailsWithErrno(ESPIPE)); } +// Event FDs may be used with splice without an offset. +TEST(SpliceTest, FromEventFD) { + // Open the input eventfd with an initial value so that it is readable. + constexpr uint64_t kEventFDValue = 1; + int efd; + ASSERT_THAT(efd = eventfd(kEventFDValue, 0), SyscallSucceeds()); + const FileDescriptor inf(efd); + + // Create a new pipe. + int fds[2]; + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + const FileDescriptor rfd(fds[0]); + const FileDescriptor wfd(fds[1]); + + // Splice 8-byte eventfd value to pipe. + constexpr int kEventFDSize = 8; + EXPECT_THAT(splice(inf.get(), nullptr, wfd.get(), nullptr, kEventFDSize, 0), + SyscallSucceedsWithValue(kEventFDSize)); + + // Contents should be equal. + std::vector<char> rbuf(kEventFDSize); + ASSERT_THAT(read(rfd.get(), rbuf.data(), rbuf.size()), + SyscallSucceedsWithValue(kEventFDSize)); + EXPECT_EQ(memcmp(rbuf.data(), &kEventFDValue, rbuf.size()), 0); +} + +// Event FDs may not be used with splice with an offset. +TEST(SpliceTest, FromEventFDOffset) { + int efd; + ASSERT_THAT(efd = eventfd(0, 0), SyscallSucceeds()); + const FileDescriptor inf(efd); + + // Create a new pipe. + int fds[2]; + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + const FileDescriptor rfd(fds[0]); + const FileDescriptor wfd(fds[1]); + + // Attempt to splice 8-byte eventfd value to pipe with offset. + // + // This is not allowed because eventfd doesn't support pread. + constexpr int kEventFDSize = 8; + loff_t in_off = 0; + EXPECT_THAT(splice(inf.get(), &in_off, wfd.get(), nullptr, kEventFDSize, 0), + SyscallFailsWithErrno(EINVAL)); +} + +// Event FDs may not be used with splice with an offset. +TEST(SpliceTest, ToEventFDOffset) { + // Create a new pipe. + int fds[2]; + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + const FileDescriptor rfd(fds[0]); + const FileDescriptor wfd(fds[1]); + + // Fill with a value. + constexpr int kEventFDSize = 8; + std::vector<char> buf(kEventFDSize); + buf[0] = 1; + ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(kEventFDSize)); + + int efd; + ASSERT_THAT(efd = eventfd(0, 0), SyscallSucceeds()); + const FileDescriptor outf(efd); + + // Attempt to splice 8-byte eventfd value to pipe with offset. + // + // This is not allowed because eventfd doesn't support pwrite. + loff_t out_off = 0; + EXPECT_THAT(splice(rfd.get(), nullptr, outf.get(), &out_off, kEventFDSize, 0), + SyscallFailsWithErrno(EINVAL)); +} + TEST(SpliceTest, ToPipe) { // Open the input file. const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); |