ext: vfs.FileDescriptionImpl and vfs.FilesystemImpl implementations.

- This also gets rid of pipes for now because pipe does not have vfs2 specific support yet. - Added file path resolution logic. - Fixes testing infrastructure. - Does not include unit tests yet. PiperOrigin-RevId: 262213950
author: Ayush Ranjan <ayushranjan@google.com> 2019-08-07 14:22:19 -0700
committer: gVisor bot <gvisor-bot@google.com> 2019-08-07 14:23:42 -0700
commit: 1c9781a4edce5fa9688f868149a2506f2ec5fa86 (patch)
tree: 6538030ca97dcdb175bed5e1dc7cea382ff06ee1 /pkg/sentry/fs
parent: 79cc4397fd99fbdd5c74ac5bb7804a463d7981d8 (diff)
19 files changed, 1195 insertions, 171 deletions
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 9fc6a5bc2..4f3d6410e 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -111,6 +111,50 @@ func (n InodeType) LinuxType() uint32 {
 	}
 }
 
+// ToDirentType converts an InodeType to a linux dirent type field.
+func ToDirentType(nodeType InodeType) uint8 {
+	switch nodeType {
+	case RegularFile, SpecialFile:
+		return linux.DT_REG
+	case Symlink:
+		return linux.DT_LNK
+	case Directory, SpecialDirectory:
+		return linux.DT_DIR
+	case Pipe:
+		return linux.DT_FIFO
+	case CharacterDevice:
+		return linux.DT_CHR
+	case BlockDevice:
+		return linux.DT_BLK
+	case Socket:
+		return linux.DT_SOCK
+	default:
+		return linux.DT_UNKNOWN
+	}
+}
+
+// ToInodeType coverts a linux file type to InodeType.
+func ToInodeType(linuxFileType linux.FileMode) InodeType {
+	switch linuxFileType {
+	case linux.ModeRegular:
+		return RegularFile
+	case linux.ModeDirectory:
+		return Directory
+	case linux.ModeSymlink:
+		return Symlink
+	case linux.ModeNamedPipe:
+		return Pipe
+	case linux.ModeCharacterDevice:
+		return CharacterDevice
+	case linux.ModeBlockDevice:
+		return BlockDevice
+	case linux.ModeSocket:
+		return Socket
+	default:
+		panic(fmt.Sprintf("unknown file mode: %d", linuxFileType))
+	}
+}
+
 // StableAttr contains Inode attributes that will be stable throughout the
 // lifetime of the Inode.
 //
diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD
index e3d617576..c6168da0a 100644
--- a/pkg/sentry/fs/ext/BUILD
+++ b/pkg/sentry/fs/ext/BUILD
@@ -4,14 +4,14 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
-    name = "dentry_list",
-    out = "dentry_list.go",
+    name = "dirent_list",
+    out = "dirent_list.go",
     package = "ext",
-    prefix = "dentry",
+    prefix = "dirent",
     template = "//pkg/ilist:generic_list",
     types = {
-        "Element": "*dentry",
-        "Linker": "*dentry",
+        "Element": "*dirent",
+        "Linker": "*dirent",
     },
 )
 
@@ -20,14 +20,13 @@ go_library(
     srcs = [
         "block_map_file.go",
         "dentry.go",
-        "dentry_list.go",
         "directory.go",
+        "dirent_list.go",
         "ext.go",
         "extent_file.go",
+        "file_description.go",
         "filesystem.go",
-        "inline_file.go",
         "inode.go",
-        "named_pipe.go",
         "regular_file.go",
         "symlink.go",
         "utils.go",
@@ -38,15 +37,19 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/fd",
+        "//pkg/log",
+        "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/ext/disklayout",
         "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/memmap",
         "//pkg/sentry/safemem",
+        "//pkg/sentry/syscalls/linux",
         "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
+        "//pkg/waiter",
     ],
 )
 
diff --git a/pkg/sentry/fs/ext/README.md b/pkg/sentry/fs/ext/README.md
new file mode 100644
index 000000000..e212717aa
--- /dev/null
+++ b/pkg/sentry/fs/ext/README.md
@@ -0,0 +1,117 @@
+## EXT(2/3/4) File System
+
+This is a filesystem driver which supports ext2, ext3 and ext4 filesystems.
+Linux has specialized drivers for each variant but none which supports all. This
+library takes advantage of ext's backward compatibility and understands the
+internal organization of on-disk structures to support all variants.
+
+This driver implementation diverges from the Linux implementations in being more
+forgiving about versioning. For instance, if a filesystem contains both extent
+based inodes and classical block map based inodes, this driver will not complain
+and interpret them both correctly. While in Linux this would be an issue. This
+blurs the line between the three ext fs variants.
+
+Ext2 is considered deprecated as of Red Hat Enterprise Linux 7, and ext3 has
+been superseded by ext4 by large performance gains. Thus it is recommended to
+upgrade older filesystem images to ext4 using e2fsprogs for better performance.
+
+### Read Only
+
+This driver currently only allows read only operations. A lot of the design
+decisions are based on this feature. There are plans to implement write (the
+process for which is documented in the future work section).
+
+### Performance
+
+One of the biggest wins about this driver is that it directly talks to the
+underlying block device (or whatever persistent storage is being used), instead
+of making expensive RPCs to a gofer.
+
+Another advantage is that ext fs supports fast concurrent reads. Currently the
+device is represented using a `io.ReaderAt` which allows for concurrent reads.
+All reads are directly passed to the device driver which intelligently serves
+the read requests in the optimal order. There is no congestion due to locking
+while reading in the filesystem level.
+
+Reads are optimized further in the way file data is transferred over to user
+memory. Ext fs directly copies over file data from disk into user memory with no
+additional allocations on the way. We can only get faster by preloading file
+data into memory (see future work section).
+
+The internal structures used to represent files, inodes and file descriptors use
+a lot of inheritance. With the level of indirection that an interface adds with
+an internal pointer, it can quickly fragment a structure across memory. As this
+runs along side a full blown kernel (which is memory intensive), having a
+fragmented struct might hurt performance. Hence these internal structures,
+though interfaced, are tightly packed in memory using the same inheritance
+pattern that pkg/sentry/vfs uses. The pkg/sentry/fs/ext/disklayout package makes
+an execption to this pattern for reasons documented in the package.
+
+### Security
+
+This driver also intends to help sandbox the container better by reducing the
+surface of the host kernel that the application touches. It prevents the
+application from exploiting vulnerabilities in the host filesystem driver. All
+`io.ReaderAt.ReadAt()` calls are translated to `pread(2)` which are directly
+passed to the device driver in the kernel. Hence this reduces the surface for
+attack.
+
+The application can not affect any host filesystems other than the one passed
+via block device by the user.
+
+### Future Work
+
+#### Write
+
+To support write operations we would need to modify the block device underneath.
+Currently, the driver does not modify the device at all, not even for updating
+the access times for reads. Modifying the filesystem incorrectly can corrupt it
+and render it unreadable for other correct ext(x) drivers. Hence caution must be
+maintained while modifying metadata structures.
+
+Ext4 specifically is built for performance and has added a lot of complexity as
+to how metadata structures are modified. For instance, files that are organized
+via an extent tree which must be balanced and file data blocks must be placed in
+the same extent as much as possible to increase locality. Such properties must
+be maintained while modifying the tree.
+
+Ext filesystems boast a lot about locality, which plays a big role in them being
+performant. The block allocation algorithm in Linux does a good job in keeping
+related data together. This behavior must be maintained as much as possible,
+else we might end up degrading the filesystem performance over time.
+
+Ext4 also supports a wide variety of features which are specialized for varying
+use cases. Implementing all of them can get difficult very quickly.
+
+Ext(x) checksums all its metadata structures to check for corruption, so
+modification of any metadata struct must correspond with re-checksumming the
+struct. Linux filesystem drivers also order on-disk updates intelligently to not
+corrupt the filesystem and also remain performant. The in-memory metadata
+structures must be kept in sync with what is on disk.
+
+There is also replication of some important structures across the filesystem.
+All replicas must be updated when their original copy is updated. There is also
+provisioning for snapshotting which must be kept in mind, although it should not
+affect this implementation unless we allow users to create filesystem snapshots.
+
+Ext4 also introduced journaling (jbd2). The journal must be updated
+appropriately.
+
+#### Performance
+
+To improve performance we should implement a buffer cache, and optionally, read
+ahead for small files. While doing so we must also keep in mind the memory usage
+and have a reasonable cap on how much file data we want to hold in memory.
+
+#### Features
+
+Our current implementation will work with most ext4 filesystems for readonly
+purposed. However, the following features are not supported yet:
+
+-   Journal
+-   Snapshotting
+-   Extended Attributes
+-   Hash Tree Directories
+-   Meta Block Groups
+-   Multiple Mount Protection
+-   Bigalloc
diff --git a/pkg/sentry/fs/ext/block_map_file.go b/pkg/sentry/fs/ext/block_map_file.go
index f30c3a174..cea89bcd9 100644
--- a/pkg/sentry/fs/ext/block_map_file.go
+++ b/pkg/sentry/fs/ext/block_map_file.go
@@ -85,7 +85,8 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) {
 	}
 
 	offset := uint64(off)
-	if offset >= f.regFile.inode.diskInode.Size() {
+	size := f.regFile.inode.diskInode.Size()
+	if offset >= size {
 		return 0, io.EOF
 	}
 
@@ -104,6 +105,9 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) {
 
 	read := 0
 	toRead := len(dst)
+	if uint64(toRead)+offset > size {
+		toRead = int(size - offset)
+	}
 	for read < toRead {
 		var err error
 		var curR int
@@ -131,6 +135,9 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) {
 		}
 	}
 
+	if read < len(dst) {
+		return read, io.EOF
+	}
 	return read, nil
 }
 
diff --git a/pkg/sentry/fs/ext/dentry.go b/pkg/sentry/fs/ext/dentry.go
index 19c9b3b2d..054fb42b6 100644
--- a/pkg/sentry/fs/ext/dentry.go
+++ b/pkg/sentry/fs/ext/dentry.go
@@ -26,8 +26,6 @@ type dentry struct {
 	// share a single non-directory Inode (with hard links). inode is
 	// immutable.
 	inode *inode
-	// dentryEntry links Dentries into their parent directory.childList.
-	dentryEntry
 }
 
 // Compiles only if dentry implements vfs.DentryImpl.
diff --git a/pkg/sentry/fs/ext/directory.go b/pkg/sentry/fs/ext/directory.go
index ab2b59e44..f896dbe1d 100644
--- a/pkg/sentry/fs/ext/directory.go
+++ b/pkg/sentry/fs/ext/directory.go
@@ -14,23 +14,293 @@
 
 package ext
 
+import (
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
 // directory represents a directory inode. It holds the childList in memory.
 type directory struct {
 	inode inode
 
-	// childList is a list containing (1) child Dentries and (2) fake Dentries
-	// (with inode == nil) that represent the iteration position of
+	// mu serializes the changes to childList.
+	// Lock Order (outermost locks must be taken first):
+	//   directory.mu
+	//     filesystem.mu
+	mu sync.Mutex
+
+	// childList is a list containing (1) child dirents and (2) fake dirents
+	// (with diskDirent == nil) that represent the iteration position of
 	// directoryFDs. childList is used to support directoryFD.IterDirents()
-	// efficiently. childList is immutable.
-	childList dentryList
+	// efficiently. childList is protected by mu.
+	childList direntList
 
-	// TODO(b/134676337): Add directory navigators.
+	// childMap maps the child's filename to the dirent structure stored in
+	// childList. This adds some data replication but helps in faster path
+	// traversal. For consistency, key == childMap[key].diskDirent.FileName().
+	// Immutable.
+	childMap map[string]*dirent
 }
 
 // newDirectroy is the directory constructor.
-func newDirectroy(inode inode) *directory {
-	// TODO(b/134676337): initialize childList.
-	file := &directory{inode: inode}
+func newDirectroy(inode inode, newDirent bool) (*directory, error) {
+	file := &directory{inode: inode, childMap: make(map[string]*dirent)}
 	file.inode.impl = file
-	return file
+
+	// Initialize childList by reading dirents from the underlying file.
+	if inode.diskInode.Flags().Index {
+		// TODO(b/134676337): Support hash tree directories. Currently only the '.'
+		// and '..' entries are read in.
+
+		// Users cannot navigate this hash tree directory yet.
+		log.Warningf("hash tree directory being used which is unsupported")
+		return file, nil
+	}
+
+	// The dirents are organized in a linear array in the file data.
+	// Extract the file data and decode the dirents.
+	regFile, err := newRegularFile(inode)
+	if err != nil {
+		return nil, err
+	}
+
+	// buf is used as scratch space for reading in dirents from disk and
+	// unmarshalling them into dirent structs.
+	buf := make([]byte, disklayout.DirentSize)
+	size := inode.diskInode.Size()
+	for off, inc := uint64(0), uint64(0); off < size; off += inc {
+		toRead := size - off
+		if toRead > disklayout.DirentSize {
+			toRead = disklayout.DirentSize
+		}
+		if n, err := regFile.impl.ReadAt(buf[:toRead], int64(off)); uint64(n) < toRead {
+			return nil, err
+		}
+
+		var curDirent dirent
+		if newDirent {
+			curDirent.diskDirent = &disklayout.DirentNew{}
+		} else {
+			curDirent.diskDirent = &disklayout.DirentOld{}
+		}
+		binary.Unmarshal(buf, binary.LittleEndian, curDirent.diskDirent)
+
+		if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 {
+			// Inode number and name length fields being set to 0 is used to indicate
+			// an unused dirent.
+			file.childList.PushBack(&curDirent)
+			file.childMap[curDirent.diskDirent.FileName()] = &curDirent
+		}
+
+		// The next dirent is placed exactly after this dirent record on disk.
+		inc = uint64(curDirent.diskDirent.RecordSize())
+	}
+
+	return file, nil
+}
+
+func (i *inode) isDir() bool {
+	_, ok := i.impl.(*directory)
+	return ok
+}
+
+// dirent is the directory.childList node.
+type dirent struct {
+	diskDirent disklayout.Dirent
+
+	// direntEntry links dirents into their parent directory.childList.
+	direntEntry
+}
+
+// directoryFD represents a directory file description. It implements
+// vfs.FileDescriptionImpl.
+type directoryFD struct {
+	fileDescription
+	vfs.DirectoryFileDescriptionDefaultImpl
+
+	// Protected by directory.mu.
+	iter *dirent
+	off  int64
+}
+
+// Compiles only if directoryFD implements vfs.FileDescriptionImpl.
+var _ vfs.FileDescriptionImpl = (*directoryFD)(nil)
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release() {
+	if fd.iter == nil {
+		return
+	}
+
+	dir := fd.inode().impl.(*directory)
+	dir.mu.Lock()
+	dir.childList.Remove(fd.iter)
+	dir.mu.Unlock()
+	fd.iter = nil
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	extfs := fd.filesystem()
+	dir := fd.inode().impl.(*directory)
+
+	dir.mu.Lock()
+	defer dir.mu.Unlock()
+
+	// Ensure that fd.iter exists and is not linked into dir.childList.
+	var child *dirent
+	if fd.iter == nil {
+		// Start iteration at the beginning of dir.
+		child = dir.childList.Front()
+		fd.iter = &dirent{}
+	} else {
+		// Continue iteration from where we left off.
+		child = fd.iter.Next()
+		dir.childList.Remove(fd.iter)
+	}
+	for ; child != nil; child = child.Next() {
+		// Skip other directoryFD iterators.
+		if child.diskDirent != nil {
+			childType, ok := child.diskDirent.FileType()
+			if !ok {
+				// We will need to read the inode off disk. Do not increment
+				// ref count here because this inode is not being added to the
+				// dentry tree.
+				extfs.mu.Lock()
+				childInode, err := extfs.getOrCreateInodeLocked(child.diskDirent.Inode())
+				extfs.mu.Unlock()
+				if err != nil {
+					// Usage of the file description after the error is
+					// undefined. This implementation would continue reading
+					// from the next dirent.
+					fd.off++
+					dir.childList.InsertAfter(child, fd.iter)
+					return err
+				}
+				childType = fs.ToInodeType(childInode.diskInode.Mode().FileType())
+			}
+
+			if !cb.Handle(vfs.Dirent{
+				Name: child.diskDirent.FileName(),
+				Type: fs.ToDirentType(childType),
+				Ino:  uint64(child.diskDirent.Inode()),
+				Off:  fd.off,
+			}) {
+				dir.childList.InsertBefore(child, fd.iter)
+				return nil
+			}
+			fd.off++
+		}
+	}
+	dir.childList.PushBack(fd.iter)
+	return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	if whence != linux.SEEK_SET && whence != linux.SEEK_CUR {
+		return 0, syserror.EINVAL
+	}
+
+	dir := fd.inode().impl.(*directory)
+
+	dir.mu.Lock()
+	defer dir.mu.Unlock()
+
+	// Find resulting offset.
+	offset += fd.off
+
+	if offset < 0 {
+		// lseek(2) specifies that EINVAL should be returned if the resulting offset
+		// is negative.
+		return 0, syserror.EINVAL
+	}
+
+	n := int64(len(dir.childMap))
+	realWantOff := offset
+	if realWantOff > n {
+		realWantOff = n
+	}
+	realCurOff := fd.off
+	if realCurOff > n {
+		realCurOff = n
+	}
+
+	// Ensure that fd.iter exists and is linked into dir.childList so we can
+	// intelligently seek from the optimal position.
+	if fd.iter == nil {
+		fd.iter = &dirent{}
+		dir.childList.PushFront(fd.iter)
+	}
+
+	// Guess that iterating from the current position is optimal.
+	child := fd.iter
+	diff := realWantOff - realCurOff // Shows direction and magnitude of travel.
+
+	// See if starting from the beginning or end is better.
+	abDiff := diff
+	if diff < 0 {
+		abDiff = -diff
+	}
+	if abDiff > realWantOff {
+		// Starting from the beginning is best.
+		child = dir.childList.Front()
+		diff = realWantOff
+	} else if abDiff > (n - realWantOff) {
+		// Starting from the end is best.
+		child = dir.childList.Back()
+		// (n - 1) because the last non-nil dirent represents the (n-1)th offset.
+		diff = realWantOff - (n - 1)
+	}
+
+	for child != nil {
+		// Skip other directoryFD iterators.
+		if child.diskDirent != nil {
+			if diff == 0 {
+				if child != fd.iter {
+					dir.childList.Remove(fd.iter)
+					dir.childList.InsertBefore(child, fd.iter)
+				}
+
+				fd.off = offset
+				return offset, nil
+			}
+
+			if diff < 0 {
+				diff++
+				child = child.Prev()
+			} else {
+				diff--
+				child = child.Next()
+			}
+			continue
+		}
+
+		if diff < 0 {
+			child = child.Prev()
+		} else {
+			child = child.Next()
+		}
+	}
+
+	// Reaching here indicates that the offset is beyond the end of the childList.
+	dir.childList.Remove(fd.iter)
+	dir.childList.PushBack(fd.iter)
+	fd.off = offset
+	return offset, nil
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+	// mmap(2) specifies that EACCESS should be returned for non-regular file fds.
+	return syserror.EACCES
 }
diff --git a/pkg/sentry/fs/ext/disklayout/dirent.go b/pkg/sentry/fs/ext/disklayout/dirent.go
index 685bf57b8..417b6cf65 100644
--- a/pkg/sentry/fs/ext/disklayout/dirent.go
+++ b/pkg/sentry/fs/ext/disklayout/dirent.go
@@ -21,6 +21,9 @@ import (
 const (
 	// MaxFileName is the maximum length of an ext fs file's name.
 	MaxFileName = 255
+
+	// DirentSize is the size of ext dirent structures.
+	DirentSize = 263
 )
 
 var (
diff --git a/pkg/sentry/fs/ext/disklayout/dirent_test.go b/pkg/sentry/fs/ext/disklayout/dirent_test.go
index cc6dff2c9..934919f8a 100644
--- a/pkg/sentry/fs/ext/disklayout/dirent_test.go
+++ b/pkg/sentry/fs/ext/disklayout/dirent_test.go
@@ -21,8 +21,6 @@ import (
 // TestDirentSize tests that the dirent structs are of the correct
 // size.
 func TestDirentSize(t *testing.T) {
-	want := uintptr(263)
-
-	assertSize(t, DirentOld{}, want)
-	assertSize(t, DirentNew{}, want)
+	assertSize(t, DirentOld{}, uintptr(DirentSize))
+	assertSize(t, DirentNew{}, uintptr(DirentSize))
 }
diff --git a/pkg/sentry/fs/ext/disklayout/superblock.go b/pkg/sentry/fs/ext/disklayout/superblock.go
index 7a337a5e0..8bb327006 100644
--- a/pkg/sentry/fs/ext/disklayout/superblock.go
+++ b/pkg/sentry/fs/ext/disklayout/superblock.go
@@ -221,7 +221,7 @@ func CompatFeaturesFromInt(f uint32) CompatFeatures {
 // This is not exhaustive, unused features are not listed.
 const (
 	// SbDirentFileType indicates that directory entries record the file type.
-	// We should use struct ext4_dir_entry_2 for dirents then.
+	// We should use struct DirentNew for dirents then.
 	SbDirentFileType = 0x2
 
 	// SbRecovery indicates that the filesystem needs recovery.
diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go
index d303dd122..c3e2c9efb 100644
--- a/pkg/sentry/fs/ext/ext.go
+++ b/pkg/sentry/fs/ext/ext.go
@@ -22,6 +22,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/fd"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -62,8 +63,40 @@ func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, err
 	return fd.NewReadWriter(devFd), nil
 }
 
+// isCompatible checks if the superblock has feature sets which are compatible.
+// We only need to check the superblock incompatible feature set since we are
+// mounting readonly. We will also need to check readonly compatible feature
+// set when mounting for read/write.
+func isCompatible(sb disklayout.SuperBlock) bool {
+	// Please note that what is being checked is limited based on the fact that we
+	// are mounting readonly and that we are not journaling. When mounting
+	// read/write or with a journal, this must be reevaluated.
+	incompatFeatures := sb.IncompatibleFeatures()
+	if incompatFeatures.MetaBG {
+		log.Warningf("ext fs: meta block groups are not supported")
+		return false
+	}
+	if incompatFeatures.MMP {
+		log.Warningf("ext fs: multiple mount protection is not supported")
+		return false
+	}
+	if incompatFeatures.Encrypted {
+		log.Warningf("ext fs: encrypted inodes not supported")
+		return false
+	}
+	if incompatFeatures.InlineData {
+		log.Warningf("ext fs: inline files not supported")
+		return false
+	}
+	return true
+}
+
 // NewFilesystem implements vfs.FilesystemType.NewFilesystem.
 func (fstype filesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	// TODO(b/134676337): Ensure that the user is mounting readonly. If not,
+	// EACCESS should be returned according to mount(2). Filesystem independent
+	// flags (like readonly) are currently not available in pkg/sentry/vfs.
+
 	dev, err := getDeviceFd(source, opts)
 	if err != nil {
 		return nil, nil, err
@@ -82,15 +115,21 @@ func (fstype filesystemType) NewFilesystem(ctx context.Context, creds *auth.Cred
 		return nil, nil, syserror.EINVAL
 	}
 
+	// Refuse to mount if the filesystem is incompatible.
+	if !isCompatible(fs.sb) {
+		return nil, nil, syserror.EINVAL
+	}
+
 	fs.bgs, err = readBlockGroups(dev, fs.sb)
 	if err != nil {
 		return nil, nil, err
 	}
 
-	rootInode, err := fs.getOrCreateInode(ctx, disklayout.RootDirInode)
+	rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode)
 	if err != nil {
 		return nil, nil, err
 	}
+	rootInode.incRef()
 
 	return &fs.vfsfs, &newDentry(rootInode).vfsd, nil
 }
diff --git a/pkg/sentry/fs/ext/ext_test.go b/pkg/sentry/fs/ext/ext_test.go
index 6396886cc..6517e7ea5 100644
--- a/pkg/sentry/fs/ext/ext_test.go
+++ b/pkg/sentry/fs/ext/ext_test.go
@@ -44,7 +44,7 @@ var (
 // setUp opens imagePath as an ext Filesystem and returns all necessary
 // elements required to run tests. If error is non-nil, it also returns a tear
 // down function which must be called after the test is run for clean up.
-func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *vfs.Dentry, func(), error) {
+func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesystem, *vfs.VirtualDentry, func(), error) {
 	localImagePath, err := testutil.FindFile(imagePath)
 	if err != nil {
 		return nil, nil, nil, nil, fmt.Errorf("failed to open local image at path %s: %v", imagePath, err)
@@ -55,20 +55,28 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *v
 		return nil, nil, nil, nil, err
 	}
 
-	// Mount the ext4 fs and retrieve the inode structure for the file.
-	mockCtx := contexttest.Context(t)
-	fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
+	ctx := contexttest.Context(t)
+	creds := auth.CredentialsFromContext(ctx)
+
+	// Create VFS.
+	vfsObj := vfs.New()
+	vfsObj.MustRegisterFilesystemType("extfs", filesystemType{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
 	}
 
+	root := mntns.Root()
+
 	tearDown := func() {
+		root.DecRef()
+
 		if err := f.Close(); err != nil {
 			t.Fatalf("tearDown failed: %v", err)
 		}
 	}
-	return mockCtx, fs, d, tearDown, nil
+	return ctx, vfsObj, &root, tearDown, nil
 }
 
 // TestRootDir tests that the root directory inode is correctly initialized and
@@ -126,15 +134,15 @@ func TestRootDir(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			_, _, vfsd, tearDown, err := setUp(t, test.image)
+			_, _, vd, tearDown, err := setUp(t, test.image)
 			if err != nil {
 				t.Fatalf("setUp failed: %v", err)
 			}
 			defer tearDown()
 
-			d, ok := vfsd.Impl().(*dentry)
+			d, ok := vd.Dentry().Impl().(*dentry)
 			if !ok {
-				t.Fatalf("ext dentry of incorrect type: %T", vfsd.Impl())
+				t.Fatalf("ext dentry of incorrect type: %T", vd.Dentry().Impl())
 			}
 
 			// Offload inode contents into local structs for comparison.
@@ -329,15 +337,15 @@ func TestFilesystemInit(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			_, vfsfs, _, tearDown, err := setUp(t, test.image)
+			_, _, vd, tearDown, err := setUp(t, test.image)
 			if err != nil {
 				t.Fatalf("setUp failed: %v", err)
 			}
 			defer tearDown()
 
-			fs, ok := vfsfs.Impl().(*filesystem)
+			fs, ok := vd.Mount().Filesystem().Impl().(*filesystem)
 			if !ok {
-				t.Fatalf("ext filesystem of incorrect type: %T", vfsfs.Impl())
+				t.Fatalf("ext filesystem of incorrect type: %T", vd.Mount().Filesystem().Impl())
 			}
 
 			// Offload superblock and block group descriptors contents into
diff --git a/pkg/sentry/fs/ext/extent_file.go b/pkg/sentry/fs/ext/extent_file.go
index 44fb9c01f..1b9bf449b 100644
--- a/pkg/sentry/fs/ext/extent_file.go
+++ b/pkg/sentry/fs/ext/extent_file.go
@@ -150,7 +150,11 @@ func (f *extentFile) ReadAt(dst []byte, off int64) (int, error) {
 		return 0, io.EOF
 	}
 
-	return f.read(&f.root, uint64(off), dst)
+	n, err := f.read(&f.root, uint64(off), dst)
+	if n < len(dst) && err == nil {
+		err = io.EOF
+	}
+	return n, err
 }
 
 // read is the recursive step of extentFile.ReadAt which traverses the extent
diff --git a/pkg/sentry/fs/ext/file_description.go b/pkg/sentry/fs/ext/file_description.go
new file mode 100644
index 000000000..d244cf1e7
--- /dev/null
+++ b/pkg/sentry/fs/ext/file_description.go
@@ -0,0 +1,110 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// fileDescription is embedded by ext implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+
+	// flags is the same as vfs.OpenOptions.Flags which are passed to
+	// vfs.FilesystemImpl.OpenAt.
+	// TODO(b/134676337): syscalls like read(2), write(2), fchmod(2), fchown(2),
+	// fgetxattr(2), ioctl(2), mmap(2) should fail with EBADF if O_PATH is set.
+	// Only close(2), fstat(2), fstatfs(2) should work.
+	flags uint32
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+	return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) inode() *inode {
+	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *fileDescription) OnClose() error { return nil }
+
+// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
+func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
+	return fd.flags, nil
+}
+
+// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
+func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
+	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
+	// no-op.
+	return nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	var stat linux.Statx
+	fd.inode().statTo(&stat)
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	return syserror.EPERM
+}
+
+// SetStat implements vfs.FileDescriptionImpl.StatFS.
+func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+	var stat linux.Statfs
+	fd.filesystem().statTo(&stat)
+	return stat, nil
+}
+
+// Readiness implements waiter.Waitable.Readiness analogously to
+// file_operations::poll == NULL in Linux.
+func (fd *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	// include/linux/poll.h:vfs_poll() => DEFAULT_POLLMASK
+	return waiter.EventIn | waiter.EventOut
+}
+
+// EventRegister implements waiter.Waitable.EventRegister analogously to
+// file_operations::poll == NULL in Linux.
+func (fd *fileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {}
+
+// EventUnregister implements waiter.Waitable.EventUnregister analogously to
+// file_operations::poll == NULL in Linux.
+func (fd *fileDescription) EventUnregister(e *waiter.Entry) {}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *fileDescription) Sync(ctx context.Context) error {
+	return nil
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// ioctl(2) specifies that ENOTTY must be returned if the file descriptor is
+	// not associated with a character special device (which is unimplemented).
+	return 0, syserror.ENOTTY
+}
diff --git a/pkg/sentry/fs/ext/filesystem.go b/pkg/sentry/fs/ext/filesystem.go
index 45b43b9e2..e08839f48 100644
--- a/pkg/sentry/fs/ext/filesystem.go
+++ b/pkg/sentry/fs/ext/filesystem.go
@@ -15,20 +15,27 @@
 package ext
 
 import (
+	"errors"
 	"io"
 	"sync"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+var (
+	// errResolveDirent indicates that the vfs.ResolvingPath.Component() does
+	// not exist on the dentry tree but does exist on disk. So it has to be read in
+	// using the in-memory dirent and added to the dentry tree. Usually indicates
+	// the need to lock filesystem.mu for writing.
+	errResolveDirent = errors.New("resolve path component using dirent")
+)
+
 // filesystem implements vfs.FilesystemImpl.
 type filesystem struct {
-	// TODO(b/134676337): Remove when all methods have been implemented.
-	vfs.FilesystemImpl
-
 	vfsfs vfs.Filesystem
 
 	// mu serializes changes to the Dentry tree.
@@ -44,8 +51,8 @@ type filesystem struct {
 	// inodeCache maps absolute inode numbers to the corresponding Inode struct.
 	// Inodes should be removed from this once their reference count hits 0.
 	//
-	// Protected by mu because every addition and removal from this corresponds to
-	// a change in the dentry tree.
+	// Protected by mu because most additions (see IterDirents) and all removals
+	// from this corresponds to a change in the dentry tree.
 	inodeCache map[uint32]*inode
 
 	// sb represents the filesystem superblock. Immutable after initialization.
@@ -59,16 +66,172 @@ type filesystem struct {
 // Compiles only if filesystem implements vfs.FilesystemImpl.
 var _ vfs.FilesystemImpl = (*filesystem)(nil)
 
-// getOrCreateInode gets the inode corresponding to the inode number passed in.
+// stepLocked resolves rp.Component() in parent directory vfsd. The write
+// parameter passed tells if the caller has acquired filesystem.mu for writing
+// or not. If set to true, an existing inode on disk can be added to the dentry
+// tree if not present already.
+//
+// stepLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions:
+//     - filesystem.mu must be locked (for writing if write param is true).
+//     - !rp.Done().
+//     - inode == vfsd.Impl().(*Dentry).inode.
+func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
+	if !inode.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, nil, err
+	}
+
+	for {
+		nextVFSD, err := rp.ResolveComponent(vfsd)
+		if err != nil {
+			return nil, nil, err
+		}
+		if nextVFSD == nil {
+			// Since the Dentry tree is not the sole source of truth for extfs, if it's
+			// not in the Dentry tree, it might need to be pulled from disk.
+			childDirent, ok := inode.impl.(*directory).childMap[rp.Component()]
+			if !ok {
+				// The underlying inode does not exist on disk.
+				return nil, nil, syserror.ENOENT
+			}
+
+			if !write {
+				// filesystem.mu must be held for writing to add to the dentry tree.
+				return nil, nil, errResolveDirent
+			}
+
+			// Create and add the component's dirent to the dentry tree.
+			fs := rp.Mount().Filesystem().Impl().(*filesystem)
+			childInode, err := fs.getOrCreateInodeLocked(childDirent.diskDirent.Inode())
+			if err != nil {
+				return nil, nil, err
+			}
+			// incRef because this is being added to the dentry tree.
+			childInode.incRef()
+			child := newDentry(childInode)
+			vfsd.InsertChild(&child.vfsd, rp.Component())
+
+			// Continue as usual now that nextVFSD is not nil.
+			nextVFSD = &child.vfsd
+		}
+		nextInode := nextVFSD.Impl().(*dentry).inode
+		if nextInode.isSymlink() && rp.ShouldFollowSymlink() {
+			if err := rp.HandleSymlink(inode.impl.(*symlink).target); err != nil {
+				return nil, nil, err
+			}
+			continue
+		}
+		rp.Advance()
+		return nextVFSD, nextInode, nil
+	}
+}
+
+// walkLocked resolves rp to an existing file. The write parameter
+// passed tells if the caller has acquired filesystem.mu for writing or not.
+// If set to true, additions can be made to the dentry tree while walking.
+// If errResolveDirent is returned, the walk needs to be continued with an
+// upgraded filesystem.mu.
+//
+// walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
+//
+// Preconditions:
+//     - filesystem.mu must be locked (for writing if write param is true).
+func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
+	vfsd := rp.Start()
+	inode := vfsd.Impl().(*dentry).inode
+	for !rp.Done() {
+		var err error
+		vfsd, inode, err = stepLocked(rp, vfsd, inode, write)
+		if err != nil {
+			return nil, nil, err
+		}
+	}
+	if rp.MustBeDir() && !inode.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	return vfsd, inode, nil
+}
+
+// walkParentLocked resolves all but the last path component of rp to an
+// existing directory. It does not check that the returned directory is
+// searchable by the provider of rp. The write parameter passed tells if the
+// caller has acquired filesystem.mu for writing or not. If set to true,
+// additions can be made to the dentry tree while walking.
+// If errResolveDirent is returned, the walk needs to be continued with an
+// upgraded filesystem.mu.
+//
+// walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat().
+//
+// Preconditions:
+//     - filesystem.mu must be locked (for writing if write param is true).
+//     - !rp.Done().
+func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
+	vfsd := rp.Start()
+	inode := vfsd.Impl().(*dentry).inode
+	for !rp.Final() {
+		var err error
+		vfsd, inode, err = stepLocked(rp, vfsd, inode, write)
+		if err != nil {
+			return nil, nil, err
+		}
+	}
+	if !inode.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	return vfsd, inode, nil
+}
+
+// walk resolves rp to an existing file. If parent is set to true, it resolves
+// the rp till the parent of the last component which should be an existing
+// directory. If parent is false then resolves rp entirely. Attemps to resolve
+// the path as far as it can with a read lock and upgrades the lock if needed.
+func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) {
+	var (
+		vfsd  *vfs.Dentry
+		inode *inode
+		err   error
+	)
+
+	// Try walking with the hopes that all dentries have already been pulled out
+	// of disk. This reduces congestion (allows concurrent walks).
+	fs.mu.RLock()
+	if parent {
+		vfsd, inode, err = walkParentLocked(rp, false)
+	} else {
+		vfsd, inode, err = walkLocked(rp, false)
+	}
+	fs.mu.RUnlock()
+
+	if err == errResolveDirent {
+		// Upgrade lock and continue walking. Lock upgrading in the middle of the
+		// walk is fine as this is a read only filesystem.
+		fs.mu.Lock()
+		if parent {
+			vfsd, inode, err = walkParentLocked(rp, true)
+		} else {
+			vfsd, inode, err = walkLocked(rp, true)
+		}
+		fs.mu.Unlock()
+	}
+
+	return vfsd, inode, err
+}
+
+// getOrCreateInodeLocked gets the inode corresponding to the inode number passed in.
 // It creates a new one with the given inode number if one does not exist.
+// The caller must increment the ref count if adding this to the dentry tree.
 //
-// Precondition: must be holding fs.mu.
-func (fs *filesystem) getOrCreateInode(ctx context.Context, inodeNum uint32) (*inode, error) {
+// Precondition: must be holding fs.mu for writing.
+func (fs *filesystem) getOrCreateInodeLocked(inodeNum uint32) (*inode, error) {
 	if in, ok := fs.inodeCache[inodeNum]; ok {
 		return in, nil
 	}
 
-	in, err := newInode(ctx, fs, inodeNum)
+	in, err := newInode(fs, inodeNum)
 	if err != nil {
 		return nil, err
 	}
@@ -77,10 +240,92 @@ func (fs *filesystem) getOrCreateInode(ctx context.Context, inodeNum uint32) (*i
 	return in, nil
 }
 
-// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
+// statTo writes the statfs fields to the output parameter.
+func (fs *filesystem) statTo(stat *linux.Statfs) {
+	stat.Type = uint64(fs.sb.Magic())
+	stat.BlockSize = int64(fs.sb.BlockSize())
+	stat.Blocks = fs.sb.BlocksCount()
+	stat.BlocksFree = fs.sb.FreeBlocksCount()
+	stat.BlocksAvailable = fs.sb.FreeBlocksCount()
+	stat.Files = uint64(fs.sb.InodesCount())
+	stat.FilesFree = uint64(fs.sb.FreeInodesCount())
+	stat.NameLength = disklayout.MaxFileName
+	stat.FragmentSize = int64(fs.sb.BlockSize())
+	// TODO(b/134676337): Set Statfs.Flags and Statfs.FSID.
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	vfsd, inode, err := fs.walk(rp, false)
+	if err != nil {
+		return nil, err
+	}
+
+	if opts.CheckSearchable {
+		if !inode.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+			return nil, err
+		}
+	}
+
+	inode.incRef()
+	return vfsd, nil
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	vfsd, inode, err := fs.walk(rp, false)
+	if err != nil {
+		return nil, err
+	}
+
+	// EROFS is returned if write access is needed.
+	if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 {
+		return nil, syserror.EROFS
+	}
+	return inode.open(rp, vfsd, opts.Flags)
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	_, inode, err := fs.walk(rp, false)
+	if err != nil {
+		return "", err
+	}
+	symlink, ok := inode.impl.(*symlink)
+	if !ok {
+		return "", syserror.EINVAL
+	}
+	return symlink.target, nil
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	_, inode, err := fs.walk(rp, false)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	var stat linux.Statx
+	inode.statTo(&stat)
+	return stat, nil
 }
 
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	if _, _, err := fs.walk(rp, false); err != nil {
+		return linux.Statfs{}, err
+	}
+
+	var stat linux.Statfs
+	fs.statTo(&stat)
+	return stat, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {}
+
 // Sync implements vfs.FilesystemImpl.Sync.
 func (fs *filesystem) Sync(ctx context.Context) error {
 	// This is a readonly filesystem for now.
@@ -89,42 +334,110 @@ func (fs *filesystem) Sync(ctx context.Context) error {
 
 // The vfs.FilesystemImpl functions below return EROFS because their respective
 // man pages say that EROFS must be returned if the path resolves to a file on
-// a read-only filesystem.
+// this read-only filesystem.
 
-// TODO(b/134676337): Implement path traversal and return EROFS only if the
-// path resolves to a Dentry within ext fs.
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+
+	if _, _, err := fs.walk(rp, true); err != nil {
+		return err
+	}
+
+	return syserror.EROFS
+}
 
 // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+
+	if _, _, err := fs.walk(rp, true); err != nil {
+		return err
+	}
+
 	return syserror.EROFS
 }
 
 // MknodAt implements vfs.FilesystemImpl.MknodAt.
 func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+
+	_, _, err := fs.walk(rp, true)
+	if err != nil {
+		return err
+	}
+
 	return syserror.EROFS
 }
 
 // RenameAt implements vfs.FilesystemImpl.RenameAt.
 func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+	if rp.Done() {
+		return syserror.ENOENT
+	}
+
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return err
+	}
+
 	return syserror.EROFS
 }
 
 // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
 func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	_, inode, err := fs.walk(rp, false)
+	if err != nil {
+		return err
+	}
+
+	if !inode.isDir() {
+		return syserror.ENOTDIR
+	}
+
 	return syserror.EROFS
 }
 
 // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
 func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return err
+	}
+
 	return syserror.EROFS
 }
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
 func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+
+	_, _, err := fs.walk(rp, true)
+	if err != nil {
+		return err
+	}
+
 	return syserror.EROFS
 }
 
 // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
 func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	_, inode, err := fs.walk(rp, false)
+	if err != nil {
+		return err
+	}
+
+	if inode.isDir() {
+		return syserror.EISDIR
+	}
+
 	return syserror.EROFS
 }
diff --git a/pkg/sentry/fs/ext/inline_file.go b/pkg/sentry/fs/ext/inline_file.go
deleted file mode 100644
index 67a538ba0..000000000
--- a/pkg/sentry/fs/ext/inline_file.go
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ext
-
-import (
-	"io"
-)
-
-// inlineFile is a type of regular file. All the data here is stored in the
-// inode.Data() array.
-type inlineFile struct {
-	regFile regularFile
-}
-
-// Compiles only if inlineFile implements io.ReaderAt.
-var _ io.ReaderAt = (*inlineFile)(nil)
-
-// newInlineFile is the inlineFile constructor.
-func newInlineFile(regFile regularFile) *inlineFile {
-	file := &inlineFile{regFile: regFile}
-	file.regFile.impl = file
-	return file
-}
-
-// ReadAt implements io.ReaderAt.ReadAt.
-func (f *inlineFile) ReadAt(dst []byte, off int64) (int, error) {
-	if len(dst) == 0 {
-		return 0, nil
-	}
-
-	size := f.regFile.inode.diskInode.Size()
-	if uint64(off) >= size {
-		return 0, io.EOF
-	}
-
-	to := uint64(off) + uint64(len(dst))
-	if to > size {
-		to = size
-	}
-
-	n := copy(dst, f.regFile.inode.diskInode.Data()[off:to])
-	return n, nil
-}
diff --git a/pkg/sentry/fs/ext/inode.go b/pkg/sentry/fs/ext/inode.go
index 364980e4c..178bd6376 100644
--- a/pkg/sentry/fs/ext/inode.go
+++ b/pkg/sentry/fs/ext/inode.go
@@ -15,12 +15,14 @@
 package ext
 
 import (
+	"fmt"
 	"io"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -31,13 +33,11 @@ import (
 //
 // Implementations:
 //    inode --
-//           |-- pipe
 //           |-- dir
 //           |-- symlink
 //           |-- regular--
 //                       |-- extent file
 //                       |-- block map file
-//                       |-- inline file
 type inode struct {
 	// refs is a reference count. refs is accessed using atomic memory operations.
 	refs int64
@@ -92,7 +92,7 @@ func (in *inode) decRef(fs *filesystem) {
 
 // newInode is the inode constructor. Reads the inode off disk. Identifies
 // inodes based on the absolute inode number on disk.
-func newInode(ctx context.Context, fs *filesystem, inodeNum uint32) (*inode, error) {
+func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
 	if inodeNum == 0 {
 		panic("inode number 0 on ext filesystems is not possible")
 	}
@@ -117,7 +117,6 @@ func newInode(ctx context.Context, fs *filesystem, inodeNum uint32) (*inode, err
 
 	// Build the inode based on its type.
 	inode := inode{
-		refs:      1,
 		inodeNum:  inodeNum,
 		dev:       fs.dev,
 		blkSize:   blkSize,
@@ -138,15 +137,76 @@ func newInode(ctx context.Context, fs *filesystem, inodeNum uint32) (*inode, err
 		}
 		return &f.inode, nil
 	case linux.ModeDirectory:
-		return &newDirectroy(inode).inode, nil
-	case linux.ModeNamedPipe:
-		return &newNamedPipe(ctx, inode).inode, nil
+		f, err := newDirectroy(inode, fs.sb.IncompatibleFeatures().DirentFileType)
+		if err != nil {
+			return nil, err
+		}
+		return &f.inode, nil
 	default:
-		// TODO(b/134676337): Return appropriate errors for sockets and devices.
+		// TODO(b/134676337): Return appropriate errors for sockets, pipes and devices.
 		return nil, syserror.EINVAL
 	}
 }
 
+// open creates and returns a file description for the dentry passed in.
+func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(flags)
+	if err := in.checkPermissions(rp.Credentials(), ats); err != nil {
+		return nil, err
+	}
+	switch in.impl.(type) {
+	case *regularFile:
+		var fd regularFileFD
+		fd.flags = flags
+		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		return &fd.vfsfd, nil
+	case *directory:
+		// Can't open directories writably. This check is not necessary for a read
+		// only filesystem but will be required when write is implemented.
+		if ats&vfs.MayWrite != 0 {
+			return nil, syserror.EISDIR
+		}
+		var fd directoryFD
+		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		fd.flags = flags
+		return &fd.vfsfd, nil
+	case *symlink:
+		if flags&linux.O_PATH == 0 {
+			// Can't open symlinks without O_PATH.
+			return nil, syserror.ELOOP
+		}
+		var fd symlinkFD
+		fd.flags = flags
+		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		return &fd.vfsfd, nil
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", in.impl))
+	}
+}
+
+func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, in.isDir(), uint16(in.diskInode.Mode()), in.diskInode.UID(), in.diskInode.GID())
+}
+
+// statTo writes the statx fields to the output parameter.
+func (in *inode) statTo(stat *linux.Statx) {
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
+		linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE |
+		linux.STATX_ATIME | linux.STATX_CTIME | linux.STATX_MTIME
+	stat.Blksize = uint32(in.blkSize)
+	stat.Mode = uint16(in.diskInode.Mode())
+	stat.Nlink = uint32(in.diskInode.LinksCount())
+	stat.UID = uint32(in.diskInode.UID())
+	stat.GID = uint32(in.diskInode.GID())
+	stat.Ino = uint64(in.inodeNum)
+	stat.Size = in.diskInode.Size()
+	stat.Atime = in.diskInode.AccessTime().StatxTimestamp()
+	stat.Ctime = in.diskInode.ChangeTime().StatxTimestamp()
+	stat.Mtime = in.diskInode.ModificationTime().StatxTimestamp()
+	// TODO(b/134676337): Set stat.Blocks which is the number of 512 byte blocks
+	// (including metadata blocks) required to represent this file.
+}
+
 // getBGNum returns the block group number that a given inode belongs to.
 func getBGNum(inodeNum uint32, inodesPerGrp uint32) uint32 {
 	return (inodeNum - 1) / inodesPerGrp
diff --git a/pkg/sentry/fs/ext/named_pipe.go b/pkg/sentry/fs/ext/named_pipe.go
deleted file mode 100644
index 0f3af1b53..000000000
--- a/pkg/sentry/fs/ext/named_pipe.go
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ext
-
-import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
-)
-
-// namedPipe represents a named pipe inode. It is currently just a wrapper
-// around pkg/sentry/kernel/pipe.
-type namedPipe struct {
-	inode inode
-
-	p        *pipe.Pipe
-	inodeOps fs.InodeOperations
-}
-
-// newNamedPipe is the namedPipe constructor.
-func newNamedPipe(ctx context.Context, inode inode) *namedPipe {
-	file := &namedPipe{inode: inode}
-	file.inode.impl = file
-	file.p = pipe.NewPipe(ctx, true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)
-	file.inodeOps = pipe.NewInodeOperations(ctx, fs.FilePermsFromMode(file.inode.diskInode.Mode()), file.p)
-	return file
-}
diff --git a/pkg/sentry/fs/ext/regular_file.go b/pkg/sentry/fs/ext/regular_file.go
index fb1bd38ef..ffc76ba5b 100644
--- a/pkg/sentry/fs/ext/regular_file.go
+++ b/pkg/sentry/fs/ext/regular_file.go
@@ -16,6 +16,15 @@ package ext
 
 import (
 	"io"
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // regularFile represents a regular file's inode. This too follows the
@@ -26,6 +35,9 @@ type regularFile struct {
 
 	// This is immutable. The first field of fileReader implementations must be
 	// regularFile to ensure temporality.
+	// io.ReaderAt is more strict than io.Reader in the sense that a partial read
+	// is always accompanied by an error. If a read spans past the end of file, a
+	// partial read (within file range) is done and io.EOF is returned.
 	impl io.ReaderAt
 }
 
@@ -48,16 +60,6 @@ func newRegularFile(inode inode) (*regularFile, error) {
 		return &file.regFile, nil
 	}
 
-	if inodeFlags.Inline {
-		if inode.diskInode.Size() > 60 {
-			panic("ext fs: inline file larger than 60 bytes")
-		}
-
-		file := newInlineFile(regFile)
-		file.regFile.inode.impl = &file.regFile
-		return &file.regFile, nil
-	}
-
 	file, err := newBlockMapFile(regFile)
 	if err != nil {
 		return nil, err
@@ -66,6 +68,92 @@ func newRegularFile(inode inode) (*regularFile, error) {
 	return &file.regFile, nil
 }
 
-func (f *regularFile) blksUsed(blkSize uint64) uint64 {
-	return (f.inode.diskInode.Size() + blkSize - 1) / blkSize
+func (in *inode) isRegular() bool {
+	_, ok := in.impl.(*regularFile)
+	return ok
+}
+
+// directoryFD represents a directory file description. It implements
+// vfs.FileDescriptionImpl.
+type regularFileFD struct {
+	fileDescription
+
+	// off is the file offset. off is accessed using atomic memory operations.
+	off int64
+
+	// offMu serializes operations that may mutate off.
+	offMu sync.Mutex
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release() {}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	safeReader := safemem.FromIOReaderAt{
+		ReaderAt: fd.inode().impl.(*regularFile).impl,
+		Offset:   offset,
+	}
+
+	// Copies data from disk directly into usermem without any intermediate
+	// allocations (if dst is converted into BlockSeq such that it does not need
+	// safe copying).
+	return dst.CopyOutFrom(ctx, safeReader)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.offMu.Lock()
+	fd.off += n
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	// write(2) specifies that EBADF must be returned if the fd is not open for
+	// writing.
+	return 0, syserror.EBADF
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	n, err := fd.PWrite(ctx, src, fd.off, opts)
+	fd.offMu.Lock()
+	fd.off += n
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *regularFileFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	return syserror.ENOTDIR
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.offMu.Lock()
+	defer fd.offMu.Unlock()
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as specified.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	case linux.SEEK_END:
+		offset += int64(fd.inode().diskInode.Size())
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+	// TODO(b/134676337): Implement mmap(2).
+	return syserror.ENODEV
 }
diff --git a/pkg/sentry/fs/ext/symlink.go b/pkg/sentry/fs/ext/symlink.go
index 9f498d989..e06548a98 100644
--- a/pkg/sentry/fs/ext/symlink.go
+++ b/pkg/sentry/fs/ext/symlink.go
@@ -15,6 +15,10 @@
 package ext
 
 import (
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -43,8 +47,8 @@ func newSymlink(inode inode) (*symlink, error) {
 		}
 
 		link = make([]byte, size)
-		if n, _ := regFile.impl.ReadAt(link, 0); uint64(n) < size {
-			return nil, syserror.EIO
+		if n, err := regFile.impl.ReadAt(link, 0); uint64(n) < size {
+			return nil, err
 		}
 	}
 
@@ -52,3 +56,56 @@ func newSymlink(inode inode) (*symlink, error) {
 	file.inode.impl = file
 	return file, nil
 }
+
+func (in *inode) isSymlink() bool {
+	_, ok := in.impl.(*symlink)
+	return ok
+}
+
+// symlinkFD represents a symlink file description and implements implements
+// vfs.FileDescriptionImpl. which may only be used if open options contains
+// O_PATH. For this reason most of the functions return EBADF.
+type symlinkFD struct {
+	fileDescription
+}
+
+// Compiles only if symlinkFD implements vfs.FileDescriptionImpl.
+var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil)
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *symlinkFD) Release() {}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return 0, syserror.EBADF
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *symlinkFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return 0, syserror.EBADF
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *symlinkFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.EBADF
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *symlinkFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.EBADF
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *symlinkFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	return syserror.ENOTDIR
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *symlinkFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return 0, syserror.EBADF
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *symlinkFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+	return syserror.EBADF
+}
author	Ayush Ranjan <ayushranjan@google.com>	2019-08-07 14:22:19 -0700
committer	gVisor bot <gvisor-bot@google.com>	2019-08-07 14:23:42 -0700
commit	1c9781a4edce5fa9688f868149a2506f2ec5fa86 (patch)
tree	6538030ca97dcdb175bed5e1dc7cea382ff06ee1 /pkg/sentry/fs
parent	79cc4397fd99fbdd5c74ac5bb7804a463d7981d8 (diff)