34 files changed, 1289 insertions, 277 deletions
diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index 4c336ea84..9961baaa9 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -1,4 +1,4 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
 package(licenses = ["notice"])
@@ -7,6 +7,7 @@ go_library(
     name = "eventchannel",
     srcs = [
         "event.go",
+        "rate.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/eventchannel",
     visibility = ["//:sandbox"],
@@ -16,6 +17,7 @@ go_library(
         "//pkg/unet",
         "@com_github_golang_protobuf//proto:go_default_library",
         "@com_github_golang_protobuf//ptypes:go_default_library_gen",
+        "@org_golang_x_time//rate:go_default_library",
     ],
 )
 
@@ -30,3 +32,12 @@ go_proto_library(
     proto = ":eventchannel_proto",
     visibility = ["//:sandbox"],
 )
+
+go_test(
+    name = "eventchannel_test",
+    srcs = ["event_test.go"],
+    embed = [":eventchannel"],
+    deps = [
+        "@com_github_golang_protobuf//proto:go_default_library",
+    ],
+)
diff --git a/pkg/eventchannel/event.go b/pkg/eventchannel/event.go
index f6d26532b..d37ad0428 100644
--- a/pkg/eventchannel/event.go
+++ b/pkg/eventchannel/event.go
@@ -43,18 +43,36 @@ type Emitter interface {
 	Close() error
 }
 
-var (
-	mu       sync.Mutex
-	emitters = make(map[Emitter]struct{})
-)
+// DefaultEmitter is the default emitter. Calls to Emit and AddEmitter are sent
+// to this Emitter.
+var DefaultEmitter = &multiEmitter{}
 
-// Emit emits a message using all added emitters.
+// Emit is a helper method that calls DefaultEmitter.Emit.
 func Emit(msg proto.Message) error {
-	mu.Lock()
-	defer mu.Unlock()
+	_, err := DefaultEmitter.Emit(msg)
+	return err
+}
+
+// AddEmitter is a helper method that calls DefaultEmitter.AddEmitter.
+func AddEmitter(e Emitter) {
+	DefaultEmitter.AddEmitter(e)
+}
+
+// multiEmitter is an Emitter that forwards messages to multiple Emitters.
+type multiEmitter struct {
+	// mu protects emitters.
+	mu sync.Mutex
+	// emitters is initialized lazily in AddEmitter.
+	emitters map[Emitter]struct{}
+}
+
+// Emit emits a message using all added emitters.
+func (me *multiEmitter) Emit(msg proto.Message) (bool, error) {
+	me.mu.Lock()
+	defer me.mu.Unlock()
 
 	var err error
-	for e := range emitters {
+	for e := range me.emitters {
 		hangup, eerr := e.Emit(msg)
 		if eerr != nil {
 			if err == nil {
@@ -68,18 +86,36 @@ func Emit(msg proto.Message) error {
 		}
 		if hangup {
 			log.Infof("Hangup on eventchannel emitter %v.", e)
-			delete(emitters, e)
+			delete(me.emitters, e)
 		}
 	}
 
-	return err
+	return false, err
 }
 
 // AddEmitter adds a new emitter.
-func AddEmitter(e Emitter) {
-	mu.Lock()
-	defer mu.Unlock()
-	emitters[e] = struct{}{}
+func (me *multiEmitter) AddEmitter(e Emitter) {
+	me.mu.Lock()
+	defer me.mu.Unlock()
+	if me.emitters == nil {
+		me.emitters = make(map[Emitter]struct{})
+	}
+	me.emitters[e] = struct{}{}
+}
+
+// Close closes all emitters. If any Close call errors, it returns the first
+// one encountered.
+func (me *multiEmitter) Close() error {
+	me.mu.Lock()
+	defer me.mu.Unlock()
+	var err error
+	for e := range me.emitters {
+		if eerr := e.Close(); err == nil && eerr != nil {
+			err = eerr
+		}
+		delete(me.emitters, e)
+	}
+	return err
 }
 
 func marshal(msg proto.Message) ([]byte, error) {
diff --git a/pkg/eventchannel/event_test.go b/pkg/eventchannel/event_test.go
new file mode 100644
index 000000000..3649097d6
--- /dev/null
+++ b/pkg/eventchannel/event_test.go
@@ -0,0 +1,146 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package eventchannel
+
+import (
+	"fmt"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/golang/protobuf/proto"
+)
+
+// testEmitter is an emitter that can be used in tests. It records all events
+// emitted, and whether it has been closed.
+type testEmitter struct {
+	// mu protects all fields below.
+	mu sync.Mutex
+
+	// events contains all emitted events.
+	events []proto.Message
+
+	// closed records whether Close() was called.
+	closed bool
+}
+
+// Emit implements Emitter.Emit.
+func (te *testEmitter) Emit(msg proto.Message) (bool, error) {
+	te.mu.Lock()
+	defer te.mu.Unlock()
+	te.events = append(te.events, msg)
+	return false, nil
+}
+
+// Close implements Emitter.Close.
+func (te *testEmitter) Close() error {
+	te.mu.Lock()
+	defer te.mu.Unlock()
+	if te.closed {
+		return fmt.Errorf("closed called twice")
+	}
+	te.closed = true
+	return nil
+}
+
+// testMessage implements proto.Message for testing.
+type testMessage struct {
+	proto.Message
+
+	// name is the name of the message, used by tests to compare messages.
+	name string
+}
+
+func TestMultiEmitter(t *testing.T) {
+	// Create three testEmitters, tied together in a multiEmitter.
+	me := &multiEmitter{}
+	var emitters []*testEmitter
+	for i := 0; i < 3; i++ {
+		te := &testEmitter{}
+		emitters = append(emitters, te)
+		me.AddEmitter(te)
+	}
+
+	// Emit three messages to multiEmitter.
+	names := []string{"foo", "bar", "baz"}
+	for _, name := range names {
+		m := testMessage{name: name}
+		if _, err := me.Emit(m); err != nil {
+			t.Fatal("me.Emit(%v) failed: %v", m, err)
+		}
+	}
+
+	// All three emitters should have all three events.
+	for _, te := range emitters {
+		if got, want := len(te.events), len(names); got != want {
+			t.Fatalf("emitter got %d events, want %d", got, want)
+		}
+		for i, name := range names {
+			if got := te.events[i].(testMessage).name; got != name {
+				t.Errorf("emitter got message with name %q, want %q", got, name)
+			}
+		}
+	}
+
+	// Close multiEmitter.
+	if err := me.Close(); err != nil {
+		t.Fatal("me.Close() failed: %v", err)
+	}
+
+	// All testEmitters should be closed.
+	for _, te := range emitters {
+		if !te.closed {
+			t.Errorf("te.closed got false, want true")
+		}
+	}
+}
+
+func TestRateLimitedEmitter(t *testing.T) {
+	// Create a RateLimittedEmitter that wraps a testEmitter.
+	te := &testEmitter{}
+	max := float64(5) // events per second
+	burst := 10       // events
+	rle := RateLimitedEmitterFrom(te, max, burst)
+
+	// Send 50 messages in one shot.
+	for i := 0; i < 50; i++ {
+		if _, err := rle.Emit(testMessage{}); err != nil {
+			t.Fatalf("rle.Emit failed: %v", err)
+		}
+	}
+
+	// We should have received only 10 messages.
+	if got, want := len(te.events), 10; got != want {
+		t.Errorf("got %d events, want %d", got, want)
+	}
+
+	// Sleep for a second and then send another 50.
+	time.Sleep(1 * time.Second)
+	for i := 0; i < 50; i++ {
+		if _, err := rle.Emit(testMessage{}); err != nil {
+			t.Fatalf("rle.Emit failed: %v", err)
+		}
+	}
+
+	// We should have at least 5 more message, plus maybe a few more if the
+	// test ran slowly.
+	got, wantAtLeast, wantAtMost := len(te.events), 15, 20
+	if got < wantAtLeast {
+		t.Errorf("got %d events, want at least  %d", got, wantAtLeast)
+	}
+	if got > wantAtMost {
+		t.Errorf("got %d events, want at most %d", got, wantAtMost)
+	}
+}
diff --git a/pkg/eventchannel/rate.go b/pkg/eventchannel/rate.go
new file mode 100644
index 000000000..179226c92
--- /dev/null
+++ b/pkg/eventchannel/rate.go
@@ -0,0 +1,54 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package eventchannel
+
+import (
+	"github.com/golang/protobuf/proto"
+	"golang.org/x/time/rate"
+)
+
+// rateLimitedEmitter wraps an emitter and limits events to the given limits.
+// Events that would exceed the limit are discarded.
+type rateLimitedEmitter struct {
+	inner   Emitter
+	limiter *rate.Limiter
+}
+
+// RateLimitedEmitterFrom creates a new event channel emitter that wraps the
+// existing emitter and enforces rate limits. The limits are imposed via a
+// token bucket, with `maxRate` events per second, with burst size of `burst`
+// events. See the golang.org/x/time/rate package and
+// https://en.wikipedia.org/wiki/Token_bucket for more information about token
+// buckets generally.
+func RateLimitedEmitterFrom(inner Emitter, maxRate float64, burst int) Emitter {
+	return &rateLimitedEmitter{
+		inner:   inner,
+		limiter: rate.NewLimiter(rate.Limit(maxRate), burst),
+	}
+}
+
+// Emit implements EventEmitter.Emit.
+func (rle *rateLimitedEmitter) Emit(msg proto.Message) (bool, error) {
+	if !rle.limiter.Allow() {
+		// Drop event.
+		return false, nil
+	}
+	return rle.inner.Emit(msg)
+}
+
+// Close implements EventEmitter.Close.
+func (rle *rateLimitedEmitter) Close() error {
+	return rle.inner.Close()
+}
diff --git a/pkg/fdnotifier/poll_unsafe.go b/pkg/fdnotifier/poll_unsafe.go
index ab8857b5e..4225b04dd 100644
--- a/pkg/fdnotifier/poll_unsafe.go
+++ b/pkg/fdnotifier/poll_unsafe.go
@@ -35,8 +35,14 @@ func NonBlockingPoll(fd int32, mask waiter.EventMask) waiter.EventMask {
 		events: int16(mask.ToLinux()),
 	}
 
+	ts := syscall.Timespec{
+		Sec:  0,
+		Nsec: 0,
+	}
+
 	for {
-		n, _, err := syscall.RawSyscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(&e)), 1, 0)
+		n, _, err := syscall.RawSyscall6(syscall.SYS_PPOLL, uintptr(unsafe.Pointer(&e)), 1,
+			uintptr(unsafe.Pointer(&ts)), 0, 0, 0)
 		// Interrupted by signal, try again.
 		if err == syscall.EINTR {
 			continue
diff --git a/pkg/seccomp/seccomp_test_victim.go b/pkg/seccomp/seccomp_test_victim.go
index 62ae1fd9f..48413f1fb 100644
--- a/pkg/seccomp/seccomp_test_victim.go
+++ b/pkg/seccomp/seccomp_test_victim.go
@@ -70,7 +70,7 @@ func main() {
 		syscall.SYS_NANOSLEEP:       {},
 		syscall.SYS_NEWFSTATAT:      {},
 		syscall.SYS_OPEN:            {},
-		syscall.SYS_POLL:            {},
+		syscall.SYS_PPOLL:           {},
 		syscall.SYS_PREAD64:         {},
 		syscall.SYS_PSELECT6:        {},
 		syscall.SYS_PWRITE64:        {},
diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD
index 2c15875f5..8158aa522 100644
--- a/pkg/sentry/fs/ext/BUILD
+++ b/pkg/sentry/fs/ext/BUILD
@@ -1,14 +1,35 @@
 package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+go_template_instance(
+    name = "dentry_list",
+    out = "dentry_list.go",
+    package = "ext",
+    prefix = "dentry",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*dentry",
+        "Linker": "*dentry",
+    },
+)
 
 go_library(
     name = "ext",
     srcs = [
+        "block_map_file.go",
         "dentry.go",
+        "dentry_list.go",
+        "directory.go",
         "ext.go",
+        "extent_file.go",
         "filesystem.go",
+        "inline_file.go",
         "inode.go",
+        "named_pipe.go",
+        "regular_file.go",
+        "symlink.go",
         "utils.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext",
@@ -16,9 +37,14 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
+        "//pkg/fd",
         "//pkg/sentry/context",
+        "//pkg/sentry/fs",
         "//pkg/sentry/fs/ext/disklayout",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
     ],
diff --git a/pkg/sentry/fs/ext/block_map_file.go b/pkg/sentry/fs/ext/block_map_file.go
new file mode 100644
index 000000000..9aabbd145
--- /dev/null
+++ b/pkg/sentry/fs/ext/block_map_file.go
@@ -0,0 +1,65 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"io"
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/binary"
+)
+
+// blockMapFile is a type of regular file which uses direct/indirect block
+// addressing to store file data. This was deprecated in ext4.
+type blockMapFile struct {
+	regFile regularFile
+
+	// mu serializes changes to fileToPhysBlks.
+	mu sync.RWMutex
+
+	// fileToPhysBlks maps the file block numbers to the physical block numbers.
+	// the physical block number for the (i)th file block is stored in the (i)th
+	// index. This is initialized (at max) with the first 12 entries. The rest
+	// have to be read in from disk when required. Protected by mu.
+	fileToPhysBlks []uint32
+}
+
+// Compiles only if blockMapFile implements fileReader.
+var _ fileReader = (*blockMapFile)(nil)
+
+// Read implements fileReader.getFileReader.
+func (f *blockMapFile) getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader {
+	panic("unimplemented")
+}
+
+// newBlockMapFile is the blockMapFile constructor. It initializes the file to
+// physical blocks map with (at most) the first 12 (direct) blocks.
+func newBlockMapFile(blkSize uint64, regFile regularFile) (*blockMapFile, error) {
+	file := &blockMapFile{regFile: regFile}
+	file.regFile.impl = file
+
+	toFill := uint64(12)
+	blksUsed := regFile.blksUsed(blkSize)
+	if blksUsed < toFill {
+		toFill = blksUsed
+	}
+
+	blkMap := regFile.inode.diskInode.Data()
+	file.fileToPhysBlks = make([]uint32, toFill)
+	for i := uint64(0); i < toFill; i++ {
+		binary.Unmarshal(blkMap[i*4:(i+1)*4], binary.LittleEndian, &file.fileToPhysBlks[i])
+	}
+	return file, nil
+}
diff --git a/pkg/sentry/fs/ext/dentry.go b/pkg/sentry/fs/ext/dentry.go
index 054fb42b6..19c9b3b2d 100644
--- a/pkg/sentry/fs/ext/dentry.go
+++ b/pkg/sentry/fs/ext/dentry.go
@@ -26,6 +26,8 @@ type dentry struct {
 	// share a single non-directory Inode (with hard links). inode is
 	// immutable.
 	inode *inode
+	// dentryEntry links Dentries into their parent directory.childList.
+	dentryEntry
 }
 
 // Compiles only if dentry implements vfs.DentryImpl.
diff --git a/pkg/sentry/fs/ext/directory.go b/pkg/sentry/fs/ext/directory.go
new file mode 100644
index 000000000..ab2b59e44
--- /dev/null
+++ b/pkg/sentry/fs/ext/directory.go
@@ -0,0 +1,36 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+// directory represents a directory inode. It holds the childList in memory.
+type directory struct {
+	inode inode
+
+	// childList is a list containing (1) child Dentries and (2) fake Dentries
+	// (with inode == nil) that represent the iteration position of
+	// directoryFDs. childList is used to support directoryFD.IterDirents()
+	// efficiently. childList is immutable.
+	childList dentryList
+
+	// TODO(b/134676337): Add directory navigators.
+}
+
+// newDirectroy is the directory constructor.
+func newDirectroy(inode inode) *directory {
+	// TODO(b/134676337): initialize childList.
+	file := &directory{inode: inode}
+	file.inode.impl = file
+	return file
+}
diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go
index 10e235fb1..d303dd122 100644
--- a/pkg/sentry/fs/ext/ext.go
+++ b/pkg/sentry/fs/ext/ext.go
@@ -19,9 +19,9 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"os"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -35,11 +35,11 @@ type filesystemType struct{}
 // Compiles only if filesystemType implements vfs.FilesystemType.
 var _ vfs.FilesystemType = (*filesystemType)(nil)
 
-// getDeviceFd returns the read seeker to the underlying device.
+// getDeviceFd returns an io.ReaderAt to the underlying device.
 // Currently there are two ways of mounting an ext(2/3/4) fs:
 //   1. Specify a mount with our internal special MountType in the OCI spec.
 //   2. Expose the device to the container and mount it from application layer.
-func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReadSeeker, error) {
+func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, error) {
 	if opts.InternalData == nil {
 		// User mount call.
 		// TODO(b/134676337): Open the device specified by `source` and return that.
@@ -47,20 +47,19 @@ func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReadSeeker, e
 	}
 
 	// NewFilesystem call originated from within the sentry.
-	fd, ok := opts.InternalData.(uintptr)
+	devFd, ok := opts.InternalData.(int)
 	if !ok {
-		return nil, errors.New("internal data for ext fs must be a uintptr containing the file descriptor to device")
+		return nil, errors.New("internal data for ext fs must be an int containing the file descriptor to device")
 	}
 
-	// We do not close this file because that would close the underlying device
-	// file descriptor (which is required for reading the fs from disk).
-	// TODO(b/134676337): Use pkg/fd instead.
-	deviceFile := os.NewFile(fd, source)
-	if deviceFile == nil {
-		return nil, fmt.Errorf("ext4 device file descriptor is not valid: %d", fd)
+	if devFd < 0 {
+		return nil, fmt.Errorf("ext device file descriptor is not valid: %d", devFd)
 	}
 
-	return deviceFile, nil
+	// The fd.ReadWriter returned from fd.NewReadWriter() does not take ownership
+	// of the file descriptor and hence will not close it when it is garbage
+	// collected.
+	return fd.NewReadWriter(devFd), nil
 }
 
 // NewFilesystem implements vfs.FilesystemType.NewFilesystem.
@@ -88,7 +87,7 @@ func (fstype filesystemType) NewFilesystem(ctx context.Context, creds *auth.Cred
 		return nil, nil, err
 	}
 
-	rootInode, err := fs.getOrCreateInode(disklayout.RootDirInode)
+	rootInode, err := fs.getOrCreateInode(ctx, disklayout.RootDirInode)
 	if err != nil {
 		return nil, nil, err
 	}
diff --git a/pkg/sentry/fs/ext/ext_test.go b/pkg/sentry/fs/ext/ext_test.go
index ee7f7907c..18764e92a 100644
--- a/pkg/sentry/fs/ext/ext_test.go
+++ b/pkg/sentry/fs/ext/ext_test.go
@@ -69,7 +69,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *v
 
 	// Mount the ext4 fs and retrieve the inode structure for the file.
 	mockCtx := contexttest.Context(t)
-	fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: f.Fd()})
+	fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
diff --git a/pkg/sentry/fs/ext/extent_file.go b/pkg/sentry/fs/ext/extent_file.go
new file mode 100644
index 000000000..aa4102dbb
--- /dev/null
+++ b/pkg/sentry/fs/ext/extent_file.go
@@ -0,0 +1,260 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"io"
+	"sort"
+
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// extentFile is a type of regular file which uses extents to store file data.
+type extentFile struct {
+	regFile regularFile
+
+	// root is the root extent node. This lives in the 60 byte diskInode.Data().
+	// Immutable.
+	root disklayout.ExtentNode
+}
+
+// Compiles only if extentFile implements fileReader.
+var _ fileReader = (*extentFile)(nil)
+
+// Read implements fileReader.getFileReader.
+func (f *extentFile) getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader {
+	return &extentReader{
+		dev:     dev,
+		file:    f,
+		fileOff: offset,
+		blkSize: blkSize,
+	}
+}
+
+// newExtentFile is the extent file constructor. It reads the entire extent
+// tree into memory.
+// TODO(b/134676337): Build extent tree on demand to reduce memory usage.
+func newExtentFile(dev io.ReaderAt, blkSize uint64, regFile regularFile) (*extentFile, error) {
+	file := &extentFile{regFile: regFile}
+	file.regFile.impl = file
+	err := file.buildExtTree(dev, blkSize)
+	if err != nil {
+		return nil, err
+	}
+	return file, nil
+}
+
+// buildExtTree builds the extent tree by reading it from disk by doing
+// running a simple DFS. It first reads the root node from the inode struct in
+// memory. Then it recursively builds the rest of the tree by reading it off
+// disk.
+//
+// Precondition: inode flag InExtents must be set.
+func (f *extentFile) buildExtTree(dev io.ReaderAt, blkSize uint64) error {
+	rootNodeData := f.regFile.inode.diskInode.Data()
+
+	binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &f.root.Header)
+
+	// Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries.
+	if f.root.Header.NumEntries > 4 {
+		// read(2) specifies that EINVAL should be returned if the file is unsuitable
+		// for reading.
+		return syserror.EINVAL
+	}
+
+	f.root.Entries = make([]disklayout.ExtentEntryPair, f.root.Header.NumEntries)
+	for i, off := uint16(0), disklayout.ExtentStructsSize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
+		var curEntry disklayout.ExtentEntry
+		if f.root.Header.Height == 0 {
+			// Leaf node.
+			curEntry = &disklayout.Extent{}
+		} else {
+			// Internal node.
+			curEntry = &disklayout.ExtentIdx{}
+		}
+		binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentStructsSize], binary.LittleEndian, curEntry)
+		f.root.Entries[i].Entry = curEntry
+	}
+
+	// If this node is internal, perform DFS.
+	if f.root.Header.Height > 0 {
+		for i := uint16(0); i < f.root.Header.NumEntries; i++ {
+			var err error
+			if f.root.Entries[i].Node, err = buildExtTreeFromDisk(dev, f.root.Entries[i].Entry, blkSize); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+// buildExtTreeFromDisk reads the extent tree nodes from disk and recursively
+// builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to
+// by the ExtentEntry.
+func buildExtTreeFromDisk(dev io.ReaderAt, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) {
+	var header disklayout.ExtentHeader
+	off := entry.PhysicalBlock() * blkSize
+	err := readFromDisk(dev, int64(off), &header)
+	if err != nil {
+		return nil, err
+	}
+
+	entries := make([]disklayout.ExtentEntryPair, header.NumEntries)
+	for i, off := uint16(0), off+disklayout.ExtentStructsSize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
+		var curEntry disklayout.ExtentEntry
+		if header.Height == 0 {
+			// Leaf node.
+			curEntry = &disklayout.Extent{}
+		} else {
+			// Internal node.
+			curEntry = &disklayout.ExtentIdx{}
+		}
+
+		err := readFromDisk(dev, int64(off), curEntry)
+		if err != nil {
+			return nil, err
+		}
+		entries[i].Entry = curEntry
+	}
+
+	// If this node is internal, perform DFS.
+	if header.Height > 0 {
+		for i := uint16(0); i < header.NumEntries; i++ {
+			var err error
+			entries[i].Node, err = buildExtTreeFromDisk(dev, entries[i].Entry, blkSize)
+			if err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	return &disklayout.ExtentNode{header, entries}, nil
+}
+
+// extentReader implements io.Reader which can traverse the extent tree and
+// read file data. This is not thread safe.
+type extentReader struct {
+	dev     io.ReaderAt
+	file    *extentFile
+	fileOff uint64 // Represents the current file offset being read from.
+	blkSize uint64
+}
+
+// Compiles only if inlineReader implements io.Reader.
+var _ io.Reader = (*extentReader)(nil)
+
+// Read implements io.Reader.Read.
+func (r *extentReader) Read(dst []byte) (int, error) {
+	if len(dst) == 0 {
+		return 0, nil
+	}
+
+	if r.fileOff >= r.file.regFile.inode.diskInode.Size() {
+		return 0, io.EOF
+	}
+
+	return r.read(&r.file.root, dst)
+}
+
+// read is a helper which traverses the extent tree and reads data.
+func (r *extentReader) read(node *disklayout.ExtentNode, dst []byte) (int, error) {
+	// Perform a binary search for the node covering bytes starting at r.fileOff.
+	// A highly fragmented filesystem can have upto 340 entries and so linear
+	// search should be avoided. Finds the first entry which does not cover the
+	// file block we want and subtracts 1 to get the desired index.
+	fileBlk := r.fileBlock()
+	n := len(node.Entries)
+	found := sort.Search(n, func(i int) bool {
+		return node.Entries[i].Entry.FileBlock() > fileBlk
+	}) - 1
+
+	// We should be in this recursive step only if the data we want exists under
+	// the current node.
+	if found < 0 {
+		panic("searching for a file block in an extent entry which does not cover it")
+	}
+
+	read := 0
+	toRead := len(dst)
+	var curR int
+	var err error
+	for i := found; i < n && read < toRead; i++ {
+		if node.Header.Height == 0 {
+			curR, err = r.readFromExtent(node.Entries[i].Entry.(*disklayout.Extent), dst[read:])
+		} else {
+			curR, err = r.read(node.Entries[i].Node, dst[read:])
+		}
+
+		read += curR
+		if err != nil {
+			return read, err
+		}
+	}
+
+	return read, nil
+}
+
+// readFromExtent reads file data from the extent. It takes advantage of the
+// sequential nature of extents and reads file data from multiple blocks in one
+// call. Also updates the file offset.
+//
+// A non-nil error indicates that this is a partial read and there is probably
+// more to read from this extent. The caller should propagate the error upward
+// and not move to the next extent in the tree.
+//
+// A subsequent call to extentReader.Read should continue reading from where we
+// left off as expected.
+func (r *extentReader) readFromExtent(ex *disklayout.Extent, dst []byte) (int, error) {
+	curFileBlk := r.fileBlock()
+	exFirstFileBlk := ex.FileBlock()
+	exLastFileBlk := exFirstFileBlk + uint32(ex.Length) // This is exclusive.
+
+	// We should be in this recursive step only if the data we want exists under
+	// the current extent.
+	if curFileBlk < exFirstFileBlk || exLastFileBlk <= curFileBlk {
+		panic("searching for a file block in an extent which does not cover it")
+	}
+
+	curPhyBlk := uint64(curFileBlk-exFirstFileBlk) + ex.PhysicalBlock()
+	readStart := curPhyBlk*r.blkSize + r.fileBlockOff()
+
+	endPhyBlk := ex.PhysicalBlock() + uint64(ex.Length)
+	extentEnd := endPhyBlk * r.blkSize // This is exclusive.
+
+	toRead := int(extentEnd - readStart)
+	if len(dst) < toRead {
+		toRead = len(dst)
+	}
+
+	n, _ := r.dev.ReadAt(dst[:toRead], int64(readStart))
+	r.fileOff += uint64(n)
+	if n < toRead {
+		return n, syserror.EIO
+	}
+	return n, nil
+}
+
+// fileBlock returns the file block number we are currently reading.
+func (r *extentReader) fileBlock() uint32 {
+	return uint32(r.fileOff / r.blkSize)
+}
+
+// fileBlockOff returns the current offset within the current file block.
+func (r *extentReader) fileBlockOff() uint64 {
+	return r.fileOff % r.blkSize
+}
diff --git a/pkg/sentry/fs/ext/extent_test.go b/pkg/sentry/fs/ext/extent_test.go
index b3f342c8e..dff401114 100644
--- a/pkg/sentry/fs/ext/extent_test.go
+++ b/pkg/sentry/fs/ext/extent_test.go
@@ -16,6 +16,8 @@ package ext
 
 import (
 	"bytes"
+	"io"
+	"math/rand"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -24,9 +26,14 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
 )
 
-// TestExtentTree tests the extent tree building logic.
+const (
+	// mockExtentBlkSize is the mock block size used for testing.
+	// No block has more than 1 header + 4 entries.
+	mockExtentBlkSize = uint64(64)
+)
+
+// The tree described below looks like:
 //
-// Test tree:
 //            0.{Head}[Idx][Idx]
 //                     /     \
 //                    /       \
@@ -44,12 +51,8 @@ import (
 //
 // Please note that ext4 might not construct extent trees looking like this.
 // This is purely for testing the tree traversal logic.
-func TestExtentTree(t *testing.T) {
-	blkSize := uint64(64) // No block has more than 1 header + 4 entries.
-	mockDisk := make([]byte, blkSize*10)
-	mockInode := &inode{diskInode: &disklayout.InodeNew{}}
-
-	node3 := &disklayout.ExtentNode{
+var (
+	node3 = &disklayout.ExtentNode{
 		Header: disklayout.ExtentHeader{
 			Magic:      disklayout.ExtentMagic,
 			NumEntries: 1,
@@ -68,7 +71,7 @@ func TestExtentTree(t *testing.T) {
 		},
 	}
 
-	node2 := &disklayout.ExtentNode{
+	node2 = &disklayout.ExtentNode{
 		Header: disklayout.ExtentHeader{
 			Magic:      disklayout.ExtentMagic,
 			NumEntries: 1,
@@ -86,7 +89,7 @@ func TestExtentTree(t *testing.T) {
 		},
 	}
 
-	node1 := &disklayout.ExtentNode{
+	node1 = &disklayout.ExtentNode{
 		Header: disklayout.ExtentHeader{
 			Magic:      disklayout.ExtentMagic,
 			NumEntries: 2,
@@ -113,7 +116,7 @@ func TestExtentTree(t *testing.T) {
 		},
 	}
 
-	node0 := &disklayout.ExtentNode{
+	node0 = &disklayout.ExtentNode{
 		Header: disklayout.ExtentHeader{
 			Magic:      disklayout.ExtentMagic,
 			NumEntries: 2,
@@ -137,22 +140,95 @@ func TestExtentTree(t *testing.T) {
 			},
 		},
 	}
+)
 
-	writeTree(mockInode, mockDisk, node0, blkSize)
+// TestExtentReader tests extentReader functionality. We should be able to use
+// the file reader like any other io.Reader.
+func TestExtentReader(t *testing.T) {
+	type extentReaderTest struct {
+		name string
+		from func(uint64) uint64
+		to   func(uint64) uint64
+	}
 
-	r := bytes.NewReader(mockDisk)
-	if err := mockInode.buildExtTree(r, blkSize); err != nil {
-		t.Fatalf("inode.buildExtTree failed: %v", err)
+	tests := []extentReaderTest{
+		{
+			name: "read first half",
+			from: beginning,
+			to:   middle,
+		},
+		{
+			name: "read entire file",
+			from: beginning,
+			to:   end,
+		},
+		{
+			name: "read second half",
+			from: middle,
+			to:   end,
+		},
 	}
 
+	dev, mockExtentFile, want := extentTreeSetUp(t, node0)
+	size := mockExtentFile.regFile.inode.diskInode.Size()
+
+	for _, test := range tests {
+		from := test.from(size)
+		to := test.to(size)
+		fileReader := mockExtentFile.getFileReader(dev, mockExtentBlkSize, from)
+
+		got := make([]byte, to-from)
+		if _, err := io.ReadFull(fileReader, got); err != nil {
+			t.Errorf("file read failed: %v", err)
+		}
+
+		if diff := cmp.Diff(got, want[from:to]); diff != "" {
+			t.Errorf("file data mismatch (-want +got):\n%s", diff)
+		}
+	}
+}
+
+// TestBuildExtentTree tests the extent tree building logic.
+func TestBuildExtentTree(t *testing.T) {
+	_, mockExtentFile, _ := extentTreeSetUp(t, node0)
+
 	opt := cmpopts.IgnoreUnexported(disklayout.ExtentIdx{}, disklayout.ExtentHeader{})
-	if diff := cmp.Diff(mockInode.root, node0, opt); diff != "" {
+	if diff := cmp.Diff(&mockExtentFile.root, node0, opt); diff != "" {
 		t.Errorf("extent tree mismatch (-want +got):\n%s", diff)
 	}
 }
 
-// writeTree writes the tree represented by `root` to the inode and disk passed.
-func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, blkSize uint64) {
+// extentTreeSetUp writes the passed extent tree to a mock disk as an extent
+// tree. It also constucts a mock extent file with the same tree built in it.
+// It also writes random data file data and returns it.
+func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (io.ReaderAt, *extentFile, []byte) {
+	t.Helper()
+
+	mockDisk := make([]byte, mockExtentBlkSize*10)
+	mockExtentFile := &extentFile{
+		regFile: regularFile{
+			inode: inode{
+				diskInode: &disklayout.InodeNew{
+					InodeOld: disklayout.InodeOld{
+						SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root),
+					},
+				},
+			},
+		},
+	}
+
+	fileData := writeTree(&mockExtentFile.regFile.inode, mockDisk, node0, mockExtentBlkSize)
+
+	r := bytes.NewReader(mockDisk)
+	if err := mockExtentFile.buildExtTree(r, mockExtentBlkSize); err != nil {
+		t.Fatalf("inode.buildExtTree failed: %v", err)
+	}
+	return r, mockExtentFile, fileData
+}
+
+// writeTree writes the tree represented by `root` to the inode and disk. It
+// also writes random file data on disk.
+func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBlkSize uint64) []byte {
 	rootData := binary.Marshal(nil, binary.LittleEndian, root.Header)
 	for _, ep := range root.Entries {
 		rootData = binary.Marshal(rootData, binary.LittleEndian, ep.Entry)
@@ -160,26 +236,57 @@ func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, blkSize uint
 
 	copy(in.diskInode.Data(), rootData)
 
-	if root.Header.Height > 0 {
-		for _, ep := range root.Entries {
-			writeTreeToDisk(disk, ep, blkSize)
+	var fileData []byte
+	for _, ep := range root.Entries {
+		if root.Header.Height == 0 {
+			fileData = append(fileData, writeRandomFileData(disk, ep.Entry.(*disklayout.Extent))...)
+		} else {
+			fileData = append(fileData, writeTreeToDisk(disk, ep)...)
 		}
 	}
+	return fileData
 }
 
 // writeTreeToDisk is the recursive step for writeTree which writes the tree
-// on the disk only.
-func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair, blkSize uint64) {
+// on the disk only. Also writes random file data on disk.
+func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair) []byte {
 	nodeData := binary.Marshal(nil, binary.LittleEndian, curNode.Node.Header)
 	for _, ep := range curNode.Node.Entries {
 		nodeData = binary.Marshal(nodeData, binary.LittleEndian, ep.Entry)
 	}
 
-	copy(disk[curNode.Entry.PhysicalBlock()*blkSize:], nodeData)
+	copy(disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:], nodeData)
+
+	var fileData []byte
+	for _, ep := range curNode.Node.Entries {
+		if curNode.Node.Header.Height == 0 {
+			fileData = append(fileData, writeRandomFileData(disk, ep.Entry.(*disklayout.Extent))...)
+		} else {
+			fileData = append(fileData, writeTreeToDisk(disk, ep)...)
+		}
+	}
+	return fileData
+}
+
+// writeRandomFileData writes random bytes to the blocks on disk that the
+// passed extent points to.
+func writeRandomFileData(disk []byte, ex *disklayout.Extent) []byte {
+	phyExStartBlk := ex.PhysicalBlock()
+	phyExStartOff := phyExStartBlk * mockExtentBlkSize
+	phyExEndOff := phyExStartOff + uint64(ex.Length)*mockExtentBlkSize
+	rand.Read(disk[phyExStartOff:phyExEndOff])
+	return disk[phyExStartOff:phyExEndOff]
+}
 
-	if curNode.Node.Header.Height > 0 {
-		for _, ep := range curNode.Node.Entries {
-			writeTreeToDisk(disk, ep, blkSize)
+// getNumPhyBlks returns the number of physical blocks covered under the node.
+func getNumPhyBlks(node *disklayout.ExtentNode) uint32 {
+	var res uint32
+	for _, ep := range node.Entries {
+		if node.Header.Height == 0 {
+			res += uint32(ep.Entry.(*disklayout.Extent).Length)
+		} else {
+			res += getNumPhyBlks(ep.Node)
 		}
 	}
+	return res
 }
diff --git a/pkg/sentry/fs/ext/filesystem.go b/pkg/sentry/fs/ext/filesystem.go
index 7150e75a5..12aeb5dac 100644
--- a/pkg/sentry/fs/ext/filesystem.go
+++ b/pkg/sentry/fs/ext/filesystem.go
@@ -31,22 +31,16 @@ type filesystem struct {
 
 	vfsfs vfs.Filesystem
 
-	// mu serializes changes to the Dentry tree and the usage of the read seeker.
-	mu sync.Mutex
+	// mu serializes changes to the Dentry tree.
+	mu sync.RWMutex
 
-	// dev is the ReadSeeker for the underlying fs device. It is protected by mu.
-	//
-	// The ext filesystems aim to maximize locality, i.e. place all the data
-	// blocks of a file close together. On a spinning disk, locality reduces the
-	// amount of movement of the head hence speeding up IO operations. On an SSD
-	// there are no moving parts but locality increases the size of each transer
-	// request. Hence, having mutual exclusion on the read seeker while reading a
-	// file *should* help in achieving the intended performance gains.
-	//
-	// Note: This synchronization was not coupled with the ReadSeeker itself
-	// because we want to synchronize across read/seek operations for the
-	// performance gains mentioned above. Helps enforcing one-file-at-a-time IO.
-	dev io.ReadSeeker
+	// dev is the io.ReaderAt for the underlying fs device. It does not require
+	// protection because io.ReaderAt permits concurrent read calls to it. It
+	// translates to the pread syscall which passes on the read request directly
+	// to the device driver. Device drivers are intelligent in serving multiple
+	// concurrent read requests in the optimal order (taking locality into
+	// consideration).
+	dev io.ReaderAt
 
 	// inodeCache maps absolute inode numbers to the corresponding Inode struct.
 	// Inodes should be removed from this once their reference count hits 0.
@@ -69,13 +63,13 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil)
 // getOrCreateInode gets the inode corresponding to the inode number passed in.
 // It creates a new one with the given inode number if one does not exist.
 //
-// Preconditions: must be holding fs.mu.
-func (fs *filesystem) getOrCreateInode(inodeNum uint32) (*inode, error) {
+// Precondition: must be holding fs.mu.
+func (fs *filesystem) getOrCreateInode(ctx context.Context, inodeNum uint32) (*inode, error) {
 	if in, ok := fs.inodeCache[inodeNum]; ok {
 		return in, nil
 	}
 
-	in, err := newInode(fs.dev, fs.sb, fs.bgs, inodeNum)
+	in, err := newInode(ctx, fs.dev, fs.sb, fs.bgs, inodeNum)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fs/ext/inline_file.go b/pkg/sentry/fs/ext/inline_file.go
new file mode 100644
index 000000000..b9adfe548
--- /dev/null
+++ b/pkg/sentry/fs/ext/inline_file.go
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"io"
+)
+
+// inlineFile is a type of regular file. All the data here is stored in the
+// inode.Data() array.
+type inlineFile struct {
+	regFile regularFile
+}
+
+// Compiles only if inlineFile implements fileReader.
+var _ fileReader = (*inlineFile)(nil)
+
+// getFileReader implements fileReader.getFileReader.
+func (f *inlineFile) getFileReader(_ io.ReaderAt, _ uint64, offset uint64) io.Reader {
+	diskInode := f.regFile.inode.diskInode
+	return &inlineReader{offset: offset, data: diskInode.Data()[:diskInode.Size()]}
+}
+
+// newInlineFile is the inlineFile constructor.
+func newInlineFile(regFile regularFile) *inlineFile {
+	file := &inlineFile{regFile: regFile}
+	file.regFile.impl = file
+	return file
+}
+
+// inlineReader implements io.Reader which can read the underlying data. This
+// is not thread safe.
+type inlineReader struct {
+	offset uint64
+	data   []byte
+}
+
+// Compiles only if inlineReader implements io.Reader.
+var _ io.Reader = (*inlineReader)(nil)
+
+// Read implements io.Reader.Read.
+func (r *inlineReader) Read(dst []byte) (int, error) {
+	if len(dst) == 0 {
+		return 0, nil
+	}
+
+	if int(r.offset) >= len(r.data) {
+		return 0, io.EOF
+	}
+
+	n := copy(dst, r.data[r.offset:])
+	r.offset += uint64(n)
+	return n, nil
+}
diff --git a/pkg/sentry/fs/ext/inode.go b/pkg/sentry/fs/ext/inode.go
index df1ea0bda..00e022953 100644
--- a/pkg/sentry/fs/ext/inode.go
+++ b/pkg/sentry/fs/ext/inode.go
@@ -18,12 +18,26 @@ import (
 	"io"
 	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // inode represents an ext inode.
+//
+// inode uses the same inheritance pattern that pkg/sentry/vfs structures use.
+// This has been done to increase memory locality.
+//
+// Implementations:
+//    inode --
+//           |-- pipe
+//           |-- dir
+//           |-- symlink
+//           |-- regular--
+//                       |-- extent file
+//                       |-- block map file
+//                       |-- inline file
 type inode struct {
 	// refs is a reference count. refs is accessed using atomic memory operations.
 	refs int64
@@ -35,9 +49,9 @@ type inode struct {
 	// diskInode gives us access to the inode struct on disk. Immutable.
 	diskInode disklayout.Inode
 
-	// root is the root extent node. This lives in the 60 byte diskInode.Blocks().
-	// Immutable. Nil if the inode does not use extents.
-	root *disklayout.ExtentNode
+	// This is immutable. The first field of the implementations must have inode
+	// as the first field to ensure temporality.
+	impl interface{}
 }
 
 // incRef increments the inode ref count.
@@ -61,7 +75,7 @@ func (in *inode) tryIncRef() bool {
 // decRef decrements the inode ref count and releases the inode resources if
 // the ref count hits 0.
 //
-// Preconditions: Must have locked fs.mu.
+// Precondition: Must have locked fs.mu.
 func (in *inode) decRef(fs *filesystem) {
 	if refs := atomic.AddInt64(&in.refs, -1); refs == 0 {
 		delete(fs.inodeCache, in.inodeNum)
@@ -72,19 +86,17 @@ func (in *inode) decRef(fs *filesystem) {
 
 // newInode is the inode constructor. Reads the inode off disk. Identifies
 // inodes based on the absolute inode number on disk.
-//
-// Preconditions: Must hold the mutex of the filesystem containing dev.
-func newInode(dev io.ReadSeeker, sb disklayout.SuperBlock, bgs []disklayout.BlockGroup, inodeNum uint32) (*inode, error) {
+func newInode(ctx context.Context, dev io.ReaderAt, sb disklayout.SuperBlock, bgs []disklayout.BlockGroup, inodeNum uint32) (*inode, error) {
 	if inodeNum == 0 {
 		panic("inode number 0 on ext filesystems is not possible")
 	}
 
-	in := &inode{refs: 1, inodeNum: inodeNum}
 	inodeRecordSize := sb.InodeSize()
+	var diskInode disklayout.Inode
 	if inodeRecordSize == disklayout.OldInodeSize {
-		in.diskInode = &disklayout.InodeOld{}
+		diskInode = &disklayout.InodeOld{}
 	} else {
-		in.diskInode = &disklayout.InodeNew{}
+		diskInode = &disklayout.InodeNew{}
 	}
 
 	// Calculate where the inode is actually placed.
@@ -93,16 +105,38 @@ func newInode(dev io.ReadSeeker, sb disklayout.SuperBlock, bgs []disklayout.Bloc
 	inodeTableOff := bgs[getBGNum(inodeNum, inodesPerGrp)].InodeTable() * blkSize
 	inodeOff := inodeTableOff + uint64(uint32(inodeRecordSize)*getBGOff(inodeNum, inodesPerGrp))
 
-	// Read it from disk and figure out which type of inode this is.
-	if err := readFromDisk(dev, int64(inodeOff), in.diskInode); err != nil {
+	if err := readFromDisk(dev, int64(inodeOff), diskInode); err != nil {
 		return nil, err
 	}
 
-	if in.diskInode.Flags().Extents {
-		in.buildExtTree(dev, blkSize)
+	// Build the inode based on its type.
+	inode := inode{
+		refs:      1,
+		inodeNum:  inodeNum,
+		diskInode: diskInode,
 	}
 
-	return in, nil
+	switch diskInode.Mode().FileType() {
+	case linux.ModeSymlink:
+		f, err := newSymlink(dev, blkSize, inode)
+		if err != nil {
+			return nil, err
+		}
+		return &f.inode, nil
+	case linux.ModeRegular:
+		f, err := newRegularFile(dev, blkSize, inode)
+		if err != nil {
+			return nil, err
+		}
+		return &f.inode, nil
+	case linux.ModeDirectory:
+		return &newDirectroy(inode).inode, nil
+	case linux.ModeNamedPipe:
+		return &newNamedPipe(ctx, inode).inode, nil
+	default:
+		// TODO(b/134676337): Return appropriate errors for sockets and devices.
+		return nil, syserror.EINVAL
+	}
 }
 
 // getBGNum returns the block group number that a given inode belongs to.
@@ -115,95 +149,3 @@ func getBGNum(inodeNum uint32, inodesPerGrp uint32) uint32 {
 func getBGOff(inodeNum uint32, inodesPerGrp uint32) uint32 {
 	return (inodeNum - 1) % inodesPerGrp
 }
-
-// buildExtTree builds the extent tree by reading it from disk by doing
-// running a simple DFS. It first reads the root node from the inode struct in
-// memory. Then it recursively builds the rest of the tree by reading it off
-// disk.
-//
-// Preconditions:
-//   - Must hold the mutex of the filesystem containing dev.
-//   - Inode flag InExtents must be set.
-func (in *inode) buildExtTree(dev io.ReadSeeker, blkSize uint64) error {
-	rootNodeData := in.diskInode.Data()
-
-	var rootHeader disklayout.ExtentHeader
-	binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &rootHeader)
-
-	// Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries.
-	if rootHeader.NumEntries > 4 {
-		// read(2) specifies that EINVAL should be returned if the file is unsuitable
-		// for reading.
-		return syserror.EINVAL
-	}
-
-	rootEntries := make([]disklayout.ExtentEntryPair, rootHeader.NumEntries)
-	for i, off := uint16(0), disklayout.ExtentStructsSize; i < rootHeader.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
-		var curEntry disklayout.ExtentEntry
-		if rootHeader.Height == 0 {
-			// Leaf node.
-			curEntry = &disklayout.Extent{}
-		} else {
-			// Internal node.
-			curEntry = &disklayout.ExtentIdx{}
-		}
-		binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentStructsSize], binary.LittleEndian, curEntry)
-		rootEntries[i].Entry = curEntry
-	}
-
-	// If this node is internal, perform DFS.
-	if rootHeader.Height > 0 {
-		for i := uint16(0); i < rootHeader.NumEntries; i++ {
-			var err error
-			if rootEntries[i].Node, err = buildExtTreeFromDisk(dev, rootEntries[i].Entry, blkSize); err != nil {
-				return err
-			}
-		}
-	}
-
-	in.root = &disklayout.ExtentNode{rootHeader, rootEntries}
-	return nil
-}
-
-// buildExtTreeFromDisk reads the extent tree nodes from disk and recursively
-// builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to
-// by the ExtentEntry.
-//
-// Preconditions: Must hold the mutex of the filesystem containing dev.
-func buildExtTreeFromDisk(dev io.ReadSeeker, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) {
-	var header disklayout.ExtentHeader
-	off := entry.PhysicalBlock() * blkSize
-	if err := readFromDisk(dev, int64(off), &header); err != nil {
-		return nil, err
-	}
-
-	entries := make([]disklayout.ExtentEntryPair, header.NumEntries)
-	for i, off := uint16(0), off+disklayout.ExtentStructsSize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
-		var curEntry disklayout.ExtentEntry
-		if header.Height == 0 {
-			// Leaf node.
-			curEntry = &disklayout.Extent{}
-		} else {
-			// Internal node.
-			curEntry = &disklayout.ExtentIdx{}
-		}
-
-		if err := readFromDisk(dev, int64(off), curEntry); err != nil {
-			return nil, err
-		}
-		entries[i].Entry = curEntry
-	}
-
-	// If this node is internal, perform DFS.
-	if header.Height > 0 {
-		for i := uint16(0); i < header.NumEntries; i++ {
-			var err error
-			entries[i].Node, err = buildExtTreeFromDisk(dev, entries[i].Entry, blkSize)
-			if err != nil {
-				return nil, err
-			}
-		}
-	}
-
-	return &disklayout.ExtentNode{header, entries}, nil
-}
diff --git a/pkg/sentry/fs/ext/named_pipe.go b/pkg/sentry/fs/ext/named_pipe.go
new file mode 100644
index 000000000..0f3af1b53
--- /dev/null
+++ b/pkg/sentry/fs/ext/named_pipe.go
@@ -0,0 +1,40 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+// namedPipe represents a named pipe inode. It is currently just a wrapper
+// around pkg/sentry/kernel/pipe.
+type namedPipe struct {
+	inode inode
+
+	p        *pipe.Pipe
+	inodeOps fs.InodeOperations
+}
+
+// newNamedPipe is the namedPipe constructor.
+func newNamedPipe(ctx context.Context, inode inode) *namedPipe {
+	file := &namedPipe{inode: inode}
+	file.inode.impl = file
+	file.p = pipe.NewPipe(ctx, true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)
+	file.inodeOps = pipe.NewInodeOperations(ctx, fs.FilePermsFromMode(file.inode.diskInode.Mode()), file.p)
+	return file
+}
diff --git a/pkg/sentry/fs/ext/regular_file.go b/pkg/sentry/fs/ext/regular_file.go
new file mode 100644
index 000000000..b48f61795
--- /dev/null
+++ b/pkg/sentry/fs/ext/regular_file.go
@@ -0,0 +1,85 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"io"
+)
+
+// fileReader is used to abstact away the complexity of how the file data is
+// stored under the hood. Provides a method to get a file reader which can be
+// used to read file data without worrying about how it is organized on disk.
+type fileReader interface {
+
+	// getFileReader returns a Reader implementation which can be used to read a
+	// file. It abstracts away the complexity of how the file is actually
+	// organized on disk. The reader is initialized with the passed offset.
+	//
+	// This reader is not meant to be retained across Read operations as it needs
+	// to be reinitialized with the correct offset for every Read.
+	getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader
+}
+
+// regularFile represents a regular file's inode. This too follows the
+// inheritance pattern prevelant in the vfs layer described in
+// pkg/sentry/vfs/README.md.
+type regularFile struct {
+	inode inode
+
+	// This is immutable. The first field of fileReader implementations must be
+	// regularFile to ensure temporality.
+	impl fileReader
+}
+
+// newRegularFile is the regularFile constructor. It figures out what kind of
+// file this is and initializes the fileReader.
+func newRegularFile(dev io.ReaderAt, blkSize uint64, inode inode) (*regularFile, error) {
+	regFile := regularFile{
+		inode: inode,
+	}
+
+	inodeFlags := inode.diskInode.Flags()
+
+	if inodeFlags.Extents {
+		file, err := newExtentFile(dev, blkSize, regFile)
+		if err != nil {
+			return nil, err
+		}
+
+		file.regFile.inode.impl = &file.regFile
+		return &file.regFile, nil
+	}
+
+	if inodeFlags.Inline {
+		if inode.diskInode.Size() > 60 {
+			panic("ext fs: inline file larger than 60 bytes")
+		}
+
+		file := newInlineFile(regFile)
+		file.regFile.inode.impl = &file.regFile
+		return &file.regFile, nil
+	}
+
+	file, err := newBlockMapFile(blkSize, regFile)
+	if err != nil {
+		return nil, err
+	}
+	file.regFile.inode.impl = &file.regFile
+	return &file.regFile, nil
+}
+
+func (f *regularFile) blksUsed(blkSize uint64) uint64 {
+	return (f.inode.diskInode.Size() + blkSize - 1) / blkSize
+}
diff --git a/pkg/sentry/fs/ext/symlink.go b/pkg/sentry/fs/ext/symlink.go
new file mode 100644
index 000000000..6a55c1a7b
--- /dev/null
+++ b/pkg/sentry/fs/ext/symlink.go
@@ -0,0 +1,57 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+	"io"
+
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// symlink represents a symlink inode.
+type symlink struct {
+	inode  inode
+	target string // immutable
+}
+
+// newSymlink is the symlink constructor. It reads out the symlink target from
+// the inode (however it might have been stored).
+func newSymlink(dev io.ReaderAt, blkSize uint64, inode inode) (*symlink, error) {
+	var file *symlink
+	var link []byte
+
+	// If the symlink target is lesser than 60 bytes, its stores in inode.Data().
+	// Otherwise either extents or block maps will be used to store the link.
+	size := inode.diskInode.Size()
+	if size < 60 {
+		link = inode.diskInode.Data()[:size]
+	} else {
+		// Create a regular file out of this inode and read out the target.
+		regFile, err := newRegularFile(dev, blkSize, inode)
+		if err != nil {
+			return nil, err
+		}
+
+		link = make([]byte, size)
+		reader := regFile.impl.getFileReader(dev, blkSize, 0)
+		if _, err := io.ReadFull(reader, link); err != nil {
+			return nil, syserror.EIO
+		}
+	}
+
+	file = &symlink{inode: inode, target: string(link)}
+	file.inode.impl = file
+	return file, nil
+}
diff --git a/pkg/sentry/fs/ext/utils.go b/pkg/sentry/fs/ext/utils.go
index 3472c5fa8..3d89d664d 100644
--- a/pkg/sentry/fs/ext/utils.go
+++ b/pkg/sentry/fs/ext/utils.go
@@ -15,38 +15,30 @@
 package ext
 
 import (
-	"encoding/binary"
 	"io"
 
+	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // readFromDisk performs a binary read from disk into the given struct from
 // the absolute offset provided.
-//
-// All disk reads should use this helper so we avoid reading from stale
-// previously used offsets. This function forces the offset parameter.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readFromDisk(dev io.ReadSeeker, abOff int64, v interface{}) error {
-	if _, err := dev.Seek(abOff, io.SeekStart); err != nil {
-		return syserror.EIO
-	}
-
-	if err := binary.Read(dev, binary.LittleEndian, v); err != nil {
+func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error {
+	n := binary.Size(v)
+	buf := make([]byte, n)
+	if read, _ := dev.ReadAt(buf, abOff); read < int(n) {
 		return syserror.EIO
 	}
 
+	binary.Unmarshal(buf, binary.LittleEndian, v)
 	return nil
 }
 
 // readSuperBlock reads the SuperBlock from block group 0 in the underlying
 // device. There are three versions of the superblock. This function identifies
 // and returns the correct version.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readSuperBlock(dev io.ReadSeeker) (disklayout.SuperBlock, error) {
+func readSuperBlock(dev io.ReaderAt) (disklayout.SuperBlock, error) {
 	var sb disklayout.SuperBlock = &disklayout.SuperBlockOld{}
 	if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil {
 		return nil, err
@@ -76,19 +68,12 @@ func blockGroupsCount(sb disklayout.SuperBlock) uint64 {
 	blocksPerGroup := uint64(sb.BlocksPerGroup())
 
 	// Round up the result. float64 can compromise precision so do it manually.
-	bgCount := blocksCount / blocksPerGroup
-	if blocksCount%blocksPerGroup != 0 {
-		bgCount++
-	}
-
-	return bgCount
+	return (blocksCount + blocksPerGroup - 1) / blocksPerGroup
 }
 
 // readBlockGroups reads the block group descriptor table from block group 0 in
 // the underlying device.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readBlockGroups(dev io.ReadSeeker, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) {
+func readBlockGroups(dev io.ReaderAt, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) {
 	bgCount := blockGroupsCount(sb)
 	bgdSize := uint64(sb.BgDescSize())
 	is64Bit := sb.IncompatibleFeatures().Is64Bit
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 693ffc760..728575864 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -678,7 +678,7 @@ func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name s
 	return "", syserror.ENOENT
 }
 
-// GetPath returns the PATH as a slice of strings given the environemnt
+// GetPath returns the PATH as a slice of strings given the environment
 // variables.
 func GetPath(env []string) []string {
 	const prefix = "PATH="
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 4c2d48e65..55a9d3d29 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -197,6 +197,11 @@ type Kernel struct {
 	// caches. Not all caches use it, only the caches that use host resources use
 	// the limiter. It may be nil if disabled.
 	DirentCacheLimiter *fs.DirentCacheLimiter
+
+	// unimplementedSyscallEmitter is used to emit unimplemented syscall
+	// events. This is initialized lazily on the first unimplemented
+	// syscall.
+	unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
 }
 
 // InitKernelArgs holds arguments to Init.
@@ -290,7 +295,6 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
 	k.futexes = futex.NewManager()
 	k.netlinkPorts = port.New()
-
 	return nil
 }
 
@@ -586,11 +590,17 @@ func (k *Kernel) UniqueID() uint64 {
 
 // CreateProcessArgs holds arguments to kernel.CreateProcess.
 type CreateProcessArgs struct {
-	// Filename is the filename to load.
+	// Filename is the filename to load as the init binary.
 	//
-	// If this is provided as "", then the file will be guessed via Argv[0].
+	// If this is provided as "", File will be checked, then the file will be
+	// guessed via Argv[0].
 	Filename string
 
+	// File is a passed host FD pointing to a file to load as the init binary.
+	//
+	// This is checked if and only if Filename is "".
+	File *fs.File
+
 	// Argvv is a list of arguments.
 	Argv []string
 
@@ -775,8 +785,16 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		defer wd.DecRef()
 	}
 
-	if args.Filename == "" {
-		// Was anything provided?
+	// Check which file to start from.
+	switch {
+	case args.Filename != "":
+		// If a filename is given, take that.
+		// Set File to nil so we resolve the path in LoadTaskImage.
+		args.File = nil
+	case args.File != nil:
+		// If File is set, take the File provided directly.
+	default:
+		// Otherwise look at Argv and see if the first argument is a valid path.
 		if len(args.Argv) == 0 {
 			return nil, 0, fmt.Errorf("no filename or command provided")
 		}
@@ -788,7 +806,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 
 	// Create a fresh task context.
 	remainingTraversals = uint(args.MaxSymlinkTraversals)
-	tc, se := k.LoadTaskImage(ctx, k.mounts, root, wd, &remainingTraversals, args.Filename, args.Argv, args.Envv, k.featureSet)
+
+	tc, se := k.LoadTaskImage(ctx, k.mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, k.featureSet)
+
 	if se != nil {
 		return nil, 0, errors.New(se.String())
 	}
@@ -1168,16 +1188,6 @@ func (k *Kernel) SupervisorContext() context.Context {
 	}
 }
 
-// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
-// channel.
-func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
-	t := TaskFromContext(ctx)
-	eventchannel.Emit(&uspb.UnimplementedSyscall{
-		Tid:       int32(t.ThreadID()),
-		Registers: t.Arch().StateData().Proto(),
-	})
-}
-
 // SocketEntry represents a socket recorded in Kernel.sockets. It implements
 // refs.WeakRefUser for sockets stored in the socket table.
 //
@@ -1272,3 +1282,23 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		return nil
 	}
 }
+
+// Rate limits for the number of unimplemented syscall evants.
+const (
+	unimplementedSyscallsMaxRate = 100  // events per second
+	unimplementedSyscallBurst    = 1000 // events
+)
+
+// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
+// channel.
+func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
+	if k.unimplementedSyscallEmitter == nil {
+		k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst)
+	}
+
+	t := TaskFromContext(ctx)
+	k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{
+		Tid:       int32(t.ThreadID()),
+		Registers: t.Arch().StateData().Proto(),
+	})
+}
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 54b1676b0..8639d379f 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -140,15 +140,22 @@ func (t *Task) Stack() *arch.Stack {
 //  * wd: Working directory to lookup filename under
 //  * maxTraversals: maximum number of symlinks to follow
 //  * filename: path to binary to load
+//  * file: an open fs.File object of the binary to load. If set,
+//  file will be loaded and not filename.
 //  * argv: Binary argv
 //  * envv: Binary envv
 //  * fs: Binary FeatureSet
-func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) {
+func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, file *fs.File, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) {
+	// If File is not nil, we should load that instead of resolving filename.
+	if file != nil {
+		filename = file.MappedName(ctx)
+	}
+
 	// Prepare a new user address space to load into.
 	m := mm.NewMemoryManager(k, k)
 	defer m.DecUsers(ctx)
 
-	os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv, k.extraAuxv, k.vdso)
+	os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv, envv, k.extraAuxv, k.vdso)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index baa12d9a0..f6f1ae762 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -67,8 +67,64 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m
 	if err != nil {
 		return nil, nil, err
 	}
+
+	// Open file will take a reference to Dirent, so destroy this one.
 	defer d.DecRef()
 
+	return openFile(ctx, nil, d, name)
+}
+
+// openFile performs checks on a file to be executed. If provided a *fs.File,
+// openFile takes that file's Dirent and performs checks on it. If provided a
+// *fs.Dirent and not a *fs.File, it creates a *fs.File object from the Dirent's
+// Inode and performs checks on that.
+//
+// openFile returns an *fs.File and *fs.Dirent, and the caller takes ownership
+// of both.
+//
+// "dirent" and "file" must not both be nil and point to a readable, executable, regular file.
+func openFile(ctx context.Context, file *fs.File, dirent *fs.Dirent, name string) (*fs.Dirent, *fs.File, error) {
+	// file and dirent must not be nil.
+	if dirent == nil && file == nil {
+		ctx.Infof("dirent and file cannot both be nil.")
+		return nil, nil, syserror.ENOENT
+	}
+
+	if file != nil {
+		dirent = file.Dirent
+	}
+
+	// Perform permissions checks on the file.
+	if err := checkFile(ctx, dirent, name); err != nil {
+		return nil, nil, err
+	}
+
+	if file == nil {
+		var ferr error
+		if file, ferr = dirent.Inode.GetFile(ctx, dirent, fs.FileFlags{Read: true}); ferr != nil {
+			return nil, nil, ferr
+		}
+	} else {
+		// GetFile takes a reference to the created file, so make one in the case
+		// that the file reference already existed.
+		file.IncRef()
+	}
+
+	// We must be able to read at arbitrary offsets.
+	if !file.Flags().Pread {
+		file.DecRef()
+		ctx.Infof("%s cannot be read at an offset: %+v", file.MappedName(ctx), file.Flags())
+		return nil, nil, syserror.EACCES
+	}
+
+	// Grab reference for caller.
+	dirent.IncRef()
+	return dirent, file, nil
+}
+
+// checkFile performs file permissions checks for binaries called in openPath
+// and openFile
+func checkFile(ctx context.Context, d *fs.Dirent, name string) error {
 	perms := fs.PermMask{
 		// TODO(gvisor.dev/issue/160): Linux requires only execute
 		// permission, not read. However, our backing filesystems may
@@ -80,7 +136,7 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m
 		Execute: true,
 	}
 	if err := d.Inode.CheckPermission(ctx, perms); err != nil {
-		return nil, nil, err
+		return err
 	}
 
 	// If they claim it's a directory, then make sure.
@@ -88,31 +144,17 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m
 	// N.B. we reject directories below, but we must first reject
 	// non-directories passed as directories.
 	if len(name) > 0 && name[len(name)-1] == '/' && !fs.IsDir(d.Inode.StableAttr) {
-		return nil, nil, syserror.ENOTDIR
+		return syserror.ENOTDIR
 	}
 
 	// No exec-ing directories, pipes, etc!
 	if !fs.IsRegular(d.Inode.StableAttr) {
 		ctx.Infof("%s is not regular: %v", name, d.Inode.StableAttr)
-		return nil, nil, syserror.EACCES
+		return syserror.EACCES
 	}
 
-	// Create a new file.
-	file, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
-	if err != nil {
-		return nil, nil, err
-	}
+	return nil
 
-	// We must be able to read at arbitrary offsets.
-	if !file.Flags().Pread {
-		file.DecRef()
-		ctx.Infof("%s cannot be read at an offset: %+v", name, file.Flags())
-		return nil, nil, syserror.EACCES
-	}
-
-	// Grab a reference for the caller.
-	d.IncRef()
-	return d, file, nil
 }
 
 // allocStack allocates and maps a stack in to any available part of the address space.
@@ -131,16 +173,30 @@ const (
 	maxLoaderAttempts = 6
 )
 
-// loadPath resolves filename to a binary and loads it.
+// loadBinary loads a binary that is pointed to by "file". If nil, the path
+// "filename" is resolved and loaded.
 //
 // It returns:
 //  * loadedELF, description of the loaded binary
 //  * arch.Context matching the binary arch
 //  * fs.Dirent of the binary file
 //  * Possibly updated argv
-func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, fs *cpuid.FeatureSet, filename string, argv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
+func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, features *cpuid.FeatureSet, filename string, passedFile *fs.File, argv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
 	for i := 0; i < maxLoaderAttempts; i++ {
-		d, f, err := openPath(ctx, mounts, root, wd, remainingTraversals, filename)
+		var (
+			d   *fs.Dirent
+			f   *fs.File
+			err error
+		)
+		if passedFile == nil {
+			d, f, err = openPath(ctx, mounts, root, wd, remainingTraversals, filename)
+
+		} else {
+			d, f, err = openFile(ctx, passedFile, nil, "")
+			// Set to nil in case we loop on a Interpreter Script.
+			passedFile = nil
+		}
+
 		if err != nil {
 			ctx.Infof("Error opening %s: %v", filename, err)
 			return loadedELF{}, nil, nil, nil, err
@@ -165,7 +221,7 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac
 
 		switch {
 		case bytes.Equal(hdr[:], []byte(elfMagic)):
-			loaded, ac, err := loadELF(ctx, m, mounts, root, wd, remainingTraversals, fs, f)
+			loaded, ac, err := loadELF(ctx, m, mounts, root, wd, remainingTraversals, features, f)
 			if err != nil {
 				ctx.Infof("Error loading ELF: %v", err)
 				return loadedELF{}, nil, nil, nil, err
@@ -190,7 +246,8 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac
 	return loadedELF{}, nil, nil, nil, syserror.ELOOP
 }
 
-// Load loads filename into a MemoryManager.
+// Load loads "file" into a MemoryManager. If file is nil, the path "filename"
+// is resolved and loaded instead.
 //
 // If Load returns ErrSwitchFile it should be called again with the returned
 // path and argv.
@@ -198,9 +255,9 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac
 // Preconditions:
 //  * The Task MemoryManager is empty.
 //  * Load is called on the Task goroutine.
-func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
+func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, file *fs.File, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
 	// Load the binary itself.
-	loaded, ac, d, argv, err := loadPath(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv)
+	loaded, ac, d, argv, err := loadBinary(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv)
 	if err != nil {
 		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", filename, err), syserr.FromError(err).ToLinux())
 	}
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 51db2d8f7..ed996ba51 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -30,8 +30,7 @@ import (
 const _AUDIT_ARCH_X86_64 = 0xc000003e
 
 // AMD64 is a table of Linux amd64 syscall API with the corresponding syscall
-// numbers from Linux 4.4. The entries commented out are those syscalls we
-// don't currently support.
+// numbers from Linux 4.4.
 var AMD64 = &kernel.SyscallTable{
 	OS:   abi.Linux,
 	Arch: arch.AMD64,
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 595eb9155..8ab7ffa25 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -96,7 +96,7 @@ func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 
 	// Load the new TaskContext.
 	maxTraversals := uint(linux.MaxSymlinkTraversals)
-	tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, filename, argv, envv, t.Arch().FeatureSet())
+	tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, filename, nil, argv, envv, t.Arch().FeatureSet())
 	if se != nil {
 		return 0, nil, se.ToError()
 	}
diff --git a/pkg/tcpip/link/fdbased/mmap_amd64.go b/pkg/tcpip/link/fdbased/mmap_amd64.go
index 8bbb4f9ab..029f86a18 100644
--- a/pkg/tcpip/link/fdbased/mmap_amd64.go
+++ b/pkg/tcpip/link/fdbased/mmap_amd64.go
@@ -134,7 +134,7 @@ func (d *packetMMapDispatcher) readMMappedPacket() ([]byte, *tcpip.Error) {
 			FD:     int32(d.fd),
 			Events: unix.POLLIN | unix.POLLERR,
 		}
-		if _, errno := rawfile.BlockingPoll(&event, 1, -1); errno != 0 {
+		if _, errno := rawfile.BlockingPoll(&event, 1, nil); errno != 0 {
 			if errno == syscall.EINTR {
 				continue
 			}
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
index b54131573..298bad55d 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
+++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
@@ -14,17 +14,18 @@
 
 #include "textflag.h"
 
-// BlockingPoll makes the poll() syscall while calling the version of
+// BlockingPoll makes the ppoll() syscall while calling the version of
 // entersyscall that relinquishes the P so that other Gs can run. This is meant
 // to be called in cases when the syscall is expected to block.
 //
-// func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (n int, err syscall.Errno)
+// func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (n int, err syscall.Errno)
 TEXT ·BlockingPoll(SB),NOSPLIT,$0-40
 	CALL	·callEntersyscallblock(SB)
 	MOVQ	fds+0(FP), DI
 	MOVQ	nfds+8(FP), SI
 	MOVQ	timeout+16(FP), DX
-	MOVQ	$0x7, AX // SYS_POLL
+	MOVQ	$0x0, R10  // sigmask parameter which isn't used here
+	MOVQ	$0x10f, AX // SYS_PPOLL
 	SYSCALL
 	CMPQ	AX, $0xfffffffffffff001
 	JLS	ok
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
index c87268610..47039a446 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
@@ -26,7 +26,7 @@ import (
 )
 
 //go:noescape
-func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (int, syscall.Errno)
+func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (int, syscall.Errno)
 
 // Use go:linkname to call into the runtime. As of Go 1.12 this has to
 // be done from Go code so that we make an ABIInternal call to an
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
index 4eab77c74..84dc0e918 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
@@ -21,9 +21,11 @@ import (
 	"unsafe"
 )
 
-// BlockingPoll is just a stub function that forwards to the poll() system call
+// BlockingPoll is just a stub function that forwards to the ppoll() system call
 // on non-amd64 platforms.
-func BlockingPoll(fds *PollEvent, nfds int, timeout int64) (int, syscall.Errno) {
-	n, _, e := syscall.Syscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout))
+func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (int, syscall.Errno) {
+	n, _, e := syscall.Syscall6(syscall.SYS_PPOLL, uintptr(unsafe.Pointer(fds)),
+		uintptr(nfds), uintptr(unsafe.Pointer(timeout)), 0, 0, 0)
+
 	return int(n), e
 }
diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
index e3fbb15c2..7e286a3a6 100644
--- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go
+++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
@@ -123,7 +123,7 @@ func BlockingRead(fd int, b []byte) (int, *tcpip.Error) {
 			Events: 1, // POLLIN
 		}
 
-		_, e = BlockingPoll(&event, 1, -1)
+		_, e = BlockingPoll(&event, 1, nil)
 		if e != 0 && e != syscall.EINTR {
 			return 0, TranslateErrno(e)
 		}
@@ -145,7 +145,7 @@ func BlockingReadv(fd int, iovecs []syscall.Iovec) (int, *tcpip.Error) {
 			Events: 1, // POLLIN
 		}
 
-		_, e = BlockingPoll(&event, 1, -1)
+		_, e = BlockingPoll(&event, 1, nil)
 		if e != 0 && e != syscall.EINTR {
 			return 0, TranslateErrno(e)
 		}
@@ -175,7 +175,7 @@ func BlockingRecvMMsg(fd int, msgHdrs []MMsgHdr) (int, *tcpip.Error) {
 			Events: 1, // POLLIN
 		}
 
-		if _, e := BlockingPoll(&event, 1, -1); e != 0 && e != syscall.EINTR {
+		if _, e := BlockingPoll(&event, 1, nil); e != 0 && e != syscall.EINTR {
 			return 0, TranslateErrno(e)
 		}
 	}
diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD
index 769509e80..cbd92fc05 100644
--- a/pkg/unet/BUILD
+++ b/pkg/unet/BUILD
@@ -11,8 +11,8 @@ go_library(
     importpath = "gvisor.dev/gvisor/pkg/unet",
     visibility = ["//visibility:public"],
     deps = [
-        "//pkg/abi/linux",
         "//pkg/gate",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/pkg/unet/unet_unsafe.go b/pkg/unet/unet_unsafe.go
index f8a42c914..85ef46edf 100644
--- a/pkg/unet/unet_unsafe.go
+++ b/pkg/unet/unet_unsafe.go
@@ -16,12 +16,11 @@ package unet
 
 import (
 	"io"
-	"math"
 	"sync/atomic"
 	"syscall"
 	"unsafe"
 
-	"gvisor.dev/gvisor/pkg/abi/linux"
+	"golang.org/x/sys/unix"
 )
 
 // wait blocks until the socket FD is ready for reading or writing, depending
@@ -37,23 +36,23 @@ func (s *Socket) wait(write bool) error {
 			return errClosing
 		}
 
-		events := []linux.PollFD{
+		events := []unix.PollFd{
 			{
 				// The actual socket FD.
-				FD:     fd,
-				Events: linux.POLLIN,
+				Fd:     fd,
+				Events: unix.POLLIN,
 			},
 			{
 				// The eventfd, signaled when we are closing.
-				FD:     int32(s.efd),
-				Events: linux.POLLIN,
+				Fd:     int32(s.efd),
+				Events: unix.POLLIN,
 			},
 		}
 		if write {
-			events[0].Events = linux.POLLOUT
+			events[0].Events = unix.POLLOUT
 		}
 
-		_, _, e := syscall.Syscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(&events[0])), 2, uintptr(math.MaxUint64))
+		_, _, e := syscall.Syscall6(syscall.SYS_PPOLL, uintptr(unsafe.Pointer(&events[0])), 2, 0, 0, 0, 0)
 		if e == syscall.EINTR {
 			continue
 		}
@@ -61,7 +60,7 @@ func (s *Socket) wait(write bool) error {
 			return e
 		}
 
-		if events[1].REvents&linux.POLLIN == linux.POLLIN {
+		if events[1].Revents&unix.POLLIN == unix.POLLIN {
 			// eventfd signaled, we're closing.
 			return errClosing
 		}