Check in gVisor.

PiperOrigin-RevId: 194583126 Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
author: Googler <noreply@google.com> 2018-04-27 10:37:02 -0700
committer: Adin Scannell <ascannell@google.com> 2018-04-28 01:44:26 -0400
commit: d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree: 54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/mm
parent: f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)
17 files changed, 5211 insertions, 0 deletions
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
new file mode 100644
index 000000000..39bde2be3
--- /dev/null
+++ b/pkg/sentry/mm/BUILD
@@ -0,0 +1,155 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "mm_state",
+    srcs = [
+        "aio_context.go",
+        "aio_context_state.go",
+        "file_refcount_set.go",
+        "io_list.go",
+        "mm.go",
+        "pma_set.go",
+        "save_restore.go",
+        "special_mappable.go",
+        "vma_set.go",
+    ],
+    out = "mm_state.go",
+    package = "mm",
+)
+
+go_template_instance(
+    name = "file_refcount_set",
+    out = "file_refcount_set.go",
+    imports = {
+        "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
+    },
+    package = "mm",
+    prefix = "fileRefcount",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "platform.FileRange",
+        "Value": "int32",
+        "Functions": "fileRefcountSetFunctions",
+    },
+)
+
+go_template_instance(
+    name = "vma_set",
+    out = "vma_set.go",
+    consts = {
+        "minDegree": "8",
+    },
+    imports = {
+        "usermem": "gvisor.googlesource.com/gvisor/pkg/sentry/usermem",
+    },
+    package = "mm",
+    prefix = "vma",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "usermem.Addr",
+        "Range": "usermem.AddrRange",
+        "Value": "vma",
+        "Functions": "vmaSetFunctions",
+    },
+)
+
+go_template_instance(
+    name = "pma_set",
+    out = "pma_set.go",
+    consts = {
+        "minDegree": "8",
+    },
+    imports = {
+        "usermem": "gvisor.googlesource.com/gvisor/pkg/sentry/usermem",
+    },
+    package = "mm",
+    prefix = "pma",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "usermem.Addr",
+        "Range": "usermem.AddrRange",
+        "Value": "pma",
+        "Functions": "pmaSetFunctions",
+    },
+)
+
+go_template_instance(
+    name = "io_list",
+    out = "io_list.go",
+    package = "mm",
+    prefix = "io",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*ioResult",
+    },
+)
+
+go_library(
+    name = "mm",
+    srcs = [
+        "address_space.go",
+        "aio_context.go",
+        "aio_context_state.go",
+        "debug.go",
+        "file_refcount_set.go",
+        "io.go",
+        "io_list.go",
+        "lifecycle.go",
+        "metadata.go",
+        "mm.go",
+        "mm_state.go",
+        "pma.go",
+        "pma_set.go",
+        "proc_pid_maps.go",
+        "save_restore.go",
+        "special_mappable.go",
+        "syscalls.go",
+        "vma.go",
+        "vma_set.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/mm",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/atomicbitops",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/proc/seqfile",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/platform/safecopy",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/tcpip/buffer",
+    ],
+)
+
+go_test(
+    name = "mm_test",
+    size = "small",
+    srcs = ["mm_test.go"],
+    embed = [":mm"],
+    deps = [
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/usermem",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/mm/README.md b/pkg/sentry/mm/README.md
new file mode 100644
index 000000000..067733475
--- /dev/null
+++ b/pkg/sentry/mm/README.md
@@ -0,0 +1,279 @@
+This package provides an emulation of Linux semantics for application virtual
+memory mappings.
+
+For completeness, this document also describes aspects of the memory management
+subsystem defined outside this package.
+
+# Background
+
+We begin by describing semantics for virtual memory in Linux.
+
+A virtual address space is defined as a collection of mappings from virtual
+addresses to physical memory. However, userspace applications do not configure
+mappings to physical memory directly. Instead, applications configure memory
+mappings from virtual addresses to offsets into a file using the `mmap` system
+call.[^mmap-anon] For example, a call to:
+
+    mmap(
+        /* addr = */ 0x400000,
+        /* length = */ 0x1000,
+        PROT_READ | PROT_WRITE,
+        MAP_SHARED,
+        /* fd = */ 3,
+        /* offset = */ 0);
+
+creates a mapping of length 0x1000 bytes, starting at virtual address (VA)
+0x400000, to offset 0 in the file represented by file descriptor (FD) 3. Within
+the Linux kernel, virtual memory mappings are represented by *virtual memory
+areas* (VMAs). Supposing that FD 3 represents file /tmp/foo, the state of the
+virtual memory subsystem after the `mmap` call may be depicted as:
+
+    VMA:     VA:0x400000 -> /tmp/foo:0x0
+
+Establishing a virtual memory area does not necessarily establish a mapping to a
+physical address, because Linux has not necessarily provisioned physical memory
+to store the file's contents. Thus, if the application attempts to read the
+contents of VA 0x400000, it may incur a *page fault*, a CPU exception that
+forces the kernel to create such a mapping to service the read.
+
+For a file, doing so consists of several logical phases:
+
+1. The kernel allocates physical memory to store the contents of the required
+   part of the file, and copies file contents to the allocated memory. Supposing
+   that the kernel chooses the physical memory at physical address (PA)
+   0x2fb000, the resulting state of the system is:
+
+        VMA:     VA:0x400000 -> /tmp/foo:0x0
+        Filemap:                /tmp/foo:0x0 -> PA:0x2fb000
+
+   (In Linux the state of the mapping from file offset to physical memory is
+   stored in `struct address_space`, but to avoid confusion with other notions
+   of address space we will refer to this system as filemap, named after Linux
+   kernel source file `mm/filemap.c`.)
+
+2. The kernel stores the effective mapping from virtual to physical address in a
+   *page table entry* (PTE) in the application's *page tables*, which are used
+   by the CPU's virtual memory hardware to perform address translation. The
+   resulting state of the system is:
+
+        VMA:     VA:0x400000 -> /tmp/foo:0x0
+        Filemap:                /tmp/foo:0x0 -> PA:0x2fb000
+        PTE:     VA:0x400000 -----------------> PA:0x2fb000
+
+   The PTE is required for the application to actually use the contents of the
+   mapped file as virtual memory. However, the PTE is derived from the VMA and
+   filemap state, both of which are independently mutable, such that mutations
+   to either will affect the PTE. For example:
+
+   - The application may remove the VMA using the `munmap` system call. This
+     breaks the mapping from VA:0x400000 to /tmp/foo:0x0, and consequently the
+     mapping from VA:0x400000 to PA:0x2fb000. However, it does not necessarily
+     break the mapping from /tmp/foo:0x0 to PA:0x2fb000, so a future mapping of
+     the same file offset may reuse this physical memory.
+
+   - The application may invalidate the file's contents by passing a length of 0
+     to the `ftruncate` system call. This breaks the mapping from /tmp/foo:0x0
+     to PA:0x2fb000, and consequently the mapping from VA:0x400000 to
+     PA:0x2fb000. However, it does not break the mapping from VA:0x400000 to
+     /tmp/foo:0x0, so future changes to the file's contents may again be made
+     visible at VA:0x400000 after another page fault results in the allocation
+     of a new physical address.
+
+   Note that, in order to correctly break the mapping from VA:0x400000 to
+   PA:0x2fb000 in the latter case, filemap must also store a *reverse mapping*
+   from /tmp/foo:0x0 to VA:0x400000 so that it can locate and remove the PTE.
+
+[^mmap-anon]: Memory mappings to non-files are discussed in later sections.
+
+## Private Mappings
+
+The preceding example considered VMAs created using the `MAP_SHARED` flag, which
+means that PTEs derived from the mapping should always use physical memory that
+represents the current state of the mapped file.[^mmap-dev-zero] Applications
+can alternatively pass the `MAP_PRIVATE` flag to create a *private mapping*.
+Private mappings are *copy-on-write*.
+
+Suppose that the application instead created a private mapping in the previous
+example. In Linux, the state of the system after a read page fault would be:
+
+    VMA:     VA:0x400000 -> /tmp/foo:0x0 (private)
+    Filemap:                /tmp/foo:0x0 -> PA:0x2fb000
+    PTE:     VA:0x400000 -----------------> PA:0x2fb000 (read-only)
+
+Now suppose the application attempts to write to VA:0x400000. For a shared
+mapping, the write would be propagated to PA:0x2fb000, and the kernel would be
+responsible for ensuring that the write is later propagated to the mapped file.
+For a private mapping, the write incurs another page fault since the PTE is
+marked read-only. In response, the kernel allocates physical memory to store the
+mapping's *private copy* of the file's contents, copies file contents to the
+allocated memory, and changes the PTE to map to the private copy. Supposing that
+the kernel chooses the physical memory at physical address (PA) 0x5ea000, the
+resulting state of the system is:
+
+    VMA:     VA:0x400000 -> /tmp/foo:0x0 (private)
+    Filemap:                /tmp/foo:0x0 -> PA:0x2fb000
+    PTE:     VA:0x400000 -----------------> PA:0x5ea000
+
+Note that the filemap mapping from /tmp/foo:0x0 to PA:0x2fb000 may still exist,
+but is now irrelevant to this mapping.
+
+[^mmap-dev-zero]: Modulo files with special mmap semantics such as `/dev/zero`.
+
+## Anonymous Mappings
+
+Instead of passing a file to the `mmap` system call, applications can instead
+request an *anonymous* mapping by passing the `MAP_ANONYMOUS` flag.
+Semantically, an anonymous mapping is essentially a mapping to an ephemeral file
+initially filled with zero bytes. Practically speaking, this is how shared
+anonymous mappings are implemented, but private anonymous mappings do not result
+in the creation of an ephemeral file; since there would be no way to modify the
+contents of the underlying file through a private mapping, all private anonymous
+mappings use a single shared page filled with zero bytes until copy-on-write
+occurs.
+
+# Virtual Memory in the Sentry
+
+The sentry implements application virtual memory atop a host kernel, introducing
+an additional level of indirection to the above.
+
+Consider the same scenario as in the previous section. Since the sentry handles
+application system calls, the effect of an application `mmap` system call is to
+create a VMA in the sentry (as opposed to the host kernel):
+
+    Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
+
+When the application first incurs a page fault on this address, the host kernel
+delivers information about the page fault to the sentry in a platform-dependent
+manner, and the sentry handles the fault:
+
+1. The sentry allocates memory to store the contents of the required part of the
+   file, and copies file contents to the allocated memory. However, since the
+   sentry is implemented atop a host kernel, it does not configure mappings to
+   physical memory directly. Instead, mappable "memory" in the sentry is
+   represented by a host file descriptor and offset, since (as noted in
+   "Background") this is the memory mapping primitive provided by the host
+   kernel. In general, memory is allocated from a temporary host file using the
+   `filemem` package. Supposing that the sentry allocates offset 0x3000 from
+   host file "memory-file", the resulting state is:
+
+        Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
+        Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
+
+2. The sentry stores the effective mapping from virtual address to host file in
+   a host VMA by invoking the `mmap` system call:
+
+        Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
+        Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
+          Host VMA:     VA:0x400000 -----------------> host:memory-file:0x3000
+
+3. The sentry returns control to the application, which immediately incurs the
+   page fault again.[^mmap-populate] However, since a host VMA now exists for
+   the faulting virtual address, the host kernel now handles the page fault as
+   described in "Background":
+
+        Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
+        Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
+          Host VMA:     VA:0x400000 -----------------> host:memory-file:0x3000
+          Host filemap:                                host:memory-file:0x3000 -> PA:0x2fb000
+          Host PTE:     VA:0x400000 --------------------------------------------> PA:0x2fb000
+
+Thus, from an implementation standpoint, host VMAs serve the same purpose in the
+sentry that PTEs do in Linux. As in Linux, sentry VMA and filemap state is
+independently mutable, and the desired state of host VMAs is derived from that
+state.
+
+[^mmap-populate]: The sentry could force the host kernel to establish PTEs when
+                  it creates the host VMA by passing the `MAP_POPULATE` flag to
+                  the `mmap` system call, but usually does not. This is because,
+                  to reduce the number of page faults that require handling by
+                  the sentry and (correspondingly) the number of host `mmap`
+                  system calls, the sentry usually creates host VMAs that are
+                  much larger than the single faulting page.
+
+## Private Mappings
+
+The sentry implements private mappings consistently with Linux. Before
+copy-on-write, the private mapping example given in the Background results in:
+
+    Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0 (private)
+    Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
+      Host VMA:     VA:0x400000 -----------------> host:memory-file:0x3000 (read-only)
+      Host filemap:                                host:memory-file:0x3000 -> PA:0x2fb000
+      Host PTE:     VA:0x400000 --------------------------------------------> PA:0x2fb000 (read-only)
+
+When the application attempts to write to this address, the host kernel delivers
+information about the resulting page fault to the sentry. Analogous to Linux,
+the sentry allocates memory to store the mapping's private copy of the file's
+contents, copies file contents to the allocated memory, and changes the host VMA
+to map to the private copy. Supposing that the sentry chooses the offset 0x4000
+in host file `memory-file` to store the private copy, the state of the system
+after copy-on-write is:
+
+    Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0 (private)
+    Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
+      Host VMA:     VA:0x400000 -----------------> host:memory-file:0x4000
+      Host filemap:                                host:memory-file:0x4000 -> PA:0x5ea000
+      Host PTE:     VA:0x400000 --------------------------------------------> PA:0x5ea000
+
+However, this highlights an important difference between Linux and the sentry.
+In Linux, page tables are concrete (architecture-dependent) data structures
+owned by the kernel. Conversely, the sentry has the ability to create and
+destroy host VMAs using host system calls, but it does not have direct access to
+their state. Thus, as written, if the application invokes the `munmap` system
+call to remove the sentry VMA, it is non-trivial for the sentry to determine
+that it should deallocate `host:memory-file:0x4000`. This implies that the
+sentry must retain information about the host VMAs that it has created.
+
+## Anonymous Mappings
+
+The sentry implements anonymous mappings consistently with Linux, except that
+there is no shared zero page.
+
+# Implementation Constructs
+
+In Linux:
+
+- A virtual address space is represented by `struct mm_struct`.
+
+- VMAs are represented by `struct vm_area_struct`, stored in `struct
+  mm_struct::mmap`.
+
+- Mappings from file offsets to physical memory are stored in `struct
+  address_space`.
+
+- Reverse mappings from file offsets to virtual mappings are stored in `struct
+  address_space::i_mmap`.
+
+- Physical memory pages are represented by a pointer to `struct page` or an
+  index called a *page frame number* (PFN), represented by `pfn_t`.
+
+- PTEs are represented by architecture-dependent type `pte_t`, stored in a table
+  hierarchy rooted at `struct mm_struct::pgd`.
+
+In the sentry:
+
+- A virtual address space is represented by type [`mm.MemoryManager`][mm].
+
+- Sentry VMAs are represented by type [`mm.vma`][mm], stored in
+  `mm.MemoryManager.vmas`.
+
+- Mappings from sentry file offsets to host file offsets are abstracted through
+  interface method [`memmap.Mappable.Translate`][memmap].
+
+- Reverse mappings from sentry file offsets to virtual mappings are abstracted
+  through interface methods [`memmap.Mappable.AddMapping` and
+  `memmap.Mappable.RemoveMapping`][memmap].
+
+- Host files that may be mapped into host VMAs are represented by type
+  [`platform.File`][platform].
+
+- Host VMAs are represented in the sentry by type [`mm.pma`][mm] ("platform
+  mapping area"), stored in `mm.MemoryManager.pmas`.
+
+- Creation and destruction of host VMAs is abstracted through interface methods
+  [`platform.AddressSpace.MapFile` and `platform.AddressSpace.Unmap`][platform].
+
+[filemem]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/platform/filemem/filemem.go
+[memmap]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/memmap/memmap.go
+[mm]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/mm/mm.go
+[platform]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/platform/platform.go
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
new file mode 100644
index 000000000..4dd67b1ea
--- /dev/null
+++ b/pkg/sentry/mm/address_space.go
@@ -0,0 +1,223 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// AddressSpace returns the platform.AddressSpace bound to mm.
+//
+// Preconditions: The caller must have called mm.Activate().
+func (mm *MemoryManager) AddressSpace() platform.AddressSpace {
+	if atomic.LoadInt32(&mm.active) == 0 {
+		panic("trying to use inactive address space?")
+	}
+	return mm.as
+}
+
+// Activate ensures this MemoryManager has a platform.AddressSpace.
+//
+// The caller must not hold any locks when calling Activate.
+//
+// When this MemoryManager is no longer needed by a task, it should call
+// Deactivate to release the reference.
+func (mm *MemoryManager) Activate() error {
+	// Fast path: the MemoryManager already has an active
+	// platform.AddressSpace, and we just need to indicate that we need it too.
+	if atomicbitops.IncUnlessZeroInt32(&mm.active) {
+		return nil
+	}
+
+	for {
+		// Slow path: may need to synchronize with other goroutines changing
+		// mm.active to or from zero.
+		mm.activeMu.Lock()
+		// Inline Unlock instead of using a defer for performance since this
+		// method is commonly in the hot-path.
+
+		// Check if we raced with another goroutine performing activation.
+		if atomic.LoadInt32(&mm.active) > 0 {
+			// This can't race; Deactivate can't decrease mm.active from 1 to 0
+			// without holding activeMu.
+			atomic.AddInt32(&mm.active, 1)
+			mm.activeMu.Unlock()
+			return nil
+		}
+
+		// Do we have a context? If so, then we never unmapped it. This can
+		// only be the case if !mm.p.CooperativelySchedulesAddressSpace().
+		if mm.as != nil {
+			atomic.StoreInt32(&mm.active, 1)
+			mm.activeMu.Unlock()
+			return nil
+		}
+
+		// Get a new address space. We must force unmapping by passing nil to
+		// NewAddressSpace if requested. (As in the nil interface object, not a
+		// typed nil.)
+		mappingsID := (interface{})(mm)
+		if mm.unmapAllOnActivate {
+			mappingsID = nil
+		}
+		as, c, err := mm.p.NewAddressSpace(mappingsID)
+		if err != nil {
+			mm.activeMu.Unlock()
+			return err
+		}
+		if as == nil {
+			// AddressSpace is unavailable, we must wait.
+			//
+			// activeMu must not be held while waiting, as the user
+			// of the address space we are waiting on may attempt
+			// to take activeMu.
+			//
+			// Don't call UninterruptibleSleepStart to register the
+			// wait to allow the watchdog stuck task to trigger in
+			// case a process is starved waiting for the address
+			// space.
+			mm.activeMu.Unlock()
+			<-c
+			continue
+		}
+
+		// Okay, we could restore all mappings at this point.
+		// But forget that. Let's just let them fault in.
+		mm.as = as
+
+		// Unmapping is done, if necessary.
+		mm.unmapAllOnActivate = false
+
+		// Now that m.as has been assigned, we can set m.active to a non-zero value
+		// to enable the fast path.
+		atomic.StoreInt32(&mm.active, 1)
+
+		mm.activeMu.Unlock()
+		return nil
+	}
+}
+
+// Deactivate releases a release to the MemoryManager.
+func (mm *MemoryManager) Deactivate() error {
+	// Fast path: this is not the last goroutine to deactivate the
+	// MemoryManager.
+	if atomicbitops.DecUnlessOneInt32(&mm.active) {
+		return nil
+	}
+
+	mm.activeMu.Lock()
+	// Same as Activate.
+
+	// Still active?
+	if atomic.AddInt32(&mm.active, -1) > 0 {
+		mm.activeMu.Unlock()
+		return nil
+	}
+
+	// Can we hold on to the address space?
+	if !mm.p.CooperativelySchedulesAddressSpace() {
+		mm.activeMu.Unlock()
+		return nil
+	}
+
+	// Release the address space.
+	if err := mm.as.Release(); err != nil {
+		atomic.StoreInt32(&mm.active, 1)
+		mm.activeMu.Unlock()
+		return err
+	}
+
+	// Lost it.
+	mm.as = nil
+	mm.activeMu.Unlock()
+	return nil
+}
+
+// mapASLocked maps addresses in ar into mm.as. If precommit is true, mappings
+// for all addresses in ar should be precommitted.
+//
+// Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0.
+// ar must be page-aligned. pseg.Range().Contains(ar.Start).
+func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error {
+	// By default, map entire pmas at a time, under the assumption that there
+	// is no cost to mapping more of a pma than necessary.
+	mapAR := usermem.AddrRange{0, ^usermem.Addr(usermem.PageSize - 1)}
+	if precommit {
+		// When explicitly precommitting, only map ar, since overmapping may
+		// incur unexpected resource usage.
+		mapAR = ar
+	} else if mapUnit := mm.p.MapUnit(); mapUnit != 0 {
+		// Limit the range we map to ar, aligned to mapUnit.
+		mapMask := usermem.Addr(mapUnit - 1)
+		mapAR.Start = ar.Start &^ mapMask
+		// If rounding ar.End up overflows, just keep the existing mapAR.End.
+		if end := (ar.End + mapMask) &^ mapMask; end >= ar.End {
+			mapAR.End = end
+		}
+	}
+	if checkInvariants {
+		if !mapAR.IsSupersetOf(ar) {
+			panic(fmt.Sprintf("mapAR %#v is not a superset of ar %#v", mapAR, ar))
+		}
+	}
+
+	for {
+		pma := pseg.ValuePtr()
+		pmaAR := pseg.Range()
+		pmaMapAR := pmaAR.Intersect(mapAR)
+		perms := pma.vmaEffectivePerms
+		if pma.needCOW {
+			perms.Write = false
+		}
+		if err := pma.file.MapInto(mm.as, pmaMapAR.Start, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
+			return err
+		}
+		// Since this checks ar.End and not mapAR.End, we will never map a pma
+		// that is not required.
+		if ar.End <= pmaAR.End {
+			return nil
+		}
+		pseg = pseg.NextSegment()
+	}
+}
+
+// unmapASLocked removes all AddressSpace mappings for addresses in ar.
+//
+// Preconditions: mm.activeMu must be locked.
+func (mm *MemoryManager) unmapASLocked(ar usermem.AddrRange) {
+	if mm.as == nil {
+		// No AddressSpace? Force all mappings to be unmapped on the next
+		// Activate.
+		mm.unmapAllOnActivate = true
+		return
+	}
+
+	// unmapASLocked doesn't require vmas or pmas to exist for ar, so it can be
+	// passed ranges that include addresses that can't be mapped by the
+	// application.
+	ar = ar.Intersect(mm.applicationAddrRange())
+
+	// Note that this AddressSpace may or may not be active. If the
+	// platform does not require cooperative sharing of AddressSpaces, they
+	// are retained between Deactivate/Activate calls. Despite not being
+	// active, it is still valid to perform operations on these address
+	// spaces.
+	mm.as.Unmap(ar.Start, uint64(ar.Length()))
+}
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
new file mode 100644
index 000000000..992bde5a5
--- /dev/null
+++ b/pkg/sentry/mm/aio_context.go
@@ -0,0 +1,377 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// aioManager creates and manages asynchronous I/O contexts.
+type aioManager struct {
+	// mu protects below.
+	mu sync.Mutex `state:"nosave"`
+
+	// aioContexts is the set of asynchronous I/O contexts.
+	contexts map[uint64]*AIOContext
+}
+
+func (a *aioManager) destroy() {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	for _, ctx := range a.contexts {
+		ctx.destroy()
+	}
+}
+
+// newAIOContext creates a new context for asynchronous I/O.
+//
+// Returns false if 'id' is currently in use.
+func (a *aioManager) newAIOContext(events uint32, id uint64) bool {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	if _, ok := a.contexts[id]; ok {
+		return false
+	}
+
+	a.contexts[id] = &AIOContext{
+		done:           make(chan struct{}, 1),
+		maxOutstanding: events,
+	}
+	return true
+}
+
+// destroyAIOContext destroys an asynchronous I/O context.
+//
+// False is returned if the context does not exist.
+func (a *aioManager) destroyAIOContext(id uint64) bool {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	ctx, ok := a.contexts[id]
+	if !ok {
+		return false
+	}
+	delete(a.contexts, id)
+	ctx.destroy()
+	return true
+}
+
+// lookupAIOContext looks up the given context.
+//
+// Returns false if context does not exist.
+func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	ctx, ok := a.contexts[id]
+	return ctx, ok
+}
+
+// ioResult is a completed I/O operation.
+type ioResult struct {
+	data interface{}
+	ioEntry
+}
+
+// AIOContext is a single asynchronous I/O context.
+type AIOContext struct {
+	// done is the notification channel used for all requests.
+	done chan struct{} `state:"nosave"`
+
+	// mu protects below.
+	mu sync.Mutex `state:"nosave"`
+
+	// results is the set of completed requests.
+	results ioList
+
+	// maxOutstanding is the maximum number of outstanding entries; this value
+	// is immutable.
+	maxOutstanding uint32
+
+	// outstanding is the number of requests outstanding; this will effectively
+	// be the number of entries in the result list or that are expected to be
+	// added to the result list.
+	outstanding uint32
+
+	// dead is set when the context is destroyed.
+	dead bool `state:"zerovalue"`
+}
+
+// destroy marks the context dead.
+func (ctx *AIOContext) destroy() {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+	ctx.dead = true
+	if ctx.outstanding == 0 {
+		close(ctx.done)
+	}
+}
+
+// Prepare reserves space for a new request, returning true if available.
+// Returns false if the context is busy.
+func (ctx *AIOContext) Prepare() bool {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+	if ctx.outstanding >= ctx.maxOutstanding {
+		return false
+	}
+	ctx.outstanding++
+	return true
+}
+
+// PopRequest pops a completed request if available, this function does not do
+// any blocking. Returns false if no request is available.
+func (ctx *AIOContext) PopRequest() (interface{}, bool) {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+
+	// Is there anything ready?
+	if e := ctx.results.Front(); e != nil {
+		ctx.results.Remove(e)
+		ctx.outstanding--
+		if ctx.outstanding == 0 && ctx.dead {
+			close(ctx.done)
+		}
+		return e.data, true
+	}
+	return nil, false
+}
+
+// FinishRequest finishes a pending request. It queues up the data
+// and notifies listeners.
+func (ctx *AIOContext) FinishRequest(data interface{}) {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+
+	// Push to the list and notify opportunistically. The channel notify
+	// here is guaranteed to be safe because outstanding must be non-zero.
+	// The done channel is only closed when outstanding reaches zero.
+	ctx.results.PushBack(&ioResult{data: data})
+
+	select {
+	case ctx.done <- struct{}{}:
+	default:
+	}
+}
+
+// WaitChannel returns a channel that is notified when an AIO request is
+// completed.
+//
+// The boolean return value indicates whether or not the context is active.
+func (ctx *AIOContext) WaitChannel() (chan struct{}, bool) {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+	if ctx.outstanding == 0 && ctx.dead {
+		return nil, false
+	}
+	return ctx.done, true
+}
+
+// aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO
+// ring buffers.
+type aioMappable struct {
+	refs.AtomicRefCount
+
+	p  platform.Platform
+	fr platform.FileRange
+}
+
+var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp())
+
+func newAIOMappable(p platform.Platform) (*aioMappable, error) {
+	fr, err := p.Memory().Allocate(aioRingBufferSize, usage.Anonymous)
+	if err != nil {
+		return nil, err
+	}
+	return &aioMappable{p: p, fr: fr}, nil
+}
+
+// DecRef implements refs.RefCounter.DecRef.
+func (m *aioMappable) DecRef() {
+	m.AtomicRefCount.DecRefWithDestructor(func() {
+		m.p.Memory().DecRef(m.fr)
+	})
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (m *aioMappable) MappedName(ctx context.Context) string {
+	return "[aio]"
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (m *aioMappable) DeviceID() uint64 {
+	return 0
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (m *aioMappable) InodeID() uint64 {
+	return 0
+}
+
+// Msync implements memmap.MappingIdentity.Msync.
+func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error {
+	// Linux: aio_ring_fops.fsync == NULL
+	return syserror.EINVAL
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (m *aioMappable) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+	// Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap()
+	// sets VM_DONTEXPAND).
+	if offset != 0 || uint64(ar.Length()) != aioRingBufferSize {
+		return syserror.EFAULT
+	}
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (m *aioMappable) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
+	// Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap()
+	// sets VM_DONTEXPAND).
+	if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize {
+		return syserror.EFAULT
+	}
+	// Require that the mapping correspond to a live AIOContext. Compare
+	// Linux's fs/aio.c:aio_ring_mremap().
+	mm, ok := ms.(*MemoryManager)
+	if !ok {
+		return syserror.EINVAL
+	}
+	am := &mm.aioManager
+	am.mu.Lock()
+	defer am.mu.Unlock()
+	oldID := uint64(srcAR.Start)
+	aioCtx, ok := am.contexts[oldID]
+	if !ok {
+		return syserror.EINVAL
+	}
+	aioCtx.mu.Lock()
+	defer aioCtx.mu.Unlock()
+	if aioCtx.dead {
+		return syserror.EINVAL
+	}
+	// Use the new ID for the AIOContext.
+	am.contexts[uint64(dstAR.Start)] = aioCtx
+	delete(am.contexts, oldID)
+	return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	var err error
+	if required.End > m.fr.Length() {
+		err = &memmap.BusError{syserror.EFAULT}
+	}
+	if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 {
+		return []memmap.Translation{
+			{
+				Source: source,
+				File:   m.p.Memory(),
+				Offset: m.fr.Start + source.Start,
+			},
+		}, err
+	}
+	return nil, err
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (m *aioMappable) InvalidateUnsavable(ctx context.Context) error {
+	return nil
+}
+
+// NewAIOContext creates a new context for asynchronous I/O.
+//
+// NewAIOContext is analogous to Linux's fs/aio.c:ioctx_alloc().
+func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint64, error) {
+	// libaio get_ioevents() expects context "handle" to be a valid address.
+	// libaio peeks inside looking for a magic number. This function allocates
+	// a page per context and keeps it set to zeroes to ensure it will not
+	// match AIO_RING_MAGIC and make libaio happy.
+	m, err := newAIOMappable(mm.p)
+	if err != nil {
+		return 0, err
+	}
+	defer m.DecRef()
+	addr, err := mm.MMap(ctx, memmap.MMapOpts{
+		Length:          aioRingBufferSize,
+		MappingIdentity: m,
+		Mappable:        m,
+		// TODO: Linux does "do_mmap_pgoff(..., PROT_READ |
+		// PROT_WRITE, ...)" in fs/aio.c:aio_setup_ring(); why do we make this
+		// mapping read-only?
+		Perms:    usermem.Read,
+		MaxPerms: usermem.Read,
+	})
+	if err != nil {
+		return 0, err
+	}
+	id := uint64(addr)
+	if !mm.aioManager.newAIOContext(events, id) {
+		mm.MUnmap(ctx, addr, aioRingBufferSize)
+		return 0, syserror.EINVAL
+	}
+	return id, nil
+}
+
+// DestroyAIOContext destroys an asynchronous I/O context. It returns false if
+// the context does not exist.
+func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) bool {
+	if _, ok := mm.LookupAIOContext(ctx, id); !ok {
+		return false
+	}
+
+	// Only unmaps after it assured that the address is a valid aio context to
+	// prevent random memory from been unmapped.
+	//
+	// Note: It's possible to unmap this address and map something else into
+	// the same address. Then it would be unmapping memory that it doesn't own.
+	// This is, however, the way Linux implements AIO. Keeps the same [weird]
+	// semantics in case anyone relies on it.
+	mm.MUnmap(ctx, usermem.Addr(id), aioRingBufferSize)
+
+	return mm.aioManager.destroyAIOContext(id)
+}
+
+// LookupAIOContext looks up the given context. It returns false if the context
+// does not exist.
+func (mm *MemoryManager) LookupAIOContext(ctx context.Context, id uint64) (*AIOContext, bool) {
+	aioCtx, ok := mm.aioManager.lookupAIOContext(id)
+	if !ok {
+		return nil, false
+	}
+
+	// Protect against 'ids' that are inaccessible (Linux also reads 4 bytes
+	// from id).
+	var buf [4]byte
+	_, err := mm.CopyIn(ctx, usermem.Addr(id), buf[:], usermem.IOOpts{})
+	if err != nil {
+		return nil, false
+	}
+
+	return aioCtx, true
+}
diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go
new file mode 100644
index 000000000..1a5e56f8e
--- /dev/null
+++ b/pkg/sentry/mm/aio_context_state.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+// afterLoad is invoked by stateify.
+func (a *AIOContext) afterLoad() {
+	a.done = make(chan struct{}, 1)
+}
diff --git a/pkg/sentry/mm/debug.go b/pkg/sentry/mm/debug.go
new file mode 100644
index 000000000..56d0490f0
--- /dev/null
+++ b/pkg/sentry/mm/debug.go
@@ -0,0 +1,98 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+const (
+	// If checkInvariants is true, perform runtime checks for invariants
+	// expected by the mm package. This is normally disabled since MM is a
+	// significant hot path in general, and some such checks (notably
+	// memmap.CheckTranslateResult) are very expensive.
+	checkInvariants = false
+
+	// If logIOErrors is true, log I/O errors that originate from MM before
+	// converting them to EFAULT.
+	logIOErrors = false
+)
+
+// String implements fmt.Stringer.String.
+func (mm *MemoryManager) String() string {
+	return mm.DebugString(context.Background())
+}
+
+// DebugString returns a string containing information about mm for debugging.
+func (mm *MemoryManager) DebugString(ctx context.Context) string {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	mm.activeMu.RLock()
+	defer mm.activeMu.RUnlock()
+	return mm.debugStringLocked(ctx)
+}
+
+// Preconditions: mm.mappingMu and mm.activeMu must be locked.
+func (mm *MemoryManager) debugStringLocked(ctx context.Context) string {
+	var b bytes.Buffer
+	b.WriteString("VMAs:\n")
+	for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+		b.Write(mm.vmaMapsEntryLocked(ctx, vseg))
+	}
+	b.WriteString("PMAs:\n")
+	for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
+		b.Write(pseg.debugStringEntryLocked())
+	}
+	return string(b.Bytes())
+}
+
+// Preconditions: mm.activeMu must be locked.
+func (pseg pmaIterator) debugStringEntryLocked() []byte {
+	var b bytes.Buffer
+
+	fmt.Fprintf(&b, "%08x-%08x ", pseg.Start(), pseg.End())
+
+	pma := pseg.ValuePtr()
+	if pma.vmaEffectivePerms.Read {
+		b.WriteByte('r')
+	} else {
+		b.WriteByte('-')
+	}
+	if pma.vmaEffectivePerms.Write {
+		if pma.needCOW {
+			b.WriteByte('c')
+		} else {
+			b.WriteByte('w')
+		}
+	} else {
+		b.WriteByte('-')
+	}
+	if pma.vmaEffectivePerms.Execute {
+		b.WriteByte('x')
+	} else {
+		b.WriteByte('-')
+	}
+	if pma.private {
+		b.WriteByte('p')
+	} else {
+		b.WriteByte('s')
+	}
+
+	fmt.Fprintf(&b, " %08x %T\n", pma.off, pma.file)
+	return b.Bytes()
+}
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
new file mode 100644
index 000000000..cac81a59d
--- /dev/null
+++ b/pkg/sentry/mm/io.go
@@ -0,0 +1,604 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// There are two supported ways to copy data to/from application virtual
+// memory:
+//
+// 1. Internally-mapped copying: Determine the platform.File that backs the
+// copied-to/from virtual address, obtain a mapping of its pages, and read or
+// write to the mapping.
+//
+// 2. AddressSpace copying: If platform.Platform.SupportsAddressSpaceIO() is
+// true, AddressSpace permissions are applicable, and an AddressSpace is
+// available, copy directly through the AddressSpace, handling faults as
+// needed.
+//
+// (Given that internally-mapped copying requires that backing memory is always
+// implemented using a host file descriptor, we could also preadv/pwritev to it
+// instead. But this would incur a host syscall for each use of the mapped
+// page, whereas mmap is a one-time cost.)
+//
+// The fixed overhead of internally-mapped copying is expected to be higher
+// than that of AddressSpace copying since the former always needs to translate
+// addresses, whereas the latter only needs to do so when faults occur.
+// However, the throughput of internally-mapped copying is expected to be
+// somewhat higher than that of AddressSpace copying due to the high cost of
+// page faults and because implementations of the latter usually rely on
+// safecopy, which doesn't use AVX registers. So we prefer to use AddressSpace
+// copying (when available) for smaller copies, and switch to internally-mapped
+// copying once a size threshold is exceeded.
+const (
+	// copyMapMinBytes is the size threshold for switching to internally-mapped
+	// copying in CopyOut, CopyIn, and ZeroOut.
+	copyMapMinBytes = 32 << 10 // 32 KB
+
+	// rwMapMinBytes is the size threshold for switching to internally-mapped
+	// copying in CopyOutFrom and CopyInTo. It's lower than copyMapMinBytes
+	// since AddressSpace copying in this case requires additional buffering;
+	// see CopyOutFrom for details.
+	rwMapMinBytes = 512
+)
+
+// checkIORange is similar to usermem.Addr.ToRange, but applies bounds checks
+// consistent with Linux's arch/x86/include/asm/uaccess.h:access_ok().
+//
+// Preconditions: length >= 0.
+func (mm *MemoryManager) checkIORange(addr usermem.Addr, length int64) (usermem.AddrRange, bool) {
+	// Note that access_ok() constrains end even if length == 0.
+	ar, ok := addr.ToRange(uint64(length))
+	return ar, (ok && ar.End <= mm.layout.MaxAddr)
+}
+
+// checkIOVec applies bound checks consistent with Linux's
+// arch/x86/include/asm/uaccess.h:access_ok() to ars.
+func (mm *MemoryManager) checkIOVec(ars usermem.AddrRangeSeq) bool {
+	for !ars.IsEmpty() {
+		ar := ars.Head()
+		if _, ok := mm.checkIORange(ar.Start, int64(ar.Length())); !ok {
+			return false
+		}
+		ars = ars.Tail()
+	}
+	return true
+}
+
+func (mm *MemoryManager) asioEnabled(opts usermem.IOOpts) bool {
+	return mm.haveASIO && !opts.IgnorePermissions && opts.AddressSpaceActive
+}
+
+// translateIOError converts errors to EFAULT, as is usually reported for all
+// I/O errors originating from MM in Linux.
+func translateIOError(ctx context.Context, err error) error {
+	if err == nil {
+		return nil
+	}
+	if logIOErrors {
+		ctx.Debugf("MM I/O error: %v", err)
+	}
+	return syserror.EFAULT
+}
+
+// CopyOut implements usermem.IO.CopyOut.
+func (mm *MemoryManager) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
+	ar, ok := mm.checkIORange(addr, int64(len(src)))
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	if len(src) == 0 {
+		return 0, nil
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.asioEnabled(opts) && len(src) < copyMapMinBytes {
+		return mm.asCopyOut(ctx, addr, src)
+	}
+
+	// Go through internal mappings.
+	n64, err := mm.withInternalMappings(ctx, ar, usermem.Write, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+		n, err := safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src)))
+		return n, translateIOError(ctx, err)
+	})
+	return int(n64), err
+}
+
+func (mm *MemoryManager) asCopyOut(ctx context.Context, addr usermem.Addr, src []byte) (int, error) {
+	var done int
+	for {
+		n, err := mm.as.CopyOut(addr+usermem.Addr(done), src[done:])
+		done += n
+		if err == nil {
+			return done, nil
+		}
+		if f, ok := err.(platform.SegmentationFault); ok {
+			ar, _ := addr.ToRange(uint64(len(src)))
+			if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Write); err != nil {
+				return done, err
+			}
+			continue
+		}
+		return done, translateIOError(ctx, err)
+	}
+}
+
+// CopyIn implements usermem.IO.CopyIn.
+func (mm *MemoryManager) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
+	ar, ok := mm.checkIORange(addr, int64(len(dst)))
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	if len(dst) == 0 {
+		return 0, nil
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.asioEnabled(opts) && len(dst) < copyMapMinBytes {
+		return mm.asCopyIn(ctx, addr, dst)
+	}
+
+	// Go through internal mappings.
+	n64, err := mm.withInternalMappings(ctx, ar, usermem.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+		n, err := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), ims)
+		return n, translateIOError(ctx, err)
+	})
+	return int(n64), err
+}
+
+func (mm *MemoryManager) asCopyIn(ctx context.Context, addr usermem.Addr, dst []byte) (int, error) {
+	var done int
+	for {
+		n, err := mm.as.CopyIn(addr+usermem.Addr(done), dst[done:])
+		done += n
+		if err == nil {
+			return done, nil
+		}
+		if f, ok := err.(platform.SegmentationFault); ok {
+			ar, _ := addr.ToRange(uint64(len(dst)))
+			if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Read); err != nil {
+				return done, err
+			}
+			continue
+		}
+		return done, translateIOError(ctx, err)
+	}
+}
+
+// ZeroOut implements usermem.IO.ZeroOut.
+func (mm *MemoryManager) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
+	ar, ok := mm.checkIORange(addr, toZero)
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	if toZero == 0 {
+		return 0, nil
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.asioEnabled(opts) && toZero < copyMapMinBytes {
+		return mm.asZeroOut(ctx, addr, toZero)
+	}
+
+	// Go through internal mappings.
+	return mm.withInternalMappings(ctx, ar, usermem.Write, opts.IgnorePermissions, func(dsts safemem.BlockSeq) (uint64, error) {
+		n, err := safemem.ZeroSeq(dsts)
+		return n, translateIOError(ctx, err)
+	})
+}
+
+func (mm *MemoryManager) asZeroOut(ctx context.Context, addr usermem.Addr, toZero int64) (int64, error) {
+	var done int64
+	for {
+		n, err := mm.as.ZeroOut(addr+usermem.Addr(done), uintptr(toZero-done))
+		done += int64(n)
+		if err == nil {
+			return done, nil
+		}
+		if f, ok := err.(platform.SegmentationFault); ok {
+			ar, _ := addr.ToRange(uint64(toZero))
+			if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Write); err != nil {
+				return done, err
+			}
+			continue
+		}
+		return done, translateIOError(ctx, err)
+	}
+}
+
+// CopyOutFrom implements usermem.IO.CopyOutFrom.
+func (mm *MemoryManager) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
+	if !mm.checkIOVec(ars) {
+		return 0, syserror.EFAULT
+	}
+
+	if ars.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes {
+		// We have to introduce a buffered copy, instead of just passing a
+		// safemem.BlockSeq representing addresses in the AddressSpace to src.
+		// This is because usermem.IO.CopyOutFrom() guarantees that it calls
+		// src.ReadToBlocks() at most once, which is incompatible with handling
+		// faults between calls. In the future, this is probably best resolved
+		// by introducing a CopyOutFrom variant or option that allows it to
+		// call src.ReadToBlocks() any number of times.
+		//
+		// This issue applies to CopyInTo as well.
+		buf := make([]byte, int(ars.NumBytes()))
+		bufN, bufErr := src.ReadToBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)))
+		var done int64
+		for done < int64(bufN) {
+			ar := ars.Head()
+			cplen := int64(ar.Length())
+			if cplen > int64(bufN)-done {
+				cplen = int64(bufN) - done
+			}
+			n, err := mm.asCopyOut(ctx, ar.Start, buf[int(done):int(done+cplen)])
+			done += int64(n)
+			if err != nil {
+				return done, err
+			}
+			ars = ars.Tail()
+		}
+		// Do not convert errors returned by src to EFAULT.
+		return done, bufErr
+	}
+
+	// Go through internal mappings.
+	return mm.withVecInternalMappings(ctx, ars, usermem.Write, opts.IgnorePermissions, src.ReadToBlocks)
+}
+
+// CopyInTo implements usermem.IO.CopyInTo.
+func (mm *MemoryManager) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
+	if !mm.checkIOVec(ars) {
+		return 0, syserror.EFAULT
+	}
+
+	if ars.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes {
+		buf := make([]byte, int(ars.NumBytes()))
+		var done int
+		var bufErr error
+		for !ars.IsEmpty() {
+			ar := ars.Head()
+			var n int
+			n, bufErr = mm.asCopyIn(ctx, ar.Start, buf[done:done+int(ar.Length())])
+			done += n
+			if bufErr != nil {
+				break
+			}
+			ars = ars.Tail()
+		}
+		n, err := dst.WriteFromBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:done])))
+		if err != nil {
+			return int64(n), err
+		}
+		// Do not convert errors returned by dst to EFAULT.
+		return int64(n), bufErr
+	}
+
+	// Go through internal mappings.
+	return mm.withVecInternalMappings(ctx, ars, usermem.Read, opts.IgnorePermissions, dst.WriteFromBlocks)
+}
+
+// SwapUint32 implements usermem.IO.SwapUint32.
+func (mm *MemoryManager) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
+	ar, ok := mm.checkIORange(addr, 4)
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
+		for {
+			old, err := mm.as.SwapUint32(addr, new)
+			if err == nil {
+				return old, nil
+			}
+			if f, ok := err.(platform.SegmentationFault); ok {
+				if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.ReadWrite); err != nil {
+					return 0, err
+				}
+				continue
+			}
+			return 0, translateIOError(ctx, err)
+		}
+	}
+
+	// Go through internal mappings.
+	var old uint32
+	_, err := mm.withInternalMappings(ctx, ar, usermem.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+		if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
+			// Atomicity is unachievable across mappings.
+			return 0, syserror.EFAULT
+		}
+		im := ims.Head()
+		var err error
+		old, err = safemem.SwapUint32(im, new)
+		if err != nil {
+			return 0, translateIOError(ctx, err)
+		}
+		return 4, nil
+	})
+	return old, err
+}
+
+// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
+func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
+	ar, ok := mm.checkIORange(addr, 4)
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
+		for {
+			prev, err := mm.as.CompareAndSwapUint32(addr, old, new)
+			if err == nil {
+				return prev, nil
+			}
+			if f, ok := err.(platform.SegmentationFault); ok {
+				if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.ReadWrite); err != nil {
+					return 0, err
+				}
+				continue
+			}
+			return 0, translateIOError(ctx, err)
+		}
+	}
+
+	// Go through internal mappings.
+	var prev uint32
+	_, err := mm.withInternalMappings(ctx, ar, usermem.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+		if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
+			// Atomicity is unachievable across mappings.
+			return 0, syserror.EFAULT
+		}
+		im := ims.Head()
+		var err error
+		prev, err = safemem.CompareAndSwapUint32(im, old, new)
+		if err != nil {
+			return 0, translateIOError(ctx, err)
+		}
+		return 4, nil
+	})
+	return prev, err
+}
+
+// handleASIOFault handles a page fault at address addr for an AddressSpaceIO
+// operation spanning ioar.
+//
+// Preconditions: mm.as != nil. ioar.Length() != 0. ioar.Contains(addr).
+func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr usermem.Addr, ioar usermem.AddrRange, at usermem.AccessType) error {
+	// Try to map all remaining pages in the I/O operation. This RoundUp can't
+	// overflow because otherwise it would have been caught by checkIORange.
+	end, _ := ioar.End.RoundUp()
+	ar := usermem.AddrRange{addr.RoundDown(), end}
+
+	// Don't bother trying existingPMAsLocked; in most cases, if we did have
+	// existing pmas, we wouldn't have faulted.
+
+	// Ensure that we have usable vmas. Here and below, only return early if we
+	// can't map the first (faulting) page; failure to map later pages are
+	// silently ignored. This maximizes partial success.
+	mm.mappingMu.RLock()
+	vseg, vend, err := mm.getVMAsLocked(ctx, ar, at, false)
+	if vendaddr := vend.Start(); vendaddr < ar.End {
+		if vendaddr <= ar.Start {
+			mm.mappingMu.RUnlock()
+			return translateIOError(ctx, err)
+		}
+		ar.End = vendaddr
+	}
+
+	// Ensure that we have usable pmas.
+	mm.activeMu.Lock()
+	pseg, pend, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{
+		breakCOW: at.Write,
+	})
+	mm.mappingMu.RUnlock()
+	if pendaddr := pend.Start(); pendaddr < ar.End {
+		if pendaddr <= ar.Start {
+			mm.activeMu.Unlock()
+			return translateIOError(ctx, err)
+		}
+		ar.End = pendaddr
+	}
+
+	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
+	// anymore.
+	mm.activeMu.DowngradeLock()
+
+	err = mm.mapASLocked(pseg, ar, false)
+	mm.activeMu.RUnlock()
+	return translateIOError(ctx, err)
+}
+
+// withInternalMappings ensures that pmas exist for all addresses in ar,
+// support access of type (at, ignorePermissions), and have internal mappings
+// cached. It then calls f with mm.activeMu locked for reading, passing
+// internal mappings for the subrange of ar for which this property holds.
+//
+// withInternalMappings takes a function returning uint64 since many safemem
+// functions have this property, but returns an int64 since this is usually
+// more useful for usermem.IO methods.
+//
+// Preconditions: 0 < ar.Length() <= math.MaxInt64.
+func (mm *MemoryManager) withInternalMappings(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
+	po := pmaOpts{
+		breakCOW: at.Write,
+	}
+
+	// If pmas are already available, we can do IO without touching mm.vmas or
+	// mm.mappingMu.
+	mm.activeMu.RLock()
+	if pseg := mm.existingPMAsLocked(ar, at, ignorePermissions, po, true /* needInternalMappings */); pseg.Ok() {
+		n, err := f(mm.internalMappingsLocked(pseg, ar))
+		mm.activeMu.RUnlock()
+		// Do not convert errors returned by f to EFAULT.
+		return int64(n), err
+	}
+	mm.activeMu.RUnlock()
+
+	// Ensure that we have usable vmas.
+	mm.mappingMu.RLock()
+	vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions)
+	if vendaddr := vend.Start(); vendaddr < ar.End {
+		if vendaddr <= ar.Start {
+			mm.mappingMu.RUnlock()
+			return 0, translateIOError(ctx, verr)
+		}
+		ar.End = vendaddr
+	}
+
+	// Ensure that we have usable pmas.
+	mm.activeMu.Lock()
+	pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, po)
+	mm.mappingMu.RUnlock()
+	if pendaddr := pend.Start(); pendaddr < ar.End {
+		if pendaddr <= ar.Start {
+			mm.activeMu.Unlock()
+			return 0, translateIOError(ctx, perr)
+		}
+		ar.End = pendaddr
+	}
+	imend, imerr := mm.getPMAInternalMappingsLocked(pseg, ar)
+	mm.activeMu.DowngradeLock()
+	if imendaddr := imend.Start(); imendaddr < ar.End {
+		if imendaddr <= ar.Start {
+			mm.activeMu.RUnlock()
+			return 0, translateIOError(ctx, imerr)
+		}
+		ar.End = imendaddr
+	}
+
+	// Do I/O.
+	un, err := f(mm.internalMappingsLocked(pseg, ar))
+	mm.activeMu.RUnlock()
+	n := int64(un)
+
+	// Return the first error in order of progress through ar.
+	if err != nil {
+		// Do not convert errors returned by f to EFAULT.
+		return n, err
+	}
+	if imerr != nil {
+		return n, translateIOError(ctx, imerr)
+	}
+	if perr != nil {
+		return n, translateIOError(ctx, perr)
+	}
+	return n, translateIOError(ctx, verr)
+}
+
+// withVecInternalMappings ensures that pmas exist for all addresses in ars,
+// support access of type (at, ignorePermissions), and have internal mappings
+// cached. It then calls f with mm.activeMu locked for reading, passing
+// internal mappings for the subset of ars for which this property holds.
+//
+// Preconditions: !ars.IsEmpty().
+func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
+	// withInternalMappings is faster than withVecInternalMappings because of
+	// iterator plumbing (this isn't generally practical in the vector case due
+	// to iterator invalidation between AddrRanges). Use it if possible.
+	if ars.NumRanges() == 1 {
+		return mm.withInternalMappings(ctx, ars.Head(), at, ignorePermissions, f)
+	}
+
+	po := pmaOpts{
+		breakCOW: at.Write,
+	}
+
+	// If pmas are already available, we can do IO without touching mm.vmas or
+	// mm.mappingMu.
+	mm.activeMu.RLock()
+	if mm.existingVecPMAsLocked(ars, at, ignorePermissions, po, true /* needInternalMappings */) {
+		n, err := f(mm.vecInternalMappingsLocked(ars))
+		mm.activeMu.RUnlock()
+		// Do not convert errors returned by f to EFAULT.
+		return int64(n), err
+	}
+	mm.activeMu.RUnlock()
+
+	// Ensure that we have usable vmas.
+	mm.mappingMu.RLock()
+	vars, verr := mm.getVecVMAsLocked(ctx, ars, at, ignorePermissions)
+	if vars.NumBytes() == 0 {
+		mm.mappingMu.RUnlock()
+		return 0, translateIOError(ctx, verr)
+	}
+
+	// Ensure that we have usable pmas.
+	mm.activeMu.Lock()
+	pars, perr := mm.getVecPMAsLocked(ctx, vars, po)
+	mm.mappingMu.RUnlock()
+	if pars.NumBytes() == 0 {
+		mm.activeMu.Unlock()
+		return 0, translateIOError(ctx, perr)
+	}
+	imars, imerr := mm.getVecPMAInternalMappingsLocked(pars)
+	mm.activeMu.DowngradeLock()
+	if imars.NumBytes() == 0 {
+		mm.activeMu.RUnlock()
+		return 0, translateIOError(ctx, imerr)
+	}
+
+	// Do I/O.
+	un, err := f(mm.vecInternalMappingsLocked(imars))
+	mm.activeMu.RUnlock()
+	n := int64(un)
+
+	// Return the first error in order of progress through ars.
+	if err != nil {
+		// Do not convert errors from f to EFAULT.
+		return n, err
+	}
+	if imerr != nil {
+		return n, translateIOError(ctx, imerr)
+	}
+	if perr != nil {
+		return n, translateIOError(ctx, perr)
+	}
+	return n, translateIOError(ctx, verr)
+}
+
+// truncatedAddrRangeSeq returns a copy of ars, but with the end truncated to
+// at most address end on AddrRange arsit.Head(). It is used in vector I/O paths to
+// truncate usermem.AddrRangeSeq when errors occur.
+//
+// Preconditions: !arsit.IsEmpty(). end <= arsit.Head().End.
+func truncatedAddrRangeSeq(ars, arsit usermem.AddrRangeSeq, end usermem.Addr) usermem.AddrRangeSeq {
+	ar := arsit.Head()
+	if end <= ar.Start {
+		return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes())
+	}
+	return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes() + int64(end-ar.Start))
+}
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
new file mode 100644
index 000000000..de7f29b04
--- /dev/null
+++ b/pkg/sentry/mm/lifecycle.go
@@ -0,0 +1,218 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
+func NewMemoryManager(p platform.Platform) *MemoryManager {
+	return &MemoryManager{
+		p:           p,
+		haveASIO:    p.SupportsAddressSpaceIO(),
+		privateRefs: &privateRefs{},
+		users:       1,
+		auxv:        arch.Auxv{},
+		aioManager:  aioManager{contexts: make(map[uint64]*AIOContext)},
+	}
+}
+
+// SetMmapLayout initializes mm's layout from the given arch.Context.
+//
+// Preconditions: mm contains no mappings and is not used concurrently.
+func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arch.MmapLayout, error) {
+	layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r)
+	if err != nil {
+		return arch.MmapLayout{}, err
+	}
+	mm.layout = layout
+	return layout, nil
+}
+
+// Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or
+// clone() (without CLONE_VM).
+func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	mm2 := &MemoryManager{
+		p:                    mm.p,
+		haveASIO:             mm.haveASIO,
+		layout:               mm.layout,
+		privateRefs:          mm.privateRefs,
+		users:                1,
+		usageAS:              mm.usageAS,
+		brk:                  mm.brk,
+		captureInvalidations: true,
+		argv:                 mm.argv,
+		envv:                 mm.envv,
+		auxv:                 append(arch.Auxv(nil), mm.auxv...),
+		// IncRef'd below, once we know that there isn't an error.
+		executable: mm.executable,
+		aioManager: aioManager{contexts: make(map[uint64]*AIOContext)},
+	}
+
+	// Copy vmas.
+	dstvgap := mm2.vmas.FirstGap()
+	for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
+		vma := srcvseg.ValuePtr()
+		vmaAR := srcvseg.Range()
+		// Inform the Mappable, if any, of the new mapping.
+		if vma.mappable != nil {
+			if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off); err != nil {
+				mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange())
+				return nil, err
+			}
+		}
+		if vma.id != nil {
+			vma.id.IncRef()
+		}
+		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, *vma).NextGap()
+		// We don't need to update mm2.usageAS since we copied it from mm
+		// above.
+	}
+
+	// Copy pmas. We have to lock mm.activeMu for writing to make existing
+	// private pmas copy-on-write. We also have to lock mm2.activeMu since
+	// after copying vmas above, memmap.Mappables may call mm2.Invalidate. We
+	// only copy private pmas, since in the common case where fork(2) is
+	// immediately followed by execve(2), copying non-private pmas that can be
+	// regenerated by calling memmap.Mappable.Translate is a waste of time.
+	// (Linux does the same; compare kernel/fork.c:dup_mmap() =>
+	// mm/memory.c:copy_page_range().)
+	mm2.activeMu.Lock()
+	defer mm2.activeMu.Unlock()
+	mm.activeMu.Lock()
+	defer mm.activeMu.Unlock()
+	dstpgap := mm2.pmas.FirstGap()
+	var unmapAR usermem.AddrRange
+	for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() {
+		pma := srcpseg.ValuePtr()
+		if !pma.private {
+			continue
+		}
+		if !pma.needCOW {
+			pma.needCOW = true
+			if pma.vmaEffectivePerms.Write {
+				// We don't want to unmap the whole address space, even though
+				// doing so would reduce calls to unmapASLocked(), because mm
+				// will most likely continue to be used after the fork, so
+				// unmapping pmas unnecessarily will result in extra page
+				// faults. But we do want to merge consecutive AddrRanges
+				// across pma boundaries.
+				if unmapAR.End == srcpseg.Start() {
+					unmapAR.End = srcpseg.End()
+				} else {
+					if unmapAR.Length() != 0 {
+						mm.unmapASLocked(unmapAR)
+					}
+					unmapAR = srcpseg.Range()
+				}
+			}
+		}
+		fr := srcpseg.fileRange()
+		mm2.incPrivateRef(fr)
+		srcpseg.ValuePtr().file.IncRef(fr)
+		addrRange := srcpseg.Range()
+		mm2.addRSSLocked(addrRange)
+		dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap()
+	}
+	if unmapAR.Length() != 0 {
+		mm.unmapASLocked(unmapAR)
+	}
+
+	// Between when we call memmap.Mappable.AddMapping while copying vmas and
+	// when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are
+	// ineffective because the pmas they invalidate haven't yet been copied,
+	// possibly allowing mm2 to get invalidated translations:
+	//
+	// Invalidating Mappable            mm.Fork
+	// ---------------------            -------
+	//
+	// mm2.Invalidate()
+	//                                  mm.activeMu.Lock()
+	// mm.Invalidate() /* blocks */
+	//                                  mm2.activeMu.Lock()
+	//                                  (mm copies invalidated pma to mm2)
+	//
+	// This would technically be both safe (since we only copy private pmas,
+	// which will still hold a reference on their memory) and consistent with
+	// Linux, but we avoid it anyway by setting mm2.captureInvalidations during
+	// construction, causing calls to mm2.Invalidate() to be captured in
+	// mm2.capturedInvalidations, to be replayed after pmas are copied - i.e.
+	// here.
+	mm2.captureInvalidations = false
+	for _, invArgs := range mm2.capturedInvalidations {
+		mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true)
+	}
+	mm2.capturedInvalidations = nil
+
+	if mm2.executable != nil {
+		mm2.executable.IncRef()
+	}
+	return mm2, nil
+}
+
+// IncUsers increments mm's user count and returns true. If the user count is
+// already 0, IncUsers does nothing and returns false.
+func (mm *MemoryManager) IncUsers() bool {
+	return atomicbitops.IncUnlessZeroInt32(&mm.users)
+}
+
+// DecUsers decrements mm's user count. If the user count reaches 0, all
+// mappings in mm are unmapped.
+func (mm *MemoryManager) DecUsers(ctx context.Context) {
+	if users := atomic.AddInt32(&mm.users, -1); users > 0 {
+		return
+	} else if users < 0 {
+		panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users))
+	}
+
+	mm.aioManager.destroy()
+
+	mm.metadataMu.Lock()
+	exe := mm.executable
+	mm.executable = nil
+	mm.metadataMu.Unlock()
+	if exe != nil {
+		exe.DecRef()
+	}
+
+	mm.activeMu.Lock()
+	// Sanity check.
+	if atomic.LoadInt32(&mm.active) != 0 {
+		panic("active address space lost?")
+	}
+	// Make sure the AddressSpace is returned.
+	if mm.as != nil {
+		mm.as.Release()
+		mm.as = nil
+	}
+	mm.activeMu.Unlock()
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	mm.unmapLocked(ctx, mm.applicationAddrRange())
+}
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
new file mode 100644
index 000000000..32d5e2ff6
--- /dev/null
+++ b/pkg/sentry/mm/metadata.go
@@ -0,0 +1,139 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// ArgvStart returns the start of the application argument vector.
+//
+// There is no guarantee that this value is sensible w.r.t. ArgvEnd.
+func (mm *MemoryManager) ArgvStart() usermem.Addr {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.argv.Start
+}
+
+// SetArgvStart sets the start of the application argument vector.
+func (mm *MemoryManager) SetArgvStart(a usermem.Addr) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.argv.Start = a
+}
+
+// ArgvEnd returns the end of the application argument vector.
+//
+// There is no guarantee that this value is sensible w.r.t. ArgvStart.
+func (mm *MemoryManager) ArgvEnd() usermem.Addr {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.argv.End
+}
+
+// SetArgvEnd sets the end of the application argument vector.
+func (mm *MemoryManager) SetArgvEnd(a usermem.Addr) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.argv.End = a
+}
+
+// EnvvStart returns the start of the application environment vector.
+//
+// There is no guarantee that this value is sensible w.r.t. EnvvEnd.
+func (mm *MemoryManager) EnvvStart() usermem.Addr {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.envv.Start
+}
+
+// SetEnvvStart sets the start of the application environment vector.
+func (mm *MemoryManager) SetEnvvStart(a usermem.Addr) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.envv.Start = a
+}
+
+// EnvvEnd returns the end of the application environment vector.
+//
+// There is no guarantee that this value is sensible w.r.t. EnvvStart.
+func (mm *MemoryManager) EnvvEnd() usermem.Addr {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.envv.End
+}
+
+// SetEnvvEnd sets the end of the application environment vector.
+func (mm *MemoryManager) SetEnvvEnd(a usermem.Addr) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.envv.End = a
+}
+
+// Auxv returns the current map of auxiliary vectors.
+func (mm *MemoryManager) Auxv() arch.Auxv {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return append(arch.Auxv(nil), mm.auxv...)
+}
+
+// SetAuxv sets the entire map of auxiliary vectors.
+func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.auxv = append(arch.Auxv(nil), auxv...)
+}
+
+// Executable returns the executable, if available.
+//
+// An additional reference will be taken in the case of a non-nil executable,
+// which must be released by the caller.
+func (mm *MemoryManager) Executable() *fs.Dirent {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+
+	if mm.executable == nil {
+		return nil
+	}
+
+	mm.executable.IncRef()
+	return mm.executable
+}
+
+// SetExecutable sets the executable.
+//
+// This takes a reference on d.
+func (mm *MemoryManager) SetExecutable(d *fs.Dirent) {
+	mm.metadataMu.Lock()
+
+	// Grab a new reference.
+	d.IncRef()
+
+	// Set the executable.
+	orig := mm.executable
+	mm.executable = d
+
+	mm.metadataMu.Unlock()
+
+	// Release the old reference.
+	//
+	// Do this without holding the lock, since it may wind up doing some
+	// I/O to sync the dirent, etc.
+	if orig != nil {
+		orig.DecRef()
+	}
+}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
new file mode 100644
index 000000000..ce8097b7f
--- /dev/null
+++ b/pkg/sentry/mm/mm.go
@@ -0,0 +1,417 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package mm provides a memory management subsystem. See README.md for a
+// detailed overview.
+//
+// Lock order:
+//
+// fs locks, except for memmap.Mappable locks
+//   mm.MemoryManager.metadataMu
+//     mm.MemoryManager.mappingMu
+//       Locks taken by memmap.Mappable methods other than Translate
+//         mm.MemoryManager.activeMu
+//           Locks taken by memmap.Mappable.Translate
+//             mm.privateRefs.mu
+//               platform.File locks
+//         mm.aioManager.mu
+//           mm.AIOContext.mu
+//
+// Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in
+// multiple mm.MemoryManagers, as it does so in a well-defined order (forked
+// child first).
+package mm
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	ssync "gvisor.googlesource.com/gvisor/pkg/sync"
+)
+
+// MemoryManager implements a virtual address space.
+type MemoryManager struct {
+	// p is the platform.
+	//
+	// p is immutable.
+	p platform.Platform
+
+	// haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from
+	// eliminating an indirect call in the hot I/O path, this makes
+	// MemoryManager.asioEnabled() a leaf function, allowing it to be inlined.
+	//
+	// haveASIO is immutable.
+	haveASIO bool `state:"nosave"`
+
+	// layout is the memory layout.
+	//
+	// layout is set by the binary loader before the MemoryManager can be used.
+	layout arch.MmapLayout
+
+	// privateRefs stores reference counts for private memory (memory whose
+	// ownership is shared by one or more pmas instead of being owned by a
+	// memmap.Mappable).
+	//
+	// NOTE: This should be replaced using refcounts on
+	// platform.File.
+	//
+	// privateRefs is immutable.
+	privateRefs *privateRefs
+
+	// users is the number of dependences on the mappings in the MemoryManager.
+	// When the number of references in users reaches zero, all mappings are
+	// unmapped.
+	//
+	// users is accessed using atomic memory operations.
+	users int32
+
+	// mappingMu is analogous to Linux's struct mm_struct::mmap_sem.
+	mappingMu ssync.DowngradableRWMutex `state:"nosave"`
+
+	// vmas stores virtual memory areas. Since vmas are stored by value,
+	// clients should usually use vmaIterator.ValuePtr() instead of
+	// vmaIterator.Value() to get a pointer to the vma rather than a copy.
+	//
+	// Invariants: vmas are always page-aligned.
+	//
+	// vmas is protected by mappingMu.
+	vmas vmaSet
+
+	// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
+	//
+	// usageAS is protected by mappingMu.
+	usageAS uint64
+
+	// brk is the mm's brk, which is manipulated using the brk(2) system call.
+	// The brk is initially set up by the loader which maps an executable
+	// binary into the mm.
+	//
+	// brk is protected by mappingMu.
+	brk usermem.AddrRange
+
+	// activeMu is loosely analogous to Linux's struct
+	// mm_struct::page_table_lock.
+	activeMu ssync.DowngradableRWMutex `state:"nosave"`
+
+	// pmas stores platform mapping areas used to implement vmas. Since pmas
+	// are stored by value, clients should usually use pmaIterator.ValuePtr()
+	// instead of pmaIterator.Value() to get a pointer to the pma rather than
+	// a copy.
+	//
+	// Inserting or removing segments from pmas should happen along with a
+	// call to mm.insertRSS or mm.removeRSS.
+	//
+	// Invariants: pmas are always page-aligned. If a pma exists for a given
+	// address, a vma must also exist for that address.
+	//
+	// pmas is protected by activeMu.
+	pmas pmaSet
+
+	// curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is
+	// reported as the MemoryManager's RSS.
+	//
+	// maxRSS should be modified only via insertRSS and removeRSS, not
+	// directly.
+	//
+	// maxRSS is protected by activeMu.
+	curRSS uint64
+
+	// maxRSS is the maximum resident set size in bytes of a MemoryManager.
+	// It is tracked as the application adds and removes mappings to pmas.
+	//
+	// maxRSS should be modified only via insertRSS, not directly.
+	//
+	// maxRSS is protected by activeMu.
+	maxRSS uint64
+
+	// as is the platform.AddressSpace that pmas are mapped into. active is the
+	// number of contexts that require as to be non-nil; if active == 0, as may
+	// be nil.
+	//
+	// as is protected by activeMu. active is manipulated with atomic memory
+	// operations; transitions to and from zero are additionally protected by
+	// activeMu. (This is because such transitions may need to be atomic with
+	// changes to as.)
+	as     platform.AddressSpace `state:"nosave"`
+	active int32                 `state:"zerovalue"`
+
+	// unmapAllOnActivate indicates that the next Activate call should activate
+	// an empty AddressSpace.
+	//
+	// This is used to ensure that an AddressSpace cached in
+	// NewAddressSpace is not used after some change in the MemoryManager
+	// or VMAs has made that AddressSpace stale.
+	//
+	// unmapAllOnActivate is protected by activeMu. It must only be set when
+	// there is no active or cached AddressSpace. If as != nil, then
+	// invalidations should be propagated immediately.
+	unmapAllOnActivate bool `state:"nosave"`
+
+	// If captureInvalidations is true, calls to MM.Invalidate() are recorded
+	// in capturedInvalidations rather than being applied immediately to pmas.
+	// This is to avoid a race condition in MM.Fork(); see that function for
+	// details.
+	//
+	// Both captureInvalidations and capturedInvalidations are protected by
+	// activeMu. Neither need to be saved since captureInvalidations is only
+	// enabled during MM.Fork(), during which saving can't occur.
+	captureInvalidations  bool             `state:"zerovalue"`
+	capturedInvalidations []invalidateArgs `state:"nosave"`
+
+	metadataMu sync.Mutex `state:"nosave"`
+
+	// argv is the application argv. This is set up by the loader and may be
+	// modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No
+	// requirements apply to argv; we do not require that argv.WellFormed().
+	//
+	// argv is protected by metadataMu.
+	argv usermem.AddrRange
+
+	// envv is the application envv. This is set up by the loader and may be
+	// modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No
+	// requirements apply to envv; we do not require that envv.WellFormed().
+	//
+	// envv is protected by metadataMu.
+	envv usermem.AddrRange
+
+	// auxv is the ELF's auxiliary vector.
+	//
+	// auxv is protected by metadataMu.
+	auxv arch.Auxv
+
+	// executable is the executable for this MemoryManager. If executable
+	// is not nil, it holds a reference on the Dirent.
+	//
+	// executable is protected by metadataMu.
+	executable *fs.Dirent
+
+	// aioManager keeps track of AIOContexts used for async IOs. AIOManager
+	// must be cloned when CLONE_VM is used.
+	aioManager aioManager
+}
+
+// vma represents a virtual memory area.
+type vma struct {
+	// mappable is the virtual memory object mapped by this vma. If mappable is
+	// nil, the vma represents a private anonymous mapping.
+	mappable memmap.Mappable
+
+	// off is the offset into mappable at which this vma begins. If mappable is
+	// nil, off is meaningless.
+	off uint64
+
+	// To speedup VMA save/restore, we group and save the following booleans
+	// as a single integer.
+
+	// realPerms are the memory permissions on this vma, as defined by the
+	// application.
+	realPerms usermem.AccessType `state:".(int)"`
+
+	// effectivePerms are the memory permissions on this vma which are
+	// actually used to control access.
+	//
+	// Invariant: effectivePerms == realPerms.Effective().
+	effectivePerms usermem.AccessType `state:"manual"`
+
+	// maxPerms limits the set of permissions that may ever apply to this
+	// memory, as well as accesses for which usermem.IOOpts.IgnorePermissions
+	// is true (e.g. ptrace(PTRACE_POKEDATA)).
+	//
+	// Invariant: maxPerms == maxPerms.Effective().
+	maxPerms usermem.AccessType `state:"manual"`
+
+	// private is true if this is a MAP_PRIVATE mapping, such that writes to
+	// the mapping are propagated to a copy.
+	private bool `state:"manual"`
+
+	// growsDown is true if the mapping may be automatically extended downward
+	// under certain conditions. If growsDown is true, mappable must be nil.
+	//
+	// There is currently no corresponding growsUp flag; in Linux, the only
+	// architectures that can have VM_GROWSUP mappings are ia64, parisc, and
+	// metag, none of which we currently support.
+	growsDown bool `state:"manual"`
+
+	// If id is not nil, it controls the lifecycle of mappable and provides vma
+	// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
+	id memmap.MappingIdentity
+
+	// If hint is non-empty, it is a description of the vma printed in
+	// /proc/[pid]/maps. hint takes priority over id.MappedName().
+	hint string
+}
+
+const (
+	vmaRealPermsRead = 1 << iota
+	vmaRealPermsWrite
+	vmaRealPermsExecute
+	vmaEffectivePermsRead
+	vmaEffectivePermsWrite
+	vmaEffectivePermsExecute
+	vmaMaxPermsRead
+	vmaMaxPermsWrite
+	vmaMaxPermsExecute
+	vmaPrivate
+	vmaGrowsDown
+)
+
+func (v *vma) saveRealPerms() int {
+	var b int
+	if v.realPerms.Read {
+		b |= vmaRealPermsRead
+	}
+	if v.realPerms.Write {
+		b |= vmaRealPermsWrite
+	}
+	if v.realPerms.Execute {
+		b |= vmaRealPermsExecute
+	}
+	if v.effectivePerms.Read {
+		b |= vmaEffectivePermsRead
+	}
+	if v.effectivePerms.Write {
+		b |= vmaEffectivePermsWrite
+	}
+	if v.effectivePerms.Execute {
+		b |= vmaEffectivePermsExecute
+	}
+	if v.maxPerms.Read {
+		b |= vmaMaxPermsRead
+	}
+	if v.maxPerms.Write {
+		b |= vmaMaxPermsWrite
+	}
+	if v.maxPerms.Execute {
+		b |= vmaMaxPermsExecute
+	}
+	if v.private {
+		b |= vmaPrivate
+	}
+	if v.growsDown {
+		b |= vmaGrowsDown
+	}
+	return b
+}
+
+func (v *vma) loadRealPerms(b int) {
+	if b&vmaRealPermsRead > 0 {
+		v.realPerms.Read = true
+	}
+	if b&vmaRealPermsWrite > 0 {
+		v.realPerms.Write = true
+	}
+	if b&vmaRealPermsExecute > 0 {
+		v.realPerms.Execute = true
+	}
+	if b&vmaEffectivePermsRead > 0 {
+		v.effectivePerms.Read = true
+	}
+	if b&vmaEffectivePermsWrite > 0 {
+		v.effectivePerms.Write = true
+	}
+	if b&vmaEffectivePermsExecute > 0 {
+		v.effectivePerms.Execute = true
+	}
+	if b&vmaMaxPermsRead > 0 {
+		v.maxPerms.Read = true
+	}
+	if b&vmaMaxPermsWrite > 0 {
+		v.maxPerms.Write = true
+	}
+	if b&vmaMaxPermsExecute > 0 {
+		v.maxPerms.Execute = true
+	}
+	if b&vmaPrivate > 0 {
+		v.private = true
+	}
+	if b&vmaGrowsDown > 0 {
+		v.growsDown = true
+	}
+}
+
+// pma represents a platform mapping area.
+type pma struct {
+	// file is the file mapped by this pma. Only pmas for which file ==
+	// platform.Platform.Memory() may be saved. pmas hold a reference to the
+	// corresponding file range while they exist.
+	file platform.File `state:"nosave"`
+
+	// off is the offset into file at which this pma begins.
+	off uint64
+
+	// vmaEffectivePerms and vmaMaxPerms are duplicated from the
+	// corresponding vma so that the IO implementation can avoid iterating
+	// mm.vmas when pmas already exist.
+	vmaEffectivePerms usermem.AccessType
+	vmaMaxPerms       usermem.AccessType
+
+	// needCOW is true if writes to the mapping must be propagated to a copy.
+	needCOW bool
+
+	// private is true if this pma represents private memory.
+	//
+	// If private is true, file must be platform.Platform.Memory(), the pma
+	// holds a reference on the mapped memory that is tracked in privateRefs,
+	// and calls to Invalidate for which
+	// memmap.InvalidateOpts.InvalidatePrivate is false should ignore the pma.
+	//
+	// If private is false, this pma caches a translation from the
+	// corresponding vma's memmap.Mappable.Translate.
+	private bool
+
+	// If internalMappings is not empty, it is the cached return value of
+	// file.MapInternal for the platform.FileRange mapped by this pma.
+	internalMappings safemem.BlockSeq `state:"nosave"`
+}
+
+type privateRefs struct {
+	mu sync.Mutex `state:"nosave"`
+
+	// refs maps offsets into Platform.Memory() to the number of pmas (or,
+	// equivalently, MemoryManagers) that share ownership of the memory at that
+	// offset.
+	refs fileRefcountSet
+}
+
+type invalidateArgs struct {
+	ar   usermem.AddrRange
+	opts memmap.InvalidateOpts
+}
+
+// fileRefcountSetFunctions implements segment.Functions for fileRefcountSet.
+type fileRefcountSetFunctions struct{}
+
+func (fileRefcountSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+func (fileRefcountSetFunctions) MaxKey() uint64 {
+	return ^uint64(0)
+}
+
+func (fileRefcountSetFunctions) ClearValue(_ *int32) {
+}
+
+func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) {
+	return rc1, rc1 == rc2
+}
+
+func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) {
+	return rc, rc
+}
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
new file mode 100644
index 000000000..b47aa7263
--- /dev/null
+++ b/pkg/sentry/mm/mm_test.go
@@ -0,0 +1,174 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func testMemoryManager(ctx context.Context) *MemoryManager {
+	p := platform.FromContext(ctx)
+	mm := NewMemoryManager(p)
+	mm.layout = arch.MmapLayout{
+		MinAddr:      p.MinUserAddress(),
+		MaxAddr:      p.MaxUserAddress(),
+		BottomUpBase: p.MinUserAddress(),
+		TopDownBase:  p.MaxUserAddress(),
+	}
+	return mm
+}
+
+func (mm *MemoryManager) realUsageAS() uint64 {
+	return uint64(mm.vmas.Span())
+}
+
+func TestUsageASUpdates(t *testing.T) {
+	ctx := contexttest.Context(t)
+	mm := testMemoryManager(ctx)
+	defer mm.DecUsers(ctx)
+
+	addr, err := mm.MMap(ctx, memmap.MMapOpts{
+		Length: 2 * usermem.PageSize,
+	})
+	if err != nil {
+		t.Fatalf("MMap got err %v want nil", err)
+	}
+	realUsage := mm.realUsageAS()
+	if mm.usageAS != realUsage {
+		t.Fatalf("usageAS believes %v bytes are mapped; %v bytes are actually mapped", mm.usageAS, realUsage)
+	}
+
+	mm.MUnmap(ctx, addr, usermem.PageSize)
+	realUsage = mm.realUsageAS()
+	if mm.usageAS != realUsage {
+		t.Fatalf("usageAS believes %v bytes are mapped; %v bytes are actually mapped", mm.usageAS, realUsage)
+	}
+}
+
+func TestBrkDataLimitUpdates(t *testing.T) {
+	limitSet := limits.NewLimitSet()
+	limitSet.Set(limits.Data, limits.Limit{}) // zero RLIMIT_DATA
+
+	ctx := contexttest.WithLimitSet(contexttest.Context(t), limitSet)
+	mm := testMemoryManager(ctx)
+	defer mm.DecUsers(ctx)
+
+	// Try to extend the brk by one page and expect doing so to fail.
+	oldBrk, _ := mm.Brk(ctx, 0)
+	if newBrk, _ := mm.Brk(ctx, oldBrk+usermem.PageSize); newBrk != oldBrk {
+		t.Errorf("brk() increased data segment above RLIMIT_DATA (old brk = %#x, new brk = %#x", oldBrk, newBrk)
+	}
+}
+
+// TestIOAfterUnmap ensures that IO fails after unmap.
+func TestIOAfterUnmap(t *testing.T) {
+	ctx := contexttest.Context(t)
+	mm := testMemoryManager(ctx)
+	defer mm.DecUsers(ctx)
+
+	addr, err := mm.MMap(ctx, memmap.MMapOpts{
+		Length:   usermem.PageSize,
+		Private:  true,
+		Perms:    usermem.Read,
+		MaxPerms: usermem.AnyAccess,
+	})
+	if err != nil {
+		t.Fatalf("MMap got err %v want nil", err)
+	}
+
+	// IO works before munmap.
+	b := make([]byte, 1)
+	n, err := mm.CopyIn(ctx, addr, b, usermem.IOOpts{})
+	if err != nil {
+		t.Errorf("CopyIn got err %v want nil", err)
+	}
+	if n != 1 {
+		t.Errorf("CopyIn got %d want 1", n)
+	}
+
+	err = mm.MUnmap(ctx, addr, usermem.PageSize)
+	if err != nil {
+		t.Fatalf("MUnmap got err %v want nil", err)
+	}
+
+	n, err = mm.CopyIn(ctx, addr, b, usermem.IOOpts{})
+	if err != syserror.EFAULT {
+		t.Errorf("CopyIn got err %v want EFAULT", err)
+	}
+	if n != 0 {
+		t.Errorf("CopyIn got %d want 0", n)
+	}
+}
+
+// TestIOAfterMProtect tests IO interaction with mprotect permissions.
+func TestIOAfterMProtect(t *testing.T) {
+	ctx := contexttest.Context(t)
+	mm := testMemoryManager(ctx)
+	defer mm.DecUsers(ctx)
+
+	addr, err := mm.MMap(ctx, memmap.MMapOpts{
+		Length:   usermem.PageSize,
+		Private:  true,
+		Perms:    usermem.ReadWrite,
+		MaxPerms: usermem.AnyAccess,
+	})
+	if err != nil {
+		t.Fatalf("MMap got err %v want nil", err)
+	}
+
+	// Writing works before mprotect.
+	b := make([]byte, 1)
+	n, err := mm.CopyOut(ctx, addr, b, usermem.IOOpts{})
+	if err != nil {
+		t.Errorf("CopyOut got err %v want nil", err)
+	}
+	if n != 1 {
+		t.Errorf("CopyOut got %d want 1", n)
+	}
+
+	err = mm.MProtect(addr, usermem.PageSize, usermem.Read, false)
+	if err != nil {
+		t.Errorf("MProtect got err %v want nil", err)
+	}
+
+	// Without IgnorePermissions, CopyOut should no longer succeed.
+	n, err = mm.CopyOut(ctx, addr, b, usermem.IOOpts{})
+	if err != syserror.EFAULT {
+		t.Errorf("CopyOut got err %v want EFAULT", err)
+	}
+	if n != 0 {
+		t.Errorf("CopyOut got %d want 0", n)
+	}
+
+	// With IgnorePermissions, CopyOut should succeed despite mprotect.
+	n, err = mm.CopyOut(ctx, addr, b, usermem.IOOpts{
+		IgnorePermissions: true,
+	})
+	if err != nil {
+		t.Errorf("CopyOut got err %v want nil", err)
+	}
+	if n != 1 {
+		t.Errorf("CopyOut got %d want 1", n)
+	}
+}
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
new file mode 100644
index 000000000..35e873762
--- /dev/null
+++ b/pkg/sentry/mm/pma.go
@@ -0,0 +1,928 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+type pmaOpts struct {
+	// If breakCOW is true, pmas must not be copy-on-write.
+	breakCOW bool
+}
+
+// existingPMAsLocked checks that pmas exist for all addresses in ar, and
+// support access of type (at, ignorePermissions). If so, it returns an
+// iterator to the pma containing ar.Start. Otherwise it returns a terminal
+// iterator.
+//
+// Preconditions: mm.activeMu must be locked. ar.Length() != 0.
+func (mm *MemoryManager) existingPMAsLocked(ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, opts pmaOpts, needInternalMappings bool) pmaIterator {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	first := mm.pmas.FindSegment(ar.Start)
+	pseg := first
+	for pseg.Ok() {
+		pma := pseg.ValuePtr()
+		perms := pma.vmaEffectivePerms
+		if ignorePermissions {
+			perms = pma.vmaMaxPerms
+		}
+		if !perms.SupersetOf(at) {
+			// These are the vma's permissions, so the caller will get an error
+			// when they try to get new pmas.
+			return pmaIterator{}
+		}
+		if opts.breakCOW && pma.needCOW {
+			return pmaIterator{}
+		}
+		if needInternalMappings && pma.internalMappings.IsEmpty() {
+			return pmaIterator{}
+		}
+
+		if ar.End <= pseg.End() {
+			return first
+		}
+		pseg, _ = pseg.NextNonEmpty()
+	}
+
+	// Ran out of pmas before reaching ar.End.
+	return pmaIterator{}
+}
+
+// existingVecPMAsLocked returns true if pmas exist for all addresses in ars,
+// and support access of type (at, ignorePermissions).
+//
+// Preconditions: mm.activeMu must be locked.
+func (mm *MemoryManager) existingVecPMAsLocked(ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool, opts pmaOpts, needInternalMappings bool) bool {
+	for ; !ars.IsEmpty(); ars = ars.Tail() {
+		if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, opts, needInternalMappings).Ok() {
+			return false
+		}
+	}
+	return true
+}
+
+// getPMAsLocked ensures that pmas exist for all addresses in ar, subject to
+// opts. It returns:
+//
+// - An iterator to the pma containing ar.Start. If no pma contains ar.Start,
+// the iterator is unspecified.
+//
+// - An iterator to the gap after the last pma containing an address in ar. If
+// pmas exist for no addresses in ar, the iterator is to a gap that begins
+// before ar.Start.
+//
+// - An error that is non-nil if pmas exist for only a subset of ar.
+//
+// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
+// writing. ar.Length() != 0. vseg.Range().Contains(ar.Start). vmas must exist
+// for all addresses in ar.
+func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, opts pmaOpts) (pmaIterator, pmaGapIterator, error) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
+		if !vseg.Range().Contains(ar.Start) {
+			panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
+		}
+	}
+
+	// Page-align ar so that all AddrRanges are aligned.
+	end, ok := ar.End.RoundUp()
+	var alignerr error
+	if !ok {
+		end = ar.End.RoundDown()
+		alignerr = syserror.EFAULT
+	}
+	ar = usermem.AddrRange{ar.Start.RoundDown(), end}
+
+	pstart, pend, perr := mm.ensurePMAsLocked(ctx, vseg, ar)
+	if pend.Start() <= ar.Start {
+		return pmaIterator{}, pend, perr
+	}
+	// ensurePMAsLocked may not have pstart due to iterator invalidation. We
+	// need it, either to return it immediately or to pass to
+	// breakCopyOnWriteLocked.
+	if !pstart.Ok() {
+		pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend)
+	}
+
+	var cowerr error
+	if opts.breakCOW {
+		var invalidated bool
+		pend, invalidated, cowerr = mm.breakCopyOnWriteLocked(pstart, ar)
+		if pend.Start() <= ar.Start {
+			return pmaIterator{}, pend, cowerr
+		}
+		if invalidated {
+			pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend)
+		}
+	}
+
+	if cowerr != nil {
+		return pstart, pend, cowerr
+	}
+	if perr != nil {
+		return pstart, pend, perr
+	}
+	return pstart, pend, alignerr
+}
+
+// getVecPMAsLocked ensures that pmas exist for all addresses in ars. It
+// returns the subset of ars for which pmas exist. If this is not equal to ars,
+// it returns a non-nil error explaining why.
+//
+// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
+// writing. vmas must exist for all addresses in ars.
+func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, opts pmaOpts) (usermem.AddrRangeSeq, error) {
+	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
+		ar := arsit.Head()
+		if ar.Length() == 0 {
+			continue
+		}
+
+		// Page-align ar so that all AddrRanges are aligned.
+		end, ok := ar.End.RoundUp()
+		var alignerr error
+		if !ok {
+			end = ar.End.RoundDown()
+			alignerr = syserror.EFAULT
+		}
+		ar = usermem.AddrRange{ar.Start.RoundDown(), end}
+
+		pstart, pend, perr := mm.ensurePMAsLocked(ctx, mm.vmas.FindSegment(ar.Start), ar)
+		if pend.Start() <= ar.Start {
+			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr
+		}
+
+		var cowerr error
+		if opts.breakCOW {
+			if !pstart.Ok() {
+				pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend)
+			}
+			pend, _, cowerr = mm.breakCopyOnWriteLocked(pstart, ar)
+		}
+
+		if cowerr != nil {
+			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), cowerr
+		}
+		if perr != nil {
+			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr
+		}
+		if alignerr != nil {
+			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), alignerr
+		}
+	}
+
+	return ars, nil
+}
+
+// ensurePMAsLocked ensures that pmas exist for all addresses in ar. It returns:
+//
+// - An iterator to the pma containing ar.Start, on a best-effort basis (that
+// is, the returned iterator may be terminal, even if such a pma exists).
+// Returning this iterator on a best-effort basis allows callers that require
+// it to use it when it's cheaply available, while also avoiding the overhead
+// of retrieving it when it's not.
+//
+// - An iterator to the gap after the last pma containing an address in ar. If
+// pmas exist for no addresses in ar, the iterator is to a gap that begins
+// before ar.Start.
+//
+// - An error that is non-nil if pmas exist for only a subset of ar.
+//
+// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
+// writing. ar.Length() != 0. ar must be page-aligned.
+// vseg.Range().Contains(ar.Start). vmas must exist for all addresses in ar.
+func (mm *MemoryManager) ensurePMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange) (pmaIterator, pmaGapIterator, error) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !vseg.Range().Contains(ar.Start) {
+			panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
+		}
+	}
+
+	pstart, pgap := mm.pmas.Find(ar.Start)
+	if pstart.Ok() {
+		pgap = pstart.NextGap()
+	}
+	for pgap.Start() < ar.End {
+		if pgap.Range().Length() == 0 {
+			pgap = pgap.NextGap()
+			continue
+		}
+		// A single pgap might be spanned by multiple vmas. Insert pmas to
+		// cover the first (vma, pgap) pair.
+		pgapAR := pgap.Range().Intersect(ar)
+		vseg = vseg.seekNextLowerBound(pgapAR.Start)
+		if checkInvariants {
+			if !vseg.Ok() {
+				panic(fmt.Sprintf("no vma after %#x", pgapAR.Start))
+			}
+			if pgapAR.Start < vseg.Start() {
+				panic(fmt.Sprintf("no vma in [%#x, %#x)", pgapAR.Start, vseg.Start()))
+			}
+		}
+		var err error
+		pgap, err = mm.insertPMAsLocked(ctx, vseg, pgap, ar)
+		// insertPMAsLocked most likely invalidated iterators, so pstart is now
+		// unknown.
+		pstart = pmaIterator{}
+		if err != nil {
+			return pstart, pgap, err
+		}
+	}
+	return pstart, pgap, nil
+}
+
+const (
+	// When memory is allocated for a private pma, align the allocated address
+	// range to a privateAllocUnit boundary when possible. Larger values of
+	// privateAllocUnit may reduce page faults by allowing fewer, larger pmas
+	// to be mapped, but may result in larger amounts of wasted memory in the
+	// presence of fragmentation. privateAllocUnit must be a power-of-2
+	// multiple of usermem.PageSize.
+	privateAllocUnit = usermem.HugePageSize
+
+	privateAllocMask = privateAllocUnit - 1
+)
+
+func privateAligned(ar usermem.AddrRange) usermem.AddrRange {
+	aligned := usermem.AddrRange{ar.Start &^ privateAllocMask, ar.End}
+	if end := (ar.End + privateAllocMask) &^ privateAllocMask; end >= ar.End {
+		aligned.End = end
+	}
+	if checkInvariants {
+		if !aligned.IsSupersetOf(ar) {
+			panic(fmt.Sprintf("aligned AddrRange %#v is not a superset of ar %#v", aligned, ar))
+		}
+	}
+	return aligned
+}
+
+// insertPMAsLocked inserts pmas into pgap corresponding to the vma iterated by
+// vseg, spanning at least ar. It returns:
+//
+// - An iterator to the gap after the last pma containing an address in ar. If
+// pmas exist for no addresses in ar, the iterator is to a gap that begins
+// before ar.Start.
+//
+// - An error that is non-nil if pmas exist for only a subset of ar.
+//
+// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
+// writing. vseg.Range().Intersect(pgap.Range()).Intersect(ar).Length() != 0.
+// ar must be page-aligned.
+func (mm *MemoryManager) insertPMAsLocked(ctx context.Context, vseg vmaIterator, pgap pmaGapIterator, ar usermem.AddrRange) (pmaGapIterator, error) {
+	optAR := vseg.Range().Intersect(pgap.Range())
+	if checkInvariants {
+		if optAR.Length() <= 0 {
+			panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap))
+		}
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar %v", ar))
+		}
+	}
+	vma := vseg.ValuePtr()
+
+	// Private anonymous mappings get pmas by allocating.
+	if vma.mappable == nil {
+		// Limit the range we allocate to ar, aligned to privateAllocUnit.
+		maskAR := privateAligned(ar)
+		allocAR := optAR.Intersect(maskAR)
+		mem := mm.p.Memory()
+		fr, err := mem.Allocate(uint64(allocAR.Length()), usage.Anonymous)
+		if err != nil {
+			return pgap, err
+		}
+		mm.incPrivateRef(fr)
+
+		if checkInvariants {
+			if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) {
+				panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr))
+			}
+		}
+
+		mm.addRSSLocked(allocAR)
+		mem.IncRef(fr)
+
+		return mm.pmas.Insert(pgap, allocAR, pma{
+			file:              mem,
+			off:               fr.Start,
+			vmaEffectivePerms: vma.effectivePerms,
+			vmaMaxPerms:       vma.maxPerms,
+			private:           true,
+			// Since we just allocated this memory and have the only reference,
+			// the new pma does not need copy-on-write.
+		}).NextGap(), nil
+	}
+
+	// Other mappings get pmas by translating. Limit the required range
+	// to ar.
+	optMR := vseg.mappableRangeOf(optAR)
+	reqAR := optAR.Intersect(ar)
+	reqMR := vseg.mappableRangeOf(reqAR)
+	perms := vma.maxPerms
+	if vma.private {
+		perms.Write = false
+	}
+	ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
+	if checkInvariants {
+		if err := memmap.CheckTranslateResult(reqMR, optMR, ts, err); err != nil {
+			panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v): %v", vma.mappable, reqMR, optMR, err))
+		}
+	}
+
+	// Install a pma for each Translation.
+	for _, t := range ts {
+		// This is valid because memmap.Mappable.Translate is required to
+		// return Translations in increasing Translation.Source order.
+		addrRange := vseg.addrRangeOf(t.Source)
+		mm.addRSSLocked(addrRange)
+		pseg := mm.pmas.Insert(pgap, addrRange, pma{
+			file:              t.File,
+			off:               t.Offset,
+			vmaEffectivePerms: vma.effectivePerms,
+			vmaMaxPerms:       vma.maxPerms,
+			needCOW:           vma.private,
+		})
+		// The new pseg may have been merged with existing segments, only take a
+		// ref on the inserted range.
+		t.File.IncRef(pseg.fileRangeOf(addrRange))
+		pgap = pseg.NextGap()
+	}
+
+	// Even if Translate returned an error, if we got to ar.End,
+	// insertPMAsLocked succeeded.
+	if ar.End <= pgap.Start() {
+		return pgap, nil
+	}
+	return pgap, err
+}
+
+// breakCopyOnWriteLocked ensures that pmas in ar are not copy-on-write. It
+// returns:
+//
+// - An iterator to the gap after the last non-COW pma containing an address in
+// ar. If non-COW pmas exist for no addresses in ar, the iterator is to a gap
+// that begins before ar.Start.
+//
+// - A boolean that is true if iterators into mm.pmas may have been
+// invalidated.
+//
+// - An error that is non-nil if non-COW pmas exist for only a subset of ar.
+//
+// Preconditions: mm.activeMu must be locked for writing. ar.Length() != 0. ar
+// must be page-aligned. pseg.Range().Contains(ar.Start). pmas must exist for
+// all addresses in ar.
+func (mm *MemoryManager) breakCopyOnWriteLocked(pseg pmaIterator, ar usermem.AddrRange) (pmaGapIterator, bool, error) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !pseg.Range().Contains(ar.Start) {
+			panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
+		}
+	}
+
+	// Limit the range we copy to ar, aligned to privateAllocUnit.
+	maskAR := privateAligned(ar)
+	var invalidatedIterators, didUnmapAS bool
+	mem := mm.p.Memory()
+	for {
+		if mm.isPMACopyOnWriteLocked(pseg) {
+			// Determine the range to copy.
+			copyAR := pseg.Range().Intersect(maskAR)
+
+			// Get internal mappings from the pma to copy from.
+			if err := pseg.getInternalMappingsLocked(); err != nil {
+				return pseg.PrevGap(), invalidatedIterators, err
+			}
+
+			// Copy contents.
+			fr, err := platform.AllocateAndFill(mem, uint64(copyAR.Length()), usage.Anonymous, &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)})
+			if _, ok := err.(safecopy.BusError); ok {
+				// If we got SIGBUS during the copy, deliver SIGBUS to
+				// userspace (instead of SIGSEGV) if we're breaking
+				// copy-on-write due to application page fault.
+				err = &memmap.BusError{err}
+			}
+			if fr.Length() == 0 {
+				return pseg.PrevGap(), invalidatedIterators, err
+			}
+			mm.incPrivateRef(fr)
+			mem.IncRef(fr)
+
+			// Unmap all of maskAR, not just copyAR, to minimize host syscalls.
+			// AddressSpace mappings must be removed before mm.decPrivateRef().
+			if !didUnmapAS {
+				mm.unmapASLocked(maskAR)
+				didUnmapAS = true
+			}
+
+			// Replace the pma with a copy in the part of the address range
+			// where copying was successful.
+			copyAR.End = copyAR.Start + usermem.Addr(fr.Length())
+			if copyAR != pseg.Range() {
+				pseg = mm.pmas.Isolate(pseg, copyAR)
+				invalidatedIterators = true
+			}
+			pma := pseg.ValuePtr()
+			if pma.private {
+				mm.decPrivateRef(pseg.fileRange())
+			}
+			pma.file.DecRef(pseg.fileRange())
+
+			pma.file = mem
+			pma.off = fr.Start
+			pma.private = true
+			pma.needCOW = false
+			pma.internalMappings = safemem.BlockSeq{}
+
+			// Try to merge pma with its neighbors.
+			if prev := pseg.PrevSegment(); prev.Ok() {
+				if merged := mm.pmas.Merge(prev, pseg); merged.Ok() {
+					pseg = merged
+					invalidatedIterators = true
+				}
+			}
+			if next := pseg.NextSegment(); next.Ok() {
+				if merged := mm.pmas.Merge(pseg, next); merged.Ok() {
+					pseg = merged
+					invalidatedIterators = true
+				}
+			}
+
+			// If an error occurred after ar.End, breakCopyOnWriteLocked still
+			// did its job, so discard the error.
+			if err != nil && pseg.End() < ar.End {
+				return pseg.NextGap(), invalidatedIterators, err
+			}
+		}
+		// This checks against ar.End, not maskAR.End, so we will never break
+		// COW on a pma that does not intersect ar.
+		if ar.End <= pseg.End() {
+			return pseg.NextGap(), invalidatedIterators, nil
+		}
+		pseg = pseg.NextSegment()
+	}
+}
+
+// Preconditions: mm.activeMu must be locked for writing.
+func (mm *MemoryManager) isPMACopyOnWriteLocked(pseg pmaIterator) bool {
+	pma := pseg.ValuePtr()
+	if !pma.needCOW {
+		return false
+	}
+	if !pma.private {
+		return true
+	}
+	// If we have the only reference on private memory to be copied, just take
+	// ownership of it instead of copying. If we do hold the only reference,
+	// additional references can only be taken by mm.Fork(), which is excluded
+	// by mm.activeMu, so this isn't racy.
+	mm.privateRefs.mu.Lock()
+	defer mm.privateRefs.mu.Unlock()
+	fr := pseg.fileRange()
+	// This check relies on mm.privateRefs.refs being kept fully merged.
+	rseg := mm.privateRefs.refs.FindSegment(fr.Start)
+	if rseg.Ok() && rseg.Value() == 1 && fr.End <= rseg.End() {
+		pma.needCOW = false
+		return false
+	}
+	return true
+}
+
+// Invalidate implements memmap.MappingSpace.Invalidate.
+func (mm *MemoryManager) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateOpts) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	mm.activeMu.Lock()
+	defer mm.activeMu.Unlock()
+	if mm.captureInvalidations {
+		mm.capturedInvalidations = append(mm.capturedInvalidations, invalidateArgs{ar, opts})
+		return
+	}
+	mm.invalidateLocked(ar, opts.InvalidatePrivate, true)
+}
+
+// invalidateLocked removes pmas and AddressSpace mappings of those pmas for
+// addresses in ar.
+//
+// Preconditions: mm.activeMu must be locked for writing. ar.Length() != 0. ar
+// must be page-aligned.
+func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivate, invalidateShared bool) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	var didUnmapAS bool
+	pseg := mm.pmas.LowerBoundSegment(ar.Start)
+	for pseg.Ok() && pseg.Start() < ar.End {
+		pma := pseg.ValuePtr()
+		if (invalidatePrivate && pma.private) || (invalidateShared && !pma.private) {
+			pseg = mm.pmas.Isolate(pseg, ar)
+			pma = pseg.ValuePtr()
+			if !didUnmapAS {
+				// Unmap all of ar, not just pseg.Range(), to minimize host
+				// syscalls. AddressSpace mappings must be removed before
+				// mm.decPrivateRef().
+				mm.unmapASLocked(ar)
+				didUnmapAS = true
+			}
+			if pma.private {
+				mm.decPrivateRef(pseg.fileRange())
+			}
+			mm.removeRSSLocked(pseg.Range())
+			pma.file.DecRef(pseg.fileRange())
+			pseg = mm.pmas.Remove(pseg).NextSegment()
+		} else {
+			pseg = pseg.NextSegment()
+		}
+	}
+}
+
+// movePMAsLocked moves all pmas in oldAR to newAR.
+//
+// Preconditions: mm.activeMu must be locked for writing. oldAR.Length() != 0.
+// oldAR.Length() == newAR.Length(). !oldAR.Overlaps(newAR).
+// mm.pmas.IsEmptyRange(newAR). oldAR and newAR must be page-aligned.
+func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) {
+	if checkInvariants {
+		if !oldAR.WellFormed() || oldAR.Length() <= 0 || !oldAR.IsPageAligned() {
+			panic(fmt.Sprintf("invalid oldAR: %v", oldAR))
+		}
+		if !newAR.WellFormed() || newAR.Length() <= 0 || !newAR.IsPageAligned() {
+			panic(fmt.Sprintf("invalid newAR: %v", newAR))
+		}
+		if oldAR.Length() != newAR.Length() {
+			panic(fmt.Sprintf("old and new address ranges have different lengths: %v, %v", oldAR, newAR))
+		}
+		if oldAR.Overlaps(newAR) {
+			panic(fmt.Sprintf("old and new address ranges overlap: %v, %v", oldAR, newAR))
+		}
+		// mm.pmas.IsEmptyRange is checked by mm.pmas.Insert.
+	}
+
+	type movedPMA struct {
+		oldAR usermem.AddrRange
+		pma   pma
+	}
+	var movedPMAs []movedPMA
+	pseg := mm.pmas.LowerBoundSegment(oldAR.Start)
+	for pseg.Ok() && pseg.Start() < oldAR.End {
+		pseg = mm.pmas.Isolate(pseg, oldAR)
+		movedPMAs = append(movedPMAs, movedPMA{
+			oldAR: pseg.Range(),
+			pma:   pseg.Value(),
+		})
+		mm.removeRSSLocked(pseg.Range())
+		pseg = mm.pmas.Remove(pseg).NextSegment()
+	}
+
+	off := newAR.Start - oldAR.Start
+	pgap := mm.pmas.FindGap(newAR.Start)
+	for i := range movedPMAs {
+		mpma := &movedPMAs[i]
+		pmaNewAR := usermem.AddrRange{mpma.oldAR.Start + off, mpma.oldAR.End + off}
+		mm.addRSSLocked(pmaNewAR)
+		pgap = mm.pmas.Insert(pgap, pmaNewAR, mpma.pma).NextGap()
+	}
+
+	mm.unmapASLocked(oldAR)
+}
+
+// getPMAInternalMappingsLocked ensures that pmas for all addresses in ar have
+// cached internal mappings. It returns:
+//
+// - An iterator to the gap after the last pma with internal mappings
+// containing an address in ar. If internal mappings exist for no addresses in
+// ar, the iterator is to a gap that begins before ar.Start.
+//
+// - An error that is non-nil if internal mappings exist for only a subset of
+// ar.
+//
+// Preconditions: mm.activeMu must be locked for writing.
+// pseg.Range().Contains(ar.Start). pmas must exist for all addresses in ar.
+// ar.Length() != 0.
+//
+// Postconditions: getPMAInternalMappingsLocked does not invalidate iterators
+// into mm.pmas.
+func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) (pmaGapIterator, error) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !pseg.Range().Contains(ar.Start) {
+			panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
+		}
+	}
+
+	for {
+		if err := pseg.getInternalMappingsLocked(); err != nil {
+			return pseg.PrevGap(), err
+		}
+		if ar.End <= pseg.End() {
+			return pseg.NextGap(), nil
+		}
+		pseg, _ = pseg.NextNonEmpty()
+	}
+}
+
+// getVecPMAInternalMappingsLocked ensures that pmas for all addresses in ars
+// have cached internal mappings. It returns the subset of ars for which
+// internal mappings exist. If this is not equal to ars, it returns a non-nil
+// error explaining why.
+//
+// Preconditions: mm.activeMu must be locked for writing. pmas must exist for
+// all addresses in ar.
+//
+// Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators
+// into mm.pmas.
+func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars usermem.AddrRangeSeq) (usermem.AddrRangeSeq, error) {
+	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
+		ar := arsit.Head()
+		if ar.Length() == 0 {
+			continue
+		}
+		if pend, err := mm.getPMAInternalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); err != nil {
+			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), err
+		}
+	}
+	return ars, nil
+}
+
+// internalMappingsLocked returns internal mappings for addresses in ar.
+//
+// Preconditions: mm.activeMu must be locked. Internal mappings must have been
+// previously established for all addresses in ar. ar.Length() != 0.
+// pseg.Range().Contains(ar.Start).
+func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) safemem.BlockSeq {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !pseg.Range().Contains(ar.Start) {
+			panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
+		}
+	}
+
+	if ar.End <= pseg.End() {
+		// Since only one pma is involved, we can use pma.internalMappings
+		// directly, avoiding a slice allocation.
+		offset := uint64(ar.Start - pseg.Start())
+		return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length()))
+	}
+
+	var ims []safemem.Block
+	for {
+		pr := pseg.Range().Intersect(ar)
+		for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pr.Start - pseg.Start())).TakeFirst64(uint64(pr.Length())); !pims.IsEmpty(); pims = pims.Tail() {
+			ims = append(ims, pims.Head())
+		}
+		if ar.End <= pseg.End() {
+			break
+		}
+		pseg = pseg.NextSegment()
+	}
+	return safemem.BlockSeqFromSlice(ims)
+}
+
+// vecInternalMappingsLocked returns internal mappings for addresses in ars.
+//
+// Preconditions: mm.activeMu must be locked. Internal mappings must have been
+// previously established for all addresses in ars.
+func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) safemem.BlockSeq {
+	var ims []safemem.Block
+	for ; !ars.IsEmpty(); ars = ars.Tail() {
+		ar := ars.Head()
+		if ar.Length() == 0 {
+			continue
+		}
+		for pims := mm.internalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); !pims.IsEmpty(); pims = pims.Tail() {
+			ims = append(ims, pims.Head())
+		}
+	}
+	return safemem.BlockSeqFromSlice(ims)
+}
+
+// incPrivateRef acquires a reference on private pages in fr.
+func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) {
+	mm.privateRefs.mu.Lock()
+	defer mm.privateRefs.mu.Unlock()
+	refSet := &mm.privateRefs.refs
+	seg, gap := refSet.Find(fr.Start)
+	for {
+		switch {
+		case seg.Ok() && seg.Start() < fr.End:
+			seg = refSet.Isolate(seg, fr)
+			seg.SetValue(seg.Value() + 1)
+			seg, gap = seg.NextNonEmpty()
+		case gap.Ok() && gap.Start() < fr.End:
+			seg, gap = refSet.InsertWithoutMerging(gap, gap.Range().Intersect(fr), 1).NextNonEmpty()
+		default:
+			refSet.MergeAdjacent(fr)
+			return
+		}
+	}
+}
+
+// decPrivateRef releases a reference on private pages in fr.
+func (mm *MemoryManager) decPrivateRef(fr platform.FileRange) {
+	var freed []platform.FileRange
+
+	mm.privateRefs.mu.Lock()
+	refSet := &mm.privateRefs.refs
+	seg := refSet.LowerBoundSegment(fr.Start)
+	for seg.Ok() && seg.Start() < fr.End {
+		seg = refSet.Isolate(seg, fr)
+		if old := seg.Value(); old == 1 {
+			freed = append(freed, seg.Range())
+			seg = refSet.Remove(seg).NextSegment()
+		} else {
+			seg.SetValue(old - 1)
+			seg = seg.NextSegment()
+		}
+	}
+	refSet.MergeAdjacent(fr)
+	mm.privateRefs.mu.Unlock()
+
+	mem := mm.p.Memory()
+	for _, fr := range freed {
+		mem.DecRef(fr)
+	}
+}
+
+// addRSSLocked updates the current and maximum resident set size of a
+// MemoryManager to reflect the insertion of a pma at ar.
+//
+// Preconditions: mm.activeMu must be locked for writing.
+func (mm *MemoryManager) addRSSLocked(ar usermem.AddrRange) {
+	mm.curRSS += uint64(ar.Length())
+	if mm.curRSS > mm.maxRSS {
+		mm.maxRSS = mm.curRSS
+	}
+}
+
+// removeRSSLocked updates the current resident set size of a MemoryManager to
+// reflect the removal of a pma at ar.
+//
+// Preconditions: mm.activeMu must be locked for writing.
+func (mm *MemoryManager) removeRSSLocked(ar usermem.AddrRange) {
+	mm.curRSS -= uint64(ar.Length())
+}
+
+// pmaSetFunctions implements segment.Functions for pmaSet.
+type pmaSetFunctions struct{}
+
+func (pmaSetFunctions) MinKey() usermem.Addr {
+	return 0
+}
+
+func (pmaSetFunctions) MaxKey() usermem.Addr {
+	return ^usermem.Addr(0)
+}
+
+func (pmaSetFunctions) ClearValue(pma *pma) {
+	pma.file = nil
+	pma.internalMappings = safemem.BlockSeq{}
+}
+
+func (pmaSetFunctions) Merge(ar1 usermem.AddrRange, pma1 pma, ar2 usermem.AddrRange, pma2 pma) (pma, bool) {
+	if pma1.file != pma2.file ||
+		pma1.off+uint64(ar1.Length()) != pma2.off ||
+		pma1.vmaEffectivePerms != pma2.vmaEffectivePerms ||
+		pma1.vmaMaxPerms != pma2.vmaMaxPerms ||
+		pma1.needCOW != pma2.needCOW ||
+		pma1.private != pma2.private {
+		return pma{}, false
+	}
+
+	// Discard internal mappings instead of trying to merge them, since merging
+	// them requires an allocation and getting them again from the
+	// platform.File might not.
+	pma1.internalMappings = safemem.BlockSeq{}
+	return pma1, true
+}
+
+func (pmaSetFunctions) Split(ar usermem.AddrRange, p pma, split usermem.Addr) (pma, pma) {
+	newlen1 := uint64(split - ar.Start)
+	p2 := p
+	p2.off += newlen1
+	if !p.internalMappings.IsEmpty() {
+		p.internalMappings = p.internalMappings.TakeFirst64(newlen1)
+		p2.internalMappings = p2.internalMappings.DropFirst64(newlen1)
+	}
+	return p, p2
+}
+
+// findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do
+// so by scanning linearly backward from pgap.
+//
+// Preconditions: mm.activeMu must be locked. addr <= pgap.Start().
+func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr usermem.Addr, pgap pmaGapIterator) pmaIterator {
+	if checkInvariants {
+		if !pgap.Ok() {
+			panic("terminal pma iterator")
+		}
+		if addr > pgap.Start() {
+			panic(fmt.Sprintf("can't seek backward to %#x from %#x", addr, pgap.Start()))
+		}
+	}
+	// Optimistically check if pgap.PrevSegment() is the PMA we're looking for,
+	// which is the case if findOrSeekPrevUpperBoundPMA is called to find the
+	// start of a range containing only a single PMA.
+	if pseg := pgap.PrevSegment(); pseg.Start() <= addr {
+		return pseg
+	}
+	return mm.pmas.UpperBoundSegment(addr)
+}
+
+// getInternalMappingsLocked ensures that pseg.ValuePtr().internalMappings is
+// non-empty.
+//
+// Preconditions: mm.activeMu must be locked for writing.
+func (pseg pmaIterator) getInternalMappingsLocked() error {
+	pma := pseg.ValuePtr()
+	if pma.internalMappings.IsEmpty() {
+		// Internal mappings are used for ignorePermissions accesses,
+		// so we need to use vma.maxPerms instead of
+		// vma.effectivePerms. However, we will never execute
+		// application code through an internal mapping, and we don't
+		// actually need a writable mapping if copy-on-write is in
+		// effect. (But get a writable mapping anyway if the pma is
+		// private, so that if breakCopyOnWriteLocked =>
+		// isPMACopyOnWriteLocked takes ownership of the pma instead of
+		// copying, it doesn't need to get a new mapping.)
+		perms := pma.vmaMaxPerms
+		perms.Execute = false
+		if pma.needCOW && !pma.private {
+			perms.Write = false
+		}
+		ims, err := pma.file.MapInternal(pseg.fileRange(), perms)
+		if err != nil {
+			return err
+		}
+		pma.internalMappings = ims
+	}
+	return nil
+}
+
+func (pseg pmaIterator) fileRange() platform.FileRange {
+	return pseg.fileRangeOf(pseg.Range())
+}
+
+// Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0.
+func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange {
+	if checkInvariants {
+		if !pseg.Ok() {
+			panic("terminal pma iterator")
+		}
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !pseg.Range().IsSupersetOf(ar) {
+			panic(fmt.Sprintf("ar %v out of bounds %v", ar, pseg.Range()))
+		}
+	}
+
+	pma := pseg.ValuePtr()
+	pstart := pseg.Start()
+	return platform.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)}
+}
diff --git a/pkg/sentry/mm/proc_pid_maps.go b/pkg/sentry/mm/proc_pid_maps.go
new file mode 100644
index 000000000..5840b257c
--- /dev/null
+++ b/pkg/sentry/mm/proc_pid_maps.go
@@ -0,0 +1,105 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	// devMinorBits is the number of minor bits in a device number. Linux:
+	// include/linux/kdev_t.h:MINORBITS
+	devMinorBits = 20
+)
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (mm *MemoryManager) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData is called by fs/proc.mapsData.ReadSeqFileData.
+func (mm *MemoryManager) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	var data []seqfile.SeqData
+	var start usermem.Addr
+	if handle != nil {
+		start = *handle.(*usermem.Addr)
+	}
+	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
+		// FIXME: If we use a usermem.Addr for the handle, we get
+		// "panic: autosave error: type usermem.Addr is not registered".
+		vmaAddr := vseg.End()
+		data = append(data, seqfile.SeqData{
+			Buf:    mm.vmaMapsEntryLocked(ctx, vseg),
+			Handle: &vmaAddr,
+		})
+	}
+	return data, 1
+}
+
+// vmaMapsEntryLocked returns a /proc/[pid]/maps entry for the vma iterated by
+// vseg, including the trailing newline.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) vmaMapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte {
+	vma := vseg.ValuePtr()
+	private := "p"
+	if !vma.private {
+		private = "s"
+	}
+
+	var dev, ino uint64
+	if vma.id != nil {
+		dev = vma.id.DeviceID()
+		ino = vma.id.InodeID()
+	}
+	devMajor := uint32(dev >> devMinorBits)
+	devMinor := uint32(dev & ((1 << devMinorBits) - 1))
+
+	var b bytes.Buffer
+	// Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() =>
+	// stack_guard_page_start().
+	fmt.Fprintf(&b, "%08x-%08x %s%s %08x %02x:%02x %d ",
+		vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino)
+
+	// Figure out our filename or hint.
+	var s string
+	if vma.hint != "" {
+		s = vma.hint
+	} else if vma.id != nil {
+		// FIXME: We are holding mm.mappingMu here, which is
+		// consistent with Linux's holding mmap_sem in
+		// fs/proc/task_mmu.c:show_map_vma() => fs/seq_file.c:seq_file_path().
+		// However, it's not clear that fs.File.MappedName() is actually
+		// consistent with this lock order.
+		s = vma.id.MappedName(ctx)
+	}
+	if s != "" {
+		// Per linux, we pad until the 74th character.
+		if pad := 73 - b.Len(); pad > 0 {
+			b.WriteString(strings.Repeat(" ", pad))
+		}
+		b.WriteString(s)
+	}
+	b.WriteString("\n")
+	return b.Bytes()
+}
diff --git a/pkg/sentry/mm/save_restore.go b/pkg/sentry/mm/save_restore.go
new file mode 100644
index 000000000..36fed8f1c
--- /dev/null
+++ b/pkg/sentry/mm/save_restore.go
@@ -0,0 +1,57 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// InvalidateUnsavable invokes memmap.Mappable.InvalidateUnsavable on all
+// Mappables mapped by mm.
+func (mm *MemoryManager) InvalidateUnsavable(ctx context.Context) error {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+		if vma := vseg.ValuePtr(); vma.mappable != nil {
+			if err := vma.mappable.InvalidateUnsavable(ctx); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// beforeSave is invoked by stateify.
+func (mm *MemoryManager) beforeSave() {
+	mem := mm.p.Memory()
+	for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
+		if pma := pseg.ValuePtr(); pma.file != mem {
+			// InvalidateUnsavable should have caused all such pmas to be
+			// invalidated.
+			panic(fmt.Sprintf("Can't save pma %#v with non-Memory file of type %T:\n%s", pseg.Range(), pma.file, mm))
+		}
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (mm *MemoryManager) afterLoad() {
+	mm.haveASIO = mm.p.SupportsAddressSpaceIO()
+	mem := mm.p.Memory()
+	for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
+		pseg.ValuePtr().file = mem
+	}
+}
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
new file mode 100644
index 000000000..9d3614034
--- /dev/null
+++ b/pkg/sentry/mm/special_mappable.go
@@ -0,0 +1,147 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SpecialMappable implements memmap.MappingIdentity and memmap.Mappable with
+// semantics similar to Linux's mm/mmap.c:_install_special_mapping(), except
+// that SpecialMappable takes ownership of the memory that it represents
+// (_install_special_mapping() does not.)
+type SpecialMappable struct {
+	refs.AtomicRefCount
+
+	p    platform.Platform
+	fr   platform.FileRange
+	name string
+}
+
+// NewSpecialMappable returns a SpecialMappable that owns fr, which represents
+// offsets in p.Memory() that contain the SpecialMappable's data. The
+// SpecialMappable will use the given name in /proc/[pid]/maps.
+//
+// Preconditions: fr.Length() != 0.
+func NewSpecialMappable(name string, p platform.Platform, fr platform.FileRange) *SpecialMappable {
+	return &SpecialMappable{p: p, fr: fr, name: name}
+}
+
+// DecRef implements refs.RefCounter.DecRef.
+func (m *SpecialMappable) DecRef() {
+	m.AtomicRefCount.DecRefWithDestructor(func() {
+		m.p.Memory().DecRef(m.fr)
+	})
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (m *SpecialMappable) MappedName(ctx context.Context) string {
+	return m.name
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (m *SpecialMappable) DeviceID() uint64 {
+	return 0
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (m *SpecialMappable) InodeID() uint64 {
+	return 0
+}
+
+// Msync implements memmap.MappingIdentity.Msync.
+func (m *SpecialMappable) Msync(ctx context.Context, mr memmap.MappableRange) error {
+	// Linux: vm_file is NULL, causing msync to skip it entirely.
+	return nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (m *SpecialMappable) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (m *SpecialMappable) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (m *SpecialMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
+	return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (m *SpecialMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	var err error
+	if required.End > m.fr.Length() {
+		err = &memmap.BusError{syserror.EFAULT}
+	}
+	if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 {
+		return []memmap.Translation{
+			{
+				Source: source,
+				File:   m.p.Memory(),
+				Offset: m.fr.Start + source.Start,
+			},
+		}, err
+	}
+	return nil, err
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (m *SpecialMappable) InvalidateUnsavable(ctx context.Context) error {
+	// Since data is stored in platform.Platform.Memory(), the contents of
+	// which are preserved across save/restore, we don't need to do anything.
+	return nil
+}
+
+// Platform returns the Platform whose Memory stores the SpecialMappable's
+// contents.
+func (m *SpecialMappable) Platform() platform.Platform {
+	return m.p
+}
+
+// FileRange returns the offsets into Platform().Memory() that store the
+// SpecialMappable's contents.
+func (m *SpecialMappable) FileRange() platform.FileRange {
+	return m.fr
+}
+
+// Length returns the length of the SpecialMappable.
+func (m *SpecialMappable) Length() uint64 {
+	return m.fr.Length()
+}
+
+// NewSharedAnonMappable returns a SpecialMappable that implements the
+// semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero.
+//
+// TODO: The use of SpecialMappable is a lazy code reuse hack. Linux
+// uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should
+// do the same to get non-zero device and inode IDs.
+func NewSharedAnonMappable(length uint64, p platform.Platform) (*SpecialMappable, error) {
+	if length == 0 || length != uint64(usermem.Addr(length).RoundDown()) {
+		return nil, syserror.EINVAL
+	}
+	fr, err := p.Memory().Allocate(length, usage.Anonymous)
+	if err != nil {
+		return nil, err
+	}
+	return NewSpecialMappable("/dev/zero (deleted)", p, fr), nil
+}
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
new file mode 100644
index 000000000..0730be65b
--- /dev/null
+++ b/pkg/sentry/mm/syscalls.go
@@ -0,0 +1,794 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+	mrand "math/rand"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// HandleUserFault handles an application page fault. sp is the faulting
+// application thread's stack pointer.
+//
+// Preconditions: mm.as != nil.
+func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr usermem.Addr, at usermem.AccessType, sp usermem.Addr) error {
+	ar, ok := addr.RoundDown().ToRange(usermem.PageSize)
+	if !ok {
+		return syserror.EFAULT
+	}
+
+	// Don't bother trying existingPMAsLocked; in most cases, if we did have
+	// existing pmas, we wouldn't have faulted.
+
+	// Ensure that we have a usable vma. Here and below, since we are only
+	// asking for a single page, there is no possibility of partial success,
+	// and any error is immediately fatal.
+	mm.mappingMu.RLock()
+	vseg, _, err := mm.getVMAsLocked(ctx, ar, at, false)
+	if err != nil {
+		mm.mappingMu.RUnlock()
+		return err
+	}
+
+	// Ensure that we have a usable pma.
+	mm.activeMu.Lock()
+	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{
+		breakCOW: at.Write,
+	})
+	mm.mappingMu.RUnlock()
+	if err != nil {
+		mm.activeMu.Unlock()
+		return err
+	}
+
+	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
+	// anymore.
+	mm.activeMu.DowngradeLock()
+
+	// Map the faulted page into the active AddressSpace.
+	err = mm.mapASLocked(pseg, ar, false)
+	mm.activeMu.RUnlock()
+	return err
+}
+
+// MMap establishes a memory mapping.
+func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (usermem.Addr, error) {
+	if opts.Length == 0 {
+		return 0, syserror.EINVAL
+	}
+	length, ok := usermem.Addr(opts.Length).RoundUp()
+	if !ok {
+		return 0, syserror.ENOMEM
+	}
+	opts.Length = uint64(length)
+
+	if opts.Mappable != nil {
+		// Offset must be aligned.
+		if usermem.Addr(opts.Offset).RoundDown() != usermem.Addr(opts.Offset) {
+			return 0, syserror.EINVAL
+		}
+		// Offset + length must not overflow.
+		if end := opts.Offset + opts.Length; end < opts.Offset {
+			return 0, syserror.ENOMEM
+		}
+	} else {
+		opts.Offset = 0
+		if !opts.Private {
+			if opts.MappingIdentity != nil {
+				return 0, syserror.EINVAL
+			}
+			m, err := NewSharedAnonMappable(opts.Length, platform.FromContext(ctx))
+			if err != nil {
+				return 0, err
+			}
+			opts.MappingIdentity = m
+			opts.Mappable = m
+		}
+	}
+
+	if opts.Addr.RoundDown() != opts.Addr {
+		// MAP_FIXED requires addr to be page-aligned; non-fixed mappings
+		// don't.
+		if opts.Fixed {
+			return 0, syserror.EINVAL
+		}
+		opts.Addr = opts.Addr.RoundDown()
+	}
+
+	if !opts.MaxPerms.SupersetOf(opts.Perms) {
+		return 0, syserror.EACCES
+	}
+	if opts.Unmap && !opts.Fixed {
+		return 0, syserror.EINVAL
+	}
+	if opts.GrowsDown && opts.Mappable != nil {
+		return 0, syserror.EINVAL
+	}
+
+	// Get the new vma.
+	mm.mappingMu.Lock()
+	vseg, ar, err := mm.createVMALocked(ctx, opts)
+	if err != nil {
+		mm.mappingMu.Unlock()
+		return 0, err
+	}
+
+	switch {
+	case opts.Precommit:
+		// Get pmas and map with precommit as requested.
+		mm.populateAndUnlock(ctx, vseg, ar, true)
+
+	case opts.Mappable == nil && length <= privateAllocUnit:
+		// NOTE: Get pmas and map eagerly in the hope
+		// that doing so will save on future page faults. We only do this for
+		// anonymous mappings, since otherwise the cost of
+		// memmap.Mappable.Translate is unknown; and only for small mappings,
+		// to avoid needing to allocate large amounts of memory that we may
+		// subsequently need to checkpoint.
+		mm.populateAndUnlock(ctx, vseg, ar, false)
+
+	default:
+		mm.mappingMu.Unlock()
+	}
+
+	return ar.Start, nil
+}
+
+// Preconditions: mm.mappingMu must be locked for writing.
+//
+// Postconditions: mm.mappingMu will be unlocked.
+func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+	if !vseg.ValuePtr().effectivePerms.Any() {
+		// Linux doesn't populate inaccessible pages. See
+		// mm/gup.c:populate_vma_page_range.
+		mm.mappingMu.Unlock()
+		return
+	}
+
+	mm.activeMu.Lock()
+
+	// Even if we get a new pma, we can't actually map it if we don't have an
+	// AddressSpace.
+	if mm.as == nil {
+		mm.activeMu.Unlock()
+		mm.mappingMu.Unlock()
+		return
+	}
+
+	// Ensure that we have usable pmas.
+	mm.mappingMu.DowngradeLock()
+	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
+	mm.mappingMu.RUnlock()
+	if err != nil {
+		// mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
+		// mm/gup.c:mm_populate(). If it matters, we'll get it again when
+		// userspace actually tries to use the failing page.
+		mm.activeMu.Unlock()
+		return
+	}
+
+	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
+	// anymore.
+	mm.activeMu.DowngradeLock()
+
+	// As above, errors are silently ignored.
+	mm.mapASLocked(pseg, ar, precommit)
+	mm.activeMu.RUnlock()
+}
+
+// MapStack allocates the initial process stack.
+func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) {
+	// maxStackSize is the maximum supported process stack size in bytes.
+	//
+	// This limit exists because stack growing isn't implemented, so the entire
+	// process stack must be mapped up-front.
+	const maxStackSize = 128 << 20
+
+	stackSize := limits.FromContext(ctx).Get(limits.Stack)
+	r, ok := usermem.Addr(stackSize.Cur).RoundUp()
+	sz := uint64(r)
+	if !ok {
+		// RLIM_INFINITY rounds up to 0.
+		sz = linux.DefaultStackSoftLimit
+	} else if sz > maxStackSize {
+		ctx.Warningf("Capping stack size from RLIMIT_STACK of %v down to %v.", sz, maxStackSize)
+		sz = maxStackSize
+	} else if sz == 0 {
+		return usermem.AddrRange{}, syserror.ENOMEM
+	}
+	szaddr := usermem.Addr(sz)
+	ctx.Debugf("Allocating stack with size of %v bytes", sz)
+
+	// Determine the stack's desired location. Unlike Linux, address
+	// randomization can't be disabled.
+	stackEnd := mm.layout.MaxAddr - usermem.Addr(mrand.Int63n(int64(mm.layout.MaxStackRand))).RoundDown()
+	if stackEnd < szaddr {
+		return usermem.AddrRange{}, syserror.ENOMEM
+	}
+	stackStart := stackEnd - szaddr
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	_, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+		Length:    sz,
+		Addr:      stackStart,
+		Perms:     usermem.ReadWrite,
+		MaxPerms:  usermem.AnyAccess,
+		Private:   true,
+		GrowsDown: true,
+		Hint:      "[stack]",
+	})
+	return ar, err
+}
+
+// MUnmap implements the semantics of Linux's munmap(2).
+func (mm *MemoryManager) MUnmap(ctx context.Context, addr usermem.Addr, length uint64) error {
+	if addr != addr.RoundDown() {
+		return syserror.EINVAL
+	}
+	if length == 0 {
+		return syserror.EINVAL
+	}
+	la, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return syserror.EINVAL
+	}
+	ar, ok := addr.ToRange(uint64(la))
+	if !ok {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	mm.unmapLocked(ctx, ar)
+	return nil
+}
+
+// MRemapOpts specifies options to MRemap.
+type MRemapOpts struct {
+	// Move controls whether MRemap moves the remapped mapping to a new address.
+	Move MRemapMoveMode
+
+	// NewAddr is the new address for the remapping. NewAddr is ignored unless
+	// Move is MMRemapMustMove.
+	NewAddr usermem.Addr
+}
+
+// MRemapMoveMode controls MRemap's moving behavior.
+type MRemapMoveMode int
+
+const (
+	// MRemapNoMove prevents MRemap from moving the remapped mapping.
+	MRemapNoMove MRemapMoveMode = iota
+
+	// MRemapMayMove allows MRemap to move the remapped mapping.
+	MRemapMayMove
+
+	// MRemapMustMove requires MRemap to move the remapped mapping to
+	// MRemapOpts.NewAddr, replacing any existing mappings in the remapped
+	// range.
+	MRemapMustMove
+)
+
+// MRemap implements the semantics of Linux's mremap(2).
+func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSize uint64, newSize uint64, opts MRemapOpts) (usermem.Addr, error) {
+	// "Note that old_address has to be page aligned." - mremap(2)
+	if oldAddr.RoundDown() != oldAddr {
+		return 0, syserror.EINVAL
+	}
+
+	// Linux treats an old_size that rounds up to 0 as 0, which is otherwise a
+	// valid size. However, new_size can't be 0 after rounding.
+	oldSizeAddr, _ := usermem.Addr(oldSize).RoundUp()
+	oldSize = uint64(oldSizeAddr)
+	newSizeAddr, ok := usermem.Addr(newSize).RoundUp()
+	if !ok || newSizeAddr == 0 {
+		return 0, syserror.EINVAL
+	}
+	newSize = uint64(newSizeAddr)
+
+	oldEnd, ok := oldAddr.AddLength(oldSize)
+	if !ok {
+		return 0, syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+
+	// All cases require that a vma exists at oldAddr.
+	vseg := mm.vmas.FindSegment(oldAddr)
+	if !vseg.Ok() {
+		return 0, syserror.EFAULT
+	}
+
+	if opts.Move != MRemapMustMove {
+		// Handle noops and in-place shrinking. These cases don't care if
+		// [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
+		// (aside from oldAddr).
+		if newSize <= oldSize {
+			if newSize < oldSize {
+				// If oldAddr+oldSize didn't overflow, oldAddr+newSize can't
+				// either.
+				newEnd := oldAddr + usermem.Addr(newSize)
+				mm.unmapLocked(ctx, usermem.AddrRange{newEnd, oldEnd})
+			}
+			return oldAddr, nil
+		}
+
+		// Handle in-place growing.
+
+		// Check that oldEnd maps to the same vma as oldAddr.
+		if vseg.End() < oldEnd {
+			return 0, syserror.EFAULT
+		}
+		// "Grow" the existing vma by creating a new mergeable one.
+		vma := vseg.ValuePtr()
+		var newOffset uint64
+		if vma.mappable != nil {
+			newOffset = vseg.mappableRange().End
+		}
+		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+			Length:          newSize - oldSize,
+			MappingIdentity: vma.id,
+			Mappable:        vma.mappable,
+			Offset:          newOffset,
+			Addr:            oldEnd,
+			Fixed:           true,
+			Perms:           vma.realPerms,
+			MaxPerms:        vma.maxPerms,
+			Private:         vma.private,
+			GrowsDown:       vma.growsDown,
+			Hint:            vma.hint,
+		})
+		if err == nil {
+			return oldAddr, nil
+		}
+		// In-place growth failed. In the MRemapMayMove case, fall through to
+		// moving below.
+		if opts.Move == MRemapNoMove {
+			return 0, err
+		}
+	}
+
+	// Handle moving, which is the only remaining case.
+
+	// Find a destination for the move.
+	var newAR usermem.AddrRange
+	switch opts.Move {
+	case MRemapMayMove:
+		newAddr, err := mm.findAvailableLocked(newSize, findAvailableOpts{})
+		if err != nil {
+			return 0, err
+		}
+		newAR, _ = newAddr.ToRange(newSize)
+
+	case MRemapMustMove:
+		newAddr := opts.NewAddr
+		if newAddr.RoundDown() != newAddr {
+			return 0, syserror.EINVAL
+		}
+		var ok bool
+		newAR, ok = newAddr.ToRange(newSize)
+		if !ok {
+			return 0, syserror.EINVAL
+		}
+		if (usermem.AddrRange{oldAddr, oldEnd}).Overlaps(newAR) {
+			return 0, syserror.EINVAL
+		}
+
+		// Unmap any mappings at the destination.
+		mm.unmapLocked(ctx, newAR)
+
+		// If the sizes specify shrinking, unmap everything between the new and
+		// old sizes at the source.
+		if newSize < oldSize {
+			oldNewEnd := oldAddr + usermem.Addr(newSize)
+			mm.unmapLocked(ctx, usermem.AddrRange{oldNewEnd, oldEnd})
+			oldEnd = oldNewEnd
+		}
+
+		// unmapLocked may have invalidated vseg; look it up again.
+		vseg = mm.vmas.FindSegment(oldAddr)
+	}
+
+	oldAR := usermem.AddrRange{oldAddr, oldEnd}
+
+	// In the MRemapMustMove case, these checks happen after unmapping:
+	// mm/mremap.c:mremap_to() => do_munmap(), vma_to_resize().
+
+	// Check that oldEnd maps to the same vma as oldAddr.
+	if vseg.End() < oldEnd {
+		return 0, syserror.EFAULT
+	}
+
+	// Check against RLIMIT_AS.
+	newUsageAS := mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS {
+		return 0, syserror.ENOMEM
+	}
+
+	if vma := vseg.ValuePtr(); vma.mappable != nil {
+		// Check that offset+length does not overflow.
+		if vma.off+uint64(newAR.Length()) < vma.off {
+			return 0, syserror.EINVAL
+		}
+		// Inform the Mappable, if any, of the copied mapping.
+		if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start)); err != nil {
+			return 0, err
+		}
+	}
+
+	// Remove the existing vma before inserting the new one to minimize
+	// iterator invalidation. We do this directly (instead of calling
+	// removeVMAsLocked) because:
+	//
+	// 1. We can't drop the reference on vma.id, which will be transferred to
+	// the new vma.
+	//
+	// 2. We can't call vma.mappable.RemoveMapping, because pmas are still at
+	// oldAR, so calling RemoveMapping could cause us to miss an invalidation
+	// overlapping oldAR.
+	//
+	// Call vseg.Value() (rather than vseg.ValuePtr()) first to make a copy of
+	// the vma.
+	vseg = mm.vmas.Isolate(vseg, oldAR)
+	vma := vseg.Value()
+	mm.vmas.Remove(vseg)
+
+	// Insert the new vma, transferring the reference on vma.id.
+	mm.vmas.Add(newAR, vma)
+
+	// Move pmas. This is technically optional for non-private pmas, which
+	// could just go through memmap.Mappable.Translate again, but it's required
+	// for private pmas.
+	mm.activeMu.Lock()
+	mm.movePMAsLocked(oldAR, newAR)
+	mm.activeMu.Unlock()
+
+	// Now that pmas have been moved to newAR, we can notify vma.mappable that
+	// oldAR is no longer mapped.
+	if vma.mappable != nil {
+		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off)
+	}
+
+	return newAR.Start, nil
+}
+
+// MProtect implements the semantics of Linux's mprotect(2).
+func (mm *MemoryManager) MProtect(addr usermem.Addr, length uint64, realPerms usermem.AccessType, growsDown bool) error {
+	if addr.RoundDown() != addr {
+		return syserror.EINVAL
+	}
+	if length == 0 {
+		return nil
+	}
+	rlength, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return syserror.ENOMEM
+	}
+	ar, ok := addr.ToRange(uint64(rlength))
+	if !ok {
+		return syserror.ENOMEM
+	}
+	effectivePerms := realPerms.Effective()
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	// Non-growsDown mprotect requires that all of ar is mapped, and stops at
+	// the first non-empty gap. growsDown mprotect requires that the first vma
+	// be growsDown, but does not require it to extend all the way to ar.Start;
+	// vmas after the first must be contiguous but need not be growsDown, like
+	// the non-growsDown case.
+	vseg := mm.vmas.LowerBoundSegment(ar.Start)
+	if !vseg.Ok() {
+		return syserror.ENOMEM
+	}
+	if growsDown {
+		if !vseg.ValuePtr().growsDown {
+			return syserror.EINVAL
+		}
+		if ar.End <= vseg.Start() {
+			return syserror.ENOMEM
+		}
+		ar.Start = vseg.Start()
+	} else {
+		if ar.Start < vseg.Start() {
+			return syserror.ENOMEM
+		}
+	}
+
+	mm.activeMu.Lock()
+	defer mm.activeMu.Unlock()
+	defer func() {
+		mm.vmas.MergeRange(ar)
+		mm.vmas.MergeAdjacent(ar)
+		mm.pmas.MergeRange(ar)
+		mm.pmas.MergeAdjacent(ar)
+	}()
+	pseg := mm.pmas.LowerBoundSegment(ar.Start)
+	var didUnmapAS bool
+	for {
+		// Check for permission validity before splitting vmas, for consistency
+		// with Linux.
+		if !vseg.ValuePtr().maxPerms.SupersetOf(effectivePerms) {
+			return syserror.EACCES
+		}
+		vseg = mm.vmas.Isolate(vseg, ar)
+
+		// Update vma permissions.
+		vma := vseg.ValuePtr()
+		vma.realPerms = realPerms
+		vma.effectivePerms = effectivePerms
+
+		// Propagate vma permission changes to pmas.
+		for pseg.Ok() && pseg.Start() < vseg.End() {
+			if pseg.Range().Overlaps(vseg.Range()) {
+				pseg = mm.pmas.Isolate(pseg, vseg.Range())
+				if !effectivePerms.SupersetOf(pseg.ValuePtr().vmaEffectivePerms) && !didUnmapAS {
+					// Unmap all of ar, not just vseg.Range(), to minimize host
+					// syscalls.
+					mm.unmapASLocked(ar)
+					didUnmapAS = true
+				}
+				pseg.ValuePtr().vmaEffectivePerms = effectivePerms
+			}
+			pseg = pseg.NextSegment()
+		}
+
+		// Continue to the next vma.
+		if ar.End <= vseg.End() {
+			return nil
+		}
+		vseg, _ = vseg.NextNonEmpty()
+		if !vseg.Ok() {
+			return syserror.ENOMEM
+		}
+	}
+}
+
+// BrkSetup sets mm's brk address to addr and its brk size to 0.
+func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) {
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	// Unmap the existing brk.
+	if mm.brk.Length() != 0 {
+		mm.unmapLocked(ctx, mm.brk)
+	}
+	mm.brk = usermem.AddrRange{addr, addr}
+}
+
+// Brk implements the semantics of Linux's brk(2), except that it returns an
+// error on failure.
+func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) {
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+
+	if addr < mm.brk.Start {
+		return mm.brk.End, syserror.EINVAL
+	}
+
+	// TODO: This enforces RLIMIT_DATA, but is slightly more
+	// permissive than the usual data limit. In particular, this only
+	// limits the size of the heap; a true RLIMIT_DATA limits the size of
+	// heap + data + bss. The segment sizes need to be plumbed from the
+	// loader package to fully enforce RLIMIT_DATA.
+	if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
+		return mm.brk.End, syserror.ENOMEM
+	}
+
+	oldbrkpg, _ := mm.brk.End.RoundUp()
+	newbrkpg, ok := addr.RoundUp()
+	if !ok {
+		return mm.brk.End, syserror.EFAULT
+	}
+
+	switch {
+	case newbrkpg < oldbrkpg:
+		mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg})
+
+	case oldbrkpg < newbrkpg:
+		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+			Length: uint64(newbrkpg - oldbrkpg),
+			Addr:   oldbrkpg,
+			Fixed:  true,
+			// Compare Linux's
+			// arch/x86/include/asm/page_types.h:VM_DATA_DEFAULT_FLAGS.
+			Perms:    usermem.ReadWrite,
+			MaxPerms: usermem.AnyAccess,
+			Private:  true,
+			Hint:     "[heap]",
+		})
+		if err != nil {
+			return mm.brk.End, err
+		}
+	}
+
+	mm.brk.End = addr
+	return addr, nil
+}
+
+// Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
+func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
+	ar, ok := addr.ToRange(length)
+	if !ok {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	mm.activeMu.Lock()
+	defer mm.activeMu.Unlock()
+
+	// Linux's mm/madvise.c:madvise_dontneed() => mm/memory.c:zap_page_range()
+	// is analogous to our mm.invalidateLocked(ar, true, true). We inline this
+	// here, with the special case that we synchronously decommit
+	// uniquely-owned (non-copy-on-write) pages for private anonymous vma,
+	// which is the common case for MADV_DONTNEED. Invalidating these pmas, and
+	// allowing them to be reallocated when touched again, increases pma
+	// fragmentation, which may significantly reduce performance for
+	// non-vectored I/O implementations. Also, decommitting synchronously
+	// ensures that Decommit immediately reduces host memory usage.
+	var didUnmapAS bool
+	pseg := mm.pmas.LowerBoundSegment(ar.Start)
+	vseg := mm.vmas.LowerBoundSegment(ar.Start)
+	mem := mm.p.Memory()
+	for pseg.Ok() && pseg.Start() < ar.End {
+		pma := pseg.ValuePtr()
+		if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
+			psegAR := pseg.Range().Intersect(ar)
+			vseg = vseg.seekNextLowerBound(psegAR.Start)
+			if checkInvariants {
+				if !vseg.Ok() {
+					panic(fmt.Sprintf("no vma after %#x", psegAR.Start))
+				}
+				if psegAR.Start < vseg.Start() {
+					panic(fmt.Sprintf("no vma in [%#x, %#x)", psegAR.Start, vseg.Start()))
+				}
+			}
+			if vseg.Range().IsSupersetOf(psegAR) && vseg.ValuePtr().mappable == nil {
+				if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
+					pseg = pseg.NextSegment()
+					continue
+				}
+				// If an error occurs, fall through to the general
+				// invalidation case below.
+			}
+		}
+		pseg = mm.pmas.Isolate(pseg, ar)
+		pma = pseg.ValuePtr()
+		if !didUnmapAS {
+			// Unmap all of ar, not just pseg.Range(), to minimize host
+			// syscalls. AddressSpace mappings must be removed before
+			// mm.decPrivateRef().
+			mm.unmapASLocked(ar)
+			didUnmapAS = true
+		}
+		if pma.private {
+			mm.decPrivateRef(pseg.fileRange())
+		}
+		pma.file.DecRef(pseg.fileRange())
+		mm.removeRSSLocked(pseg.Range())
+
+		pseg = mm.pmas.Remove(pseg).NextSegment()
+	}
+
+	// "If there are some parts of the specified address space that are not
+	// mapped, the Linux version of madvise() ignores them and applies the call
+	// to the rest (but returns ENOMEM from the system call, as it should)." -
+	// madvise(2)
+	if mm.vmas.SpanRange(ar) != ar.Length() {
+		return syserror.ENOMEM
+	}
+	return nil
+}
+
+// Sync implements the semantics of Linux's msync(MS_SYNC).
+func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uint64) error {
+	ar, ok := addr.ToRange(length)
+	if !ok {
+		return syserror.ENOMEM
+	}
+
+	mm.mappingMu.RLock()
+	// Can't defer mm.mappingMu.RUnlock(); see below.
+	vseg := mm.vmas.LowerBoundSegment(ar.Start)
+	if !vseg.Ok() {
+		mm.mappingMu.RUnlock()
+		return syserror.ENOMEM
+	}
+	var unmapped bool
+	lastEnd := ar.Start
+	for {
+		if !vseg.Ok() {
+			mm.mappingMu.RUnlock()
+			unmapped = true
+			break
+		}
+		if lastEnd < vseg.Start() {
+			unmapped = true
+		}
+		lastEnd = vseg.End()
+		vma := vseg.ValuePtr()
+		// It's only possible to have dirtied the Mappable through a shared
+		// mapping. Don't check if the mapping is writable, because mprotect
+		// may have changed this, and also because Linux doesn't.
+		if id := vma.id; id != nil && vma.mappable != nil && !vma.private {
+			// We can't call memmap.MappingIdentity.Msync while holding
+			// mm.mappingMu since it may take fs locks that precede it in the
+			// lock order.
+			id.IncRef()
+			mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar))
+			mm.mappingMu.RUnlock()
+			err := id.Msync(ctx, mr)
+			id.DecRef()
+			if err != nil {
+				return err
+			}
+			if lastEnd >= ar.End {
+				break
+			}
+			mm.mappingMu.RLock()
+			vseg = mm.vmas.LowerBoundSegment(lastEnd)
+		} else {
+			if lastEnd >= ar.End {
+				mm.mappingMu.RUnlock()
+				break
+			}
+			vseg = vseg.NextSegment()
+		}
+	}
+
+	if unmapped {
+		return syserror.ENOMEM
+	}
+	return nil
+}
+
+// VirtualMemorySize returns the combined length in bytes of all mappings in
+// mm.
+func (mm *MemoryManager) VirtualMemorySize() uint64 {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	return uint64(mm.usageAS)
+}
+
+// VirtualMemorySizeRange returns the combined length in bytes of all mappings
+// in ar in mm.
+func (mm *MemoryManager) VirtualMemorySizeRange(ar usermem.AddrRange) uint64 {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	return uint64(mm.vmas.SpanRange(ar))
+}
+
+// ResidentSetSize returns the value advertised as mm's RSS in bytes.
+func (mm *MemoryManager) ResidentSetSize() uint64 {
+	mm.activeMu.RLock()
+	defer mm.activeMu.RUnlock()
+	return uint64(mm.curRSS)
+}
+
+// MaxResidentSetSize returns the value advertised as mm's max RSS in bytes.
+func (mm *MemoryManager) MaxResidentSetSize() uint64 {
+	mm.activeMu.RLock()
+	defer mm.activeMu.RUnlock()
+	return uint64(mm.maxRSS)
+}
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
new file mode 100644
index 000000000..b6af48cb7
--- /dev/null
+++ b/pkg/sentry/mm/vma.go
@@ -0,0 +1,476 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Preconditions: mm.mappingMu must be locked for writing. opts must be valid
+// as defined by the checks in MMap.
+func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts) (vmaIterator, usermem.AddrRange, error) {
+	if opts.MaxPerms != opts.MaxPerms.Effective() {
+		panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms))
+	}
+
+	// Find a useable range.
+	addr, err := mm.findAvailableLocked(opts.Length, findAvailableOpts{
+		Addr:  opts.Addr,
+		Fixed: opts.Fixed,
+		Unmap: opts.Unmap,
+	})
+	if err != nil {
+		return vmaIterator{}, usermem.AddrRange{}, err
+	}
+	ar, _ := addr.ToRange(opts.Length)
+
+	// Check against RLIMIT_AS.
+	newUsageAS := mm.usageAS + opts.Length
+	if opts.Unmap {
+		newUsageAS -= uint64(mm.vmas.SpanRange(ar))
+	}
+	if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS {
+		return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM
+	}
+
+	// Remove overwritten mappings. This ordering is consistent with Linux:
+	// compare Linux's mm/mmap.c:mmap_region() => do_munmap(),
+	// file->f_op->mmap().
+	var vgap vmaGapIterator
+	if opts.Unmap {
+		vgap = mm.unmapLocked(ctx, ar)
+	} else {
+		vgap = mm.vmas.FindGap(ar.Start)
+	}
+
+	// Inform the Mappable, if any, of the new mapping.
+	if opts.Mappable != nil {
+		if err := opts.Mappable.AddMapping(ctx, mm, ar, opts.Offset); err != nil {
+			return vmaIterator{}, usermem.AddrRange{}, err
+		}
+	}
+
+	// Take a reference on opts.MappingIdentity before inserting the vma since
+	// vma merging can drop the reference.
+	if opts.MappingIdentity != nil {
+		opts.MappingIdentity.IncRef()
+	}
+
+	// Finally insert the vma.
+	vseg := mm.vmas.Insert(vgap, ar, vma{
+		mappable:       opts.Mappable,
+		off:            opts.Offset,
+		realPerms:      opts.Perms,
+		effectivePerms: opts.Perms.Effective(),
+		maxPerms:       opts.MaxPerms,
+		private:        opts.Private,
+		growsDown:      opts.GrowsDown,
+		id:             opts.MappingIdentity,
+		hint:           opts.Hint,
+	})
+	mm.usageAS += opts.Length
+
+	return vseg, ar, nil
+}
+
+type findAvailableOpts struct {
+	// Addr is a suggested address. Addr must be page-aligned.
+	Addr usermem.Addr
+
+	// Fixed is true if only the suggested address is acceptable.
+	Fixed bool
+
+	// Unmap is true if existing vmas and guard pages may exist in the returned
+	// range.
+	Unmap bool
+}
+
+// findAvailableLocked finds an allocatable range.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) findAvailableLocked(length uint64, opts findAvailableOpts) (usermem.Addr, error) {
+	// Does the provided suggestion work?
+	if ar, ok := opts.Addr.ToRange(length); ok {
+		if mm.applicationAddrRange().IsSupersetOf(ar) {
+			if opts.Unmap {
+				return ar.Start, nil
+			}
+			// Check for the presence of an existing vma or guard page.
+			if vgap := mm.vmas.FindGap(ar.Start); vgap.Ok() && vgap.availableRange().IsSupersetOf(ar) {
+				return ar.Start, nil
+			}
+		}
+	}
+
+	// Fixed mappings accept only the requested address.
+	if opts.Fixed {
+		return 0, syserror.ENOMEM
+	}
+
+	// Prefer hugepage alignment if a hugepage or more is requested.
+	alignment := uint64(usermem.PageSize)
+	if length >= usermem.HugePageSize {
+		alignment = usermem.HugePageSize
+	}
+
+	if mm.layout.DefaultDirection == arch.MmapBottomUp {
+		return mm.findLowestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.BottomUpBase, mm.layout.MaxAddr})
+	}
+	return mm.findHighestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.MinAddr, mm.layout.TopDownBase})
+}
+
+func (mm *MemoryManager) applicationAddrRange() usermem.AddrRange {
+	return usermem.AddrRange{mm.layout.MinAddr, mm.layout.MaxAddr}
+}
+
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) {
+	for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextGap() {
+		if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
+			// Can we shift up to match the alignment?
+			if offset := uint64(gr.Start) % alignment; offset != 0 {
+				if uint64(gr.Length()) >= length+alignment-offset {
+					// Yes, we're aligned.
+					return gr.Start + usermem.Addr(alignment-offset), nil
+				}
+			}
+
+			// Either aligned perfectly, or can't align it.
+			return gr.Start, nil
+		}
+	}
+	return 0, syserror.ENOMEM
+}
+
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) {
+	for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevGap() {
+		if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
+			// Can we shift down to match the alignment?
+			start := gr.End - usermem.Addr(length)
+			if offset := uint64(start) % alignment; offset != 0 {
+				if gr.Start <= start-usermem.Addr(offset) {
+					// Yes, we're aligned.
+					return start - usermem.Addr(offset), nil
+				}
+			}
+
+			// Either aligned perfectly, or can't align it.
+			return start, nil
+		}
+	}
+	return 0, syserror.ENOMEM
+}
+
+// getVMAsLocked ensures that vmas exist for all addresses in ar, and support
+// access of type (at, ignorePermissions). It returns:
+//
+// - An iterator to the vma containing ar.Start. If no vma contains ar.Start,
+// the iterator is unspecified.
+//
+// - An iterator to the gap after the last vma containing an address in ar. If
+// vmas exist for no addresses in ar, the iterator is to a gap that begins
+// before ar.Start.
+//
+// - An error that is non-nil if vmas exist for only a subset of ar.
+//
+// Preconditions: mm.mappingMu must be locked for reading; it may be
+// temporarily unlocked. ar.Length() != 0.
+func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	// Inline mm.vmas.LowerBoundSegment so that we have the preceding gap if
+	// !vbegin.Ok().
+	vbegin, vgap := mm.vmas.Find(ar.Start)
+	if !vbegin.Ok() {
+		vbegin = vgap.NextSegment()
+		// vseg.Ok() is checked before entering the following loop.
+	} else {
+		vgap = vbegin.PrevGap()
+	}
+
+	addr := ar.Start
+	vseg := vbegin
+	for vseg.Ok() {
+		// Loop invariants: vgap = vseg.PrevGap(); addr < vseg.End().
+		vma := vseg.ValuePtr()
+		if addr < vseg.Start() {
+			// TODO: Implement vma.growsDown here.
+			return vbegin, vgap, syserror.EFAULT
+		}
+
+		perms := vma.effectivePerms
+		if ignorePermissions {
+			perms = vma.maxPerms
+		}
+		if !perms.SupersetOf(at) {
+			return vbegin, vgap, syserror.EPERM
+		}
+
+		addr = vseg.End()
+		vgap = vseg.NextGap()
+		if addr >= ar.End {
+			return vbegin, vgap, nil
+		}
+		vseg = vgap.NextSegment()
+	}
+
+	// Ran out of vmas before ar.End.
+	return vbegin, vgap, syserror.EFAULT
+}
+
+// getVecVMAsLocked ensures that vmas exist for all addresses in ars, and
+// support access to type of (at, ignorePermissions). It retuns the subset of
+// ars for which vmas exist. If this is not equal to ars, it returns a non-nil
+// error explaining why.
+//
+// Preconditions: mm.mappingMu must be locked for reading; it may be
+// temporarily unlocked.
+//
+// Postconditions: ars is not mutated.
+func (mm *MemoryManager) getVecVMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool) (usermem.AddrRangeSeq, error) {
+	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
+		ar := arsit.Head()
+		if ar.Length() == 0 {
+			continue
+		}
+		if _, vend, err := mm.getVMAsLocked(ctx, ar, at, ignorePermissions); err != nil {
+			return truncatedAddrRangeSeq(ars, arsit, vend.Start()), err
+		}
+	}
+	return ars, nil
+}
+
+// vma extension will not shrink the number of unmapped bytes between the start
+// of a growsDown vma and the end of its predecessor non-growsDown vma below
+// guardBytes.
+//
+// guardBytes is equivalent to Linux's stack_guard_gap after upstream
+// 1be7107fbe18 "mm: larger stack guard gap, between vmas".
+const guardBytes = 256 * usermem.PageSize
+
+// unmapLocked unmaps all addresses in ar and returns the resulting gap in
+// mm.vmas.
+//
+// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0.
+// ar must be page-aligned.
+func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	// AddressSpace mappings and pmas must be invalidated before
+	// mm.removeVMAsLocked() => memmap.Mappable.RemoveMapping().
+	mm.Invalidate(ar, memmap.InvalidateOpts{InvalidatePrivate: true})
+	return mm.removeVMAsLocked(ctx, ar)
+}
+
+// removeVMAsLocked removes vmas for addresses in ar and returns the resulting
+// gap in mm.vmas. It does not remove pmas or AddressSpace mappings; clients
+// must do so before calling removeVMAsLocked.
+//
+// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. ar
+// must be page-aligned.
+func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	vseg, vgap := mm.vmas.Find(ar.Start)
+	if vgap.Ok() {
+		vseg = vgap.NextSegment()
+	}
+	for vseg.Ok() && vseg.Start() < ar.End {
+		vseg = mm.vmas.Isolate(vseg, ar)
+		vmaAR := vseg.Range()
+		vma := vseg.ValuePtr()
+		if vma.mappable != nil {
+			vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off)
+		}
+		if vma.id != nil {
+			vma.id.DecRef()
+		}
+		mm.usageAS -= uint64(vmaAR.Length())
+		vgap = mm.vmas.Remove(vseg)
+		vseg = vgap.NextSegment()
+	}
+	return vgap
+}
+
+// vmaSetFunctions implements segment.Functions for vmaSet.
+type vmaSetFunctions struct{}
+
+func (vmaSetFunctions) MinKey() usermem.Addr {
+	return 0
+}
+
+func (vmaSetFunctions) MaxKey() usermem.Addr {
+	return ^usermem.Addr(0)
+}
+
+func (vmaSetFunctions) ClearValue(vma *vma) {
+	vma.mappable = nil
+	vma.id = nil
+	vma.hint = ""
+}
+
+func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRange, vma2 vma) (vma, bool) {
+	if vma1.mappable != vma2.mappable ||
+		(vma1.mappable != nil && vma1.off+uint64(ar1.Length()) != vma2.off) ||
+		vma1.realPerms != vma2.realPerms ||
+		vma1.maxPerms != vma2.maxPerms ||
+		vma1.private != vma2.private ||
+		vma1.growsDown != vma2.growsDown ||
+		vma1.id != vma2.id ||
+		vma1.hint != vma2.hint {
+		return vma{}, false
+	}
+
+	if vma2.id != nil {
+		vma2.id.DecRef()
+	}
+	return vma1, true
+}
+
+func (vmaSetFunctions) Split(ar usermem.AddrRange, v vma, split usermem.Addr) (vma, vma) {
+	v2 := v
+	if v2.mappable != nil {
+		v2.off += uint64(split - ar.Start)
+	}
+	if v2.id != nil {
+		v2.id.IncRef()
+	}
+	return v, v2
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil. vseg.Range().Contains(addr).
+func (vseg vmaIterator) mappableOffsetAt(addr usermem.Addr) uint64 {
+	if checkInvariants {
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
+		if vseg.ValuePtr().mappable == nil {
+			panic("Mappable offset is meaningless for anonymous vma")
+		}
+		if !vseg.Range().Contains(addr) {
+			panic(fmt.Sprintf("addr %v out of bounds %v", addr, vseg.Range()))
+		}
+	}
+
+	vma := vseg.ValuePtr()
+	vstart := vseg.Start()
+	return vma.off + uint64(addr-vstart)
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil.
+func (vseg vmaIterator) mappableRange() memmap.MappableRange {
+	return vseg.mappableRangeOf(vseg.Range())
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil.
+// vseg.Range().IsSupersetOf(ar). ar.Length() != 0.
+func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRange {
+	if checkInvariants {
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
+		if vseg.ValuePtr().mappable == nil {
+			panic("MappableRange is meaningless for anonymous vma")
+		}
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !vseg.Range().IsSupersetOf(ar) {
+			panic(fmt.Sprintf("ar %v out of bounds %v", ar, vseg.Range()))
+		}
+	}
+
+	vma := vseg.ValuePtr()
+	vstart := vseg.Start()
+	return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)}
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil.
+// vseg.mappableRange().IsSupersetOf(mr). mr.Length() != 0.
+func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange {
+	if checkInvariants {
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
+		if vseg.ValuePtr().mappable == nil {
+			panic("MappableRange is meaningless for anonymous vma")
+		}
+		if !mr.WellFormed() || mr.Length() <= 0 {
+			panic(fmt.Sprintf("invalid mr: %v", mr))
+		}
+		if !vseg.mappableRange().IsSupersetOf(mr) {
+			panic(fmt.Sprintf("mr %v out of bounds %v", mr, vseg.mappableRange()))
+		}
+	}
+
+	vma := vseg.ValuePtr()
+	vstart := vseg.Start()
+	return usermem.AddrRange{vstart + usermem.Addr(mr.Start-vma.off), vstart + usermem.Addr(mr.End-vma.off)}
+}
+
+// seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by
+// scanning linearly forward from vseg.
+//
+// Preconditions: mm.mappingMu must be locked. addr >= vseg.Start().
+func (vseg vmaIterator) seekNextLowerBound(addr usermem.Addr) vmaIterator {
+	if checkInvariants {
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
+		if addr < vseg.Start() {
+			panic(fmt.Sprintf("can't seek forward to %#x from %#x", addr, vseg.Start()))
+		}
+	}
+	for vseg.Ok() && addr >= vseg.End() {
+		vseg = vseg.NextSegment()
+	}
+	return vseg
+}
+
+// availableRange returns the subset of vgap.Range() in which new vmas may be
+// created without MMapOpts.Unmap == true.
+func (vgap vmaGapIterator) availableRange() usermem.AddrRange {
+	ar := vgap.Range()
+	next := vgap.NextSegment()
+	if !next.Ok() || !next.ValuePtr().growsDown {
+		return ar
+	}
+	// Exclude guard pages.
+	if ar.Length() < guardBytes {
+		return usermem.AddrRange{ar.Start, ar.Start}
+	}
+	ar.End -= guardBytes
+	return ar
+}
author	Googler <noreply@google.com>	2018-04-27 10:37:02 -0700
committer	Adin Scannell <ascannell@google.com>	2018-04-28 01:44:26 -0400
commit	d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree	54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/mm
parent	f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)