18 files changed, 6517 insertions, 0 deletions
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
new file mode 100644
index 000000000..a036ce53c
--- /dev/null
+++ b/pkg/sentry/mm/BUILD
@@ -0,0 +1,142 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "file_refcount_set",
+    out = "file_refcount_set.go",
+    imports = {
+        "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
+    },
+    package = "mm",
+    prefix = "fileRefcount",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "platform.FileRange",
+        "Value": "int32",
+        "Functions": "fileRefcountSetFunctions",
+    },
+)
+
+go_template_instance(
+    name = "vma_set",
+    out = "vma_set.go",
+    consts = {
+        "minDegree": "8",
+        "trackGaps": "1",
+    },
+    imports = {
+        "usermem": "gvisor.dev/gvisor/pkg/usermem",
+    },
+    package = "mm",
+    prefix = "vma",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "usermem.Addr",
+        "Range": "usermem.AddrRange",
+        "Value": "vma",
+        "Functions": "vmaSetFunctions",
+    },
+)
+
+go_template_instance(
+    name = "pma_set",
+    out = "pma_set.go",
+    consts = {
+        "minDegree": "8",
+    },
+    imports = {
+        "usermem": "gvisor.dev/gvisor/pkg/usermem",
+    },
+    package = "mm",
+    prefix = "pma",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "usermem.Addr",
+        "Range": "usermem.AddrRange",
+        "Value": "pma",
+        "Functions": "pmaSetFunctions",
+    },
+)
+
+go_template_instance(
+    name = "io_list",
+    out = "io_list.go",
+    package = "mm",
+    prefix = "io",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*ioResult",
+        "Linker": "*ioResult",
+    },
+)
+
+go_library(
+    name = "mm",
+    srcs = [
+        "address_space.go",
+        "aio_context.go",
+        "aio_context_state.go",
+        "debug.go",
+        "file_refcount_set.go",
+        "io.go",
+        "io_list.go",
+        "lifecycle.go",
+        "metadata.go",
+        "mm.go",
+        "pma.go",
+        "pma_set.go",
+        "procfs.go",
+        "save_restore.go",
+        "shm.go",
+        "special_mappable.go",
+        "syscalls.go",
+        "vma.go",
+        "vma_set.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/atomicbitops",
+        "//pkg/context",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/safecopy",
+        "//pkg/safemem",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/fs/proc/seqfile",
+        "//pkg/sentry/fsbridge",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/futex",
+        "//pkg/sentry/kernel/shm",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/usage",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/tcpip/buffer",
+        "//pkg/usermem",
+    ],
+)
+
+go_test(
+    name = "mm_test",
+    size = "small",
+    srcs = ["mm_test.go"],
+    library = ":mm",
+    deps = [
+        "//pkg/context",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/mm/README.md b/pkg/sentry/mm/README.md
new file mode 100644
index 000000000..f4d43d927
--- /dev/null
+++ b/pkg/sentry/mm/README.md
@@ -0,0 +1,280 @@
+This package provides an emulation of Linux semantics for application virtual
+memory mappings.
+
+For completeness, this document also describes aspects of the memory management
+subsystem defined outside this package.
+
+# Background
+
+We begin by describing semantics for virtual memory in Linux.
+
+A virtual address space is defined as a collection of mappings from virtual
+addresses to physical memory. However, userspace applications do not configure
+mappings to physical memory directly. Instead, applications configure memory
+mappings from virtual addresses to offsets into a file using the `mmap` system
+call.[^mmap-anon] For example, a call to:
+
+    mmap(
+        /* addr = */ 0x400000,
+        /* length = */ 0x1000,
+        PROT_READ | PROT_WRITE,
+        MAP_SHARED,
+        /* fd = */ 3,
+        /* offset = */ 0);
+
+creates a mapping of length 0x1000 bytes, starting at virtual address (VA)
+0x400000, to offset 0 in the file represented by file descriptor (FD) 3. Within
+the Linux kernel, virtual memory mappings are represented by *virtual memory
+areas* (VMAs). Supposing that FD 3 represents file /tmp/foo, the state of the
+virtual memory subsystem after the `mmap` call may be depicted as:
+
+    VMA:     VA:0x400000 -> /tmp/foo:0x0
+
+Establishing a virtual memory area does not necessarily establish a mapping to a
+physical address, because Linux has not necessarily provisioned physical memory
+to store the file's contents. Thus, if the application attempts to read the
+contents of VA 0x400000, it may incur a *page fault*, a CPU exception that
+forces the kernel to create such a mapping to service the read.
+
+For a file, doing so consists of several logical phases:
+
+1.  The kernel allocates physical memory to store the contents of the required
+    part of the file, and copies file contents to the allocated memory.
+    Supposing that the kernel chooses the physical memory at physical address
+    (PA) 0x2fb000, the resulting state of the system is:
+
+        VMA:     VA:0x400000 -> /tmp/foo:0x0
+        Filemap:                /tmp/foo:0x0 -> PA:0x2fb000
+
+    (In Linux the state of the mapping from file offset to physical memory is
+    stored in `struct address_space`, but to avoid confusion with other notions
+    of address space we will refer to this system as filemap, named after Linux
+    kernel source file `mm/filemap.c`.)
+
+2.  The kernel stores the effective mapping from virtual to physical address in
+    a *page table entry* (PTE) in the application's *page tables*, which are
+    used by the CPU's virtual memory hardware to perform address translation.
+    The resulting state of the system is:
+
+        VMA:     VA:0x400000 -> /tmp/foo:0x0
+        Filemap:                /tmp/foo:0x0 -> PA:0x2fb000
+        PTE:     VA:0x400000 -----------------> PA:0x2fb000
+
+    The PTE is required for the application to actually use the contents of the
+    mapped file as virtual memory. However, the PTE is derived from the VMA and
+    filemap state, both of which are independently mutable, such that mutations
+    to either will affect the PTE. For example:
+
+    -   The application may remove the VMA using the `munmap` system call. This
+        breaks the mapping from VA:0x400000 to /tmp/foo:0x0, and consequently
+        the mapping from VA:0x400000 to PA:0x2fb000. However, it does not
+        necessarily break the mapping from /tmp/foo:0x0 to PA:0x2fb000, so a
+        future mapping of the same file offset may reuse this physical memory.
+
+    -   The application may invalidate the file's contents by passing a length
+        of 0 to the `ftruncate` system call. This breaks the mapping from
+        /tmp/foo:0x0 to PA:0x2fb000, and consequently the mapping from
+        VA:0x400000 to PA:0x2fb000. However, it does not break the mapping from
+        VA:0x400000 to /tmp/foo:0x0, so future changes to the file's contents
+        may again be made visible at VA:0x400000 after another page fault
+        results in the allocation of a new physical address.
+
+    Note that, in order to correctly break the mapping from VA:0x400000 to
+    PA:0x2fb000 in the latter case, filemap must also store a *reverse mapping*
+    from /tmp/foo:0x0 to VA:0x400000 so that it can locate and remove the PTE.
+
+[^mmap-anon]: Memory mappings to non-files are discussed in later sections.
+
+## Private Mappings
+
+The preceding example considered VMAs created using the `MAP_SHARED` flag, which
+means that PTEs derived from the mapping should always use physical memory that
+represents the current state of the mapped file.[^mmap-dev-zero] Applications
+can alternatively pass the `MAP_PRIVATE` flag to create a *private mapping*.
+Private mappings are *copy-on-write*.
+
+Suppose that the application instead created a private mapping in the previous
+example. In Linux, the state of the system after a read page fault would be:
+
+    VMA:     VA:0x400000 -> /tmp/foo:0x0 (private)
+    Filemap:                /tmp/foo:0x0 -> PA:0x2fb000
+    PTE:     VA:0x400000 -----------------> PA:0x2fb000 (read-only)
+
+Now suppose the application attempts to write to VA:0x400000. For a shared
+mapping, the write would be propagated to PA:0x2fb000, and the kernel would be
+responsible for ensuring that the write is later propagated to the mapped file.
+For a private mapping, the write incurs another page fault since the PTE is
+marked read-only. In response, the kernel allocates physical memory to store the
+mapping's *private copy* of the file's contents, copies file contents to the
+allocated memory, and changes the PTE to map to the private copy. Supposing that
+the kernel chooses the physical memory at physical address (PA) 0x5ea000, the
+resulting state of the system is:
+
+    VMA:     VA:0x400000 -> /tmp/foo:0x0 (private)
+    Filemap:                /tmp/foo:0x0 -> PA:0x2fb000
+    PTE:     VA:0x400000 -----------------> PA:0x5ea000
+
+Note that the filemap mapping from /tmp/foo:0x0 to PA:0x2fb000 may still exist,
+but is now irrelevant to this mapping.
+
+[^mmap-dev-zero]: Modulo files with special mmap semantics such as `/dev/zero`.
+
+## Anonymous Mappings
+
+Instead of passing a file to the `mmap` system call, applications can instead
+request an *anonymous* mapping by passing the `MAP_ANONYMOUS` flag.
+Semantically, an anonymous mapping is essentially a mapping to an ephemeral file
+initially filled with zero bytes. Practically speaking, this is how shared
+anonymous mappings are implemented, but private anonymous mappings do not result
+in the creation of an ephemeral file; since there would be no way to modify the
+contents of the underlying file through a private mapping, all private anonymous
+mappings use a single shared page filled with zero bytes until copy-on-write
+occurs.
+
+# Virtual Memory in the Sentry
+
+The sentry implements application virtual memory atop a host kernel, introducing
+an additional level of indirection to the above.
+
+Consider the same scenario as in the previous section. Since the sentry handles
+application system calls, the effect of an application `mmap` system call is to
+create a VMA in the sentry (as opposed to the host kernel):
+
+    Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
+
+When the application first incurs a page fault on this address, the host kernel
+delivers information about the page fault to the sentry in a platform-dependent
+manner, and the sentry handles the fault:
+
+1.  The sentry allocates memory to store the contents of the required part of
+    the file, and copies file contents to the allocated memory. However, since
+    the sentry is implemented atop a host kernel, it does not configure mappings
+    to physical memory directly. Instead, mappable "memory" in the sentry is
+    represented by a host file descriptor and offset, since (as noted in
+    "Background") this is the memory mapping primitive provided by the host
+    kernel. In general, memory is allocated from a temporary host file using the
+    `pgalloc` package. Supposing that the sentry allocates offset 0x3000 from
+    host file "memory-file", the resulting state is:
+
+        Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
+        Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
+
+2.  The sentry stores the effective mapping from virtual address to host file in
+    a host VMA by invoking the `mmap` system call:
+
+        Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
+        Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
+          Host VMA:     VA:0x400000 -----------------> host:memory-file:0x3000
+
+3.  The sentry returns control to the application, which immediately incurs the
+    page fault again.[^mmap-populate] However, since a host VMA now exists for
+    the faulting virtual address, the host kernel now handles the page fault as
+    described in "Background":
+
+        Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
+        Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
+          Host VMA:     VA:0x400000 -----------------> host:memory-file:0x3000
+          Host filemap:                                host:memory-file:0x3000 -> PA:0x2fb000
+          Host PTE:     VA:0x400000 --------------------------------------------> PA:0x2fb000
+
+Thus, from an implementation standpoint, host VMAs serve the same purpose in the
+sentry that PTEs do in Linux. As in Linux, sentry VMA and filemap state is
+independently mutable, and the desired state of host VMAs is derived from that
+state.
+
+[^mmap-populate]: The sentry could force the host kernel to establish PTEs when
+    it creates the host VMA by passing the `MAP_POPULATE` flag to
+    the `mmap` system call, but usually does not. This is because,
+    to reduce the number of page faults that require handling by
+    the sentry and (correspondingly) the number of host `mmap`
+    system calls, the sentry usually creates host VMAs that are
+    much larger than the single faulting page.
+
+## Private Mappings
+
+The sentry implements private mappings consistently with Linux. Before
+copy-on-write, the private mapping example given in the Background results in:
+
+    Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0 (private)
+    Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
+      Host VMA:     VA:0x400000 -----------------> host:memory-file:0x3000 (read-only)
+      Host filemap:                                host:memory-file:0x3000 -> PA:0x2fb000
+      Host PTE:     VA:0x400000 --------------------------------------------> PA:0x2fb000 (read-only)
+
+When the application attempts to write to this address, the host kernel delivers
+information about the resulting page fault to the sentry. Analogous to Linux,
+the sentry allocates memory to store the mapping's private copy of the file's
+contents, copies file contents to the allocated memory, and changes the host VMA
+to map to the private copy. Supposing that the sentry chooses the offset 0x4000
+in host file `memory-file` to store the private copy, the state of the system
+after copy-on-write is:
+
+    Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0 (private)
+    Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
+      Host VMA:     VA:0x400000 -----------------> host:memory-file:0x4000
+      Host filemap:                                host:memory-file:0x4000 -> PA:0x5ea000
+      Host PTE:     VA:0x400000 --------------------------------------------> PA:0x5ea000
+
+However, this highlights an important difference between Linux and the sentry.
+In Linux, page tables are concrete (architecture-dependent) data structures
+owned by the kernel. Conversely, the sentry has the ability to create and
+destroy host VMAs using host system calls, but it does not have direct access to
+their state. Thus, as written, if the application invokes the `munmap` system
+call to remove the sentry VMA, it is non-trivial for the sentry to determine
+that it should deallocate `host:memory-file:0x4000`. This implies that the
+sentry must retain information about the host VMAs that it has created.
+
+## Anonymous Mappings
+
+The sentry implements anonymous mappings consistently with Linux, except that
+there is no shared zero page.
+
+# Implementation Constructs
+
+In Linux:
+
+-   A virtual address space is represented by `struct mm_struct`.
+
+-   VMAs are represented by `struct vm_area_struct`, stored in `struct
+    mm_struct::mmap`.
+
+-   Mappings from file offsets to physical memory are stored in `struct
+    address_space`.
+
+-   Reverse mappings from file offsets to virtual mappings are stored in `struct
+    address_space::i_mmap`.
+
+-   Physical memory pages are represented by a pointer to `struct page` or an
+    index called a *page frame number* (PFN), represented by `pfn_t`.
+
+-   PTEs are represented by architecture-dependent type `pte_t`, stored in a
+    table hierarchy rooted at `struct mm_struct::pgd`.
+
+In the sentry:
+
+-   A virtual address space is represented by type [`mm.MemoryManager`][mm].
+
+-   Sentry VMAs are represented by type [`mm.vma`][mm], stored in
+    `mm.MemoryManager.vmas`.
+
+-   Mappings from sentry file offsets to host file offsets are abstracted
+    through interface method [`memmap.Mappable.Translate`][memmap].
+
+-   Reverse mappings from sentry file offsets to virtual mappings are abstracted
+    through interface methods
+    [`memmap.Mappable.AddMapping` and `memmap.Mappable.RemoveMapping`][memmap].
+
+-   Host files that may be mapped into host VMAs are represented by type
+    [`platform.File`][platform].
+
+-   Host VMAs are represented in the sentry by type [`mm.pma`][mm] ("platform
+    mapping area"), stored in `mm.MemoryManager.pmas`.
+
+-   Creation and destruction of host VMAs is abstracted through interface
+    methods
+    [`platform.AddressSpace.MapFile` and `platform.AddressSpace.Unmap`][platform].
+
+[memmap]: https://github.com/google/gvisor/blob/master/pkg/sentry/memmap/memmap.go
+[mm]: https://github.com/google/gvisor/blob/master/pkg/sentry/mm/mm.go
+[pgalloc]: https://github.com/google/gvisor/blob/master/pkg/sentry/pgalloc/pgalloc.go
+[platform]: https://github.com/google/gvisor/blob/master/pkg/sentry/platform/platform.go
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
new file mode 100644
index 000000000..5c667117c
--- /dev/null
+++ b/pkg/sentry/mm/address_space.go
@@ -0,0 +1,236 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// AddressSpace returns the platform.AddressSpace bound to mm.
+//
+// Preconditions: The caller must have called mm.Activate().
+func (mm *MemoryManager) AddressSpace() platform.AddressSpace {
+	if atomic.LoadInt32(&mm.active) == 0 {
+		panic("trying to use inactive address space?")
+	}
+	return mm.as
+}
+
+// Activate ensures this MemoryManager has a platform.AddressSpace.
+//
+// The caller must not hold any locks when calling Activate.
+//
+// When this MemoryManager is no longer needed by a task, it should call
+// Deactivate to release the reference.
+func (mm *MemoryManager) Activate(ctx context.Context) error {
+	// Fast path: the MemoryManager already has an active
+	// platform.AddressSpace, and we just need to indicate that we need it too.
+	for {
+		active := atomic.LoadInt32(&mm.active)
+		if active == 0 {
+			// Fall back to the slow path.
+			break
+		}
+		if atomic.CompareAndSwapInt32(&mm.active, active, active+1) {
+			return nil
+		}
+	}
+
+	for {
+		// Slow path: may need to synchronize with other goroutines changing
+		// mm.active to or from zero.
+		mm.activeMu.Lock()
+		// Inline Unlock instead of using a defer for performance since this
+		// method is commonly in the hot-path.
+
+		// Check if we raced with another goroutine performing activation.
+		if atomic.LoadInt32(&mm.active) > 0 {
+			// This can't race; Deactivate can't decrease mm.active from 1 to 0
+			// without holding activeMu.
+			atomic.AddInt32(&mm.active, 1)
+			mm.activeMu.Unlock()
+			return nil
+		}
+
+		// Do we have a context? If so, then we never unmapped it. This can
+		// only be the case if !mm.p.CooperativelySchedulesAddressSpace().
+		if mm.as != nil {
+			atomic.StoreInt32(&mm.active, 1)
+			mm.activeMu.Unlock()
+			return nil
+		}
+
+		// Get a new address space. We must force unmapping by passing nil to
+		// NewAddressSpace if requested. (As in the nil interface object, not a
+		// typed nil.)
+		mappingsID := (interface{})(mm)
+		if mm.unmapAllOnActivate {
+			mappingsID = nil
+		}
+		as, c, err := mm.p.NewAddressSpace(mappingsID)
+		if err != nil {
+			mm.activeMu.Unlock()
+			return err
+		}
+		if as == nil {
+			// AddressSpace is unavailable, we must wait.
+			//
+			// activeMu must not be held while waiting, as the user of the address
+			// space we are waiting on may attempt to take activeMu.
+			mm.activeMu.Unlock()
+
+			sleep := mm.p.CooperativelySchedulesAddressSpace() && mm.sleepForActivation
+			if sleep {
+				// Mark this task sleeping while waiting for the address space to
+				// prevent the watchdog from reporting it as a stuck task.
+				ctx.UninterruptibleSleepStart(false)
+			}
+			<-c
+			if sleep {
+				ctx.UninterruptibleSleepFinish(false)
+			}
+			continue
+		}
+
+		// Okay, we could restore all mappings at this point.
+		// But forget that. Let's just let them fault in.
+		mm.as = as
+
+		// Unmapping is done, if necessary.
+		mm.unmapAllOnActivate = false
+
+		// Now that m.as has been assigned, we can set m.active to a non-zero value
+		// to enable the fast path.
+		atomic.StoreInt32(&mm.active, 1)
+
+		mm.activeMu.Unlock()
+		return nil
+	}
+}
+
+// Deactivate releases a reference to the MemoryManager.
+func (mm *MemoryManager) Deactivate() {
+	// Fast path: this is not the last goroutine to deactivate the
+	// MemoryManager.
+	for {
+		active := atomic.LoadInt32(&mm.active)
+		if active == 1 {
+			// Fall back to the slow path.
+			break
+		}
+		if atomic.CompareAndSwapInt32(&mm.active, active, active-1) {
+			return
+		}
+	}
+
+	mm.activeMu.Lock()
+	// Same as Activate.
+
+	// Still active?
+	if atomic.AddInt32(&mm.active, -1) > 0 {
+		mm.activeMu.Unlock()
+		return
+	}
+
+	// Can we hold on to the address space?
+	if !mm.p.CooperativelySchedulesAddressSpace() {
+		mm.activeMu.Unlock()
+		return
+	}
+
+	// Release the address space.
+	mm.as.Release()
+
+	// Lost it.
+	mm.as = nil
+	mm.activeMu.Unlock()
+}
+
+// mapASLocked maps addresses in ar into mm.as. If precommit is true, mappings
+// for all addresses in ar should be precommitted.
+//
+// Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0.
+// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start).
+func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error {
+	// By default, map entire pmas at a time, under the assumption that there
+	// is no cost to mapping more of a pma than necessary.
+	mapAR := usermem.AddrRange{0, ^usermem.Addr(usermem.PageSize - 1)}
+	if precommit {
+		// When explicitly precommitting, only map ar, since overmapping may
+		// incur unexpected resource usage.
+		mapAR = ar
+	} else if mapUnit := mm.p.MapUnit(); mapUnit != 0 {
+		// Limit the range we map to ar, aligned to mapUnit.
+		mapMask := usermem.Addr(mapUnit - 1)
+		mapAR.Start = ar.Start &^ mapMask
+		// If rounding ar.End up overflows, just keep the existing mapAR.End.
+		if end := (ar.End + mapMask) &^ mapMask; end >= ar.End {
+			mapAR.End = end
+		}
+	}
+	if checkInvariants {
+		if !mapAR.IsSupersetOf(ar) {
+			panic(fmt.Sprintf("mapAR %#v is not a superset of ar %#v", mapAR, ar))
+		}
+	}
+
+	// Since this checks ar.End and not mapAR.End, we will never map a pma that
+	// is not required.
+	for pseg.Ok() && pseg.Start() < ar.End {
+		pma := pseg.ValuePtr()
+		pmaAR := pseg.Range()
+		pmaMapAR := pmaAR.Intersect(mapAR)
+		perms := pma.effectivePerms
+		if pma.needCOW {
+			perms.Write = false
+		}
+		if perms.Any() { // MapFile precondition
+			if err := mm.as.MapFile(pmaMapAR.Start, pma.file, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
+				return err
+			}
+		}
+		pseg = pseg.NextSegment()
+	}
+	return nil
+}
+
+// unmapASLocked removes all AddressSpace mappings for addresses in ar.
+//
+// Preconditions: mm.activeMu must be locked.
+func (mm *MemoryManager) unmapASLocked(ar usermem.AddrRange) {
+	if mm.as == nil {
+		// No AddressSpace? Force all mappings to be unmapped on the next
+		// Activate.
+		mm.unmapAllOnActivate = true
+		return
+	}
+
+	// unmapASLocked doesn't require vmas or pmas to exist for ar, so it can be
+	// passed ranges that include addresses that can't be mapped by the
+	// application.
+	ar = ar.Intersect(mm.applicationAddrRange())
+
+	// Note that this AddressSpace may or may not be active. If the
+	// platform does not require cooperative sharing of AddressSpaces, they
+	// are retained between Deactivate/Activate calls. Despite not being
+	// active, it is still valid to perform operations on these address
+	// spaces.
+	mm.as.Unmap(ar.Start, uint64(ar.Length()))
+}
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
new file mode 100644
index 000000000..379148903
--- /dev/null
+++ b/pkg/sentry/mm/aio_context.go
@@ -0,0 +1,429 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// aioManager creates and manages asynchronous I/O contexts.
+//
+// +stateify savable
+type aioManager struct {
+	// mu protects below.
+	mu sync.Mutex `state:"nosave"`
+
+	// aioContexts is the set of asynchronous I/O contexts.
+	contexts map[uint64]*AIOContext
+}
+
+func (a *aioManager) destroy() {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	for _, ctx := range a.contexts {
+		ctx.destroy()
+	}
+}
+
+// newAIOContext creates a new context for asynchronous I/O.
+//
+// Returns false if 'id' is currently in use.
+func (a *aioManager) newAIOContext(events uint32, id uint64) bool {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	if _, ok := a.contexts[id]; ok {
+		return false
+	}
+
+	a.contexts[id] = &AIOContext{
+		requestReady:   make(chan struct{}, 1),
+		maxOutstanding: events,
+	}
+	return true
+}
+
+// destroyAIOContext destroys an asynchronous I/O context. It doesn't wait for
+// for pending requests to complete. Returns the destroyed AIOContext so it can
+// be drained.
+//
+// Nil is returned if the context does not exist.
+func (a *aioManager) destroyAIOContext(id uint64) *AIOContext {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	ctx, ok := a.contexts[id]
+	if !ok {
+		return nil
+	}
+	delete(a.contexts, id)
+	ctx.destroy()
+	return ctx
+}
+
+// lookupAIOContext looks up the given context.
+//
+// Returns false if context does not exist.
+func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	ctx, ok := a.contexts[id]
+	return ctx, ok
+}
+
+// ioResult is a completed I/O operation.
+//
+// +stateify savable
+type ioResult struct {
+	data interface{}
+	ioEntry
+}
+
+// AIOContext is a single asynchronous I/O context.
+//
+// +stateify savable
+type AIOContext struct {
+	// requestReady is the notification channel used for all requests.
+	requestReady chan struct{} `state:"nosave"`
+
+	// mu protects below.
+	mu sync.Mutex `state:"nosave"`
+
+	// results is the set of completed requests.
+	results ioList
+
+	// maxOutstanding is the maximum number of outstanding entries; this value
+	// is immutable.
+	maxOutstanding uint32
+
+	// outstanding is the number of requests outstanding; this will effectively
+	// be the number of entries in the result list or that are expected to be
+	// added to the result list.
+	outstanding uint32
+
+	// dead is set when the context is destroyed.
+	dead bool `state:"zerovalue"`
+}
+
+// destroy marks the context dead.
+func (ctx *AIOContext) destroy() {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+	ctx.dead = true
+	ctx.checkForDone()
+}
+
+// Preconditions: ctx.mu must be held by caller.
+func (ctx *AIOContext) checkForDone() {
+	if ctx.dead && ctx.outstanding == 0 {
+		close(ctx.requestReady)
+		ctx.requestReady = nil
+	}
+}
+
+// Prepare reserves space for a new request, returning true if available.
+// Returns false if the context is busy.
+func (ctx *AIOContext) Prepare() bool {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+	if ctx.outstanding >= ctx.maxOutstanding {
+		return false
+	}
+	ctx.outstanding++
+	return true
+}
+
+// PopRequest pops a completed request if available, this function does not do
+// any blocking. Returns false if no request is available.
+func (ctx *AIOContext) PopRequest() (interface{}, bool) {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+
+	// Is there anything ready?
+	if e := ctx.results.Front(); e != nil {
+		if ctx.outstanding == 0 {
+			panic("AIOContext outstanding is going negative")
+		}
+		ctx.outstanding--
+		ctx.results.Remove(e)
+		ctx.checkForDone()
+		return e.data, true
+	}
+	return nil, false
+}
+
+// FinishRequest finishes a pending request. It queues up the data
+// and notifies listeners.
+func (ctx *AIOContext) FinishRequest(data interface{}) {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+
+	// Push to the list and notify opportunistically. The channel notify
+	// here is guaranteed to be safe because outstanding must be non-zero.
+	// The requestReady channel is only closed when outstanding reaches zero.
+	ctx.results.PushBack(&ioResult{data: data})
+
+	select {
+	case ctx.requestReady <- struct{}{}:
+	default:
+	}
+}
+
+// WaitChannel returns a channel that is notified when an AIO request is
+// completed. Returns nil if the context is destroyed and there are no more
+// outstanding requests.
+func (ctx *AIOContext) WaitChannel() chan struct{} {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+	return ctx.requestReady
+}
+
+// Dead returns true if the context has been destroyed.
+func (ctx *AIOContext) Dead() bool {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+	return ctx.dead
+}
+
+// CancelPendingRequest forgets about a request that hasn't yet completed.
+func (ctx *AIOContext) CancelPendingRequest() {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+
+	if ctx.outstanding == 0 {
+		panic("AIOContext outstanding is going negative")
+	}
+	ctx.outstanding--
+	ctx.checkForDone()
+}
+
+// Drain drops all completed requests. Pending requests remain untouched.
+func (ctx *AIOContext) Drain() {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+
+	if ctx.outstanding == 0 {
+		return
+	}
+	size := uint32(ctx.results.Len())
+	if ctx.outstanding < size {
+		panic("AIOContext outstanding is going negative")
+	}
+	ctx.outstanding -= size
+	ctx.results.Reset()
+	ctx.checkForDone()
+}
+
+// aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO
+// ring buffers.
+//
+// +stateify savable
+type aioMappable struct {
+	refs.AtomicRefCount
+
+	mfp pgalloc.MemoryFileProvider
+	fr  platform.FileRange
+}
+
+var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp())
+
+func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) {
+	fr, err := mfp.MemoryFile().Allocate(aioRingBufferSize, usage.Anonymous)
+	if err != nil {
+		return nil, err
+	}
+	m := aioMappable{mfp: mfp, fr: fr}
+	m.EnableLeakCheck("mm.aioMappable")
+	return &m, nil
+}
+
+// DecRef implements refs.RefCounter.DecRef.
+func (m *aioMappable) DecRef() {
+	m.AtomicRefCount.DecRefWithDestructor(func() {
+		m.mfp.MemoryFile().DecRef(m.fr)
+	})
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (m *aioMappable) MappedName(ctx context.Context) string {
+	return "[aio]"
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (m *aioMappable) DeviceID() uint64 {
+	return 0
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (m *aioMappable) InodeID() uint64 {
+	return 0
+}
+
+// Msync implements memmap.MappingIdentity.Msync.
+func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error {
+	// Linux: aio_ring_fops.fsync == NULL
+	return syserror.EINVAL
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (m *aioMappable) AddMapping(_ context.Context, _ memmap.MappingSpace, ar usermem.AddrRange, offset uint64, _ bool) error {
+	// Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap()
+	// sets VM_DONTEXPAND).
+	if offset != 0 || uint64(ar.Length()) != aioRingBufferSize {
+		return syserror.EFAULT
+	}
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (m *aioMappable) RemoveMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) {
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, _ bool) error {
+	// Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap()
+	// sets VM_DONTEXPAND).
+	if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize {
+		return syserror.EFAULT
+	}
+	// Require that the mapping correspond to a live AIOContext. Compare
+	// Linux's fs/aio.c:aio_ring_mremap().
+	mm, ok := ms.(*MemoryManager)
+	if !ok {
+		return syserror.EINVAL
+	}
+	am := &mm.aioManager
+	am.mu.Lock()
+	defer am.mu.Unlock()
+	oldID := uint64(srcAR.Start)
+	aioCtx, ok := am.contexts[oldID]
+	if !ok {
+		return syserror.EINVAL
+	}
+	aioCtx.mu.Lock()
+	defer aioCtx.mu.Unlock()
+	if aioCtx.dead {
+		return syserror.EINVAL
+	}
+	// Use the new ID for the AIOContext.
+	am.contexts[uint64(dstAR.Start)] = aioCtx
+	delete(am.contexts, oldID)
+	return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	var err error
+	if required.End > m.fr.Length() {
+		err = &memmap.BusError{syserror.EFAULT}
+	}
+	if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 {
+		return []memmap.Translation{
+			{
+				Source: source,
+				File:   m.mfp.MemoryFile(),
+				Offset: m.fr.Start + source.Start,
+				Perms:  usermem.AnyAccess,
+			},
+		}, err
+	}
+	return nil, err
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (m *aioMappable) InvalidateUnsavable(ctx context.Context) error {
+	return nil
+}
+
+// NewAIOContext creates a new context for asynchronous I/O.
+//
+// NewAIOContext is analogous to Linux's fs/aio.c:ioctx_alloc().
+func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint64, error) {
+	// libaio get_ioevents() expects context "handle" to be a valid address.
+	// libaio peeks inside looking for a magic number. This function allocates
+	// a page per context and keeps it set to zeroes to ensure it will not
+	// match AIO_RING_MAGIC and make libaio happy.
+	m, err := newAIOMappable(mm.mfp)
+	if err != nil {
+		return 0, err
+	}
+	defer m.DecRef()
+	addr, err := mm.MMap(ctx, memmap.MMapOpts{
+		Length:          aioRingBufferSize,
+		MappingIdentity: m,
+		Mappable:        m,
+		// Linux uses "do_mmap_pgoff(..., PROT_READ | PROT_WRITE, ...)" in
+		// fs/aio.c:aio_setup_ring(). Since we don't implement AIO_RING_MAGIC,
+		// user mode should not write to this page.
+		Perms:    usermem.Read,
+		MaxPerms: usermem.Read,
+	})
+	if err != nil {
+		return 0, err
+	}
+	id := uint64(addr)
+	if !mm.aioManager.newAIOContext(events, id) {
+		mm.MUnmap(ctx, addr, aioRingBufferSize)
+		return 0, syserror.EINVAL
+	}
+	return id, nil
+}
+
+// DestroyAIOContext destroys an asynchronous I/O context. It returns the
+// destroyed context. nil if the context does not exist.
+func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOContext {
+	if _, ok := mm.LookupAIOContext(ctx, id); !ok {
+		return nil
+	}
+
+	// Only unmaps after it assured that the address is a valid aio context to
+	// prevent random memory from been unmapped.
+	//
+	// Note: It's possible to unmap this address and map something else into
+	// the same address. Then it would be unmapping memory that it doesn't own.
+	// This is, however, the way Linux implements AIO. Keeps the same [weird]
+	// semantics in case anyone relies on it.
+	mm.MUnmap(ctx, usermem.Addr(id), aioRingBufferSize)
+
+	return mm.aioManager.destroyAIOContext(id)
+}
+
+// LookupAIOContext looks up the given context. It returns false if the context
+// does not exist.
+func (mm *MemoryManager) LookupAIOContext(ctx context.Context, id uint64) (*AIOContext, bool) {
+	aioCtx, ok := mm.aioManager.lookupAIOContext(id)
+	if !ok {
+		return nil, false
+	}
+
+	// Protect against 'ids' that are inaccessible (Linux also reads 4 bytes
+	// from id).
+	var buf [4]byte
+	_, err := mm.CopyIn(ctx, usermem.Addr(id), buf[:], usermem.IOOpts{})
+	if err != nil {
+		return nil, false
+	}
+
+	return aioCtx, true
+}
diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go
new file mode 100644
index 000000000..3dabac1af
--- /dev/null
+++ b/pkg/sentry/mm/aio_context_state.go
@@ -0,0 +1,20 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+// afterLoad is invoked by stateify.
+func (a *AIOContext) afterLoad() {
+	a.requestReady = make(chan struct{}, 1)
+}
diff --git a/pkg/sentry/mm/debug.go b/pkg/sentry/mm/debug.go
new file mode 100644
index 000000000..c273c982e
--- /dev/null
+++ b/pkg/sentry/mm/debug.go
@@ -0,0 +1,98 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/context"
+)
+
+const (
+	// If checkInvariants is true, perform runtime checks for invariants
+	// expected by the mm package. This is normally disabled since MM is a
+	// significant hot path in general, and some such checks (notably
+	// memmap.CheckTranslateResult) are very expensive.
+	checkInvariants = false
+
+	// If logIOErrors is true, log I/O errors that originate from MM before
+	// converting them to EFAULT.
+	logIOErrors = false
+)
+
+// String implements fmt.Stringer.String.
+func (mm *MemoryManager) String() string {
+	return mm.DebugString(context.Background())
+}
+
+// DebugString returns a string containing information about mm for debugging.
+func (mm *MemoryManager) DebugString(ctx context.Context) string {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	mm.activeMu.RLock()
+	defer mm.activeMu.RUnlock()
+	return mm.debugStringLocked(ctx)
+}
+
+// Preconditions: mm.mappingMu and mm.activeMu must be locked.
+func (mm *MemoryManager) debugStringLocked(ctx context.Context) string {
+	var b bytes.Buffer
+	b.WriteString("VMAs:\n")
+	for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+		b.Write(mm.vmaMapsEntryLocked(ctx, vseg))
+	}
+	b.WriteString("PMAs:\n")
+	for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
+		b.Write(pseg.debugStringEntryLocked())
+	}
+	return string(b.Bytes())
+}
+
+// Preconditions: mm.activeMu must be locked.
+func (pseg pmaIterator) debugStringEntryLocked() []byte {
+	var b bytes.Buffer
+
+	fmt.Fprintf(&b, "%08x-%08x ", pseg.Start(), pseg.End())
+
+	pma := pseg.ValuePtr()
+	if pma.effectivePerms.Read {
+		b.WriteByte('r')
+	} else {
+		b.WriteByte('-')
+	}
+	if pma.effectivePerms.Write {
+		if pma.needCOW {
+			b.WriteByte('c')
+		} else {
+			b.WriteByte('w')
+		}
+	} else {
+		b.WriteByte('-')
+	}
+	if pma.effectivePerms.Execute {
+		b.WriteByte('x')
+	} else {
+		b.WriteByte('-')
+	}
+	if pma.private {
+		b.WriteByte('p')
+	} else {
+		b.WriteByte('s')
+	}
+
+	fmt.Fprintf(&b, " %08x %T\n", pma.off, pma.file)
+	return b.Bytes()
+}
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
new file mode 100644
index 000000000..fa776f9c6
--- /dev/null
+++ b/pkg/sentry/mm/io.go
@@ -0,0 +1,639 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// There are two supported ways to copy data to/from application virtual
+// memory:
+//
+// 1. Internally-mapped copying: Determine the platform.File that backs the
+// copied-to/from virtual address, obtain a mapping of its pages, and read or
+// write to the mapping.
+//
+// 2. AddressSpace copying: If platform.Platform.SupportsAddressSpaceIO() is
+// true, AddressSpace permissions are applicable, and an AddressSpace is
+// available, copy directly through the AddressSpace, handling faults as
+// needed.
+//
+// (Given that internally-mapped copying requires that backing memory is always
+// implemented using a host file descriptor, we could also preadv/pwritev to it
+// instead. But this would incur a host syscall for each use of the mapped
+// page, whereas mmap is a one-time cost.)
+//
+// The fixed overhead of internally-mapped copying is expected to be higher
+// than that of AddressSpace copying since the former always needs to translate
+// addresses, whereas the latter only needs to do so when faults occur.
+// However, the throughput of internally-mapped copying is expected to be
+// somewhat higher than that of AddressSpace copying due to the high cost of
+// page faults and because implementations of the latter usually rely on
+// safecopy, which doesn't use AVX registers. So we prefer to use AddressSpace
+// copying (when available) for smaller copies, and switch to internally-mapped
+// copying once a size threshold is exceeded.
+const (
+	// copyMapMinBytes is the size threshold for switching to internally-mapped
+	// copying in CopyOut, CopyIn, and ZeroOut.
+	copyMapMinBytes = 32 << 10 // 32 KB
+
+	// rwMapMinBytes is the size threshold for switching to internally-mapped
+	// copying in CopyOutFrom and CopyInTo. It's lower than copyMapMinBytes
+	// since AddressSpace copying in this case requires additional buffering;
+	// see CopyOutFrom for details.
+	rwMapMinBytes = 512
+)
+
+// CheckIORange is similar to usermem.Addr.ToRange, but applies bounds checks
+// consistent with Linux's arch/x86/include/asm/uaccess.h:access_ok().
+//
+// Preconditions: length >= 0.
+func (mm *MemoryManager) CheckIORange(addr usermem.Addr, length int64) (usermem.AddrRange, bool) {
+	// Note that access_ok() constrains end even if length == 0.
+	ar, ok := addr.ToRange(uint64(length))
+	return ar, (ok && ar.End <= mm.layout.MaxAddr)
+}
+
+// checkIOVec applies bound checks consistent with Linux's
+// arch/x86/include/asm/uaccess.h:access_ok() to ars.
+func (mm *MemoryManager) checkIOVec(ars usermem.AddrRangeSeq) bool {
+	for !ars.IsEmpty() {
+		ar := ars.Head()
+		if _, ok := mm.CheckIORange(ar.Start, int64(ar.Length())); !ok {
+			return false
+		}
+		ars = ars.Tail()
+	}
+	return true
+}
+
+func (mm *MemoryManager) asioEnabled(opts usermem.IOOpts) bool {
+	return mm.haveASIO && !opts.IgnorePermissions && opts.AddressSpaceActive
+}
+
+// translateIOError converts errors to EFAULT, as is usually reported for all
+// I/O errors originating from MM in Linux.
+func translateIOError(ctx context.Context, err error) error {
+	if err == nil {
+		return nil
+	}
+	if logIOErrors {
+		ctx.Debugf("MM I/O error: %v", err)
+	}
+	return syserror.EFAULT
+}
+
+// CopyOut implements usermem.IO.CopyOut.
+func (mm *MemoryManager) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
+	ar, ok := mm.CheckIORange(addr, int64(len(src)))
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	if len(src) == 0 {
+		return 0, nil
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.asioEnabled(opts) && len(src) < copyMapMinBytes {
+		return mm.asCopyOut(ctx, addr, src)
+	}
+
+	// Go through internal mappings.
+	n64, err := mm.withInternalMappings(ctx, ar, usermem.Write, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+		n, err := safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src)))
+		return n, translateIOError(ctx, err)
+	})
+	return int(n64), err
+}
+
+func (mm *MemoryManager) asCopyOut(ctx context.Context, addr usermem.Addr, src []byte) (int, error) {
+	var done int
+	for {
+		n, err := mm.as.CopyOut(addr+usermem.Addr(done), src[done:])
+		done += n
+		if err == nil {
+			return done, nil
+		}
+		if f, ok := err.(platform.SegmentationFault); ok {
+			ar, _ := addr.ToRange(uint64(len(src)))
+			if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Write); err != nil {
+				return done, err
+			}
+			continue
+		}
+		return done, translateIOError(ctx, err)
+	}
+}
+
+// CopyIn implements usermem.IO.CopyIn.
+func (mm *MemoryManager) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
+	ar, ok := mm.CheckIORange(addr, int64(len(dst)))
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	if len(dst) == 0 {
+		return 0, nil
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.asioEnabled(opts) && len(dst) < copyMapMinBytes {
+		return mm.asCopyIn(ctx, addr, dst)
+	}
+
+	// Go through internal mappings.
+	n64, err := mm.withInternalMappings(ctx, ar, usermem.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+		n, err := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), ims)
+		return n, translateIOError(ctx, err)
+	})
+	return int(n64), err
+}
+
+func (mm *MemoryManager) asCopyIn(ctx context.Context, addr usermem.Addr, dst []byte) (int, error) {
+	var done int
+	for {
+		n, err := mm.as.CopyIn(addr+usermem.Addr(done), dst[done:])
+		done += n
+		if err == nil {
+			return done, nil
+		}
+		if f, ok := err.(platform.SegmentationFault); ok {
+			ar, _ := addr.ToRange(uint64(len(dst)))
+			if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Read); err != nil {
+				return done, err
+			}
+			continue
+		}
+		return done, translateIOError(ctx, err)
+	}
+}
+
+// ZeroOut implements usermem.IO.ZeroOut.
+func (mm *MemoryManager) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
+	ar, ok := mm.CheckIORange(addr, toZero)
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	if toZero == 0 {
+		return 0, nil
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.asioEnabled(opts) && toZero < copyMapMinBytes {
+		return mm.asZeroOut(ctx, addr, toZero)
+	}
+
+	// Go through internal mappings.
+	return mm.withInternalMappings(ctx, ar, usermem.Write, opts.IgnorePermissions, func(dsts safemem.BlockSeq) (uint64, error) {
+		n, err := safemem.ZeroSeq(dsts)
+		return n, translateIOError(ctx, err)
+	})
+}
+
+func (mm *MemoryManager) asZeroOut(ctx context.Context, addr usermem.Addr, toZero int64) (int64, error) {
+	var done int64
+	for {
+		n, err := mm.as.ZeroOut(addr+usermem.Addr(done), uintptr(toZero-done))
+		done += int64(n)
+		if err == nil {
+			return done, nil
+		}
+		if f, ok := err.(platform.SegmentationFault); ok {
+			ar, _ := addr.ToRange(uint64(toZero))
+			if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Write); err != nil {
+				return done, err
+			}
+			continue
+		}
+		return done, translateIOError(ctx, err)
+	}
+}
+
+// CopyOutFrom implements usermem.IO.CopyOutFrom.
+func (mm *MemoryManager) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
+	if !mm.checkIOVec(ars) {
+		return 0, syserror.EFAULT
+	}
+
+	if ars.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes {
+		// We have to introduce a buffered copy, instead of just passing a
+		// safemem.BlockSeq representing addresses in the AddressSpace to src.
+		// This is because usermem.IO.CopyOutFrom() guarantees that it calls
+		// src.ReadToBlocks() at most once, which is incompatible with handling
+		// faults between calls. In the future, this is probably best resolved
+		// by introducing a CopyOutFrom variant or option that allows it to
+		// call src.ReadToBlocks() any number of times.
+		//
+		// This issue applies to CopyInTo as well.
+		buf := make([]byte, int(ars.NumBytes()))
+		bufN, bufErr := src.ReadToBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)))
+		var done int64
+		for done < int64(bufN) {
+			ar := ars.Head()
+			cplen := int64(ar.Length())
+			if cplen > int64(bufN)-done {
+				cplen = int64(bufN) - done
+			}
+			n, err := mm.asCopyOut(ctx, ar.Start, buf[int(done):int(done+cplen)])
+			done += int64(n)
+			if err != nil {
+				return done, err
+			}
+			ars = ars.Tail()
+		}
+		// Do not convert errors returned by src to EFAULT.
+		return done, bufErr
+	}
+
+	// Go through internal mappings.
+	return mm.withVecInternalMappings(ctx, ars, usermem.Write, opts.IgnorePermissions, src.ReadToBlocks)
+}
+
+// CopyInTo implements usermem.IO.CopyInTo.
+func (mm *MemoryManager) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
+	if !mm.checkIOVec(ars) {
+		return 0, syserror.EFAULT
+	}
+
+	if ars.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes {
+		buf := make([]byte, int(ars.NumBytes()))
+		var done int
+		var bufErr error
+		for !ars.IsEmpty() {
+			ar := ars.Head()
+			var n int
+			n, bufErr = mm.asCopyIn(ctx, ar.Start, buf[done:done+int(ar.Length())])
+			done += n
+			if bufErr != nil {
+				break
+			}
+			ars = ars.Tail()
+		}
+		n, err := dst.WriteFromBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:done])))
+		if err != nil {
+			return int64(n), err
+		}
+		// Do not convert errors returned by dst to EFAULT.
+		return int64(n), bufErr
+	}
+
+	// Go through internal mappings.
+	return mm.withVecInternalMappings(ctx, ars, usermem.Read, opts.IgnorePermissions, dst.WriteFromBlocks)
+}
+
+// SwapUint32 implements usermem.IO.SwapUint32.
+func (mm *MemoryManager) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
+	ar, ok := mm.CheckIORange(addr, 4)
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
+		for {
+			old, err := mm.as.SwapUint32(addr, new)
+			if err == nil {
+				return old, nil
+			}
+			if f, ok := err.(platform.SegmentationFault); ok {
+				if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.ReadWrite); err != nil {
+					return 0, err
+				}
+				continue
+			}
+			return 0, translateIOError(ctx, err)
+		}
+	}
+
+	// Go through internal mappings.
+	var old uint32
+	_, err := mm.withInternalMappings(ctx, ar, usermem.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+		if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
+			// Atomicity is unachievable across mappings.
+			return 0, syserror.EFAULT
+		}
+		im := ims.Head()
+		var err error
+		old, err = safemem.SwapUint32(im, new)
+		if err != nil {
+			return 0, translateIOError(ctx, err)
+		}
+		// Return the number of bytes read.
+		return 4, nil
+	})
+	return old, err
+}
+
+// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
+func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
+	ar, ok := mm.CheckIORange(addr, 4)
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
+		for {
+			prev, err := mm.as.CompareAndSwapUint32(addr, old, new)
+			if err == nil {
+				return prev, nil
+			}
+			if f, ok := err.(platform.SegmentationFault); ok {
+				if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.ReadWrite); err != nil {
+					return 0, err
+				}
+				continue
+			}
+			return 0, translateIOError(ctx, err)
+		}
+	}
+
+	// Go through internal mappings.
+	var prev uint32
+	_, err := mm.withInternalMappings(ctx, ar, usermem.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+		if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
+			// Atomicity is unachievable across mappings.
+			return 0, syserror.EFAULT
+		}
+		im := ims.Head()
+		var err error
+		prev, err = safemem.CompareAndSwapUint32(im, old, new)
+		if err != nil {
+			return 0, translateIOError(ctx, err)
+		}
+		// Return the number of bytes read.
+		return 4, nil
+	})
+	return prev, err
+}
+
+// LoadUint32 implements usermem.IO.LoadUint32.
+func (mm *MemoryManager) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) {
+	ar, ok := mm.CheckIORange(addr, 4)
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
+		for {
+			val, err := mm.as.LoadUint32(addr)
+			if err == nil {
+				return val, nil
+			}
+			if f, ok := err.(platform.SegmentationFault); ok {
+				if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Read); err != nil {
+					return 0, err
+				}
+				continue
+			}
+			return 0, translateIOError(ctx, err)
+		}
+	}
+
+	// Go through internal mappings.
+	var val uint32
+	_, err := mm.withInternalMappings(ctx, ar, usermem.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+		if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
+			// Atomicity is unachievable across mappings.
+			return 0, syserror.EFAULT
+		}
+		im := ims.Head()
+		var err error
+		val, err = safemem.LoadUint32(im)
+		if err != nil {
+			return 0, translateIOError(ctx, err)
+		}
+		// Return the number of bytes read.
+		return 4, nil
+	})
+	return val, err
+}
+
+// handleASIOFault handles a page fault at address addr for an AddressSpaceIO
+// operation spanning ioar.
+//
+// Preconditions: mm.as != nil. ioar.Length() != 0. ioar.Contains(addr).
+func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr usermem.Addr, ioar usermem.AddrRange, at usermem.AccessType) error {
+	// Try to map all remaining pages in the I/O operation. This RoundUp can't
+	// overflow because otherwise it would have been caught by CheckIORange.
+	end, _ := ioar.End.RoundUp()
+	ar := usermem.AddrRange{addr.RoundDown(), end}
+
+	// Don't bother trying existingPMAsLocked; in most cases, if we did have
+	// existing pmas, we wouldn't have faulted.
+
+	// Ensure that we have usable vmas. Here and below, only return early if we
+	// can't map the first (faulting) page; failure to map later pages are
+	// silently ignored. This maximizes partial success.
+	mm.mappingMu.RLock()
+	vseg, vend, err := mm.getVMAsLocked(ctx, ar, at, false)
+	if vendaddr := vend.Start(); vendaddr < ar.End {
+		if vendaddr <= ar.Start {
+			mm.mappingMu.RUnlock()
+			return translateIOError(ctx, err)
+		}
+		ar.End = vendaddr
+	}
+
+	// Ensure that we have usable pmas.
+	mm.activeMu.Lock()
+	pseg, pend, err := mm.getPMAsLocked(ctx, vseg, ar, at)
+	mm.mappingMu.RUnlock()
+	if pendaddr := pend.Start(); pendaddr < ar.End {
+		if pendaddr <= ar.Start {
+			mm.activeMu.Unlock()
+			return translateIOError(ctx, err)
+		}
+		ar.End = pendaddr
+	}
+
+	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
+	// anymore.
+	mm.activeMu.DowngradeLock()
+
+	err = mm.mapASLocked(pseg, ar, false)
+	mm.activeMu.RUnlock()
+	return translateIOError(ctx, err)
+}
+
+// withInternalMappings ensures that pmas exist for all addresses in ar,
+// support access of type (at, ignorePermissions), and have internal mappings
+// cached. It then calls f with mm.activeMu locked for reading, passing
+// internal mappings for the subrange of ar for which this property holds.
+//
+// withInternalMappings takes a function returning uint64 since many safemem
+// functions have this property, but returns an int64 since this is usually
+// more useful for usermem.IO methods.
+//
+// Preconditions: 0 < ar.Length() <= math.MaxInt64.
+func (mm *MemoryManager) withInternalMappings(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
+	// If pmas are already available, we can do IO without touching mm.vmas or
+	// mm.mappingMu.
+	mm.activeMu.RLock()
+	if pseg := mm.existingPMAsLocked(ar, at, ignorePermissions, true /* needInternalMappings */); pseg.Ok() {
+		n, err := f(mm.internalMappingsLocked(pseg, ar))
+		mm.activeMu.RUnlock()
+		// Do not convert errors returned by f to EFAULT.
+		return int64(n), err
+	}
+	mm.activeMu.RUnlock()
+
+	// Ensure that we have usable vmas.
+	mm.mappingMu.RLock()
+	vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions)
+	if vendaddr := vend.Start(); vendaddr < ar.End {
+		if vendaddr <= ar.Start {
+			mm.mappingMu.RUnlock()
+			return 0, translateIOError(ctx, verr)
+		}
+		ar.End = vendaddr
+	}
+
+	// Ensure that we have usable pmas.
+	mm.activeMu.Lock()
+	pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at)
+	mm.mappingMu.RUnlock()
+	if pendaddr := pend.Start(); pendaddr < ar.End {
+		if pendaddr <= ar.Start {
+			mm.activeMu.Unlock()
+			return 0, translateIOError(ctx, perr)
+		}
+		ar.End = pendaddr
+	}
+	imend, imerr := mm.getPMAInternalMappingsLocked(pseg, ar)
+	mm.activeMu.DowngradeLock()
+	if imendaddr := imend.Start(); imendaddr < ar.End {
+		if imendaddr <= ar.Start {
+			mm.activeMu.RUnlock()
+			return 0, translateIOError(ctx, imerr)
+		}
+		ar.End = imendaddr
+	}
+
+	// Do I/O.
+	un, err := f(mm.internalMappingsLocked(pseg, ar))
+	mm.activeMu.RUnlock()
+	n := int64(un)
+
+	// Return the first error in order of progress through ar.
+	if err != nil {
+		// Do not convert errors returned by f to EFAULT.
+		return n, err
+	}
+	if imerr != nil {
+		return n, translateIOError(ctx, imerr)
+	}
+	if perr != nil {
+		return n, translateIOError(ctx, perr)
+	}
+	return n, translateIOError(ctx, verr)
+}
+
+// withVecInternalMappings ensures that pmas exist for all addresses in ars,
+// support access of type (at, ignorePermissions), and have internal mappings
+// cached. It then calls f with mm.activeMu locked for reading, passing
+// internal mappings for the subset of ars for which this property holds.
+//
+// Preconditions: !ars.IsEmpty().
+func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
+	// withInternalMappings is faster than withVecInternalMappings because of
+	// iterator plumbing (this isn't generally practical in the vector case due
+	// to iterator invalidation between AddrRanges). Use it if possible.
+	if ars.NumRanges() == 1 {
+		return mm.withInternalMappings(ctx, ars.Head(), at, ignorePermissions, f)
+	}
+
+	// If pmas are already available, we can do IO without touching mm.vmas or
+	// mm.mappingMu.
+	mm.activeMu.RLock()
+	if mm.existingVecPMAsLocked(ars, at, ignorePermissions, true /* needInternalMappings */) {
+		n, err := f(mm.vecInternalMappingsLocked(ars))
+		mm.activeMu.RUnlock()
+		// Do not convert errors returned by f to EFAULT.
+		return int64(n), err
+	}
+	mm.activeMu.RUnlock()
+
+	// Ensure that we have usable vmas.
+	mm.mappingMu.RLock()
+	vars, verr := mm.getVecVMAsLocked(ctx, ars, at, ignorePermissions)
+	if vars.NumBytes() == 0 {
+		mm.mappingMu.RUnlock()
+		return 0, translateIOError(ctx, verr)
+	}
+
+	// Ensure that we have usable pmas.
+	mm.activeMu.Lock()
+	pars, perr := mm.getVecPMAsLocked(ctx, vars, at)
+	mm.mappingMu.RUnlock()
+	if pars.NumBytes() == 0 {
+		mm.activeMu.Unlock()
+		return 0, translateIOError(ctx, perr)
+	}
+	imars, imerr := mm.getVecPMAInternalMappingsLocked(pars)
+	mm.activeMu.DowngradeLock()
+	if imars.NumBytes() == 0 {
+		mm.activeMu.RUnlock()
+		return 0, translateIOError(ctx, imerr)
+	}
+
+	// Do I/O.
+	un, err := f(mm.vecInternalMappingsLocked(imars))
+	mm.activeMu.RUnlock()
+	n := int64(un)
+
+	// Return the first error in order of progress through ars.
+	if err != nil {
+		// Do not convert errors from f to EFAULT.
+		return n, err
+	}
+	if imerr != nil {
+		return n, translateIOError(ctx, imerr)
+	}
+	if perr != nil {
+		return n, translateIOError(ctx, perr)
+	}
+	return n, translateIOError(ctx, verr)
+}
+
+// truncatedAddrRangeSeq returns a copy of ars, but with the end truncated to
+// at most address end on AddrRange arsit.Head(). It is used in vector I/O paths to
+// truncate usermem.AddrRangeSeq when errors occur.
+//
+// Preconditions: !arsit.IsEmpty(). end <= arsit.Head().End.
+func truncatedAddrRangeSeq(ars, arsit usermem.AddrRangeSeq, end usermem.Addr) usermem.AddrRangeSeq {
+	ar := arsit.Head()
+	if end <= ar.Start {
+		return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes())
+	}
+	return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes() + int64(end-ar.Start))
+}
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
new file mode 100644
index 000000000..aac56679b
--- /dev/null
+++ b/pkg/sentry/mm/lifecycle.go
@@ -0,0 +1,283 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
+func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider, sleepForActivation bool) *MemoryManager {
+	return &MemoryManager{
+		p:                  p,
+		mfp:                mfp,
+		haveASIO:           p.SupportsAddressSpaceIO(),
+		privateRefs:        &privateRefs{},
+		users:              1,
+		auxv:               arch.Auxv{},
+		dumpability:        UserDumpable,
+		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
+		sleepForActivation: sleepForActivation,
+	}
+}
+
+// SetMmapLayout initializes mm's layout from the given arch.Context.
+//
+// Preconditions: mm contains no mappings and is not used concurrently.
+func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arch.MmapLayout, error) {
+	layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r)
+	if err != nil {
+		return arch.MmapLayout{}, err
+	}
+	mm.layout = layout
+	return layout, nil
+}
+
+// Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or
+// clone() (without CLONE_VM).
+func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	mm2 := &MemoryManager{
+		p:           mm.p,
+		mfp:         mm.mfp,
+		haveASIO:    mm.haveASIO,
+		layout:      mm.layout,
+		privateRefs: mm.privateRefs,
+		users:       1,
+		brk:         mm.brk,
+		usageAS:     mm.usageAS,
+		dataAS:      mm.dataAS,
+		// "The child does not inherit its parent's memory locks (mlock(2),
+		// mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is
+		// MLockNone, both of which are zero values. vma.mlockMode is reset
+		// when copied below.
+		captureInvalidations: true,
+		argv:                 mm.argv,
+		envv:                 mm.envv,
+		auxv:                 append(arch.Auxv(nil), mm.auxv...),
+		// IncRef'd below, once we know that there isn't an error.
+		executable:         mm.executable,
+		dumpability:        mm.dumpability,
+		aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
+		sleepForActivation: mm.sleepForActivation,
+		vdsoSigReturnAddr:  mm.vdsoSigReturnAddr,
+	}
+
+	// Copy vmas.
+	dontforks := false
+	dstvgap := mm2.vmas.FirstGap()
+	for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
+		vma := srcvseg.Value() // makes a copy of the vma
+		vmaAR := srcvseg.Range()
+
+		if vma.dontfork {
+			length := uint64(vmaAR.Length())
+			mm2.usageAS -= length
+			if vma.isPrivateDataLocked() {
+				mm2.dataAS -= length
+			}
+			dontforks = true
+			continue
+		}
+
+		// Inform the Mappable, if any, of the new mapping.
+		if vma.mappable != nil {
+			if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil {
+				mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange())
+				return nil, err
+			}
+		}
+		if vma.id != nil {
+			vma.id.IncRef()
+		}
+		vma.mlockMode = memmap.MLockNone
+		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap()
+		// We don't need to update mm2.usageAS since we copied it from mm
+		// above.
+	}
+
+	// Copy pmas. We have to lock mm.activeMu for writing to make existing
+	// private pmas copy-on-write. We also have to lock mm2.activeMu since
+	// after copying vmas above, memmap.Mappables may call mm2.Invalidate. We
+	// only copy private pmas, since in the common case where fork(2) is
+	// immediately followed by execve(2), copying non-private pmas that can be
+	// regenerated by calling memmap.Mappable.Translate is a waste of time.
+	// (Linux does the same; compare kernel/fork.c:dup_mmap() =>
+	// mm/memory.c:copy_page_range().)
+	mm2.activeMu.Lock()
+	defer mm2.activeMu.Unlock()
+	mm.activeMu.Lock()
+	defer mm.activeMu.Unlock()
+	if dontforks {
+		defer mm.pmas.MergeRange(mm.applicationAddrRange())
+	}
+	srcvseg := mm.vmas.FirstSegment()
+	dstpgap := mm2.pmas.FirstGap()
+	var unmapAR usermem.AddrRange
+	for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() {
+		pma := srcpseg.ValuePtr()
+		if !pma.private {
+			continue
+		}
+
+		if dontforks {
+			// Find the 'vma' that contains the starting address
+			// associated with the 'pma' (there must be one).
+			srcvseg = srcvseg.seekNextLowerBound(srcpseg.Start())
+			if checkInvariants {
+				if !srcvseg.Ok() {
+					panic(fmt.Sprintf("no vma covers pma range %v", srcpseg.Range()))
+				}
+				if srcpseg.Start() < srcvseg.Start() {
+					panic(fmt.Sprintf("vma %v ran ahead of pma %v", srcvseg.Range(), srcpseg.Range()))
+				}
+			}
+
+			srcpseg = mm.pmas.Isolate(srcpseg, srcvseg.Range())
+			if srcvseg.ValuePtr().dontfork {
+				continue
+			}
+			pma = srcpseg.ValuePtr()
+		}
+
+		if !pma.needCOW {
+			pma.needCOW = true
+			if pma.effectivePerms.Write {
+				// We don't want to unmap the whole address space, even though
+				// doing so would reduce calls to unmapASLocked(), because mm
+				// will most likely continue to be used after the fork, so
+				// unmapping pmas unnecessarily will result in extra page
+				// faults. But we do want to merge consecutive AddrRanges
+				// across pma boundaries.
+				if unmapAR.End == srcpseg.Start() {
+					unmapAR.End = srcpseg.End()
+				} else {
+					if unmapAR.Length() != 0 {
+						mm.unmapASLocked(unmapAR)
+					}
+					unmapAR = srcpseg.Range()
+				}
+				pma.effectivePerms.Write = false
+			}
+			pma.maxPerms.Write = false
+		}
+		fr := srcpseg.fileRange()
+		mm2.incPrivateRef(fr)
+		srcpseg.ValuePtr().file.IncRef(fr)
+		addrRange := srcpseg.Range()
+		mm2.addRSSLocked(addrRange)
+		dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap()
+	}
+	if unmapAR.Length() != 0 {
+		mm.unmapASLocked(unmapAR)
+	}
+
+	// Between when we call memmap.Mappable.AddMapping while copying vmas and
+	// when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are
+	// ineffective because the pmas they invalidate haven't yet been copied,
+	// possibly allowing mm2 to get invalidated translations:
+	//
+	// Invalidating Mappable            mm.Fork
+	// ---------------------            -------
+	//
+	// mm2.Invalidate()
+	//                                  mm.activeMu.Lock()
+	// mm.Invalidate() /* blocks */
+	//                                  mm2.activeMu.Lock()
+	//                                  (mm copies invalidated pma to mm2)
+	//
+	// This would technically be both safe (since we only copy private pmas,
+	// which will still hold a reference on their memory) and consistent with
+	// Linux, but we avoid it anyway by setting mm2.captureInvalidations during
+	// construction, causing calls to mm2.Invalidate() to be captured in
+	// mm2.capturedInvalidations, to be replayed after pmas are copied - i.e.
+	// here.
+	mm2.captureInvalidations = false
+	for _, invArgs := range mm2.capturedInvalidations {
+		mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true)
+	}
+	mm2.capturedInvalidations = nil
+
+	if mm2.executable != nil {
+		mm2.executable.IncRef()
+	}
+	return mm2, nil
+}
+
+// IncUsers increments mm's user count and returns true. If the user count is
+// already 0, IncUsers does nothing and returns false.
+func (mm *MemoryManager) IncUsers() bool {
+	for {
+		users := atomic.LoadInt32(&mm.users)
+		if users == 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt32(&mm.users, users, users+1) {
+			return true
+		}
+	}
+}
+
+// DecUsers decrements mm's user count. If the user count reaches 0, all
+// mappings in mm are unmapped.
+func (mm *MemoryManager) DecUsers(ctx context.Context) {
+	if users := atomic.AddInt32(&mm.users, -1); users > 0 {
+		return
+	} else if users < 0 {
+		panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users))
+	}
+
+	mm.aioManager.destroy()
+
+	mm.metadataMu.Lock()
+	exe := mm.executable
+	mm.executable = nil
+	mm.metadataMu.Unlock()
+	if exe != nil {
+		exe.DecRef()
+	}
+
+	mm.activeMu.Lock()
+	// Sanity check.
+	if atomic.LoadInt32(&mm.active) != 0 {
+		panic("active address space lost?")
+	}
+	// Make sure the AddressSpace is returned.
+	if mm.as != nil {
+		mm.as.Release()
+		mm.as = nil
+	}
+	mm.activeMu.Unlock()
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	// If mm is being dropped before mm.SetMmapLayout was called,
+	// mm.applicationAddrRange() will be empty.
+	if ar := mm.applicationAddrRange(); ar.Length() != 0 {
+		mm.unmapLocked(ctx, ar)
+	}
+}
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
new file mode 100644
index 000000000..28e5057f7
--- /dev/null
+++ b/pkg/sentry/mm/metadata.go
@@ -0,0 +1,183 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Dumpability describes if and how core dumps should be created.
+type Dumpability int
+
+const (
+	// NotDumpable indicates that core dumps should never be created.
+	NotDumpable Dumpability = iota
+
+	// UserDumpable indicates that core dumps should be created, owned by
+	// the current user.
+	UserDumpable
+
+	// RootDumpable indicates that core dumps should be created, owned by
+	// root.
+	RootDumpable
+)
+
+// Dumpability returns the dumpability.
+func (mm *MemoryManager) Dumpability() Dumpability {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.dumpability
+}
+
+// SetDumpability sets the dumpability.
+func (mm *MemoryManager) SetDumpability(d Dumpability) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.dumpability = d
+}
+
+// ArgvStart returns the start of the application argument vector.
+//
+// There is no guarantee that this value is sensible w.r.t. ArgvEnd.
+func (mm *MemoryManager) ArgvStart() usermem.Addr {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.argv.Start
+}
+
+// SetArgvStart sets the start of the application argument vector.
+func (mm *MemoryManager) SetArgvStart(a usermem.Addr) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.argv.Start = a
+}
+
+// ArgvEnd returns the end of the application argument vector.
+//
+// There is no guarantee that this value is sensible w.r.t. ArgvStart.
+func (mm *MemoryManager) ArgvEnd() usermem.Addr {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.argv.End
+}
+
+// SetArgvEnd sets the end of the application argument vector.
+func (mm *MemoryManager) SetArgvEnd(a usermem.Addr) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.argv.End = a
+}
+
+// EnvvStart returns the start of the application environment vector.
+//
+// There is no guarantee that this value is sensible w.r.t. EnvvEnd.
+func (mm *MemoryManager) EnvvStart() usermem.Addr {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.envv.Start
+}
+
+// SetEnvvStart sets the start of the application environment vector.
+func (mm *MemoryManager) SetEnvvStart(a usermem.Addr) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.envv.Start = a
+}
+
+// EnvvEnd returns the end of the application environment vector.
+//
+// There is no guarantee that this value is sensible w.r.t. EnvvStart.
+func (mm *MemoryManager) EnvvEnd() usermem.Addr {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.envv.End
+}
+
+// SetEnvvEnd sets the end of the application environment vector.
+func (mm *MemoryManager) SetEnvvEnd(a usermem.Addr) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.envv.End = a
+}
+
+// Auxv returns the current map of auxiliary vectors.
+func (mm *MemoryManager) Auxv() arch.Auxv {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return append(arch.Auxv(nil), mm.auxv...)
+}
+
+// SetAuxv sets the entire map of auxiliary vectors.
+func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.auxv = append(arch.Auxv(nil), auxv...)
+}
+
+// Executable returns the executable, if available.
+//
+// An additional reference will be taken in the case of a non-nil executable,
+// which must be released by the caller.
+func (mm *MemoryManager) Executable() fsbridge.File {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+
+	if mm.executable == nil {
+		return nil
+	}
+
+	mm.executable.IncRef()
+	return mm.executable
+}
+
+// SetExecutable sets the executable.
+//
+// This takes a reference on d.
+func (mm *MemoryManager) SetExecutable(file fsbridge.File) {
+	mm.metadataMu.Lock()
+
+	// Grab a new reference.
+	file.IncRef()
+
+	// Set the executable.
+	orig := mm.executable
+	mm.executable = file
+
+	mm.metadataMu.Unlock()
+
+	// Release the old reference.
+	//
+	// Do this without holding the lock, since it may wind up doing some
+	// I/O to sync the dirent, etc.
+	if orig != nil {
+		orig.DecRef()
+	}
+}
+
+// VDSOSigReturn returns the address of vdso_sigreturn.
+func (mm *MemoryManager) VDSOSigReturn() uint64 {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.vdsoSigReturnAddr
+}
+
+// SetVDSOSigReturn sets the address of vdso_sigreturn.
+func (mm *MemoryManager) SetVDSOSigReturn(addr uint64) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.vdsoSigReturnAddr = addr
+}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
new file mode 100644
index 000000000..6db7c3d40
--- /dev/null
+++ b/pkg/sentry/mm/mm.go
@@ -0,0 +1,478 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package mm provides a memory management subsystem. See README.md for a
+// detailed overview.
+//
+// Lock order:
+//
+// fs locks, except for memmap.Mappable locks
+//   mm.MemoryManager.metadataMu
+//     mm.MemoryManager.mappingMu
+//       Locks taken by memmap.Mappable methods other than Translate
+//         mm.MemoryManager.activeMu
+//           Locks taken by memmap.Mappable.Translate
+//             mm.privateRefs.mu
+//               platform.AddressSpace locks
+//                 platform.File locks
+//         mm.aioManager.mu
+//           mm.AIOContext.mu
+//
+// Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in
+// multiple mm.MemoryManagers, as it does so in a well-defined order (forked
+// child first).
+package mm
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// MemoryManager implements a virtual address space.
+//
+// +stateify savable
+type MemoryManager struct {
+	// p and mfp are immutable.
+	p   platform.Platform
+	mfp pgalloc.MemoryFileProvider
+
+	// haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from
+	// eliminating an indirect call in the hot I/O path, this makes
+	// MemoryManager.asioEnabled() a leaf function, allowing it to be inlined.
+	//
+	// haveASIO is immutable.
+	haveASIO bool `state:"nosave"`
+
+	// layout is the memory layout.
+	//
+	// layout is set by the binary loader before the MemoryManager can be used.
+	layout arch.MmapLayout
+
+	// privateRefs stores reference counts for private memory (memory whose
+	// ownership is shared by one or more pmas instead of being owned by a
+	// memmap.Mappable).
+	//
+	// privateRefs is immutable.
+	privateRefs *privateRefs
+
+	// users is the number of dependencies on the mappings in the MemoryManager.
+	// When the number of references in users reaches zero, all mappings are
+	// unmapped.
+	//
+	// users is accessed using atomic memory operations.
+	users int32
+
+	// mappingMu is analogous to Linux's struct mm_struct::mmap_sem.
+	mappingMu sync.RWMutex `state:"nosave"`
+
+	// vmas stores virtual memory areas. Since vmas are stored by value,
+	// clients should usually use vmaIterator.ValuePtr() instead of
+	// vmaIterator.Value() to get a pointer to the vma rather than a copy.
+	//
+	// Invariants: vmas are always page-aligned.
+	//
+	// vmas is protected by mappingMu.
+	vmas vmaSet
+
+	// brk is the mm's brk, which is manipulated using the brk(2) system call.
+	// The brk is initially set up by the loader which maps an executable
+	// binary into the mm.
+	//
+	// brk is protected by mappingMu.
+	brk usermem.AddrRange
+
+	// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
+	//
+	// usageAS is protected by mappingMu.
+	usageAS uint64
+
+	// lockedAS is the combined size in bytes of all vmas with vma.mlockMode !=
+	// memmap.MLockNone.
+	//
+	// lockedAS is protected by mappingMu.
+	lockedAS uint64
+
+	// dataAS is the size of private data segments, like mm_struct->data_vm.
+	// It means the vma which is private, writable, not stack.
+	//
+	// dataAS is protected by mappingMu.
+	dataAS uint64
+
+	// New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or
+	// defMLockMode is greater.
+	//
+	// defMLockMode is protected by mappingMu.
+	defMLockMode memmap.MLockMode
+
+	// activeMu is loosely analogous to Linux's struct
+	// mm_struct::page_table_lock.
+	activeMu sync.RWMutex `state:"nosave"`
+
+	// pmas stores platform mapping areas used to implement vmas. Since pmas
+	// are stored by value, clients should usually use pmaIterator.ValuePtr()
+	// instead of pmaIterator.Value() to get a pointer to the pma rather than
+	// a copy.
+	//
+	// Inserting or removing segments from pmas should happen along with a
+	// call to mm.insertRSS or mm.removeRSS.
+	//
+	// Invariants: pmas are always page-aligned. If a pma exists for a given
+	// address, a vma must also exist for that address.
+	//
+	// pmas is protected by activeMu.
+	pmas pmaSet
+
+	// curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is
+	// reported as the MemoryManager's RSS.
+	//
+	// maxRSS should be modified only via insertRSS and removeRSS, not
+	// directly.
+	//
+	// maxRSS is protected by activeMu.
+	curRSS uint64
+
+	// maxRSS is the maximum resident set size in bytes of a MemoryManager.
+	// It is tracked as the application adds and removes mappings to pmas.
+	//
+	// maxRSS should be modified only via insertRSS, not directly.
+	//
+	// maxRSS is protected by activeMu.
+	maxRSS uint64
+
+	// as is the platform.AddressSpace that pmas are mapped into. active is the
+	// number of contexts that require as to be non-nil; if active == 0, as may
+	// be nil.
+	//
+	// as is protected by activeMu. active is manipulated with atomic memory
+	// operations; transitions to and from zero are additionally protected by
+	// activeMu. (This is because such transitions may need to be atomic with
+	// changes to as.)
+	as     platform.AddressSpace `state:"nosave"`
+	active int32                 `state:"zerovalue"`
+
+	// unmapAllOnActivate indicates that the next Activate call should activate
+	// an empty AddressSpace.
+	//
+	// This is used to ensure that an AddressSpace cached in
+	// NewAddressSpace is not used after some change in the MemoryManager
+	// or VMAs has made that AddressSpace stale.
+	//
+	// unmapAllOnActivate is protected by activeMu. It must only be set when
+	// there is no active or cached AddressSpace. If as != nil, then
+	// invalidations should be propagated immediately.
+	unmapAllOnActivate bool `state:"nosave"`
+
+	// If captureInvalidations is true, calls to MM.Invalidate() are recorded
+	// in capturedInvalidations rather than being applied immediately to pmas.
+	// This is to avoid a race condition in MM.Fork(); see that function for
+	// details.
+	//
+	// Both captureInvalidations and capturedInvalidations are protected by
+	// activeMu. Neither need to be saved since captureInvalidations is only
+	// enabled during MM.Fork(), during which saving can't occur.
+	captureInvalidations  bool             `state:"zerovalue"`
+	capturedInvalidations []invalidateArgs `state:"nosave"`
+
+	metadataMu sync.Mutex `state:"nosave"`
+
+	// argv is the application argv. This is set up by the loader and may be
+	// modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No
+	// requirements apply to argv; we do not require that argv.WellFormed().
+	//
+	// argv is protected by metadataMu.
+	argv usermem.AddrRange
+
+	// envv is the application envv. This is set up by the loader and may be
+	// modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No
+	// requirements apply to envv; we do not require that envv.WellFormed().
+	//
+	// envv is protected by metadataMu.
+	envv usermem.AddrRange
+
+	// auxv is the ELF's auxiliary vector.
+	//
+	// auxv is protected by metadataMu.
+	auxv arch.Auxv
+
+	// executable is the executable for this MemoryManager. If executable
+	// is not nil, it holds a reference on the Dirent.
+	//
+	// executable is protected by metadataMu.
+	executable fsbridge.File
+
+	// dumpability describes if and how this MemoryManager may be dumped to
+	// userspace.
+	//
+	// dumpability is protected by metadataMu.
+	dumpability Dumpability
+
+	// aioManager keeps track of AIOContexts used for async IOs. AIOManager
+	// must be cloned when CLONE_VM is used.
+	aioManager aioManager
+
+	// sleepForActivation indicates whether the task should report to be sleeping
+	// before trying to activate the address space. When set to true, delays in
+	// activation are not reported as stuck tasks by the watchdog.
+	sleepForActivation bool
+
+	// vdsoSigReturnAddr is the address of 'vdso_sigreturn'.
+	vdsoSigReturnAddr uint64
+}
+
+// vma represents a virtual memory area.
+//
+// +stateify savable
+type vma struct {
+	// mappable is the virtual memory object mapped by this vma. If mappable is
+	// nil, the vma represents a private anonymous mapping.
+	mappable memmap.Mappable
+
+	// off is the offset into mappable at which this vma begins. If mappable is
+	// nil, off is meaningless.
+	off uint64
+
+	// To speedup VMA save/restore, we group and save the following booleans
+	// as a single integer.
+
+	// realPerms are the memory permissions on this vma, as defined by the
+	// application.
+	realPerms usermem.AccessType `state:".(int)"`
+
+	// effectivePerms are the memory permissions on this vma which are
+	// actually used to control access.
+	//
+	// Invariant: effectivePerms == realPerms.Effective().
+	effectivePerms usermem.AccessType `state:"manual"`
+
+	// maxPerms limits the set of permissions that may ever apply to this
+	// memory, as well as accesses for which usermem.IOOpts.IgnorePermissions
+	// is true (e.g. ptrace(PTRACE_POKEDATA)).
+	//
+	// Invariant: maxPerms == maxPerms.Effective().
+	maxPerms usermem.AccessType `state:"manual"`
+
+	// private is true if this is a MAP_PRIVATE mapping, such that writes to
+	// the mapping are propagated to a copy.
+	private bool `state:"manual"`
+
+	// growsDown is true if the mapping may be automatically extended downward
+	// under certain conditions. If growsDown is true, mappable must be nil.
+	//
+	// There is currently no corresponding growsUp flag; in Linux, the only
+	// architectures that can have VM_GROWSUP mappings are ia64, parisc, and
+	// metag, none of which we currently support.
+	growsDown bool `state:"manual"`
+
+	// dontfork is the MADV_DONTFORK setting for this vma configured by madvise().
+	dontfork bool
+
+	mlockMode memmap.MLockMode
+
+	// numaPolicy is the NUMA policy for this vma set by mbind().
+	numaPolicy linux.NumaPolicy
+
+	// numaNodemask is the NUMA nodemask for this vma set by mbind().
+	numaNodemask uint64
+
+	// If id is not nil, it controls the lifecycle of mappable and provides vma
+	// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
+	id memmap.MappingIdentity
+
+	// If hint is non-empty, it is a description of the vma printed in
+	// /proc/[pid]/maps. hint takes priority over id.MappedName().
+	hint string
+}
+
+const (
+	vmaRealPermsRead = 1 << iota
+	vmaRealPermsWrite
+	vmaRealPermsExecute
+	vmaEffectivePermsRead
+	vmaEffectivePermsWrite
+	vmaEffectivePermsExecute
+	vmaMaxPermsRead
+	vmaMaxPermsWrite
+	vmaMaxPermsExecute
+	vmaPrivate
+	vmaGrowsDown
+)
+
+func (v *vma) saveRealPerms() int {
+	var b int
+	if v.realPerms.Read {
+		b |= vmaRealPermsRead
+	}
+	if v.realPerms.Write {
+		b |= vmaRealPermsWrite
+	}
+	if v.realPerms.Execute {
+		b |= vmaRealPermsExecute
+	}
+	if v.effectivePerms.Read {
+		b |= vmaEffectivePermsRead
+	}
+	if v.effectivePerms.Write {
+		b |= vmaEffectivePermsWrite
+	}
+	if v.effectivePerms.Execute {
+		b |= vmaEffectivePermsExecute
+	}
+	if v.maxPerms.Read {
+		b |= vmaMaxPermsRead
+	}
+	if v.maxPerms.Write {
+		b |= vmaMaxPermsWrite
+	}
+	if v.maxPerms.Execute {
+		b |= vmaMaxPermsExecute
+	}
+	if v.private {
+		b |= vmaPrivate
+	}
+	if v.growsDown {
+		b |= vmaGrowsDown
+	}
+	return b
+}
+
+func (v *vma) loadRealPerms(b int) {
+	if b&vmaRealPermsRead > 0 {
+		v.realPerms.Read = true
+	}
+	if b&vmaRealPermsWrite > 0 {
+		v.realPerms.Write = true
+	}
+	if b&vmaRealPermsExecute > 0 {
+		v.realPerms.Execute = true
+	}
+	if b&vmaEffectivePermsRead > 0 {
+		v.effectivePerms.Read = true
+	}
+	if b&vmaEffectivePermsWrite > 0 {
+		v.effectivePerms.Write = true
+	}
+	if b&vmaEffectivePermsExecute > 0 {
+		v.effectivePerms.Execute = true
+	}
+	if b&vmaMaxPermsRead > 0 {
+		v.maxPerms.Read = true
+	}
+	if b&vmaMaxPermsWrite > 0 {
+		v.maxPerms.Write = true
+	}
+	if b&vmaMaxPermsExecute > 0 {
+		v.maxPerms.Execute = true
+	}
+	if b&vmaPrivate > 0 {
+		v.private = true
+	}
+	if b&vmaGrowsDown > 0 {
+		v.growsDown = true
+	}
+}
+
+// pma represents a platform mapping area.
+//
+// +stateify savable
+type pma struct {
+	// file is the file mapped by this pma. Only pmas for which file ==
+	// MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to
+	// the corresponding file range while they exist.
+	file platform.File `state:"nosave"`
+
+	// off is the offset into file at which this pma begins.
+	//
+	// Note that pmas do *not* hold references on offsets in file! If private
+	// is true, MemoryManager.privateRefs holds the reference instead. If
+	// private is false, the corresponding memmap.Mappable holds the reference
+	// instead (per memmap.Mappable.Translate requirement).
+	off uint64
+
+	// translatePerms is the permissions returned by memmap.Mappable.Translate.
+	// If private is true, translatePerms is usermem.AnyAccess.
+	translatePerms usermem.AccessType
+
+	// effectivePerms is the permissions allowed for non-ignorePermissions
+	// accesses. maxPerms is the permissions allowed for ignorePermissions
+	// accesses. These are vma.effectivePerms and vma.maxPerms respectively,
+	// masked by pma.translatePerms and with Write disallowed if pma.needCOW is
+	// true.
+	//
+	// These are stored in the pma so that the IO implementation can avoid
+	// iterating mm.vmas when pmas already exist.
+	effectivePerms usermem.AccessType
+	maxPerms       usermem.AccessType
+
+	// needCOW is true if writes to the mapping must be propagated to a copy.
+	needCOW bool
+
+	// private is true if this pma represents private memory.
+	//
+	// If private is true, file must be MemoryManager.mfp.MemoryFile(), the pma
+	// holds a reference on the mapped memory that is tracked in privateRefs,
+	// and calls to Invalidate for which
+	// memmap.InvalidateOpts.InvalidatePrivate is false should ignore the pma.
+	//
+	// If private is false, this pma caches a translation from the
+	// corresponding vma's memmap.Mappable.Translate.
+	private bool
+
+	// If internalMappings is not empty, it is the cached return value of
+	// file.MapInternal for the platform.FileRange mapped by this pma.
+	internalMappings safemem.BlockSeq `state:"nosave"`
+}
+
+// +stateify savable
+type privateRefs struct {
+	mu sync.Mutex `state:"nosave"`
+
+	// refs maps offsets into MemoryManager.mfp.MemoryFile() to the number of
+	// pmas (or, equivalently, MemoryManagers) that share ownership of the
+	// memory at that offset.
+	refs fileRefcountSet
+}
+
+type invalidateArgs struct {
+	ar   usermem.AddrRange
+	opts memmap.InvalidateOpts
+}
+
+// fileRefcountSetFunctions implements segment.Functions for fileRefcountSet.
+type fileRefcountSetFunctions struct{}
+
+func (fileRefcountSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+func (fileRefcountSetFunctions) MaxKey() uint64 {
+	return ^uint64(0)
+}
+
+func (fileRefcountSetFunctions) ClearValue(_ *int32) {
+}
+
+func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) {
+	return rc1, rc1 == rc2
+}
+
+func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) {
+	return rc, rc
+}
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
new file mode 100644
index 000000000..fdc308542
--- /dev/null
+++ b/pkg/sentry/mm/mm_test.go
@@ -0,0 +1,230 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func testMemoryManager(ctx context.Context) *MemoryManager {
+	p := platform.FromContext(ctx)
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	mm := NewMemoryManager(p, mfp, false)
+	mm.layout = arch.MmapLayout{
+		MinAddr:      p.MinUserAddress(),
+		MaxAddr:      p.MaxUserAddress(),
+		BottomUpBase: p.MinUserAddress(),
+		TopDownBase:  p.MaxUserAddress(),
+	}
+	return mm
+}
+
+func (mm *MemoryManager) realUsageAS() uint64 {
+	return uint64(mm.vmas.Span())
+}
+
+func TestUsageASUpdates(t *testing.T) {
+	ctx := contexttest.Context(t)
+	mm := testMemoryManager(ctx)
+	defer mm.DecUsers(ctx)
+
+	addr, err := mm.MMap(ctx, memmap.MMapOpts{
+		Length: 2 * usermem.PageSize,
+	})
+	if err != nil {
+		t.Fatalf("MMap got err %v want nil", err)
+	}
+	realUsage := mm.realUsageAS()
+	if mm.usageAS != realUsage {
+		t.Fatalf("usageAS believes %v bytes are mapped; %v bytes are actually mapped", mm.usageAS, realUsage)
+	}
+
+	mm.MUnmap(ctx, addr, usermem.PageSize)
+	realUsage = mm.realUsageAS()
+	if mm.usageAS != realUsage {
+		t.Fatalf("usageAS believes %v bytes are mapped; %v bytes are actually mapped", mm.usageAS, realUsage)
+	}
+}
+
+func (mm *MemoryManager) realDataAS() uint64 {
+	var sz uint64
+	for seg := mm.vmas.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		vma := seg.Value()
+		if vma.isPrivateDataLocked() {
+			sz += uint64(seg.Range().Length())
+		}
+	}
+	return sz
+}
+
+func TestDataASUpdates(t *testing.T) {
+	ctx := contexttest.Context(t)
+	mm := testMemoryManager(ctx)
+	defer mm.DecUsers(ctx)
+
+	addr, err := mm.MMap(ctx, memmap.MMapOpts{
+		Length:   3 * usermem.PageSize,
+		Private:  true,
+		Perms:    usermem.Write,
+		MaxPerms: usermem.AnyAccess,
+	})
+	if err != nil {
+		t.Fatalf("MMap got err %v want nil", err)
+	}
+	if mm.dataAS == 0 {
+		t.Fatalf("dataAS is 0, wanted not 0")
+	}
+	realDataAS := mm.realDataAS()
+	if mm.dataAS != realDataAS {
+		t.Fatalf("dataAS believes %v bytes are mapped; %v bytes are actually mapped", mm.dataAS, realDataAS)
+	}
+
+	mm.MUnmap(ctx, addr, usermem.PageSize)
+	realDataAS = mm.realDataAS()
+	if mm.dataAS != realDataAS {
+		t.Fatalf("dataAS believes %v bytes are mapped; %v bytes are actually mapped", mm.dataAS, realDataAS)
+	}
+
+	mm.MProtect(addr+usermem.PageSize, usermem.PageSize, usermem.Read, false)
+	realDataAS = mm.realDataAS()
+	if mm.dataAS != realDataAS {
+		t.Fatalf("dataAS believes %v bytes are mapped; %v bytes are actually mapped", mm.dataAS, realDataAS)
+	}
+
+	mm.MRemap(ctx, addr+2*usermem.PageSize, usermem.PageSize, 2*usermem.PageSize, MRemapOpts{
+		Move: MRemapMayMove,
+	})
+	realDataAS = mm.realDataAS()
+	if mm.dataAS != realDataAS {
+		t.Fatalf("dataAS believes %v bytes are mapped; %v bytes are actually mapped", mm.dataAS, realDataAS)
+	}
+}
+
+func TestBrkDataLimitUpdates(t *testing.T) {
+	limitSet := limits.NewLimitSet()
+	limitSet.Set(limits.Data, limits.Limit{}, true /* privileged */) // zero RLIMIT_DATA
+
+	ctx := contexttest.WithLimitSet(contexttest.Context(t), limitSet)
+	mm := testMemoryManager(ctx)
+	defer mm.DecUsers(ctx)
+
+	// Try to extend the brk by one page and expect doing so to fail.
+	oldBrk, _ := mm.Brk(ctx, 0)
+	if newBrk, _ := mm.Brk(ctx, oldBrk+usermem.PageSize); newBrk != oldBrk {
+		t.Errorf("brk() increased data segment above RLIMIT_DATA (old brk = %#x, new brk = %#x", oldBrk, newBrk)
+	}
+}
+
+// TestIOAfterUnmap ensures that IO fails after unmap.
+func TestIOAfterUnmap(t *testing.T) {
+	ctx := contexttest.Context(t)
+	mm := testMemoryManager(ctx)
+	defer mm.DecUsers(ctx)
+
+	addr, err := mm.MMap(ctx, memmap.MMapOpts{
+		Length:   usermem.PageSize,
+		Private:  true,
+		Perms:    usermem.Read,
+		MaxPerms: usermem.AnyAccess,
+	})
+	if err != nil {
+		t.Fatalf("MMap got err %v want nil", err)
+	}
+
+	// IO works before munmap.
+	b := make([]byte, 1)
+	n, err := mm.CopyIn(ctx, addr, b, usermem.IOOpts{})
+	if err != nil {
+		t.Errorf("CopyIn got err %v want nil", err)
+	}
+	if n != 1 {
+		t.Errorf("CopyIn got %d want 1", n)
+	}
+
+	err = mm.MUnmap(ctx, addr, usermem.PageSize)
+	if err != nil {
+		t.Fatalf("MUnmap got err %v want nil", err)
+	}
+
+	n, err = mm.CopyIn(ctx, addr, b, usermem.IOOpts{})
+	if err != syserror.EFAULT {
+		t.Errorf("CopyIn got err %v want EFAULT", err)
+	}
+	if n != 0 {
+		t.Errorf("CopyIn got %d want 0", n)
+	}
+}
+
+// TestIOAfterMProtect tests IO interaction with mprotect permissions.
+func TestIOAfterMProtect(t *testing.T) {
+	ctx := contexttest.Context(t)
+	mm := testMemoryManager(ctx)
+	defer mm.DecUsers(ctx)
+
+	addr, err := mm.MMap(ctx, memmap.MMapOpts{
+		Length:   usermem.PageSize,
+		Private:  true,
+		Perms:    usermem.ReadWrite,
+		MaxPerms: usermem.AnyAccess,
+	})
+	if err != nil {
+		t.Fatalf("MMap got err %v want nil", err)
+	}
+
+	// Writing works before mprotect.
+	b := make([]byte, 1)
+	n, err := mm.CopyOut(ctx, addr, b, usermem.IOOpts{})
+	if err != nil {
+		t.Errorf("CopyOut got err %v want nil", err)
+	}
+	if n != 1 {
+		t.Errorf("CopyOut got %d want 1", n)
+	}
+
+	err = mm.MProtect(addr, usermem.PageSize, usermem.Read, false)
+	if err != nil {
+		t.Errorf("MProtect got err %v want nil", err)
+	}
+
+	// Without IgnorePermissions, CopyOut should no longer succeed.
+	n, err = mm.CopyOut(ctx, addr, b, usermem.IOOpts{})
+	if err != syserror.EFAULT {
+		t.Errorf("CopyOut got err %v want EFAULT", err)
+	}
+	if n != 0 {
+		t.Errorf("CopyOut got %d want 0", n)
+	}
+
+	// With IgnorePermissions, CopyOut should succeed despite mprotect.
+	n, err = mm.CopyOut(ctx, addr, b, usermem.IOOpts{
+		IgnorePermissions: true,
+	})
+	if err != nil {
+		t.Errorf("CopyOut got err %v want nil", err)
+	}
+	if n != 1 {
+		t.Errorf("CopyOut got %d want 1", n)
+	}
+}
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
new file mode 100644
index 000000000..62e4c20af
--- /dev/null
+++ b/pkg/sentry/mm/pma.go
@@ -0,0 +1,1036 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safecopy"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// existingPMAsLocked checks that pmas exist for all addresses in ar, and
+// support access of type (at, ignorePermissions). If so, it returns an
+// iterator to the pma containing ar.Start. Otherwise it returns a terminal
+// iterator.
+//
+// Preconditions: mm.activeMu must be locked. ar.Length() != 0.
+func (mm *MemoryManager) existingPMAsLocked(ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	first := mm.pmas.FindSegment(ar.Start)
+	pseg := first
+	for pseg.Ok() {
+		pma := pseg.ValuePtr()
+		perms := pma.effectivePerms
+		if ignorePermissions {
+			perms = pma.maxPerms
+		}
+		if !perms.SupersetOf(at) {
+			return pmaIterator{}
+		}
+		if needInternalMappings && pma.internalMappings.IsEmpty() {
+			return pmaIterator{}
+		}
+
+		if ar.End <= pseg.End() {
+			return first
+		}
+		pseg, _ = pseg.NextNonEmpty()
+	}
+
+	// Ran out of pmas before reaching ar.End.
+	return pmaIterator{}
+}
+
+// existingVecPMAsLocked returns true if pmas exist for all addresses in ars,
+// and support access of type (at, ignorePermissions).
+//
+// Preconditions: mm.activeMu must be locked.
+func (mm *MemoryManager) existingVecPMAsLocked(ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool, needInternalMappings bool) bool {
+	for ; !ars.IsEmpty(); ars = ars.Tail() {
+		if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, needInternalMappings).Ok() {
+			return false
+		}
+	}
+	return true
+}
+
+// getPMAsLocked ensures that pmas exist for all addresses in ar, and support
+// access of type at. It returns:
+//
+// - An iterator to the pma containing ar.Start. If no pma contains ar.Start,
+// the iterator is unspecified.
+//
+// - An iterator to the gap after the last pma containing an address in ar. If
+// pmas exist for no addresses in ar, the iterator is to a gap that begins
+// before ar.Start.
+//
+// - An error that is non-nil if pmas exist for only a subset of ar.
+//
+// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
+// writing. ar.Length() != 0. vseg.Range().Contains(ar.Start). vmas must exist
+// for all addresses in ar, and support accesses of type at (i.e. permission
+// checks must have been performed against vmas).
+func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
+		if !vseg.Range().Contains(ar.Start) {
+			panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
+		}
+	}
+
+	// Page-align ar so that all AddrRanges are aligned.
+	end, ok := ar.End.RoundUp()
+	var alignerr error
+	if !ok {
+		end = ar.End.RoundDown()
+		alignerr = syserror.EFAULT
+	}
+	ar = usermem.AddrRange{ar.Start.RoundDown(), end}
+
+	pstart, pend, perr := mm.getPMAsInternalLocked(ctx, vseg, ar, at)
+	if pend.Start() <= ar.Start {
+		return pmaIterator{}, pend, perr
+	}
+	// getPMAsInternalLocked may not have returned pstart due to iterator
+	// invalidation.
+	if !pstart.Ok() {
+		pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend)
+	}
+	if perr != nil {
+		return pstart, pend, perr
+	}
+	return pstart, pend, alignerr
+}
+
+// getVecPMAsLocked ensures that pmas exist for all addresses in ars, and
+// support access of type at. It returns the subset of ars for which pmas
+// exist. If this is not equal to ars, it returns a non-nil error explaining
+// why.
+//
+// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
+// writing. vmas must exist for all addresses in ars, and support accesses of
+// type at (i.e. permission checks must have been performed against vmas).
+func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType) (usermem.AddrRangeSeq, error) {
+	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
+		ar := arsit.Head()
+		if ar.Length() == 0 {
+			continue
+		}
+		if checkInvariants {
+			if !ar.WellFormed() {
+				panic(fmt.Sprintf("invalid ar: %v", ar))
+			}
+		}
+
+		// Page-align ar so that all AddrRanges are aligned.
+		end, ok := ar.End.RoundUp()
+		var alignerr error
+		if !ok {
+			end = ar.End.RoundDown()
+			alignerr = syserror.EFAULT
+		}
+		ar = usermem.AddrRange{ar.Start.RoundDown(), end}
+
+		_, pend, perr := mm.getPMAsInternalLocked(ctx, mm.vmas.FindSegment(ar.Start), ar, at)
+		if perr != nil {
+			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr
+		}
+		if alignerr != nil {
+			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), alignerr
+		}
+	}
+
+	return ars, nil
+}
+
+// getPMAsInternalLocked is equivalent to getPMAsLocked, with the following
+// exceptions:
+//
+// - getPMAsInternalLocked returns a pmaIterator on a best-effort basis (that
+// is, the returned iterator may be terminal, even if a pma that contains
+// ar.Start exists). Returning this iterator on a best-effort basis allows
+// callers that require it to use it when it's cheaply available, while also
+// avoiding the overhead of retrieving it when it's not.
+//
+// - getPMAsInternalLocked additionally requires that ar is page-aligned.
+//
+// getPMAsInternalLocked is an implementation helper for getPMAsLocked and
+// getVecPMAsLocked; other clients should call one of those instead.
+func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
+		if !vseg.Range().Contains(ar.Start) {
+			panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
+		}
+	}
+
+	mf := mm.mfp.MemoryFile()
+	// Limit the range we allocate to ar, aligned to privateAllocUnit.
+	maskAR := privateAligned(ar)
+	didUnmapAS := false
+	// The range in which we iterate vmas and pmas is still limited to ar, to
+	// ensure that we don't allocate or COW-break a pma we don't need.
+	pseg, pgap := mm.pmas.Find(ar.Start)
+	pstart := pseg
+	for {
+		// Get pmas for this vma.
+		vsegAR := vseg.Range().Intersect(ar)
+		vma := vseg.ValuePtr()
+	pmaLoop:
+		for {
+			switch {
+			case pgap.Ok() && pgap.Start() < vsegAR.End:
+				// Need a pma here.
+				optAR := vseg.Range().Intersect(pgap.Range())
+				if checkInvariants {
+					if optAR.Length() <= 0 {
+						panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap))
+					}
+				}
+				if vma.mappable == nil {
+					// Private anonymous mappings get pmas by allocating.
+					allocAR := optAR.Intersect(maskAR)
+					fr, err := mf.Allocate(uint64(allocAR.Length()), usage.Anonymous)
+					if err != nil {
+						return pstart, pgap, err
+					}
+					if checkInvariants {
+						if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) {
+							panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr))
+						}
+					}
+					mm.addRSSLocked(allocAR)
+					mm.incPrivateRef(fr)
+					mf.IncRef(fr)
+					pseg, pgap = mm.pmas.Insert(pgap, allocAR, pma{
+						file:           mf,
+						off:            fr.Start,
+						translatePerms: usermem.AnyAccess,
+						effectivePerms: vma.effectivePerms,
+						maxPerms:       vma.maxPerms,
+						// Since we just allocated this memory and have the
+						// only reference, the new pma does not need
+						// copy-on-write.
+						private: true,
+					}).NextNonEmpty()
+					pstart = pmaIterator{} // iterators invalidated
+				} else {
+					// Other mappings get pmas by translating.
+					optMR := vseg.mappableRangeOf(optAR)
+					reqAR := optAR.Intersect(ar)
+					reqMR := vseg.mappableRangeOf(reqAR)
+					perms := at
+					if vma.private {
+						// This pma will be copy-on-write; don't require write
+						// permission, but do require read permission to
+						// facilitate the copy.
+						//
+						// If at.Write is true, we will need to break
+						// copy-on-write immediately, which occurs after
+						// translation below.
+						perms.Read = true
+						perms.Write = false
+					}
+					ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
+					if checkInvariants {
+						if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil {
+							panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err))
+						}
+					}
+					// Install a pma for each translation.
+					if len(ts) == 0 {
+						return pstart, pgap, err
+					}
+					pstart = pmaIterator{} // iterators invalidated
+					for _, t := range ts {
+						newpmaAR := vseg.addrRangeOf(t.Source)
+						newpma := pma{
+							file:           t.File,
+							off:            t.Offset,
+							translatePerms: t.Perms,
+							effectivePerms: vma.effectivePerms.Intersect(t.Perms),
+							maxPerms:       vma.maxPerms.Intersect(t.Perms),
+						}
+						if vma.private {
+							newpma.effectivePerms.Write = false
+							newpma.maxPerms.Write = false
+							newpma.needCOW = true
+						}
+						mm.addRSSLocked(newpmaAR)
+						t.File.IncRef(t.FileRange())
+						// This is valid because memmap.Mappable.Translate is
+						// required to return Translations in increasing
+						// Translation.Source order.
+						pseg = mm.pmas.Insert(pgap, newpmaAR, newpma)
+						pgap = pseg.NextGap()
+					}
+					// The error returned by Translate is only significant if
+					// it occurred before ar.End.
+					if err != nil && vseg.addrRangeOf(ts[len(ts)-1].Source).End < ar.End {
+						return pstart, pgap, err
+					}
+					// Rewind pseg to the first pma inserted and continue the
+					// loop to check if we need to break copy-on-write.
+					pseg, pgap = mm.findOrSeekPrevUpperBoundPMA(vseg.addrRangeOf(ts[0].Source).Start, pgap), pmaGapIterator{}
+					continue
+				}
+
+			case pseg.Ok() && pseg.Start() < vsegAR.End:
+				oldpma := pseg.ValuePtr()
+				if at.Write && mm.isPMACopyOnWriteLocked(vseg, pseg) {
+					// Break copy-on-write by copying.
+					if checkInvariants {
+						if !oldpma.maxPerms.Read {
+							panic(fmt.Sprintf("pma %v needs to be copied for writing, but is not readable: %v", pseg.Range(), oldpma))
+						}
+					}
+					// The majority of copy-on-write breaks on executable pages
+					// come from:
+					//
+					// - The ELF loader, which must zero out bytes on the last
+					// page of each segment after the end of the segment.
+					//
+					// - gdb's use of ptrace to insert breakpoints.
+					//
+					// Neither of these cases has enough spatial locality to
+					// benefit from copying nearby pages, so if the vma is
+					// executable, only copy the pages required.
+					var copyAR usermem.AddrRange
+					if vseg.ValuePtr().effectivePerms.Execute {
+						copyAR = pseg.Range().Intersect(ar)
+					} else {
+						copyAR = pseg.Range().Intersect(maskAR)
+					}
+					// Get internal mappings from the pma to copy from.
+					if err := pseg.getInternalMappingsLocked(); err != nil {
+						return pstart, pseg.PrevGap(), err
+					}
+					// Copy contents.
+					fr, err := mf.AllocateAndFill(uint64(copyAR.Length()), usage.Anonymous, &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)})
+					if _, ok := err.(safecopy.BusError); ok {
+						// If we got SIGBUS during the copy, deliver SIGBUS to
+						// userspace (instead of SIGSEGV) if we're breaking
+						// copy-on-write due to application page fault.
+						err = &memmap.BusError{err}
+					}
+					if fr.Length() == 0 {
+						return pstart, pseg.PrevGap(), err
+					}
+					// Unmap all of maskAR, not just copyAR, to minimize host
+					// syscalls. AddressSpace mappings must be removed before
+					// mm.decPrivateRef().
+					if !didUnmapAS {
+						mm.unmapASLocked(maskAR)
+						didUnmapAS = true
+					}
+					// Replace the pma with a copy in the part of the address
+					// range where copying was successful. This doesn't change
+					// RSS.
+					copyAR.End = copyAR.Start + usermem.Addr(fr.Length())
+					if copyAR != pseg.Range() {
+						pseg = mm.pmas.Isolate(pseg, copyAR)
+						pstart = pmaIterator{} // iterators invalidated
+					}
+					oldpma = pseg.ValuePtr()
+					if oldpma.private {
+						mm.decPrivateRef(pseg.fileRange())
+					}
+					oldpma.file.DecRef(pseg.fileRange())
+					mm.incPrivateRef(fr)
+					mf.IncRef(fr)
+					oldpma.file = mf
+					oldpma.off = fr.Start
+					oldpma.translatePerms = usermem.AnyAccess
+					oldpma.effectivePerms = vma.effectivePerms
+					oldpma.maxPerms = vma.maxPerms
+					oldpma.needCOW = false
+					oldpma.private = true
+					oldpma.internalMappings = safemem.BlockSeq{}
+					// Try to merge the pma with its neighbors.
+					if prev := pseg.PrevSegment(); prev.Ok() {
+						if merged := mm.pmas.Merge(prev, pseg); merged.Ok() {
+							pseg = merged
+							pstart = pmaIterator{} // iterators invalidated
+						}
+					}
+					if next := pseg.NextSegment(); next.Ok() {
+						if merged := mm.pmas.Merge(pseg, next); merged.Ok() {
+							pseg = merged
+							pstart = pmaIterator{} // iterators invalidated
+						}
+					}
+					// The error returned by AllocateAndFill is only
+					// significant if it occurred before ar.End.
+					if err != nil && pseg.End() < ar.End {
+						return pstart, pseg.NextGap(), err
+					}
+					// Ensure pseg and pgap are correct for the next iteration
+					// of the loop.
+					pseg, pgap = pseg.NextNonEmpty()
+				} else if !oldpma.translatePerms.SupersetOf(at) {
+					// Get new pmas (with sufficient permissions) by calling
+					// memmap.Mappable.Translate again.
+					if checkInvariants {
+						if oldpma.private {
+							panic(fmt.Sprintf("private pma %v has non-maximal pma.translatePerms: %v", pseg.Range(), oldpma))
+						}
+					}
+					// Allow the entire pma to be replaced.
+					optAR := pseg.Range()
+					optMR := vseg.mappableRangeOf(optAR)
+					reqAR := optAR.Intersect(ar)
+					reqMR := vseg.mappableRangeOf(reqAR)
+					perms := oldpma.translatePerms.Union(at)
+					ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
+					if checkInvariants {
+						if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil {
+							panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err))
+						}
+					}
+					// Remove the part of the existing pma covered by new
+					// Translations, then insert new pmas. This doesn't change
+					// RSS. Note that we don't need to call unmapASLocked: any
+					// existing AddressSpace mappings are still valid (though
+					// less permissive than the new pmas indicate) until
+					// Invalidate is called, and will be replaced by future
+					// calls to mapASLocked.
+					if len(ts) == 0 {
+						return pstart, pseg.PrevGap(), err
+					}
+					transMR := memmap.MappableRange{ts[0].Source.Start, ts[len(ts)-1].Source.End}
+					transAR := vseg.addrRangeOf(transMR)
+					pseg = mm.pmas.Isolate(pseg, transAR)
+					pseg.ValuePtr().file.DecRef(pseg.fileRange())
+					pgap = mm.pmas.Remove(pseg)
+					pstart = pmaIterator{} // iterators invalidated
+					for _, t := range ts {
+						newpmaAR := vseg.addrRangeOf(t.Source)
+						newpma := pma{
+							file:           t.File,
+							off:            t.Offset,
+							translatePerms: t.Perms,
+							effectivePerms: vma.effectivePerms.Intersect(t.Perms),
+							maxPerms:       vma.maxPerms.Intersect(t.Perms),
+						}
+						if vma.private {
+							newpma.effectivePerms.Write = false
+							newpma.maxPerms.Write = false
+							newpma.needCOW = true
+						}
+						t.File.IncRef(t.FileRange())
+						pseg = mm.pmas.Insert(pgap, newpmaAR, newpma)
+						pgap = pseg.NextGap()
+					}
+					// The error returned by Translate is only significant if
+					// it occurred before ar.End.
+					if err != nil && pseg.End() < ar.End {
+						return pstart, pgap, err
+					}
+					// Ensure pseg and pgap are correct for the next iteration
+					// of the loop.
+					if pgap.Range().Length() == 0 {
+						pseg, pgap = pgap.NextSegment(), pmaGapIterator{}
+					} else {
+						pseg = pmaIterator{}
+					}
+				} else {
+					// We have a usable pma; continue.
+					pseg, pgap = pseg.NextNonEmpty()
+				}
+
+			default:
+				break pmaLoop
+			}
+		}
+		// Go to the next vma.
+		if ar.End <= vseg.End() {
+			if pgap.Ok() {
+				return pstart, pgap, nil
+			}
+			return pstart, pseg.PrevGap(), nil
+		}
+		vseg = vseg.NextSegment()
+	}
+}
+
+const (
+	// When memory is allocated for a private pma, align the allocated address
+	// range to a privateAllocUnit boundary when possible. Larger values of
+	// privateAllocUnit may reduce page faults by allowing fewer, larger pmas
+	// to be mapped, but may result in larger amounts of wasted memory in the
+	// presence of fragmentation. privateAllocUnit must be a power-of-2
+	// multiple of usermem.PageSize.
+	privateAllocUnit = usermem.HugePageSize
+
+	privateAllocMask = privateAllocUnit - 1
+)
+
+func privateAligned(ar usermem.AddrRange) usermem.AddrRange {
+	aligned := usermem.AddrRange{ar.Start &^ privateAllocMask, ar.End}
+	if end := (ar.End + privateAllocMask) &^ privateAllocMask; end >= ar.End {
+		aligned.End = end
+	}
+	if checkInvariants {
+		if !aligned.IsSupersetOf(ar) {
+			panic(fmt.Sprintf("aligned AddrRange %#v is not a superset of ar %#v", aligned, ar))
+		}
+	}
+	return aligned
+}
+
+// isPMACopyOnWriteLocked returns true if the contents of the pma represented
+// by pseg must be copied to a new private pma to be written to.
+//
+// If the pma is a copy-on-write private pma, and holds the only reference on
+// the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory
+// and update the pma to indicate that it does not require copy-on-write.
+//
+// Preconditions: vseg.Range().IsSupersetOf(pseg.Range()). mm.mappingMu must be
+// locked. mm.activeMu must be locked for writing.
+func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool {
+	pma := pseg.ValuePtr()
+	if !pma.needCOW {
+		return false
+	}
+	if !pma.private {
+		return true
+	}
+	// If we have the only reference on private memory to be copied, just take
+	// ownership of it instead of copying. If we do hold the only reference,
+	// additional references can only be taken by mm.Fork(), which is excluded
+	// by mm.activeMu, so this isn't racy.
+	mm.privateRefs.mu.Lock()
+	defer mm.privateRefs.mu.Unlock()
+	fr := pseg.fileRange()
+	// This check relies on mm.privateRefs.refs being kept fully merged.
+	rseg := mm.privateRefs.refs.FindSegment(fr.Start)
+	if rseg.Ok() && rseg.Value() == 1 && fr.End <= rseg.End() {
+		pma.needCOW = false
+		// pma.private => pma.translatePerms == usermem.AnyAccess
+		vma := vseg.ValuePtr()
+		pma.effectivePerms = vma.effectivePerms
+		pma.maxPerms = vma.maxPerms
+		return false
+	}
+	return true
+}
+
+// Invalidate implements memmap.MappingSpace.Invalidate.
+func (mm *MemoryManager) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateOpts) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	mm.activeMu.Lock()
+	defer mm.activeMu.Unlock()
+	if mm.captureInvalidations {
+		mm.capturedInvalidations = append(mm.capturedInvalidations, invalidateArgs{ar, opts})
+		return
+	}
+	mm.invalidateLocked(ar, opts.InvalidatePrivate, true)
+}
+
+// invalidateLocked removes pmas and AddressSpace mappings of those pmas for
+// addresses in ar.
+//
+// Preconditions: mm.activeMu must be locked for writing. ar.Length() != 0. ar
+// must be page-aligned.
+func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivate, invalidateShared bool) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	var didUnmapAS bool
+	pseg := mm.pmas.LowerBoundSegment(ar.Start)
+	for pseg.Ok() && pseg.Start() < ar.End {
+		pma := pseg.ValuePtr()
+		if (invalidatePrivate && pma.private) || (invalidateShared && !pma.private) {
+			pseg = mm.pmas.Isolate(pseg, ar)
+			pma = pseg.ValuePtr()
+			if !didUnmapAS {
+				// Unmap all of ar, not just pseg.Range(), to minimize host
+				// syscalls. AddressSpace mappings must be removed before
+				// mm.decPrivateRef().
+				mm.unmapASLocked(ar)
+				didUnmapAS = true
+			}
+			if pma.private {
+				mm.decPrivateRef(pseg.fileRange())
+			}
+			mm.removeRSSLocked(pseg.Range())
+			pma.file.DecRef(pseg.fileRange())
+			pseg = mm.pmas.Remove(pseg).NextSegment()
+		} else {
+			pseg = pseg.NextSegment()
+		}
+	}
+}
+
+// Pin returns the platform.File ranges currently mapped by addresses in ar in
+// mm, acquiring a reference on the returned ranges which the caller must
+// release by calling Unpin. If not all addresses are mapped, Pin returns a
+// non-nil error. Note that Pin may return both a non-empty slice of
+// PinnedRanges and a non-nil error.
+//
+// Pin does not prevent mapped ranges from changing, making it unsuitable for
+// most I/O. It should only be used in contexts that would use get_user_pages()
+// in the Linux kernel.
+//
+// Preconditions: ar.Length() != 0. ar must be page-aligned.
+func (mm *MemoryManager) Pin(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) ([]PinnedRange, error) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	// Ensure that we have usable vmas.
+	mm.mappingMu.RLock()
+	vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions)
+	if vendaddr := vend.Start(); vendaddr < ar.End {
+		if vendaddr <= ar.Start {
+			mm.mappingMu.RUnlock()
+			return nil, verr
+		}
+		ar.End = vendaddr
+	}
+
+	// Ensure that we have usable pmas.
+	mm.activeMu.Lock()
+	pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at)
+	mm.mappingMu.RUnlock()
+	if pendaddr := pend.Start(); pendaddr < ar.End {
+		if pendaddr <= ar.Start {
+			mm.activeMu.Unlock()
+			return nil, perr
+		}
+		ar.End = pendaddr
+	}
+
+	// Gather pmas.
+	var prs []PinnedRange
+	for pseg.Ok() && pseg.Start() < ar.End {
+		psar := pseg.Range().Intersect(ar)
+		f := pseg.ValuePtr().file
+		fr := pseg.fileRangeOf(psar)
+		f.IncRef(fr)
+		prs = append(prs, PinnedRange{
+			Source: psar,
+			File:   f,
+			Offset: fr.Start,
+		})
+		pseg = pseg.NextSegment()
+	}
+	mm.activeMu.Unlock()
+
+	// Return the first error in order of progress through ar.
+	if perr != nil {
+		return prs, perr
+	}
+	return prs, verr
+}
+
+// PinnedRanges are returned by MemoryManager.Pin.
+type PinnedRange struct {
+	// Source is the corresponding range of addresses.
+	Source usermem.AddrRange
+
+	// File is the mapped file.
+	File platform.File
+
+	// Offset is the offset into File at which this PinnedRange begins.
+	Offset uint64
+}
+
+// FileRange returns the platform.File offsets mapped by pr.
+func (pr PinnedRange) FileRange() platform.FileRange {
+	return platform.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())}
+}
+
+// Unpin releases the reference held by prs.
+func Unpin(prs []PinnedRange) {
+	for i := range prs {
+		prs[i].File.DecRef(prs[i].FileRange())
+	}
+}
+
+// movePMAsLocked moves all pmas in oldAR to newAR.
+//
+// Preconditions: mm.activeMu must be locked for writing. oldAR.Length() != 0.
+// oldAR.Length() <= newAR.Length(). !oldAR.Overlaps(newAR).
+// mm.pmas.IsEmptyRange(newAR). oldAR and newAR must be page-aligned.
+func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) {
+	if checkInvariants {
+		if !oldAR.WellFormed() || oldAR.Length() <= 0 || !oldAR.IsPageAligned() {
+			panic(fmt.Sprintf("invalid oldAR: %v", oldAR))
+		}
+		if !newAR.WellFormed() || newAR.Length() <= 0 || !newAR.IsPageAligned() {
+			panic(fmt.Sprintf("invalid newAR: %v", newAR))
+		}
+		if oldAR.Length() > newAR.Length() {
+			panic(fmt.Sprintf("old address range %v may contain pmas that will not fit in new address range %v", oldAR, newAR))
+		}
+		if oldAR.Overlaps(newAR) {
+			panic(fmt.Sprintf("old and new address ranges overlap: %v, %v", oldAR, newAR))
+		}
+		// mm.pmas.IsEmptyRange is checked by mm.pmas.Insert.
+	}
+
+	type movedPMA struct {
+		oldAR usermem.AddrRange
+		pma   pma
+	}
+	var movedPMAs []movedPMA
+	pseg := mm.pmas.LowerBoundSegment(oldAR.Start)
+	for pseg.Ok() && pseg.Start() < oldAR.End {
+		pseg = mm.pmas.Isolate(pseg, oldAR)
+		movedPMAs = append(movedPMAs, movedPMA{
+			oldAR: pseg.Range(),
+			pma:   pseg.Value(),
+		})
+		pseg = mm.pmas.Remove(pseg).NextSegment()
+		// No RSS change is needed since we're re-inserting the same pmas
+		// below.
+	}
+
+	off := newAR.Start - oldAR.Start
+	pgap := mm.pmas.FindGap(newAR.Start)
+	for i := range movedPMAs {
+		mpma := &movedPMAs[i]
+		pmaNewAR := usermem.AddrRange{mpma.oldAR.Start + off, mpma.oldAR.End + off}
+		pgap = mm.pmas.Insert(pgap, pmaNewAR, mpma.pma).NextGap()
+	}
+
+	mm.unmapASLocked(oldAR)
+}
+
+// getPMAInternalMappingsLocked ensures that pmas for all addresses in ar have
+// cached internal mappings. It returns:
+//
+// - An iterator to the gap after the last pma with internal mappings
+// containing an address in ar. If internal mappings exist for no addresses in
+// ar, the iterator is to a gap that begins before ar.Start.
+//
+// - An error that is non-nil if internal mappings exist for only a subset of
+// ar.
+//
+// Preconditions: mm.activeMu must be locked for writing.
+// pseg.Range().Contains(ar.Start). pmas must exist for all addresses in ar.
+// ar.Length() != 0.
+//
+// Postconditions: getPMAInternalMappingsLocked does not invalidate iterators
+// into mm.pmas.
+func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) (pmaGapIterator, error) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !pseg.Range().Contains(ar.Start) {
+			panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
+		}
+	}
+
+	for {
+		if err := pseg.getInternalMappingsLocked(); err != nil {
+			return pseg.PrevGap(), err
+		}
+		if ar.End <= pseg.End() {
+			return pseg.NextGap(), nil
+		}
+		pseg, _ = pseg.NextNonEmpty()
+	}
+}
+
+// getVecPMAInternalMappingsLocked ensures that pmas for all addresses in ars
+// have cached internal mappings. It returns the subset of ars for which
+// internal mappings exist. If this is not equal to ars, it returns a non-nil
+// error explaining why.
+//
+// Preconditions: mm.activeMu must be locked for writing. pmas must exist for
+// all addresses in ar.
+//
+// Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators
+// into mm.pmas.
+func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars usermem.AddrRangeSeq) (usermem.AddrRangeSeq, error) {
+	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
+		ar := arsit.Head()
+		if ar.Length() == 0 {
+			continue
+		}
+		if pend, err := mm.getPMAInternalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); err != nil {
+			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), err
+		}
+	}
+	return ars, nil
+}
+
+// internalMappingsLocked returns internal mappings for addresses in ar.
+//
+// Preconditions: mm.activeMu must be locked. Internal mappings must have been
+// previously established for all addresses in ar. ar.Length() != 0.
+// pseg.Range().Contains(ar.Start).
+func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) safemem.BlockSeq {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !pseg.Range().Contains(ar.Start) {
+			panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
+		}
+	}
+
+	if ar.End <= pseg.End() {
+		// Since only one pma is involved, we can use pma.internalMappings
+		// directly, avoiding a slice allocation.
+		offset := uint64(ar.Start - pseg.Start())
+		return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length()))
+	}
+
+	var ims []safemem.Block
+	for {
+		pr := pseg.Range().Intersect(ar)
+		for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pr.Start - pseg.Start())).TakeFirst64(uint64(pr.Length())); !pims.IsEmpty(); pims = pims.Tail() {
+			ims = append(ims, pims.Head())
+		}
+		if ar.End <= pseg.End() {
+			break
+		}
+		pseg = pseg.NextSegment()
+	}
+	return safemem.BlockSeqFromSlice(ims)
+}
+
+// vecInternalMappingsLocked returns internal mappings for addresses in ars.
+//
+// Preconditions: mm.activeMu must be locked. Internal mappings must have been
+// previously established for all addresses in ars.
+func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) safemem.BlockSeq {
+	var ims []safemem.Block
+	for ; !ars.IsEmpty(); ars = ars.Tail() {
+		ar := ars.Head()
+		if ar.Length() == 0 {
+			continue
+		}
+		for pims := mm.internalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); !pims.IsEmpty(); pims = pims.Tail() {
+			ims = append(ims, pims.Head())
+		}
+	}
+	return safemem.BlockSeqFromSlice(ims)
+}
+
+// incPrivateRef acquires a reference on private pages in fr.
+func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) {
+	mm.privateRefs.mu.Lock()
+	defer mm.privateRefs.mu.Unlock()
+	refSet := &mm.privateRefs.refs
+	seg, gap := refSet.Find(fr.Start)
+	for {
+		switch {
+		case seg.Ok() && seg.Start() < fr.End:
+			seg = refSet.Isolate(seg, fr)
+			seg.SetValue(seg.Value() + 1)
+			seg, gap = seg.NextNonEmpty()
+		case gap.Ok() && gap.Start() < fr.End:
+			seg, gap = refSet.InsertWithoutMerging(gap, gap.Range().Intersect(fr), 1).NextNonEmpty()
+		default:
+			refSet.MergeAdjacent(fr)
+			return
+		}
+	}
+}
+
+// decPrivateRef releases a reference on private pages in fr.
+func (mm *MemoryManager) decPrivateRef(fr platform.FileRange) {
+	var freed []platform.FileRange
+
+	mm.privateRefs.mu.Lock()
+	refSet := &mm.privateRefs.refs
+	seg := refSet.LowerBoundSegment(fr.Start)
+	for seg.Ok() && seg.Start() < fr.End {
+		seg = refSet.Isolate(seg, fr)
+		if old := seg.Value(); old == 1 {
+			freed = append(freed, seg.Range())
+			seg = refSet.Remove(seg).NextSegment()
+		} else {
+			seg.SetValue(old - 1)
+			seg = seg.NextSegment()
+		}
+	}
+	refSet.MergeAdjacent(fr)
+	mm.privateRefs.mu.Unlock()
+
+	mf := mm.mfp.MemoryFile()
+	for _, fr := range freed {
+		mf.DecRef(fr)
+	}
+}
+
+// addRSSLocked updates the current and maximum resident set size of a
+// MemoryManager to reflect the insertion of a pma at ar.
+//
+// Preconditions: mm.activeMu must be locked for writing.
+func (mm *MemoryManager) addRSSLocked(ar usermem.AddrRange) {
+	mm.curRSS += uint64(ar.Length())
+	if mm.curRSS > mm.maxRSS {
+		mm.maxRSS = mm.curRSS
+	}
+}
+
+// removeRSSLocked updates the current resident set size of a MemoryManager to
+// reflect the removal of a pma at ar.
+//
+// Preconditions: mm.activeMu must be locked for writing.
+func (mm *MemoryManager) removeRSSLocked(ar usermem.AddrRange) {
+	mm.curRSS -= uint64(ar.Length())
+}
+
+// pmaSetFunctions implements segment.Functions for pmaSet.
+type pmaSetFunctions struct{}
+
+func (pmaSetFunctions) MinKey() usermem.Addr {
+	return 0
+}
+
+func (pmaSetFunctions) MaxKey() usermem.Addr {
+	return ^usermem.Addr(0)
+}
+
+func (pmaSetFunctions) ClearValue(pma *pma) {
+	pma.file = nil
+	pma.internalMappings = safemem.BlockSeq{}
+}
+
+func (pmaSetFunctions) Merge(ar1 usermem.AddrRange, pma1 pma, ar2 usermem.AddrRange, pma2 pma) (pma, bool) {
+	if pma1.file != pma2.file ||
+		pma1.off+uint64(ar1.Length()) != pma2.off ||
+		pma1.translatePerms != pma2.translatePerms ||
+		pma1.effectivePerms != pma2.effectivePerms ||
+		pma1.maxPerms != pma2.maxPerms ||
+		pma1.needCOW != pma2.needCOW ||
+		pma1.private != pma2.private {
+		return pma{}, false
+	}
+
+	// Discard internal mappings instead of trying to merge them, since merging
+	// them requires an allocation and getting them again from the
+	// platform.File might not.
+	pma1.internalMappings = safemem.BlockSeq{}
+	return pma1, true
+}
+
+func (pmaSetFunctions) Split(ar usermem.AddrRange, p pma, split usermem.Addr) (pma, pma) {
+	newlen1 := uint64(split - ar.Start)
+	p2 := p
+	p2.off += newlen1
+	if !p.internalMappings.IsEmpty() {
+		p.internalMappings = p.internalMappings.TakeFirst64(newlen1)
+		p2.internalMappings = p2.internalMappings.DropFirst64(newlen1)
+	}
+	return p, p2
+}
+
+// findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do
+// so by scanning linearly backward from pgap.
+//
+// Preconditions: mm.activeMu must be locked. addr <= pgap.Start().
+func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr usermem.Addr, pgap pmaGapIterator) pmaIterator {
+	if checkInvariants {
+		if !pgap.Ok() {
+			panic("terminal pma iterator")
+		}
+		if addr > pgap.Start() {
+			panic(fmt.Sprintf("can't seek backward to %#x from %#x", addr, pgap.Start()))
+		}
+	}
+	// Optimistically check if pgap.PrevSegment() is the PMA we're looking for,
+	// which is the case if findOrSeekPrevUpperBoundPMA is called to find the
+	// start of a range containing only a single PMA.
+	if pseg := pgap.PrevSegment(); pseg.Start() <= addr {
+		return pseg
+	}
+	return mm.pmas.UpperBoundSegment(addr)
+}
+
+// getInternalMappingsLocked ensures that pseg.ValuePtr().internalMappings is
+// non-empty.
+//
+// Preconditions: mm.activeMu must be locked for writing.
+func (pseg pmaIterator) getInternalMappingsLocked() error {
+	pma := pseg.ValuePtr()
+	if pma.internalMappings.IsEmpty() {
+		// This must use maxPerms (instead of perms) because some permission
+		// constraints are only visible to vmas; for example, mappings of
+		// read-only files have vma.maxPerms.Write unset, but this may not be
+		// visible to the memmap.Mappable.
+		perms := pma.maxPerms
+		// We will never execute application code through an internal mapping.
+		perms.Execute = false
+		ims, err := pma.file.MapInternal(pseg.fileRange(), perms)
+		if err != nil {
+			return err
+		}
+		pma.internalMappings = ims
+	}
+	return nil
+}
+
+func (pseg pmaIterator) fileRange() platform.FileRange {
+	return pseg.fileRangeOf(pseg.Range())
+}
+
+// Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0.
+func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange {
+	if checkInvariants {
+		if !pseg.Ok() {
+			panic("terminal pma iterator")
+		}
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !pseg.Range().IsSupersetOf(ar) {
+			panic(fmt.Sprintf("ar %v out of bounds %v", ar, pseg.Range()))
+		}
+	}
+
+	pma := pseg.ValuePtr()
+	pstart := pseg.Start()
+	return platform.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)}
+}
diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go
new file mode 100644
index 000000000..6efe5102b
--- /dev/null
+++ b/pkg/sentry/mm/procfs.go
@@ -0,0 +1,329 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+	// devMinorBits is the number of minor bits in a device number. Linux:
+	// include/linux/kdev_t.h:MINORBITS
+	devMinorBits = 20
+
+	vsyscallEnd        = usermem.Addr(0xffffffffff601000)
+	vsyscallMapsEntry  = "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]\n"
+	vsyscallSmapsEntry = vsyscallMapsEntry +
+		"Size:                  4 kB\n" +
+		"Rss:                   0 kB\n" +
+		"Pss:                   0 kB\n" +
+		"Shared_Clean:          0 kB\n" +
+		"Shared_Dirty:          0 kB\n" +
+		"Private_Clean:         0 kB\n" +
+		"Private_Dirty:         0 kB\n" +
+		"Referenced:            0 kB\n" +
+		"Anonymous:             0 kB\n" +
+		"AnonHugePages:         0 kB\n" +
+		"Shared_Hugetlb:        0 kB\n" +
+		"Private_Hugetlb:       0 kB\n" +
+		"Swap:                  0 kB\n" +
+		"SwapPss:               0 kB\n" +
+		"KernelPageSize:        4 kB\n" +
+		"MMUPageSize:           4 kB\n" +
+		"Locked:                0 kB\n" +
+		"VmFlags: rd ex \n"
+)
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (mm *MemoryManager) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadMapsDataInto is called by fsimpl/proc.mapsData.Generate to
+// implement /proc/[pid]/maps.
+func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, buf *bytes.Buffer) {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	var start usermem.Addr
+
+	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
+		mm.appendVMAMapsEntryLocked(ctx, vseg, buf)
+	}
+
+	// We always emulate vsyscall, so advertise it here. Everything about a
+	// vsyscall region is static, so just hard code the maps entry since we
+	// don't have a real vma backing it. The vsyscall region is at the end of
+	// the virtual address space so nothing should be mapped after it (if
+	// something is really mapped in the tiny ~10 MiB segment afterwards, we'll
+	// get the sorting on the maps file wrong at worst; but that's not possible
+	// on any current platform).
+	//
+	// Artifically adjust the seqfile handle so we only output vsyscall entry once.
+	if start != vsyscallEnd {
+		buf.WriteString(vsyscallMapsEntry)
+	}
+}
+
+// ReadMapsSeqFileData is called by fs/proc.mapsData.ReadSeqFileData to
+// implement /proc/[pid]/maps.
+func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	var data []seqfile.SeqData
+	var start usermem.Addr
+	if handle != nil {
+		start = *handle.(*usermem.Addr)
+	}
+	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
+		vmaAddr := vseg.End()
+		data = append(data, seqfile.SeqData{
+			Buf:    mm.vmaMapsEntryLocked(ctx, vseg),
+			Handle: &vmaAddr,
+		})
+	}
+
+	// We always emulate vsyscall, so advertise it here. Everything about a
+	// vsyscall region is static, so just hard code the maps entry since we
+	// don't have a real vma backing it. The vsyscall region is at the end of
+	// the virtual address space so nothing should be mapped after it (if
+	// something is really mapped in the tiny ~10 MiB segment afterwards, we'll
+	// get the sorting on the maps file wrong at worst; but that's not possible
+	// on any current platform).
+	//
+	// Artifically adjust the seqfile handle so we only output vsyscall entry once.
+	if start != vsyscallEnd {
+		vmaAddr := vsyscallEnd
+		data = append(data, seqfile.SeqData{
+			Buf:    []byte(vsyscallMapsEntry),
+			Handle: &vmaAddr,
+		})
+	}
+	return data, 1
+}
+
+// vmaMapsEntryLocked returns a /proc/[pid]/maps entry for the vma iterated by
+// vseg, including the trailing newline.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) vmaMapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte {
+	var b bytes.Buffer
+	mm.appendVMAMapsEntryLocked(ctx, vseg, &b)
+	return b.Bytes()
+}
+
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) {
+	vma := vseg.ValuePtr()
+	private := "p"
+	if !vma.private {
+		private = "s"
+	}
+
+	var dev, ino uint64
+	if vma.id != nil {
+		dev = vma.id.DeviceID()
+		ino = vma.id.InodeID()
+	}
+	devMajor := uint32(dev >> devMinorBits)
+	devMinor := uint32(dev & ((1 << devMinorBits) - 1))
+
+	// Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() =>
+	// stack_guard_page_start().
+	lineLen, _ := fmt.Fprintf(b, "%08x-%08x %s%s %08x %02x:%02x %d ",
+		vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino)
+
+	// Figure out our filename or hint.
+	var s string
+	if vma.hint != "" {
+		s = vma.hint
+	} else if vma.id != nil {
+		// FIXME(jamieliu): We are holding mm.mappingMu here, which is
+		// consistent with Linux's holding mmap_sem in
+		// fs/proc/task_mmu.c:show_map_vma() => fs/seq_file.c:seq_file_path().
+		// However, it's not clear that fs.File.MappedName() is actually
+		// consistent with this lock order.
+		s = vma.id.MappedName(ctx)
+	}
+	if s != "" {
+		// Per linux, we pad until the 74th character.
+		if pad := 73 - lineLen; pad > 0 {
+			b.WriteString(strings.Repeat(" ", pad))
+		}
+		b.WriteString(s)
+	}
+	b.WriteString("\n")
+}
+
+// ReadSmapsDataInto is called by fsimpl/proc.smapsData.Generate to
+// implement /proc/[pid]/maps.
+func (mm *MemoryManager) ReadSmapsDataInto(ctx context.Context, buf *bytes.Buffer) {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	var start usermem.Addr
+
+	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
+		mm.vmaSmapsEntryIntoLocked(ctx, vseg, buf)
+	}
+
+	// We always emulate vsyscall, so advertise it here. See
+	// ReadMapsSeqFileData for additional commentary.
+	if start != vsyscallEnd {
+		buf.WriteString(vsyscallSmapsEntry)
+	}
+}
+
+// ReadSmapsSeqFileData is called by fs/proc.smapsData.ReadSeqFileData to
+// implement /proc/[pid]/smaps.
+func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	var data []seqfile.SeqData
+	var start usermem.Addr
+	if handle != nil {
+		start = *handle.(*usermem.Addr)
+	}
+	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
+		vmaAddr := vseg.End()
+		data = append(data, seqfile.SeqData{
+			Buf:    mm.vmaSmapsEntryLocked(ctx, vseg),
+			Handle: &vmaAddr,
+		})
+	}
+
+	// We always emulate vsyscall, so advertise it here. See
+	// ReadMapsSeqFileData for additional commentary.
+	if start != vsyscallEnd {
+		vmaAddr := vsyscallEnd
+		data = append(data, seqfile.SeqData{
+			Buf:    []byte(vsyscallSmapsEntry),
+			Handle: &vmaAddr,
+		})
+	}
+	return data, 1
+}
+
+// vmaSmapsEntryLocked returns a /proc/[pid]/smaps entry for the vma iterated
+// by vseg, including the trailing newline.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte {
+	var b bytes.Buffer
+	mm.vmaSmapsEntryIntoLocked(ctx, vseg, &b)
+	return b.Bytes()
+}
+
+func (mm *MemoryManager) vmaSmapsEntryIntoLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) {
+	mm.appendVMAMapsEntryLocked(ctx, vseg, b)
+	vma := vseg.ValuePtr()
+
+	// We take mm.activeMu here in each call to vmaSmapsEntryLocked, instead of
+	// requiring it to be locked as a precondition, to reduce the latency
+	// impact of reading /proc/[pid]/smaps on concurrent performance-sensitive
+	// operations requiring activeMu for writing like faults.
+	mm.activeMu.RLock()
+	var rss uint64
+	var anon uint64
+	vsegAR := vseg.Range()
+	for pseg := mm.pmas.LowerBoundSegment(vsegAR.Start); pseg.Ok() && pseg.Start() < vsegAR.End; pseg = pseg.NextSegment() {
+		psegAR := pseg.Range().Intersect(vsegAR)
+		size := uint64(psegAR.Length())
+		rss += size
+		if pseg.ValuePtr().private {
+			anon += size
+		}
+	}
+	mm.activeMu.RUnlock()
+
+	fmt.Fprintf(b, "Size:           %8d kB\n", vseg.Range().Length()/1024)
+	fmt.Fprintf(b, "Rss:            %8d kB\n", rss/1024)
+	// Currently we report PSS = RSS, i.e. we pretend each page mapped by a pma
+	// is only mapped by that pma. This avoids having to query memmap.Mappables
+	// for reference count information on each page. As a corollary, all pages
+	// are accounted as "private" whether or not the vma is private; compare
+	// Linux's fs/proc/task_mmu.c:smaps_account().
+	fmt.Fprintf(b, "Pss:            %8d kB\n", rss/1024)
+	fmt.Fprintf(b, "Shared_Clean:   %8d kB\n", 0)
+	fmt.Fprintf(b, "Shared_Dirty:   %8d kB\n", 0)
+	// Pretend that all pages are dirty if the vma is writable, and clean otherwise.
+	clean := rss
+	if vma.effectivePerms.Write {
+		clean = 0
+	}
+	fmt.Fprintf(b, "Private_Clean:  %8d kB\n", clean/1024)
+	fmt.Fprintf(b, "Private_Dirty:  %8d kB\n", (rss-clean)/1024)
+	// Pretend that all pages are "referenced" (recently touched).
+	fmt.Fprintf(b, "Referenced:     %8d kB\n", rss/1024)
+	fmt.Fprintf(b, "Anonymous:      %8d kB\n", anon/1024)
+	// Hugepages (hugetlb and THP) are not implemented.
+	fmt.Fprintf(b, "AnonHugePages:  %8d kB\n", 0)
+	fmt.Fprintf(b, "Shared_Hugetlb: %8d kB\n", 0)
+	fmt.Fprintf(b, "Private_Hugetlb: %7d kB\n", 0)
+	// Swap is not implemented.
+	fmt.Fprintf(b, "Swap:           %8d kB\n", 0)
+	fmt.Fprintf(b, "SwapPss:        %8d kB\n", 0)
+	fmt.Fprintf(b, "KernelPageSize: %8d kB\n", usermem.PageSize/1024)
+	fmt.Fprintf(b, "MMUPageSize:    %8d kB\n", usermem.PageSize/1024)
+	locked := rss
+	if vma.mlockMode == memmap.MLockNone {
+		locked = 0
+	}
+	fmt.Fprintf(b, "Locked:         %8d kB\n", locked/1024)
+
+	b.WriteString("VmFlags: ")
+	if vma.realPerms.Read {
+		b.WriteString("rd ")
+	}
+	if vma.realPerms.Write {
+		b.WriteString("wr ")
+	}
+	if vma.realPerms.Execute {
+		b.WriteString("ex ")
+	}
+	if vma.canWriteMappableLocked() { // VM_SHARED
+		b.WriteString("sh ")
+	}
+	if vma.maxPerms.Read {
+		b.WriteString("mr ")
+	}
+	if vma.maxPerms.Write {
+		b.WriteString("mw ")
+	}
+	if vma.maxPerms.Execute {
+		b.WriteString("me ")
+	}
+	if !vma.private { // VM_MAYSHARE
+		b.WriteString("ms ")
+	}
+	if vma.growsDown {
+		b.WriteString("gd ")
+	}
+	if vma.mlockMode != memmap.MLockNone { // VM_LOCKED
+		b.WriteString("lo ")
+	}
+	if vma.mlockMode == memmap.MLockLazy { // VM_LOCKONFAULT
+		b.WriteString("?? ") // no explicit encoding in fs/proc/task_mmu.c:show_smap_vma_flags()
+	}
+	if vma.private && vma.effectivePerms.Write { // VM_ACCOUNT
+		b.WriteString("ac ")
+	}
+	b.WriteString("\n")
+}
diff --git a/pkg/sentry/mm/save_restore.go b/pkg/sentry/mm/save_restore.go
new file mode 100644
index 000000000..f56215d9a
--- /dev/null
+++ b/pkg/sentry/mm/save_restore.go
@@ -0,0 +1,57 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/context"
+)
+
+// InvalidateUnsavable invokes memmap.Mappable.InvalidateUnsavable on all
+// Mappables mapped by mm.
+func (mm *MemoryManager) InvalidateUnsavable(ctx context.Context) error {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+		if vma := vseg.ValuePtr(); vma.mappable != nil {
+			if err := vma.mappable.InvalidateUnsavable(ctx); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// beforeSave is invoked by stateify.
+func (mm *MemoryManager) beforeSave() {
+	mf := mm.mfp.MemoryFile()
+	for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
+		if pma := pseg.ValuePtr(); pma.file != mf {
+			// InvalidateUnsavable should have caused all such pmas to be
+			// invalidated.
+			panic(fmt.Sprintf("Can't save pma %#v with non-MemoryFile of type %T:\n%s", pseg.Range(), pma.file, mm))
+		}
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (mm *MemoryManager) afterLoad() {
+	mm.haveASIO = mm.p.SupportsAddressSpaceIO()
+	mf := mm.mfp.MemoryFile()
+	for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
+		pseg.ValuePtr().file = mf
+	}
+}
diff --git a/pkg/sentry/mm/shm.go b/pkg/sentry/mm/shm.go
new file mode 100644
index 000000000..6432731d4
--- /dev/null
+++ b/pkg/sentry/mm/shm.go
@@ -0,0 +1,66 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/shm"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// DetachShm unmaps a sysv shared memory segment.
+func (mm *MemoryManager) DetachShm(ctx context.Context, addr usermem.Addr) error {
+	if addr != addr.RoundDown() {
+		// "... shmaddr is not aligned on a page boundary." - man shmdt(2)
+		return syserror.EINVAL
+	}
+
+	var detached *shm.Shm
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+
+	// Find and remove the first vma containing an address >= addr that maps a
+	// segment originally attached at addr.
+	vseg := mm.vmas.LowerBoundSegment(addr)
+	for vseg.Ok() {
+		vma := vseg.ValuePtr()
+		if shm, ok := vma.mappable.(*shm.Shm); ok && vseg.Start() >= addr && uint64(vseg.Start()-addr) == vma.off {
+			detached = shm
+			vseg = mm.unmapLocked(ctx, vseg.Range()).NextSegment()
+			break
+		} else {
+			vseg = vseg.NextSegment()
+		}
+	}
+
+	if detached == nil {
+		// There is no shared memory segment attached at addr.
+		return syserror.EINVAL
+	}
+
+	// Remove all vmas that could have been created by the same attach.
+	end := addr + usermem.Addr(detached.EffectiveSize())
+	for vseg.Ok() && vseg.End() <= end {
+		vma := vseg.ValuePtr()
+		if vma.mappable == detached && uint64(vseg.Start()-addr) == vma.off {
+			vseg = mm.unmapLocked(ctx, vseg.Range()).NextSegment()
+		} else {
+			vseg = vseg.NextSegment()
+		}
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
new file mode 100644
index 000000000..9ad52082d
--- /dev/null
+++ b/pkg/sentry/mm/special_mappable.go
@@ -0,0 +1,157 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// SpecialMappable implements memmap.MappingIdentity and memmap.Mappable with
+// semantics similar to Linux's mm/mmap.c:_install_special_mapping(), except
+// that SpecialMappable takes ownership of the memory that it represents
+// (_install_special_mapping() does not.)
+//
+// +stateify savable
+type SpecialMappable struct {
+	refs.AtomicRefCount
+
+	mfp  pgalloc.MemoryFileProvider
+	fr   platform.FileRange
+	name string
+}
+
+// NewSpecialMappable returns a SpecialMappable that owns fr, which represents
+// offsets in mfp.MemoryFile() that contain the SpecialMappable's data. The
+// SpecialMappable will use the given name in /proc/[pid]/maps.
+//
+// Preconditions: fr.Length() != 0.
+func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable {
+	m := SpecialMappable{mfp: mfp, fr: fr, name: name}
+	m.EnableLeakCheck("mm.SpecialMappable")
+	return &m
+}
+
+// DecRef implements refs.RefCounter.DecRef.
+func (m *SpecialMappable) DecRef() {
+	m.AtomicRefCount.DecRefWithDestructor(func() {
+		m.mfp.MemoryFile().DecRef(m.fr)
+	})
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (m *SpecialMappable) MappedName(ctx context.Context) string {
+	return m.name
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (m *SpecialMappable) DeviceID() uint64 {
+	return 0
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (m *SpecialMappable) InodeID() uint64 {
+	return 0
+}
+
+// Msync implements memmap.MappingIdentity.Msync.
+func (m *SpecialMappable) Msync(ctx context.Context, mr memmap.MappableRange) error {
+	// Linux: vm_file is NULL, causing msync to skip it entirely.
+	return nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (*SpecialMappable) AddMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) error {
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (*SpecialMappable) RemoveMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) {
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (*SpecialMappable) CopyMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, usermem.AddrRange, uint64, bool) error {
+	return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (m *SpecialMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	var err error
+	if required.End > m.fr.Length() {
+		err = &memmap.BusError{syserror.EFAULT}
+	}
+	if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 {
+		return []memmap.Translation{
+			{
+				Source: source,
+				File:   m.mfp.MemoryFile(),
+				Offset: m.fr.Start + source.Start,
+				Perms:  usermem.AnyAccess,
+			},
+		}, err
+	}
+	return nil, err
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (m *SpecialMappable) InvalidateUnsavable(ctx context.Context) error {
+	// Since data is stored in pgalloc.MemoryFile, the contents of which are
+	// preserved across save/restore, we don't need to do anything.
+	return nil
+}
+
+// MemoryFileProvider returns the MemoryFileProvider whose MemoryFile stores
+// the SpecialMappable's contents.
+func (m *SpecialMappable) MemoryFileProvider() pgalloc.MemoryFileProvider {
+	return m.mfp
+}
+
+// FileRange returns the offsets into MemoryFileProvider().MemoryFile() that
+// store the SpecialMappable's contents.
+func (m *SpecialMappable) FileRange() platform.FileRange {
+	return m.fr
+}
+
+// Length returns the length of the SpecialMappable.
+func (m *SpecialMappable) Length() uint64 {
+	return m.fr.Length()
+}
+
+// NewSharedAnonMappable returns a SpecialMappable that implements the
+// semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero.
+//
+// TODO(jamieliu): The use of SpecialMappable is a lazy code reuse hack. Linux
+// uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should
+// do the same to get non-zero device and inode IDs.
+func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) {
+	if length == 0 {
+		return nil, syserror.EINVAL
+	}
+	alignedLen, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return nil, syserror.EINVAL
+	}
+	fr, err := mfp.MemoryFile().Allocate(uint64(alignedLen), usage.Anonymous)
+	if err != nil {
+		return nil, err
+	}
+	return NewSpecialMappable("/dev/zero (deleted)", mfp, fr), nil
+}
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
new file mode 100644
index 000000000..3f496aa9f
--- /dev/null
+++ b/pkg/sentry/mm/syscalls.go
@@ -0,0 +1,1286 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+	mrand "math/rand"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// HandleUserFault handles an application page fault. sp is the faulting
+// application thread's stack pointer.
+//
+// Preconditions: mm.as != nil.
+func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr usermem.Addr, at usermem.AccessType, sp usermem.Addr) error {
+	ar, ok := addr.RoundDown().ToRange(usermem.PageSize)
+	if !ok {
+		return syserror.EFAULT
+	}
+
+	// Don't bother trying existingPMAsLocked; in most cases, if we did have
+	// existing pmas, we wouldn't have faulted.
+
+	// Ensure that we have a usable vma. Here and below, since we are only
+	// asking for a single page, there is no possibility of partial success,
+	// and any error is immediately fatal.
+	mm.mappingMu.RLock()
+	vseg, _, err := mm.getVMAsLocked(ctx, ar, at, false)
+	if err != nil {
+		mm.mappingMu.RUnlock()
+		return err
+	}
+
+	// Ensure that we have a usable pma.
+	mm.activeMu.Lock()
+	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, at)
+	mm.mappingMu.RUnlock()
+	if err != nil {
+		mm.activeMu.Unlock()
+		return err
+	}
+
+	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
+	// anymore.
+	mm.activeMu.DowngradeLock()
+
+	// Map the faulted page into the active AddressSpace.
+	err = mm.mapASLocked(pseg, ar, false)
+	mm.activeMu.RUnlock()
+	return err
+}
+
+// MMap establishes a memory mapping.
+func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (usermem.Addr, error) {
+	if opts.Length == 0 {
+		return 0, syserror.EINVAL
+	}
+	length, ok := usermem.Addr(opts.Length).RoundUp()
+	if !ok {
+		return 0, syserror.ENOMEM
+	}
+	opts.Length = uint64(length)
+
+	if opts.Mappable != nil {
+		// Offset must be aligned.
+		if usermem.Addr(opts.Offset).RoundDown() != usermem.Addr(opts.Offset) {
+			return 0, syserror.EINVAL
+		}
+		// Offset + length must not overflow.
+		if end := opts.Offset + opts.Length; end < opts.Offset {
+			return 0, syserror.ENOMEM
+		}
+	} else {
+		opts.Offset = 0
+		if !opts.Private {
+			if opts.MappingIdentity != nil {
+				return 0, syserror.EINVAL
+			}
+			m, err := NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
+			if err != nil {
+				return 0, err
+			}
+			defer m.DecRef()
+			opts.MappingIdentity = m
+			opts.Mappable = m
+		}
+	}
+
+	if opts.Addr.RoundDown() != opts.Addr {
+		// MAP_FIXED requires addr to be page-aligned; non-fixed mappings
+		// don't.
+		if opts.Fixed {
+			return 0, syserror.EINVAL
+		}
+		opts.Addr = opts.Addr.RoundDown()
+	}
+
+	if !opts.MaxPerms.SupersetOf(opts.Perms) {
+		return 0, syserror.EACCES
+	}
+	if opts.Unmap && !opts.Fixed {
+		return 0, syserror.EINVAL
+	}
+	if opts.GrowsDown && opts.Mappable != nil {
+		return 0, syserror.EINVAL
+	}
+
+	// Get the new vma.
+	mm.mappingMu.Lock()
+	if opts.MLockMode < mm.defMLockMode {
+		opts.MLockMode = mm.defMLockMode
+	}
+	vseg, ar, err := mm.createVMALocked(ctx, opts)
+	if err != nil {
+		mm.mappingMu.Unlock()
+		return 0, err
+	}
+
+	// TODO(jamieliu): In Linux, VM_LOCKONFAULT (which may be set on the new
+	// vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears
+	// to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in
+	// mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() =>
+	// populate_vma_page_range(). Confirm this behavior.
+	switch {
+	case opts.Precommit || opts.MLockMode == memmap.MLockEager:
+		// Get pmas and map with precommit as requested.
+		mm.populateVMAAndUnlock(ctx, vseg, ar, true)
+
+	case opts.Mappable == nil && length <= privateAllocUnit:
+		// NOTE(b/63077076, b/63360184): Get pmas and map eagerly in the hope
+		// that doing so will save on future page faults. We only do this for
+		// anonymous mappings, since otherwise the cost of
+		// memmap.Mappable.Translate is unknown; and only for small mappings,
+		// to avoid needing to allocate large amounts of memory that we may
+		// subsequently need to checkpoint.
+		mm.populateVMAAndUnlock(ctx, vseg, ar, false)
+
+	default:
+		mm.mappingMu.Unlock()
+	}
+
+	return ar.Start, nil
+}
+
+// populateVMA obtains pmas for addresses in ar in the given vma, and maps them
+// into mm.as if it is active.
+//
+// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar).
+func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+	if !vseg.ValuePtr().effectivePerms.Any() {
+		// Linux doesn't populate inaccessible pages. See
+		// mm/gup.c:populate_vma_page_range.
+		return
+	}
+
+	mm.activeMu.Lock()
+	// Can't defer mm.activeMu.Unlock(); see below.
+
+	// Even if we get new pmas, we can't actually map them if we don't have an
+	// AddressSpace.
+	if mm.as == nil {
+		mm.activeMu.Unlock()
+		return
+	}
+
+	// Ensure that we have usable pmas.
+	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, usermem.NoAccess)
+	if err != nil {
+		// mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
+		// mm/gup.c:mm_populate(). If it matters, we'll get it again when
+		// userspace actually tries to use the failing page.
+		mm.activeMu.Unlock()
+		return
+	}
+
+	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
+	// anymore.
+	mm.activeMu.DowngradeLock()
+
+	// As above, errors are silently ignored.
+	mm.mapASLocked(pseg, ar, precommit)
+	mm.activeMu.RUnlock()
+}
+
+// populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally
+// unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is
+// preferable to populateVMA since it unlocks mm.mappingMu before performing
+// expensive operations that don't require it to be locked.
+//
+// Preconditions: mm.mappingMu must be locked for writing.
+// vseg.Range().IsSupersetOf(ar).
+//
+// Postconditions: mm.mappingMu will be unlocked.
+func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+	// See populateVMA above for commentary.
+	if !vseg.ValuePtr().effectivePerms.Any() {
+		mm.mappingMu.Unlock()
+		return
+	}
+
+	mm.activeMu.Lock()
+
+	if mm.as == nil {
+		mm.activeMu.Unlock()
+		mm.mappingMu.Unlock()
+		return
+	}
+
+	// mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it
+	// isn't needed at all for mapASLocked.
+	mm.mappingMu.DowngradeLock()
+	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, usermem.NoAccess)
+	mm.mappingMu.RUnlock()
+	if err != nil {
+		mm.activeMu.Unlock()
+		return
+	}
+
+	mm.activeMu.DowngradeLock()
+	mm.mapASLocked(pseg, ar, precommit)
+	mm.activeMu.RUnlock()
+}
+
+// MapStack allocates the initial process stack.
+func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) {
+	// maxStackSize is the maximum supported process stack size in bytes.
+	//
+	// This limit exists because stack growing isn't implemented, so the entire
+	// process stack must be mapped up-front.
+	const maxStackSize = 128 << 20
+
+	stackSize := limits.FromContext(ctx).Get(limits.Stack)
+	r, ok := usermem.Addr(stackSize.Cur).RoundUp()
+	sz := uint64(r)
+	if !ok {
+		// RLIM_INFINITY rounds up to 0.
+		sz = linux.DefaultStackSoftLimit
+	} else if sz > maxStackSize {
+		ctx.Warningf("Capping stack size from RLIMIT_STACK of %v down to %v.", sz, maxStackSize)
+		sz = maxStackSize
+	} else if sz == 0 {
+		return usermem.AddrRange{}, syserror.ENOMEM
+	}
+	szaddr := usermem.Addr(sz)
+	ctx.Debugf("Allocating stack with size of %v bytes", sz)
+
+	// Determine the stack's desired location. Unlike Linux, address
+	// randomization can't be disabled.
+	stackEnd := mm.layout.MaxAddr - usermem.Addr(mrand.Int63n(int64(mm.layout.MaxStackRand))).RoundDown()
+	if stackEnd < szaddr {
+		return usermem.AddrRange{}, syserror.ENOMEM
+	}
+	stackStart := stackEnd - szaddr
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	_, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+		Length:    sz,
+		Addr:      stackStart,
+		Perms:     usermem.ReadWrite,
+		MaxPerms:  usermem.AnyAccess,
+		Private:   true,
+		GrowsDown: true,
+		MLockMode: mm.defMLockMode,
+		Hint:      "[stack]",
+	})
+	return ar, err
+}
+
+// MUnmap implements the semantics of Linux's munmap(2).
+func (mm *MemoryManager) MUnmap(ctx context.Context, addr usermem.Addr, length uint64) error {
+	if addr != addr.RoundDown() {
+		return syserror.EINVAL
+	}
+	if length == 0 {
+		return syserror.EINVAL
+	}
+	la, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return syserror.EINVAL
+	}
+	ar, ok := addr.ToRange(uint64(la))
+	if !ok {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	mm.unmapLocked(ctx, ar)
+	return nil
+}
+
+// MRemapOpts specifies options to MRemap.
+type MRemapOpts struct {
+	// Move controls whether MRemap moves the remapped mapping to a new address.
+	Move MRemapMoveMode
+
+	// NewAddr is the new address for the remapping. NewAddr is ignored unless
+	// Move is MMRemapMustMove.
+	NewAddr usermem.Addr
+}
+
+// MRemapMoveMode controls MRemap's moving behavior.
+type MRemapMoveMode int
+
+const (
+	// MRemapNoMove prevents MRemap from moving the remapped mapping.
+	MRemapNoMove MRemapMoveMode = iota
+
+	// MRemapMayMove allows MRemap to move the remapped mapping.
+	MRemapMayMove
+
+	// MRemapMustMove requires MRemap to move the remapped mapping to
+	// MRemapOpts.NewAddr, replacing any existing mappings in the remapped
+	// range.
+	MRemapMustMove
+)
+
+// MRemap implements the semantics of Linux's mremap(2).
+func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSize uint64, newSize uint64, opts MRemapOpts) (usermem.Addr, error) {
+	// "Note that old_address has to be page aligned." - mremap(2)
+	if oldAddr.RoundDown() != oldAddr {
+		return 0, syserror.EINVAL
+	}
+
+	// Linux treats an old_size that rounds up to 0 as 0, which is otherwise a
+	// valid size. However, new_size can't be 0 after rounding.
+	oldSizeAddr, _ := usermem.Addr(oldSize).RoundUp()
+	oldSize = uint64(oldSizeAddr)
+	newSizeAddr, ok := usermem.Addr(newSize).RoundUp()
+	if !ok || newSizeAddr == 0 {
+		return 0, syserror.EINVAL
+	}
+	newSize = uint64(newSizeAddr)
+
+	oldEnd, ok := oldAddr.AddLength(oldSize)
+	if !ok {
+		return 0, syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+
+	// All cases require that a vma exists at oldAddr.
+	vseg := mm.vmas.FindSegment(oldAddr)
+	if !vseg.Ok() {
+		return 0, syserror.EFAULT
+	}
+
+	// Behavior matrix:
+	//
+	// Move     | oldSize = 0 | oldSize < newSize | oldSize = newSize | oldSize > newSize
+	// ---------+-------------+-------------------+-------------------+------------------
+	//   NoMove | ENOMEM [1]  | Grow in-place     | No-op             | Shrink in-place
+	//  MayMove | Copy [1]    | Grow in-place or  | No-op             | Shrink in-place
+	//          |             |   move            |                   |
+	// MustMove | Copy        | Move and grow     | Move              | Shrink and move
+	//
+	// [1] In-place growth is impossible because the vma at oldAddr already
+	// occupies at least part of the destination. Thus the NoMove case always
+	// fails and the MayMove case always falls back to copying.
+
+	if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone {
+		// Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall,
+		// mremap in Linux does not check mm/mlock.c:can_do_mlock() and
+		// therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and
+		// !CAP_IPC_LOCK.
+		mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+			if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit {
+				return 0, syserror.EAGAIN
+			}
+		}
+	}
+
+	if opts.Move != MRemapMustMove {
+		// Handle no-ops and in-place shrinking. These cases don't care if
+		// [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
+		// (aside from oldAddr).
+		if newSize <= oldSize {
+			if newSize < oldSize {
+				// If oldAddr+oldSize didn't overflow, oldAddr+newSize can't
+				// either.
+				newEnd := oldAddr + usermem.Addr(newSize)
+				mm.unmapLocked(ctx, usermem.AddrRange{newEnd, oldEnd})
+			}
+			return oldAddr, nil
+		}
+
+		// Handle in-place growing.
+
+		// Check that oldEnd maps to the same vma as oldAddr.
+		if vseg.End() < oldEnd {
+			return 0, syserror.EFAULT
+		}
+		// "Grow" the existing vma by creating a new mergeable one.
+		vma := vseg.ValuePtr()
+		var newOffset uint64
+		if vma.mappable != nil {
+			newOffset = vseg.mappableRange().End
+		}
+		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+			Length:          newSize - oldSize,
+			MappingIdentity: vma.id,
+			Mappable:        vma.mappable,
+			Offset:          newOffset,
+			Addr:            oldEnd,
+			Fixed:           true,
+			Perms:           vma.realPerms,
+			MaxPerms:        vma.maxPerms,
+			Private:         vma.private,
+			GrowsDown:       vma.growsDown,
+			MLockMode:       vma.mlockMode,
+			Hint:            vma.hint,
+		})
+		if err == nil {
+			if vma.mlockMode == memmap.MLockEager {
+				mm.populateVMA(ctx, vseg, ar, true)
+			}
+			return oldAddr, nil
+		}
+		// In-place growth failed. In the MRemapMayMove case, fall through to
+		// copying/moving below.
+		if opts.Move == MRemapNoMove {
+			return 0, err
+		}
+	}
+
+	// Find a location for the new mapping.
+	var newAR usermem.AddrRange
+	switch opts.Move {
+	case MRemapMayMove:
+		newAddr, err := mm.findAvailableLocked(newSize, findAvailableOpts{})
+		if err != nil {
+			return 0, err
+		}
+		newAR, _ = newAddr.ToRange(newSize)
+
+	case MRemapMustMove:
+		newAddr := opts.NewAddr
+		if newAddr.RoundDown() != newAddr {
+			return 0, syserror.EINVAL
+		}
+		var ok bool
+		newAR, ok = newAddr.ToRange(newSize)
+		if !ok {
+			return 0, syserror.EINVAL
+		}
+		if (usermem.AddrRange{oldAddr, oldEnd}).Overlaps(newAR) {
+			return 0, syserror.EINVAL
+		}
+
+		// Check that the new region is valid.
+		_, err := mm.findAvailableLocked(newSize, findAvailableOpts{
+			Addr:  newAddr,
+			Fixed: true,
+			Unmap: true,
+		})
+		if err != nil {
+			return 0, err
+		}
+
+		// Unmap any mappings at the destination.
+		mm.unmapLocked(ctx, newAR)
+
+		// If the sizes specify shrinking, unmap everything between the new and
+		// old sizes at the source. Unmapping before the following checks is
+		// correct: compare Linux's mm/mremap.c:mremap_to() => do_munmap(),
+		// vma_to_resize().
+		if newSize < oldSize {
+			oldNewEnd := oldAddr + usermem.Addr(newSize)
+			mm.unmapLocked(ctx, usermem.AddrRange{oldNewEnd, oldEnd})
+			oldEnd = oldNewEnd
+		}
+
+		// unmapLocked may have invalidated vseg; look it up again.
+		vseg = mm.vmas.FindSegment(oldAddr)
+	}
+
+	oldAR := usermem.AddrRange{oldAddr, oldEnd}
+
+	// Check that oldEnd maps to the same vma as oldAddr.
+	if vseg.End() < oldEnd {
+		return 0, syserror.EFAULT
+	}
+
+	// Check against RLIMIT_AS.
+	newUsageAS := mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS {
+		return 0, syserror.ENOMEM
+	}
+
+	if vma := vseg.ValuePtr(); vma.mappable != nil {
+		// Check that offset+length does not overflow.
+		if vma.off+uint64(newAR.Length()) < vma.off {
+			return 0, syserror.EINVAL
+		}
+		// Inform the Mappable, if any, of the new mapping.
+		if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start), vma.canWriteMappableLocked()); err != nil {
+			return 0, err
+		}
+	}
+
+	if oldSize == 0 {
+		// Handle copying.
+		//
+		// We can't use createVMALocked because it calls Mappable.AddMapping,
+		// whereas we've already called Mappable.CopyMapping (which is
+		// consistent with Linux). Call vseg.Value() (rather than
+		// vseg.ValuePtr()) to make a copy of the vma.
+		vma := vseg.Value()
+		if vma.mappable != nil {
+			vma.off = vseg.mappableOffsetAt(oldAR.Start)
+		}
+		if vma.id != nil {
+			vma.id.IncRef()
+		}
+		vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
+		mm.usageAS += uint64(newAR.Length())
+		if vma.isPrivateDataLocked() {
+			mm.dataAS += uint64(newAR.Length())
+		}
+		if vma.mlockMode != memmap.MLockNone {
+			mm.lockedAS += uint64(newAR.Length())
+			if vma.mlockMode == memmap.MLockEager {
+				mm.populateVMA(ctx, vseg, newAR, true)
+			}
+		}
+		return newAR.Start, nil
+	}
+
+	// Handle moving.
+	//
+	// Remove the existing vma before inserting the new one to minimize
+	// iterator invalidation. We do this directly (instead of calling
+	// removeVMAsLocked) because:
+	//
+	// 1. We can't drop the reference on vma.id, which will be transferred to
+	// the new vma.
+	//
+	// 2. We can't call vma.mappable.RemoveMapping, because pmas are still at
+	// oldAR, so calling RemoveMapping could cause us to miss an invalidation
+	// overlapping oldAR.
+	//
+	// Call vseg.Value() (rather than vseg.ValuePtr()) to make a copy of the
+	// vma.
+	vseg = mm.vmas.Isolate(vseg, oldAR)
+	vma := vseg.Value()
+	mm.vmas.Remove(vseg)
+	vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
+	mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	if vma.isPrivateDataLocked() {
+		mm.dataAS = mm.dataAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	}
+	if vma.mlockMode != memmap.MLockNone {
+		mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	}
+
+	// Move pmas. This is technically optional for non-private pmas, which
+	// could just go through memmap.Mappable.Translate again, but it's required
+	// for private pmas.
+	mm.activeMu.Lock()
+	mm.movePMAsLocked(oldAR, newAR)
+	mm.activeMu.Unlock()
+
+	// Now that pmas have been moved to newAR, we can notify vma.mappable that
+	// oldAR is no longer mapped.
+	if vma.mappable != nil {
+		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.canWriteMappableLocked())
+	}
+
+	if vma.mlockMode == memmap.MLockEager {
+		mm.populateVMA(ctx, vseg, newAR, true)
+	}
+
+	return newAR.Start, nil
+}
+
+// MProtect implements the semantics of Linux's mprotect(2).
+func (mm *MemoryManager) MProtect(addr usermem.Addr, length uint64, realPerms usermem.AccessType, growsDown bool) error {
+	if addr.RoundDown() != addr {
+		return syserror.EINVAL
+	}
+	if length == 0 {
+		return nil
+	}
+	rlength, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return syserror.ENOMEM
+	}
+	ar, ok := addr.ToRange(uint64(rlength))
+	if !ok {
+		return syserror.ENOMEM
+	}
+	effectivePerms := realPerms.Effective()
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	// Non-growsDown mprotect requires that all of ar is mapped, and stops at
+	// the first non-empty gap. growsDown mprotect requires that the first vma
+	// be growsDown, but does not require it to extend all the way to ar.Start;
+	// vmas after the first must be contiguous but need not be growsDown, like
+	// the non-growsDown case.
+	vseg := mm.vmas.LowerBoundSegment(ar.Start)
+	if !vseg.Ok() {
+		return syserror.ENOMEM
+	}
+	if growsDown {
+		if !vseg.ValuePtr().growsDown {
+			return syserror.EINVAL
+		}
+		if ar.End <= vseg.Start() {
+			return syserror.ENOMEM
+		}
+		ar.Start = vseg.Start()
+	} else {
+		if ar.Start < vseg.Start() {
+			return syserror.ENOMEM
+		}
+	}
+
+	mm.activeMu.Lock()
+	defer mm.activeMu.Unlock()
+	defer func() {
+		mm.vmas.MergeRange(ar)
+		mm.vmas.MergeAdjacent(ar)
+		mm.pmas.MergeRange(ar)
+		mm.pmas.MergeAdjacent(ar)
+	}()
+	pseg := mm.pmas.LowerBoundSegment(ar.Start)
+	var didUnmapAS bool
+	for {
+		// Check for permission validity before splitting vmas, for consistency
+		// with Linux.
+		if !vseg.ValuePtr().maxPerms.SupersetOf(effectivePerms) {
+			return syserror.EACCES
+		}
+		vseg = mm.vmas.Isolate(vseg, ar)
+
+		// Update vma permissions.
+		vma := vseg.ValuePtr()
+		vmaLength := vseg.Range().Length()
+		if vma.isPrivateDataLocked() {
+			mm.dataAS -= uint64(vmaLength)
+		}
+
+		vma.realPerms = realPerms
+		vma.effectivePerms = effectivePerms
+		if vma.isPrivateDataLocked() {
+			mm.dataAS += uint64(vmaLength)
+		}
+
+		// Propagate vma permission changes to pmas.
+		for pseg.Ok() && pseg.Start() < vseg.End() {
+			if pseg.Range().Overlaps(vseg.Range()) {
+				pseg = mm.pmas.Isolate(pseg, vseg.Range())
+				pma := pseg.ValuePtr()
+				if !effectivePerms.SupersetOf(pma.effectivePerms) && !didUnmapAS {
+					// Unmap all of ar, not just vseg.Range(), to minimize host
+					// syscalls.
+					mm.unmapASLocked(ar)
+					didUnmapAS = true
+				}
+				pma.effectivePerms = effectivePerms.Intersect(pma.translatePerms)
+				if pma.needCOW {
+					pma.effectivePerms.Write = false
+				}
+			}
+			pseg = pseg.NextSegment()
+		}
+
+		// Continue to the next vma.
+		if ar.End <= vseg.End() {
+			return nil
+		}
+		vseg, _ = vseg.NextNonEmpty()
+		if !vseg.Ok() {
+			return syserror.ENOMEM
+		}
+	}
+}
+
+// BrkSetup sets mm's brk address to addr and its brk size to 0.
+func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) {
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	// Unmap the existing brk.
+	if mm.brk.Length() != 0 {
+		mm.unmapLocked(ctx, mm.brk)
+	}
+	mm.brk = usermem.AddrRange{addr, addr}
+}
+
+// Brk implements the semantics of Linux's brk(2), except that it returns an
+// error on failure.
+func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) {
+	mm.mappingMu.Lock()
+	// Can't defer mm.mappingMu.Unlock(); see below.
+
+	if addr < mm.brk.Start {
+		addr = mm.brk.End
+		mm.mappingMu.Unlock()
+		return addr, syserror.EINVAL
+	}
+
+	// TODO(gvisor.dev/issue/156): This enforces RLIMIT_DATA, but is
+	// slightly more permissive than the usual data limit. In particular,
+	// this only limits the size of the heap; a true RLIMIT_DATA limits the
+	// size of heap + data + bss. The segment sizes need to be plumbed from
+	// the loader package to fully enforce RLIMIT_DATA.
+	if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
+		addr = mm.brk.End
+		mm.mappingMu.Unlock()
+		return addr, syserror.ENOMEM
+	}
+
+	oldbrkpg, _ := mm.brk.End.RoundUp()
+	newbrkpg, ok := addr.RoundUp()
+	if !ok {
+		addr = mm.brk.End
+		mm.mappingMu.Unlock()
+		return addr, syserror.EFAULT
+	}
+
+	switch {
+	case oldbrkpg < newbrkpg:
+		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+			Length: uint64(newbrkpg - oldbrkpg),
+			Addr:   oldbrkpg,
+			Fixed:  true,
+			// Compare Linux's
+			// arch/x86/include/asm/page_types.h:VM_DATA_DEFAULT_FLAGS.
+			Perms:    usermem.ReadWrite,
+			MaxPerms: usermem.AnyAccess,
+			Private:  true,
+			// Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes
+			// mm->def_flags.
+			MLockMode: mm.defMLockMode,
+			Hint:      "[heap]",
+		})
+		if err != nil {
+			addr = mm.brk.End
+			mm.mappingMu.Unlock()
+			return addr, err
+		}
+		mm.brk.End = addr
+		if mm.defMLockMode == memmap.MLockEager {
+			mm.populateVMAAndUnlock(ctx, vseg, ar, true)
+		} else {
+			mm.mappingMu.Unlock()
+		}
+
+	case newbrkpg < oldbrkpg:
+		mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg})
+		fallthrough
+
+	default:
+		mm.brk.End = addr
+		mm.mappingMu.Unlock()
+	}
+
+	return addr, nil
+}
+
+// MLock implements the semantics of Linux's mlock()/mlock2()/munlock(),
+// depending on mode.
+func (mm *MemoryManager) MLock(ctx context.Context, addr usermem.Addr, length uint64, mode memmap.MLockMode) error {
+	// Linux allows this to overflow.
+	la, _ := usermem.Addr(length + addr.PageOffset()).RoundUp()
+	ar, ok := addr.RoundDown().ToRange(uint64(la))
+	if !ok {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	// Can't defer mm.mappingMu.Unlock(); see below.
+
+	if mode != memmap.MLockNone {
+		// Check against RLIMIT_MEMLOCK.
+		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+			if mlockLimit == 0 {
+				mm.mappingMu.Unlock()
+				return syserror.EPERM
+			}
+			if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit {
+				mm.mappingMu.Unlock()
+				return syserror.ENOMEM
+			}
+		}
+	}
+
+	// Check this after RLIMIT_MEMLOCK for consistency with Linux.
+	if ar.Length() == 0 {
+		mm.mappingMu.Unlock()
+		return nil
+	}
+
+	// Apply the new mlock mode to vmas.
+	var unmapped bool
+	vseg := mm.vmas.FindSegment(ar.Start)
+	for {
+		if !vseg.Ok() {
+			unmapped = true
+			break
+		}
+		vseg = mm.vmas.Isolate(vseg, ar)
+		vma := vseg.ValuePtr()
+		prevMode := vma.mlockMode
+		vma.mlockMode = mode
+		if mode != memmap.MLockNone && prevMode == memmap.MLockNone {
+			mm.lockedAS += uint64(vseg.Range().Length())
+		} else if mode == memmap.MLockNone && prevMode != memmap.MLockNone {
+			mm.lockedAS -= uint64(vseg.Range().Length())
+		}
+		if ar.End <= vseg.End() {
+			break
+		}
+		vseg, _ = vseg.NextNonEmpty()
+	}
+	mm.vmas.MergeRange(ar)
+	mm.vmas.MergeAdjacent(ar)
+	if unmapped {
+		mm.mappingMu.Unlock()
+		return syserror.ENOMEM
+	}
+
+	if mode == memmap.MLockEager {
+		// Ensure that we have usable pmas. Since we didn't return ENOMEM
+		// above, ar must be fully covered by vmas, so we can just use
+		// NextSegment below.
+		mm.activeMu.Lock()
+		mm.mappingMu.DowngradeLock()
+		for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+			if !vseg.ValuePtr().effectivePerms.Any() {
+				// Linux: mm/gup.c:__get_user_pages() returns EFAULT in this
+				// case, which is converted to ENOMEM by mlock.
+				mm.activeMu.Unlock()
+				mm.mappingMu.RUnlock()
+				return syserror.ENOMEM
+			}
+			_, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), usermem.NoAccess)
+			if err != nil {
+				mm.activeMu.Unlock()
+				mm.mappingMu.RUnlock()
+				// Linux: mm/mlock.c:__mlock_posix_error_return()
+				if err == syserror.EFAULT {
+					return syserror.ENOMEM
+				}
+				if err == syserror.ENOMEM {
+					return syserror.EAGAIN
+				}
+				return err
+			}
+		}
+
+		// Map pmas into the active AddressSpace, if we have one.
+		mm.mappingMu.RUnlock()
+		if mm.as != nil {
+			mm.activeMu.DowngradeLock()
+			err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */)
+			mm.activeMu.RUnlock()
+			if err != nil {
+				return err
+			}
+		} else {
+			mm.activeMu.Unlock()
+		}
+	} else {
+		mm.mappingMu.Unlock()
+	}
+
+	return nil
+}
+
+// MLockAllOpts holds options to MLockAll.
+type MLockAllOpts struct {
+	// If Current is true, change the memory-locking behavior of all mappings
+	// to Mode. If Future is true, upgrade the memory-locking behavior of all
+	// future mappings to Mode. At least one of Current or Future must be true.
+	Current bool
+	Future  bool
+	Mode    memmap.MLockMode
+}
+
+// MLockAll implements the semantics of Linux's mlockall()/munlockall(),
+// depending on opts.
+func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error {
+	if !opts.Current && !opts.Future {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	// Can't defer mm.mappingMu.Unlock(); see below.
+
+	if opts.Current {
+		if opts.Mode != memmap.MLockNone {
+			// Check against RLIMIT_MEMLOCK.
+			if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+				mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+				if mlockLimit == 0 {
+					mm.mappingMu.Unlock()
+					return syserror.EPERM
+				}
+				if uint64(mm.vmas.Span()) > mlockLimit {
+					mm.mappingMu.Unlock()
+					return syserror.ENOMEM
+				}
+			}
+		}
+		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+			vma := vseg.ValuePtr()
+			prevMode := vma.mlockMode
+			vma.mlockMode = opts.Mode
+			if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone {
+				mm.lockedAS += uint64(vseg.Range().Length())
+			} else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone {
+				mm.lockedAS -= uint64(vseg.Range().Length())
+			}
+		}
+	}
+
+	if opts.Future {
+		mm.defMLockMode = opts.Mode
+	}
+
+	if opts.Current && opts.Mode == memmap.MLockEager {
+		// Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate()
+		// ignores the return value of __mm_populate(), so all errors below are
+		// ignored.
+		//
+		// Try to get usable pmas.
+		mm.activeMu.Lock()
+		mm.mappingMu.DowngradeLock()
+		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+			if vseg.ValuePtr().effectivePerms.Any() {
+				mm.getPMAsLocked(ctx, vseg, vseg.Range(), usermem.NoAccess)
+			}
+		}
+
+		// Map all pmas into the active AddressSpace, if we have one.
+		mm.mappingMu.RUnlock()
+		if mm.as != nil {
+			mm.activeMu.DowngradeLock()
+			mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */)
+			mm.activeMu.RUnlock()
+		} else {
+			mm.activeMu.Unlock()
+		}
+	} else {
+		mm.mappingMu.Unlock()
+	}
+	return nil
+}
+
+// NumaPolicy implements the semantics of Linux's get_mempolicy(MPOL_F_ADDR).
+func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (linux.NumaPolicy, uint64, error) {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	vseg := mm.vmas.FindSegment(addr)
+	if !vseg.Ok() {
+		return 0, 0, syserror.EFAULT
+	}
+	vma := vseg.ValuePtr()
+	return vma.numaPolicy, vma.numaNodemask, nil
+}
+
+// SetNumaPolicy implements the semantics of Linux's mbind().
+func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy linux.NumaPolicy, nodemask uint64) error {
+	if !addr.IsPageAligned() {
+		return syserror.EINVAL
+	}
+	// Linux allows this to overflow.
+	la, _ := usermem.Addr(length).RoundUp()
+	ar, ok := addr.ToRange(uint64(la))
+	if !ok {
+		return syserror.EINVAL
+	}
+	if ar.Length() == 0 {
+		return nil
+	}
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	defer func() {
+		mm.vmas.MergeRange(ar)
+		mm.vmas.MergeAdjacent(ar)
+	}()
+	vseg := mm.vmas.LowerBoundSegment(ar.Start)
+	lastEnd := ar.Start
+	for {
+		if !vseg.Ok() || lastEnd < vseg.Start() {
+			// "EFAULT: ... there was an unmapped hole in the specified memory
+			// range specified [sic] by addr and len." - mbind(2)
+			return syserror.EFAULT
+		}
+		vseg = mm.vmas.Isolate(vseg, ar)
+		vma := vseg.ValuePtr()
+		vma.numaPolicy = policy
+		vma.numaNodemask = nodemask
+		lastEnd = vseg.End()
+		if ar.End <= lastEnd {
+			return nil
+		}
+		vseg, _ = vseg.NextNonEmpty()
+	}
+}
+
+// SetDontFork implements the semantics of madvise MADV_DONTFORK.
+func (mm *MemoryManager) SetDontFork(addr usermem.Addr, length uint64, dontfork bool) error {
+	ar, ok := addr.ToRange(length)
+	if !ok {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	defer func() {
+		mm.vmas.MergeRange(ar)
+		mm.vmas.MergeAdjacent(ar)
+	}()
+
+	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+		vseg = mm.vmas.Isolate(vseg, ar)
+		vma := vseg.ValuePtr()
+		vma.dontfork = dontfork
+	}
+
+	if mm.vmas.SpanRange(ar) != ar.Length() {
+		return syserror.ENOMEM
+	}
+	return nil
+}
+
+// Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
+func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
+	ar, ok := addr.ToRange(length)
+	if !ok {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	mm.activeMu.Lock()
+	defer mm.activeMu.Unlock()
+
+	// Linux's mm/madvise.c:madvise_dontneed() => mm/memory.c:zap_page_range()
+	// is analogous to our mm.invalidateLocked(ar, true, true). We inline this
+	// here, with the special case that we synchronously decommit
+	// uniquely-owned (non-copy-on-write) pages for private anonymous vma,
+	// which is the common case for MADV_DONTNEED. Invalidating these pmas, and
+	// allowing them to be reallocated when touched again, increases pma
+	// fragmentation, which may significantly reduce performance for
+	// non-vectored I/O implementations. Also, decommitting synchronously
+	// ensures that Decommit immediately reduces host memory usage.
+	var didUnmapAS bool
+	pseg := mm.pmas.LowerBoundSegment(ar.Start)
+	mf := mm.mfp.MemoryFile()
+	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+		vma := vseg.ValuePtr()
+		if vma.mlockMode != memmap.MLockNone {
+			return syserror.EINVAL
+		}
+		vsegAR := vseg.Range().Intersect(ar)
+		// pseg should already correspond to either this vma or a later one,
+		// since there can't be a pma without a corresponding vma.
+		if checkInvariants {
+			if pseg.Ok() && pseg.End() <= vsegAR.Start {
+				panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR))
+			}
+		}
+		for pseg.Ok() && pseg.Start() < vsegAR.End {
+			pma := pseg.ValuePtr()
+			if pma.private && !mm.isPMACopyOnWriteLocked(vseg, pseg) {
+				psegAR := pseg.Range().Intersect(ar)
+				if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil {
+					if err := mf.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
+						pseg = pseg.NextSegment()
+						continue
+					}
+					// If an error occurs, fall through to the general
+					// invalidation case below.
+				}
+			}
+			pseg = mm.pmas.Isolate(pseg, vsegAR)
+			pma = pseg.ValuePtr()
+			if !didUnmapAS {
+				// Unmap all of ar, not just pseg.Range(), to minimize host
+				// syscalls. AddressSpace mappings must be removed before
+				// mm.decPrivateRef().
+				mm.unmapASLocked(ar)
+				didUnmapAS = true
+			}
+			if pma.private {
+				mm.decPrivateRef(pseg.fileRange())
+			}
+			pma.file.DecRef(pseg.fileRange())
+			mm.removeRSSLocked(pseg.Range())
+			pseg = mm.pmas.Remove(pseg).NextSegment()
+		}
+	}
+
+	// "If there are some parts of the specified address space that are not
+	// mapped, the Linux version of madvise() ignores them and applies the call
+	// to the rest (but returns ENOMEM from the system call, as it should)." -
+	// madvise(2)
+	if mm.vmas.SpanRange(ar) != ar.Length() {
+		return syserror.ENOMEM
+	}
+	return nil
+}
+
+// MSyncOpts holds options to MSync.
+type MSyncOpts struct {
+	// Sync has the semantics of MS_SYNC.
+	Sync bool
+
+	// Invalidate has the semantics of MS_INVALIDATE.
+	Invalidate bool
+}
+
+// MSync implements the semantics of Linux's msync().
+func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length uint64, opts MSyncOpts) error {
+	if addr != addr.RoundDown() {
+		return syserror.EINVAL
+	}
+	if length == 0 {
+		return nil
+	}
+	la, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return syserror.ENOMEM
+	}
+	ar, ok := addr.ToRange(uint64(la))
+	if !ok {
+		return syserror.ENOMEM
+	}
+
+	mm.mappingMu.RLock()
+	// Can't defer mm.mappingMu.RUnlock(); see below.
+	vseg := mm.vmas.LowerBoundSegment(ar.Start)
+	if !vseg.Ok() {
+		mm.mappingMu.RUnlock()
+		return syserror.ENOMEM
+	}
+	var unmapped bool
+	lastEnd := ar.Start
+	for {
+		if !vseg.Ok() {
+			mm.mappingMu.RUnlock()
+			unmapped = true
+			break
+		}
+		if lastEnd < vseg.Start() {
+			unmapped = true
+		}
+		lastEnd = vseg.End()
+		vma := vseg.ValuePtr()
+		if opts.Invalidate && vma.mlockMode != memmap.MLockNone {
+			mm.mappingMu.RUnlock()
+			return syserror.EBUSY
+		}
+		// It's only possible to have dirtied the Mappable through a shared
+		// mapping. Don't check if the mapping is writable, because mprotect
+		// may have changed this, and also because Linux doesn't.
+		if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private {
+			// We can't call memmap.MappingIdentity.Msync while holding
+			// mm.mappingMu since it may take fs locks that precede it in the
+			// lock order.
+			id.IncRef()
+			mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar))
+			mm.mappingMu.RUnlock()
+			err := id.Msync(ctx, mr)
+			id.DecRef()
+			if err != nil {
+				return err
+			}
+			if lastEnd >= ar.End {
+				break
+			}
+			mm.mappingMu.RLock()
+			vseg = mm.vmas.LowerBoundSegment(lastEnd)
+		} else {
+			if lastEnd >= ar.End {
+				mm.mappingMu.RUnlock()
+				break
+			}
+			vseg = vseg.NextSegment()
+		}
+	}
+
+	if unmapped {
+		return syserror.ENOMEM
+	}
+	return nil
+}
+
+// GetSharedFutexKey is used by kernel.Task.GetSharedKey.
+func (mm *MemoryManager) GetSharedFutexKey(ctx context.Context, addr usermem.Addr) (futex.Key, error) {
+	ar, ok := addr.ToRange(4) // sizeof(int32).
+	if !ok {
+		return futex.Key{}, syserror.EFAULT
+	}
+
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	vseg, _, err := mm.getVMAsLocked(ctx, ar, usermem.Read, false)
+	if err != nil {
+		return futex.Key{}, err
+	}
+	vma := vseg.ValuePtr()
+
+	if vma.private {
+		return futex.Key{
+			Kind:   futex.KindSharedPrivate,
+			Offset: uint64(addr),
+		}, nil
+	}
+
+	if vma.id != nil {
+		vma.id.IncRef()
+	}
+	return futex.Key{
+		Kind:            futex.KindSharedMappable,
+		Mappable:        vma.mappable,
+		MappingIdentity: vma.id,
+		Offset:          vseg.mappableOffsetAt(addr),
+	}, nil
+}
+
+// VirtualMemorySize returns the combined length in bytes of all mappings in
+// mm.
+func (mm *MemoryManager) VirtualMemorySize() uint64 {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	return mm.usageAS
+}
+
+// VirtualMemorySizeRange returns the combined length in bytes of all mappings
+// in ar in mm.
+func (mm *MemoryManager) VirtualMemorySizeRange(ar usermem.AddrRange) uint64 {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	return uint64(mm.vmas.SpanRange(ar))
+}
+
+// ResidentSetSize returns the value advertised as mm's RSS in bytes.
+func (mm *MemoryManager) ResidentSetSize() uint64 {
+	mm.activeMu.RLock()
+	defer mm.activeMu.RUnlock()
+	return mm.curRSS
+}
+
+// MaxResidentSetSize returns the value advertised as mm's max RSS in bytes.
+func (mm *MemoryManager) MaxResidentSetSize() uint64 {
+	mm.activeMu.RLock()
+	defer mm.activeMu.RUnlock()
+	return mm.maxRSS
+}
+
+// VirtualDataSize returns the size of private data segments in mm.
+func (mm *MemoryManager) VirtualDataSize() uint64 {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	return mm.dataAS
+}
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
new file mode 100644
index 000000000..16d8207e9
--- /dev/null
+++ b/pkg/sentry/mm/vma.go
@@ -0,0 +1,568 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Preconditions: mm.mappingMu must be locked for writing. opts must be valid
+// as defined by the checks in MMap.
+func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts) (vmaIterator, usermem.AddrRange, error) {
+	if opts.MaxPerms != opts.MaxPerms.Effective() {
+		panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms))
+	}
+
+	// Find a usable range.
+	addr, err := mm.findAvailableLocked(opts.Length, findAvailableOpts{
+		Addr:     opts.Addr,
+		Fixed:    opts.Fixed,
+		Unmap:    opts.Unmap,
+		Map32Bit: opts.Map32Bit,
+	})
+	if err != nil {
+		return vmaIterator{}, usermem.AddrRange{}, err
+	}
+	ar, _ := addr.ToRange(opts.Length)
+
+	// Check against RLIMIT_AS.
+	newUsageAS := mm.usageAS + opts.Length
+	if opts.Unmap {
+		newUsageAS -= uint64(mm.vmas.SpanRange(ar))
+	}
+	if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS {
+		return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM
+	}
+
+	if opts.MLockMode != memmap.MLockNone {
+		// Check against RLIMIT_MEMLOCK.
+		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+			if mlockLimit == 0 {
+				return vmaIterator{}, usermem.AddrRange{}, syserror.EPERM
+			}
+			newLockedAS := mm.lockedAS + opts.Length
+			if opts.Unmap {
+				newLockedAS -= mm.mlockedBytesRangeLocked(ar)
+			}
+			if newLockedAS > mlockLimit {
+				return vmaIterator{}, usermem.AddrRange{}, syserror.EAGAIN
+			}
+		}
+	}
+
+	// Remove overwritten mappings. This ordering is consistent with Linux:
+	// compare Linux's mm/mmap.c:mmap_region() => do_munmap(),
+	// file->f_op->mmap().
+	var vgap vmaGapIterator
+	if opts.Unmap {
+		vgap = mm.unmapLocked(ctx, ar)
+	} else {
+		vgap = mm.vmas.FindGap(ar.Start)
+	}
+
+	// Inform the Mappable, if any, of the new mapping.
+	if opts.Mappable != nil {
+		// The expression for writable is vma.canWriteMappableLocked(), but we
+		// don't yet have a vma.
+		if err := opts.Mappable.AddMapping(ctx, mm, ar, opts.Offset, !opts.Private && opts.MaxPerms.Write); err != nil {
+			return vmaIterator{}, usermem.AddrRange{}, err
+		}
+	}
+
+	// Take a reference on opts.MappingIdentity before inserting the vma since
+	// vma merging can drop the reference.
+	if opts.MappingIdentity != nil {
+		opts.MappingIdentity.IncRef()
+	}
+
+	// Finally insert the vma.
+	v := vma{
+		mappable:       opts.Mappable,
+		off:            opts.Offset,
+		realPerms:      opts.Perms,
+		effectivePerms: opts.Perms.Effective(),
+		maxPerms:       opts.MaxPerms,
+		private:        opts.Private,
+		growsDown:      opts.GrowsDown,
+		mlockMode:      opts.MLockMode,
+		numaPolicy:     linux.MPOL_DEFAULT,
+		id:             opts.MappingIdentity,
+		hint:           opts.Hint,
+	}
+
+	vseg := mm.vmas.Insert(vgap, ar, v)
+	mm.usageAS += opts.Length
+	if v.isPrivateDataLocked() {
+		mm.dataAS += opts.Length
+	}
+	if opts.MLockMode != memmap.MLockNone {
+		mm.lockedAS += opts.Length
+	}
+
+	return vseg, ar, nil
+}
+
+type findAvailableOpts struct {
+	// These fields are equivalent to those in memmap.MMapOpts, except that:
+	//
+	// - Addr must be page-aligned.
+	//
+	// - Unmap allows existing guard pages in the returned range.
+
+	Addr     usermem.Addr
+	Fixed    bool
+	Unmap    bool
+	Map32Bit bool
+}
+
+// map32Start/End are the bounds to which MAP_32BIT mappings are constrained,
+// and are equivalent to Linux's MAP32_BASE and MAP32_MAX respectively.
+const (
+	map32Start = 0x40000000
+	map32End   = 0x80000000
+)
+
+// findAvailableLocked finds an allocatable range.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) findAvailableLocked(length uint64, opts findAvailableOpts) (usermem.Addr, error) {
+	if opts.Fixed {
+		opts.Map32Bit = false
+	}
+	allowedAR := mm.applicationAddrRange()
+	if opts.Map32Bit {
+		allowedAR = allowedAR.Intersect(usermem.AddrRange{map32Start, map32End})
+	}
+
+	// Does the provided suggestion work?
+	if ar, ok := opts.Addr.ToRange(length); ok {
+		if allowedAR.IsSupersetOf(ar) {
+			if opts.Unmap {
+				return ar.Start, nil
+			}
+			// Check for the presence of an existing vma or guard page.
+			if vgap := mm.vmas.FindGap(ar.Start); vgap.Ok() && vgap.availableRange().IsSupersetOf(ar) {
+				return ar.Start, nil
+			}
+		}
+	}
+
+	// Fixed mappings accept only the requested address.
+	if opts.Fixed {
+		return 0, syserror.ENOMEM
+	}
+
+	// Prefer hugepage alignment if a hugepage or more is requested.
+	alignment := uint64(usermem.PageSize)
+	if length >= usermem.HugePageSize {
+		alignment = usermem.HugePageSize
+	}
+
+	if opts.Map32Bit {
+		return mm.findLowestAvailableLocked(length, alignment, allowedAR)
+	}
+	if mm.layout.DefaultDirection == arch.MmapBottomUp {
+		return mm.findLowestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.BottomUpBase, mm.layout.MaxAddr})
+	}
+	return mm.findHighestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.MinAddr, mm.layout.TopDownBase})
+}
+
+func (mm *MemoryManager) applicationAddrRange() usermem.AddrRange {
+	return usermem.AddrRange{mm.layout.MinAddr, mm.layout.MaxAddr}
+}
+
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) {
+	for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextLargeEnoughGap(usermem.Addr(length)) {
+		if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
+			// Can we shift up to match the alignment?
+			if offset := uint64(gr.Start) % alignment; offset != 0 {
+				if uint64(gr.Length()) >= length+alignment-offset {
+					// Yes, we're aligned.
+					return gr.Start + usermem.Addr(alignment-offset), nil
+				}
+			}
+
+			// Either aligned perfectly, or can't align it.
+			return gr.Start, nil
+		}
+	}
+	return 0, syserror.ENOMEM
+}
+
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) {
+	for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevLargeEnoughGap(usermem.Addr(length)) {
+		if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
+			// Can we shift down to match the alignment?
+			start := gr.End - usermem.Addr(length)
+			if offset := uint64(start) % alignment; offset != 0 {
+				if gr.Start <= start-usermem.Addr(offset) {
+					// Yes, we're aligned.
+					return start - usermem.Addr(offset), nil
+				}
+			}
+
+			// Either aligned perfectly, or can't align it.
+			return start, nil
+		}
+	}
+	return 0, syserror.ENOMEM
+}
+
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 {
+	var total uint64
+	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+		if vseg.ValuePtr().mlockMode != memmap.MLockNone {
+			total += uint64(vseg.Range().Intersect(ar).Length())
+		}
+	}
+	return total
+}
+
+// getVMAsLocked ensures that vmas exist for all addresses in ar, and support
+// access of type (at, ignorePermissions). It returns:
+//
+// - An iterator to the vma containing ar.Start. If no vma contains ar.Start,
+// the iterator is unspecified.
+//
+// - An iterator to the gap after the last vma containing an address in ar. If
+// vmas exist for no addresses in ar, the iterator is to a gap that begins
+// before ar.Start.
+//
+// - An error that is non-nil if vmas exist for only a subset of ar.
+//
+// Preconditions: mm.mappingMu must be locked for reading; it may be
+// temporarily unlocked. ar.Length() != 0.
+func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	// Inline mm.vmas.LowerBoundSegment so that we have the preceding gap if
+	// !vbegin.Ok().
+	vbegin, vgap := mm.vmas.Find(ar.Start)
+	if !vbegin.Ok() {
+		vbegin = vgap.NextSegment()
+		// vseg.Ok() is checked before entering the following loop.
+	} else {
+		vgap = vbegin.PrevGap()
+	}
+
+	addr := ar.Start
+	vseg := vbegin
+	for vseg.Ok() {
+		// Loop invariants: vgap = vseg.PrevGap(); addr < vseg.End().
+		vma := vseg.ValuePtr()
+		if addr < vseg.Start() {
+			// TODO(jamieliu): Implement vma.growsDown here.
+			return vbegin, vgap, syserror.EFAULT
+		}
+
+		perms := vma.effectivePerms
+		if ignorePermissions {
+			perms = vma.maxPerms
+		}
+		if !perms.SupersetOf(at) {
+			return vbegin, vgap, syserror.EPERM
+		}
+
+		addr = vseg.End()
+		vgap = vseg.NextGap()
+		if addr >= ar.End {
+			return vbegin, vgap, nil
+		}
+		vseg = vgap.NextSegment()
+	}
+
+	// Ran out of vmas before ar.End.
+	return vbegin, vgap, syserror.EFAULT
+}
+
+// getVecVMAsLocked ensures that vmas exist for all addresses in ars, and
+// support access to type of (at, ignorePermissions). It returns the subset of
+// ars for which vmas exist. If this is not equal to ars, it returns a non-nil
+// error explaining why.
+//
+// Preconditions: mm.mappingMu must be locked for reading; it may be
+// temporarily unlocked.
+//
+// Postconditions: ars is not mutated.
+func (mm *MemoryManager) getVecVMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool) (usermem.AddrRangeSeq, error) {
+	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
+		ar := arsit.Head()
+		if ar.Length() == 0 {
+			continue
+		}
+		if _, vend, err := mm.getVMAsLocked(ctx, ar, at, ignorePermissions); err != nil {
+			return truncatedAddrRangeSeq(ars, arsit, vend.Start()), err
+		}
+	}
+	return ars, nil
+}
+
+// vma extension will not shrink the number of unmapped bytes between the start
+// of a growsDown vma and the end of its predecessor non-growsDown vma below
+// guardBytes.
+//
+// guardBytes is equivalent to Linux's stack_guard_gap after upstream
+// 1be7107fbe18 "mm: larger stack guard gap, between vmas".
+const guardBytes = 256 * usermem.PageSize
+
+// unmapLocked unmaps all addresses in ar and returns the resulting gap in
+// mm.vmas.
+//
+// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0.
+// ar must be page-aligned.
+func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	// AddressSpace mappings and pmas must be invalidated before
+	// mm.removeVMAsLocked() => memmap.Mappable.RemoveMapping().
+	mm.Invalidate(ar, memmap.InvalidateOpts{InvalidatePrivate: true})
+	return mm.removeVMAsLocked(ctx, ar)
+}
+
+// removeVMAsLocked removes vmas for addresses in ar and returns the resulting
+// gap in mm.vmas. It does not remove pmas or AddressSpace mappings; clients
+// must do so before calling removeVMAsLocked.
+//
+// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. ar
+// must be page-aligned.
+func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	vseg, vgap := mm.vmas.Find(ar.Start)
+	if vgap.Ok() {
+		vseg = vgap.NextSegment()
+	}
+	for vseg.Ok() && vseg.Start() < ar.End {
+		vseg = mm.vmas.Isolate(vseg, ar)
+		vmaAR := vseg.Range()
+		vma := vseg.ValuePtr()
+		if vma.mappable != nil {
+			vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked())
+		}
+		if vma.id != nil {
+			vma.id.DecRef()
+		}
+		mm.usageAS -= uint64(vmaAR.Length())
+		if vma.isPrivateDataLocked() {
+			mm.dataAS -= uint64(vmaAR.Length())
+		}
+		if vma.mlockMode != memmap.MLockNone {
+			mm.lockedAS -= uint64(vmaAR.Length())
+		}
+		vgap = mm.vmas.Remove(vseg)
+		vseg = vgap.NextSegment()
+	}
+	return vgap
+}
+
+// canWriteMappableLocked returns true if it is possible for vma.mappable to be
+// written to via this vma, i.e. if it is possible that
+// vma.mappable.Translate(at.Write=true) may be called as a result of this vma.
+// This includes via I/O with usermem.IOOpts.IgnorePermissions = true, such as
+// PTRACE_POKEDATA.
+//
+// canWriteMappableLocked is equivalent to Linux's VM_SHARED.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (vma *vma) canWriteMappableLocked() bool {
+	return !vma.private && vma.maxPerms.Write
+}
+
+// isPrivateDataLocked identify the data segments - private, writable, not stack
+//
+// Preconditions: mm.mappingMu must be locked.
+func (vma *vma) isPrivateDataLocked() bool {
+	return vma.realPerms.Write && vma.private && !vma.growsDown
+}
+
+// vmaSetFunctions implements segment.Functions for vmaSet.
+type vmaSetFunctions struct{}
+
+func (vmaSetFunctions) MinKey() usermem.Addr {
+	return 0
+}
+
+func (vmaSetFunctions) MaxKey() usermem.Addr {
+	return ^usermem.Addr(0)
+}
+
+func (vmaSetFunctions) ClearValue(vma *vma) {
+	vma.mappable = nil
+	vma.id = nil
+	vma.hint = ""
+}
+
+func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRange, vma2 vma) (vma, bool) {
+	if vma1.mappable != vma2.mappable ||
+		(vma1.mappable != nil && vma1.off+uint64(ar1.Length()) != vma2.off) ||
+		vma1.realPerms != vma2.realPerms ||
+		vma1.maxPerms != vma2.maxPerms ||
+		vma1.private != vma2.private ||
+		vma1.growsDown != vma2.growsDown ||
+		vma1.mlockMode != vma2.mlockMode ||
+		vma1.numaPolicy != vma2.numaPolicy ||
+		vma1.numaNodemask != vma2.numaNodemask ||
+		vma1.dontfork != vma2.dontfork ||
+		vma1.id != vma2.id ||
+		vma1.hint != vma2.hint {
+		return vma{}, false
+	}
+
+	if vma2.id != nil {
+		vma2.id.DecRef()
+	}
+	return vma1, true
+}
+
+func (vmaSetFunctions) Split(ar usermem.AddrRange, v vma, split usermem.Addr) (vma, vma) {
+	v2 := v
+	if v2.mappable != nil {
+		v2.off += uint64(split - ar.Start)
+	}
+	if v2.id != nil {
+		v2.id.IncRef()
+	}
+	return v, v2
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil. vseg.Range().Contains(addr).
+func (vseg vmaIterator) mappableOffsetAt(addr usermem.Addr) uint64 {
+	if checkInvariants {
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
+		if vseg.ValuePtr().mappable == nil {
+			panic("Mappable offset is meaningless for anonymous vma")
+		}
+		if !vseg.Range().Contains(addr) {
+			panic(fmt.Sprintf("addr %v out of bounds %v", addr, vseg.Range()))
+		}
+	}
+
+	vma := vseg.ValuePtr()
+	vstart := vseg.Start()
+	return vma.off + uint64(addr-vstart)
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil.
+func (vseg vmaIterator) mappableRange() memmap.MappableRange {
+	return vseg.mappableRangeOf(vseg.Range())
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil.
+// vseg.Range().IsSupersetOf(ar). ar.Length() != 0.
+func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRange {
+	if checkInvariants {
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
+		if vseg.ValuePtr().mappable == nil {
+			panic("MappableRange is meaningless for anonymous vma")
+		}
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !vseg.Range().IsSupersetOf(ar) {
+			panic(fmt.Sprintf("ar %v out of bounds %v", ar, vseg.Range()))
+		}
+	}
+
+	vma := vseg.ValuePtr()
+	vstart := vseg.Start()
+	return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)}
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil.
+// vseg.mappableRange().IsSupersetOf(mr). mr.Length() != 0.
+func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange {
+	if checkInvariants {
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
+		if vseg.ValuePtr().mappable == nil {
+			panic("MappableRange is meaningless for anonymous vma")
+		}
+		if !mr.WellFormed() || mr.Length() <= 0 {
+			panic(fmt.Sprintf("invalid mr: %v", mr))
+		}
+		if !vseg.mappableRange().IsSupersetOf(mr) {
+			panic(fmt.Sprintf("mr %v out of bounds %v", mr, vseg.mappableRange()))
+		}
+	}
+
+	vma := vseg.ValuePtr()
+	vstart := vseg.Start()
+	return usermem.AddrRange{vstart + usermem.Addr(mr.Start-vma.off), vstart + usermem.Addr(mr.End-vma.off)}
+}
+
+// seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by
+// scanning linearly forward from vseg.
+//
+// Preconditions: mm.mappingMu must be locked. addr >= vseg.Start().
+func (vseg vmaIterator) seekNextLowerBound(addr usermem.Addr) vmaIterator {
+	if checkInvariants {
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
+		if addr < vseg.Start() {
+			panic(fmt.Sprintf("can't seek forward to %#x from %#x", addr, vseg.Start()))
+		}
+	}
+	for vseg.Ok() && addr >= vseg.End() {
+		vseg = vseg.NextSegment()
+	}
+	return vseg
+}
+
+// availableRange returns the subset of vgap.Range() in which new vmas may be
+// created without MMapOpts.Unmap == true.
+func (vgap vmaGapIterator) availableRange() usermem.AddrRange {
+	ar := vgap.Range()
+	next := vgap.NextSegment()
+	if !next.Ok() || !next.ValuePtr().growsDown {
+		return ar
+	}
+	// Exclude guard pages.
+	if ar.Length() < guardBytes {
+		return usermem.AddrRange{ar.Start, ar.Start}
+	}
+	ar.End -= guardBytes
+	return ar
+}