summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/mm
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/mm')
-rw-r--r--pkg/sentry/mm/BUILD142
-rw-r--r--pkg/sentry/mm/README.md280
-rw-r--r--pkg/sentry/mm/address_space.go236
-rw-r--r--pkg/sentry/mm/aio_context.go429
-rw-r--r--pkg/sentry/mm/aio_context_state.go20
-rw-r--r--pkg/sentry/mm/debug.go98
-rw-r--r--pkg/sentry/mm/io.go639
-rw-r--r--pkg/sentry/mm/lifecycle.go283
-rw-r--r--pkg/sentry/mm/metadata.go183
-rw-r--r--pkg/sentry/mm/mm.go478
-rw-r--r--pkg/sentry/mm/mm_test.go230
-rw-r--r--pkg/sentry/mm/pma.go1036
-rw-r--r--pkg/sentry/mm/procfs.go329
-rw-r--r--pkg/sentry/mm/save_restore.go57
-rw-r--r--pkg/sentry/mm/shm.go66
-rw-r--r--pkg/sentry/mm/special_mappable.go157
-rw-r--r--pkg/sentry/mm/syscalls.go1286
-rw-r--r--pkg/sentry/mm/vma.go568
18 files changed, 6517 insertions, 0 deletions
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
new file mode 100644
index 000000000..a036ce53c
--- /dev/null
+++ b/pkg/sentry/mm/BUILD
@@ -0,0 +1,142 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+ name = "file_refcount_set",
+ out = "file_refcount_set.go",
+ imports = {
+ "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
+ },
+ package = "mm",
+ prefix = "fileRefcount",
+ template = "//pkg/segment:generic_set",
+ types = {
+ "Key": "uint64",
+ "Range": "platform.FileRange",
+ "Value": "int32",
+ "Functions": "fileRefcountSetFunctions",
+ },
+)
+
+go_template_instance(
+ name = "vma_set",
+ out = "vma_set.go",
+ consts = {
+ "minDegree": "8",
+ "trackGaps": "1",
+ },
+ imports = {
+ "usermem": "gvisor.dev/gvisor/pkg/usermem",
+ },
+ package = "mm",
+ prefix = "vma",
+ template = "//pkg/segment:generic_set",
+ types = {
+ "Key": "usermem.Addr",
+ "Range": "usermem.AddrRange",
+ "Value": "vma",
+ "Functions": "vmaSetFunctions",
+ },
+)
+
+go_template_instance(
+ name = "pma_set",
+ out = "pma_set.go",
+ consts = {
+ "minDegree": "8",
+ },
+ imports = {
+ "usermem": "gvisor.dev/gvisor/pkg/usermem",
+ },
+ package = "mm",
+ prefix = "pma",
+ template = "//pkg/segment:generic_set",
+ types = {
+ "Key": "usermem.Addr",
+ "Range": "usermem.AddrRange",
+ "Value": "pma",
+ "Functions": "pmaSetFunctions",
+ },
+)
+
+go_template_instance(
+ name = "io_list",
+ out = "io_list.go",
+ package = "mm",
+ prefix = "io",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*ioResult",
+ "Linker": "*ioResult",
+ },
+)
+
+go_library(
+ name = "mm",
+ srcs = [
+ "address_space.go",
+ "aio_context.go",
+ "aio_context_state.go",
+ "debug.go",
+ "file_refcount_set.go",
+ "io.go",
+ "io_list.go",
+ "lifecycle.go",
+ "metadata.go",
+ "mm.go",
+ "pma.go",
+ "pma_set.go",
+ "procfs.go",
+ "save_restore.go",
+ "shm.go",
+ "special_mappable.go",
+ "syscalls.go",
+ "vma.go",
+ "vma_set.go",
+ ],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/atomicbitops",
+ "//pkg/context",
+ "//pkg/log",
+ "//pkg/refs",
+ "//pkg/safecopy",
+ "//pkg/safemem",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/fs/proc/seqfile",
+ "//pkg/sentry/fsbridge",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/futex",
+ "//pkg/sentry/kernel/shm",
+ "//pkg/sentry/limits",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/pgalloc",
+ "//pkg/sentry/platform",
+ "//pkg/sentry/usage",
+ "//pkg/sync",
+ "//pkg/syserror",
+ "//pkg/tcpip/buffer",
+ "//pkg/usermem",
+ ],
+)
+
+go_test(
+ name = "mm_test",
+ size = "small",
+ srcs = ["mm_test.go"],
+ library = ":mm",
+ deps = [
+ "//pkg/context",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/contexttest",
+ "//pkg/sentry/limits",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/pgalloc",
+ "//pkg/sentry/platform",
+ "//pkg/syserror",
+ "//pkg/usermem",
+ ],
+)
diff --git a/pkg/sentry/mm/README.md b/pkg/sentry/mm/README.md
new file mode 100644
index 000000000..f4d43d927
--- /dev/null
+++ b/pkg/sentry/mm/README.md
@@ -0,0 +1,280 @@
+This package provides an emulation of Linux semantics for application virtual
+memory mappings.
+
+For completeness, this document also describes aspects of the memory management
+subsystem defined outside this package.
+
+# Background
+
+We begin by describing semantics for virtual memory in Linux.
+
+A virtual address space is defined as a collection of mappings from virtual
+addresses to physical memory. However, userspace applications do not configure
+mappings to physical memory directly. Instead, applications configure memory
+mappings from virtual addresses to offsets into a file using the `mmap` system
+call.[^mmap-anon] For example, a call to:
+
+ mmap(
+ /* addr = */ 0x400000,
+ /* length = */ 0x1000,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED,
+ /* fd = */ 3,
+ /* offset = */ 0);
+
+creates a mapping of length 0x1000 bytes, starting at virtual address (VA)
+0x400000, to offset 0 in the file represented by file descriptor (FD) 3. Within
+the Linux kernel, virtual memory mappings are represented by *virtual memory
+areas* (VMAs). Supposing that FD 3 represents file /tmp/foo, the state of the
+virtual memory subsystem after the `mmap` call may be depicted as:
+
+ VMA: VA:0x400000 -> /tmp/foo:0x0
+
+Establishing a virtual memory area does not necessarily establish a mapping to a
+physical address, because Linux has not necessarily provisioned physical memory
+to store the file's contents. Thus, if the application attempts to read the
+contents of VA 0x400000, it may incur a *page fault*, a CPU exception that
+forces the kernel to create such a mapping to service the read.
+
+For a file, doing so consists of several logical phases:
+
+1. The kernel allocates physical memory to store the contents of the required
+ part of the file, and copies file contents to the allocated memory.
+ Supposing that the kernel chooses the physical memory at physical address
+ (PA) 0x2fb000, the resulting state of the system is:
+
+ VMA: VA:0x400000 -> /tmp/foo:0x0
+ Filemap: /tmp/foo:0x0 -> PA:0x2fb000
+
+ (In Linux the state of the mapping from file offset to physical memory is
+ stored in `struct address_space`, but to avoid confusion with other notions
+ of address space we will refer to this system as filemap, named after Linux
+ kernel source file `mm/filemap.c`.)
+
+2. The kernel stores the effective mapping from virtual to physical address in
+ a *page table entry* (PTE) in the application's *page tables*, which are
+ used by the CPU's virtual memory hardware to perform address translation.
+ The resulting state of the system is:
+
+ VMA: VA:0x400000 -> /tmp/foo:0x0
+ Filemap: /tmp/foo:0x0 -> PA:0x2fb000
+ PTE: VA:0x400000 -----------------> PA:0x2fb000
+
+ The PTE is required for the application to actually use the contents of the
+ mapped file as virtual memory. However, the PTE is derived from the VMA and
+ filemap state, both of which are independently mutable, such that mutations
+ to either will affect the PTE. For example:
+
+ - The application may remove the VMA using the `munmap` system call. This
+ breaks the mapping from VA:0x400000 to /tmp/foo:0x0, and consequently
+ the mapping from VA:0x400000 to PA:0x2fb000. However, it does not
+ necessarily break the mapping from /tmp/foo:0x0 to PA:0x2fb000, so a
+ future mapping of the same file offset may reuse this physical memory.
+
+ - The application may invalidate the file's contents by passing a length
+ of 0 to the `ftruncate` system call. This breaks the mapping from
+ /tmp/foo:0x0 to PA:0x2fb000, and consequently the mapping from
+ VA:0x400000 to PA:0x2fb000. However, it does not break the mapping from
+ VA:0x400000 to /tmp/foo:0x0, so future changes to the file's contents
+ may again be made visible at VA:0x400000 after another page fault
+ results in the allocation of a new physical address.
+
+ Note that, in order to correctly break the mapping from VA:0x400000 to
+ PA:0x2fb000 in the latter case, filemap must also store a *reverse mapping*
+ from /tmp/foo:0x0 to VA:0x400000 so that it can locate and remove the PTE.
+
+[^mmap-anon]: Memory mappings to non-files are discussed in later sections.
+
+## Private Mappings
+
+The preceding example considered VMAs created using the `MAP_SHARED` flag, which
+means that PTEs derived from the mapping should always use physical memory that
+represents the current state of the mapped file.[^mmap-dev-zero] Applications
+can alternatively pass the `MAP_PRIVATE` flag to create a *private mapping*.
+Private mappings are *copy-on-write*.
+
+Suppose that the application instead created a private mapping in the previous
+example. In Linux, the state of the system after a read page fault would be:
+
+ VMA: VA:0x400000 -> /tmp/foo:0x0 (private)
+ Filemap: /tmp/foo:0x0 -> PA:0x2fb000
+ PTE: VA:0x400000 -----------------> PA:0x2fb000 (read-only)
+
+Now suppose the application attempts to write to VA:0x400000. For a shared
+mapping, the write would be propagated to PA:0x2fb000, and the kernel would be
+responsible for ensuring that the write is later propagated to the mapped file.
+For a private mapping, the write incurs another page fault since the PTE is
+marked read-only. In response, the kernel allocates physical memory to store the
+mapping's *private copy* of the file's contents, copies file contents to the
+allocated memory, and changes the PTE to map to the private copy. Supposing that
+the kernel chooses the physical memory at physical address (PA) 0x5ea000, the
+resulting state of the system is:
+
+ VMA: VA:0x400000 -> /tmp/foo:0x0 (private)
+ Filemap: /tmp/foo:0x0 -> PA:0x2fb000
+ PTE: VA:0x400000 -----------------> PA:0x5ea000
+
+Note that the filemap mapping from /tmp/foo:0x0 to PA:0x2fb000 may still exist,
+but is now irrelevant to this mapping.
+
+[^mmap-dev-zero]: Modulo files with special mmap semantics such as `/dev/zero`.
+
+## Anonymous Mappings
+
+Instead of passing a file to the `mmap` system call, applications can instead
+request an *anonymous* mapping by passing the `MAP_ANONYMOUS` flag.
+Semantically, an anonymous mapping is essentially a mapping to an ephemeral file
+initially filled with zero bytes. Practically speaking, this is how shared
+anonymous mappings are implemented, but private anonymous mappings do not result
+in the creation of an ephemeral file; since there would be no way to modify the
+contents of the underlying file through a private mapping, all private anonymous
+mappings use a single shared page filled with zero bytes until copy-on-write
+occurs.
+
+# Virtual Memory in the Sentry
+
+The sentry implements application virtual memory atop a host kernel, introducing
+an additional level of indirection to the above.
+
+Consider the same scenario as in the previous section. Since the sentry handles
+application system calls, the effect of an application `mmap` system call is to
+create a VMA in the sentry (as opposed to the host kernel):
+
+ Sentry VMA: VA:0x400000 -> /tmp/foo:0x0
+
+When the application first incurs a page fault on this address, the host kernel
+delivers information about the page fault to the sentry in a platform-dependent
+manner, and the sentry handles the fault:
+
+1. The sentry allocates memory to store the contents of the required part of
+ the file, and copies file contents to the allocated memory. However, since
+ the sentry is implemented atop a host kernel, it does not configure mappings
+ to physical memory directly. Instead, mappable "memory" in the sentry is
+ represented by a host file descriptor and offset, since (as noted in
+ "Background") this is the memory mapping primitive provided by the host
+ kernel. In general, memory is allocated from a temporary host file using the
+ `pgalloc` package. Supposing that the sentry allocates offset 0x3000 from
+ host file "memory-file", the resulting state is:
+
+ Sentry VMA: VA:0x400000 -> /tmp/foo:0x0
+ Sentry filemap: /tmp/foo:0x0 -> host:memory-file:0x3000
+
+2. The sentry stores the effective mapping from virtual address to host file in
+ a host VMA by invoking the `mmap` system call:
+
+ Sentry VMA: VA:0x400000 -> /tmp/foo:0x0
+ Sentry filemap: /tmp/foo:0x0 -> host:memory-file:0x3000
+ Host VMA: VA:0x400000 -----------------> host:memory-file:0x3000
+
+3. The sentry returns control to the application, which immediately incurs the
+ page fault again.[^mmap-populate] However, since a host VMA now exists for
+ the faulting virtual address, the host kernel now handles the page fault as
+ described in "Background":
+
+ Sentry VMA: VA:0x400000 -> /tmp/foo:0x0
+ Sentry filemap: /tmp/foo:0x0 -> host:memory-file:0x3000
+ Host VMA: VA:0x400000 -----------------> host:memory-file:0x3000
+ Host filemap: host:memory-file:0x3000 -> PA:0x2fb000
+ Host PTE: VA:0x400000 --------------------------------------------> PA:0x2fb000
+
+Thus, from an implementation standpoint, host VMAs serve the same purpose in the
+sentry that PTEs do in Linux. As in Linux, sentry VMA and filemap state is
+independently mutable, and the desired state of host VMAs is derived from that
+state.
+
+[^mmap-populate]: The sentry could force the host kernel to establish PTEs when
+ it creates the host VMA by passing the `MAP_POPULATE` flag to
+ the `mmap` system call, but usually does not. This is because,
+ to reduce the number of page faults that require handling by
+ the sentry and (correspondingly) the number of host `mmap`
+ system calls, the sentry usually creates host VMAs that are
+ much larger than the single faulting page.
+
+## Private Mappings
+
+The sentry implements private mappings consistently with Linux. Before
+copy-on-write, the private mapping example given in the Background results in:
+
+ Sentry VMA: VA:0x400000 -> /tmp/foo:0x0 (private)
+ Sentry filemap: /tmp/foo:0x0 -> host:memory-file:0x3000
+ Host VMA: VA:0x400000 -----------------> host:memory-file:0x3000 (read-only)
+ Host filemap: host:memory-file:0x3000 -> PA:0x2fb000
+ Host PTE: VA:0x400000 --------------------------------------------> PA:0x2fb000 (read-only)
+
+When the application attempts to write to this address, the host kernel delivers
+information about the resulting page fault to the sentry. Analogous to Linux,
+the sentry allocates memory to store the mapping's private copy of the file's
+contents, copies file contents to the allocated memory, and changes the host VMA
+to map to the private copy. Supposing that the sentry chooses the offset 0x4000
+in host file `memory-file` to store the private copy, the state of the system
+after copy-on-write is:
+
+ Sentry VMA: VA:0x400000 -> /tmp/foo:0x0 (private)
+ Sentry filemap: /tmp/foo:0x0 -> host:memory-file:0x3000
+ Host VMA: VA:0x400000 -----------------> host:memory-file:0x4000
+ Host filemap: host:memory-file:0x4000 -> PA:0x5ea000
+ Host PTE: VA:0x400000 --------------------------------------------> PA:0x5ea000
+
+However, this highlights an important difference between Linux and the sentry.
+In Linux, page tables are concrete (architecture-dependent) data structures
+owned by the kernel. Conversely, the sentry has the ability to create and
+destroy host VMAs using host system calls, but it does not have direct access to
+their state. Thus, as written, if the application invokes the `munmap` system
+call to remove the sentry VMA, it is non-trivial for the sentry to determine
+that it should deallocate `host:memory-file:0x4000`. This implies that the
+sentry must retain information about the host VMAs that it has created.
+
+## Anonymous Mappings
+
+The sentry implements anonymous mappings consistently with Linux, except that
+there is no shared zero page.
+
+# Implementation Constructs
+
+In Linux:
+
+- A virtual address space is represented by `struct mm_struct`.
+
+- VMAs are represented by `struct vm_area_struct`, stored in `struct
+ mm_struct::mmap`.
+
+- Mappings from file offsets to physical memory are stored in `struct
+ address_space`.
+
+- Reverse mappings from file offsets to virtual mappings are stored in `struct
+ address_space::i_mmap`.
+
+- Physical memory pages are represented by a pointer to `struct page` or an
+ index called a *page frame number* (PFN), represented by `pfn_t`.
+
+- PTEs are represented by architecture-dependent type `pte_t`, stored in a
+ table hierarchy rooted at `struct mm_struct::pgd`.
+
+In the sentry:
+
+- A virtual address space is represented by type [`mm.MemoryManager`][mm].
+
+- Sentry VMAs are represented by type [`mm.vma`][mm], stored in
+ `mm.MemoryManager.vmas`.
+
+- Mappings from sentry file offsets to host file offsets are abstracted
+ through interface method [`memmap.Mappable.Translate`][memmap].
+
+- Reverse mappings from sentry file offsets to virtual mappings are abstracted
+ through interface methods
+ [`memmap.Mappable.AddMapping` and `memmap.Mappable.RemoveMapping`][memmap].
+
+- Host files that may be mapped into host VMAs are represented by type
+ [`platform.File`][platform].
+
+- Host VMAs are represented in the sentry by type [`mm.pma`][mm] ("platform
+ mapping area"), stored in `mm.MemoryManager.pmas`.
+
+- Creation and destruction of host VMAs is abstracted through interface
+ methods
+ [`platform.AddressSpace.MapFile` and `platform.AddressSpace.Unmap`][platform].
+
+[memmap]: https://github.com/google/gvisor/blob/master/pkg/sentry/memmap/memmap.go
+[mm]: https://github.com/google/gvisor/blob/master/pkg/sentry/mm/mm.go
+[pgalloc]: https://github.com/google/gvisor/blob/master/pkg/sentry/pgalloc/pgalloc.go
+[platform]: https://github.com/google/gvisor/blob/master/pkg/sentry/platform/platform.go
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
new file mode 100644
index 000000000..5c667117c
--- /dev/null
+++ b/pkg/sentry/mm/address_space.go
@@ -0,0 +1,236 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// AddressSpace returns the platform.AddressSpace bound to mm.
+//
+// Preconditions: The caller must have called mm.Activate().
+func (mm *MemoryManager) AddressSpace() platform.AddressSpace {
+ if atomic.LoadInt32(&mm.active) == 0 {
+ panic("trying to use inactive address space?")
+ }
+ return mm.as
+}
+
+// Activate ensures this MemoryManager has a platform.AddressSpace.
+//
+// The caller must not hold any locks when calling Activate.
+//
+// When this MemoryManager is no longer needed by a task, it should call
+// Deactivate to release the reference.
+func (mm *MemoryManager) Activate(ctx context.Context) error {
+ // Fast path: the MemoryManager already has an active
+ // platform.AddressSpace, and we just need to indicate that we need it too.
+ for {
+ active := atomic.LoadInt32(&mm.active)
+ if active == 0 {
+ // Fall back to the slow path.
+ break
+ }
+ if atomic.CompareAndSwapInt32(&mm.active, active, active+1) {
+ return nil
+ }
+ }
+
+ for {
+ // Slow path: may need to synchronize with other goroutines changing
+ // mm.active to or from zero.
+ mm.activeMu.Lock()
+ // Inline Unlock instead of using a defer for performance since this
+ // method is commonly in the hot-path.
+
+ // Check if we raced with another goroutine performing activation.
+ if atomic.LoadInt32(&mm.active) > 0 {
+ // This can't race; Deactivate can't decrease mm.active from 1 to 0
+ // without holding activeMu.
+ atomic.AddInt32(&mm.active, 1)
+ mm.activeMu.Unlock()
+ return nil
+ }
+
+ // Do we have a context? If so, then we never unmapped it. This can
+ // only be the case if !mm.p.CooperativelySchedulesAddressSpace().
+ if mm.as != nil {
+ atomic.StoreInt32(&mm.active, 1)
+ mm.activeMu.Unlock()
+ return nil
+ }
+
+ // Get a new address space. We must force unmapping by passing nil to
+ // NewAddressSpace if requested. (As in the nil interface object, not a
+ // typed nil.)
+ mappingsID := (interface{})(mm)
+ if mm.unmapAllOnActivate {
+ mappingsID = nil
+ }
+ as, c, err := mm.p.NewAddressSpace(mappingsID)
+ if err != nil {
+ mm.activeMu.Unlock()
+ return err
+ }
+ if as == nil {
+ // AddressSpace is unavailable, we must wait.
+ //
+ // activeMu must not be held while waiting, as the user of the address
+ // space we are waiting on may attempt to take activeMu.
+ mm.activeMu.Unlock()
+
+ sleep := mm.p.CooperativelySchedulesAddressSpace() && mm.sleepForActivation
+ if sleep {
+ // Mark this task sleeping while waiting for the address space to
+ // prevent the watchdog from reporting it as a stuck task.
+ ctx.UninterruptibleSleepStart(false)
+ }
+ <-c
+ if sleep {
+ ctx.UninterruptibleSleepFinish(false)
+ }
+ continue
+ }
+
+ // Okay, we could restore all mappings at this point.
+ // But forget that. Let's just let them fault in.
+ mm.as = as
+
+ // Unmapping is done, if necessary.
+ mm.unmapAllOnActivate = false
+
+ // Now that m.as has been assigned, we can set m.active to a non-zero value
+ // to enable the fast path.
+ atomic.StoreInt32(&mm.active, 1)
+
+ mm.activeMu.Unlock()
+ return nil
+ }
+}
+
+// Deactivate releases a reference to the MemoryManager.
+func (mm *MemoryManager) Deactivate() {
+ // Fast path: this is not the last goroutine to deactivate the
+ // MemoryManager.
+ for {
+ active := atomic.LoadInt32(&mm.active)
+ if active == 1 {
+ // Fall back to the slow path.
+ break
+ }
+ if atomic.CompareAndSwapInt32(&mm.active, active, active-1) {
+ return
+ }
+ }
+
+ mm.activeMu.Lock()
+ // Same as Activate.
+
+ // Still active?
+ if atomic.AddInt32(&mm.active, -1) > 0 {
+ mm.activeMu.Unlock()
+ return
+ }
+
+ // Can we hold on to the address space?
+ if !mm.p.CooperativelySchedulesAddressSpace() {
+ mm.activeMu.Unlock()
+ return
+ }
+
+ // Release the address space.
+ mm.as.Release()
+
+ // Lost it.
+ mm.as = nil
+ mm.activeMu.Unlock()
+}
+
+// mapASLocked maps addresses in ar into mm.as. If precommit is true, mappings
+// for all addresses in ar should be precommitted.
+//
+// Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0.
+// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start).
+func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error {
+ // By default, map entire pmas at a time, under the assumption that there
+ // is no cost to mapping more of a pma than necessary.
+ mapAR := usermem.AddrRange{0, ^usermem.Addr(usermem.PageSize - 1)}
+ if precommit {
+ // When explicitly precommitting, only map ar, since overmapping may
+ // incur unexpected resource usage.
+ mapAR = ar
+ } else if mapUnit := mm.p.MapUnit(); mapUnit != 0 {
+ // Limit the range we map to ar, aligned to mapUnit.
+ mapMask := usermem.Addr(mapUnit - 1)
+ mapAR.Start = ar.Start &^ mapMask
+ // If rounding ar.End up overflows, just keep the existing mapAR.End.
+ if end := (ar.End + mapMask) &^ mapMask; end >= ar.End {
+ mapAR.End = end
+ }
+ }
+ if checkInvariants {
+ if !mapAR.IsSupersetOf(ar) {
+ panic(fmt.Sprintf("mapAR %#v is not a superset of ar %#v", mapAR, ar))
+ }
+ }
+
+ // Since this checks ar.End and not mapAR.End, we will never map a pma that
+ // is not required.
+ for pseg.Ok() && pseg.Start() < ar.End {
+ pma := pseg.ValuePtr()
+ pmaAR := pseg.Range()
+ pmaMapAR := pmaAR.Intersect(mapAR)
+ perms := pma.effectivePerms
+ if pma.needCOW {
+ perms.Write = false
+ }
+ if perms.Any() { // MapFile precondition
+ if err := mm.as.MapFile(pmaMapAR.Start, pma.file, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
+ return err
+ }
+ }
+ pseg = pseg.NextSegment()
+ }
+ return nil
+}
+
+// unmapASLocked removes all AddressSpace mappings for addresses in ar.
+//
+// Preconditions: mm.activeMu must be locked.
+func (mm *MemoryManager) unmapASLocked(ar usermem.AddrRange) {
+ if mm.as == nil {
+ // No AddressSpace? Force all mappings to be unmapped on the next
+ // Activate.
+ mm.unmapAllOnActivate = true
+ return
+ }
+
+ // unmapASLocked doesn't require vmas or pmas to exist for ar, so it can be
+ // passed ranges that include addresses that can't be mapped by the
+ // application.
+ ar = ar.Intersect(mm.applicationAddrRange())
+
+ // Note that this AddressSpace may or may not be active. If the
+ // platform does not require cooperative sharing of AddressSpaces, they
+ // are retained between Deactivate/Activate calls. Despite not being
+ // active, it is still valid to perform operations on these address
+ // spaces.
+ mm.as.Unmap(ar.Start, uint64(ar.Length()))
+}
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
new file mode 100644
index 000000000..379148903
--- /dev/null
+++ b/pkg/sentry/mm/aio_context.go
@@ -0,0 +1,429 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// aioManager creates and manages asynchronous I/O contexts.
+//
+// +stateify savable
+type aioManager struct {
+ // mu protects below.
+ mu sync.Mutex `state:"nosave"`
+
+ // aioContexts is the set of asynchronous I/O contexts.
+ contexts map[uint64]*AIOContext
+}
+
+func (a *aioManager) destroy() {
+ a.mu.Lock()
+ defer a.mu.Unlock()
+
+ for _, ctx := range a.contexts {
+ ctx.destroy()
+ }
+}
+
+// newAIOContext creates a new context for asynchronous I/O.
+//
+// Returns false if 'id' is currently in use.
+func (a *aioManager) newAIOContext(events uint32, id uint64) bool {
+ a.mu.Lock()
+ defer a.mu.Unlock()
+
+ if _, ok := a.contexts[id]; ok {
+ return false
+ }
+
+ a.contexts[id] = &AIOContext{
+ requestReady: make(chan struct{}, 1),
+ maxOutstanding: events,
+ }
+ return true
+}
+
+// destroyAIOContext destroys an asynchronous I/O context. It doesn't wait for
+// for pending requests to complete. Returns the destroyed AIOContext so it can
+// be drained.
+//
+// Nil is returned if the context does not exist.
+func (a *aioManager) destroyAIOContext(id uint64) *AIOContext {
+ a.mu.Lock()
+ defer a.mu.Unlock()
+ ctx, ok := a.contexts[id]
+ if !ok {
+ return nil
+ }
+ delete(a.contexts, id)
+ ctx.destroy()
+ return ctx
+}
+
+// lookupAIOContext looks up the given context.
+//
+// Returns false if context does not exist.
+func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) {
+ a.mu.Lock()
+ defer a.mu.Unlock()
+ ctx, ok := a.contexts[id]
+ return ctx, ok
+}
+
+// ioResult is a completed I/O operation.
+//
+// +stateify savable
+type ioResult struct {
+ data interface{}
+ ioEntry
+}
+
+// AIOContext is a single asynchronous I/O context.
+//
+// +stateify savable
+type AIOContext struct {
+ // requestReady is the notification channel used for all requests.
+ requestReady chan struct{} `state:"nosave"`
+
+ // mu protects below.
+ mu sync.Mutex `state:"nosave"`
+
+ // results is the set of completed requests.
+ results ioList
+
+ // maxOutstanding is the maximum number of outstanding entries; this value
+ // is immutable.
+ maxOutstanding uint32
+
+ // outstanding is the number of requests outstanding; this will effectively
+ // be the number of entries in the result list or that are expected to be
+ // added to the result list.
+ outstanding uint32
+
+ // dead is set when the context is destroyed.
+ dead bool `state:"zerovalue"`
+}
+
+// destroy marks the context dead.
+func (ctx *AIOContext) destroy() {
+ ctx.mu.Lock()
+ defer ctx.mu.Unlock()
+ ctx.dead = true
+ ctx.checkForDone()
+}
+
+// Preconditions: ctx.mu must be held by caller.
+func (ctx *AIOContext) checkForDone() {
+ if ctx.dead && ctx.outstanding == 0 {
+ close(ctx.requestReady)
+ ctx.requestReady = nil
+ }
+}
+
+// Prepare reserves space for a new request, returning true if available.
+// Returns false if the context is busy.
+func (ctx *AIOContext) Prepare() bool {
+ ctx.mu.Lock()
+ defer ctx.mu.Unlock()
+ if ctx.outstanding >= ctx.maxOutstanding {
+ return false
+ }
+ ctx.outstanding++
+ return true
+}
+
+// PopRequest pops a completed request if available, this function does not do
+// any blocking. Returns false if no request is available.
+func (ctx *AIOContext) PopRequest() (interface{}, bool) {
+ ctx.mu.Lock()
+ defer ctx.mu.Unlock()
+
+ // Is there anything ready?
+ if e := ctx.results.Front(); e != nil {
+ if ctx.outstanding == 0 {
+ panic("AIOContext outstanding is going negative")
+ }
+ ctx.outstanding--
+ ctx.results.Remove(e)
+ ctx.checkForDone()
+ return e.data, true
+ }
+ return nil, false
+}
+
+// FinishRequest finishes a pending request. It queues up the data
+// and notifies listeners.
+func (ctx *AIOContext) FinishRequest(data interface{}) {
+ ctx.mu.Lock()
+ defer ctx.mu.Unlock()
+
+ // Push to the list and notify opportunistically. The channel notify
+ // here is guaranteed to be safe because outstanding must be non-zero.
+ // The requestReady channel is only closed when outstanding reaches zero.
+ ctx.results.PushBack(&ioResult{data: data})
+
+ select {
+ case ctx.requestReady <- struct{}{}:
+ default:
+ }
+}
+
+// WaitChannel returns a channel that is notified when an AIO request is
+// completed. Returns nil if the context is destroyed and there are no more
+// outstanding requests.
+func (ctx *AIOContext) WaitChannel() chan struct{} {
+ ctx.mu.Lock()
+ defer ctx.mu.Unlock()
+ return ctx.requestReady
+}
+
+// Dead returns true if the context has been destroyed.
+func (ctx *AIOContext) Dead() bool {
+ ctx.mu.Lock()
+ defer ctx.mu.Unlock()
+ return ctx.dead
+}
+
+// CancelPendingRequest forgets about a request that hasn't yet completed.
+func (ctx *AIOContext) CancelPendingRequest() {
+ ctx.mu.Lock()
+ defer ctx.mu.Unlock()
+
+ if ctx.outstanding == 0 {
+ panic("AIOContext outstanding is going negative")
+ }
+ ctx.outstanding--
+ ctx.checkForDone()
+}
+
+// Drain drops all completed requests. Pending requests remain untouched.
+func (ctx *AIOContext) Drain() {
+ ctx.mu.Lock()
+ defer ctx.mu.Unlock()
+
+ if ctx.outstanding == 0 {
+ return
+ }
+ size := uint32(ctx.results.Len())
+ if ctx.outstanding < size {
+ panic("AIOContext outstanding is going negative")
+ }
+ ctx.outstanding -= size
+ ctx.results.Reset()
+ ctx.checkForDone()
+}
+
+// aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO
+// ring buffers.
+//
+// +stateify savable
+type aioMappable struct {
+ refs.AtomicRefCount
+
+ mfp pgalloc.MemoryFileProvider
+ fr platform.FileRange
+}
+
+var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp())
+
+func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) {
+ fr, err := mfp.MemoryFile().Allocate(aioRingBufferSize, usage.Anonymous)
+ if err != nil {
+ return nil, err
+ }
+ m := aioMappable{mfp: mfp, fr: fr}
+ m.EnableLeakCheck("mm.aioMappable")
+ return &m, nil
+}
+
+// DecRef implements refs.RefCounter.DecRef.
+func (m *aioMappable) DecRef() {
+ m.AtomicRefCount.DecRefWithDestructor(func() {
+ m.mfp.MemoryFile().DecRef(m.fr)
+ })
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (m *aioMappable) MappedName(ctx context.Context) string {
+ return "[aio]"
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (m *aioMappable) DeviceID() uint64 {
+ return 0
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (m *aioMappable) InodeID() uint64 {
+ return 0
+}
+
+// Msync implements memmap.MappingIdentity.Msync.
+func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error {
+ // Linux: aio_ring_fops.fsync == NULL
+ return syserror.EINVAL
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (m *aioMappable) AddMapping(_ context.Context, _ memmap.MappingSpace, ar usermem.AddrRange, offset uint64, _ bool) error {
+ // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap()
+ // sets VM_DONTEXPAND).
+ if offset != 0 || uint64(ar.Length()) != aioRingBufferSize {
+ return syserror.EFAULT
+ }
+ return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (m *aioMappable) RemoveMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) {
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, _ bool) error {
+ // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap()
+ // sets VM_DONTEXPAND).
+ if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize {
+ return syserror.EFAULT
+ }
+ // Require that the mapping correspond to a live AIOContext. Compare
+ // Linux's fs/aio.c:aio_ring_mremap().
+ mm, ok := ms.(*MemoryManager)
+ if !ok {
+ return syserror.EINVAL
+ }
+ am := &mm.aioManager
+ am.mu.Lock()
+ defer am.mu.Unlock()
+ oldID := uint64(srcAR.Start)
+ aioCtx, ok := am.contexts[oldID]
+ if !ok {
+ return syserror.EINVAL
+ }
+ aioCtx.mu.Lock()
+ defer aioCtx.mu.Unlock()
+ if aioCtx.dead {
+ return syserror.EINVAL
+ }
+ // Use the new ID for the AIOContext.
+ am.contexts[uint64(dstAR.Start)] = aioCtx
+ delete(am.contexts, oldID)
+ return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+ var err error
+ if required.End > m.fr.Length() {
+ err = &memmap.BusError{syserror.EFAULT}
+ }
+ if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 {
+ return []memmap.Translation{
+ {
+ Source: source,
+ File: m.mfp.MemoryFile(),
+ Offset: m.fr.Start + source.Start,
+ Perms: usermem.AnyAccess,
+ },
+ }, err
+ }
+ return nil, err
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (m *aioMappable) InvalidateUnsavable(ctx context.Context) error {
+ return nil
+}
+
+// NewAIOContext creates a new context for asynchronous I/O.
+//
+// NewAIOContext is analogous to Linux's fs/aio.c:ioctx_alloc().
+func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint64, error) {
+ // libaio get_ioevents() expects context "handle" to be a valid address.
+ // libaio peeks inside looking for a magic number. This function allocates
+ // a page per context and keeps it set to zeroes to ensure it will not
+ // match AIO_RING_MAGIC and make libaio happy.
+ m, err := newAIOMappable(mm.mfp)
+ if err != nil {
+ return 0, err
+ }
+ defer m.DecRef()
+ addr, err := mm.MMap(ctx, memmap.MMapOpts{
+ Length: aioRingBufferSize,
+ MappingIdentity: m,
+ Mappable: m,
+ // Linux uses "do_mmap_pgoff(..., PROT_READ | PROT_WRITE, ...)" in
+ // fs/aio.c:aio_setup_ring(). Since we don't implement AIO_RING_MAGIC,
+ // user mode should not write to this page.
+ Perms: usermem.Read,
+ MaxPerms: usermem.Read,
+ })
+ if err != nil {
+ return 0, err
+ }
+ id := uint64(addr)
+ if !mm.aioManager.newAIOContext(events, id) {
+ mm.MUnmap(ctx, addr, aioRingBufferSize)
+ return 0, syserror.EINVAL
+ }
+ return id, nil
+}
+
+// DestroyAIOContext destroys an asynchronous I/O context. It returns the
+// destroyed context. nil if the context does not exist.
+func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOContext {
+ if _, ok := mm.LookupAIOContext(ctx, id); !ok {
+ return nil
+ }
+
+ // Only unmaps after it assured that the address is a valid aio context to
+ // prevent random memory from been unmapped.
+ //
+ // Note: It's possible to unmap this address and map something else into
+ // the same address. Then it would be unmapping memory that it doesn't own.
+ // This is, however, the way Linux implements AIO. Keeps the same [weird]
+ // semantics in case anyone relies on it.
+ mm.MUnmap(ctx, usermem.Addr(id), aioRingBufferSize)
+
+ return mm.aioManager.destroyAIOContext(id)
+}
+
+// LookupAIOContext looks up the given context. It returns false if the context
+// does not exist.
+func (mm *MemoryManager) LookupAIOContext(ctx context.Context, id uint64) (*AIOContext, bool) {
+ aioCtx, ok := mm.aioManager.lookupAIOContext(id)
+ if !ok {
+ return nil, false
+ }
+
+ // Protect against 'ids' that are inaccessible (Linux also reads 4 bytes
+ // from id).
+ var buf [4]byte
+ _, err := mm.CopyIn(ctx, usermem.Addr(id), buf[:], usermem.IOOpts{})
+ if err != nil {
+ return nil, false
+ }
+
+ return aioCtx, true
+}
diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go
new file mode 100644
index 000000000..3dabac1af
--- /dev/null
+++ b/pkg/sentry/mm/aio_context_state.go
@@ -0,0 +1,20 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+// afterLoad is invoked by stateify.
+func (a *AIOContext) afterLoad() {
+ a.requestReady = make(chan struct{}, 1)
+}
diff --git a/pkg/sentry/mm/debug.go b/pkg/sentry/mm/debug.go
new file mode 100644
index 000000000..c273c982e
--- /dev/null
+++ b/pkg/sentry/mm/debug.go
@@ -0,0 +1,98 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+ "bytes"
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/context"
+)
+
+const (
+ // If checkInvariants is true, perform runtime checks for invariants
+ // expected by the mm package. This is normally disabled since MM is a
+ // significant hot path in general, and some such checks (notably
+ // memmap.CheckTranslateResult) are very expensive.
+ checkInvariants = false
+
+ // If logIOErrors is true, log I/O errors that originate from MM before
+ // converting them to EFAULT.
+ logIOErrors = false
+)
+
+// String implements fmt.Stringer.String.
+func (mm *MemoryManager) String() string {
+ return mm.DebugString(context.Background())
+}
+
+// DebugString returns a string containing information about mm for debugging.
+func (mm *MemoryManager) DebugString(ctx context.Context) string {
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ mm.activeMu.RLock()
+ defer mm.activeMu.RUnlock()
+ return mm.debugStringLocked(ctx)
+}
+
+// Preconditions: mm.mappingMu and mm.activeMu must be locked.
+func (mm *MemoryManager) debugStringLocked(ctx context.Context) string {
+ var b bytes.Buffer
+ b.WriteString("VMAs:\n")
+ for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+ b.Write(mm.vmaMapsEntryLocked(ctx, vseg))
+ }
+ b.WriteString("PMAs:\n")
+ for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
+ b.Write(pseg.debugStringEntryLocked())
+ }
+ return string(b.Bytes())
+}
+
+// Preconditions: mm.activeMu must be locked.
+func (pseg pmaIterator) debugStringEntryLocked() []byte {
+ var b bytes.Buffer
+
+ fmt.Fprintf(&b, "%08x-%08x ", pseg.Start(), pseg.End())
+
+ pma := pseg.ValuePtr()
+ if pma.effectivePerms.Read {
+ b.WriteByte('r')
+ } else {
+ b.WriteByte('-')
+ }
+ if pma.effectivePerms.Write {
+ if pma.needCOW {
+ b.WriteByte('c')
+ } else {
+ b.WriteByte('w')
+ }
+ } else {
+ b.WriteByte('-')
+ }
+ if pma.effectivePerms.Execute {
+ b.WriteByte('x')
+ } else {
+ b.WriteByte('-')
+ }
+ if pma.private {
+ b.WriteByte('p')
+ } else {
+ b.WriteByte('s')
+ }
+
+ fmt.Fprintf(&b, " %08x %T\n", pma.off, pma.file)
+ return b.Bytes()
+}
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
new file mode 100644
index 000000000..fa776f9c6
--- /dev/null
+++ b/pkg/sentry/mm/io.go
@@ -0,0 +1,639 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// There are two supported ways to copy data to/from application virtual
+// memory:
+//
+// 1. Internally-mapped copying: Determine the platform.File that backs the
+// copied-to/from virtual address, obtain a mapping of its pages, and read or
+// write to the mapping.
+//
+// 2. AddressSpace copying: If platform.Platform.SupportsAddressSpaceIO() is
+// true, AddressSpace permissions are applicable, and an AddressSpace is
+// available, copy directly through the AddressSpace, handling faults as
+// needed.
+//
+// (Given that internally-mapped copying requires that backing memory is always
+// implemented using a host file descriptor, we could also preadv/pwritev to it
+// instead. But this would incur a host syscall for each use of the mapped
+// page, whereas mmap is a one-time cost.)
+//
+// The fixed overhead of internally-mapped copying is expected to be higher
+// than that of AddressSpace copying since the former always needs to translate
+// addresses, whereas the latter only needs to do so when faults occur.
+// However, the throughput of internally-mapped copying is expected to be
+// somewhat higher than that of AddressSpace copying due to the high cost of
+// page faults and because implementations of the latter usually rely on
+// safecopy, which doesn't use AVX registers. So we prefer to use AddressSpace
+// copying (when available) for smaller copies, and switch to internally-mapped
+// copying once a size threshold is exceeded.
+const (
+ // copyMapMinBytes is the size threshold for switching to internally-mapped
+ // copying in CopyOut, CopyIn, and ZeroOut.
+ copyMapMinBytes = 32 << 10 // 32 KB
+
+ // rwMapMinBytes is the size threshold for switching to internally-mapped
+ // copying in CopyOutFrom and CopyInTo. It's lower than copyMapMinBytes
+ // since AddressSpace copying in this case requires additional buffering;
+ // see CopyOutFrom for details.
+ rwMapMinBytes = 512
+)
+
+// CheckIORange is similar to usermem.Addr.ToRange, but applies bounds checks
+// consistent with Linux's arch/x86/include/asm/uaccess.h:access_ok().
+//
+// Preconditions: length >= 0.
+func (mm *MemoryManager) CheckIORange(addr usermem.Addr, length int64) (usermem.AddrRange, bool) {
+ // Note that access_ok() constrains end even if length == 0.
+ ar, ok := addr.ToRange(uint64(length))
+ return ar, (ok && ar.End <= mm.layout.MaxAddr)
+}
+
+// checkIOVec applies bound checks consistent with Linux's
+// arch/x86/include/asm/uaccess.h:access_ok() to ars.
+func (mm *MemoryManager) checkIOVec(ars usermem.AddrRangeSeq) bool {
+ for !ars.IsEmpty() {
+ ar := ars.Head()
+ if _, ok := mm.CheckIORange(ar.Start, int64(ar.Length())); !ok {
+ return false
+ }
+ ars = ars.Tail()
+ }
+ return true
+}
+
+func (mm *MemoryManager) asioEnabled(opts usermem.IOOpts) bool {
+ return mm.haveASIO && !opts.IgnorePermissions && opts.AddressSpaceActive
+}
+
+// translateIOError converts errors to EFAULT, as is usually reported for all
+// I/O errors originating from MM in Linux.
+func translateIOError(ctx context.Context, err error) error {
+ if err == nil {
+ return nil
+ }
+ if logIOErrors {
+ ctx.Debugf("MM I/O error: %v", err)
+ }
+ return syserror.EFAULT
+}
+
+// CopyOut implements usermem.IO.CopyOut.
+func (mm *MemoryManager) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
+ ar, ok := mm.CheckIORange(addr, int64(len(src)))
+ if !ok {
+ return 0, syserror.EFAULT
+ }
+
+ if len(src) == 0 {
+ return 0, nil
+ }
+
+ // Do AddressSpace IO if applicable.
+ if mm.asioEnabled(opts) && len(src) < copyMapMinBytes {
+ return mm.asCopyOut(ctx, addr, src)
+ }
+
+ // Go through internal mappings.
+ n64, err := mm.withInternalMappings(ctx, ar, usermem.Write, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+ n, err := safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src)))
+ return n, translateIOError(ctx, err)
+ })
+ return int(n64), err
+}
+
+func (mm *MemoryManager) asCopyOut(ctx context.Context, addr usermem.Addr, src []byte) (int, error) {
+ var done int
+ for {
+ n, err := mm.as.CopyOut(addr+usermem.Addr(done), src[done:])
+ done += n
+ if err == nil {
+ return done, nil
+ }
+ if f, ok := err.(platform.SegmentationFault); ok {
+ ar, _ := addr.ToRange(uint64(len(src)))
+ if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Write); err != nil {
+ return done, err
+ }
+ continue
+ }
+ return done, translateIOError(ctx, err)
+ }
+}
+
+// CopyIn implements usermem.IO.CopyIn.
+func (mm *MemoryManager) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
+ ar, ok := mm.CheckIORange(addr, int64(len(dst)))
+ if !ok {
+ return 0, syserror.EFAULT
+ }
+
+ if len(dst) == 0 {
+ return 0, nil
+ }
+
+ // Do AddressSpace IO if applicable.
+ if mm.asioEnabled(opts) && len(dst) < copyMapMinBytes {
+ return mm.asCopyIn(ctx, addr, dst)
+ }
+
+ // Go through internal mappings.
+ n64, err := mm.withInternalMappings(ctx, ar, usermem.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+ n, err := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), ims)
+ return n, translateIOError(ctx, err)
+ })
+ return int(n64), err
+}
+
+func (mm *MemoryManager) asCopyIn(ctx context.Context, addr usermem.Addr, dst []byte) (int, error) {
+ var done int
+ for {
+ n, err := mm.as.CopyIn(addr+usermem.Addr(done), dst[done:])
+ done += n
+ if err == nil {
+ return done, nil
+ }
+ if f, ok := err.(platform.SegmentationFault); ok {
+ ar, _ := addr.ToRange(uint64(len(dst)))
+ if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Read); err != nil {
+ return done, err
+ }
+ continue
+ }
+ return done, translateIOError(ctx, err)
+ }
+}
+
+// ZeroOut implements usermem.IO.ZeroOut.
+func (mm *MemoryManager) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
+ ar, ok := mm.CheckIORange(addr, toZero)
+ if !ok {
+ return 0, syserror.EFAULT
+ }
+
+ if toZero == 0 {
+ return 0, nil
+ }
+
+ // Do AddressSpace IO if applicable.
+ if mm.asioEnabled(opts) && toZero < copyMapMinBytes {
+ return mm.asZeroOut(ctx, addr, toZero)
+ }
+
+ // Go through internal mappings.
+ return mm.withInternalMappings(ctx, ar, usermem.Write, opts.IgnorePermissions, func(dsts safemem.BlockSeq) (uint64, error) {
+ n, err := safemem.ZeroSeq(dsts)
+ return n, translateIOError(ctx, err)
+ })
+}
+
+func (mm *MemoryManager) asZeroOut(ctx context.Context, addr usermem.Addr, toZero int64) (int64, error) {
+ var done int64
+ for {
+ n, err := mm.as.ZeroOut(addr+usermem.Addr(done), uintptr(toZero-done))
+ done += int64(n)
+ if err == nil {
+ return done, nil
+ }
+ if f, ok := err.(platform.SegmentationFault); ok {
+ ar, _ := addr.ToRange(uint64(toZero))
+ if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Write); err != nil {
+ return done, err
+ }
+ continue
+ }
+ return done, translateIOError(ctx, err)
+ }
+}
+
+// CopyOutFrom implements usermem.IO.CopyOutFrom.
+func (mm *MemoryManager) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
+ if !mm.checkIOVec(ars) {
+ return 0, syserror.EFAULT
+ }
+
+ if ars.NumBytes() == 0 {
+ return 0, nil
+ }
+
+ // Do AddressSpace IO if applicable.
+ if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes {
+ // We have to introduce a buffered copy, instead of just passing a
+ // safemem.BlockSeq representing addresses in the AddressSpace to src.
+ // This is because usermem.IO.CopyOutFrom() guarantees that it calls
+ // src.ReadToBlocks() at most once, which is incompatible with handling
+ // faults between calls. In the future, this is probably best resolved
+ // by introducing a CopyOutFrom variant or option that allows it to
+ // call src.ReadToBlocks() any number of times.
+ //
+ // This issue applies to CopyInTo as well.
+ buf := make([]byte, int(ars.NumBytes()))
+ bufN, bufErr := src.ReadToBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)))
+ var done int64
+ for done < int64(bufN) {
+ ar := ars.Head()
+ cplen := int64(ar.Length())
+ if cplen > int64(bufN)-done {
+ cplen = int64(bufN) - done
+ }
+ n, err := mm.asCopyOut(ctx, ar.Start, buf[int(done):int(done+cplen)])
+ done += int64(n)
+ if err != nil {
+ return done, err
+ }
+ ars = ars.Tail()
+ }
+ // Do not convert errors returned by src to EFAULT.
+ return done, bufErr
+ }
+
+ // Go through internal mappings.
+ return mm.withVecInternalMappings(ctx, ars, usermem.Write, opts.IgnorePermissions, src.ReadToBlocks)
+}
+
+// CopyInTo implements usermem.IO.CopyInTo.
+func (mm *MemoryManager) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
+ if !mm.checkIOVec(ars) {
+ return 0, syserror.EFAULT
+ }
+
+ if ars.NumBytes() == 0 {
+ return 0, nil
+ }
+
+ // Do AddressSpace IO if applicable.
+ if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes {
+ buf := make([]byte, int(ars.NumBytes()))
+ var done int
+ var bufErr error
+ for !ars.IsEmpty() {
+ ar := ars.Head()
+ var n int
+ n, bufErr = mm.asCopyIn(ctx, ar.Start, buf[done:done+int(ar.Length())])
+ done += n
+ if bufErr != nil {
+ break
+ }
+ ars = ars.Tail()
+ }
+ n, err := dst.WriteFromBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:done])))
+ if err != nil {
+ return int64(n), err
+ }
+ // Do not convert errors returned by dst to EFAULT.
+ return int64(n), bufErr
+ }
+
+ // Go through internal mappings.
+ return mm.withVecInternalMappings(ctx, ars, usermem.Read, opts.IgnorePermissions, dst.WriteFromBlocks)
+}
+
+// SwapUint32 implements usermem.IO.SwapUint32.
+func (mm *MemoryManager) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
+ ar, ok := mm.CheckIORange(addr, 4)
+ if !ok {
+ return 0, syserror.EFAULT
+ }
+
+ // Do AddressSpace IO if applicable.
+ if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
+ for {
+ old, err := mm.as.SwapUint32(addr, new)
+ if err == nil {
+ return old, nil
+ }
+ if f, ok := err.(platform.SegmentationFault); ok {
+ if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.ReadWrite); err != nil {
+ return 0, err
+ }
+ continue
+ }
+ return 0, translateIOError(ctx, err)
+ }
+ }
+
+ // Go through internal mappings.
+ var old uint32
+ _, err := mm.withInternalMappings(ctx, ar, usermem.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+ if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
+ // Atomicity is unachievable across mappings.
+ return 0, syserror.EFAULT
+ }
+ im := ims.Head()
+ var err error
+ old, err = safemem.SwapUint32(im, new)
+ if err != nil {
+ return 0, translateIOError(ctx, err)
+ }
+ // Return the number of bytes read.
+ return 4, nil
+ })
+ return old, err
+}
+
+// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
+func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
+ ar, ok := mm.CheckIORange(addr, 4)
+ if !ok {
+ return 0, syserror.EFAULT
+ }
+
+ // Do AddressSpace IO if applicable.
+ if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
+ for {
+ prev, err := mm.as.CompareAndSwapUint32(addr, old, new)
+ if err == nil {
+ return prev, nil
+ }
+ if f, ok := err.(platform.SegmentationFault); ok {
+ if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.ReadWrite); err != nil {
+ return 0, err
+ }
+ continue
+ }
+ return 0, translateIOError(ctx, err)
+ }
+ }
+
+ // Go through internal mappings.
+ var prev uint32
+ _, err := mm.withInternalMappings(ctx, ar, usermem.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+ if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
+ // Atomicity is unachievable across mappings.
+ return 0, syserror.EFAULT
+ }
+ im := ims.Head()
+ var err error
+ prev, err = safemem.CompareAndSwapUint32(im, old, new)
+ if err != nil {
+ return 0, translateIOError(ctx, err)
+ }
+ // Return the number of bytes read.
+ return 4, nil
+ })
+ return prev, err
+}
+
+// LoadUint32 implements usermem.IO.LoadUint32.
+func (mm *MemoryManager) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) {
+ ar, ok := mm.CheckIORange(addr, 4)
+ if !ok {
+ return 0, syserror.EFAULT
+ }
+
+ // Do AddressSpace IO if applicable.
+ if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
+ for {
+ val, err := mm.as.LoadUint32(addr)
+ if err == nil {
+ return val, nil
+ }
+ if f, ok := err.(platform.SegmentationFault); ok {
+ if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Read); err != nil {
+ return 0, err
+ }
+ continue
+ }
+ return 0, translateIOError(ctx, err)
+ }
+ }
+
+ // Go through internal mappings.
+ var val uint32
+ _, err := mm.withInternalMappings(ctx, ar, usermem.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+ if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
+ // Atomicity is unachievable across mappings.
+ return 0, syserror.EFAULT
+ }
+ im := ims.Head()
+ var err error
+ val, err = safemem.LoadUint32(im)
+ if err != nil {
+ return 0, translateIOError(ctx, err)
+ }
+ // Return the number of bytes read.
+ return 4, nil
+ })
+ return val, err
+}
+
+// handleASIOFault handles a page fault at address addr for an AddressSpaceIO
+// operation spanning ioar.
+//
+// Preconditions: mm.as != nil. ioar.Length() != 0. ioar.Contains(addr).
+func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr usermem.Addr, ioar usermem.AddrRange, at usermem.AccessType) error {
+ // Try to map all remaining pages in the I/O operation. This RoundUp can't
+ // overflow because otherwise it would have been caught by CheckIORange.
+ end, _ := ioar.End.RoundUp()
+ ar := usermem.AddrRange{addr.RoundDown(), end}
+
+ // Don't bother trying existingPMAsLocked; in most cases, if we did have
+ // existing pmas, we wouldn't have faulted.
+
+ // Ensure that we have usable vmas. Here and below, only return early if we
+ // can't map the first (faulting) page; failure to map later pages are
+ // silently ignored. This maximizes partial success.
+ mm.mappingMu.RLock()
+ vseg, vend, err := mm.getVMAsLocked(ctx, ar, at, false)
+ if vendaddr := vend.Start(); vendaddr < ar.End {
+ if vendaddr <= ar.Start {
+ mm.mappingMu.RUnlock()
+ return translateIOError(ctx, err)
+ }
+ ar.End = vendaddr
+ }
+
+ // Ensure that we have usable pmas.
+ mm.activeMu.Lock()
+ pseg, pend, err := mm.getPMAsLocked(ctx, vseg, ar, at)
+ mm.mappingMu.RUnlock()
+ if pendaddr := pend.Start(); pendaddr < ar.End {
+ if pendaddr <= ar.Start {
+ mm.activeMu.Unlock()
+ return translateIOError(ctx, err)
+ }
+ ar.End = pendaddr
+ }
+
+ // Downgrade to a read-lock on activeMu since we don't need to mutate pmas
+ // anymore.
+ mm.activeMu.DowngradeLock()
+
+ err = mm.mapASLocked(pseg, ar, false)
+ mm.activeMu.RUnlock()
+ return translateIOError(ctx, err)
+}
+
+// withInternalMappings ensures that pmas exist for all addresses in ar,
+// support access of type (at, ignorePermissions), and have internal mappings
+// cached. It then calls f with mm.activeMu locked for reading, passing
+// internal mappings for the subrange of ar for which this property holds.
+//
+// withInternalMappings takes a function returning uint64 since many safemem
+// functions have this property, but returns an int64 since this is usually
+// more useful for usermem.IO methods.
+//
+// Preconditions: 0 < ar.Length() <= math.MaxInt64.
+func (mm *MemoryManager) withInternalMappings(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
+ // If pmas are already available, we can do IO without touching mm.vmas or
+ // mm.mappingMu.
+ mm.activeMu.RLock()
+ if pseg := mm.existingPMAsLocked(ar, at, ignorePermissions, true /* needInternalMappings */); pseg.Ok() {
+ n, err := f(mm.internalMappingsLocked(pseg, ar))
+ mm.activeMu.RUnlock()
+ // Do not convert errors returned by f to EFAULT.
+ return int64(n), err
+ }
+ mm.activeMu.RUnlock()
+
+ // Ensure that we have usable vmas.
+ mm.mappingMu.RLock()
+ vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions)
+ if vendaddr := vend.Start(); vendaddr < ar.End {
+ if vendaddr <= ar.Start {
+ mm.mappingMu.RUnlock()
+ return 0, translateIOError(ctx, verr)
+ }
+ ar.End = vendaddr
+ }
+
+ // Ensure that we have usable pmas.
+ mm.activeMu.Lock()
+ pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at)
+ mm.mappingMu.RUnlock()
+ if pendaddr := pend.Start(); pendaddr < ar.End {
+ if pendaddr <= ar.Start {
+ mm.activeMu.Unlock()
+ return 0, translateIOError(ctx, perr)
+ }
+ ar.End = pendaddr
+ }
+ imend, imerr := mm.getPMAInternalMappingsLocked(pseg, ar)
+ mm.activeMu.DowngradeLock()
+ if imendaddr := imend.Start(); imendaddr < ar.End {
+ if imendaddr <= ar.Start {
+ mm.activeMu.RUnlock()
+ return 0, translateIOError(ctx, imerr)
+ }
+ ar.End = imendaddr
+ }
+
+ // Do I/O.
+ un, err := f(mm.internalMappingsLocked(pseg, ar))
+ mm.activeMu.RUnlock()
+ n := int64(un)
+
+ // Return the first error in order of progress through ar.
+ if err != nil {
+ // Do not convert errors returned by f to EFAULT.
+ return n, err
+ }
+ if imerr != nil {
+ return n, translateIOError(ctx, imerr)
+ }
+ if perr != nil {
+ return n, translateIOError(ctx, perr)
+ }
+ return n, translateIOError(ctx, verr)
+}
+
+// withVecInternalMappings ensures that pmas exist for all addresses in ars,
+// support access of type (at, ignorePermissions), and have internal mappings
+// cached. It then calls f with mm.activeMu locked for reading, passing
+// internal mappings for the subset of ars for which this property holds.
+//
+// Preconditions: !ars.IsEmpty().
+func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
+ // withInternalMappings is faster than withVecInternalMappings because of
+ // iterator plumbing (this isn't generally practical in the vector case due
+ // to iterator invalidation between AddrRanges). Use it if possible.
+ if ars.NumRanges() == 1 {
+ return mm.withInternalMappings(ctx, ars.Head(), at, ignorePermissions, f)
+ }
+
+ // If pmas are already available, we can do IO without touching mm.vmas or
+ // mm.mappingMu.
+ mm.activeMu.RLock()
+ if mm.existingVecPMAsLocked(ars, at, ignorePermissions, true /* needInternalMappings */) {
+ n, err := f(mm.vecInternalMappingsLocked(ars))
+ mm.activeMu.RUnlock()
+ // Do not convert errors returned by f to EFAULT.
+ return int64(n), err
+ }
+ mm.activeMu.RUnlock()
+
+ // Ensure that we have usable vmas.
+ mm.mappingMu.RLock()
+ vars, verr := mm.getVecVMAsLocked(ctx, ars, at, ignorePermissions)
+ if vars.NumBytes() == 0 {
+ mm.mappingMu.RUnlock()
+ return 0, translateIOError(ctx, verr)
+ }
+
+ // Ensure that we have usable pmas.
+ mm.activeMu.Lock()
+ pars, perr := mm.getVecPMAsLocked(ctx, vars, at)
+ mm.mappingMu.RUnlock()
+ if pars.NumBytes() == 0 {
+ mm.activeMu.Unlock()
+ return 0, translateIOError(ctx, perr)
+ }
+ imars, imerr := mm.getVecPMAInternalMappingsLocked(pars)
+ mm.activeMu.DowngradeLock()
+ if imars.NumBytes() == 0 {
+ mm.activeMu.RUnlock()
+ return 0, translateIOError(ctx, imerr)
+ }
+
+ // Do I/O.
+ un, err := f(mm.vecInternalMappingsLocked(imars))
+ mm.activeMu.RUnlock()
+ n := int64(un)
+
+ // Return the first error in order of progress through ars.
+ if err != nil {
+ // Do not convert errors from f to EFAULT.
+ return n, err
+ }
+ if imerr != nil {
+ return n, translateIOError(ctx, imerr)
+ }
+ if perr != nil {
+ return n, translateIOError(ctx, perr)
+ }
+ return n, translateIOError(ctx, verr)
+}
+
+// truncatedAddrRangeSeq returns a copy of ars, but with the end truncated to
+// at most address end on AddrRange arsit.Head(). It is used in vector I/O paths to
+// truncate usermem.AddrRangeSeq when errors occur.
+//
+// Preconditions: !arsit.IsEmpty(). end <= arsit.Head().End.
+func truncatedAddrRangeSeq(ars, arsit usermem.AddrRangeSeq, end usermem.Addr) usermem.AddrRangeSeq {
+ ar := arsit.Head()
+ if end <= ar.Start {
+ return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes())
+ }
+ return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes() + int64(end-ar.Start))
+}
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
new file mode 100644
index 000000000..aac56679b
--- /dev/null
+++ b/pkg/sentry/mm/lifecycle.go
@@ -0,0 +1,283 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
+func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider, sleepForActivation bool) *MemoryManager {
+ return &MemoryManager{
+ p: p,
+ mfp: mfp,
+ haveASIO: p.SupportsAddressSpaceIO(),
+ privateRefs: &privateRefs{},
+ users: 1,
+ auxv: arch.Auxv{},
+ dumpability: UserDumpable,
+ aioManager: aioManager{contexts: make(map[uint64]*AIOContext)},
+ sleepForActivation: sleepForActivation,
+ }
+}
+
+// SetMmapLayout initializes mm's layout from the given arch.Context.
+//
+// Preconditions: mm contains no mappings and is not used concurrently.
+func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arch.MmapLayout, error) {
+ layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r)
+ if err != nil {
+ return arch.MmapLayout{}, err
+ }
+ mm.layout = layout
+ return layout, nil
+}
+
+// Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or
+// clone() (without CLONE_VM).
+func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ mm2 := &MemoryManager{
+ p: mm.p,
+ mfp: mm.mfp,
+ haveASIO: mm.haveASIO,
+ layout: mm.layout,
+ privateRefs: mm.privateRefs,
+ users: 1,
+ brk: mm.brk,
+ usageAS: mm.usageAS,
+ dataAS: mm.dataAS,
+ // "The child does not inherit its parent's memory locks (mlock(2),
+ // mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is
+ // MLockNone, both of which are zero values. vma.mlockMode is reset
+ // when copied below.
+ captureInvalidations: true,
+ argv: mm.argv,
+ envv: mm.envv,
+ auxv: append(arch.Auxv(nil), mm.auxv...),
+ // IncRef'd below, once we know that there isn't an error.
+ executable: mm.executable,
+ dumpability: mm.dumpability,
+ aioManager: aioManager{contexts: make(map[uint64]*AIOContext)},
+ sleepForActivation: mm.sleepForActivation,
+ vdsoSigReturnAddr: mm.vdsoSigReturnAddr,
+ }
+
+ // Copy vmas.
+ dontforks := false
+ dstvgap := mm2.vmas.FirstGap()
+ for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
+ vma := srcvseg.Value() // makes a copy of the vma
+ vmaAR := srcvseg.Range()
+
+ if vma.dontfork {
+ length := uint64(vmaAR.Length())
+ mm2.usageAS -= length
+ if vma.isPrivateDataLocked() {
+ mm2.dataAS -= length
+ }
+ dontforks = true
+ continue
+ }
+
+ // Inform the Mappable, if any, of the new mapping.
+ if vma.mappable != nil {
+ if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil {
+ mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange())
+ return nil, err
+ }
+ }
+ if vma.id != nil {
+ vma.id.IncRef()
+ }
+ vma.mlockMode = memmap.MLockNone
+ dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap()
+ // We don't need to update mm2.usageAS since we copied it from mm
+ // above.
+ }
+
+ // Copy pmas. We have to lock mm.activeMu for writing to make existing
+ // private pmas copy-on-write. We also have to lock mm2.activeMu since
+ // after copying vmas above, memmap.Mappables may call mm2.Invalidate. We
+ // only copy private pmas, since in the common case where fork(2) is
+ // immediately followed by execve(2), copying non-private pmas that can be
+ // regenerated by calling memmap.Mappable.Translate is a waste of time.
+ // (Linux does the same; compare kernel/fork.c:dup_mmap() =>
+ // mm/memory.c:copy_page_range().)
+ mm2.activeMu.Lock()
+ defer mm2.activeMu.Unlock()
+ mm.activeMu.Lock()
+ defer mm.activeMu.Unlock()
+ if dontforks {
+ defer mm.pmas.MergeRange(mm.applicationAddrRange())
+ }
+ srcvseg := mm.vmas.FirstSegment()
+ dstpgap := mm2.pmas.FirstGap()
+ var unmapAR usermem.AddrRange
+ for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() {
+ pma := srcpseg.ValuePtr()
+ if !pma.private {
+ continue
+ }
+
+ if dontforks {
+ // Find the 'vma' that contains the starting address
+ // associated with the 'pma' (there must be one).
+ srcvseg = srcvseg.seekNextLowerBound(srcpseg.Start())
+ if checkInvariants {
+ if !srcvseg.Ok() {
+ panic(fmt.Sprintf("no vma covers pma range %v", srcpseg.Range()))
+ }
+ if srcpseg.Start() < srcvseg.Start() {
+ panic(fmt.Sprintf("vma %v ran ahead of pma %v", srcvseg.Range(), srcpseg.Range()))
+ }
+ }
+
+ srcpseg = mm.pmas.Isolate(srcpseg, srcvseg.Range())
+ if srcvseg.ValuePtr().dontfork {
+ continue
+ }
+ pma = srcpseg.ValuePtr()
+ }
+
+ if !pma.needCOW {
+ pma.needCOW = true
+ if pma.effectivePerms.Write {
+ // We don't want to unmap the whole address space, even though
+ // doing so would reduce calls to unmapASLocked(), because mm
+ // will most likely continue to be used after the fork, so
+ // unmapping pmas unnecessarily will result in extra page
+ // faults. But we do want to merge consecutive AddrRanges
+ // across pma boundaries.
+ if unmapAR.End == srcpseg.Start() {
+ unmapAR.End = srcpseg.End()
+ } else {
+ if unmapAR.Length() != 0 {
+ mm.unmapASLocked(unmapAR)
+ }
+ unmapAR = srcpseg.Range()
+ }
+ pma.effectivePerms.Write = false
+ }
+ pma.maxPerms.Write = false
+ }
+ fr := srcpseg.fileRange()
+ mm2.incPrivateRef(fr)
+ srcpseg.ValuePtr().file.IncRef(fr)
+ addrRange := srcpseg.Range()
+ mm2.addRSSLocked(addrRange)
+ dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap()
+ }
+ if unmapAR.Length() != 0 {
+ mm.unmapASLocked(unmapAR)
+ }
+
+ // Between when we call memmap.Mappable.AddMapping while copying vmas and
+ // when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are
+ // ineffective because the pmas they invalidate haven't yet been copied,
+ // possibly allowing mm2 to get invalidated translations:
+ //
+ // Invalidating Mappable mm.Fork
+ // --------------------- -------
+ //
+ // mm2.Invalidate()
+ // mm.activeMu.Lock()
+ // mm.Invalidate() /* blocks */
+ // mm2.activeMu.Lock()
+ // (mm copies invalidated pma to mm2)
+ //
+ // This would technically be both safe (since we only copy private pmas,
+ // which will still hold a reference on their memory) and consistent with
+ // Linux, but we avoid it anyway by setting mm2.captureInvalidations during
+ // construction, causing calls to mm2.Invalidate() to be captured in
+ // mm2.capturedInvalidations, to be replayed after pmas are copied - i.e.
+ // here.
+ mm2.captureInvalidations = false
+ for _, invArgs := range mm2.capturedInvalidations {
+ mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true)
+ }
+ mm2.capturedInvalidations = nil
+
+ if mm2.executable != nil {
+ mm2.executable.IncRef()
+ }
+ return mm2, nil
+}
+
+// IncUsers increments mm's user count and returns true. If the user count is
+// already 0, IncUsers does nothing and returns false.
+func (mm *MemoryManager) IncUsers() bool {
+ for {
+ users := atomic.LoadInt32(&mm.users)
+ if users == 0 {
+ return false
+ }
+ if atomic.CompareAndSwapInt32(&mm.users, users, users+1) {
+ return true
+ }
+ }
+}
+
+// DecUsers decrements mm's user count. If the user count reaches 0, all
+// mappings in mm are unmapped.
+func (mm *MemoryManager) DecUsers(ctx context.Context) {
+ if users := atomic.AddInt32(&mm.users, -1); users > 0 {
+ return
+ } else if users < 0 {
+ panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users))
+ }
+
+ mm.aioManager.destroy()
+
+ mm.metadataMu.Lock()
+ exe := mm.executable
+ mm.executable = nil
+ mm.metadataMu.Unlock()
+ if exe != nil {
+ exe.DecRef()
+ }
+
+ mm.activeMu.Lock()
+ // Sanity check.
+ if atomic.LoadInt32(&mm.active) != 0 {
+ panic("active address space lost?")
+ }
+ // Make sure the AddressSpace is returned.
+ if mm.as != nil {
+ mm.as.Release()
+ mm.as = nil
+ }
+ mm.activeMu.Unlock()
+
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+ // If mm is being dropped before mm.SetMmapLayout was called,
+ // mm.applicationAddrRange() will be empty.
+ if ar := mm.applicationAddrRange(); ar.Length() != 0 {
+ mm.unmapLocked(ctx, ar)
+ }
+}
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
new file mode 100644
index 000000000..28e5057f7
--- /dev/null
+++ b/pkg/sentry/mm/metadata.go
@@ -0,0 +1,183 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Dumpability describes if and how core dumps should be created.
+type Dumpability int
+
+const (
+ // NotDumpable indicates that core dumps should never be created.
+ NotDumpable Dumpability = iota
+
+ // UserDumpable indicates that core dumps should be created, owned by
+ // the current user.
+ UserDumpable
+
+ // RootDumpable indicates that core dumps should be created, owned by
+ // root.
+ RootDumpable
+)
+
+// Dumpability returns the dumpability.
+func (mm *MemoryManager) Dumpability() Dumpability {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ return mm.dumpability
+}
+
+// SetDumpability sets the dumpability.
+func (mm *MemoryManager) SetDumpability(d Dumpability) {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ mm.dumpability = d
+}
+
+// ArgvStart returns the start of the application argument vector.
+//
+// There is no guarantee that this value is sensible w.r.t. ArgvEnd.
+func (mm *MemoryManager) ArgvStart() usermem.Addr {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ return mm.argv.Start
+}
+
+// SetArgvStart sets the start of the application argument vector.
+func (mm *MemoryManager) SetArgvStart(a usermem.Addr) {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ mm.argv.Start = a
+}
+
+// ArgvEnd returns the end of the application argument vector.
+//
+// There is no guarantee that this value is sensible w.r.t. ArgvStart.
+func (mm *MemoryManager) ArgvEnd() usermem.Addr {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ return mm.argv.End
+}
+
+// SetArgvEnd sets the end of the application argument vector.
+func (mm *MemoryManager) SetArgvEnd(a usermem.Addr) {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ mm.argv.End = a
+}
+
+// EnvvStart returns the start of the application environment vector.
+//
+// There is no guarantee that this value is sensible w.r.t. EnvvEnd.
+func (mm *MemoryManager) EnvvStart() usermem.Addr {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ return mm.envv.Start
+}
+
+// SetEnvvStart sets the start of the application environment vector.
+func (mm *MemoryManager) SetEnvvStart(a usermem.Addr) {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ mm.envv.Start = a
+}
+
+// EnvvEnd returns the end of the application environment vector.
+//
+// There is no guarantee that this value is sensible w.r.t. EnvvStart.
+func (mm *MemoryManager) EnvvEnd() usermem.Addr {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ return mm.envv.End
+}
+
+// SetEnvvEnd sets the end of the application environment vector.
+func (mm *MemoryManager) SetEnvvEnd(a usermem.Addr) {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ mm.envv.End = a
+}
+
+// Auxv returns the current map of auxiliary vectors.
+func (mm *MemoryManager) Auxv() arch.Auxv {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ return append(arch.Auxv(nil), mm.auxv...)
+}
+
+// SetAuxv sets the entire map of auxiliary vectors.
+func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ mm.auxv = append(arch.Auxv(nil), auxv...)
+}
+
+// Executable returns the executable, if available.
+//
+// An additional reference will be taken in the case of a non-nil executable,
+// which must be released by the caller.
+func (mm *MemoryManager) Executable() fsbridge.File {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+
+ if mm.executable == nil {
+ return nil
+ }
+
+ mm.executable.IncRef()
+ return mm.executable
+}
+
+// SetExecutable sets the executable.
+//
+// This takes a reference on d.
+func (mm *MemoryManager) SetExecutable(file fsbridge.File) {
+ mm.metadataMu.Lock()
+
+ // Grab a new reference.
+ file.IncRef()
+
+ // Set the executable.
+ orig := mm.executable
+ mm.executable = file
+
+ mm.metadataMu.Unlock()
+
+ // Release the old reference.
+ //
+ // Do this without holding the lock, since it may wind up doing some
+ // I/O to sync the dirent, etc.
+ if orig != nil {
+ orig.DecRef()
+ }
+}
+
+// VDSOSigReturn returns the address of vdso_sigreturn.
+func (mm *MemoryManager) VDSOSigReturn() uint64 {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ return mm.vdsoSigReturnAddr
+}
+
+// SetVDSOSigReturn sets the address of vdso_sigreturn.
+func (mm *MemoryManager) SetVDSOSigReturn(addr uint64) {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ mm.vdsoSigReturnAddr = addr
+}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
new file mode 100644
index 000000000..6db7c3d40
--- /dev/null
+++ b/pkg/sentry/mm/mm.go
@@ -0,0 +1,478 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package mm provides a memory management subsystem. See README.md for a
+// detailed overview.
+//
+// Lock order:
+//
+// fs locks, except for memmap.Mappable locks
+// mm.MemoryManager.metadataMu
+// mm.MemoryManager.mappingMu
+// Locks taken by memmap.Mappable methods other than Translate
+// mm.MemoryManager.activeMu
+// Locks taken by memmap.Mappable.Translate
+// mm.privateRefs.mu
+// platform.AddressSpace locks
+// platform.File locks
+// mm.aioManager.mu
+// mm.AIOContext.mu
+//
+// Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in
+// multiple mm.MemoryManagers, as it does so in a well-defined order (forked
+// child first).
+package mm
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// MemoryManager implements a virtual address space.
+//
+// +stateify savable
+type MemoryManager struct {
+ // p and mfp are immutable.
+ p platform.Platform
+ mfp pgalloc.MemoryFileProvider
+
+ // haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from
+ // eliminating an indirect call in the hot I/O path, this makes
+ // MemoryManager.asioEnabled() a leaf function, allowing it to be inlined.
+ //
+ // haveASIO is immutable.
+ haveASIO bool `state:"nosave"`
+
+ // layout is the memory layout.
+ //
+ // layout is set by the binary loader before the MemoryManager can be used.
+ layout arch.MmapLayout
+
+ // privateRefs stores reference counts for private memory (memory whose
+ // ownership is shared by one or more pmas instead of being owned by a
+ // memmap.Mappable).
+ //
+ // privateRefs is immutable.
+ privateRefs *privateRefs
+
+ // users is the number of dependencies on the mappings in the MemoryManager.
+ // When the number of references in users reaches zero, all mappings are
+ // unmapped.
+ //
+ // users is accessed using atomic memory operations.
+ users int32
+
+ // mappingMu is analogous to Linux's struct mm_struct::mmap_sem.
+ mappingMu sync.RWMutex `state:"nosave"`
+
+ // vmas stores virtual memory areas. Since vmas are stored by value,
+ // clients should usually use vmaIterator.ValuePtr() instead of
+ // vmaIterator.Value() to get a pointer to the vma rather than a copy.
+ //
+ // Invariants: vmas are always page-aligned.
+ //
+ // vmas is protected by mappingMu.
+ vmas vmaSet
+
+ // brk is the mm's brk, which is manipulated using the brk(2) system call.
+ // The brk is initially set up by the loader which maps an executable
+ // binary into the mm.
+ //
+ // brk is protected by mappingMu.
+ brk usermem.AddrRange
+
+ // usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
+ //
+ // usageAS is protected by mappingMu.
+ usageAS uint64
+
+ // lockedAS is the combined size in bytes of all vmas with vma.mlockMode !=
+ // memmap.MLockNone.
+ //
+ // lockedAS is protected by mappingMu.
+ lockedAS uint64
+
+ // dataAS is the size of private data segments, like mm_struct->data_vm.
+ // It means the vma which is private, writable, not stack.
+ //
+ // dataAS is protected by mappingMu.
+ dataAS uint64
+
+ // New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or
+ // defMLockMode is greater.
+ //
+ // defMLockMode is protected by mappingMu.
+ defMLockMode memmap.MLockMode
+
+ // activeMu is loosely analogous to Linux's struct
+ // mm_struct::page_table_lock.
+ activeMu sync.RWMutex `state:"nosave"`
+
+ // pmas stores platform mapping areas used to implement vmas. Since pmas
+ // are stored by value, clients should usually use pmaIterator.ValuePtr()
+ // instead of pmaIterator.Value() to get a pointer to the pma rather than
+ // a copy.
+ //
+ // Inserting or removing segments from pmas should happen along with a
+ // call to mm.insertRSS or mm.removeRSS.
+ //
+ // Invariants: pmas are always page-aligned. If a pma exists for a given
+ // address, a vma must also exist for that address.
+ //
+ // pmas is protected by activeMu.
+ pmas pmaSet
+
+ // curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is
+ // reported as the MemoryManager's RSS.
+ //
+ // maxRSS should be modified only via insertRSS and removeRSS, not
+ // directly.
+ //
+ // maxRSS is protected by activeMu.
+ curRSS uint64
+
+ // maxRSS is the maximum resident set size in bytes of a MemoryManager.
+ // It is tracked as the application adds and removes mappings to pmas.
+ //
+ // maxRSS should be modified only via insertRSS, not directly.
+ //
+ // maxRSS is protected by activeMu.
+ maxRSS uint64
+
+ // as is the platform.AddressSpace that pmas are mapped into. active is the
+ // number of contexts that require as to be non-nil; if active == 0, as may
+ // be nil.
+ //
+ // as is protected by activeMu. active is manipulated with atomic memory
+ // operations; transitions to and from zero are additionally protected by
+ // activeMu. (This is because such transitions may need to be atomic with
+ // changes to as.)
+ as platform.AddressSpace `state:"nosave"`
+ active int32 `state:"zerovalue"`
+
+ // unmapAllOnActivate indicates that the next Activate call should activate
+ // an empty AddressSpace.
+ //
+ // This is used to ensure that an AddressSpace cached in
+ // NewAddressSpace is not used after some change in the MemoryManager
+ // or VMAs has made that AddressSpace stale.
+ //
+ // unmapAllOnActivate is protected by activeMu. It must only be set when
+ // there is no active or cached AddressSpace. If as != nil, then
+ // invalidations should be propagated immediately.
+ unmapAllOnActivate bool `state:"nosave"`
+
+ // If captureInvalidations is true, calls to MM.Invalidate() are recorded
+ // in capturedInvalidations rather than being applied immediately to pmas.
+ // This is to avoid a race condition in MM.Fork(); see that function for
+ // details.
+ //
+ // Both captureInvalidations and capturedInvalidations are protected by
+ // activeMu. Neither need to be saved since captureInvalidations is only
+ // enabled during MM.Fork(), during which saving can't occur.
+ captureInvalidations bool `state:"zerovalue"`
+ capturedInvalidations []invalidateArgs `state:"nosave"`
+
+ metadataMu sync.Mutex `state:"nosave"`
+
+ // argv is the application argv. This is set up by the loader and may be
+ // modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No
+ // requirements apply to argv; we do not require that argv.WellFormed().
+ //
+ // argv is protected by metadataMu.
+ argv usermem.AddrRange
+
+ // envv is the application envv. This is set up by the loader and may be
+ // modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No
+ // requirements apply to envv; we do not require that envv.WellFormed().
+ //
+ // envv is protected by metadataMu.
+ envv usermem.AddrRange
+
+ // auxv is the ELF's auxiliary vector.
+ //
+ // auxv is protected by metadataMu.
+ auxv arch.Auxv
+
+ // executable is the executable for this MemoryManager. If executable
+ // is not nil, it holds a reference on the Dirent.
+ //
+ // executable is protected by metadataMu.
+ executable fsbridge.File
+
+ // dumpability describes if and how this MemoryManager may be dumped to
+ // userspace.
+ //
+ // dumpability is protected by metadataMu.
+ dumpability Dumpability
+
+ // aioManager keeps track of AIOContexts used for async IOs. AIOManager
+ // must be cloned when CLONE_VM is used.
+ aioManager aioManager
+
+ // sleepForActivation indicates whether the task should report to be sleeping
+ // before trying to activate the address space. When set to true, delays in
+ // activation are not reported as stuck tasks by the watchdog.
+ sleepForActivation bool
+
+ // vdsoSigReturnAddr is the address of 'vdso_sigreturn'.
+ vdsoSigReturnAddr uint64
+}
+
+// vma represents a virtual memory area.
+//
+// +stateify savable
+type vma struct {
+ // mappable is the virtual memory object mapped by this vma. If mappable is
+ // nil, the vma represents a private anonymous mapping.
+ mappable memmap.Mappable
+
+ // off is the offset into mappable at which this vma begins. If mappable is
+ // nil, off is meaningless.
+ off uint64
+
+ // To speedup VMA save/restore, we group and save the following booleans
+ // as a single integer.
+
+ // realPerms are the memory permissions on this vma, as defined by the
+ // application.
+ realPerms usermem.AccessType `state:".(int)"`
+
+ // effectivePerms are the memory permissions on this vma which are
+ // actually used to control access.
+ //
+ // Invariant: effectivePerms == realPerms.Effective().
+ effectivePerms usermem.AccessType `state:"manual"`
+
+ // maxPerms limits the set of permissions that may ever apply to this
+ // memory, as well as accesses for which usermem.IOOpts.IgnorePermissions
+ // is true (e.g. ptrace(PTRACE_POKEDATA)).
+ //
+ // Invariant: maxPerms == maxPerms.Effective().
+ maxPerms usermem.AccessType `state:"manual"`
+
+ // private is true if this is a MAP_PRIVATE mapping, such that writes to
+ // the mapping are propagated to a copy.
+ private bool `state:"manual"`
+
+ // growsDown is true if the mapping may be automatically extended downward
+ // under certain conditions. If growsDown is true, mappable must be nil.
+ //
+ // There is currently no corresponding growsUp flag; in Linux, the only
+ // architectures that can have VM_GROWSUP mappings are ia64, parisc, and
+ // metag, none of which we currently support.
+ growsDown bool `state:"manual"`
+
+ // dontfork is the MADV_DONTFORK setting for this vma configured by madvise().
+ dontfork bool
+
+ mlockMode memmap.MLockMode
+
+ // numaPolicy is the NUMA policy for this vma set by mbind().
+ numaPolicy linux.NumaPolicy
+
+ // numaNodemask is the NUMA nodemask for this vma set by mbind().
+ numaNodemask uint64
+
+ // If id is not nil, it controls the lifecycle of mappable and provides vma
+ // metadata shown in /proc/[pid]/maps, and the vma holds a reference.
+ id memmap.MappingIdentity
+
+ // If hint is non-empty, it is a description of the vma printed in
+ // /proc/[pid]/maps. hint takes priority over id.MappedName().
+ hint string
+}
+
+const (
+ vmaRealPermsRead = 1 << iota
+ vmaRealPermsWrite
+ vmaRealPermsExecute
+ vmaEffectivePermsRead
+ vmaEffectivePermsWrite
+ vmaEffectivePermsExecute
+ vmaMaxPermsRead
+ vmaMaxPermsWrite
+ vmaMaxPermsExecute
+ vmaPrivate
+ vmaGrowsDown
+)
+
+func (v *vma) saveRealPerms() int {
+ var b int
+ if v.realPerms.Read {
+ b |= vmaRealPermsRead
+ }
+ if v.realPerms.Write {
+ b |= vmaRealPermsWrite
+ }
+ if v.realPerms.Execute {
+ b |= vmaRealPermsExecute
+ }
+ if v.effectivePerms.Read {
+ b |= vmaEffectivePermsRead
+ }
+ if v.effectivePerms.Write {
+ b |= vmaEffectivePermsWrite
+ }
+ if v.effectivePerms.Execute {
+ b |= vmaEffectivePermsExecute
+ }
+ if v.maxPerms.Read {
+ b |= vmaMaxPermsRead
+ }
+ if v.maxPerms.Write {
+ b |= vmaMaxPermsWrite
+ }
+ if v.maxPerms.Execute {
+ b |= vmaMaxPermsExecute
+ }
+ if v.private {
+ b |= vmaPrivate
+ }
+ if v.growsDown {
+ b |= vmaGrowsDown
+ }
+ return b
+}
+
+func (v *vma) loadRealPerms(b int) {
+ if b&vmaRealPermsRead > 0 {
+ v.realPerms.Read = true
+ }
+ if b&vmaRealPermsWrite > 0 {
+ v.realPerms.Write = true
+ }
+ if b&vmaRealPermsExecute > 0 {
+ v.realPerms.Execute = true
+ }
+ if b&vmaEffectivePermsRead > 0 {
+ v.effectivePerms.Read = true
+ }
+ if b&vmaEffectivePermsWrite > 0 {
+ v.effectivePerms.Write = true
+ }
+ if b&vmaEffectivePermsExecute > 0 {
+ v.effectivePerms.Execute = true
+ }
+ if b&vmaMaxPermsRead > 0 {
+ v.maxPerms.Read = true
+ }
+ if b&vmaMaxPermsWrite > 0 {
+ v.maxPerms.Write = true
+ }
+ if b&vmaMaxPermsExecute > 0 {
+ v.maxPerms.Execute = true
+ }
+ if b&vmaPrivate > 0 {
+ v.private = true
+ }
+ if b&vmaGrowsDown > 0 {
+ v.growsDown = true
+ }
+}
+
+// pma represents a platform mapping area.
+//
+// +stateify savable
+type pma struct {
+ // file is the file mapped by this pma. Only pmas for which file ==
+ // MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to
+ // the corresponding file range while they exist.
+ file platform.File `state:"nosave"`
+
+ // off is the offset into file at which this pma begins.
+ //
+ // Note that pmas do *not* hold references on offsets in file! If private
+ // is true, MemoryManager.privateRefs holds the reference instead. If
+ // private is false, the corresponding memmap.Mappable holds the reference
+ // instead (per memmap.Mappable.Translate requirement).
+ off uint64
+
+ // translatePerms is the permissions returned by memmap.Mappable.Translate.
+ // If private is true, translatePerms is usermem.AnyAccess.
+ translatePerms usermem.AccessType
+
+ // effectivePerms is the permissions allowed for non-ignorePermissions
+ // accesses. maxPerms is the permissions allowed for ignorePermissions
+ // accesses. These are vma.effectivePerms and vma.maxPerms respectively,
+ // masked by pma.translatePerms and with Write disallowed if pma.needCOW is
+ // true.
+ //
+ // These are stored in the pma so that the IO implementation can avoid
+ // iterating mm.vmas when pmas already exist.
+ effectivePerms usermem.AccessType
+ maxPerms usermem.AccessType
+
+ // needCOW is true if writes to the mapping must be propagated to a copy.
+ needCOW bool
+
+ // private is true if this pma represents private memory.
+ //
+ // If private is true, file must be MemoryManager.mfp.MemoryFile(), the pma
+ // holds a reference on the mapped memory that is tracked in privateRefs,
+ // and calls to Invalidate for which
+ // memmap.InvalidateOpts.InvalidatePrivate is false should ignore the pma.
+ //
+ // If private is false, this pma caches a translation from the
+ // corresponding vma's memmap.Mappable.Translate.
+ private bool
+
+ // If internalMappings is not empty, it is the cached return value of
+ // file.MapInternal for the platform.FileRange mapped by this pma.
+ internalMappings safemem.BlockSeq `state:"nosave"`
+}
+
+// +stateify savable
+type privateRefs struct {
+ mu sync.Mutex `state:"nosave"`
+
+ // refs maps offsets into MemoryManager.mfp.MemoryFile() to the number of
+ // pmas (or, equivalently, MemoryManagers) that share ownership of the
+ // memory at that offset.
+ refs fileRefcountSet
+}
+
+type invalidateArgs struct {
+ ar usermem.AddrRange
+ opts memmap.InvalidateOpts
+}
+
+// fileRefcountSetFunctions implements segment.Functions for fileRefcountSet.
+type fileRefcountSetFunctions struct{}
+
+func (fileRefcountSetFunctions) MinKey() uint64 {
+ return 0
+}
+
+func (fileRefcountSetFunctions) MaxKey() uint64 {
+ return ^uint64(0)
+}
+
+func (fileRefcountSetFunctions) ClearValue(_ *int32) {
+}
+
+func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) {
+ return rc1, rc1 == rc2
+}
+
+func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) {
+ return rc, rc
+}
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
new file mode 100644
index 000000000..fdc308542
--- /dev/null
+++ b/pkg/sentry/mm/mm_test.go
@@ -0,0 +1,230 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+func testMemoryManager(ctx context.Context) *MemoryManager {
+ p := platform.FromContext(ctx)
+ mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+ mm := NewMemoryManager(p, mfp, false)
+ mm.layout = arch.MmapLayout{
+ MinAddr: p.MinUserAddress(),
+ MaxAddr: p.MaxUserAddress(),
+ BottomUpBase: p.MinUserAddress(),
+ TopDownBase: p.MaxUserAddress(),
+ }
+ return mm
+}
+
+func (mm *MemoryManager) realUsageAS() uint64 {
+ return uint64(mm.vmas.Span())
+}
+
+func TestUsageASUpdates(t *testing.T) {
+ ctx := contexttest.Context(t)
+ mm := testMemoryManager(ctx)
+ defer mm.DecUsers(ctx)
+
+ addr, err := mm.MMap(ctx, memmap.MMapOpts{
+ Length: 2 * usermem.PageSize,
+ })
+ if err != nil {
+ t.Fatalf("MMap got err %v want nil", err)
+ }
+ realUsage := mm.realUsageAS()
+ if mm.usageAS != realUsage {
+ t.Fatalf("usageAS believes %v bytes are mapped; %v bytes are actually mapped", mm.usageAS, realUsage)
+ }
+
+ mm.MUnmap(ctx, addr, usermem.PageSize)
+ realUsage = mm.realUsageAS()
+ if mm.usageAS != realUsage {
+ t.Fatalf("usageAS believes %v bytes are mapped; %v bytes are actually mapped", mm.usageAS, realUsage)
+ }
+}
+
+func (mm *MemoryManager) realDataAS() uint64 {
+ var sz uint64
+ for seg := mm.vmas.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+ vma := seg.Value()
+ if vma.isPrivateDataLocked() {
+ sz += uint64(seg.Range().Length())
+ }
+ }
+ return sz
+}
+
+func TestDataASUpdates(t *testing.T) {
+ ctx := contexttest.Context(t)
+ mm := testMemoryManager(ctx)
+ defer mm.DecUsers(ctx)
+
+ addr, err := mm.MMap(ctx, memmap.MMapOpts{
+ Length: 3 * usermem.PageSize,
+ Private: true,
+ Perms: usermem.Write,
+ MaxPerms: usermem.AnyAccess,
+ })
+ if err != nil {
+ t.Fatalf("MMap got err %v want nil", err)
+ }
+ if mm.dataAS == 0 {
+ t.Fatalf("dataAS is 0, wanted not 0")
+ }
+ realDataAS := mm.realDataAS()
+ if mm.dataAS != realDataAS {
+ t.Fatalf("dataAS believes %v bytes are mapped; %v bytes are actually mapped", mm.dataAS, realDataAS)
+ }
+
+ mm.MUnmap(ctx, addr, usermem.PageSize)
+ realDataAS = mm.realDataAS()
+ if mm.dataAS != realDataAS {
+ t.Fatalf("dataAS believes %v bytes are mapped; %v bytes are actually mapped", mm.dataAS, realDataAS)
+ }
+
+ mm.MProtect(addr+usermem.PageSize, usermem.PageSize, usermem.Read, false)
+ realDataAS = mm.realDataAS()
+ if mm.dataAS != realDataAS {
+ t.Fatalf("dataAS believes %v bytes are mapped; %v bytes are actually mapped", mm.dataAS, realDataAS)
+ }
+
+ mm.MRemap(ctx, addr+2*usermem.PageSize, usermem.PageSize, 2*usermem.PageSize, MRemapOpts{
+ Move: MRemapMayMove,
+ })
+ realDataAS = mm.realDataAS()
+ if mm.dataAS != realDataAS {
+ t.Fatalf("dataAS believes %v bytes are mapped; %v bytes are actually mapped", mm.dataAS, realDataAS)
+ }
+}
+
+func TestBrkDataLimitUpdates(t *testing.T) {
+ limitSet := limits.NewLimitSet()
+ limitSet.Set(limits.Data, limits.Limit{}, true /* privileged */) // zero RLIMIT_DATA
+
+ ctx := contexttest.WithLimitSet(contexttest.Context(t), limitSet)
+ mm := testMemoryManager(ctx)
+ defer mm.DecUsers(ctx)
+
+ // Try to extend the brk by one page and expect doing so to fail.
+ oldBrk, _ := mm.Brk(ctx, 0)
+ if newBrk, _ := mm.Brk(ctx, oldBrk+usermem.PageSize); newBrk != oldBrk {
+ t.Errorf("brk() increased data segment above RLIMIT_DATA (old brk = %#x, new brk = %#x", oldBrk, newBrk)
+ }
+}
+
+// TestIOAfterUnmap ensures that IO fails after unmap.
+func TestIOAfterUnmap(t *testing.T) {
+ ctx := contexttest.Context(t)
+ mm := testMemoryManager(ctx)
+ defer mm.DecUsers(ctx)
+
+ addr, err := mm.MMap(ctx, memmap.MMapOpts{
+ Length: usermem.PageSize,
+ Private: true,
+ Perms: usermem.Read,
+ MaxPerms: usermem.AnyAccess,
+ })
+ if err != nil {
+ t.Fatalf("MMap got err %v want nil", err)
+ }
+
+ // IO works before munmap.
+ b := make([]byte, 1)
+ n, err := mm.CopyIn(ctx, addr, b, usermem.IOOpts{})
+ if err != nil {
+ t.Errorf("CopyIn got err %v want nil", err)
+ }
+ if n != 1 {
+ t.Errorf("CopyIn got %d want 1", n)
+ }
+
+ err = mm.MUnmap(ctx, addr, usermem.PageSize)
+ if err != nil {
+ t.Fatalf("MUnmap got err %v want nil", err)
+ }
+
+ n, err = mm.CopyIn(ctx, addr, b, usermem.IOOpts{})
+ if err != syserror.EFAULT {
+ t.Errorf("CopyIn got err %v want EFAULT", err)
+ }
+ if n != 0 {
+ t.Errorf("CopyIn got %d want 0", n)
+ }
+}
+
+// TestIOAfterMProtect tests IO interaction with mprotect permissions.
+func TestIOAfterMProtect(t *testing.T) {
+ ctx := contexttest.Context(t)
+ mm := testMemoryManager(ctx)
+ defer mm.DecUsers(ctx)
+
+ addr, err := mm.MMap(ctx, memmap.MMapOpts{
+ Length: usermem.PageSize,
+ Private: true,
+ Perms: usermem.ReadWrite,
+ MaxPerms: usermem.AnyAccess,
+ })
+ if err != nil {
+ t.Fatalf("MMap got err %v want nil", err)
+ }
+
+ // Writing works before mprotect.
+ b := make([]byte, 1)
+ n, err := mm.CopyOut(ctx, addr, b, usermem.IOOpts{})
+ if err != nil {
+ t.Errorf("CopyOut got err %v want nil", err)
+ }
+ if n != 1 {
+ t.Errorf("CopyOut got %d want 1", n)
+ }
+
+ err = mm.MProtect(addr, usermem.PageSize, usermem.Read, false)
+ if err != nil {
+ t.Errorf("MProtect got err %v want nil", err)
+ }
+
+ // Without IgnorePermissions, CopyOut should no longer succeed.
+ n, err = mm.CopyOut(ctx, addr, b, usermem.IOOpts{})
+ if err != syserror.EFAULT {
+ t.Errorf("CopyOut got err %v want EFAULT", err)
+ }
+ if n != 0 {
+ t.Errorf("CopyOut got %d want 0", n)
+ }
+
+ // With IgnorePermissions, CopyOut should succeed despite mprotect.
+ n, err = mm.CopyOut(ctx, addr, b, usermem.IOOpts{
+ IgnorePermissions: true,
+ })
+ if err != nil {
+ t.Errorf("CopyOut got err %v want nil", err)
+ }
+ if n != 1 {
+ t.Errorf("CopyOut got %d want 1", n)
+ }
+}
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
new file mode 100644
index 000000000..62e4c20af
--- /dev/null
+++ b/pkg/sentry/mm/pma.go
@@ -0,0 +1,1036 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/safecopy"
+ "gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// existingPMAsLocked checks that pmas exist for all addresses in ar, and
+// support access of type (at, ignorePermissions). If so, it returns an
+// iterator to the pma containing ar.Start. Otherwise it returns a terminal
+// iterator.
+//
+// Preconditions: mm.activeMu must be locked. ar.Length() != 0.
+func (mm *MemoryManager) existingPMAsLocked(ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator {
+ if checkInvariants {
+ if !ar.WellFormed() || ar.Length() <= 0 {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ }
+
+ first := mm.pmas.FindSegment(ar.Start)
+ pseg := first
+ for pseg.Ok() {
+ pma := pseg.ValuePtr()
+ perms := pma.effectivePerms
+ if ignorePermissions {
+ perms = pma.maxPerms
+ }
+ if !perms.SupersetOf(at) {
+ return pmaIterator{}
+ }
+ if needInternalMappings && pma.internalMappings.IsEmpty() {
+ return pmaIterator{}
+ }
+
+ if ar.End <= pseg.End() {
+ return first
+ }
+ pseg, _ = pseg.NextNonEmpty()
+ }
+
+ // Ran out of pmas before reaching ar.End.
+ return pmaIterator{}
+}
+
+// existingVecPMAsLocked returns true if pmas exist for all addresses in ars,
+// and support access of type (at, ignorePermissions).
+//
+// Preconditions: mm.activeMu must be locked.
+func (mm *MemoryManager) existingVecPMAsLocked(ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool, needInternalMappings bool) bool {
+ for ; !ars.IsEmpty(); ars = ars.Tail() {
+ if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, needInternalMappings).Ok() {
+ return false
+ }
+ }
+ return true
+}
+
+// getPMAsLocked ensures that pmas exist for all addresses in ar, and support
+// access of type at. It returns:
+//
+// - An iterator to the pma containing ar.Start. If no pma contains ar.Start,
+// the iterator is unspecified.
+//
+// - An iterator to the gap after the last pma containing an address in ar. If
+// pmas exist for no addresses in ar, the iterator is to a gap that begins
+// before ar.Start.
+//
+// - An error that is non-nil if pmas exist for only a subset of ar.
+//
+// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
+// writing. ar.Length() != 0. vseg.Range().Contains(ar.Start). vmas must exist
+// for all addresses in ar, and support accesses of type at (i.e. permission
+// checks must have been performed against vmas).
+func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) {
+ if checkInvariants {
+ if !ar.WellFormed() || ar.Length() <= 0 {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ if !vseg.Ok() {
+ panic("terminal vma iterator")
+ }
+ if !vseg.Range().Contains(ar.Start) {
+ panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
+ }
+ }
+
+ // Page-align ar so that all AddrRanges are aligned.
+ end, ok := ar.End.RoundUp()
+ var alignerr error
+ if !ok {
+ end = ar.End.RoundDown()
+ alignerr = syserror.EFAULT
+ }
+ ar = usermem.AddrRange{ar.Start.RoundDown(), end}
+
+ pstart, pend, perr := mm.getPMAsInternalLocked(ctx, vseg, ar, at)
+ if pend.Start() <= ar.Start {
+ return pmaIterator{}, pend, perr
+ }
+ // getPMAsInternalLocked may not have returned pstart due to iterator
+ // invalidation.
+ if !pstart.Ok() {
+ pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend)
+ }
+ if perr != nil {
+ return pstart, pend, perr
+ }
+ return pstart, pend, alignerr
+}
+
+// getVecPMAsLocked ensures that pmas exist for all addresses in ars, and
+// support access of type at. It returns the subset of ars for which pmas
+// exist. If this is not equal to ars, it returns a non-nil error explaining
+// why.
+//
+// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
+// writing. vmas must exist for all addresses in ars, and support accesses of
+// type at (i.e. permission checks must have been performed against vmas).
+func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType) (usermem.AddrRangeSeq, error) {
+ for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
+ ar := arsit.Head()
+ if ar.Length() == 0 {
+ continue
+ }
+ if checkInvariants {
+ if !ar.WellFormed() {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ }
+
+ // Page-align ar so that all AddrRanges are aligned.
+ end, ok := ar.End.RoundUp()
+ var alignerr error
+ if !ok {
+ end = ar.End.RoundDown()
+ alignerr = syserror.EFAULT
+ }
+ ar = usermem.AddrRange{ar.Start.RoundDown(), end}
+
+ _, pend, perr := mm.getPMAsInternalLocked(ctx, mm.vmas.FindSegment(ar.Start), ar, at)
+ if perr != nil {
+ return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr
+ }
+ if alignerr != nil {
+ return truncatedAddrRangeSeq(ars, arsit, pend.Start()), alignerr
+ }
+ }
+
+ return ars, nil
+}
+
+// getPMAsInternalLocked is equivalent to getPMAsLocked, with the following
+// exceptions:
+//
+// - getPMAsInternalLocked returns a pmaIterator on a best-effort basis (that
+// is, the returned iterator may be terminal, even if a pma that contains
+// ar.Start exists). Returning this iterator on a best-effort basis allows
+// callers that require it to use it when it's cheaply available, while also
+// avoiding the overhead of retrieving it when it's not.
+//
+// - getPMAsInternalLocked additionally requires that ar is page-aligned.
+//
+// getPMAsInternalLocked is an implementation helper for getPMAsLocked and
+// getVecPMAsLocked; other clients should call one of those instead.
+func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) {
+ if checkInvariants {
+ if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ if !vseg.Ok() {
+ panic("terminal vma iterator")
+ }
+ if !vseg.Range().Contains(ar.Start) {
+ panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
+ }
+ }
+
+ mf := mm.mfp.MemoryFile()
+ // Limit the range we allocate to ar, aligned to privateAllocUnit.
+ maskAR := privateAligned(ar)
+ didUnmapAS := false
+ // The range in which we iterate vmas and pmas is still limited to ar, to
+ // ensure that we don't allocate or COW-break a pma we don't need.
+ pseg, pgap := mm.pmas.Find(ar.Start)
+ pstart := pseg
+ for {
+ // Get pmas for this vma.
+ vsegAR := vseg.Range().Intersect(ar)
+ vma := vseg.ValuePtr()
+ pmaLoop:
+ for {
+ switch {
+ case pgap.Ok() && pgap.Start() < vsegAR.End:
+ // Need a pma here.
+ optAR := vseg.Range().Intersect(pgap.Range())
+ if checkInvariants {
+ if optAR.Length() <= 0 {
+ panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap))
+ }
+ }
+ if vma.mappable == nil {
+ // Private anonymous mappings get pmas by allocating.
+ allocAR := optAR.Intersect(maskAR)
+ fr, err := mf.Allocate(uint64(allocAR.Length()), usage.Anonymous)
+ if err != nil {
+ return pstart, pgap, err
+ }
+ if checkInvariants {
+ if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) {
+ panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr))
+ }
+ }
+ mm.addRSSLocked(allocAR)
+ mm.incPrivateRef(fr)
+ mf.IncRef(fr)
+ pseg, pgap = mm.pmas.Insert(pgap, allocAR, pma{
+ file: mf,
+ off: fr.Start,
+ translatePerms: usermem.AnyAccess,
+ effectivePerms: vma.effectivePerms,
+ maxPerms: vma.maxPerms,
+ // Since we just allocated this memory and have the
+ // only reference, the new pma does not need
+ // copy-on-write.
+ private: true,
+ }).NextNonEmpty()
+ pstart = pmaIterator{} // iterators invalidated
+ } else {
+ // Other mappings get pmas by translating.
+ optMR := vseg.mappableRangeOf(optAR)
+ reqAR := optAR.Intersect(ar)
+ reqMR := vseg.mappableRangeOf(reqAR)
+ perms := at
+ if vma.private {
+ // This pma will be copy-on-write; don't require write
+ // permission, but do require read permission to
+ // facilitate the copy.
+ //
+ // If at.Write is true, we will need to break
+ // copy-on-write immediately, which occurs after
+ // translation below.
+ perms.Read = true
+ perms.Write = false
+ }
+ ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
+ if checkInvariants {
+ if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil {
+ panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err))
+ }
+ }
+ // Install a pma for each translation.
+ if len(ts) == 0 {
+ return pstart, pgap, err
+ }
+ pstart = pmaIterator{} // iterators invalidated
+ for _, t := range ts {
+ newpmaAR := vseg.addrRangeOf(t.Source)
+ newpma := pma{
+ file: t.File,
+ off: t.Offset,
+ translatePerms: t.Perms,
+ effectivePerms: vma.effectivePerms.Intersect(t.Perms),
+ maxPerms: vma.maxPerms.Intersect(t.Perms),
+ }
+ if vma.private {
+ newpma.effectivePerms.Write = false
+ newpma.maxPerms.Write = false
+ newpma.needCOW = true
+ }
+ mm.addRSSLocked(newpmaAR)
+ t.File.IncRef(t.FileRange())
+ // This is valid because memmap.Mappable.Translate is
+ // required to return Translations in increasing
+ // Translation.Source order.
+ pseg = mm.pmas.Insert(pgap, newpmaAR, newpma)
+ pgap = pseg.NextGap()
+ }
+ // The error returned by Translate is only significant if
+ // it occurred before ar.End.
+ if err != nil && vseg.addrRangeOf(ts[len(ts)-1].Source).End < ar.End {
+ return pstart, pgap, err
+ }
+ // Rewind pseg to the first pma inserted and continue the
+ // loop to check if we need to break copy-on-write.
+ pseg, pgap = mm.findOrSeekPrevUpperBoundPMA(vseg.addrRangeOf(ts[0].Source).Start, pgap), pmaGapIterator{}
+ continue
+ }
+
+ case pseg.Ok() && pseg.Start() < vsegAR.End:
+ oldpma := pseg.ValuePtr()
+ if at.Write && mm.isPMACopyOnWriteLocked(vseg, pseg) {
+ // Break copy-on-write by copying.
+ if checkInvariants {
+ if !oldpma.maxPerms.Read {
+ panic(fmt.Sprintf("pma %v needs to be copied for writing, but is not readable: %v", pseg.Range(), oldpma))
+ }
+ }
+ // The majority of copy-on-write breaks on executable pages
+ // come from:
+ //
+ // - The ELF loader, which must zero out bytes on the last
+ // page of each segment after the end of the segment.
+ //
+ // - gdb's use of ptrace to insert breakpoints.
+ //
+ // Neither of these cases has enough spatial locality to
+ // benefit from copying nearby pages, so if the vma is
+ // executable, only copy the pages required.
+ var copyAR usermem.AddrRange
+ if vseg.ValuePtr().effectivePerms.Execute {
+ copyAR = pseg.Range().Intersect(ar)
+ } else {
+ copyAR = pseg.Range().Intersect(maskAR)
+ }
+ // Get internal mappings from the pma to copy from.
+ if err := pseg.getInternalMappingsLocked(); err != nil {
+ return pstart, pseg.PrevGap(), err
+ }
+ // Copy contents.
+ fr, err := mf.AllocateAndFill(uint64(copyAR.Length()), usage.Anonymous, &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)})
+ if _, ok := err.(safecopy.BusError); ok {
+ // If we got SIGBUS during the copy, deliver SIGBUS to
+ // userspace (instead of SIGSEGV) if we're breaking
+ // copy-on-write due to application page fault.
+ err = &memmap.BusError{err}
+ }
+ if fr.Length() == 0 {
+ return pstart, pseg.PrevGap(), err
+ }
+ // Unmap all of maskAR, not just copyAR, to minimize host
+ // syscalls. AddressSpace mappings must be removed before
+ // mm.decPrivateRef().
+ if !didUnmapAS {
+ mm.unmapASLocked(maskAR)
+ didUnmapAS = true
+ }
+ // Replace the pma with a copy in the part of the address
+ // range where copying was successful. This doesn't change
+ // RSS.
+ copyAR.End = copyAR.Start + usermem.Addr(fr.Length())
+ if copyAR != pseg.Range() {
+ pseg = mm.pmas.Isolate(pseg, copyAR)
+ pstart = pmaIterator{} // iterators invalidated
+ }
+ oldpma = pseg.ValuePtr()
+ if oldpma.private {
+ mm.decPrivateRef(pseg.fileRange())
+ }
+ oldpma.file.DecRef(pseg.fileRange())
+ mm.incPrivateRef(fr)
+ mf.IncRef(fr)
+ oldpma.file = mf
+ oldpma.off = fr.Start
+ oldpma.translatePerms = usermem.AnyAccess
+ oldpma.effectivePerms = vma.effectivePerms
+ oldpma.maxPerms = vma.maxPerms
+ oldpma.needCOW = false
+ oldpma.private = true
+ oldpma.internalMappings = safemem.BlockSeq{}
+ // Try to merge the pma with its neighbors.
+ if prev := pseg.PrevSegment(); prev.Ok() {
+ if merged := mm.pmas.Merge(prev, pseg); merged.Ok() {
+ pseg = merged
+ pstart = pmaIterator{} // iterators invalidated
+ }
+ }
+ if next := pseg.NextSegment(); next.Ok() {
+ if merged := mm.pmas.Merge(pseg, next); merged.Ok() {
+ pseg = merged
+ pstart = pmaIterator{} // iterators invalidated
+ }
+ }
+ // The error returned by AllocateAndFill is only
+ // significant if it occurred before ar.End.
+ if err != nil && pseg.End() < ar.End {
+ return pstart, pseg.NextGap(), err
+ }
+ // Ensure pseg and pgap are correct for the next iteration
+ // of the loop.
+ pseg, pgap = pseg.NextNonEmpty()
+ } else if !oldpma.translatePerms.SupersetOf(at) {
+ // Get new pmas (with sufficient permissions) by calling
+ // memmap.Mappable.Translate again.
+ if checkInvariants {
+ if oldpma.private {
+ panic(fmt.Sprintf("private pma %v has non-maximal pma.translatePerms: %v", pseg.Range(), oldpma))
+ }
+ }
+ // Allow the entire pma to be replaced.
+ optAR := pseg.Range()
+ optMR := vseg.mappableRangeOf(optAR)
+ reqAR := optAR.Intersect(ar)
+ reqMR := vseg.mappableRangeOf(reqAR)
+ perms := oldpma.translatePerms.Union(at)
+ ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
+ if checkInvariants {
+ if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil {
+ panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err))
+ }
+ }
+ // Remove the part of the existing pma covered by new
+ // Translations, then insert new pmas. This doesn't change
+ // RSS. Note that we don't need to call unmapASLocked: any
+ // existing AddressSpace mappings are still valid (though
+ // less permissive than the new pmas indicate) until
+ // Invalidate is called, and will be replaced by future
+ // calls to mapASLocked.
+ if len(ts) == 0 {
+ return pstart, pseg.PrevGap(), err
+ }
+ transMR := memmap.MappableRange{ts[0].Source.Start, ts[len(ts)-1].Source.End}
+ transAR := vseg.addrRangeOf(transMR)
+ pseg = mm.pmas.Isolate(pseg, transAR)
+ pseg.ValuePtr().file.DecRef(pseg.fileRange())
+ pgap = mm.pmas.Remove(pseg)
+ pstart = pmaIterator{} // iterators invalidated
+ for _, t := range ts {
+ newpmaAR := vseg.addrRangeOf(t.Source)
+ newpma := pma{
+ file: t.File,
+ off: t.Offset,
+ translatePerms: t.Perms,
+ effectivePerms: vma.effectivePerms.Intersect(t.Perms),
+ maxPerms: vma.maxPerms.Intersect(t.Perms),
+ }
+ if vma.private {
+ newpma.effectivePerms.Write = false
+ newpma.maxPerms.Write = false
+ newpma.needCOW = true
+ }
+ t.File.IncRef(t.FileRange())
+ pseg = mm.pmas.Insert(pgap, newpmaAR, newpma)
+ pgap = pseg.NextGap()
+ }
+ // The error returned by Translate is only significant if
+ // it occurred before ar.End.
+ if err != nil && pseg.End() < ar.End {
+ return pstart, pgap, err
+ }
+ // Ensure pseg and pgap are correct for the next iteration
+ // of the loop.
+ if pgap.Range().Length() == 0 {
+ pseg, pgap = pgap.NextSegment(), pmaGapIterator{}
+ } else {
+ pseg = pmaIterator{}
+ }
+ } else {
+ // We have a usable pma; continue.
+ pseg, pgap = pseg.NextNonEmpty()
+ }
+
+ default:
+ break pmaLoop
+ }
+ }
+ // Go to the next vma.
+ if ar.End <= vseg.End() {
+ if pgap.Ok() {
+ return pstart, pgap, nil
+ }
+ return pstart, pseg.PrevGap(), nil
+ }
+ vseg = vseg.NextSegment()
+ }
+}
+
+const (
+ // When memory is allocated for a private pma, align the allocated address
+ // range to a privateAllocUnit boundary when possible. Larger values of
+ // privateAllocUnit may reduce page faults by allowing fewer, larger pmas
+ // to be mapped, but may result in larger amounts of wasted memory in the
+ // presence of fragmentation. privateAllocUnit must be a power-of-2
+ // multiple of usermem.PageSize.
+ privateAllocUnit = usermem.HugePageSize
+
+ privateAllocMask = privateAllocUnit - 1
+)
+
+func privateAligned(ar usermem.AddrRange) usermem.AddrRange {
+ aligned := usermem.AddrRange{ar.Start &^ privateAllocMask, ar.End}
+ if end := (ar.End + privateAllocMask) &^ privateAllocMask; end >= ar.End {
+ aligned.End = end
+ }
+ if checkInvariants {
+ if !aligned.IsSupersetOf(ar) {
+ panic(fmt.Sprintf("aligned AddrRange %#v is not a superset of ar %#v", aligned, ar))
+ }
+ }
+ return aligned
+}
+
+// isPMACopyOnWriteLocked returns true if the contents of the pma represented
+// by pseg must be copied to a new private pma to be written to.
+//
+// If the pma is a copy-on-write private pma, and holds the only reference on
+// the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory
+// and update the pma to indicate that it does not require copy-on-write.
+//
+// Preconditions: vseg.Range().IsSupersetOf(pseg.Range()). mm.mappingMu must be
+// locked. mm.activeMu must be locked for writing.
+func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool {
+ pma := pseg.ValuePtr()
+ if !pma.needCOW {
+ return false
+ }
+ if !pma.private {
+ return true
+ }
+ // If we have the only reference on private memory to be copied, just take
+ // ownership of it instead of copying. If we do hold the only reference,
+ // additional references can only be taken by mm.Fork(), which is excluded
+ // by mm.activeMu, so this isn't racy.
+ mm.privateRefs.mu.Lock()
+ defer mm.privateRefs.mu.Unlock()
+ fr := pseg.fileRange()
+ // This check relies on mm.privateRefs.refs being kept fully merged.
+ rseg := mm.privateRefs.refs.FindSegment(fr.Start)
+ if rseg.Ok() && rseg.Value() == 1 && fr.End <= rseg.End() {
+ pma.needCOW = false
+ // pma.private => pma.translatePerms == usermem.AnyAccess
+ vma := vseg.ValuePtr()
+ pma.effectivePerms = vma.effectivePerms
+ pma.maxPerms = vma.maxPerms
+ return false
+ }
+ return true
+}
+
+// Invalidate implements memmap.MappingSpace.Invalidate.
+func (mm *MemoryManager) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateOpts) {
+ if checkInvariants {
+ if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ }
+
+ mm.activeMu.Lock()
+ defer mm.activeMu.Unlock()
+ if mm.captureInvalidations {
+ mm.capturedInvalidations = append(mm.capturedInvalidations, invalidateArgs{ar, opts})
+ return
+ }
+ mm.invalidateLocked(ar, opts.InvalidatePrivate, true)
+}
+
+// invalidateLocked removes pmas and AddressSpace mappings of those pmas for
+// addresses in ar.
+//
+// Preconditions: mm.activeMu must be locked for writing. ar.Length() != 0. ar
+// must be page-aligned.
+func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivate, invalidateShared bool) {
+ if checkInvariants {
+ if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ }
+
+ var didUnmapAS bool
+ pseg := mm.pmas.LowerBoundSegment(ar.Start)
+ for pseg.Ok() && pseg.Start() < ar.End {
+ pma := pseg.ValuePtr()
+ if (invalidatePrivate && pma.private) || (invalidateShared && !pma.private) {
+ pseg = mm.pmas.Isolate(pseg, ar)
+ pma = pseg.ValuePtr()
+ if !didUnmapAS {
+ // Unmap all of ar, not just pseg.Range(), to minimize host
+ // syscalls. AddressSpace mappings must be removed before
+ // mm.decPrivateRef().
+ mm.unmapASLocked(ar)
+ didUnmapAS = true
+ }
+ if pma.private {
+ mm.decPrivateRef(pseg.fileRange())
+ }
+ mm.removeRSSLocked(pseg.Range())
+ pma.file.DecRef(pseg.fileRange())
+ pseg = mm.pmas.Remove(pseg).NextSegment()
+ } else {
+ pseg = pseg.NextSegment()
+ }
+ }
+}
+
+// Pin returns the platform.File ranges currently mapped by addresses in ar in
+// mm, acquiring a reference on the returned ranges which the caller must
+// release by calling Unpin. If not all addresses are mapped, Pin returns a
+// non-nil error. Note that Pin may return both a non-empty slice of
+// PinnedRanges and a non-nil error.
+//
+// Pin does not prevent mapped ranges from changing, making it unsuitable for
+// most I/O. It should only be used in contexts that would use get_user_pages()
+// in the Linux kernel.
+//
+// Preconditions: ar.Length() != 0. ar must be page-aligned.
+func (mm *MemoryManager) Pin(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) ([]PinnedRange, error) {
+ if checkInvariants {
+ if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ }
+
+ // Ensure that we have usable vmas.
+ mm.mappingMu.RLock()
+ vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions)
+ if vendaddr := vend.Start(); vendaddr < ar.End {
+ if vendaddr <= ar.Start {
+ mm.mappingMu.RUnlock()
+ return nil, verr
+ }
+ ar.End = vendaddr
+ }
+
+ // Ensure that we have usable pmas.
+ mm.activeMu.Lock()
+ pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at)
+ mm.mappingMu.RUnlock()
+ if pendaddr := pend.Start(); pendaddr < ar.End {
+ if pendaddr <= ar.Start {
+ mm.activeMu.Unlock()
+ return nil, perr
+ }
+ ar.End = pendaddr
+ }
+
+ // Gather pmas.
+ var prs []PinnedRange
+ for pseg.Ok() && pseg.Start() < ar.End {
+ psar := pseg.Range().Intersect(ar)
+ f := pseg.ValuePtr().file
+ fr := pseg.fileRangeOf(psar)
+ f.IncRef(fr)
+ prs = append(prs, PinnedRange{
+ Source: psar,
+ File: f,
+ Offset: fr.Start,
+ })
+ pseg = pseg.NextSegment()
+ }
+ mm.activeMu.Unlock()
+
+ // Return the first error in order of progress through ar.
+ if perr != nil {
+ return prs, perr
+ }
+ return prs, verr
+}
+
+// PinnedRanges are returned by MemoryManager.Pin.
+type PinnedRange struct {
+ // Source is the corresponding range of addresses.
+ Source usermem.AddrRange
+
+ // File is the mapped file.
+ File platform.File
+
+ // Offset is the offset into File at which this PinnedRange begins.
+ Offset uint64
+}
+
+// FileRange returns the platform.File offsets mapped by pr.
+func (pr PinnedRange) FileRange() platform.FileRange {
+ return platform.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())}
+}
+
+// Unpin releases the reference held by prs.
+func Unpin(prs []PinnedRange) {
+ for i := range prs {
+ prs[i].File.DecRef(prs[i].FileRange())
+ }
+}
+
+// movePMAsLocked moves all pmas in oldAR to newAR.
+//
+// Preconditions: mm.activeMu must be locked for writing. oldAR.Length() != 0.
+// oldAR.Length() <= newAR.Length(). !oldAR.Overlaps(newAR).
+// mm.pmas.IsEmptyRange(newAR). oldAR and newAR must be page-aligned.
+func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) {
+ if checkInvariants {
+ if !oldAR.WellFormed() || oldAR.Length() <= 0 || !oldAR.IsPageAligned() {
+ panic(fmt.Sprintf("invalid oldAR: %v", oldAR))
+ }
+ if !newAR.WellFormed() || newAR.Length() <= 0 || !newAR.IsPageAligned() {
+ panic(fmt.Sprintf("invalid newAR: %v", newAR))
+ }
+ if oldAR.Length() > newAR.Length() {
+ panic(fmt.Sprintf("old address range %v may contain pmas that will not fit in new address range %v", oldAR, newAR))
+ }
+ if oldAR.Overlaps(newAR) {
+ panic(fmt.Sprintf("old and new address ranges overlap: %v, %v", oldAR, newAR))
+ }
+ // mm.pmas.IsEmptyRange is checked by mm.pmas.Insert.
+ }
+
+ type movedPMA struct {
+ oldAR usermem.AddrRange
+ pma pma
+ }
+ var movedPMAs []movedPMA
+ pseg := mm.pmas.LowerBoundSegment(oldAR.Start)
+ for pseg.Ok() && pseg.Start() < oldAR.End {
+ pseg = mm.pmas.Isolate(pseg, oldAR)
+ movedPMAs = append(movedPMAs, movedPMA{
+ oldAR: pseg.Range(),
+ pma: pseg.Value(),
+ })
+ pseg = mm.pmas.Remove(pseg).NextSegment()
+ // No RSS change is needed since we're re-inserting the same pmas
+ // below.
+ }
+
+ off := newAR.Start - oldAR.Start
+ pgap := mm.pmas.FindGap(newAR.Start)
+ for i := range movedPMAs {
+ mpma := &movedPMAs[i]
+ pmaNewAR := usermem.AddrRange{mpma.oldAR.Start + off, mpma.oldAR.End + off}
+ pgap = mm.pmas.Insert(pgap, pmaNewAR, mpma.pma).NextGap()
+ }
+
+ mm.unmapASLocked(oldAR)
+}
+
+// getPMAInternalMappingsLocked ensures that pmas for all addresses in ar have
+// cached internal mappings. It returns:
+//
+// - An iterator to the gap after the last pma with internal mappings
+// containing an address in ar. If internal mappings exist for no addresses in
+// ar, the iterator is to a gap that begins before ar.Start.
+//
+// - An error that is non-nil if internal mappings exist for only a subset of
+// ar.
+//
+// Preconditions: mm.activeMu must be locked for writing.
+// pseg.Range().Contains(ar.Start). pmas must exist for all addresses in ar.
+// ar.Length() != 0.
+//
+// Postconditions: getPMAInternalMappingsLocked does not invalidate iterators
+// into mm.pmas.
+func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) (pmaGapIterator, error) {
+ if checkInvariants {
+ if !ar.WellFormed() || ar.Length() <= 0 {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ if !pseg.Range().Contains(ar.Start) {
+ panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
+ }
+ }
+
+ for {
+ if err := pseg.getInternalMappingsLocked(); err != nil {
+ return pseg.PrevGap(), err
+ }
+ if ar.End <= pseg.End() {
+ return pseg.NextGap(), nil
+ }
+ pseg, _ = pseg.NextNonEmpty()
+ }
+}
+
+// getVecPMAInternalMappingsLocked ensures that pmas for all addresses in ars
+// have cached internal mappings. It returns the subset of ars for which
+// internal mappings exist. If this is not equal to ars, it returns a non-nil
+// error explaining why.
+//
+// Preconditions: mm.activeMu must be locked for writing. pmas must exist for
+// all addresses in ar.
+//
+// Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators
+// into mm.pmas.
+func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars usermem.AddrRangeSeq) (usermem.AddrRangeSeq, error) {
+ for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
+ ar := arsit.Head()
+ if ar.Length() == 0 {
+ continue
+ }
+ if pend, err := mm.getPMAInternalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); err != nil {
+ return truncatedAddrRangeSeq(ars, arsit, pend.Start()), err
+ }
+ }
+ return ars, nil
+}
+
+// internalMappingsLocked returns internal mappings for addresses in ar.
+//
+// Preconditions: mm.activeMu must be locked. Internal mappings must have been
+// previously established for all addresses in ar. ar.Length() != 0.
+// pseg.Range().Contains(ar.Start).
+func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) safemem.BlockSeq {
+ if checkInvariants {
+ if !ar.WellFormed() || ar.Length() <= 0 {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ if !pseg.Range().Contains(ar.Start) {
+ panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
+ }
+ }
+
+ if ar.End <= pseg.End() {
+ // Since only one pma is involved, we can use pma.internalMappings
+ // directly, avoiding a slice allocation.
+ offset := uint64(ar.Start - pseg.Start())
+ return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length()))
+ }
+
+ var ims []safemem.Block
+ for {
+ pr := pseg.Range().Intersect(ar)
+ for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pr.Start - pseg.Start())).TakeFirst64(uint64(pr.Length())); !pims.IsEmpty(); pims = pims.Tail() {
+ ims = append(ims, pims.Head())
+ }
+ if ar.End <= pseg.End() {
+ break
+ }
+ pseg = pseg.NextSegment()
+ }
+ return safemem.BlockSeqFromSlice(ims)
+}
+
+// vecInternalMappingsLocked returns internal mappings for addresses in ars.
+//
+// Preconditions: mm.activeMu must be locked. Internal mappings must have been
+// previously established for all addresses in ars.
+func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) safemem.BlockSeq {
+ var ims []safemem.Block
+ for ; !ars.IsEmpty(); ars = ars.Tail() {
+ ar := ars.Head()
+ if ar.Length() == 0 {
+ continue
+ }
+ for pims := mm.internalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); !pims.IsEmpty(); pims = pims.Tail() {
+ ims = append(ims, pims.Head())
+ }
+ }
+ return safemem.BlockSeqFromSlice(ims)
+}
+
+// incPrivateRef acquires a reference on private pages in fr.
+func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) {
+ mm.privateRefs.mu.Lock()
+ defer mm.privateRefs.mu.Unlock()
+ refSet := &mm.privateRefs.refs
+ seg, gap := refSet.Find(fr.Start)
+ for {
+ switch {
+ case seg.Ok() && seg.Start() < fr.End:
+ seg = refSet.Isolate(seg, fr)
+ seg.SetValue(seg.Value() + 1)
+ seg, gap = seg.NextNonEmpty()
+ case gap.Ok() && gap.Start() < fr.End:
+ seg, gap = refSet.InsertWithoutMerging(gap, gap.Range().Intersect(fr), 1).NextNonEmpty()
+ default:
+ refSet.MergeAdjacent(fr)
+ return
+ }
+ }
+}
+
+// decPrivateRef releases a reference on private pages in fr.
+func (mm *MemoryManager) decPrivateRef(fr platform.FileRange) {
+ var freed []platform.FileRange
+
+ mm.privateRefs.mu.Lock()
+ refSet := &mm.privateRefs.refs
+ seg := refSet.LowerBoundSegment(fr.Start)
+ for seg.Ok() && seg.Start() < fr.End {
+ seg = refSet.Isolate(seg, fr)
+ if old := seg.Value(); old == 1 {
+ freed = append(freed, seg.Range())
+ seg = refSet.Remove(seg).NextSegment()
+ } else {
+ seg.SetValue(old - 1)
+ seg = seg.NextSegment()
+ }
+ }
+ refSet.MergeAdjacent(fr)
+ mm.privateRefs.mu.Unlock()
+
+ mf := mm.mfp.MemoryFile()
+ for _, fr := range freed {
+ mf.DecRef(fr)
+ }
+}
+
+// addRSSLocked updates the current and maximum resident set size of a
+// MemoryManager to reflect the insertion of a pma at ar.
+//
+// Preconditions: mm.activeMu must be locked for writing.
+func (mm *MemoryManager) addRSSLocked(ar usermem.AddrRange) {
+ mm.curRSS += uint64(ar.Length())
+ if mm.curRSS > mm.maxRSS {
+ mm.maxRSS = mm.curRSS
+ }
+}
+
+// removeRSSLocked updates the current resident set size of a MemoryManager to
+// reflect the removal of a pma at ar.
+//
+// Preconditions: mm.activeMu must be locked for writing.
+func (mm *MemoryManager) removeRSSLocked(ar usermem.AddrRange) {
+ mm.curRSS -= uint64(ar.Length())
+}
+
+// pmaSetFunctions implements segment.Functions for pmaSet.
+type pmaSetFunctions struct{}
+
+func (pmaSetFunctions) MinKey() usermem.Addr {
+ return 0
+}
+
+func (pmaSetFunctions) MaxKey() usermem.Addr {
+ return ^usermem.Addr(0)
+}
+
+func (pmaSetFunctions) ClearValue(pma *pma) {
+ pma.file = nil
+ pma.internalMappings = safemem.BlockSeq{}
+}
+
+func (pmaSetFunctions) Merge(ar1 usermem.AddrRange, pma1 pma, ar2 usermem.AddrRange, pma2 pma) (pma, bool) {
+ if pma1.file != pma2.file ||
+ pma1.off+uint64(ar1.Length()) != pma2.off ||
+ pma1.translatePerms != pma2.translatePerms ||
+ pma1.effectivePerms != pma2.effectivePerms ||
+ pma1.maxPerms != pma2.maxPerms ||
+ pma1.needCOW != pma2.needCOW ||
+ pma1.private != pma2.private {
+ return pma{}, false
+ }
+
+ // Discard internal mappings instead of trying to merge them, since merging
+ // them requires an allocation and getting them again from the
+ // platform.File might not.
+ pma1.internalMappings = safemem.BlockSeq{}
+ return pma1, true
+}
+
+func (pmaSetFunctions) Split(ar usermem.AddrRange, p pma, split usermem.Addr) (pma, pma) {
+ newlen1 := uint64(split - ar.Start)
+ p2 := p
+ p2.off += newlen1
+ if !p.internalMappings.IsEmpty() {
+ p.internalMappings = p.internalMappings.TakeFirst64(newlen1)
+ p2.internalMappings = p2.internalMappings.DropFirst64(newlen1)
+ }
+ return p, p2
+}
+
+// findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do
+// so by scanning linearly backward from pgap.
+//
+// Preconditions: mm.activeMu must be locked. addr <= pgap.Start().
+func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr usermem.Addr, pgap pmaGapIterator) pmaIterator {
+ if checkInvariants {
+ if !pgap.Ok() {
+ panic("terminal pma iterator")
+ }
+ if addr > pgap.Start() {
+ panic(fmt.Sprintf("can't seek backward to %#x from %#x", addr, pgap.Start()))
+ }
+ }
+ // Optimistically check if pgap.PrevSegment() is the PMA we're looking for,
+ // which is the case if findOrSeekPrevUpperBoundPMA is called to find the
+ // start of a range containing only a single PMA.
+ if pseg := pgap.PrevSegment(); pseg.Start() <= addr {
+ return pseg
+ }
+ return mm.pmas.UpperBoundSegment(addr)
+}
+
+// getInternalMappingsLocked ensures that pseg.ValuePtr().internalMappings is
+// non-empty.
+//
+// Preconditions: mm.activeMu must be locked for writing.
+func (pseg pmaIterator) getInternalMappingsLocked() error {
+ pma := pseg.ValuePtr()
+ if pma.internalMappings.IsEmpty() {
+ // This must use maxPerms (instead of perms) because some permission
+ // constraints are only visible to vmas; for example, mappings of
+ // read-only files have vma.maxPerms.Write unset, but this may not be
+ // visible to the memmap.Mappable.
+ perms := pma.maxPerms
+ // We will never execute application code through an internal mapping.
+ perms.Execute = false
+ ims, err := pma.file.MapInternal(pseg.fileRange(), perms)
+ if err != nil {
+ return err
+ }
+ pma.internalMappings = ims
+ }
+ return nil
+}
+
+func (pseg pmaIterator) fileRange() platform.FileRange {
+ return pseg.fileRangeOf(pseg.Range())
+}
+
+// Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0.
+func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange {
+ if checkInvariants {
+ if !pseg.Ok() {
+ panic("terminal pma iterator")
+ }
+ if !ar.WellFormed() || ar.Length() <= 0 {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ if !pseg.Range().IsSupersetOf(ar) {
+ panic(fmt.Sprintf("ar %v out of bounds %v", ar, pseg.Range()))
+ }
+ }
+
+ pma := pseg.ValuePtr()
+ pstart := pseg.Start()
+ return platform.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)}
+}
diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go
new file mode 100644
index 000000000..6efe5102b
--- /dev/null
+++ b/pkg/sentry/mm/procfs.go
@@ -0,0 +1,329 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+ "bytes"
+ "fmt"
+ "strings"
+
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+ // devMinorBits is the number of minor bits in a device number. Linux:
+ // include/linux/kdev_t.h:MINORBITS
+ devMinorBits = 20
+
+ vsyscallEnd = usermem.Addr(0xffffffffff601000)
+ vsyscallMapsEntry = "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"
+ vsyscallSmapsEntry = vsyscallMapsEntry +
+ "Size: 4 kB\n" +
+ "Rss: 0 kB\n" +
+ "Pss: 0 kB\n" +
+ "Shared_Clean: 0 kB\n" +
+ "Shared_Dirty: 0 kB\n" +
+ "Private_Clean: 0 kB\n" +
+ "Private_Dirty: 0 kB\n" +
+ "Referenced: 0 kB\n" +
+ "Anonymous: 0 kB\n" +
+ "AnonHugePages: 0 kB\n" +
+ "Shared_Hugetlb: 0 kB\n" +
+ "Private_Hugetlb: 0 kB\n" +
+ "Swap: 0 kB\n" +
+ "SwapPss: 0 kB\n" +
+ "KernelPageSize: 4 kB\n" +
+ "MMUPageSize: 4 kB\n" +
+ "Locked: 0 kB\n" +
+ "VmFlags: rd ex \n"
+)
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (mm *MemoryManager) NeedsUpdate(generation int64) bool {
+ return true
+}
+
+// ReadMapsDataInto is called by fsimpl/proc.mapsData.Generate to
+// implement /proc/[pid]/maps.
+func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, buf *bytes.Buffer) {
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ var start usermem.Addr
+
+ for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
+ mm.appendVMAMapsEntryLocked(ctx, vseg, buf)
+ }
+
+ // We always emulate vsyscall, so advertise it here. Everything about a
+ // vsyscall region is static, so just hard code the maps entry since we
+ // don't have a real vma backing it. The vsyscall region is at the end of
+ // the virtual address space so nothing should be mapped after it (if
+ // something is really mapped in the tiny ~10 MiB segment afterwards, we'll
+ // get the sorting on the maps file wrong at worst; but that's not possible
+ // on any current platform).
+ //
+ // Artifically adjust the seqfile handle so we only output vsyscall entry once.
+ if start != vsyscallEnd {
+ buf.WriteString(vsyscallMapsEntry)
+ }
+}
+
+// ReadMapsSeqFileData is called by fs/proc.mapsData.ReadSeqFileData to
+// implement /proc/[pid]/maps.
+func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ var data []seqfile.SeqData
+ var start usermem.Addr
+ if handle != nil {
+ start = *handle.(*usermem.Addr)
+ }
+ for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
+ vmaAddr := vseg.End()
+ data = append(data, seqfile.SeqData{
+ Buf: mm.vmaMapsEntryLocked(ctx, vseg),
+ Handle: &vmaAddr,
+ })
+ }
+
+ // We always emulate vsyscall, so advertise it here. Everything about a
+ // vsyscall region is static, so just hard code the maps entry since we
+ // don't have a real vma backing it. The vsyscall region is at the end of
+ // the virtual address space so nothing should be mapped after it (if
+ // something is really mapped in the tiny ~10 MiB segment afterwards, we'll
+ // get the sorting on the maps file wrong at worst; but that's not possible
+ // on any current platform).
+ //
+ // Artifically adjust the seqfile handle so we only output vsyscall entry once.
+ if start != vsyscallEnd {
+ vmaAddr := vsyscallEnd
+ data = append(data, seqfile.SeqData{
+ Buf: []byte(vsyscallMapsEntry),
+ Handle: &vmaAddr,
+ })
+ }
+ return data, 1
+}
+
+// vmaMapsEntryLocked returns a /proc/[pid]/maps entry for the vma iterated by
+// vseg, including the trailing newline.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) vmaMapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte {
+ var b bytes.Buffer
+ mm.appendVMAMapsEntryLocked(ctx, vseg, &b)
+ return b.Bytes()
+}
+
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) {
+ vma := vseg.ValuePtr()
+ private := "p"
+ if !vma.private {
+ private = "s"
+ }
+
+ var dev, ino uint64
+ if vma.id != nil {
+ dev = vma.id.DeviceID()
+ ino = vma.id.InodeID()
+ }
+ devMajor := uint32(dev >> devMinorBits)
+ devMinor := uint32(dev & ((1 << devMinorBits) - 1))
+
+ // Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() =>
+ // stack_guard_page_start().
+ lineLen, _ := fmt.Fprintf(b, "%08x-%08x %s%s %08x %02x:%02x %d ",
+ vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino)
+
+ // Figure out our filename or hint.
+ var s string
+ if vma.hint != "" {
+ s = vma.hint
+ } else if vma.id != nil {
+ // FIXME(jamieliu): We are holding mm.mappingMu here, which is
+ // consistent with Linux's holding mmap_sem in
+ // fs/proc/task_mmu.c:show_map_vma() => fs/seq_file.c:seq_file_path().
+ // However, it's not clear that fs.File.MappedName() is actually
+ // consistent with this lock order.
+ s = vma.id.MappedName(ctx)
+ }
+ if s != "" {
+ // Per linux, we pad until the 74th character.
+ if pad := 73 - lineLen; pad > 0 {
+ b.WriteString(strings.Repeat(" ", pad))
+ }
+ b.WriteString(s)
+ }
+ b.WriteString("\n")
+}
+
+// ReadSmapsDataInto is called by fsimpl/proc.smapsData.Generate to
+// implement /proc/[pid]/maps.
+func (mm *MemoryManager) ReadSmapsDataInto(ctx context.Context, buf *bytes.Buffer) {
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ var start usermem.Addr
+
+ for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
+ mm.vmaSmapsEntryIntoLocked(ctx, vseg, buf)
+ }
+
+ // We always emulate vsyscall, so advertise it here. See
+ // ReadMapsSeqFileData for additional commentary.
+ if start != vsyscallEnd {
+ buf.WriteString(vsyscallSmapsEntry)
+ }
+}
+
+// ReadSmapsSeqFileData is called by fs/proc.smapsData.ReadSeqFileData to
+// implement /proc/[pid]/smaps.
+func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ var data []seqfile.SeqData
+ var start usermem.Addr
+ if handle != nil {
+ start = *handle.(*usermem.Addr)
+ }
+ for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
+ vmaAddr := vseg.End()
+ data = append(data, seqfile.SeqData{
+ Buf: mm.vmaSmapsEntryLocked(ctx, vseg),
+ Handle: &vmaAddr,
+ })
+ }
+
+ // We always emulate vsyscall, so advertise it here. See
+ // ReadMapsSeqFileData for additional commentary.
+ if start != vsyscallEnd {
+ vmaAddr := vsyscallEnd
+ data = append(data, seqfile.SeqData{
+ Buf: []byte(vsyscallSmapsEntry),
+ Handle: &vmaAddr,
+ })
+ }
+ return data, 1
+}
+
+// vmaSmapsEntryLocked returns a /proc/[pid]/smaps entry for the vma iterated
+// by vseg, including the trailing newline.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte {
+ var b bytes.Buffer
+ mm.vmaSmapsEntryIntoLocked(ctx, vseg, &b)
+ return b.Bytes()
+}
+
+func (mm *MemoryManager) vmaSmapsEntryIntoLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) {
+ mm.appendVMAMapsEntryLocked(ctx, vseg, b)
+ vma := vseg.ValuePtr()
+
+ // We take mm.activeMu here in each call to vmaSmapsEntryLocked, instead of
+ // requiring it to be locked as a precondition, to reduce the latency
+ // impact of reading /proc/[pid]/smaps on concurrent performance-sensitive
+ // operations requiring activeMu for writing like faults.
+ mm.activeMu.RLock()
+ var rss uint64
+ var anon uint64
+ vsegAR := vseg.Range()
+ for pseg := mm.pmas.LowerBoundSegment(vsegAR.Start); pseg.Ok() && pseg.Start() < vsegAR.End; pseg = pseg.NextSegment() {
+ psegAR := pseg.Range().Intersect(vsegAR)
+ size := uint64(psegAR.Length())
+ rss += size
+ if pseg.ValuePtr().private {
+ anon += size
+ }
+ }
+ mm.activeMu.RUnlock()
+
+ fmt.Fprintf(b, "Size: %8d kB\n", vseg.Range().Length()/1024)
+ fmt.Fprintf(b, "Rss: %8d kB\n", rss/1024)
+ // Currently we report PSS = RSS, i.e. we pretend each page mapped by a pma
+ // is only mapped by that pma. This avoids having to query memmap.Mappables
+ // for reference count information on each page. As a corollary, all pages
+ // are accounted as "private" whether or not the vma is private; compare
+ // Linux's fs/proc/task_mmu.c:smaps_account().
+ fmt.Fprintf(b, "Pss: %8d kB\n", rss/1024)
+ fmt.Fprintf(b, "Shared_Clean: %8d kB\n", 0)
+ fmt.Fprintf(b, "Shared_Dirty: %8d kB\n", 0)
+ // Pretend that all pages are dirty if the vma is writable, and clean otherwise.
+ clean := rss
+ if vma.effectivePerms.Write {
+ clean = 0
+ }
+ fmt.Fprintf(b, "Private_Clean: %8d kB\n", clean/1024)
+ fmt.Fprintf(b, "Private_Dirty: %8d kB\n", (rss-clean)/1024)
+ // Pretend that all pages are "referenced" (recently touched).
+ fmt.Fprintf(b, "Referenced: %8d kB\n", rss/1024)
+ fmt.Fprintf(b, "Anonymous: %8d kB\n", anon/1024)
+ // Hugepages (hugetlb and THP) are not implemented.
+ fmt.Fprintf(b, "AnonHugePages: %8d kB\n", 0)
+ fmt.Fprintf(b, "Shared_Hugetlb: %8d kB\n", 0)
+ fmt.Fprintf(b, "Private_Hugetlb: %7d kB\n", 0)
+ // Swap is not implemented.
+ fmt.Fprintf(b, "Swap: %8d kB\n", 0)
+ fmt.Fprintf(b, "SwapPss: %8d kB\n", 0)
+ fmt.Fprintf(b, "KernelPageSize: %8d kB\n", usermem.PageSize/1024)
+ fmt.Fprintf(b, "MMUPageSize: %8d kB\n", usermem.PageSize/1024)
+ locked := rss
+ if vma.mlockMode == memmap.MLockNone {
+ locked = 0
+ }
+ fmt.Fprintf(b, "Locked: %8d kB\n", locked/1024)
+
+ b.WriteString("VmFlags: ")
+ if vma.realPerms.Read {
+ b.WriteString("rd ")
+ }
+ if vma.realPerms.Write {
+ b.WriteString("wr ")
+ }
+ if vma.realPerms.Execute {
+ b.WriteString("ex ")
+ }
+ if vma.canWriteMappableLocked() { // VM_SHARED
+ b.WriteString("sh ")
+ }
+ if vma.maxPerms.Read {
+ b.WriteString("mr ")
+ }
+ if vma.maxPerms.Write {
+ b.WriteString("mw ")
+ }
+ if vma.maxPerms.Execute {
+ b.WriteString("me ")
+ }
+ if !vma.private { // VM_MAYSHARE
+ b.WriteString("ms ")
+ }
+ if vma.growsDown {
+ b.WriteString("gd ")
+ }
+ if vma.mlockMode != memmap.MLockNone { // VM_LOCKED
+ b.WriteString("lo ")
+ }
+ if vma.mlockMode == memmap.MLockLazy { // VM_LOCKONFAULT
+ b.WriteString("?? ") // no explicit encoding in fs/proc/task_mmu.c:show_smap_vma_flags()
+ }
+ if vma.private && vma.effectivePerms.Write { // VM_ACCOUNT
+ b.WriteString("ac ")
+ }
+ b.WriteString("\n")
+}
diff --git a/pkg/sentry/mm/save_restore.go b/pkg/sentry/mm/save_restore.go
new file mode 100644
index 000000000..f56215d9a
--- /dev/null
+++ b/pkg/sentry/mm/save_restore.go
@@ -0,0 +1,57 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/context"
+)
+
+// InvalidateUnsavable invokes memmap.Mappable.InvalidateUnsavable on all
+// Mappables mapped by mm.
+func (mm *MemoryManager) InvalidateUnsavable(ctx context.Context) error {
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+ if vma := vseg.ValuePtr(); vma.mappable != nil {
+ if err := vma.mappable.InvalidateUnsavable(ctx); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
+// beforeSave is invoked by stateify.
+func (mm *MemoryManager) beforeSave() {
+ mf := mm.mfp.MemoryFile()
+ for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
+ if pma := pseg.ValuePtr(); pma.file != mf {
+ // InvalidateUnsavable should have caused all such pmas to be
+ // invalidated.
+ panic(fmt.Sprintf("Can't save pma %#v with non-MemoryFile of type %T:\n%s", pseg.Range(), pma.file, mm))
+ }
+ }
+}
+
+// afterLoad is invoked by stateify.
+func (mm *MemoryManager) afterLoad() {
+ mm.haveASIO = mm.p.SupportsAddressSpaceIO()
+ mf := mm.mfp.MemoryFile()
+ for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
+ pseg.ValuePtr().file = mf
+ }
+}
diff --git a/pkg/sentry/mm/shm.go b/pkg/sentry/mm/shm.go
new file mode 100644
index 000000000..6432731d4
--- /dev/null
+++ b/pkg/sentry/mm/shm.go
@@ -0,0 +1,66 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/shm"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// DetachShm unmaps a sysv shared memory segment.
+func (mm *MemoryManager) DetachShm(ctx context.Context, addr usermem.Addr) error {
+ if addr != addr.RoundDown() {
+ // "... shmaddr is not aligned on a page boundary." - man shmdt(2)
+ return syserror.EINVAL
+ }
+
+ var detached *shm.Shm
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+
+ // Find and remove the first vma containing an address >= addr that maps a
+ // segment originally attached at addr.
+ vseg := mm.vmas.LowerBoundSegment(addr)
+ for vseg.Ok() {
+ vma := vseg.ValuePtr()
+ if shm, ok := vma.mappable.(*shm.Shm); ok && vseg.Start() >= addr && uint64(vseg.Start()-addr) == vma.off {
+ detached = shm
+ vseg = mm.unmapLocked(ctx, vseg.Range()).NextSegment()
+ break
+ } else {
+ vseg = vseg.NextSegment()
+ }
+ }
+
+ if detached == nil {
+ // There is no shared memory segment attached at addr.
+ return syserror.EINVAL
+ }
+
+ // Remove all vmas that could have been created by the same attach.
+ end := addr + usermem.Addr(detached.EffectiveSize())
+ for vseg.Ok() && vseg.End() <= end {
+ vma := vseg.ValuePtr()
+ if vma.mappable == detached && uint64(vseg.Start()-addr) == vma.off {
+ vseg = mm.unmapLocked(ctx, vseg.Range()).NextSegment()
+ } else {
+ vseg = vseg.NextSegment()
+ }
+ }
+
+ return nil
+}
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
new file mode 100644
index 000000000..9ad52082d
--- /dev/null
+++ b/pkg/sentry/mm/special_mappable.go
@@ -0,0 +1,157 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// SpecialMappable implements memmap.MappingIdentity and memmap.Mappable with
+// semantics similar to Linux's mm/mmap.c:_install_special_mapping(), except
+// that SpecialMappable takes ownership of the memory that it represents
+// (_install_special_mapping() does not.)
+//
+// +stateify savable
+type SpecialMappable struct {
+ refs.AtomicRefCount
+
+ mfp pgalloc.MemoryFileProvider
+ fr platform.FileRange
+ name string
+}
+
+// NewSpecialMappable returns a SpecialMappable that owns fr, which represents
+// offsets in mfp.MemoryFile() that contain the SpecialMappable's data. The
+// SpecialMappable will use the given name in /proc/[pid]/maps.
+//
+// Preconditions: fr.Length() != 0.
+func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable {
+ m := SpecialMappable{mfp: mfp, fr: fr, name: name}
+ m.EnableLeakCheck("mm.SpecialMappable")
+ return &m
+}
+
+// DecRef implements refs.RefCounter.DecRef.
+func (m *SpecialMappable) DecRef() {
+ m.AtomicRefCount.DecRefWithDestructor(func() {
+ m.mfp.MemoryFile().DecRef(m.fr)
+ })
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (m *SpecialMappable) MappedName(ctx context.Context) string {
+ return m.name
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (m *SpecialMappable) DeviceID() uint64 {
+ return 0
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (m *SpecialMappable) InodeID() uint64 {
+ return 0
+}
+
+// Msync implements memmap.MappingIdentity.Msync.
+func (m *SpecialMappable) Msync(ctx context.Context, mr memmap.MappableRange) error {
+ // Linux: vm_file is NULL, causing msync to skip it entirely.
+ return nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (*SpecialMappable) AddMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) error {
+ return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (*SpecialMappable) RemoveMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) {
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (*SpecialMappable) CopyMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, usermem.AddrRange, uint64, bool) error {
+ return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (m *SpecialMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+ var err error
+ if required.End > m.fr.Length() {
+ err = &memmap.BusError{syserror.EFAULT}
+ }
+ if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 {
+ return []memmap.Translation{
+ {
+ Source: source,
+ File: m.mfp.MemoryFile(),
+ Offset: m.fr.Start + source.Start,
+ Perms: usermem.AnyAccess,
+ },
+ }, err
+ }
+ return nil, err
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (m *SpecialMappable) InvalidateUnsavable(ctx context.Context) error {
+ // Since data is stored in pgalloc.MemoryFile, the contents of which are
+ // preserved across save/restore, we don't need to do anything.
+ return nil
+}
+
+// MemoryFileProvider returns the MemoryFileProvider whose MemoryFile stores
+// the SpecialMappable's contents.
+func (m *SpecialMappable) MemoryFileProvider() pgalloc.MemoryFileProvider {
+ return m.mfp
+}
+
+// FileRange returns the offsets into MemoryFileProvider().MemoryFile() that
+// store the SpecialMappable's contents.
+func (m *SpecialMappable) FileRange() platform.FileRange {
+ return m.fr
+}
+
+// Length returns the length of the SpecialMappable.
+func (m *SpecialMappable) Length() uint64 {
+ return m.fr.Length()
+}
+
+// NewSharedAnonMappable returns a SpecialMappable that implements the
+// semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero.
+//
+// TODO(jamieliu): The use of SpecialMappable is a lazy code reuse hack. Linux
+// uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should
+// do the same to get non-zero device and inode IDs.
+func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) {
+ if length == 0 {
+ return nil, syserror.EINVAL
+ }
+ alignedLen, ok := usermem.Addr(length).RoundUp()
+ if !ok {
+ return nil, syserror.EINVAL
+ }
+ fr, err := mfp.MemoryFile().Allocate(uint64(alignedLen), usage.Anonymous)
+ if err != nil {
+ return nil, err
+ }
+ return NewSpecialMappable("/dev/zero (deleted)", mfp, fr), nil
+}
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
new file mode 100644
index 000000000..3f496aa9f
--- /dev/null
+++ b/pkg/sentry/mm/syscalls.go
@@ -0,0 +1,1286 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+ "fmt"
+ mrand "math/rand"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/futex"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// HandleUserFault handles an application page fault. sp is the faulting
+// application thread's stack pointer.
+//
+// Preconditions: mm.as != nil.
+func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr usermem.Addr, at usermem.AccessType, sp usermem.Addr) error {
+ ar, ok := addr.RoundDown().ToRange(usermem.PageSize)
+ if !ok {
+ return syserror.EFAULT
+ }
+
+ // Don't bother trying existingPMAsLocked; in most cases, if we did have
+ // existing pmas, we wouldn't have faulted.
+
+ // Ensure that we have a usable vma. Here and below, since we are only
+ // asking for a single page, there is no possibility of partial success,
+ // and any error is immediately fatal.
+ mm.mappingMu.RLock()
+ vseg, _, err := mm.getVMAsLocked(ctx, ar, at, false)
+ if err != nil {
+ mm.mappingMu.RUnlock()
+ return err
+ }
+
+ // Ensure that we have a usable pma.
+ mm.activeMu.Lock()
+ pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, at)
+ mm.mappingMu.RUnlock()
+ if err != nil {
+ mm.activeMu.Unlock()
+ return err
+ }
+
+ // Downgrade to a read-lock on activeMu since we don't need to mutate pmas
+ // anymore.
+ mm.activeMu.DowngradeLock()
+
+ // Map the faulted page into the active AddressSpace.
+ err = mm.mapASLocked(pseg, ar, false)
+ mm.activeMu.RUnlock()
+ return err
+}
+
+// MMap establishes a memory mapping.
+func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (usermem.Addr, error) {
+ if opts.Length == 0 {
+ return 0, syserror.EINVAL
+ }
+ length, ok := usermem.Addr(opts.Length).RoundUp()
+ if !ok {
+ return 0, syserror.ENOMEM
+ }
+ opts.Length = uint64(length)
+
+ if opts.Mappable != nil {
+ // Offset must be aligned.
+ if usermem.Addr(opts.Offset).RoundDown() != usermem.Addr(opts.Offset) {
+ return 0, syserror.EINVAL
+ }
+ // Offset + length must not overflow.
+ if end := opts.Offset + opts.Length; end < opts.Offset {
+ return 0, syserror.ENOMEM
+ }
+ } else {
+ opts.Offset = 0
+ if !opts.Private {
+ if opts.MappingIdentity != nil {
+ return 0, syserror.EINVAL
+ }
+ m, err := NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
+ if err != nil {
+ return 0, err
+ }
+ defer m.DecRef()
+ opts.MappingIdentity = m
+ opts.Mappable = m
+ }
+ }
+
+ if opts.Addr.RoundDown() != opts.Addr {
+ // MAP_FIXED requires addr to be page-aligned; non-fixed mappings
+ // don't.
+ if opts.Fixed {
+ return 0, syserror.EINVAL
+ }
+ opts.Addr = opts.Addr.RoundDown()
+ }
+
+ if !opts.MaxPerms.SupersetOf(opts.Perms) {
+ return 0, syserror.EACCES
+ }
+ if opts.Unmap && !opts.Fixed {
+ return 0, syserror.EINVAL
+ }
+ if opts.GrowsDown && opts.Mappable != nil {
+ return 0, syserror.EINVAL
+ }
+
+ // Get the new vma.
+ mm.mappingMu.Lock()
+ if opts.MLockMode < mm.defMLockMode {
+ opts.MLockMode = mm.defMLockMode
+ }
+ vseg, ar, err := mm.createVMALocked(ctx, opts)
+ if err != nil {
+ mm.mappingMu.Unlock()
+ return 0, err
+ }
+
+ // TODO(jamieliu): In Linux, VM_LOCKONFAULT (which may be set on the new
+ // vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears
+ // to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in
+ // mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() =>
+ // populate_vma_page_range(). Confirm this behavior.
+ switch {
+ case opts.Precommit || opts.MLockMode == memmap.MLockEager:
+ // Get pmas and map with precommit as requested.
+ mm.populateVMAAndUnlock(ctx, vseg, ar, true)
+
+ case opts.Mappable == nil && length <= privateAllocUnit:
+ // NOTE(b/63077076, b/63360184): Get pmas and map eagerly in the hope
+ // that doing so will save on future page faults. We only do this for
+ // anonymous mappings, since otherwise the cost of
+ // memmap.Mappable.Translate is unknown; and only for small mappings,
+ // to avoid needing to allocate large amounts of memory that we may
+ // subsequently need to checkpoint.
+ mm.populateVMAAndUnlock(ctx, vseg, ar, false)
+
+ default:
+ mm.mappingMu.Unlock()
+ }
+
+ return ar.Start, nil
+}
+
+// populateVMA obtains pmas for addresses in ar in the given vma, and maps them
+// into mm.as if it is active.
+//
+// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar).
+func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+ if !vseg.ValuePtr().effectivePerms.Any() {
+ // Linux doesn't populate inaccessible pages. See
+ // mm/gup.c:populate_vma_page_range.
+ return
+ }
+
+ mm.activeMu.Lock()
+ // Can't defer mm.activeMu.Unlock(); see below.
+
+ // Even if we get new pmas, we can't actually map them if we don't have an
+ // AddressSpace.
+ if mm.as == nil {
+ mm.activeMu.Unlock()
+ return
+ }
+
+ // Ensure that we have usable pmas.
+ pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, usermem.NoAccess)
+ if err != nil {
+ // mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
+ // mm/gup.c:mm_populate(). If it matters, we'll get it again when
+ // userspace actually tries to use the failing page.
+ mm.activeMu.Unlock()
+ return
+ }
+
+ // Downgrade to a read-lock on activeMu since we don't need to mutate pmas
+ // anymore.
+ mm.activeMu.DowngradeLock()
+
+ // As above, errors are silently ignored.
+ mm.mapASLocked(pseg, ar, precommit)
+ mm.activeMu.RUnlock()
+}
+
+// populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally
+// unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is
+// preferable to populateVMA since it unlocks mm.mappingMu before performing
+// expensive operations that don't require it to be locked.
+//
+// Preconditions: mm.mappingMu must be locked for writing.
+// vseg.Range().IsSupersetOf(ar).
+//
+// Postconditions: mm.mappingMu will be unlocked.
+func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+ // See populateVMA above for commentary.
+ if !vseg.ValuePtr().effectivePerms.Any() {
+ mm.mappingMu.Unlock()
+ return
+ }
+
+ mm.activeMu.Lock()
+
+ if mm.as == nil {
+ mm.activeMu.Unlock()
+ mm.mappingMu.Unlock()
+ return
+ }
+
+ // mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it
+ // isn't needed at all for mapASLocked.
+ mm.mappingMu.DowngradeLock()
+ pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, usermem.NoAccess)
+ mm.mappingMu.RUnlock()
+ if err != nil {
+ mm.activeMu.Unlock()
+ return
+ }
+
+ mm.activeMu.DowngradeLock()
+ mm.mapASLocked(pseg, ar, precommit)
+ mm.activeMu.RUnlock()
+}
+
+// MapStack allocates the initial process stack.
+func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) {
+ // maxStackSize is the maximum supported process stack size in bytes.
+ //
+ // This limit exists because stack growing isn't implemented, so the entire
+ // process stack must be mapped up-front.
+ const maxStackSize = 128 << 20
+
+ stackSize := limits.FromContext(ctx).Get(limits.Stack)
+ r, ok := usermem.Addr(stackSize.Cur).RoundUp()
+ sz := uint64(r)
+ if !ok {
+ // RLIM_INFINITY rounds up to 0.
+ sz = linux.DefaultStackSoftLimit
+ } else if sz > maxStackSize {
+ ctx.Warningf("Capping stack size from RLIMIT_STACK of %v down to %v.", sz, maxStackSize)
+ sz = maxStackSize
+ } else if sz == 0 {
+ return usermem.AddrRange{}, syserror.ENOMEM
+ }
+ szaddr := usermem.Addr(sz)
+ ctx.Debugf("Allocating stack with size of %v bytes", sz)
+
+ // Determine the stack's desired location. Unlike Linux, address
+ // randomization can't be disabled.
+ stackEnd := mm.layout.MaxAddr - usermem.Addr(mrand.Int63n(int64(mm.layout.MaxStackRand))).RoundDown()
+ if stackEnd < szaddr {
+ return usermem.AddrRange{}, syserror.ENOMEM
+ }
+ stackStart := stackEnd - szaddr
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+ _, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+ Length: sz,
+ Addr: stackStart,
+ Perms: usermem.ReadWrite,
+ MaxPerms: usermem.AnyAccess,
+ Private: true,
+ GrowsDown: true,
+ MLockMode: mm.defMLockMode,
+ Hint: "[stack]",
+ })
+ return ar, err
+}
+
+// MUnmap implements the semantics of Linux's munmap(2).
+func (mm *MemoryManager) MUnmap(ctx context.Context, addr usermem.Addr, length uint64) error {
+ if addr != addr.RoundDown() {
+ return syserror.EINVAL
+ }
+ if length == 0 {
+ return syserror.EINVAL
+ }
+ la, ok := usermem.Addr(length).RoundUp()
+ if !ok {
+ return syserror.EINVAL
+ }
+ ar, ok := addr.ToRange(uint64(la))
+ if !ok {
+ return syserror.EINVAL
+ }
+
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+ mm.unmapLocked(ctx, ar)
+ return nil
+}
+
+// MRemapOpts specifies options to MRemap.
+type MRemapOpts struct {
+ // Move controls whether MRemap moves the remapped mapping to a new address.
+ Move MRemapMoveMode
+
+ // NewAddr is the new address for the remapping. NewAddr is ignored unless
+ // Move is MMRemapMustMove.
+ NewAddr usermem.Addr
+}
+
+// MRemapMoveMode controls MRemap's moving behavior.
+type MRemapMoveMode int
+
+const (
+ // MRemapNoMove prevents MRemap from moving the remapped mapping.
+ MRemapNoMove MRemapMoveMode = iota
+
+ // MRemapMayMove allows MRemap to move the remapped mapping.
+ MRemapMayMove
+
+ // MRemapMustMove requires MRemap to move the remapped mapping to
+ // MRemapOpts.NewAddr, replacing any existing mappings in the remapped
+ // range.
+ MRemapMustMove
+)
+
+// MRemap implements the semantics of Linux's mremap(2).
+func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSize uint64, newSize uint64, opts MRemapOpts) (usermem.Addr, error) {
+ // "Note that old_address has to be page aligned." - mremap(2)
+ if oldAddr.RoundDown() != oldAddr {
+ return 0, syserror.EINVAL
+ }
+
+ // Linux treats an old_size that rounds up to 0 as 0, which is otherwise a
+ // valid size. However, new_size can't be 0 after rounding.
+ oldSizeAddr, _ := usermem.Addr(oldSize).RoundUp()
+ oldSize = uint64(oldSizeAddr)
+ newSizeAddr, ok := usermem.Addr(newSize).RoundUp()
+ if !ok || newSizeAddr == 0 {
+ return 0, syserror.EINVAL
+ }
+ newSize = uint64(newSizeAddr)
+
+ oldEnd, ok := oldAddr.AddLength(oldSize)
+ if !ok {
+ return 0, syserror.EINVAL
+ }
+
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+
+ // All cases require that a vma exists at oldAddr.
+ vseg := mm.vmas.FindSegment(oldAddr)
+ if !vseg.Ok() {
+ return 0, syserror.EFAULT
+ }
+
+ // Behavior matrix:
+ //
+ // Move | oldSize = 0 | oldSize < newSize | oldSize = newSize | oldSize > newSize
+ // ---------+-------------+-------------------+-------------------+------------------
+ // NoMove | ENOMEM [1] | Grow in-place | No-op | Shrink in-place
+ // MayMove | Copy [1] | Grow in-place or | No-op | Shrink in-place
+ // | | move | |
+ // MustMove | Copy | Move and grow | Move | Shrink and move
+ //
+ // [1] In-place growth is impossible because the vma at oldAddr already
+ // occupies at least part of the destination. Thus the NoMove case always
+ // fails and the MayMove case always falls back to copying.
+
+ if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone {
+ // Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall,
+ // mremap in Linux does not check mm/mlock.c:can_do_mlock() and
+ // therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and
+ // !CAP_IPC_LOCK.
+ mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+ if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+ if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit {
+ return 0, syserror.EAGAIN
+ }
+ }
+ }
+
+ if opts.Move != MRemapMustMove {
+ // Handle no-ops and in-place shrinking. These cases don't care if
+ // [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
+ // (aside from oldAddr).
+ if newSize <= oldSize {
+ if newSize < oldSize {
+ // If oldAddr+oldSize didn't overflow, oldAddr+newSize can't
+ // either.
+ newEnd := oldAddr + usermem.Addr(newSize)
+ mm.unmapLocked(ctx, usermem.AddrRange{newEnd, oldEnd})
+ }
+ return oldAddr, nil
+ }
+
+ // Handle in-place growing.
+
+ // Check that oldEnd maps to the same vma as oldAddr.
+ if vseg.End() < oldEnd {
+ return 0, syserror.EFAULT
+ }
+ // "Grow" the existing vma by creating a new mergeable one.
+ vma := vseg.ValuePtr()
+ var newOffset uint64
+ if vma.mappable != nil {
+ newOffset = vseg.mappableRange().End
+ }
+ vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+ Length: newSize - oldSize,
+ MappingIdentity: vma.id,
+ Mappable: vma.mappable,
+ Offset: newOffset,
+ Addr: oldEnd,
+ Fixed: true,
+ Perms: vma.realPerms,
+ MaxPerms: vma.maxPerms,
+ Private: vma.private,
+ GrowsDown: vma.growsDown,
+ MLockMode: vma.mlockMode,
+ Hint: vma.hint,
+ })
+ if err == nil {
+ if vma.mlockMode == memmap.MLockEager {
+ mm.populateVMA(ctx, vseg, ar, true)
+ }
+ return oldAddr, nil
+ }
+ // In-place growth failed. In the MRemapMayMove case, fall through to
+ // copying/moving below.
+ if opts.Move == MRemapNoMove {
+ return 0, err
+ }
+ }
+
+ // Find a location for the new mapping.
+ var newAR usermem.AddrRange
+ switch opts.Move {
+ case MRemapMayMove:
+ newAddr, err := mm.findAvailableLocked(newSize, findAvailableOpts{})
+ if err != nil {
+ return 0, err
+ }
+ newAR, _ = newAddr.ToRange(newSize)
+
+ case MRemapMustMove:
+ newAddr := opts.NewAddr
+ if newAddr.RoundDown() != newAddr {
+ return 0, syserror.EINVAL
+ }
+ var ok bool
+ newAR, ok = newAddr.ToRange(newSize)
+ if !ok {
+ return 0, syserror.EINVAL
+ }
+ if (usermem.AddrRange{oldAddr, oldEnd}).Overlaps(newAR) {
+ return 0, syserror.EINVAL
+ }
+
+ // Check that the new region is valid.
+ _, err := mm.findAvailableLocked(newSize, findAvailableOpts{
+ Addr: newAddr,
+ Fixed: true,
+ Unmap: true,
+ })
+ if err != nil {
+ return 0, err
+ }
+
+ // Unmap any mappings at the destination.
+ mm.unmapLocked(ctx, newAR)
+
+ // If the sizes specify shrinking, unmap everything between the new and
+ // old sizes at the source. Unmapping before the following checks is
+ // correct: compare Linux's mm/mremap.c:mremap_to() => do_munmap(),
+ // vma_to_resize().
+ if newSize < oldSize {
+ oldNewEnd := oldAddr + usermem.Addr(newSize)
+ mm.unmapLocked(ctx, usermem.AddrRange{oldNewEnd, oldEnd})
+ oldEnd = oldNewEnd
+ }
+
+ // unmapLocked may have invalidated vseg; look it up again.
+ vseg = mm.vmas.FindSegment(oldAddr)
+ }
+
+ oldAR := usermem.AddrRange{oldAddr, oldEnd}
+
+ // Check that oldEnd maps to the same vma as oldAddr.
+ if vseg.End() < oldEnd {
+ return 0, syserror.EFAULT
+ }
+
+ // Check against RLIMIT_AS.
+ newUsageAS := mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+ if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS {
+ return 0, syserror.ENOMEM
+ }
+
+ if vma := vseg.ValuePtr(); vma.mappable != nil {
+ // Check that offset+length does not overflow.
+ if vma.off+uint64(newAR.Length()) < vma.off {
+ return 0, syserror.EINVAL
+ }
+ // Inform the Mappable, if any, of the new mapping.
+ if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start), vma.canWriteMappableLocked()); err != nil {
+ return 0, err
+ }
+ }
+
+ if oldSize == 0 {
+ // Handle copying.
+ //
+ // We can't use createVMALocked because it calls Mappable.AddMapping,
+ // whereas we've already called Mappable.CopyMapping (which is
+ // consistent with Linux). Call vseg.Value() (rather than
+ // vseg.ValuePtr()) to make a copy of the vma.
+ vma := vseg.Value()
+ if vma.mappable != nil {
+ vma.off = vseg.mappableOffsetAt(oldAR.Start)
+ }
+ if vma.id != nil {
+ vma.id.IncRef()
+ }
+ vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
+ mm.usageAS += uint64(newAR.Length())
+ if vma.isPrivateDataLocked() {
+ mm.dataAS += uint64(newAR.Length())
+ }
+ if vma.mlockMode != memmap.MLockNone {
+ mm.lockedAS += uint64(newAR.Length())
+ if vma.mlockMode == memmap.MLockEager {
+ mm.populateVMA(ctx, vseg, newAR, true)
+ }
+ }
+ return newAR.Start, nil
+ }
+
+ // Handle moving.
+ //
+ // Remove the existing vma before inserting the new one to minimize
+ // iterator invalidation. We do this directly (instead of calling
+ // removeVMAsLocked) because:
+ //
+ // 1. We can't drop the reference on vma.id, which will be transferred to
+ // the new vma.
+ //
+ // 2. We can't call vma.mappable.RemoveMapping, because pmas are still at
+ // oldAR, so calling RemoveMapping could cause us to miss an invalidation
+ // overlapping oldAR.
+ //
+ // Call vseg.Value() (rather than vseg.ValuePtr()) to make a copy of the
+ // vma.
+ vseg = mm.vmas.Isolate(vseg, oldAR)
+ vma := vseg.Value()
+ mm.vmas.Remove(vseg)
+ vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
+ mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+ if vma.isPrivateDataLocked() {
+ mm.dataAS = mm.dataAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+ }
+ if vma.mlockMode != memmap.MLockNone {
+ mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+ }
+
+ // Move pmas. This is technically optional for non-private pmas, which
+ // could just go through memmap.Mappable.Translate again, but it's required
+ // for private pmas.
+ mm.activeMu.Lock()
+ mm.movePMAsLocked(oldAR, newAR)
+ mm.activeMu.Unlock()
+
+ // Now that pmas have been moved to newAR, we can notify vma.mappable that
+ // oldAR is no longer mapped.
+ if vma.mappable != nil {
+ vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.canWriteMappableLocked())
+ }
+
+ if vma.mlockMode == memmap.MLockEager {
+ mm.populateVMA(ctx, vseg, newAR, true)
+ }
+
+ return newAR.Start, nil
+}
+
+// MProtect implements the semantics of Linux's mprotect(2).
+func (mm *MemoryManager) MProtect(addr usermem.Addr, length uint64, realPerms usermem.AccessType, growsDown bool) error {
+ if addr.RoundDown() != addr {
+ return syserror.EINVAL
+ }
+ if length == 0 {
+ return nil
+ }
+ rlength, ok := usermem.Addr(length).RoundUp()
+ if !ok {
+ return syserror.ENOMEM
+ }
+ ar, ok := addr.ToRange(uint64(rlength))
+ if !ok {
+ return syserror.ENOMEM
+ }
+ effectivePerms := realPerms.Effective()
+
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+ // Non-growsDown mprotect requires that all of ar is mapped, and stops at
+ // the first non-empty gap. growsDown mprotect requires that the first vma
+ // be growsDown, but does not require it to extend all the way to ar.Start;
+ // vmas after the first must be contiguous but need not be growsDown, like
+ // the non-growsDown case.
+ vseg := mm.vmas.LowerBoundSegment(ar.Start)
+ if !vseg.Ok() {
+ return syserror.ENOMEM
+ }
+ if growsDown {
+ if !vseg.ValuePtr().growsDown {
+ return syserror.EINVAL
+ }
+ if ar.End <= vseg.Start() {
+ return syserror.ENOMEM
+ }
+ ar.Start = vseg.Start()
+ } else {
+ if ar.Start < vseg.Start() {
+ return syserror.ENOMEM
+ }
+ }
+
+ mm.activeMu.Lock()
+ defer mm.activeMu.Unlock()
+ defer func() {
+ mm.vmas.MergeRange(ar)
+ mm.vmas.MergeAdjacent(ar)
+ mm.pmas.MergeRange(ar)
+ mm.pmas.MergeAdjacent(ar)
+ }()
+ pseg := mm.pmas.LowerBoundSegment(ar.Start)
+ var didUnmapAS bool
+ for {
+ // Check for permission validity before splitting vmas, for consistency
+ // with Linux.
+ if !vseg.ValuePtr().maxPerms.SupersetOf(effectivePerms) {
+ return syserror.EACCES
+ }
+ vseg = mm.vmas.Isolate(vseg, ar)
+
+ // Update vma permissions.
+ vma := vseg.ValuePtr()
+ vmaLength := vseg.Range().Length()
+ if vma.isPrivateDataLocked() {
+ mm.dataAS -= uint64(vmaLength)
+ }
+
+ vma.realPerms = realPerms
+ vma.effectivePerms = effectivePerms
+ if vma.isPrivateDataLocked() {
+ mm.dataAS += uint64(vmaLength)
+ }
+
+ // Propagate vma permission changes to pmas.
+ for pseg.Ok() && pseg.Start() < vseg.End() {
+ if pseg.Range().Overlaps(vseg.Range()) {
+ pseg = mm.pmas.Isolate(pseg, vseg.Range())
+ pma := pseg.ValuePtr()
+ if !effectivePerms.SupersetOf(pma.effectivePerms) && !didUnmapAS {
+ // Unmap all of ar, not just vseg.Range(), to minimize host
+ // syscalls.
+ mm.unmapASLocked(ar)
+ didUnmapAS = true
+ }
+ pma.effectivePerms = effectivePerms.Intersect(pma.translatePerms)
+ if pma.needCOW {
+ pma.effectivePerms.Write = false
+ }
+ }
+ pseg = pseg.NextSegment()
+ }
+
+ // Continue to the next vma.
+ if ar.End <= vseg.End() {
+ return nil
+ }
+ vseg, _ = vseg.NextNonEmpty()
+ if !vseg.Ok() {
+ return syserror.ENOMEM
+ }
+ }
+}
+
+// BrkSetup sets mm's brk address to addr and its brk size to 0.
+func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) {
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+ // Unmap the existing brk.
+ if mm.brk.Length() != 0 {
+ mm.unmapLocked(ctx, mm.brk)
+ }
+ mm.brk = usermem.AddrRange{addr, addr}
+}
+
+// Brk implements the semantics of Linux's brk(2), except that it returns an
+// error on failure.
+func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) {
+ mm.mappingMu.Lock()
+ // Can't defer mm.mappingMu.Unlock(); see below.
+
+ if addr < mm.brk.Start {
+ addr = mm.brk.End
+ mm.mappingMu.Unlock()
+ return addr, syserror.EINVAL
+ }
+
+ // TODO(gvisor.dev/issue/156): This enforces RLIMIT_DATA, but is
+ // slightly more permissive than the usual data limit. In particular,
+ // this only limits the size of the heap; a true RLIMIT_DATA limits the
+ // size of heap + data + bss. The segment sizes need to be plumbed from
+ // the loader package to fully enforce RLIMIT_DATA.
+ if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
+ addr = mm.brk.End
+ mm.mappingMu.Unlock()
+ return addr, syserror.ENOMEM
+ }
+
+ oldbrkpg, _ := mm.brk.End.RoundUp()
+ newbrkpg, ok := addr.RoundUp()
+ if !ok {
+ addr = mm.brk.End
+ mm.mappingMu.Unlock()
+ return addr, syserror.EFAULT
+ }
+
+ switch {
+ case oldbrkpg < newbrkpg:
+ vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+ Length: uint64(newbrkpg - oldbrkpg),
+ Addr: oldbrkpg,
+ Fixed: true,
+ // Compare Linux's
+ // arch/x86/include/asm/page_types.h:VM_DATA_DEFAULT_FLAGS.
+ Perms: usermem.ReadWrite,
+ MaxPerms: usermem.AnyAccess,
+ Private: true,
+ // Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes
+ // mm->def_flags.
+ MLockMode: mm.defMLockMode,
+ Hint: "[heap]",
+ })
+ if err != nil {
+ addr = mm.brk.End
+ mm.mappingMu.Unlock()
+ return addr, err
+ }
+ mm.brk.End = addr
+ if mm.defMLockMode == memmap.MLockEager {
+ mm.populateVMAAndUnlock(ctx, vseg, ar, true)
+ } else {
+ mm.mappingMu.Unlock()
+ }
+
+ case newbrkpg < oldbrkpg:
+ mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg})
+ fallthrough
+
+ default:
+ mm.brk.End = addr
+ mm.mappingMu.Unlock()
+ }
+
+ return addr, nil
+}
+
+// MLock implements the semantics of Linux's mlock()/mlock2()/munlock(),
+// depending on mode.
+func (mm *MemoryManager) MLock(ctx context.Context, addr usermem.Addr, length uint64, mode memmap.MLockMode) error {
+ // Linux allows this to overflow.
+ la, _ := usermem.Addr(length + addr.PageOffset()).RoundUp()
+ ar, ok := addr.RoundDown().ToRange(uint64(la))
+ if !ok {
+ return syserror.EINVAL
+ }
+
+ mm.mappingMu.Lock()
+ // Can't defer mm.mappingMu.Unlock(); see below.
+
+ if mode != memmap.MLockNone {
+ // Check against RLIMIT_MEMLOCK.
+ if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+ mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+ if mlockLimit == 0 {
+ mm.mappingMu.Unlock()
+ return syserror.EPERM
+ }
+ if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit {
+ mm.mappingMu.Unlock()
+ return syserror.ENOMEM
+ }
+ }
+ }
+
+ // Check this after RLIMIT_MEMLOCK for consistency with Linux.
+ if ar.Length() == 0 {
+ mm.mappingMu.Unlock()
+ return nil
+ }
+
+ // Apply the new mlock mode to vmas.
+ var unmapped bool
+ vseg := mm.vmas.FindSegment(ar.Start)
+ for {
+ if !vseg.Ok() {
+ unmapped = true
+ break
+ }
+ vseg = mm.vmas.Isolate(vseg, ar)
+ vma := vseg.ValuePtr()
+ prevMode := vma.mlockMode
+ vma.mlockMode = mode
+ if mode != memmap.MLockNone && prevMode == memmap.MLockNone {
+ mm.lockedAS += uint64(vseg.Range().Length())
+ } else if mode == memmap.MLockNone && prevMode != memmap.MLockNone {
+ mm.lockedAS -= uint64(vseg.Range().Length())
+ }
+ if ar.End <= vseg.End() {
+ break
+ }
+ vseg, _ = vseg.NextNonEmpty()
+ }
+ mm.vmas.MergeRange(ar)
+ mm.vmas.MergeAdjacent(ar)
+ if unmapped {
+ mm.mappingMu.Unlock()
+ return syserror.ENOMEM
+ }
+
+ if mode == memmap.MLockEager {
+ // Ensure that we have usable pmas. Since we didn't return ENOMEM
+ // above, ar must be fully covered by vmas, so we can just use
+ // NextSegment below.
+ mm.activeMu.Lock()
+ mm.mappingMu.DowngradeLock()
+ for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+ if !vseg.ValuePtr().effectivePerms.Any() {
+ // Linux: mm/gup.c:__get_user_pages() returns EFAULT in this
+ // case, which is converted to ENOMEM by mlock.
+ mm.activeMu.Unlock()
+ mm.mappingMu.RUnlock()
+ return syserror.ENOMEM
+ }
+ _, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), usermem.NoAccess)
+ if err != nil {
+ mm.activeMu.Unlock()
+ mm.mappingMu.RUnlock()
+ // Linux: mm/mlock.c:__mlock_posix_error_return()
+ if err == syserror.EFAULT {
+ return syserror.ENOMEM
+ }
+ if err == syserror.ENOMEM {
+ return syserror.EAGAIN
+ }
+ return err
+ }
+ }
+
+ // Map pmas into the active AddressSpace, if we have one.
+ mm.mappingMu.RUnlock()
+ if mm.as != nil {
+ mm.activeMu.DowngradeLock()
+ err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */)
+ mm.activeMu.RUnlock()
+ if err != nil {
+ return err
+ }
+ } else {
+ mm.activeMu.Unlock()
+ }
+ } else {
+ mm.mappingMu.Unlock()
+ }
+
+ return nil
+}
+
+// MLockAllOpts holds options to MLockAll.
+type MLockAllOpts struct {
+ // If Current is true, change the memory-locking behavior of all mappings
+ // to Mode. If Future is true, upgrade the memory-locking behavior of all
+ // future mappings to Mode. At least one of Current or Future must be true.
+ Current bool
+ Future bool
+ Mode memmap.MLockMode
+}
+
+// MLockAll implements the semantics of Linux's mlockall()/munlockall(),
+// depending on opts.
+func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error {
+ if !opts.Current && !opts.Future {
+ return syserror.EINVAL
+ }
+
+ mm.mappingMu.Lock()
+ // Can't defer mm.mappingMu.Unlock(); see below.
+
+ if opts.Current {
+ if opts.Mode != memmap.MLockNone {
+ // Check against RLIMIT_MEMLOCK.
+ if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+ mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+ if mlockLimit == 0 {
+ mm.mappingMu.Unlock()
+ return syserror.EPERM
+ }
+ if uint64(mm.vmas.Span()) > mlockLimit {
+ mm.mappingMu.Unlock()
+ return syserror.ENOMEM
+ }
+ }
+ }
+ for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+ vma := vseg.ValuePtr()
+ prevMode := vma.mlockMode
+ vma.mlockMode = opts.Mode
+ if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone {
+ mm.lockedAS += uint64(vseg.Range().Length())
+ } else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone {
+ mm.lockedAS -= uint64(vseg.Range().Length())
+ }
+ }
+ }
+
+ if opts.Future {
+ mm.defMLockMode = opts.Mode
+ }
+
+ if opts.Current && opts.Mode == memmap.MLockEager {
+ // Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate()
+ // ignores the return value of __mm_populate(), so all errors below are
+ // ignored.
+ //
+ // Try to get usable pmas.
+ mm.activeMu.Lock()
+ mm.mappingMu.DowngradeLock()
+ for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+ if vseg.ValuePtr().effectivePerms.Any() {
+ mm.getPMAsLocked(ctx, vseg, vseg.Range(), usermem.NoAccess)
+ }
+ }
+
+ // Map all pmas into the active AddressSpace, if we have one.
+ mm.mappingMu.RUnlock()
+ if mm.as != nil {
+ mm.activeMu.DowngradeLock()
+ mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */)
+ mm.activeMu.RUnlock()
+ } else {
+ mm.activeMu.Unlock()
+ }
+ } else {
+ mm.mappingMu.Unlock()
+ }
+ return nil
+}
+
+// NumaPolicy implements the semantics of Linux's get_mempolicy(MPOL_F_ADDR).
+func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (linux.NumaPolicy, uint64, error) {
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ vseg := mm.vmas.FindSegment(addr)
+ if !vseg.Ok() {
+ return 0, 0, syserror.EFAULT
+ }
+ vma := vseg.ValuePtr()
+ return vma.numaPolicy, vma.numaNodemask, nil
+}
+
+// SetNumaPolicy implements the semantics of Linux's mbind().
+func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy linux.NumaPolicy, nodemask uint64) error {
+ if !addr.IsPageAligned() {
+ return syserror.EINVAL
+ }
+ // Linux allows this to overflow.
+ la, _ := usermem.Addr(length).RoundUp()
+ ar, ok := addr.ToRange(uint64(la))
+ if !ok {
+ return syserror.EINVAL
+ }
+ if ar.Length() == 0 {
+ return nil
+ }
+
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+ defer func() {
+ mm.vmas.MergeRange(ar)
+ mm.vmas.MergeAdjacent(ar)
+ }()
+ vseg := mm.vmas.LowerBoundSegment(ar.Start)
+ lastEnd := ar.Start
+ for {
+ if !vseg.Ok() || lastEnd < vseg.Start() {
+ // "EFAULT: ... there was an unmapped hole in the specified memory
+ // range specified [sic] by addr and len." - mbind(2)
+ return syserror.EFAULT
+ }
+ vseg = mm.vmas.Isolate(vseg, ar)
+ vma := vseg.ValuePtr()
+ vma.numaPolicy = policy
+ vma.numaNodemask = nodemask
+ lastEnd = vseg.End()
+ if ar.End <= lastEnd {
+ return nil
+ }
+ vseg, _ = vseg.NextNonEmpty()
+ }
+}
+
+// SetDontFork implements the semantics of madvise MADV_DONTFORK.
+func (mm *MemoryManager) SetDontFork(addr usermem.Addr, length uint64, dontfork bool) error {
+ ar, ok := addr.ToRange(length)
+ if !ok {
+ return syserror.EINVAL
+ }
+
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+ defer func() {
+ mm.vmas.MergeRange(ar)
+ mm.vmas.MergeAdjacent(ar)
+ }()
+
+ for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+ vseg = mm.vmas.Isolate(vseg, ar)
+ vma := vseg.ValuePtr()
+ vma.dontfork = dontfork
+ }
+
+ if mm.vmas.SpanRange(ar) != ar.Length() {
+ return syserror.ENOMEM
+ }
+ return nil
+}
+
+// Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
+func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
+ ar, ok := addr.ToRange(length)
+ if !ok {
+ return syserror.EINVAL
+ }
+
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ mm.activeMu.Lock()
+ defer mm.activeMu.Unlock()
+
+ // Linux's mm/madvise.c:madvise_dontneed() => mm/memory.c:zap_page_range()
+ // is analogous to our mm.invalidateLocked(ar, true, true). We inline this
+ // here, with the special case that we synchronously decommit
+ // uniquely-owned (non-copy-on-write) pages for private anonymous vma,
+ // which is the common case for MADV_DONTNEED. Invalidating these pmas, and
+ // allowing them to be reallocated when touched again, increases pma
+ // fragmentation, which may significantly reduce performance for
+ // non-vectored I/O implementations. Also, decommitting synchronously
+ // ensures that Decommit immediately reduces host memory usage.
+ var didUnmapAS bool
+ pseg := mm.pmas.LowerBoundSegment(ar.Start)
+ mf := mm.mfp.MemoryFile()
+ for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+ vma := vseg.ValuePtr()
+ if vma.mlockMode != memmap.MLockNone {
+ return syserror.EINVAL
+ }
+ vsegAR := vseg.Range().Intersect(ar)
+ // pseg should already correspond to either this vma or a later one,
+ // since there can't be a pma without a corresponding vma.
+ if checkInvariants {
+ if pseg.Ok() && pseg.End() <= vsegAR.Start {
+ panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR))
+ }
+ }
+ for pseg.Ok() && pseg.Start() < vsegAR.End {
+ pma := pseg.ValuePtr()
+ if pma.private && !mm.isPMACopyOnWriteLocked(vseg, pseg) {
+ psegAR := pseg.Range().Intersect(ar)
+ if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil {
+ if err := mf.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
+ pseg = pseg.NextSegment()
+ continue
+ }
+ // If an error occurs, fall through to the general
+ // invalidation case below.
+ }
+ }
+ pseg = mm.pmas.Isolate(pseg, vsegAR)
+ pma = pseg.ValuePtr()
+ if !didUnmapAS {
+ // Unmap all of ar, not just pseg.Range(), to minimize host
+ // syscalls. AddressSpace mappings must be removed before
+ // mm.decPrivateRef().
+ mm.unmapASLocked(ar)
+ didUnmapAS = true
+ }
+ if pma.private {
+ mm.decPrivateRef(pseg.fileRange())
+ }
+ pma.file.DecRef(pseg.fileRange())
+ mm.removeRSSLocked(pseg.Range())
+ pseg = mm.pmas.Remove(pseg).NextSegment()
+ }
+ }
+
+ // "If there are some parts of the specified address space that are not
+ // mapped, the Linux version of madvise() ignores them and applies the call
+ // to the rest (but returns ENOMEM from the system call, as it should)." -
+ // madvise(2)
+ if mm.vmas.SpanRange(ar) != ar.Length() {
+ return syserror.ENOMEM
+ }
+ return nil
+}
+
+// MSyncOpts holds options to MSync.
+type MSyncOpts struct {
+ // Sync has the semantics of MS_SYNC.
+ Sync bool
+
+ // Invalidate has the semantics of MS_INVALIDATE.
+ Invalidate bool
+}
+
+// MSync implements the semantics of Linux's msync().
+func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length uint64, opts MSyncOpts) error {
+ if addr != addr.RoundDown() {
+ return syserror.EINVAL
+ }
+ if length == 0 {
+ return nil
+ }
+ la, ok := usermem.Addr(length).RoundUp()
+ if !ok {
+ return syserror.ENOMEM
+ }
+ ar, ok := addr.ToRange(uint64(la))
+ if !ok {
+ return syserror.ENOMEM
+ }
+
+ mm.mappingMu.RLock()
+ // Can't defer mm.mappingMu.RUnlock(); see below.
+ vseg := mm.vmas.LowerBoundSegment(ar.Start)
+ if !vseg.Ok() {
+ mm.mappingMu.RUnlock()
+ return syserror.ENOMEM
+ }
+ var unmapped bool
+ lastEnd := ar.Start
+ for {
+ if !vseg.Ok() {
+ mm.mappingMu.RUnlock()
+ unmapped = true
+ break
+ }
+ if lastEnd < vseg.Start() {
+ unmapped = true
+ }
+ lastEnd = vseg.End()
+ vma := vseg.ValuePtr()
+ if opts.Invalidate && vma.mlockMode != memmap.MLockNone {
+ mm.mappingMu.RUnlock()
+ return syserror.EBUSY
+ }
+ // It's only possible to have dirtied the Mappable through a shared
+ // mapping. Don't check if the mapping is writable, because mprotect
+ // may have changed this, and also because Linux doesn't.
+ if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private {
+ // We can't call memmap.MappingIdentity.Msync while holding
+ // mm.mappingMu since it may take fs locks that precede it in the
+ // lock order.
+ id.IncRef()
+ mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar))
+ mm.mappingMu.RUnlock()
+ err := id.Msync(ctx, mr)
+ id.DecRef()
+ if err != nil {
+ return err
+ }
+ if lastEnd >= ar.End {
+ break
+ }
+ mm.mappingMu.RLock()
+ vseg = mm.vmas.LowerBoundSegment(lastEnd)
+ } else {
+ if lastEnd >= ar.End {
+ mm.mappingMu.RUnlock()
+ break
+ }
+ vseg = vseg.NextSegment()
+ }
+ }
+
+ if unmapped {
+ return syserror.ENOMEM
+ }
+ return nil
+}
+
+// GetSharedFutexKey is used by kernel.Task.GetSharedKey.
+func (mm *MemoryManager) GetSharedFutexKey(ctx context.Context, addr usermem.Addr) (futex.Key, error) {
+ ar, ok := addr.ToRange(4) // sizeof(int32).
+ if !ok {
+ return futex.Key{}, syserror.EFAULT
+ }
+
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ vseg, _, err := mm.getVMAsLocked(ctx, ar, usermem.Read, false)
+ if err != nil {
+ return futex.Key{}, err
+ }
+ vma := vseg.ValuePtr()
+
+ if vma.private {
+ return futex.Key{
+ Kind: futex.KindSharedPrivate,
+ Offset: uint64(addr),
+ }, nil
+ }
+
+ if vma.id != nil {
+ vma.id.IncRef()
+ }
+ return futex.Key{
+ Kind: futex.KindSharedMappable,
+ Mappable: vma.mappable,
+ MappingIdentity: vma.id,
+ Offset: vseg.mappableOffsetAt(addr),
+ }, nil
+}
+
+// VirtualMemorySize returns the combined length in bytes of all mappings in
+// mm.
+func (mm *MemoryManager) VirtualMemorySize() uint64 {
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ return mm.usageAS
+}
+
+// VirtualMemorySizeRange returns the combined length in bytes of all mappings
+// in ar in mm.
+func (mm *MemoryManager) VirtualMemorySizeRange(ar usermem.AddrRange) uint64 {
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ return uint64(mm.vmas.SpanRange(ar))
+}
+
+// ResidentSetSize returns the value advertised as mm's RSS in bytes.
+func (mm *MemoryManager) ResidentSetSize() uint64 {
+ mm.activeMu.RLock()
+ defer mm.activeMu.RUnlock()
+ return mm.curRSS
+}
+
+// MaxResidentSetSize returns the value advertised as mm's max RSS in bytes.
+func (mm *MemoryManager) MaxResidentSetSize() uint64 {
+ mm.activeMu.RLock()
+ defer mm.activeMu.RUnlock()
+ return mm.maxRSS
+}
+
+// VirtualDataSize returns the size of private data segments in mm.
+func (mm *MemoryManager) VirtualDataSize() uint64 {
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ return mm.dataAS
+}
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
new file mode 100644
index 000000000..16d8207e9
--- /dev/null
+++ b/pkg/sentry/mm/vma.go
@@ -0,0 +1,568 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Preconditions: mm.mappingMu must be locked for writing. opts must be valid
+// as defined by the checks in MMap.
+func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts) (vmaIterator, usermem.AddrRange, error) {
+ if opts.MaxPerms != opts.MaxPerms.Effective() {
+ panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms))
+ }
+
+ // Find a usable range.
+ addr, err := mm.findAvailableLocked(opts.Length, findAvailableOpts{
+ Addr: opts.Addr,
+ Fixed: opts.Fixed,
+ Unmap: opts.Unmap,
+ Map32Bit: opts.Map32Bit,
+ })
+ if err != nil {
+ return vmaIterator{}, usermem.AddrRange{}, err
+ }
+ ar, _ := addr.ToRange(opts.Length)
+
+ // Check against RLIMIT_AS.
+ newUsageAS := mm.usageAS + opts.Length
+ if opts.Unmap {
+ newUsageAS -= uint64(mm.vmas.SpanRange(ar))
+ }
+ if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS {
+ return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM
+ }
+
+ if opts.MLockMode != memmap.MLockNone {
+ // Check against RLIMIT_MEMLOCK.
+ if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+ mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+ if mlockLimit == 0 {
+ return vmaIterator{}, usermem.AddrRange{}, syserror.EPERM
+ }
+ newLockedAS := mm.lockedAS + opts.Length
+ if opts.Unmap {
+ newLockedAS -= mm.mlockedBytesRangeLocked(ar)
+ }
+ if newLockedAS > mlockLimit {
+ return vmaIterator{}, usermem.AddrRange{}, syserror.EAGAIN
+ }
+ }
+ }
+
+ // Remove overwritten mappings. This ordering is consistent with Linux:
+ // compare Linux's mm/mmap.c:mmap_region() => do_munmap(),
+ // file->f_op->mmap().
+ var vgap vmaGapIterator
+ if opts.Unmap {
+ vgap = mm.unmapLocked(ctx, ar)
+ } else {
+ vgap = mm.vmas.FindGap(ar.Start)
+ }
+
+ // Inform the Mappable, if any, of the new mapping.
+ if opts.Mappable != nil {
+ // The expression for writable is vma.canWriteMappableLocked(), but we
+ // don't yet have a vma.
+ if err := opts.Mappable.AddMapping(ctx, mm, ar, opts.Offset, !opts.Private && opts.MaxPerms.Write); err != nil {
+ return vmaIterator{}, usermem.AddrRange{}, err
+ }
+ }
+
+ // Take a reference on opts.MappingIdentity before inserting the vma since
+ // vma merging can drop the reference.
+ if opts.MappingIdentity != nil {
+ opts.MappingIdentity.IncRef()
+ }
+
+ // Finally insert the vma.
+ v := vma{
+ mappable: opts.Mappable,
+ off: opts.Offset,
+ realPerms: opts.Perms,
+ effectivePerms: opts.Perms.Effective(),
+ maxPerms: opts.MaxPerms,
+ private: opts.Private,
+ growsDown: opts.GrowsDown,
+ mlockMode: opts.MLockMode,
+ numaPolicy: linux.MPOL_DEFAULT,
+ id: opts.MappingIdentity,
+ hint: opts.Hint,
+ }
+
+ vseg := mm.vmas.Insert(vgap, ar, v)
+ mm.usageAS += opts.Length
+ if v.isPrivateDataLocked() {
+ mm.dataAS += opts.Length
+ }
+ if opts.MLockMode != memmap.MLockNone {
+ mm.lockedAS += opts.Length
+ }
+
+ return vseg, ar, nil
+}
+
+type findAvailableOpts struct {
+ // These fields are equivalent to those in memmap.MMapOpts, except that:
+ //
+ // - Addr must be page-aligned.
+ //
+ // - Unmap allows existing guard pages in the returned range.
+
+ Addr usermem.Addr
+ Fixed bool
+ Unmap bool
+ Map32Bit bool
+}
+
+// map32Start/End are the bounds to which MAP_32BIT mappings are constrained,
+// and are equivalent to Linux's MAP32_BASE and MAP32_MAX respectively.
+const (
+ map32Start = 0x40000000
+ map32End = 0x80000000
+)
+
+// findAvailableLocked finds an allocatable range.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) findAvailableLocked(length uint64, opts findAvailableOpts) (usermem.Addr, error) {
+ if opts.Fixed {
+ opts.Map32Bit = false
+ }
+ allowedAR := mm.applicationAddrRange()
+ if opts.Map32Bit {
+ allowedAR = allowedAR.Intersect(usermem.AddrRange{map32Start, map32End})
+ }
+
+ // Does the provided suggestion work?
+ if ar, ok := opts.Addr.ToRange(length); ok {
+ if allowedAR.IsSupersetOf(ar) {
+ if opts.Unmap {
+ return ar.Start, nil
+ }
+ // Check for the presence of an existing vma or guard page.
+ if vgap := mm.vmas.FindGap(ar.Start); vgap.Ok() && vgap.availableRange().IsSupersetOf(ar) {
+ return ar.Start, nil
+ }
+ }
+ }
+
+ // Fixed mappings accept only the requested address.
+ if opts.Fixed {
+ return 0, syserror.ENOMEM
+ }
+
+ // Prefer hugepage alignment if a hugepage or more is requested.
+ alignment := uint64(usermem.PageSize)
+ if length >= usermem.HugePageSize {
+ alignment = usermem.HugePageSize
+ }
+
+ if opts.Map32Bit {
+ return mm.findLowestAvailableLocked(length, alignment, allowedAR)
+ }
+ if mm.layout.DefaultDirection == arch.MmapBottomUp {
+ return mm.findLowestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.BottomUpBase, mm.layout.MaxAddr})
+ }
+ return mm.findHighestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.MinAddr, mm.layout.TopDownBase})
+}
+
+func (mm *MemoryManager) applicationAddrRange() usermem.AddrRange {
+ return usermem.AddrRange{mm.layout.MinAddr, mm.layout.MaxAddr}
+}
+
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) {
+ for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextLargeEnoughGap(usermem.Addr(length)) {
+ if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
+ // Can we shift up to match the alignment?
+ if offset := uint64(gr.Start) % alignment; offset != 0 {
+ if uint64(gr.Length()) >= length+alignment-offset {
+ // Yes, we're aligned.
+ return gr.Start + usermem.Addr(alignment-offset), nil
+ }
+ }
+
+ // Either aligned perfectly, or can't align it.
+ return gr.Start, nil
+ }
+ }
+ return 0, syserror.ENOMEM
+}
+
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) {
+ for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevLargeEnoughGap(usermem.Addr(length)) {
+ if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
+ // Can we shift down to match the alignment?
+ start := gr.End - usermem.Addr(length)
+ if offset := uint64(start) % alignment; offset != 0 {
+ if gr.Start <= start-usermem.Addr(offset) {
+ // Yes, we're aligned.
+ return start - usermem.Addr(offset), nil
+ }
+ }
+
+ // Either aligned perfectly, or can't align it.
+ return start, nil
+ }
+ }
+ return 0, syserror.ENOMEM
+}
+
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 {
+ var total uint64
+ for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+ if vseg.ValuePtr().mlockMode != memmap.MLockNone {
+ total += uint64(vseg.Range().Intersect(ar).Length())
+ }
+ }
+ return total
+}
+
+// getVMAsLocked ensures that vmas exist for all addresses in ar, and support
+// access of type (at, ignorePermissions). It returns:
+//
+// - An iterator to the vma containing ar.Start. If no vma contains ar.Start,
+// the iterator is unspecified.
+//
+// - An iterator to the gap after the last vma containing an address in ar. If
+// vmas exist for no addresses in ar, the iterator is to a gap that begins
+// before ar.Start.
+//
+// - An error that is non-nil if vmas exist for only a subset of ar.
+//
+// Preconditions: mm.mappingMu must be locked for reading; it may be
+// temporarily unlocked. ar.Length() != 0.
+func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) {
+ if checkInvariants {
+ if !ar.WellFormed() || ar.Length() <= 0 {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ }
+
+ // Inline mm.vmas.LowerBoundSegment so that we have the preceding gap if
+ // !vbegin.Ok().
+ vbegin, vgap := mm.vmas.Find(ar.Start)
+ if !vbegin.Ok() {
+ vbegin = vgap.NextSegment()
+ // vseg.Ok() is checked before entering the following loop.
+ } else {
+ vgap = vbegin.PrevGap()
+ }
+
+ addr := ar.Start
+ vseg := vbegin
+ for vseg.Ok() {
+ // Loop invariants: vgap = vseg.PrevGap(); addr < vseg.End().
+ vma := vseg.ValuePtr()
+ if addr < vseg.Start() {
+ // TODO(jamieliu): Implement vma.growsDown here.
+ return vbegin, vgap, syserror.EFAULT
+ }
+
+ perms := vma.effectivePerms
+ if ignorePermissions {
+ perms = vma.maxPerms
+ }
+ if !perms.SupersetOf(at) {
+ return vbegin, vgap, syserror.EPERM
+ }
+
+ addr = vseg.End()
+ vgap = vseg.NextGap()
+ if addr >= ar.End {
+ return vbegin, vgap, nil
+ }
+ vseg = vgap.NextSegment()
+ }
+
+ // Ran out of vmas before ar.End.
+ return vbegin, vgap, syserror.EFAULT
+}
+
+// getVecVMAsLocked ensures that vmas exist for all addresses in ars, and
+// support access to type of (at, ignorePermissions). It returns the subset of
+// ars for which vmas exist. If this is not equal to ars, it returns a non-nil
+// error explaining why.
+//
+// Preconditions: mm.mappingMu must be locked for reading; it may be
+// temporarily unlocked.
+//
+// Postconditions: ars is not mutated.
+func (mm *MemoryManager) getVecVMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool) (usermem.AddrRangeSeq, error) {
+ for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
+ ar := arsit.Head()
+ if ar.Length() == 0 {
+ continue
+ }
+ if _, vend, err := mm.getVMAsLocked(ctx, ar, at, ignorePermissions); err != nil {
+ return truncatedAddrRangeSeq(ars, arsit, vend.Start()), err
+ }
+ }
+ return ars, nil
+}
+
+// vma extension will not shrink the number of unmapped bytes between the start
+// of a growsDown vma and the end of its predecessor non-growsDown vma below
+// guardBytes.
+//
+// guardBytes is equivalent to Linux's stack_guard_gap after upstream
+// 1be7107fbe18 "mm: larger stack guard gap, between vmas".
+const guardBytes = 256 * usermem.PageSize
+
+// unmapLocked unmaps all addresses in ar and returns the resulting gap in
+// mm.vmas.
+//
+// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0.
+// ar must be page-aligned.
+func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
+ if checkInvariants {
+ if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ }
+
+ // AddressSpace mappings and pmas must be invalidated before
+ // mm.removeVMAsLocked() => memmap.Mappable.RemoveMapping().
+ mm.Invalidate(ar, memmap.InvalidateOpts{InvalidatePrivate: true})
+ return mm.removeVMAsLocked(ctx, ar)
+}
+
+// removeVMAsLocked removes vmas for addresses in ar and returns the resulting
+// gap in mm.vmas. It does not remove pmas or AddressSpace mappings; clients
+// must do so before calling removeVMAsLocked.
+//
+// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. ar
+// must be page-aligned.
+func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
+ if checkInvariants {
+ if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ }
+
+ vseg, vgap := mm.vmas.Find(ar.Start)
+ if vgap.Ok() {
+ vseg = vgap.NextSegment()
+ }
+ for vseg.Ok() && vseg.Start() < ar.End {
+ vseg = mm.vmas.Isolate(vseg, ar)
+ vmaAR := vseg.Range()
+ vma := vseg.ValuePtr()
+ if vma.mappable != nil {
+ vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked())
+ }
+ if vma.id != nil {
+ vma.id.DecRef()
+ }
+ mm.usageAS -= uint64(vmaAR.Length())
+ if vma.isPrivateDataLocked() {
+ mm.dataAS -= uint64(vmaAR.Length())
+ }
+ if vma.mlockMode != memmap.MLockNone {
+ mm.lockedAS -= uint64(vmaAR.Length())
+ }
+ vgap = mm.vmas.Remove(vseg)
+ vseg = vgap.NextSegment()
+ }
+ return vgap
+}
+
+// canWriteMappableLocked returns true if it is possible for vma.mappable to be
+// written to via this vma, i.e. if it is possible that
+// vma.mappable.Translate(at.Write=true) may be called as a result of this vma.
+// This includes via I/O with usermem.IOOpts.IgnorePermissions = true, such as
+// PTRACE_POKEDATA.
+//
+// canWriteMappableLocked is equivalent to Linux's VM_SHARED.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (vma *vma) canWriteMappableLocked() bool {
+ return !vma.private && vma.maxPerms.Write
+}
+
+// isPrivateDataLocked identify the data segments - private, writable, not stack
+//
+// Preconditions: mm.mappingMu must be locked.
+func (vma *vma) isPrivateDataLocked() bool {
+ return vma.realPerms.Write && vma.private && !vma.growsDown
+}
+
+// vmaSetFunctions implements segment.Functions for vmaSet.
+type vmaSetFunctions struct{}
+
+func (vmaSetFunctions) MinKey() usermem.Addr {
+ return 0
+}
+
+func (vmaSetFunctions) MaxKey() usermem.Addr {
+ return ^usermem.Addr(0)
+}
+
+func (vmaSetFunctions) ClearValue(vma *vma) {
+ vma.mappable = nil
+ vma.id = nil
+ vma.hint = ""
+}
+
+func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRange, vma2 vma) (vma, bool) {
+ if vma1.mappable != vma2.mappable ||
+ (vma1.mappable != nil && vma1.off+uint64(ar1.Length()) != vma2.off) ||
+ vma1.realPerms != vma2.realPerms ||
+ vma1.maxPerms != vma2.maxPerms ||
+ vma1.private != vma2.private ||
+ vma1.growsDown != vma2.growsDown ||
+ vma1.mlockMode != vma2.mlockMode ||
+ vma1.numaPolicy != vma2.numaPolicy ||
+ vma1.numaNodemask != vma2.numaNodemask ||
+ vma1.dontfork != vma2.dontfork ||
+ vma1.id != vma2.id ||
+ vma1.hint != vma2.hint {
+ return vma{}, false
+ }
+
+ if vma2.id != nil {
+ vma2.id.DecRef()
+ }
+ return vma1, true
+}
+
+func (vmaSetFunctions) Split(ar usermem.AddrRange, v vma, split usermem.Addr) (vma, vma) {
+ v2 := v
+ if v2.mappable != nil {
+ v2.off += uint64(split - ar.Start)
+ }
+ if v2.id != nil {
+ v2.id.IncRef()
+ }
+ return v, v2
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil. vseg.Range().Contains(addr).
+func (vseg vmaIterator) mappableOffsetAt(addr usermem.Addr) uint64 {
+ if checkInvariants {
+ if !vseg.Ok() {
+ panic("terminal vma iterator")
+ }
+ if vseg.ValuePtr().mappable == nil {
+ panic("Mappable offset is meaningless for anonymous vma")
+ }
+ if !vseg.Range().Contains(addr) {
+ panic(fmt.Sprintf("addr %v out of bounds %v", addr, vseg.Range()))
+ }
+ }
+
+ vma := vseg.ValuePtr()
+ vstart := vseg.Start()
+ return vma.off + uint64(addr-vstart)
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil.
+func (vseg vmaIterator) mappableRange() memmap.MappableRange {
+ return vseg.mappableRangeOf(vseg.Range())
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil.
+// vseg.Range().IsSupersetOf(ar). ar.Length() != 0.
+func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRange {
+ if checkInvariants {
+ if !vseg.Ok() {
+ panic("terminal vma iterator")
+ }
+ if vseg.ValuePtr().mappable == nil {
+ panic("MappableRange is meaningless for anonymous vma")
+ }
+ if !ar.WellFormed() || ar.Length() <= 0 {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ if !vseg.Range().IsSupersetOf(ar) {
+ panic(fmt.Sprintf("ar %v out of bounds %v", ar, vseg.Range()))
+ }
+ }
+
+ vma := vseg.ValuePtr()
+ vstart := vseg.Start()
+ return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)}
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil.
+// vseg.mappableRange().IsSupersetOf(mr). mr.Length() != 0.
+func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange {
+ if checkInvariants {
+ if !vseg.Ok() {
+ panic("terminal vma iterator")
+ }
+ if vseg.ValuePtr().mappable == nil {
+ panic("MappableRange is meaningless for anonymous vma")
+ }
+ if !mr.WellFormed() || mr.Length() <= 0 {
+ panic(fmt.Sprintf("invalid mr: %v", mr))
+ }
+ if !vseg.mappableRange().IsSupersetOf(mr) {
+ panic(fmt.Sprintf("mr %v out of bounds %v", mr, vseg.mappableRange()))
+ }
+ }
+
+ vma := vseg.ValuePtr()
+ vstart := vseg.Start()
+ return usermem.AddrRange{vstart + usermem.Addr(mr.Start-vma.off), vstart + usermem.Addr(mr.End-vma.off)}
+}
+
+// seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by
+// scanning linearly forward from vseg.
+//
+// Preconditions: mm.mappingMu must be locked. addr >= vseg.Start().
+func (vseg vmaIterator) seekNextLowerBound(addr usermem.Addr) vmaIterator {
+ if checkInvariants {
+ if !vseg.Ok() {
+ panic("terminal vma iterator")
+ }
+ if addr < vseg.Start() {
+ panic(fmt.Sprintf("can't seek forward to %#x from %#x", addr, vseg.Start()))
+ }
+ }
+ for vseg.Ok() && addr >= vseg.End() {
+ vseg = vseg.NextSegment()
+ }
+ return vseg
+}
+
+// availableRange returns the subset of vgap.Range() in which new vmas may be
+// created without MMapOpts.Unmap == true.
+func (vgap vmaGapIterator) availableRange() usermem.AddrRange {
+ ar := vgap.Range()
+ next := vgap.NextSegment()
+ if !next.Ok() || !next.ValuePtr().growsDown {
+ return ar
+ }
+ // Exclude guard pages.
+ if ar.Length() < guardBytes {
+ return usermem.AddrRange{ar.Start, ar.Start}
+ }
+ ar.End -= guardBytes
+ return ar
+}