diff options
Diffstat (limited to 'pkg/sentry/mm')
-rw-r--r-- | pkg/sentry/mm/BUILD | 155 | ||||
-rw-r--r-- | pkg/sentry/mm/README.md | 279 | ||||
-rw-r--r-- | pkg/sentry/mm/address_space.go | 223 | ||||
-rw-r--r-- | pkg/sentry/mm/aio_context.go | 377 | ||||
-rw-r--r-- | pkg/sentry/mm/aio_context_state.go | 20 | ||||
-rw-r--r-- | pkg/sentry/mm/debug.go | 98 | ||||
-rw-r--r-- | pkg/sentry/mm/io.go | 604 | ||||
-rw-r--r-- | pkg/sentry/mm/lifecycle.go | 218 | ||||
-rw-r--r-- | pkg/sentry/mm/metadata.go | 139 | ||||
-rw-r--r-- | pkg/sentry/mm/mm.go | 417 | ||||
-rw-r--r-- | pkg/sentry/mm/mm_test.go | 174 | ||||
-rw-r--r-- | pkg/sentry/mm/pma.go | 928 | ||||
-rw-r--r-- | pkg/sentry/mm/proc_pid_maps.go | 105 | ||||
-rw-r--r-- | pkg/sentry/mm/save_restore.go | 57 | ||||
-rw-r--r-- | pkg/sentry/mm/special_mappable.go | 147 | ||||
-rw-r--r-- | pkg/sentry/mm/syscalls.go | 794 | ||||
-rw-r--r-- | pkg/sentry/mm/vma.go | 476 |
17 files changed, 5211 insertions, 0 deletions
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD new file mode 100644 index 000000000..39bde2be3 --- /dev/null +++ b/pkg/sentry/mm/BUILD @@ -0,0 +1,155 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "mm_state", + srcs = [ + "aio_context.go", + "aio_context_state.go", + "file_refcount_set.go", + "io_list.go", + "mm.go", + "pma_set.go", + "save_restore.go", + "special_mappable.go", + "vma_set.go", + ], + out = "mm_state.go", + package = "mm", +) + +go_template_instance( + name = "file_refcount_set", + out = "file_refcount_set.go", + imports = { + "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform", + }, + package = "mm", + prefix = "fileRefcount", + template = "//pkg/segment:generic_set", + types = { + "Key": "uint64", + "Range": "platform.FileRange", + "Value": "int32", + "Functions": "fileRefcountSetFunctions", + }, +) + +go_template_instance( + name = "vma_set", + out = "vma_set.go", + consts = { + "minDegree": "8", + }, + imports = { + "usermem": "gvisor.googlesource.com/gvisor/pkg/sentry/usermem", + }, + package = "mm", + prefix = "vma", + template = "//pkg/segment:generic_set", + types = { + "Key": "usermem.Addr", + "Range": "usermem.AddrRange", + "Value": "vma", + "Functions": "vmaSetFunctions", + }, +) + +go_template_instance( + name = "pma_set", + out = "pma_set.go", + consts = { + "minDegree": "8", + }, + imports = { + "usermem": "gvisor.googlesource.com/gvisor/pkg/sentry/usermem", + }, + package = "mm", + prefix = "pma", + template = "//pkg/segment:generic_set", + types = { + "Key": "usermem.Addr", + "Range": "usermem.AddrRange", + "Value": "pma", + "Functions": "pmaSetFunctions", + }, +) + +go_template_instance( + name = "io_list", + out = "io_list.go", + package = "mm", + prefix = "io", + template = "//pkg/ilist:generic_list", + types = { + "Linker": "*ioResult", + }, +) + +go_library( + name = "mm", + srcs = [ + "address_space.go", + "aio_context.go", + "aio_context_state.go", + "debug.go", + "file_refcount_set.go", + "io.go", + "io_list.go", + "lifecycle.go", + "metadata.go", + "mm.go", + "mm_state.go", + "pma.go", + "pma_set.go", + "proc_pid_maps.go", + "save_restore.go", + "special_mappable.go", + "syscalls.go", + "vma.go", + "vma_set.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/mm", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/atomicbitops", + "//pkg/log", + "//pkg/refs", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/proc/seqfile", + "//pkg/sentry/limits", + "//pkg/sentry/memmap", + "//pkg/sentry/platform", + "//pkg/sentry/platform/safecopy", + "//pkg/sentry/safemem", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/sync", + "//pkg/syserror", + "//pkg/tcpip/buffer", + ], +) + +go_test( + name = "mm_test", + size = "small", + srcs = ["mm_test.go"], + embed = [":mm"], + deps = [ + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/limits", + "//pkg/sentry/memmap", + "//pkg/sentry/platform", + "//pkg/sentry/usermem", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/mm/README.md b/pkg/sentry/mm/README.md new file mode 100644 index 000000000..067733475 --- /dev/null +++ b/pkg/sentry/mm/README.md @@ -0,0 +1,279 @@ +This package provides an emulation of Linux semantics for application virtual +memory mappings. + +For completeness, this document also describes aspects of the memory management +subsystem defined outside this package. + +# Background + +We begin by describing semantics for virtual memory in Linux. + +A virtual address space is defined as a collection of mappings from virtual +addresses to physical memory. However, userspace applications do not configure +mappings to physical memory directly. Instead, applications configure memory +mappings from virtual addresses to offsets into a file using the `mmap` system +call.[^mmap-anon] For example, a call to: + + mmap( + /* addr = */ 0x400000, + /* length = */ 0x1000, + PROT_READ | PROT_WRITE, + MAP_SHARED, + /* fd = */ 3, + /* offset = */ 0); + +creates a mapping of length 0x1000 bytes, starting at virtual address (VA) +0x400000, to offset 0 in the file represented by file descriptor (FD) 3. Within +the Linux kernel, virtual memory mappings are represented by *virtual memory +areas* (VMAs). Supposing that FD 3 represents file /tmp/foo, the state of the +virtual memory subsystem after the `mmap` call may be depicted as: + + VMA: VA:0x400000 -> /tmp/foo:0x0 + +Establishing a virtual memory area does not necessarily establish a mapping to a +physical address, because Linux has not necessarily provisioned physical memory +to store the file's contents. Thus, if the application attempts to read the +contents of VA 0x400000, it may incur a *page fault*, a CPU exception that +forces the kernel to create such a mapping to service the read. + +For a file, doing so consists of several logical phases: + +1. The kernel allocates physical memory to store the contents of the required + part of the file, and copies file contents to the allocated memory. Supposing + that the kernel chooses the physical memory at physical address (PA) + 0x2fb000, the resulting state of the system is: + + VMA: VA:0x400000 -> /tmp/foo:0x0 + Filemap: /tmp/foo:0x0 -> PA:0x2fb000 + + (In Linux the state of the mapping from file offset to physical memory is + stored in `struct address_space`, but to avoid confusion with other notions + of address space we will refer to this system as filemap, named after Linux + kernel source file `mm/filemap.c`.) + +2. The kernel stores the effective mapping from virtual to physical address in a + *page table entry* (PTE) in the application's *page tables*, which are used + by the CPU's virtual memory hardware to perform address translation. The + resulting state of the system is: + + VMA: VA:0x400000 -> /tmp/foo:0x0 + Filemap: /tmp/foo:0x0 -> PA:0x2fb000 + PTE: VA:0x400000 -----------------> PA:0x2fb000 + + The PTE is required for the application to actually use the contents of the + mapped file as virtual memory. However, the PTE is derived from the VMA and + filemap state, both of which are independently mutable, such that mutations + to either will affect the PTE. For example: + + - The application may remove the VMA using the `munmap` system call. This + breaks the mapping from VA:0x400000 to /tmp/foo:0x0, and consequently the + mapping from VA:0x400000 to PA:0x2fb000. However, it does not necessarily + break the mapping from /tmp/foo:0x0 to PA:0x2fb000, so a future mapping of + the same file offset may reuse this physical memory. + + - The application may invalidate the file's contents by passing a length of 0 + to the `ftruncate` system call. This breaks the mapping from /tmp/foo:0x0 + to PA:0x2fb000, and consequently the mapping from VA:0x400000 to + PA:0x2fb000. However, it does not break the mapping from VA:0x400000 to + /tmp/foo:0x0, so future changes to the file's contents may again be made + visible at VA:0x400000 after another page fault results in the allocation + of a new physical address. + + Note that, in order to correctly break the mapping from VA:0x400000 to + PA:0x2fb000 in the latter case, filemap must also store a *reverse mapping* + from /tmp/foo:0x0 to VA:0x400000 so that it can locate and remove the PTE. + +[^mmap-anon]: Memory mappings to non-files are discussed in later sections. + +## Private Mappings + +The preceding example considered VMAs created using the `MAP_SHARED` flag, which +means that PTEs derived from the mapping should always use physical memory that +represents the current state of the mapped file.[^mmap-dev-zero] Applications +can alternatively pass the `MAP_PRIVATE` flag to create a *private mapping*. +Private mappings are *copy-on-write*. + +Suppose that the application instead created a private mapping in the previous +example. In Linux, the state of the system after a read page fault would be: + + VMA: VA:0x400000 -> /tmp/foo:0x0 (private) + Filemap: /tmp/foo:0x0 -> PA:0x2fb000 + PTE: VA:0x400000 -----------------> PA:0x2fb000 (read-only) + +Now suppose the application attempts to write to VA:0x400000. For a shared +mapping, the write would be propagated to PA:0x2fb000, and the kernel would be +responsible for ensuring that the write is later propagated to the mapped file. +For a private mapping, the write incurs another page fault since the PTE is +marked read-only. In response, the kernel allocates physical memory to store the +mapping's *private copy* of the file's contents, copies file contents to the +allocated memory, and changes the PTE to map to the private copy. Supposing that +the kernel chooses the physical memory at physical address (PA) 0x5ea000, the +resulting state of the system is: + + VMA: VA:0x400000 -> /tmp/foo:0x0 (private) + Filemap: /tmp/foo:0x0 -> PA:0x2fb000 + PTE: VA:0x400000 -----------------> PA:0x5ea000 + +Note that the filemap mapping from /tmp/foo:0x0 to PA:0x2fb000 may still exist, +but is now irrelevant to this mapping. + +[^mmap-dev-zero]: Modulo files with special mmap semantics such as `/dev/zero`. + +## Anonymous Mappings + +Instead of passing a file to the `mmap` system call, applications can instead +request an *anonymous* mapping by passing the `MAP_ANONYMOUS` flag. +Semantically, an anonymous mapping is essentially a mapping to an ephemeral file +initially filled with zero bytes. Practically speaking, this is how shared +anonymous mappings are implemented, but private anonymous mappings do not result +in the creation of an ephemeral file; since there would be no way to modify the +contents of the underlying file through a private mapping, all private anonymous +mappings use a single shared page filled with zero bytes until copy-on-write +occurs. + +# Virtual Memory in the Sentry + +The sentry implements application virtual memory atop a host kernel, introducing +an additional level of indirection to the above. + +Consider the same scenario as in the previous section. Since the sentry handles +application system calls, the effect of an application `mmap` system call is to +create a VMA in the sentry (as opposed to the host kernel): + + Sentry VMA: VA:0x400000 -> /tmp/foo:0x0 + +When the application first incurs a page fault on this address, the host kernel +delivers information about the page fault to the sentry in a platform-dependent +manner, and the sentry handles the fault: + +1. The sentry allocates memory to store the contents of the required part of the + file, and copies file contents to the allocated memory. However, since the + sentry is implemented atop a host kernel, it does not configure mappings to + physical memory directly. Instead, mappable "memory" in the sentry is + represented by a host file descriptor and offset, since (as noted in + "Background") this is the memory mapping primitive provided by the host + kernel. In general, memory is allocated from a temporary host file using the + `filemem` package. Supposing that the sentry allocates offset 0x3000 from + host file "memory-file", the resulting state is: + + Sentry VMA: VA:0x400000 -> /tmp/foo:0x0 + Sentry filemap: /tmp/foo:0x0 -> host:memory-file:0x3000 + +2. The sentry stores the effective mapping from virtual address to host file in + a host VMA by invoking the `mmap` system call: + + Sentry VMA: VA:0x400000 -> /tmp/foo:0x0 + Sentry filemap: /tmp/foo:0x0 -> host:memory-file:0x3000 + Host VMA: VA:0x400000 -----------------> host:memory-file:0x3000 + +3. The sentry returns control to the application, which immediately incurs the + page fault again.[^mmap-populate] However, since a host VMA now exists for + the faulting virtual address, the host kernel now handles the page fault as + described in "Background": + + Sentry VMA: VA:0x400000 -> /tmp/foo:0x0 + Sentry filemap: /tmp/foo:0x0 -> host:memory-file:0x3000 + Host VMA: VA:0x400000 -----------------> host:memory-file:0x3000 + Host filemap: host:memory-file:0x3000 -> PA:0x2fb000 + Host PTE: VA:0x400000 --------------------------------------------> PA:0x2fb000 + +Thus, from an implementation standpoint, host VMAs serve the same purpose in the +sentry that PTEs do in Linux. As in Linux, sentry VMA and filemap state is +independently mutable, and the desired state of host VMAs is derived from that +state. + +[^mmap-populate]: The sentry could force the host kernel to establish PTEs when + it creates the host VMA by passing the `MAP_POPULATE` flag to + the `mmap` system call, but usually does not. This is because, + to reduce the number of page faults that require handling by + the sentry and (correspondingly) the number of host `mmap` + system calls, the sentry usually creates host VMAs that are + much larger than the single faulting page. + +## Private Mappings + +The sentry implements private mappings consistently with Linux. Before +copy-on-write, the private mapping example given in the Background results in: + + Sentry VMA: VA:0x400000 -> /tmp/foo:0x0 (private) + Sentry filemap: /tmp/foo:0x0 -> host:memory-file:0x3000 + Host VMA: VA:0x400000 -----------------> host:memory-file:0x3000 (read-only) + Host filemap: host:memory-file:0x3000 -> PA:0x2fb000 + Host PTE: VA:0x400000 --------------------------------------------> PA:0x2fb000 (read-only) + +When the application attempts to write to this address, the host kernel delivers +information about the resulting page fault to the sentry. Analogous to Linux, +the sentry allocates memory to store the mapping's private copy of the file's +contents, copies file contents to the allocated memory, and changes the host VMA +to map to the private copy. Supposing that the sentry chooses the offset 0x4000 +in host file `memory-file` to store the private copy, the state of the system +after copy-on-write is: + + Sentry VMA: VA:0x400000 -> /tmp/foo:0x0 (private) + Sentry filemap: /tmp/foo:0x0 -> host:memory-file:0x3000 + Host VMA: VA:0x400000 -----------------> host:memory-file:0x4000 + Host filemap: host:memory-file:0x4000 -> PA:0x5ea000 + Host PTE: VA:0x400000 --------------------------------------------> PA:0x5ea000 + +However, this highlights an important difference between Linux and the sentry. +In Linux, page tables are concrete (architecture-dependent) data structures +owned by the kernel. Conversely, the sentry has the ability to create and +destroy host VMAs using host system calls, but it does not have direct access to +their state. Thus, as written, if the application invokes the `munmap` system +call to remove the sentry VMA, it is non-trivial for the sentry to determine +that it should deallocate `host:memory-file:0x4000`. This implies that the +sentry must retain information about the host VMAs that it has created. + +## Anonymous Mappings + +The sentry implements anonymous mappings consistently with Linux, except that +there is no shared zero page. + +# Implementation Constructs + +In Linux: + +- A virtual address space is represented by `struct mm_struct`. + +- VMAs are represented by `struct vm_area_struct`, stored in `struct + mm_struct::mmap`. + +- Mappings from file offsets to physical memory are stored in `struct + address_space`. + +- Reverse mappings from file offsets to virtual mappings are stored in `struct + address_space::i_mmap`. + +- Physical memory pages are represented by a pointer to `struct page` or an + index called a *page frame number* (PFN), represented by `pfn_t`. + +- PTEs are represented by architecture-dependent type `pte_t`, stored in a table + hierarchy rooted at `struct mm_struct::pgd`. + +In the sentry: + +- A virtual address space is represented by type [`mm.MemoryManager`][mm]. + +- Sentry VMAs are represented by type [`mm.vma`][mm], stored in + `mm.MemoryManager.vmas`. + +- Mappings from sentry file offsets to host file offsets are abstracted through + interface method [`memmap.Mappable.Translate`][memmap]. + +- Reverse mappings from sentry file offsets to virtual mappings are abstracted + through interface methods [`memmap.Mappable.AddMapping` and + `memmap.Mappable.RemoveMapping`][memmap]. + +- Host files that may be mapped into host VMAs are represented by type + [`platform.File`][platform]. + +- Host VMAs are represented in the sentry by type [`mm.pma`][mm] ("platform + mapping area"), stored in `mm.MemoryManager.pmas`. + +- Creation and destruction of host VMAs is abstracted through interface methods + [`platform.AddressSpace.MapFile` and `platform.AddressSpace.Unmap`][platform]. + +[filemem]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/platform/filemem/filemem.go +[memmap]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/memmap/memmap.go +[mm]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/mm/mm.go +[platform]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/platform/platform.go diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go new file mode 100644 index 000000000..4dd67b1ea --- /dev/null +++ b/pkg/sentry/mm/address_space.go @@ -0,0 +1,223 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/atomicbitops" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// AddressSpace returns the platform.AddressSpace bound to mm. +// +// Preconditions: The caller must have called mm.Activate(). +func (mm *MemoryManager) AddressSpace() platform.AddressSpace { + if atomic.LoadInt32(&mm.active) == 0 { + panic("trying to use inactive address space?") + } + return mm.as +} + +// Activate ensures this MemoryManager has a platform.AddressSpace. +// +// The caller must not hold any locks when calling Activate. +// +// When this MemoryManager is no longer needed by a task, it should call +// Deactivate to release the reference. +func (mm *MemoryManager) Activate() error { + // Fast path: the MemoryManager already has an active + // platform.AddressSpace, and we just need to indicate that we need it too. + if atomicbitops.IncUnlessZeroInt32(&mm.active) { + return nil + } + + for { + // Slow path: may need to synchronize with other goroutines changing + // mm.active to or from zero. + mm.activeMu.Lock() + // Inline Unlock instead of using a defer for performance since this + // method is commonly in the hot-path. + + // Check if we raced with another goroutine performing activation. + if atomic.LoadInt32(&mm.active) > 0 { + // This can't race; Deactivate can't decrease mm.active from 1 to 0 + // without holding activeMu. + atomic.AddInt32(&mm.active, 1) + mm.activeMu.Unlock() + return nil + } + + // Do we have a context? If so, then we never unmapped it. This can + // only be the case if !mm.p.CooperativelySchedulesAddressSpace(). + if mm.as != nil { + atomic.StoreInt32(&mm.active, 1) + mm.activeMu.Unlock() + return nil + } + + // Get a new address space. We must force unmapping by passing nil to + // NewAddressSpace if requested. (As in the nil interface object, not a + // typed nil.) + mappingsID := (interface{})(mm) + if mm.unmapAllOnActivate { + mappingsID = nil + } + as, c, err := mm.p.NewAddressSpace(mappingsID) + if err != nil { + mm.activeMu.Unlock() + return err + } + if as == nil { + // AddressSpace is unavailable, we must wait. + // + // activeMu must not be held while waiting, as the user + // of the address space we are waiting on may attempt + // to take activeMu. + // + // Don't call UninterruptibleSleepStart to register the + // wait to allow the watchdog stuck task to trigger in + // case a process is starved waiting for the address + // space. + mm.activeMu.Unlock() + <-c + continue + } + + // Okay, we could restore all mappings at this point. + // But forget that. Let's just let them fault in. + mm.as = as + + // Unmapping is done, if necessary. + mm.unmapAllOnActivate = false + + // Now that m.as has been assigned, we can set m.active to a non-zero value + // to enable the fast path. + atomic.StoreInt32(&mm.active, 1) + + mm.activeMu.Unlock() + return nil + } +} + +// Deactivate releases a release to the MemoryManager. +func (mm *MemoryManager) Deactivate() error { + // Fast path: this is not the last goroutine to deactivate the + // MemoryManager. + if atomicbitops.DecUnlessOneInt32(&mm.active) { + return nil + } + + mm.activeMu.Lock() + // Same as Activate. + + // Still active? + if atomic.AddInt32(&mm.active, -1) > 0 { + mm.activeMu.Unlock() + return nil + } + + // Can we hold on to the address space? + if !mm.p.CooperativelySchedulesAddressSpace() { + mm.activeMu.Unlock() + return nil + } + + // Release the address space. + if err := mm.as.Release(); err != nil { + atomic.StoreInt32(&mm.active, 1) + mm.activeMu.Unlock() + return err + } + + // Lost it. + mm.as = nil + mm.activeMu.Unlock() + return nil +} + +// mapASLocked maps addresses in ar into mm.as. If precommit is true, mappings +// for all addresses in ar should be precommitted. +// +// Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0. +// ar must be page-aligned. pseg.Range().Contains(ar.Start). +func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error { + // By default, map entire pmas at a time, under the assumption that there + // is no cost to mapping more of a pma than necessary. + mapAR := usermem.AddrRange{0, ^usermem.Addr(usermem.PageSize - 1)} + if precommit { + // When explicitly precommitting, only map ar, since overmapping may + // incur unexpected resource usage. + mapAR = ar + } else if mapUnit := mm.p.MapUnit(); mapUnit != 0 { + // Limit the range we map to ar, aligned to mapUnit. + mapMask := usermem.Addr(mapUnit - 1) + mapAR.Start = ar.Start &^ mapMask + // If rounding ar.End up overflows, just keep the existing mapAR.End. + if end := (ar.End + mapMask) &^ mapMask; end >= ar.End { + mapAR.End = end + } + } + if checkInvariants { + if !mapAR.IsSupersetOf(ar) { + panic(fmt.Sprintf("mapAR %#v is not a superset of ar %#v", mapAR, ar)) + } + } + + for { + pma := pseg.ValuePtr() + pmaAR := pseg.Range() + pmaMapAR := pmaAR.Intersect(mapAR) + perms := pma.vmaEffectivePerms + if pma.needCOW { + perms.Write = false + } + if err := pma.file.MapInto(mm.as, pmaMapAR.Start, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil { + return err + } + // Since this checks ar.End and not mapAR.End, we will never map a pma + // that is not required. + if ar.End <= pmaAR.End { + return nil + } + pseg = pseg.NextSegment() + } +} + +// unmapASLocked removes all AddressSpace mappings for addresses in ar. +// +// Preconditions: mm.activeMu must be locked. +func (mm *MemoryManager) unmapASLocked(ar usermem.AddrRange) { + if mm.as == nil { + // No AddressSpace? Force all mappings to be unmapped on the next + // Activate. + mm.unmapAllOnActivate = true + return + } + + // unmapASLocked doesn't require vmas or pmas to exist for ar, so it can be + // passed ranges that include addresses that can't be mapped by the + // application. + ar = ar.Intersect(mm.applicationAddrRange()) + + // Note that this AddressSpace may or may not be active. If the + // platform does not require cooperative sharing of AddressSpaces, they + // are retained between Deactivate/Activate calls. Despite not being + // active, it is still valid to perform operations on these address + // spaces. + mm.as.Unmap(ar.Start, uint64(ar.Length())) +} diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go new file mode 100644 index 000000000..992bde5a5 --- /dev/null +++ b/pkg/sentry/mm/aio_context.go @@ -0,0 +1,377 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// aioManager creates and manages asynchronous I/O contexts. +type aioManager struct { + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // aioContexts is the set of asynchronous I/O contexts. + contexts map[uint64]*AIOContext +} + +func (a *aioManager) destroy() { + a.mu.Lock() + defer a.mu.Unlock() + + for _, ctx := range a.contexts { + ctx.destroy() + } +} + +// newAIOContext creates a new context for asynchronous I/O. +// +// Returns false if 'id' is currently in use. +func (a *aioManager) newAIOContext(events uint32, id uint64) bool { + a.mu.Lock() + defer a.mu.Unlock() + + if _, ok := a.contexts[id]; ok { + return false + } + + a.contexts[id] = &AIOContext{ + done: make(chan struct{}, 1), + maxOutstanding: events, + } + return true +} + +// destroyAIOContext destroys an asynchronous I/O context. +// +// False is returned if the context does not exist. +func (a *aioManager) destroyAIOContext(id uint64) bool { + a.mu.Lock() + defer a.mu.Unlock() + ctx, ok := a.contexts[id] + if !ok { + return false + } + delete(a.contexts, id) + ctx.destroy() + return true +} + +// lookupAIOContext looks up the given context. +// +// Returns false if context does not exist. +func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) { + a.mu.Lock() + defer a.mu.Unlock() + ctx, ok := a.contexts[id] + return ctx, ok +} + +// ioResult is a completed I/O operation. +type ioResult struct { + data interface{} + ioEntry +} + +// AIOContext is a single asynchronous I/O context. +type AIOContext struct { + // done is the notification channel used for all requests. + done chan struct{} `state:"nosave"` + + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // results is the set of completed requests. + results ioList + + // maxOutstanding is the maximum number of outstanding entries; this value + // is immutable. + maxOutstanding uint32 + + // outstanding is the number of requests outstanding; this will effectively + // be the number of entries in the result list or that are expected to be + // added to the result list. + outstanding uint32 + + // dead is set when the context is destroyed. + dead bool `state:"zerovalue"` +} + +// destroy marks the context dead. +func (ctx *AIOContext) destroy() { + ctx.mu.Lock() + defer ctx.mu.Unlock() + ctx.dead = true + if ctx.outstanding == 0 { + close(ctx.done) + } +} + +// Prepare reserves space for a new request, returning true if available. +// Returns false if the context is busy. +func (ctx *AIOContext) Prepare() bool { + ctx.mu.Lock() + defer ctx.mu.Unlock() + if ctx.outstanding >= ctx.maxOutstanding { + return false + } + ctx.outstanding++ + return true +} + +// PopRequest pops a completed request if available, this function does not do +// any blocking. Returns false if no request is available. +func (ctx *AIOContext) PopRequest() (interface{}, bool) { + ctx.mu.Lock() + defer ctx.mu.Unlock() + + // Is there anything ready? + if e := ctx.results.Front(); e != nil { + ctx.results.Remove(e) + ctx.outstanding-- + if ctx.outstanding == 0 && ctx.dead { + close(ctx.done) + } + return e.data, true + } + return nil, false +} + +// FinishRequest finishes a pending request. It queues up the data +// and notifies listeners. +func (ctx *AIOContext) FinishRequest(data interface{}) { + ctx.mu.Lock() + defer ctx.mu.Unlock() + + // Push to the list and notify opportunistically. The channel notify + // here is guaranteed to be safe because outstanding must be non-zero. + // The done channel is only closed when outstanding reaches zero. + ctx.results.PushBack(&ioResult{data: data}) + + select { + case ctx.done <- struct{}{}: + default: + } +} + +// WaitChannel returns a channel that is notified when an AIO request is +// completed. +// +// The boolean return value indicates whether or not the context is active. +func (ctx *AIOContext) WaitChannel() (chan struct{}, bool) { + ctx.mu.Lock() + defer ctx.mu.Unlock() + if ctx.outstanding == 0 && ctx.dead { + return nil, false + } + return ctx.done, true +} + +// aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO +// ring buffers. +type aioMappable struct { + refs.AtomicRefCount + + p platform.Platform + fr platform.FileRange +} + +var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp()) + +func newAIOMappable(p platform.Platform) (*aioMappable, error) { + fr, err := p.Memory().Allocate(aioRingBufferSize, usage.Anonymous) + if err != nil { + return nil, err + } + return &aioMappable{p: p, fr: fr}, nil +} + +// DecRef implements refs.RefCounter.DecRef. +func (m *aioMappable) DecRef() { + m.AtomicRefCount.DecRefWithDestructor(func() { + m.p.Memory().DecRef(m.fr) + }) +} + +// MappedName implements memmap.MappingIdentity.MappedName. +func (m *aioMappable) MappedName(ctx context.Context) string { + return "[aio]" +} + +// DeviceID implements memmap.MappingIdentity.DeviceID. +func (m *aioMappable) DeviceID() uint64 { + return 0 +} + +// InodeID implements memmap.MappingIdentity.InodeID. +func (m *aioMappable) InodeID() uint64 { + return 0 +} + +// Msync implements memmap.MappingIdentity.Msync. +func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error { + // Linux: aio_ring_fops.fsync == NULL + return syserror.EINVAL +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (m *aioMappable) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error { + // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() + // sets VM_DONTEXPAND). + if offset != 0 || uint64(ar.Length()) != aioRingBufferSize { + return syserror.EFAULT + } + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (m *aioMappable) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) { +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error { + // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() + // sets VM_DONTEXPAND). + if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize { + return syserror.EFAULT + } + // Require that the mapping correspond to a live AIOContext. Compare + // Linux's fs/aio.c:aio_ring_mremap(). + mm, ok := ms.(*MemoryManager) + if !ok { + return syserror.EINVAL + } + am := &mm.aioManager + am.mu.Lock() + defer am.mu.Unlock() + oldID := uint64(srcAR.Start) + aioCtx, ok := am.contexts[oldID] + if !ok { + return syserror.EINVAL + } + aioCtx.mu.Lock() + defer aioCtx.mu.Unlock() + if aioCtx.dead { + return syserror.EINVAL + } + // Use the new ID for the AIOContext. + am.contexts[uint64(dstAR.Start)] = aioCtx + delete(am.contexts, oldID) + return nil +} + +// Translate implements memmap.Mappable.Translate. +func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + var err error + if required.End > m.fr.Length() { + err = &memmap.BusError{syserror.EFAULT} + } + if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 { + return []memmap.Translation{ + { + Source: source, + File: m.p.Memory(), + Offset: m.fr.Start + source.Start, + }, + }, err + } + return nil, err +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (m *aioMappable) InvalidateUnsavable(ctx context.Context) error { + return nil +} + +// NewAIOContext creates a new context for asynchronous I/O. +// +// NewAIOContext is analogous to Linux's fs/aio.c:ioctx_alloc(). +func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint64, error) { + // libaio get_ioevents() expects context "handle" to be a valid address. + // libaio peeks inside looking for a magic number. This function allocates + // a page per context and keeps it set to zeroes to ensure it will not + // match AIO_RING_MAGIC and make libaio happy. + m, err := newAIOMappable(mm.p) + if err != nil { + return 0, err + } + defer m.DecRef() + addr, err := mm.MMap(ctx, memmap.MMapOpts{ + Length: aioRingBufferSize, + MappingIdentity: m, + Mappable: m, + // TODO: Linux does "do_mmap_pgoff(..., PROT_READ | + // PROT_WRITE, ...)" in fs/aio.c:aio_setup_ring(); why do we make this + // mapping read-only? + Perms: usermem.Read, + MaxPerms: usermem.Read, + }) + if err != nil { + return 0, err + } + id := uint64(addr) + if !mm.aioManager.newAIOContext(events, id) { + mm.MUnmap(ctx, addr, aioRingBufferSize) + return 0, syserror.EINVAL + } + return id, nil +} + +// DestroyAIOContext destroys an asynchronous I/O context. It returns false if +// the context does not exist. +func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) bool { + if _, ok := mm.LookupAIOContext(ctx, id); !ok { + return false + } + + // Only unmaps after it assured that the address is a valid aio context to + // prevent random memory from been unmapped. + // + // Note: It's possible to unmap this address and map something else into + // the same address. Then it would be unmapping memory that it doesn't own. + // This is, however, the way Linux implements AIO. Keeps the same [weird] + // semantics in case anyone relies on it. + mm.MUnmap(ctx, usermem.Addr(id), aioRingBufferSize) + + return mm.aioManager.destroyAIOContext(id) +} + +// LookupAIOContext looks up the given context. It returns false if the context +// does not exist. +func (mm *MemoryManager) LookupAIOContext(ctx context.Context, id uint64) (*AIOContext, bool) { + aioCtx, ok := mm.aioManager.lookupAIOContext(id) + if !ok { + return nil, false + } + + // Protect against 'ids' that are inaccessible (Linux also reads 4 bytes + // from id). + var buf [4]byte + _, err := mm.CopyIn(ctx, usermem.Addr(id), buf[:], usermem.IOOpts{}) + if err != nil { + return nil, false + } + + return aioCtx, true +} diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go new file mode 100644 index 000000000..1a5e56f8e --- /dev/null +++ b/pkg/sentry/mm/aio_context_state.go @@ -0,0 +1,20 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +// afterLoad is invoked by stateify. +func (a *AIOContext) afterLoad() { + a.done = make(chan struct{}, 1) +} diff --git a/pkg/sentry/mm/debug.go b/pkg/sentry/mm/debug.go new file mode 100644 index 000000000..56d0490f0 --- /dev/null +++ b/pkg/sentry/mm/debug.go @@ -0,0 +1,98 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "bytes" + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" +) + +const ( + // If checkInvariants is true, perform runtime checks for invariants + // expected by the mm package. This is normally disabled since MM is a + // significant hot path in general, and some such checks (notably + // memmap.CheckTranslateResult) are very expensive. + checkInvariants = false + + // If logIOErrors is true, log I/O errors that originate from MM before + // converting them to EFAULT. + logIOErrors = false +) + +// String implements fmt.Stringer.String. +func (mm *MemoryManager) String() string { + return mm.DebugString(context.Background()) +} + +// DebugString returns a string containing information about mm for debugging. +func (mm *MemoryManager) DebugString(ctx context.Context) string { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + mm.activeMu.RLock() + defer mm.activeMu.RUnlock() + return mm.debugStringLocked(ctx) +} + +// Preconditions: mm.mappingMu and mm.activeMu must be locked. +func (mm *MemoryManager) debugStringLocked(ctx context.Context) string { + var b bytes.Buffer + b.WriteString("VMAs:\n") + for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { + b.Write(mm.vmaMapsEntryLocked(ctx, vseg)) + } + b.WriteString("PMAs:\n") + for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() { + b.Write(pseg.debugStringEntryLocked()) + } + return string(b.Bytes()) +} + +// Preconditions: mm.activeMu must be locked. +func (pseg pmaIterator) debugStringEntryLocked() []byte { + var b bytes.Buffer + + fmt.Fprintf(&b, "%08x-%08x ", pseg.Start(), pseg.End()) + + pma := pseg.ValuePtr() + if pma.vmaEffectivePerms.Read { + b.WriteByte('r') + } else { + b.WriteByte('-') + } + if pma.vmaEffectivePerms.Write { + if pma.needCOW { + b.WriteByte('c') + } else { + b.WriteByte('w') + } + } else { + b.WriteByte('-') + } + if pma.vmaEffectivePerms.Execute { + b.WriteByte('x') + } else { + b.WriteByte('-') + } + if pma.private { + b.WriteByte('p') + } else { + b.WriteByte('s') + } + + fmt.Fprintf(&b, " %08x %T\n", pma.off, pma.file) + return b.Bytes() +} diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go new file mode 100644 index 000000000..cac81a59d --- /dev/null +++ b/pkg/sentry/mm/io.go @@ -0,0 +1,604 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// There are two supported ways to copy data to/from application virtual +// memory: +// +// 1. Internally-mapped copying: Determine the platform.File that backs the +// copied-to/from virtual address, obtain a mapping of its pages, and read or +// write to the mapping. +// +// 2. AddressSpace copying: If platform.Platform.SupportsAddressSpaceIO() is +// true, AddressSpace permissions are applicable, and an AddressSpace is +// available, copy directly through the AddressSpace, handling faults as +// needed. +// +// (Given that internally-mapped copying requires that backing memory is always +// implemented using a host file descriptor, we could also preadv/pwritev to it +// instead. But this would incur a host syscall for each use of the mapped +// page, whereas mmap is a one-time cost.) +// +// The fixed overhead of internally-mapped copying is expected to be higher +// than that of AddressSpace copying since the former always needs to translate +// addresses, whereas the latter only needs to do so when faults occur. +// However, the throughput of internally-mapped copying is expected to be +// somewhat higher than that of AddressSpace copying due to the high cost of +// page faults and because implementations of the latter usually rely on +// safecopy, which doesn't use AVX registers. So we prefer to use AddressSpace +// copying (when available) for smaller copies, and switch to internally-mapped +// copying once a size threshold is exceeded. +const ( + // copyMapMinBytes is the size threshold for switching to internally-mapped + // copying in CopyOut, CopyIn, and ZeroOut. + copyMapMinBytes = 32 << 10 // 32 KB + + // rwMapMinBytes is the size threshold for switching to internally-mapped + // copying in CopyOutFrom and CopyInTo. It's lower than copyMapMinBytes + // since AddressSpace copying in this case requires additional buffering; + // see CopyOutFrom for details. + rwMapMinBytes = 512 +) + +// checkIORange is similar to usermem.Addr.ToRange, but applies bounds checks +// consistent with Linux's arch/x86/include/asm/uaccess.h:access_ok(). +// +// Preconditions: length >= 0. +func (mm *MemoryManager) checkIORange(addr usermem.Addr, length int64) (usermem.AddrRange, bool) { + // Note that access_ok() constrains end even if length == 0. + ar, ok := addr.ToRange(uint64(length)) + return ar, (ok && ar.End <= mm.layout.MaxAddr) +} + +// checkIOVec applies bound checks consistent with Linux's +// arch/x86/include/asm/uaccess.h:access_ok() to ars. +func (mm *MemoryManager) checkIOVec(ars usermem.AddrRangeSeq) bool { + for !ars.IsEmpty() { + ar := ars.Head() + if _, ok := mm.checkIORange(ar.Start, int64(ar.Length())); !ok { + return false + } + ars = ars.Tail() + } + return true +} + +func (mm *MemoryManager) asioEnabled(opts usermem.IOOpts) bool { + return mm.haveASIO && !opts.IgnorePermissions && opts.AddressSpaceActive +} + +// translateIOError converts errors to EFAULT, as is usually reported for all +// I/O errors originating from MM in Linux. +func translateIOError(ctx context.Context, err error) error { + if err == nil { + return nil + } + if logIOErrors { + ctx.Debugf("MM I/O error: %v", err) + } + return syserror.EFAULT +} + +// CopyOut implements usermem.IO.CopyOut. +func (mm *MemoryManager) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) { + ar, ok := mm.checkIORange(addr, int64(len(src))) + if !ok { + return 0, syserror.EFAULT + } + + if len(src) == 0 { + return 0, nil + } + + // Do AddressSpace IO if applicable. + if mm.asioEnabled(opts) && len(src) < copyMapMinBytes { + return mm.asCopyOut(ctx, addr, src) + } + + // Go through internal mappings. + n64, err := mm.withInternalMappings(ctx, ar, usermem.Write, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { + n, err := safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src))) + return n, translateIOError(ctx, err) + }) + return int(n64), err +} + +func (mm *MemoryManager) asCopyOut(ctx context.Context, addr usermem.Addr, src []byte) (int, error) { + var done int + for { + n, err := mm.as.CopyOut(addr+usermem.Addr(done), src[done:]) + done += n + if err == nil { + return done, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + ar, _ := addr.ToRange(uint64(len(src))) + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Write); err != nil { + return done, err + } + continue + } + return done, translateIOError(ctx, err) + } +} + +// CopyIn implements usermem.IO.CopyIn. +func (mm *MemoryManager) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) { + ar, ok := mm.checkIORange(addr, int64(len(dst))) + if !ok { + return 0, syserror.EFAULT + } + + if len(dst) == 0 { + return 0, nil + } + + // Do AddressSpace IO if applicable. + if mm.asioEnabled(opts) && len(dst) < copyMapMinBytes { + return mm.asCopyIn(ctx, addr, dst) + } + + // Go through internal mappings. + n64, err := mm.withInternalMappings(ctx, ar, usermem.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { + n, err := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), ims) + return n, translateIOError(ctx, err) + }) + return int(n64), err +} + +func (mm *MemoryManager) asCopyIn(ctx context.Context, addr usermem.Addr, dst []byte) (int, error) { + var done int + for { + n, err := mm.as.CopyIn(addr+usermem.Addr(done), dst[done:]) + done += n + if err == nil { + return done, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + ar, _ := addr.ToRange(uint64(len(dst))) + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Read); err != nil { + return done, err + } + continue + } + return done, translateIOError(ctx, err) + } +} + +// ZeroOut implements usermem.IO.ZeroOut. +func (mm *MemoryManager) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { + ar, ok := mm.checkIORange(addr, toZero) + if !ok { + return 0, syserror.EFAULT + } + + if toZero == 0 { + return 0, nil + } + + // Do AddressSpace IO if applicable. + if mm.asioEnabled(opts) && toZero < copyMapMinBytes { + return mm.asZeroOut(ctx, addr, toZero) + } + + // Go through internal mappings. + return mm.withInternalMappings(ctx, ar, usermem.Write, opts.IgnorePermissions, func(dsts safemem.BlockSeq) (uint64, error) { + n, err := safemem.ZeroSeq(dsts) + return n, translateIOError(ctx, err) + }) +} + +func (mm *MemoryManager) asZeroOut(ctx context.Context, addr usermem.Addr, toZero int64) (int64, error) { + var done int64 + for { + n, err := mm.as.ZeroOut(addr+usermem.Addr(done), uintptr(toZero-done)) + done += int64(n) + if err == nil { + return done, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + ar, _ := addr.ToRange(uint64(toZero)) + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Write); err != nil { + return done, err + } + continue + } + return done, translateIOError(ctx, err) + } +} + +// CopyOutFrom implements usermem.IO.CopyOutFrom. +func (mm *MemoryManager) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { + if !mm.checkIOVec(ars) { + return 0, syserror.EFAULT + } + + if ars.NumBytes() == 0 { + return 0, nil + } + + // Do AddressSpace IO if applicable. + if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes { + // We have to introduce a buffered copy, instead of just passing a + // safemem.BlockSeq representing addresses in the AddressSpace to src. + // This is because usermem.IO.CopyOutFrom() guarantees that it calls + // src.ReadToBlocks() at most once, which is incompatible with handling + // faults between calls. In the future, this is probably best resolved + // by introducing a CopyOutFrom variant or option that allows it to + // call src.ReadToBlocks() any number of times. + // + // This issue applies to CopyInTo as well. + buf := make([]byte, int(ars.NumBytes())) + bufN, bufErr := src.ReadToBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf))) + var done int64 + for done < int64(bufN) { + ar := ars.Head() + cplen := int64(ar.Length()) + if cplen > int64(bufN)-done { + cplen = int64(bufN) - done + } + n, err := mm.asCopyOut(ctx, ar.Start, buf[int(done):int(done+cplen)]) + done += int64(n) + if err != nil { + return done, err + } + ars = ars.Tail() + } + // Do not convert errors returned by src to EFAULT. + return done, bufErr + } + + // Go through internal mappings. + return mm.withVecInternalMappings(ctx, ars, usermem.Write, opts.IgnorePermissions, src.ReadToBlocks) +} + +// CopyInTo implements usermem.IO.CopyInTo. +func (mm *MemoryManager) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { + if !mm.checkIOVec(ars) { + return 0, syserror.EFAULT + } + + if ars.NumBytes() == 0 { + return 0, nil + } + + // Do AddressSpace IO if applicable. + if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes { + buf := make([]byte, int(ars.NumBytes())) + var done int + var bufErr error + for !ars.IsEmpty() { + ar := ars.Head() + var n int + n, bufErr = mm.asCopyIn(ctx, ar.Start, buf[done:done+int(ar.Length())]) + done += n + if bufErr != nil { + break + } + ars = ars.Tail() + } + n, err := dst.WriteFromBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:done]))) + if err != nil { + return int64(n), err + } + // Do not convert errors returned by dst to EFAULT. + return int64(n), bufErr + } + + // Go through internal mappings. + return mm.withVecInternalMappings(ctx, ars, usermem.Read, opts.IgnorePermissions, dst.WriteFromBlocks) +} + +// SwapUint32 implements usermem.IO.SwapUint32. +func (mm *MemoryManager) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) { + ar, ok := mm.checkIORange(addr, 4) + if !ok { + return 0, syserror.EFAULT + } + + // Do AddressSpace IO if applicable. + if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions { + for { + old, err := mm.as.SwapUint32(addr, new) + if err == nil { + return old, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.ReadWrite); err != nil { + return 0, err + } + continue + } + return 0, translateIOError(ctx, err) + } + } + + // Go through internal mappings. + var old uint32 + _, err := mm.withInternalMappings(ctx, ar, usermem.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { + if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { + // Atomicity is unachievable across mappings. + return 0, syserror.EFAULT + } + im := ims.Head() + var err error + old, err = safemem.SwapUint32(im, new) + if err != nil { + return 0, translateIOError(ctx, err) + } + return 4, nil + }) + return old, err +} + +// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32. +func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) { + ar, ok := mm.checkIORange(addr, 4) + if !ok { + return 0, syserror.EFAULT + } + + // Do AddressSpace IO if applicable. + if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions { + for { + prev, err := mm.as.CompareAndSwapUint32(addr, old, new) + if err == nil { + return prev, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.ReadWrite); err != nil { + return 0, err + } + continue + } + return 0, translateIOError(ctx, err) + } + } + + // Go through internal mappings. + var prev uint32 + _, err := mm.withInternalMappings(ctx, ar, usermem.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { + if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { + // Atomicity is unachievable across mappings. + return 0, syserror.EFAULT + } + im := ims.Head() + var err error + prev, err = safemem.CompareAndSwapUint32(im, old, new) + if err != nil { + return 0, translateIOError(ctx, err) + } + return 4, nil + }) + return prev, err +} + +// handleASIOFault handles a page fault at address addr for an AddressSpaceIO +// operation spanning ioar. +// +// Preconditions: mm.as != nil. ioar.Length() != 0. ioar.Contains(addr). +func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr usermem.Addr, ioar usermem.AddrRange, at usermem.AccessType) error { + // Try to map all remaining pages in the I/O operation. This RoundUp can't + // overflow because otherwise it would have been caught by checkIORange. + end, _ := ioar.End.RoundUp() + ar := usermem.AddrRange{addr.RoundDown(), end} + + // Don't bother trying existingPMAsLocked; in most cases, if we did have + // existing pmas, we wouldn't have faulted. + + // Ensure that we have usable vmas. Here and below, only return early if we + // can't map the first (faulting) page; failure to map later pages are + // silently ignored. This maximizes partial success. + mm.mappingMu.RLock() + vseg, vend, err := mm.getVMAsLocked(ctx, ar, at, false) + if vendaddr := vend.Start(); vendaddr < ar.End { + if vendaddr <= ar.Start { + mm.mappingMu.RUnlock() + return translateIOError(ctx, err) + } + ar.End = vendaddr + } + + // Ensure that we have usable pmas. + mm.activeMu.Lock() + pseg, pend, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{ + breakCOW: at.Write, + }) + mm.mappingMu.RUnlock() + if pendaddr := pend.Start(); pendaddr < ar.End { + if pendaddr <= ar.Start { + mm.activeMu.Unlock() + return translateIOError(ctx, err) + } + ar.End = pendaddr + } + + // Downgrade to a read-lock on activeMu since we don't need to mutate pmas + // anymore. + mm.activeMu.DowngradeLock() + + err = mm.mapASLocked(pseg, ar, false) + mm.activeMu.RUnlock() + return translateIOError(ctx, err) +} + +// withInternalMappings ensures that pmas exist for all addresses in ar, +// support access of type (at, ignorePermissions), and have internal mappings +// cached. It then calls f with mm.activeMu locked for reading, passing +// internal mappings for the subrange of ar for which this property holds. +// +// withInternalMappings takes a function returning uint64 since many safemem +// functions have this property, but returns an int64 since this is usually +// more useful for usermem.IO methods. +// +// Preconditions: 0 < ar.Length() <= math.MaxInt64. +func (mm *MemoryManager) withInternalMappings(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { + po := pmaOpts{ + breakCOW: at.Write, + } + + // If pmas are already available, we can do IO without touching mm.vmas or + // mm.mappingMu. + mm.activeMu.RLock() + if pseg := mm.existingPMAsLocked(ar, at, ignorePermissions, po, true /* needInternalMappings */); pseg.Ok() { + n, err := f(mm.internalMappingsLocked(pseg, ar)) + mm.activeMu.RUnlock() + // Do not convert errors returned by f to EFAULT. + return int64(n), err + } + mm.activeMu.RUnlock() + + // Ensure that we have usable vmas. + mm.mappingMu.RLock() + vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions) + if vendaddr := vend.Start(); vendaddr < ar.End { + if vendaddr <= ar.Start { + mm.mappingMu.RUnlock() + return 0, translateIOError(ctx, verr) + } + ar.End = vendaddr + } + + // Ensure that we have usable pmas. + mm.activeMu.Lock() + pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, po) + mm.mappingMu.RUnlock() + if pendaddr := pend.Start(); pendaddr < ar.End { + if pendaddr <= ar.Start { + mm.activeMu.Unlock() + return 0, translateIOError(ctx, perr) + } + ar.End = pendaddr + } + imend, imerr := mm.getPMAInternalMappingsLocked(pseg, ar) + mm.activeMu.DowngradeLock() + if imendaddr := imend.Start(); imendaddr < ar.End { + if imendaddr <= ar.Start { + mm.activeMu.RUnlock() + return 0, translateIOError(ctx, imerr) + } + ar.End = imendaddr + } + + // Do I/O. + un, err := f(mm.internalMappingsLocked(pseg, ar)) + mm.activeMu.RUnlock() + n := int64(un) + + // Return the first error in order of progress through ar. + if err != nil { + // Do not convert errors returned by f to EFAULT. + return n, err + } + if imerr != nil { + return n, translateIOError(ctx, imerr) + } + if perr != nil { + return n, translateIOError(ctx, perr) + } + return n, translateIOError(ctx, verr) +} + +// withVecInternalMappings ensures that pmas exist for all addresses in ars, +// support access of type (at, ignorePermissions), and have internal mappings +// cached. It then calls f with mm.activeMu locked for reading, passing +// internal mappings for the subset of ars for which this property holds. +// +// Preconditions: !ars.IsEmpty(). +func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { + // withInternalMappings is faster than withVecInternalMappings because of + // iterator plumbing (this isn't generally practical in the vector case due + // to iterator invalidation between AddrRanges). Use it if possible. + if ars.NumRanges() == 1 { + return mm.withInternalMappings(ctx, ars.Head(), at, ignorePermissions, f) + } + + po := pmaOpts{ + breakCOW: at.Write, + } + + // If pmas are already available, we can do IO without touching mm.vmas or + // mm.mappingMu. + mm.activeMu.RLock() + if mm.existingVecPMAsLocked(ars, at, ignorePermissions, po, true /* needInternalMappings */) { + n, err := f(mm.vecInternalMappingsLocked(ars)) + mm.activeMu.RUnlock() + // Do not convert errors returned by f to EFAULT. + return int64(n), err + } + mm.activeMu.RUnlock() + + // Ensure that we have usable vmas. + mm.mappingMu.RLock() + vars, verr := mm.getVecVMAsLocked(ctx, ars, at, ignorePermissions) + if vars.NumBytes() == 0 { + mm.mappingMu.RUnlock() + return 0, translateIOError(ctx, verr) + } + + // Ensure that we have usable pmas. + mm.activeMu.Lock() + pars, perr := mm.getVecPMAsLocked(ctx, vars, po) + mm.mappingMu.RUnlock() + if pars.NumBytes() == 0 { + mm.activeMu.Unlock() + return 0, translateIOError(ctx, perr) + } + imars, imerr := mm.getVecPMAInternalMappingsLocked(pars) + mm.activeMu.DowngradeLock() + if imars.NumBytes() == 0 { + mm.activeMu.RUnlock() + return 0, translateIOError(ctx, imerr) + } + + // Do I/O. + un, err := f(mm.vecInternalMappingsLocked(imars)) + mm.activeMu.RUnlock() + n := int64(un) + + // Return the first error in order of progress through ars. + if err != nil { + // Do not convert errors from f to EFAULT. + return n, err + } + if imerr != nil { + return n, translateIOError(ctx, imerr) + } + if perr != nil { + return n, translateIOError(ctx, perr) + } + return n, translateIOError(ctx, verr) +} + +// truncatedAddrRangeSeq returns a copy of ars, but with the end truncated to +// at most address end on AddrRange arsit.Head(). It is used in vector I/O paths to +// truncate usermem.AddrRangeSeq when errors occur. +// +// Preconditions: !arsit.IsEmpty(). end <= arsit.Head().End. +func truncatedAddrRangeSeq(ars, arsit usermem.AddrRangeSeq, end usermem.Addr) usermem.AddrRangeSeq { + ar := arsit.Head() + if end <= ar.Start { + return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes()) + } + return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes() + int64(end-ar.Start)) +} diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go new file mode 100644 index 000000000..de7f29b04 --- /dev/null +++ b/pkg/sentry/mm/lifecycle.go @@ -0,0 +1,218 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/atomicbitops" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// NewMemoryManager returns a new MemoryManager with no mappings and 1 user. +func NewMemoryManager(p platform.Platform) *MemoryManager { + return &MemoryManager{ + p: p, + haveASIO: p.SupportsAddressSpaceIO(), + privateRefs: &privateRefs{}, + users: 1, + auxv: arch.Auxv{}, + aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + } +} + +// SetMmapLayout initializes mm's layout from the given arch.Context. +// +// Preconditions: mm contains no mappings and is not used concurrently. +func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arch.MmapLayout, error) { + layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r) + if err != nil { + return arch.MmapLayout{}, err + } + mm.layout = layout + return layout, nil +} + +// Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or +// clone() (without CLONE_VM). +func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + mm2 := &MemoryManager{ + p: mm.p, + haveASIO: mm.haveASIO, + layout: mm.layout, + privateRefs: mm.privateRefs, + users: 1, + usageAS: mm.usageAS, + brk: mm.brk, + captureInvalidations: true, + argv: mm.argv, + envv: mm.envv, + auxv: append(arch.Auxv(nil), mm.auxv...), + // IncRef'd below, once we know that there isn't an error. + executable: mm.executable, + aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + } + + // Copy vmas. + dstvgap := mm2.vmas.FirstGap() + for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() { + vma := srcvseg.ValuePtr() + vmaAR := srcvseg.Range() + // Inform the Mappable, if any, of the new mapping. + if vma.mappable != nil { + if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off); err != nil { + mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange()) + return nil, err + } + } + if vma.id != nil { + vma.id.IncRef() + } + dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, *vma).NextGap() + // We don't need to update mm2.usageAS since we copied it from mm + // above. + } + + // Copy pmas. We have to lock mm.activeMu for writing to make existing + // private pmas copy-on-write. We also have to lock mm2.activeMu since + // after copying vmas above, memmap.Mappables may call mm2.Invalidate. We + // only copy private pmas, since in the common case where fork(2) is + // immediately followed by execve(2), copying non-private pmas that can be + // regenerated by calling memmap.Mappable.Translate is a waste of time. + // (Linux does the same; compare kernel/fork.c:dup_mmap() => + // mm/memory.c:copy_page_range().) + mm2.activeMu.Lock() + defer mm2.activeMu.Unlock() + mm.activeMu.Lock() + defer mm.activeMu.Unlock() + dstpgap := mm2.pmas.FirstGap() + var unmapAR usermem.AddrRange + for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() { + pma := srcpseg.ValuePtr() + if !pma.private { + continue + } + if !pma.needCOW { + pma.needCOW = true + if pma.vmaEffectivePerms.Write { + // We don't want to unmap the whole address space, even though + // doing so would reduce calls to unmapASLocked(), because mm + // will most likely continue to be used after the fork, so + // unmapping pmas unnecessarily will result in extra page + // faults. But we do want to merge consecutive AddrRanges + // across pma boundaries. + if unmapAR.End == srcpseg.Start() { + unmapAR.End = srcpseg.End() + } else { + if unmapAR.Length() != 0 { + mm.unmapASLocked(unmapAR) + } + unmapAR = srcpseg.Range() + } + } + } + fr := srcpseg.fileRange() + mm2.incPrivateRef(fr) + srcpseg.ValuePtr().file.IncRef(fr) + addrRange := srcpseg.Range() + mm2.addRSSLocked(addrRange) + dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap() + } + if unmapAR.Length() != 0 { + mm.unmapASLocked(unmapAR) + } + + // Between when we call memmap.Mappable.AddMapping while copying vmas and + // when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are + // ineffective because the pmas they invalidate haven't yet been copied, + // possibly allowing mm2 to get invalidated translations: + // + // Invalidating Mappable mm.Fork + // --------------------- ------- + // + // mm2.Invalidate() + // mm.activeMu.Lock() + // mm.Invalidate() /* blocks */ + // mm2.activeMu.Lock() + // (mm copies invalidated pma to mm2) + // + // This would technically be both safe (since we only copy private pmas, + // which will still hold a reference on their memory) and consistent with + // Linux, but we avoid it anyway by setting mm2.captureInvalidations during + // construction, causing calls to mm2.Invalidate() to be captured in + // mm2.capturedInvalidations, to be replayed after pmas are copied - i.e. + // here. + mm2.captureInvalidations = false + for _, invArgs := range mm2.capturedInvalidations { + mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true) + } + mm2.capturedInvalidations = nil + + if mm2.executable != nil { + mm2.executable.IncRef() + } + return mm2, nil +} + +// IncUsers increments mm's user count and returns true. If the user count is +// already 0, IncUsers does nothing and returns false. +func (mm *MemoryManager) IncUsers() bool { + return atomicbitops.IncUnlessZeroInt32(&mm.users) +} + +// DecUsers decrements mm's user count. If the user count reaches 0, all +// mappings in mm are unmapped. +func (mm *MemoryManager) DecUsers(ctx context.Context) { + if users := atomic.AddInt32(&mm.users, -1); users > 0 { + return + } else if users < 0 { + panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users)) + } + + mm.aioManager.destroy() + + mm.metadataMu.Lock() + exe := mm.executable + mm.executable = nil + mm.metadataMu.Unlock() + if exe != nil { + exe.DecRef() + } + + mm.activeMu.Lock() + // Sanity check. + if atomic.LoadInt32(&mm.active) != 0 { + panic("active address space lost?") + } + // Make sure the AddressSpace is returned. + if mm.as != nil { + mm.as.Release() + mm.as = nil + } + mm.activeMu.Unlock() + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + mm.unmapLocked(ctx, mm.applicationAddrRange()) +} diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go new file mode 100644 index 000000000..32d5e2ff6 --- /dev/null +++ b/pkg/sentry/mm/metadata.go @@ -0,0 +1,139 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// ArgvStart returns the start of the application argument vector. +// +// There is no guarantee that this value is sensible w.r.t. ArgvEnd. +func (mm *MemoryManager) ArgvStart() usermem.Addr { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return mm.argv.Start +} + +// SetArgvStart sets the start of the application argument vector. +func (mm *MemoryManager) SetArgvStart(a usermem.Addr) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.argv.Start = a +} + +// ArgvEnd returns the end of the application argument vector. +// +// There is no guarantee that this value is sensible w.r.t. ArgvStart. +func (mm *MemoryManager) ArgvEnd() usermem.Addr { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return mm.argv.End +} + +// SetArgvEnd sets the end of the application argument vector. +func (mm *MemoryManager) SetArgvEnd(a usermem.Addr) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.argv.End = a +} + +// EnvvStart returns the start of the application environment vector. +// +// There is no guarantee that this value is sensible w.r.t. EnvvEnd. +func (mm *MemoryManager) EnvvStart() usermem.Addr { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return mm.envv.Start +} + +// SetEnvvStart sets the start of the application environment vector. +func (mm *MemoryManager) SetEnvvStart(a usermem.Addr) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.envv.Start = a +} + +// EnvvEnd returns the end of the application environment vector. +// +// There is no guarantee that this value is sensible w.r.t. EnvvStart. +func (mm *MemoryManager) EnvvEnd() usermem.Addr { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return mm.envv.End +} + +// SetEnvvEnd sets the end of the application environment vector. +func (mm *MemoryManager) SetEnvvEnd(a usermem.Addr) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.envv.End = a +} + +// Auxv returns the current map of auxiliary vectors. +func (mm *MemoryManager) Auxv() arch.Auxv { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return append(arch.Auxv(nil), mm.auxv...) +} + +// SetAuxv sets the entire map of auxiliary vectors. +func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.auxv = append(arch.Auxv(nil), auxv...) +} + +// Executable returns the executable, if available. +// +// An additional reference will be taken in the case of a non-nil executable, +// which must be released by the caller. +func (mm *MemoryManager) Executable() *fs.Dirent { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + + if mm.executable == nil { + return nil + } + + mm.executable.IncRef() + return mm.executable +} + +// SetExecutable sets the executable. +// +// This takes a reference on d. +func (mm *MemoryManager) SetExecutable(d *fs.Dirent) { + mm.metadataMu.Lock() + + // Grab a new reference. + d.IncRef() + + // Set the executable. + orig := mm.executable + mm.executable = d + + mm.metadataMu.Unlock() + + // Release the old reference. + // + // Do this without holding the lock, since it may wind up doing some + // I/O to sync the dirent, etc. + if orig != nil { + orig.DecRef() + } +} diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go new file mode 100644 index 000000000..ce8097b7f --- /dev/null +++ b/pkg/sentry/mm/mm.go @@ -0,0 +1,417 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package mm provides a memory management subsystem. See README.md for a +// detailed overview. +// +// Lock order: +// +// fs locks, except for memmap.Mappable locks +// mm.MemoryManager.metadataMu +// mm.MemoryManager.mappingMu +// Locks taken by memmap.Mappable methods other than Translate +// mm.MemoryManager.activeMu +// Locks taken by memmap.Mappable.Translate +// mm.privateRefs.mu +// platform.File locks +// mm.aioManager.mu +// mm.AIOContext.mu +// +// Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in +// multiple mm.MemoryManagers, as it does so in a well-defined order (forked +// child first). +package mm + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + ssync "gvisor.googlesource.com/gvisor/pkg/sync" +) + +// MemoryManager implements a virtual address space. +type MemoryManager struct { + // p is the platform. + // + // p is immutable. + p platform.Platform + + // haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from + // eliminating an indirect call in the hot I/O path, this makes + // MemoryManager.asioEnabled() a leaf function, allowing it to be inlined. + // + // haveASIO is immutable. + haveASIO bool `state:"nosave"` + + // layout is the memory layout. + // + // layout is set by the binary loader before the MemoryManager can be used. + layout arch.MmapLayout + + // privateRefs stores reference counts for private memory (memory whose + // ownership is shared by one or more pmas instead of being owned by a + // memmap.Mappable). + // + // NOTE: This should be replaced using refcounts on + // platform.File. + // + // privateRefs is immutable. + privateRefs *privateRefs + + // users is the number of dependences on the mappings in the MemoryManager. + // When the number of references in users reaches zero, all mappings are + // unmapped. + // + // users is accessed using atomic memory operations. + users int32 + + // mappingMu is analogous to Linux's struct mm_struct::mmap_sem. + mappingMu ssync.DowngradableRWMutex `state:"nosave"` + + // vmas stores virtual memory areas. Since vmas are stored by value, + // clients should usually use vmaIterator.ValuePtr() instead of + // vmaIterator.Value() to get a pointer to the vma rather than a copy. + // + // Invariants: vmas are always page-aligned. + // + // vmas is protected by mappingMu. + vmas vmaSet + + // usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks. + // + // usageAS is protected by mappingMu. + usageAS uint64 + + // brk is the mm's brk, which is manipulated using the brk(2) system call. + // The brk is initially set up by the loader which maps an executable + // binary into the mm. + // + // brk is protected by mappingMu. + brk usermem.AddrRange + + // activeMu is loosely analogous to Linux's struct + // mm_struct::page_table_lock. + activeMu ssync.DowngradableRWMutex `state:"nosave"` + + // pmas stores platform mapping areas used to implement vmas. Since pmas + // are stored by value, clients should usually use pmaIterator.ValuePtr() + // instead of pmaIterator.Value() to get a pointer to the pma rather than + // a copy. + // + // Inserting or removing segments from pmas should happen along with a + // call to mm.insertRSS or mm.removeRSS. + // + // Invariants: pmas are always page-aligned. If a pma exists for a given + // address, a vma must also exist for that address. + // + // pmas is protected by activeMu. + pmas pmaSet + + // curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is + // reported as the MemoryManager's RSS. + // + // maxRSS should be modified only via insertRSS and removeRSS, not + // directly. + // + // maxRSS is protected by activeMu. + curRSS uint64 + + // maxRSS is the maximum resident set size in bytes of a MemoryManager. + // It is tracked as the application adds and removes mappings to pmas. + // + // maxRSS should be modified only via insertRSS, not directly. + // + // maxRSS is protected by activeMu. + maxRSS uint64 + + // as is the platform.AddressSpace that pmas are mapped into. active is the + // number of contexts that require as to be non-nil; if active == 0, as may + // be nil. + // + // as is protected by activeMu. active is manipulated with atomic memory + // operations; transitions to and from zero are additionally protected by + // activeMu. (This is because such transitions may need to be atomic with + // changes to as.) + as platform.AddressSpace `state:"nosave"` + active int32 `state:"zerovalue"` + + // unmapAllOnActivate indicates that the next Activate call should activate + // an empty AddressSpace. + // + // This is used to ensure that an AddressSpace cached in + // NewAddressSpace is not used after some change in the MemoryManager + // or VMAs has made that AddressSpace stale. + // + // unmapAllOnActivate is protected by activeMu. It must only be set when + // there is no active or cached AddressSpace. If as != nil, then + // invalidations should be propagated immediately. + unmapAllOnActivate bool `state:"nosave"` + + // If captureInvalidations is true, calls to MM.Invalidate() are recorded + // in capturedInvalidations rather than being applied immediately to pmas. + // This is to avoid a race condition in MM.Fork(); see that function for + // details. + // + // Both captureInvalidations and capturedInvalidations are protected by + // activeMu. Neither need to be saved since captureInvalidations is only + // enabled during MM.Fork(), during which saving can't occur. + captureInvalidations bool `state:"zerovalue"` + capturedInvalidations []invalidateArgs `state:"nosave"` + + metadataMu sync.Mutex `state:"nosave"` + + // argv is the application argv. This is set up by the loader and may be + // modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No + // requirements apply to argv; we do not require that argv.WellFormed(). + // + // argv is protected by metadataMu. + argv usermem.AddrRange + + // envv is the application envv. This is set up by the loader and may be + // modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No + // requirements apply to envv; we do not require that envv.WellFormed(). + // + // envv is protected by metadataMu. + envv usermem.AddrRange + + // auxv is the ELF's auxiliary vector. + // + // auxv is protected by metadataMu. + auxv arch.Auxv + + // executable is the executable for this MemoryManager. If executable + // is not nil, it holds a reference on the Dirent. + // + // executable is protected by metadataMu. + executable *fs.Dirent + + // aioManager keeps track of AIOContexts used for async IOs. AIOManager + // must be cloned when CLONE_VM is used. + aioManager aioManager +} + +// vma represents a virtual memory area. +type vma struct { + // mappable is the virtual memory object mapped by this vma. If mappable is + // nil, the vma represents a private anonymous mapping. + mappable memmap.Mappable + + // off is the offset into mappable at which this vma begins. If mappable is + // nil, off is meaningless. + off uint64 + + // To speedup VMA save/restore, we group and save the following booleans + // as a single integer. + + // realPerms are the memory permissions on this vma, as defined by the + // application. + realPerms usermem.AccessType `state:".(int)"` + + // effectivePerms are the memory permissions on this vma which are + // actually used to control access. + // + // Invariant: effectivePerms == realPerms.Effective(). + effectivePerms usermem.AccessType `state:"manual"` + + // maxPerms limits the set of permissions that may ever apply to this + // memory, as well as accesses for which usermem.IOOpts.IgnorePermissions + // is true (e.g. ptrace(PTRACE_POKEDATA)). + // + // Invariant: maxPerms == maxPerms.Effective(). + maxPerms usermem.AccessType `state:"manual"` + + // private is true if this is a MAP_PRIVATE mapping, such that writes to + // the mapping are propagated to a copy. + private bool `state:"manual"` + + // growsDown is true if the mapping may be automatically extended downward + // under certain conditions. If growsDown is true, mappable must be nil. + // + // There is currently no corresponding growsUp flag; in Linux, the only + // architectures that can have VM_GROWSUP mappings are ia64, parisc, and + // metag, none of which we currently support. + growsDown bool `state:"manual"` + + // If id is not nil, it controls the lifecycle of mappable and provides vma + // metadata shown in /proc/[pid]/maps, and the vma holds a reference. + id memmap.MappingIdentity + + // If hint is non-empty, it is a description of the vma printed in + // /proc/[pid]/maps. hint takes priority over id.MappedName(). + hint string +} + +const ( + vmaRealPermsRead = 1 << iota + vmaRealPermsWrite + vmaRealPermsExecute + vmaEffectivePermsRead + vmaEffectivePermsWrite + vmaEffectivePermsExecute + vmaMaxPermsRead + vmaMaxPermsWrite + vmaMaxPermsExecute + vmaPrivate + vmaGrowsDown +) + +func (v *vma) saveRealPerms() int { + var b int + if v.realPerms.Read { + b |= vmaRealPermsRead + } + if v.realPerms.Write { + b |= vmaRealPermsWrite + } + if v.realPerms.Execute { + b |= vmaRealPermsExecute + } + if v.effectivePerms.Read { + b |= vmaEffectivePermsRead + } + if v.effectivePerms.Write { + b |= vmaEffectivePermsWrite + } + if v.effectivePerms.Execute { + b |= vmaEffectivePermsExecute + } + if v.maxPerms.Read { + b |= vmaMaxPermsRead + } + if v.maxPerms.Write { + b |= vmaMaxPermsWrite + } + if v.maxPerms.Execute { + b |= vmaMaxPermsExecute + } + if v.private { + b |= vmaPrivate + } + if v.growsDown { + b |= vmaGrowsDown + } + return b +} + +func (v *vma) loadRealPerms(b int) { + if b&vmaRealPermsRead > 0 { + v.realPerms.Read = true + } + if b&vmaRealPermsWrite > 0 { + v.realPerms.Write = true + } + if b&vmaRealPermsExecute > 0 { + v.realPerms.Execute = true + } + if b&vmaEffectivePermsRead > 0 { + v.effectivePerms.Read = true + } + if b&vmaEffectivePermsWrite > 0 { + v.effectivePerms.Write = true + } + if b&vmaEffectivePermsExecute > 0 { + v.effectivePerms.Execute = true + } + if b&vmaMaxPermsRead > 0 { + v.maxPerms.Read = true + } + if b&vmaMaxPermsWrite > 0 { + v.maxPerms.Write = true + } + if b&vmaMaxPermsExecute > 0 { + v.maxPerms.Execute = true + } + if b&vmaPrivate > 0 { + v.private = true + } + if b&vmaGrowsDown > 0 { + v.growsDown = true + } +} + +// pma represents a platform mapping area. +type pma struct { + // file is the file mapped by this pma. Only pmas for which file == + // platform.Platform.Memory() may be saved. pmas hold a reference to the + // corresponding file range while they exist. + file platform.File `state:"nosave"` + + // off is the offset into file at which this pma begins. + off uint64 + + // vmaEffectivePerms and vmaMaxPerms are duplicated from the + // corresponding vma so that the IO implementation can avoid iterating + // mm.vmas when pmas already exist. + vmaEffectivePerms usermem.AccessType + vmaMaxPerms usermem.AccessType + + // needCOW is true if writes to the mapping must be propagated to a copy. + needCOW bool + + // private is true if this pma represents private memory. + // + // If private is true, file must be platform.Platform.Memory(), the pma + // holds a reference on the mapped memory that is tracked in privateRefs, + // and calls to Invalidate for which + // memmap.InvalidateOpts.InvalidatePrivate is false should ignore the pma. + // + // If private is false, this pma caches a translation from the + // corresponding vma's memmap.Mappable.Translate. + private bool + + // If internalMappings is not empty, it is the cached return value of + // file.MapInternal for the platform.FileRange mapped by this pma. + internalMappings safemem.BlockSeq `state:"nosave"` +} + +type privateRefs struct { + mu sync.Mutex `state:"nosave"` + + // refs maps offsets into Platform.Memory() to the number of pmas (or, + // equivalently, MemoryManagers) that share ownership of the memory at that + // offset. + refs fileRefcountSet +} + +type invalidateArgs struct { + ar usermem.AddrRange + opts memmap.InvalidateOpts +} + +// fileRefcountSetFunctions implements segment.Functions for fileRefcountSet. +type fileRefcountSetFunctions struct{} + +func (fileRefcountSetFunctions) MinKey() uint64 { + return 0 +} + +func (fileRefcountSetFunctions) MaxKey() uint64 { + return ^uint64(0) +} + +func (fileRefcountSetFunctions) ClearValue(_ *int32) { +} + +func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) { + return rc1, rc1 == rc2 +} + +func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) { + return rc, rc +} diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go new file mode 100644 index 000000000..b47aa7263 --- /dev/null +++ b/pkg/sentry/mm/mm_test.go @@ -0,0 +1,174 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +func testMemoryManager(ctx context.Context) *MemoryManager { + p := platform.FromContext(ctx) + mm := NewMemoryManager(p) + mm.layout = arch.MmapLayout{ + MinAddr: p.MinUserAddress(), + MaxAddr: p.MaxUserAddress(), + BottomUpBase: p.MinUserAddress(), + TopDownBase: p.MaxUserAddress(), + } + return mm +} + +func (mm *MemoryManager) realUsageAS() uint64 { + return uint64(mm.vmas.Span()) +} + +func TestUsageASUpdates(t *testing.T) { + ctx := contexttest.Context(t) + mm := testMemoryManager(ctx) + defer mm.DecUsers(ctx) + + addr, err := mm.MMap(ctx, memmap.MMapOpts{ + Length: 2 * usermem.PageSize, + }) + if err != nil { + t.Fatalf("MMap got err %v want nil", err) + } + realUsage := mm.realUsageAS() + if mm.usageAS != realUsage { + t.Fatalf("usageAS believes %v bytes are mapped; %v bytes are actually mapped", mm.usageAS, realUsage) + } + + mm.MUnmap(ctx, addr, usermem.PageSize) + realUsage = mm.realUsageAS() + if mm.usageAS != realUsage { + t.Fatalf("usageAS believes %v bytes are mapped; %v bytes are actually mapped", mm.usageAS, realUsage) + } +} + +func TestBrkDataLimitUpdates(t *testing.T) { + limitSet := limits.NewLimitSet() + limitSet.Set(limits.Data, limits.Limit{}) // zero RLIMIT_DATA + + ctx := contexttest.WithLimitSet(contexttest.Context(t), limitSet) + mm := testMemoryManager(ctx) + defer mm.DecUsers(ctx) + + // Try to extend the brk by one page and expect doing so to fail. + oldBrk, _ := mm.Brk(ctx, 0) + if newBrk, _ := mm.Brk(ctx, oldBrk+usermem.PageSize); newBrk != oldBrk { + t.Errorf("brk() increased data segment above RLIMIT_DATA (old brk = %#x, new brk = %#x", oldBrk, newBrk) + } +} + +// TestIOAfterUnmap ensures that IO fails after unmap. +func TestIOAfterUnmap(t *testing.T) { + ctx := contexttest.Context(t) + mm := testMemoryManager(ctx) + defer mm.DecUsers(ctx) + + addr, err := mm.MMap(ctx, memmap.MMapOpts{ + Length: usermem.PageSize, + Private: true, + Perms: usermem.Read, + MaxPerms: usermem.AnyAccess, + }) + if err != nil { + t.Fatalf("MMap got err %v want nil", err) + } + + // IO works before munmap. + b := make([]byte, 1) + n, err := mm.CopyIn(ctx, addr, b, usermem.IOOpts{}) + if err != nil { + t.Errorf("CopyIn got err %v want nil", err) + } + if n != 1 { + t.Errorf("CopyIn got %d want 1", n) + } + + err = mm.MUnmap(ctx, addr, usermem.PageSize) + if err != nil { + t.Fatalf("MUnmap got err %v want nil", err) + } + + n, err = mm.CopyIn(ctx, addr, b, usermem.IOOpts{}) + if err != syserror.EFAULT { + t.Errorf("CopyIn got err %v want EFAULT", err) + } + if n != 0 { + t.Errorf("CopyIn got %d want 0", n) + } +} + +// TestIOAfterMProtect tests IO interaction with mprotect permissions. +func TestIOAfterMProtect(t *testing.T) { + ctx := contexttest.Context(t) + mm := testMemoryManager(ctx) + defer mm.DecUsers(ctx) + + addr, err := mm.MMap(ctx, memmap.MMapOpts{ + Length: usermem.PageSize, + Private: true, + Perms: usermem.ReadWrite, + MaxPerms: usermem.AnyAccess, + }) + if err != nil { + t.Fatalf("MMap got err %v want nil", err) + } + + // Writing works before mprotect. + b := make([]byte, 1) + n, err := mm.CopyOut(ctx, addr, b, usermem.IOOpts{}) + if err != nil { + t.Errorf("CopyOut got err %v want nil", err) + } + if n != 1 { + t.Errorf("CopyOut got %d want 1", n) + } + + err = mm.MProtect(addr, usermem.PageSize, usermem.Read, false) + if err != nil { + t.Errorf("MProtect got err %v want nil", err) + } + + // Without IgnorePermissions, CopyOut should no longer succeed. + n, err = mm.CopyOut(ctx, addr, b, usermem.IOOpts{}) + if err != syserror.EFAULT { + t.Errorf("CopyOut got err %v want EFAULT", err) + } + if n != 0 { + t.Errorf("CopyOut got %d want 0", n) + } + + // With IgnorePermissions, CopyOut should succeed despite mprotect. + n, err = mm.CopyOut(ctx, addr, b, usermem.IOOpts{ + IgnorePermissions: true, + }) + if err != nil { + t.Errorf("CopyOut got err %v want nil", err) + } + if n != 1 { + t.Errorf("CopyOut got %d want 1", n) + } +} diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go new file mode 100644 index 000000000..35e873762 --- /dev/null +++ b/pkg/sentry/mm/pma.go @@ -0,0 +1,928 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +type pmaOpts struct { + // If breakCOW is true, pmas must not be copy-on-write. + breakCOW bool +} + +// existingPMAsLocked checks that pmas exist for all addresses in ar, and +// support access of type (at, ignorePermissions). If so, it returns an +// iterator to the pma containing ar.Start. Otherwise it returns a terminal +// iterator. +// +// Preconditions: mm.activeMu must be locked. ar.Length() != 0. +func (mm *MemoryManager) existingPMAsLocked(ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, opts pmaOpts, needInternalMappings bool) pmaIterator { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + first := mm.pmas.FindSegment(ar.Start) + pseg := first + for pseg.Ok() { + pma := pseg.ValuePtr() + perms := pma.vmaEffectivePerms + if ignorePermissions { + perms = pma.vmaMaxPerms + } + if !perms.SupersetOf(at) { + // These are the vma's permissions, so the caller will get an error + // when they try to get new pmas. + return pmaIterator{} + } + if opts.breakCOW && pma.needCOW { + return pmaIterator{} + } + if needInternalMappings && pma.internalMappings.IsEmpty() { + return pmaIterator{} + } + + if ar.End <= pseg.End() { + return first + } + pseg, _ = pseg.NextNonEmpty() + } + + // Ran out of pmas before reaching ar.End. + return pmaIterator{} +} + +// existingVecPMAsLocked returns true if pmas exist for all addresses in ars, +// and support access of type (at, ignorePermissions). +// +// Preconditions: mm.activeMu must be locked. +func (mm *MemoryManager) existingVecPMAsLocked(ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool, opts pmaOpts, needInternalMappings bool) bool { + for ; !ars.IsEmpty(); ars = ars.Tail() { + if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, opts, needInternalMappings).Ok() { + return false + } + } + return true +} + +// getPMAsLocked ensures that pmas exist for all addresses in ar, subject to +// opts. It returns: +// +// - An iterator to the pma containing ar.Start. If no pma contains ar.Start, +// the iterator is unspecified. +// +// - An iterator to the gap after the last pma containing an address in ar. If +// pmas exist for no addresses in ar, the iterator is to a gap that begins +// before ar.Start. +// +// - An error that is non-nil if pmas exist for only a subset of ar. +// +// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for +// writing. ar.Length() != 0. vseg.Range().Contains(ar.Start). vmas must exist +// for all addresses in ar. +func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, opts pmaOpts) (pmaIterator, pmaGapIterator, error) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !vseg.Ok() { + panic("terminal vma iterator") + } + if !vseg.Range().Contains(ar.Start) { + panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar)) + } + } + + // Page-align ar so that all AddrRanges are aligned. + end, ok := ar.End.RoundUp() + var alignerr error + if !ok { + end = ar.End.RoundDown() + alignerr = syserror.EFAULT + } + ar = usermem.AddrRange{ar.Start.RoundDown(), end} + + pstart, pend, perr := mm.ensurePMAsLocked(ctx, vseg, ar) + if pend.Start() <= ar.Start { + return pmaIterator{}, pend, perr + } + // ensurePMAsLocked may not have pstart due to iterator invalidation. We + // need it, either to return it immediately or to pass to + // breakCopyOnWriteLocked. + if !pstart.Ok() { + pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend) + } + + var cowerr error + if opts.breakCOW { + var invalidated bool + pend, invalidated, cowerr = mm.breakCopyOnWriteLocked(pstart, ar) + if pend.Start() <= ar.Start { + return pmaIterator{}, pend, cowerr + } + if invalidated { + pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend) + } + } + + if cowerr != nil { + return pstart, pend, cowerr + } + if perr != nil { + return pstart, pend, perr + } + return pstart, pend, alignerr +} + +// getVecPMAsLocked ensures that pmas exist for all addresses in ars. It +// returns the subset of ars for which pmas exist. If this is not equal to ars, +// it returns a non-nil error explaining why. +// +// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for +// writing. vmas must exist for all addresses in ars. +func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, opts pmaOpts) (usermem.AddrRangeSeq, error) { + for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { + ar := arsit.Head() + if ar.Length() == 0 { + continue + } + + // Page-align ar so that all AddrRanges are aligned. + end, ok := ar.End.RoundUp() + var alignerr error + if !ok { + end = ar.End.RoundDown() + alignerr = syserror.EFAULT + } + ar = usermem.AddrRange{ar.Start.RoundDown(), end} + + pstart, pend, perr := mm.ensurePMAsLocked(ctx, mm.vmas.FindSegment(ar.Start), ar) + if pend.Start() <= ar.Start { + return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr + } + + var cowerr error + if opts.breakCOW { + if !pstart.Ok() { + pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend) + } + pend, _, cowerr = mm.breakCopyOnWriteLocked(pstart, ar) + } + + if cowerr != nil { + return truncatedAddrRangeSeq(ars, arsit, pend.Start()), cowerr + } + if perr != nil { + return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr + } + if alignerr != nil { + return truncatedAddrRangeSeq(ars, arsit, pend.Start()), alignerr + } + } + + return ars, nil +} + +// ensurePMAsLocked ensures that pmas exist for all addresses in ar. It returns: +// +// - An iterator to the pma containing ar.Start, on a best-effort basis (that +// is, the returned iterator may be terminal, even if such a pma exists). +// Returning this iterator on a best-effort basis allows callers that require +// it to use it when it's cheaply available, while also avoiding the overhead +// of retrieving it when it's not. +// +// - An iterator to the gap after the last pma containing an address in ar. If +// pmas exist for no addresses in ar, the iterator is to a gap that begins +// before ar.Start. +// +// - An error that is non-nil if pmas exist for only a subset of ar. +// +// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for +// writing. ar.Length() != 0. ar must be page-aligned. +// vseg.Range().Contains(ar.Start). vmas must exist for all addresses in ar. +func (mm *MemoryManager) ensurePMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange) (pmaIterator, pmaGapIterator, error) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !vseg.Range().Contains(ar.Start) { + panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar)) + } + } + + pstart, pgap := mm.pmas.Find(ar.Start) + if pstart.Ok() { + pgap = pstart.NextGap() + } + for pgap.Start() < ar.End { + if pgap.Range().Length() == 0 { + pgap = pgap.NextGap() + continue + } + // A single pgap might be spanned by multiple vmas. Insert pmas to + // cover the first (vma, pgap) pair. + pgapAR := pgap.Range().Intersect(ar) + vseg = vseg.seekNextLowerBound(pgapAR.Start) + if checkInvariants { + if !vseg.Ok() { + panic(fmt.Sprintf("no vma after %#x", pgapAR.Start)) + } + if pgapAR.Start < vseg.Start() { + panic(fmt.Sprintf("no vma in [%#x, %#x)", pgapAR.Start, vseg.Start())) + } + } + var err error + pgap, err = mm.insertPMAsLocked(ctx, vseg, pgap, ar) + // insertPMAsLocked most likely invalidated iterators, so pstart is now + // unknown. + pstart = pmaIterator{} + if err != nil { + return pstart, pgap, err + } + } + return pstart, pgap, nil +} + +const ( + // When memory is allocated for a private pma, align the allocated address + // range to a privateAllocUnit boundary when possible. Larger values of + // privateAllocUnit may reduce page faults by allowing fewer, larger pmas + // to be mapped, but may result in larger amounts of wasted memory in the + // presence of fragmentation. privateAllocUnit must be a power-of-2 + // multiple of usermem.PageSize. + privateAllocUnit = usermem.HugePageSize + + privateAllocMask = privateAllocUnit - 1 +) + +func privateAligned(ar usermem.AddrRange) usermem.AddrRange { + aligned := usermem.AddrRange{ar.Start &^ privateAllocMask, ar.End} + if end := (ar.End + privateAllocMask) &^ privateAllocMask; end >= ar.End { + aligned.End = end + } + if checkInvariants { + if !aligned.IsSupersetOf(ar) { + panic(fmt.Sprintf("aligned AddrRange %#v is not a superset of ar %#v", aligned, ar)) + } + } + return aligned +} + +// insertPMAsLocked inserts pmas into pgap corresponding to the vma iterated by +// vseg, spanning at least ar. It returns: +// +// - An iterator to the gap after the last pma containing an address in ar. If +// pmas exist for no addresses in ar, the iterator is to a gap that begins +// before ar.Start. +// +// - An error that is non-nil if pmas exist for only a subset of ar. +// +// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for +// writing. vseg.Range().Intersect(pgap.Range()).Intersect(ar).Length() != 0. +// ar must be page-aligned. +func (mm *MemoryManager) insertPMAsLocked(ctx context.Context, vseg vmaIterator, pgap pmaGapIterator, ar usermem.AddrRange) (pmaGapIterator, error) { + optAR := vseg.Range().Intersect(pgap.Range()) + if checkInvariants { + if optAR.Length() <= 0 { + panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap)) + } + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar %v", ar)) + } + } + vma := vseg.ValuePtr() + + // Private anonymous mappings get pmas by allocating. + if vma.mappable == nil { + // Limit the range we allocate to ar, aligned to privateAllocUnit. + maskAR := privateAligned(ar) + allocAR := optAR.Intersect(maskAR) + mem := mm.p.Memory() + fr, err := mem.Allocate(uint64(allocAR.Length()), usage.Anonymous) + if err != nil { + return pgap, err + } + mm.incPrivateRef(fr) + + if checkInvariants { + if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) { + panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr)) + } + } + + mm.addRSSLocked(allocAR) + mem.IncRef(fr) + + return mm.pmas.Insert(pgap, allocAR, pma{ + file: mem, + off: fr.Start, + vmaEffectivePerms: vma.effectivePerms, + vmaMaxPerms: vma.maxPerms, + private: true, + // Since we just allocated this memory and have the only reference, + // the new pma does not need copy-on-write. + }).NextGap(), nil + } + + // Other mappings get pmas by translating. Limit the required range + // to ar. + optMR := vseg.mappableRangeOf(optAR) + reqAR := optAR.Intersect(ar) + reqMR := vseg.mappableRangeOf(reqAR) + perms := vma.maxPerms + if vma.private { + perms.Write = false + } + ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms) + if checkInvariants { + if err := memmap.CheckTranslateResult(reqMR, optMR, ts, err); err != nil { + panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v): %v", vma.mappable, reqMR, optMR, err)) + } + } + + // Install a pma for each Translation. + for _, t := range ts { + // This is valid because memmap.Mappable.Translate is required to + // return Translations in increasing Translation.Source order. + addrRange := vseg.addrRangeOf(t.Source) + mm.addRSSLocked(addrRange) + pseg := mm.pmas.Insert(pgap, addrRange, pma{ + file: t.File, + off: t.Offset, + vmaEffectivePerms: vma.effectivePerms, + vmaMaxPerms: vma.maxPerms, + needCOW: vma.private, + }) + // The new pseg may have been merged with existing segments, only take a + // ref on the inserted range. + t.File.IncRef(pseg.fileRangeOf(addrRange)) + pgap = pseg.NextGap() + } + + // Even if Translate returned an error, if we got to ar.End, + // insertPMAsLocked succeeded. + if ar.End <= pgap.Start() { + return pgap, nil + } + return pgap, err +} + +// breakCopyOnWriteLocked ensures that pmas in ar are not copy-on-write. It +// returns: +// +// - An iterator to the gap after the last non-COW pma containing an address in +// ar. If non-COW pmas exist for no addresses in ar, the iterator is to a gap +// that begins before ar.Start. +// +// - A boolean that is true if iterators into mm.pmas may have been +// invalidated. +// +// - An error that is non-nil if non-COW pmas exist for only a subset of ar. +// +// Preconditions: mm.activeMu must be locked for writing. ar.Length() != 0. ar +// must be page-aligned. pseg.Range().Contains(ar.Start). pmas must exist for +// all addresses in ar. +func (mm *MemoryManager) breakCopyOnWriteLocked(pseg pmaIterator, ar usermem.AddrRange) (pmaGapIterator, bool, error) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !pseg.Range().Contains(ar.Start) { + panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar)) + } + } + + // Limit the range we copy to ar, aligned to privateAllocUnit. + maskAR := privateAligned(ar) + var invalidatedIterators, didUnmapAS bool + mem := mm.p.Memory() + for { + if mm.isPMACopyOnWriteLocked(pseg) { + // Determine the range to copy. + copyAR := pseg.Range().Intersect(maskAR) + + // Get internal mappings from the pma to copy from. + if err := pseg.getInternalMappingsLocked(); err != nil { + return pseg.PrevGap(), invalidatedIterators, err + } + + // Copy contents. + fr, err := platform.AllocateAndFill(mem, uint64(copyAR.Length()), usage.Anonymous, &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)}) + if _, ok := err.(safecopy.BusError); ok { + // If we got SIGBUS during the copy, deliver SIGBUS to + // userspace (instead of SIGSEGV) if we're breaking + // copy-on-write due to application page fault. + err = &memmap.BusError{err} + } + if fr.Length() == 0 { + return pseg.PrevGap(), invalidatedIterators, err + } + mm.incPrivateRef(fr) + mem.IncRef(fr) + + // Unmap all of maskAR, not just copyAR, to minimize host syscalls. + // AddressSpace mappings must be removed before mm.decPrivateRef(). + if !didUnmapAS { + mm.unmapASLocked(maskAR) + didUnmapAS = true + } + + // Replace the pma with a copy in the part of the address range + // where copying was successful. + copyAR.End = copyAR.Start + usermem.Addr(fr.Length()) + if copyAR != pseg.Range() { + pseg = mm.pmas.Isolate(pseg, copyAR) + invalidatedIterators = true + } + pma := pseg.ValuePtr() + if pma.private { + mm.decPrivateRef(pseg.fileRange()) + } + pma.file.DecRef(pseg.fileRange()) + + pma.file = mem + pma.off = fr.Start + pma.private = true + pma.needCOW = false + pma.internalMappings = safemem.BlockSeq{} + + // Try to merge pma with its neighbors. + if prev := pseg.PrevSegment(); prev.Ok() { + if merged := mm.pmas.Merge(prev, pseg); merged.Ok() { + pseg = merged + invalidatedIterators = true + } + } + if next := pseg.NextSegment(); next.Ok() { + if merged := mm.pmas.Merge(pseg, next); merged.Ok() { + pseg = merged + invalidatedIterators = true + } + } + + // If an error occurred after ar.End, breakCopyOnWriteLocked still + // did its job, so discard the error. + if err != nil && pseg.End() < ar.End { + return pseg.NextGap(), invalidatedIterators, err + } + } + // This checks against ar.End, not maskAR.End, so we will never break + // COW on a pma that does not intersect ar. + if ar.End <= pseg.End() { + return pseg.NextGap(), invalidatedIterators, nil + } + pseg = pseg.NextSegment() + } +} + +// Preconditions: mm.activeMu must be locked for writing. +func (mm *MemoryManager) isPMACopyOnWriteLocked(pseg pmaIterator) bool { + pma := pseg.ValuePtr() + if !pma.needCOW { + return false + } + if !pma.private { + return true + } + // If we have the only reference on private memory to be copied, just take + // ownership of it instead of copying. If we do hold the only reference, + // additional references can only be taken by mm.Fork(), which is excluded + // by mm.activeMu, so this isn't racy. + mm.privateRefs.mu.Lock() + defer mm.privateRefs.mu.Unlock() + fr := pseg.fileRange() + // This check relies on mm.privateRefs.refs being kept fully merged. + rseg := mm.privateRefs.refs.FindSegment(fr.Start) + if rseg.Ok() && rseg.Value() == 1 && fr.End <= rseg.End() { + pma.needCOW = false + return false + } + return true +} + +// Invalidate implements memmap.MappingSpace.Invalidate. +func (mm *MemoryManager) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateOpts) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + mm.activeMu.Lock() + defer mm.activeMu.Unlock() + if mm.captureInvalidations { + mm.capturedInvalidations = append(mm.capturedInvalidations, invalidateArgs{ar, opts}) + return + } + mm.invalidateLocked(ar, opts.InvalidatePrivate, true) +} + +// invalidateLocked removes pmas and AddressSpace mappings of those pmas for +// addresses in ar. +// +// Preconditions: mm.activeMu must be locked for writing. ar.Length() != 0. ar +// must be page-aligned. +func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivate, invalidateShared bool) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + var didUnmapAS bool + pseg := mm.pmas.LowerBoundSegment(ar.Start) + for pseg.Ok() && pseg.Start() < ar.End { + pma := pseg.ValuePtr() + if (invalidatePrivate && pma.private) || (invalidateShared && !pma.private) { + pseg = mm.pmas.Isolate(pseg, ar) + pma = pseg.ValuePtr() + if !didUnmapAS { + // Unmap all of ar, not just pseg.Range(), to minimize host + // syscalls. AddressSpace mappings must be removed before + // mm.decPrivateRef(). + mm.unmapASLocked(ar) + didUnmapAS = true + } + if pma.private { + mm.decPrivateRef(pseg.fileRange()) + } + mm.removeRSSLocked(pseg.Range()) + pma.file.DecRef(pseg.fileRange()) + pseg = mm.pmas.Remove(pseg).NextSegment() + } else { + pseg = pseg.NextSegment() + } + } +} + +// movePMAsLocked moves all pmas in oldAR to newAR. +// +// Preconditions: mm.activeMu must be locked for writing. oldAR.Length() != 0. +// oldAR.Length() == newAR.Length(). !oldAR.Overlaps(newAR). +// mm.pmas.IsEmptyRange(newAR). oldAR and newAR must be page-aligned. +func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) { + if checkInvariants { + if !oldAR.WellFormed() || oldAR.Length() <= 0 || !oldAR.IsPageAligned() { + panic(fmt.Sprintf("invalid oldAR: %v", oldAR)) + } + if !newAR.WellFormed() || newAR.Length() <= 0 || !newAR.IsPageAligned() { + panic(fmt.Sprintf("invalid newAR: %v", newAR)) + } + if oldAR.Length() != newAR.Length() { + panic(fmt.Sprintf("old and new address ranges have different lengths: %v, %v", oldAR, newAR)) + } + if oldAR.Overlaps(newAR) { + panic(fmt.Sprintf("old and new address ranges overlap: %v, %v", oldAR, newAR)) + } + // mm.pmas.IsEmptyRange is checked by mm.pmas.Insert. + } + + type movedPMA struct { + oldAR usermem.AddrRange + pma pma + } + var movedPMAs []movedPMA + pseg := mm.pmas.LowerBoundSegment(oldAR.Start) + for pseg.Ok() && pseg.Start() < oldAR.End { + pseg = mm.pmas.Isolate(pseg, oldAR) + movedPMAs = append(movedPMAs, movedPMA{ + oldAR: pseg.Range(), + pma: pseg.Value(), + }) + mm.removeRSSLocked(pseg.Range()) + pseg = mm.pmas.Remove(pseg).NextSegment() + } + + off := newAR.Start - oldAR.Start + pgap := mm.pmas.FindGap(newAR.Start) + for i := range movedPMAs { + mpma := &movedPMAs[i] + pmaNewAR := usermem.AddrRange{mpma.oldAR.Start + off, mpma.oldAR.End + off} + mm.addRSSLocked(pmaNewAR) + pgap = mm.pmas.Insert(pgap, pmaNewAR, mpma.pma).NextGap() + } + + mm.unmapASLocked(oldAR) +} + +// getPMAInternalMappingsLocked ensures that pmas for all addresses in ar have +// cached internal mappings. It returns: +// +// - An iterator to the gap after the last pma with internal mappings +// containing an address in ar. If internal mappings exist for no addresses in +// ar, the iterator is to a gap that begins before ar.Start. +// +// - An error that is non-nil if internal mappings exist for only a subset of +// ar. +// +// Preconditions: mm.activeMu must be locked for writing. +// pseg.Range().Contains(ar.Start). pmas must exist for all addresses in ar. +// ar.Length() != 0. +// +// Postconditions: getPMAInternalMappingsLocked does not invalidate iterators +// into mm.pmas. +func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) (pmaGapIterator, error) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !pseg.Range().Contains(ar.Start) { + panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar)) + } + } + + for { + if err := pseg.getInternalMappingsLocked(); err != nil { + return pseg.PrevGap(), err + } + if ar.End <= pseg.End() { + return pseg.NextGap(), nil + } + pseg, _ = pseg.NextNonEmpty() + } +} + +// getVecPMAInternalMappingsLocked ensures that pmas for all addresses in ars +// have cached internal mappings. It returns the subset of ars for which +// internal mappings exist. If this is not equal to ars, it returns a non-nil +// error explaining why. +// +// Preconditions: mm.activeMu must be locked for writing. pmas must exist for +// all addresses in ar. +// +// Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators +// into mm.pmas. +func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars usermem.AddrRangeSeq) (usermem.AddrRangeSeq, error) { + for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { + ar := arsit.Head() + if ar.Length() == 0 { + continue + } + if pend, err := mm.getPMAInternalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); err != nil { + return truncatedAddrRangeSeq(ars, arsit, pend.Start()), err + } + } + return ars, nil +} + +// internalMappingsLocked returns internal mappings for addresses in ar. +// +// Preconditions: mm.activeMu must be locked. Internal mappings must have been +// previously established for all addresses in ar. ar.Length() != 0. +// pseg.Range().Contains(ar.Start). +func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) safemem.BlockSeq { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !pseg.Range().Contains(ar.Start) { + panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar)) + } + } + + if ar.End <= pseg.End() { + // Since only one pma is involved, we can use pma.internalMappings + // directly, avoiding a slice allocation. + offset := uint64(ar.Start - pseg.Start()) + return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length())) + } + + var ims []safemem.Block + for { + pr := pseg.Range().Intersect(ar) + for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pr.Start - pseg.Start())).TakeFirst64(uint64(pr.Length())); !pims.IsEmpty(); pims = pims.Tail() { + ims = append(ims, pims.Head()) + } + if ar.End <= pseg.End() { + break + } + pseg = pseg.NextSegment() + } + return safemem.BlockSeqFromSlice(ims) +} + +// vecInternalMappingsLocked returns internal mappings for addresses in ars. +// +// Preconditions: mm.activeMu must be locked. Internal mappings must have been +// previously established for all addresses in ars. +func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) safemem.BlockSeq { + var ims []safemem.Block + for ; !ars.IsEmpty(); ars = ars.Tail() { + ar := ars.Head() + if ar.Length() == 0 { + continue + } + for pims := mm.internalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); !pims.IsEmpty(); pims = pims.Tail() { + ims = append(ims, pims.Head()) + } + } + return safemem.BlockSeqFromSlice(ims) +} + +// incPrivateRef acquires a reference on private pages in fr. +func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) { + mm.privateRefs.mu.Lock() + defer mm.privateRefs.mu.Unlock() + refSet := &mm.privateRefs.refs + seg, gap := refSet.Find(fr.Start) + for { + switch { + case seg.Ok() && seg.Start() < fr.End: + seg = refSet.Isolate(seg, fr) + seg.SetValue(seg.Value() + 1) + seg, gap = seg.NextNonEmpty() + case gap.Ok() && gap.Start() < fr.End: + seg, gap = refSet.InsertWithoutMerging(gap, gap.Range().Intersect(fr), 1).NextNonEmpty() + default: + refSet.MergeAdjacent(fr) + return + } + } +} + +// decPrivateRef releases a reference on private pages in fr. +func (mm *MemoryManager) decPrivateRef(fr platform.FileRange) { + var freed []platform.FileRange + + mm.privateRefs.mu.Lock() + refSet := &mm.privateRefs.refs + seg := refSet.LowerBoundSegment(fr.Start) + for seg.Ok() && seg.Start() < fr.End { + seg = refSet.Isolate(seg, fr) + if old := seg.Value(); old == 1 { + freed = append(freed, seg.Range()) + seg = refSet.Remove(seg).NextSegment() + } else { + seg.SetValue(old - 1) + seg = seg.NextSegment() + } + } + refSet.MergeAdjacent(fr) + mm.privateRefs.mu.Unlock() + + mem := mm.p.Memory() + for _, fr := range freed { + mem.DecRef(fr) + } +} + +// addRSSLocked updates the current and maximum resident set size of a +// MemoryManager to reflect the insertion of a pma at ar. +// +// Preconditions: mm.activeMu must be locked for writing. +func (mm *MemoryManager) addRSSLocked(ar usermem.AddrRange) { + mm.curRSS += uint64(ar.Length()) + if mm.curRSS > mm.maxRSS { + mm.maxRSS = mm.curRSS + } +} + +// removeRSSLocked updates the current resident set size of a MemoryManager to +// reflect the removal of a pma at ar. +// +// Preconditions: mm.activeMu must be locked for writing. +func (mm *MemoryManager) removeRSSLocked(ar usermem.AddrRange) { + mm.curRSS -= uint64(ar.Length()) +} + +// pmaSetFunctions implements segment.Functions for pmaSet. +type pmaSetFunctions struct{} + +func (pmaSetFunctions) MinKey() usermem.Addr { + return 0 +} + +func (pmaSetFunctions) MaxKey() usermem.Addr { + return ^usermem.Addr(0) +} + +func (pmaSetFunctions) ClearValue(pma *pma) { + pma.file = nil + pma.internalMappings = safemem.BlockSeq{} +} + +func (pmaSetFunctions) Merge(ar1 usermem.AddrRange, pma1 pma, ar2 usermem.AddrRange, pma2 pma) (pma, bool) { + if pma1.file != pma2.file || + pma1.off+uint64(ar1.Length()) != pma2.off || + pma1.vmaEffectivePerms != pma2.vmaEffectivePerms || + pma1.vmaMaxPerms != pma2.vmaMaxPerms || + pma1.needCOW != pma2.needCOW || + pma1.private != pma2.private { + return pma{}, false + } + + // Discard internal mappings instead of trying to merge them, since merging + // them requires an allocation and getting them again from the + // platform.File might not. + pma1.internalMappings = safemem.BlockSeq{} + return pma1, true +} + +func (pmaSetFunctions) Split(ar usermem.AddrRange, p pma, split usermem.Addr) (pma, pma) { + newlen1 := uint64(split - ar.Start) + p2 := p + p2.off += newlen1 + if !p.internalMappings.IsEmpty() { + p.internalMappings = p.internalMappings.TakeFirst64(newlen1) + p2.internalMappings = p2.internalMappings.DropFirst64(newlen1) + } + return p, p2 +} + +// findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do +// so by scanning linearly backward from pgap. +// +// Preconditions: mm.activeMu must be locked. addr <= pgap.Start(). +func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr usermem.Addr, pgap pmaGapIterator) pmaIterator { + if checkInvariants { + if !pgap.Ok() { + panic("terminal pma iterator") + } + if addr > pgap.Start() { + panic(fmt.Sprintf("can't seek backward to %#x from %#x", addr, pgap.Start())) + } + } + // Optimistically check if pgap.PrevSegment() is the PMA we're looking for, + // which is the case if findOrSeekPrevUpperBoundPMA is called to find the + // start of a range containing only a single PMA. + if pseg := pgap.PrevSegment(); pseg.Start() <= addr { + return pseg + } + return mm.pmas.UpperBoundSegment(addr) +} + +// getInternalMappingsLocked ensures that pseg.ValuePtr().internalMappings is +// non-empty. +// +// Preconditions: mm.activeMu must be locked for writing. +func (pseg pmaIterator) getInternalMappingsLocked() error { + pma := pseg.ValuePtr() + if pma.internalMappings.IsEmpty() { + // Internal mappings are used for ignorePermissions accesses, + // so we need to use vma.maxPerms instead of + // vma.effectivePerms. However, we will never execute + // application code through an internal mapping, and we don't + // actually need a writable mapping if copy-on-write is in + // effect. (But get a writable mapping anyway if the pma is + // private, so that if breakCopyOnWriteLocked => + // isPMACopyOnWriteLocked takes ownership of the pma instead of + // copying, it doesn't need to get a new mapping.) + perms := pma.vmaMaxPerms + perms.Execute = false + if pma.needCOW && !pma.private { + perms.Write = false + } + ims, err := pma.file.MapInternal(pseg.fileRange(), perms) + if err != nil { + return err + } + pma.internalMappings = ims + } + return nil +} + +func (pseg pmaIterator) fileRange() platform.FileRange { + return pseg.fileRangeOf(pseg.Range()) +} + +// Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0. +func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange { + if checkInvariants { + if !pseg.Ok() { + panic("terminal pma iterator") + } + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !pseg.Range().IsSupersetOf(ar) { + panic(fmt.Sprintf("ar %v out of bounds %v", ar, pseg.Range())) + } + } + + pma := pseg.ValuePtr() + pstart := pseg.Start() + return platform.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)} +} diff --git a/pkg/sentry/mm/proc_pid_maps.go b/pkg/sentry/mm/proc_pid_maps.go new file mode 100644 index 000000000..5840b257c --- /dev/null +++ b/pkg/sentry/mm/proc_pid_maps.go @@ -0,0 +1,105 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "bytes" + "fmt" + "strings" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +const ( + // devMinorBits is the number of minor bits in a device number. Linux: + // include/linux/kdev_t.h:MINORBITS + devMinorBits = 20 +) + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (mm *MemoryManager) NeedsUpdate(generation int64) bool { + return true +} + +// ReadSeqFileData is called by fs/proc.mapsData.ReadSeqFileData. +func (mm *MemoryManager) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + var data []seqfile.SeqData + var start usermem.Addr + if handle != nil { + start = *handle.(*usermem.Addr) + } + for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { + // FIXME: If we use a usermem.Addr for the handle, we get + // "panic: autosave error: type usermem.Addr is not registered". + vmaAddr := vseg.End() + data = append(data, seqfile.SeqData{ + Buf: mm.vmaMapsEntryLocked(ctx, vseg), + Handle: &vmaAddr, + }) + } + return data, 1 +} + +// vmaMapsEntryLocked returns a /proc/[pid]/maps entry for the vma iterated by +// vseg, including the trailing newline. +// +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) vmaMapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte { + vma := vseg.ValuePtr() + private := "p" + if !vma.private { + private = "s" + } + + var dev, ino uint64 + if vma.id != nil { + dev = vma.id.DeviceID() + ino = vma.id.InodeID() + } + devMajor := uint32(dev >> devMinorBits) + devMinor := uint32(dev & ((1 << devMinorBits) - 1)) + + var b bytes.Buffer + // Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() => + // stack_guard_page_start(). + fmt.Fprintf(&b, "%08x-%08x %s%s %08x %02x:%02x %d ", + vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino) + + // Figure out our filename or hint. + var s string + if vma.hint != "" { + s = vma.hint + } else if vma.id != nil { + // FIXME: We are holding mm.mappingMu here, which is + // consistent with Linux's holding mmap_sem in + // fs/proc/task_mmu.c:show_map_vma() => fs/seq_file.c:seq_file_path(). + // However, it's not clear that fs.File.MappedName() is actually + // consistent with this lock order. + s = vma.id.MappedName(ctx) + } + if s != "" { + // Per linux, we pad until the 74th character. + if pad := 73 - b.Len(); pad > 0 { + b.WriteString(strings.Repeat(" ", pad)) + } + b.WriteString(s) + } + b.WriteString("\n") + return b.Bytes() +} diff --git a/pkg/sentry/mm/save_restore.go b/pkg/sentry/mm/save_restore.go new file mode 100644 index 000000000..36fed8f1c --- /dev/null +++ b/pkg/sentry/mm/save_restore.go @@ -0,0 +1,57 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" +) + +// InvalidateUnsavable invokes memmap.Mappable.InvalidateUnsavable on all +// Mappables mapped by mm. +func (mm *MemoryManager) InvalidateUnsavable(ctx context.Context) error { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { + if vma := vseg.ValuePtr(); vma.mappable != nil { + if err := vma.mappable.InvalidateUnsavable(ctx); err != nil { + return err + } + } + } + return nil +} + +// beforeSave is invoked by stateify. +func (mm *MemoryManager) beforeSave() { + mem := mm.p.Memory() + for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() { + if pma := pseg.ValuePtr(); pma.file != mem { + // InvalidateUnsavable should have caused all such pmas to be + // invalidated. + panic(fmt.Sprintf("Can't save pma %#v with non-Memory file of type %T:\n%s", pseg.Range(), pma.file, mm)) + } + } +} + +// afterLoad is invoked by stateify. +func (mm *MemoryManager) afterLoad() { + mm.haveASIO = mm.p.SupportsAddressSpaceIO() + mem := mm.p.Memory() + for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() { + pseg.ValuePtr().file = mem + } +} diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go new file mode 100644 index 000000000..9d3614034 --- /dev/null +++ b/pkg/sentry/mm/special_mappable.go @@ -0,0 +1,147 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// SpecialMappable implements memmap.MappingIdentity and memmap.Mappable with +// semantics similar to Linux's mm/mmap.c:_install_special_mapping(), except +// that SpecialMappable takes ownership of the memory that it represents +// (_install_special_mapping() does not.) +type SpecialMappable struct { + refs.AtomicRefCount + + p platform.Platform + fr platform.FileRange + name string +} + +// NewSpecialMappable returns a SpecialMappable that owns fr, which represents +// offsets in p.Memory() that contain the SpecialMappable's data. The +// SpecialMappable will use the given name in /proc/[pid]/maps. +// +// Preconditions: fr.Length() != 0. +func NewSpecialMappable(name string, p platform.Platform, fr platform.FileRange) *SpecialMappable { + return &SpecialMappable{p: p, fr: fr, name: name} +} + +// DecRef implements refs.RefCounter.DecRef. +func (m *SpecialMappable) DecRef() { + m.AtomicRefCount.DecRefWithDestructor(func() { + m.p.Memory().DecRef(m.fr) + }) +} + +// MappedName implements memmap.MappingIdentity.MappedName. +func (m *SpecialMappable) MappedName(ctx context.Context) string { + return m.name +} + +// DeviceID implements memmap.MappingIdentity.DeviceID. +func (m *SpecialMappable) DeviceID() uint64 { + return 0 +} + +// InodeID implements memmap.MappingIdentity.InodeID. +func (m *SpecialMappable) InodeID() uint64 { + return 0 +} + +// Msync implements memmap.MappingIdentity.Msync. +func (m *SpecialMappable) Msync(ctx context.Context, mr memmap.MappableRange) error { + // Linux: vm_file is NULL, causing msync to skip it entirely. + return nil +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (m *SpecialMappable) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error { + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (m *SpecialMappable) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) { +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (m *SpecialMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error { + return nil +} + +// Translate implements memmap.Mappable.Translate. +func (m *SpecialMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + var err error + if required.End > m.fr.Length() { + err = &memmap.BusError{syserror.EFAULT} + } + if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 { + return []memmap.Translation{ + { + Source: source, + File: m.p.Memory(), + Offset: m.fr.Start + source.Start, + }, + }, err + } + return nil, err +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (m *SpecialMappable) InvalidateUnsavable(ctx context.Context) error { + // Since data is stored in platform.Platform.Memory(), the contents of + // which are preserved across save/restore, we don't need to do anything. + return nil +} + +// Platform returns the Platform whose Memory stores the SpecialMappable's +// contents. +func (m *SpecialMappable) Platform() platform.Platform { + return m.p +} + +// FileRange returns the offsets into Platform().Memory() that store the +// SpecialMappable's contents. +func (m *SpecialMappable) FileRange() platform.FileRange { + return m.fr +} + +// Length returns the length of the SpecialMappable. +func (m *SpecialMappable) Length() uint64 { + return m.fr.Length() +} + +// NewSharedAnonMappable returns a SpecialMappable that implements the +// semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero. +// +// TODO: The use of SpecialMappable is a lazy code reuse hack. Linux +// uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should +// do the same to get non-zero device and inode IDs. +func NewSharedAnonMappable(length uint64, p platform.Platform) (*SpecialMappable, error) { + if length == 0 || length != uint64(usermem.Addr(length).RoundDown()) { + return nil, syserror.EINVAL + } + fr, err := p.Memory().Allocate(length, usage.Anonymous) + if err != nil { + return nil, err + } + return NewSpecialMappable("/dev/zero (deleted)", p, fr), nil +} diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go new file mode 100644 index 000000000..0730be65b --- /dev/null +++ b/pkg/sentry/mm/syscalls.go @@ -0,0 +1,794 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + mrand "math/rand" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// HandleUserFault handles an application page fault. sp is the faulting +// application thread's stack pointer. +// +// Preconditions: mm.as != nil. +func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr usermem.Addr, at usermem.AccessType, sp usermem.Addr) error { + ar, ok := addr.RoundDown().ToRange(usermem.PageSize) + if !ok { + return syserror.EFAULT + } + + // Don't bother trying existingPMAsLocked; in most cases, if we did have + // existing pmas, we wouldn't have faulted. + + // Ensure that we have a usable vma. Here and below, since we are only + // asking for a single page, there is no possibility of partial success, + // and any error is immediately fatal. + mm.mappingMu.RLock() + vseg, _, err := mm.getVMAsLocked(ctx, ar, at, false) + if err != nil { + mm.mappingMu.RUnlock() + return err + } + + // Ensure that we have a usable pma. + mm.activeMu.Lock() + pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{ + breakCOW: at.Write, + }) + mm.mappingMu.RUnlock() + if err != nil { + mm.activeMu.Unlock() + return err + } + + // Downgrade to a read-lock on activeMu since we don't need to mutate pmas + // anymore. + mm.activeMu.DowngradeLock() + + // Map the faulted page into the active AddressSpace. + err = mm.mapASLocked(pseg, ar, false) + mm.activeMu.RUnlock() + return err +} + +// MMap establishes a memory mapping. +func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (usermem.Addr, error) { + if opts.Length == 0 { + return 0, syserror.EINVAL + } + length, ok := usermem.Addr(opts.Length).RoundUp() + if !ok { + return 0, syserror.ENOMEM + } + opts.Length = uint64(length) + + if opts.Mappable != nil { + // Offset must be aligned. + if usermem.Addr(opts.Offset).RoundDown() != usermem.Addr(opts.Offset) { + return 0, syserror.EINVAL + } + // Offset + length must not overflow. + if end := opts.Offset + opts.Length; end < opts.Offset { + return 0, syserror.ENOMEM + } + } else { + opts.Offset = 0 + if !opts.Private { + if opts.MappingIdentity != nil { + return 0, syserror.EINVAL + } + m, err := NewSharedAnonMappable(opts.Length, platform.FromContext(ctx)) + if err != nil { + return 0, err + } + opts.MappingIdentity = m + opts.Mappable = m + } + } + + if opts.Addr.RoundDown() != opts.Addr { + // MAP_FIXED requires addr to be page-aligned; non-fixed mappings + // don't. + if opts.Fixed { + return 0, syserror.EINVAL + } + opts.Addr = opts.Addr.RoundDown() + } + + if !opts.MaxPerms.SupersetOf(opts.Perms) { + return 0, syserror.EACCES + } + if opts.Unmap && !opts.Fixed { + return 0, syserror.EINVAL + } + if opts.GrowsDown && opts.Mappable != nil { + return 0, syserror.EINVAL + } + + // Get the new vma. + mm.mappingMu.Lock() + vseg, ar, err := mm.createVMALocked(ctx, opts) + if err != nil { + mm.mappingMu.Unlock() + return 0, err + } + + switch { + case opts.Precommit: + // Get pmas and map with precommit as requested. + mm.populateAndUnlock(ctx, vseg, ar, true) + + case opts.Mappable == nil && length <= privateAllocUnit: + // NOTE: Get pmas and map eagerly in the hope + // that doing so will save on future page faults. We only do this for + // anonymous mappings, since otherwise the cost of + // memmap.Mappable.Translate is unknown; and only for small mappings, + // to avoid needing to allocate large amounts of memory that we may + // subsequently need to checkpoint. + mm.populateAndUnlock(ctx, vseg, ar, false) + + default: + mm.mappingMu.Unlock() + } + + return ar.Start, nil +} + +// Preconditions: mm.mappingMu must be locked for writing. +// +// Postconditions: mm.mappingMu will be unlocked. +func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) { + if !vseg.ValuePtr().effectivePerms.Any() { + // Linux doesn't populate inaccessible pages. See + // mm/gup.c:populate_vma_page_range. + mm.mappingMu.Unlock() + return + } + + mm.activeMu.Lock() + + // Even if we get a new pma, we can't actually map it if we don't have an + // AddressSpace. + if mm.as == nil { + mm.activeMu.Unlock() + mm.mappingMu.Unlock() + return + } + + // Ensure that we have usable pmas. + mm.mappingMu.DowngradeLock() + pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{}) + mm.mappingMu.RUnlock() + if err != nil { + // mm/util.c:vm_mmap_pgoff() ignores the error, if any, from + // mm/gup.c:mm_populate(). If it matters, we'll get it again when + // userspace actually tries to use the failing page. + mm.activeMu.Unlock() + return + } + + // Downgrade to a read-lock on activeMu since we don't need to mutate pmas + // anymore. + mm.activeMu.DowngradeLock() + + // As above, errors are silently ignored. + mm.mapASLocked(pseg, ar, precommit) + mm.activeMu.RUnlock() +} + +// MapStack allocates the initial process stack. +func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) { + // maxStackSize is the maximum supported process stack size in bytes. + // + // This limit exists because stack growing isn't implemented, so the entire + // process stack must be mapped up-front. + const maxStackSize = 128 << 20 + + stackSize := limits.FromContext(ctx).Get(limits.Stack) + r, ok := usermem.Addr(stackSize.Cur).RoundUp() + sz := uint64(r) + if !ok { + // RLIM_INFINITY rounds up to 0. + sz = linux.DefaultStackSoftLimit + } else if sz > maxStackSize { + ctx.Warningf("Capping stack size from RLIMIT_STACK of %v down to %v.", sz, maxStackSize) + sz = maxStackSize + } else if sz == 0 { + return usermem.AddrRange{}, syserror.ENOMEM + } + szaddr := usermem.Addr(sz) + ctx.Debugf("Allocating stack with size of %v bytes", sz) + + // Determine the stack's desired location. Unlike Linux, address + // randomization can't be disabled. + stackEnd := mm.layout.MaxAddr - usermem.Addr(mrand.Int63n(int64(mm.layout.MaxStackRand))).RoundDown() + if stackEnd < szaddr { + return usermem.AddrRange{}, syserror.ENOMEM + } + stackStart := stackEnd - szaddr + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + _, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{ + Length: sz, + Addr: stackStart, + Perms: usermem.ReadWrite, + MaxPerms: usermem.AnyAccess, + Private: true, + GrowsDown: true, + Hint: "[stack]", + }) + return ar, err +} + +// MUnmap implements the semantics of Linux's munmap(2). +func (mm *MemoryManager) MUnmap(ctx context.Context, addr usermem.Addr, length uint64) error { + if addr != addr.RoundDown() { + return syserror.EINVAL + } + if length == 0 { + return syserror.EINVAL + } + la, ok := usermem.Addr(length).RoundUp() + if !ok { + return syserror.EINVAL + } + ar, ok := addr.ToRange(uint64(la)) + if !ok { + return syserror.EINVAL + } + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + mm.unmapLocked(ctx, ar) + return nil +} + +// MRemapOpts specifies options to MRemap. +type MRemapOpts struct { + // Move controls whether MRemap moves the remapped mapping to a new address. + Move MRemapMoveMode + + // NewAddr is the new address for the remapping. NewAddr is ignored unless + // Move is MMRemapMustMove. + NewAddr usermem.Addr +} + +// MRemapMoveMode controls MRemap's moving behavior. +type MRemapMoveMode int + +const ( + // MRemapNoMove prevents MRemap from moving the remapped mapping. + MRemapNoMove MRemapMoveMode = iota + + // MRemapMayMove allows MRemap to move the remapped mapping. + MRemapMayMove + + // MRemapMustMove requires MRemap to move the remapped mapping to + // MRemapOpts.NewAddr, replacing any existing mappings in the remapped + // range. + MRemapMustMove +) + +// MRemap implements the semantics of Linux's mremap(2). +func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSize uint64, newSize uint64, opts MRemapOpts) (usermem.Addr, error) { + // "Note that old_address has to be page aligned." - mremap(2) + if oldAddr.RoundDown() != oldAddr { + return 0, syserror.EINVAL + } + + // Linux treats an old_size that rounds up to 0 as 0, which is otherwise a + // valid size. However, new_size can't be 0 after rounding. + oldSizeAddr, _ := usermem.Addr(oldSize).RoundUp() + oldSize = uint64(oldSizeAddr) + newSizeAddr, ok := usermem.Addr(newSize).RoundUp() + if !ok || newSizeAddr == 0 { + return 0, syserror.EINVAL + } + newSize = uint64(newSizeAddr) + + oldEnd, ok := oldAddr.AddLength(oldSize) + if !ok { + return 0, syserror.EINVAL + } + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + + // All cases require that a vma exists at oldAddr. + vseg := mm.vmas.FindSegment(oldAddr) + if !vseg.Ok() { + return 0, syserror.EFAULT + } + + if opts.Move != MRemapMustMove { + // Handle noops and in-place shrinking. These cases don't care if + // [oldAddr, oldEnd) maps to a single vma, or is even mapped at all + // (aside from oldAddr). + if newSize <= oldSize { + if newSize < oldSize { + // If oldAddr+oldSize didn't overflow, oldAddr+newSize can't + // either. + newEnd := oldAddr + usermem.Addr(newSize) + mm.unmapLocked(ctx, usermem.AddrRange{newEnd, oldEnd}) + } + return oldAddr, nil + } + + // Handle in-place growing. + + // Check that oldEnd maps to the same vma as oldAddr. + if vseg.End() < oldEnd { + return 0, syserror.EFAULT + } + // "Grow" the existing vma by creating a new mergeable one. + vma := vseg.ValuePtr() + var newOffset uint64 + if vma.mappable != nil { + newOffset = vseg.mappableRange().End + } + _, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{ + Length: newSize - oldSize, + MappingIdentity: vma.id, + Mappable: vma.mappable, + Offset: newOffset, + Addr: oldEnd, + Fixed: true, + Perms: vma.realPerms, + MaxPerms: vma.maxPerms, + Private: vma.private, + GrowsDown: vma.growsDown, + Hint: vma.hint, + }) + if err == nil { + return oldAddr, nil + } + // In-place growth failed. In the MRemapMayMove case, fall through to + // moving below. + if opts.Move == MRemapNoMove { + return 0, err + } + } + + // Handle moving, which is the only remaining case. + + // Find a destination for the move. + var newAR usermem.AddrRange + switch opts.Move { + case MRemapMayMove: + newAddr, err := mm.findAvailableLocked(newSize, findAvailableOpts{}) + if err != nil { + return 0, err + } + newAR, _ = newAddr.ToRange(newSize) + + case MRemapMustMove: + newAddr := opts.NewAddr + if newAddr.RoundDown() != newAddr { + return 0, syserror.EINVAL + } + var ok bool + newAR, ok = newAddr.ToRange(newSize) + if !ok { + return 0, syserror.EINVAL + } + if (usermem.AddrRange{oldAddr, oldEnd}).Overlaps(newAR) { + return 0, syserror.EINVAL + } + + // Unmap any mappings at the destination. + mm.unmapLocked(ctx, newAR) + + // If the sizes specify shrinking, unmap everything between the new and + // old sizes at the source. + if newSize < oldSize { + oldNewEnd := oldAddr + usermem.Addr(newSize) + mm.unmapLocked(ctx, usermem.AddrRange{oldNewEnd, oldEnd}) + oldEnd = oldNewEnd + } + + // unmapLocked may have invalidated vseg; look it up again. + vseg = mm.vmas.FindSegment(oldAddr) + } + + oldAR := usermem.AddrRange{oldAddr, oldEnd} + + // In the MRemapMustMove case, these checks happen after unmapping: + // mm/mremap.c:mremap_to() => do_munmap(), vma_to_resize(). + + // Check that oldEnd maps to the same vma as oldAddr. + if vseg.End() < oldEnd { + return 0, syserror.EFAULT + } + + // Check against RLIMIT_AS. + newUsageAS := mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length()) + if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS { + return 0, syserror.ENOMEM + } + + if vma := vseg.ValuePtr(); vma.mappable != nil { + // Check that offset+length does not overflow. + if vma.off+uint64(newAR.Length()) < vma.off { + return 0, syserror.EINVAL + } + // Inform the Mappable, if any, of the copied mapping. + if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start)); err != nil { + return 0, err + } + } + + // Remove the existing vma before inserting the new one to minimize + // iterator invalidation. We do this directly (instead of calling + // removeVMAsLocked) because: + // + // 1. We can't drop the reference on vma.id, which will be transferred to + // the new vma. + // + // 2. We can't call vma.mappable.RemoveMapping, because pmas are still at + // oldAR, so calling RemoveMapping could cause us to miss an invalidation + // overlapping oldAR. + // + // Call vseg.Value() (rather than vseg.ValuePtr()) first to make a copy of + // the vma. + vseg = mm.vmas.Isolate(vseg, oldAR) + vma := vseg.Value() + mm.vmas.Remove(vseg) + + // Insert the new vma, transferring the reference on vma.id. + mm.vmas.Add(newAR, vma) + + // Move pmas. This is technically optional for non-private pmas, which + // could just go through memmap.Mappable.Translate again, but it's required + // for private pmas. + mm.activeMu.Lock() + mm.movePMAsLocked(oldAR, newAR) + mm.activeMu.Unlock() + + // Now that pmas have been moved to newAR, we can notify vma.mappable that + // oldAR is no longer mapped. + if vma.mappable != nil { + vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off) + } + + return newAR.Start, nil +} + +// MProtect implements the semantics of Linux's mprotect(2). +func (mm *MemoryManager) MProtect(addr usermem.Addr, length uint64, realPerms usermem.AccessType, growsDown bool) error { + if addr.RoundDown() != addr { + return syserror.EINVAL + } + if length == 0 { + return nil + } + rlength, ok := usermem.Addr(length).RoundUp() + if !ok { + return syserror.ENOMEM + } + ar, ok := addr.ToRange(uint64(rlength)) + if !ok { + return syserror.ENOMEM + } + effectivePerms := realPerms.Effective() + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + // Non-growsDown mprotect requires that all of ar is mapped, and stops at + // the first non-empty gap. growsDown mprotect requires that the first vma + // be growsDown, but does not require it to extend all the way to ar.Start; + // vmas after the first must be contiguous but need not be growsDown, like + // the non-growsDown case. + vseg := mm.vmas.LowerBoundSegment(ar.Start) + if !vseg.Ok() { + return syserror.ENOMEM + } + if growsDown { + if !vseg.ValuePtr().growsDown { + return syserror.EINVAL + } + if ar.End <= vseg.Start() { + return syserror.ENOMEM + } + ar.Start = vseg.Start() + } else { + if ar.Start < vseg.Start() { + return syserror.ENOMEM + } + } + + mm.activeMu.Lock() + defer mm.activeMu.Unlock() + defer func() { + mm.vmas.MergeRange(ar) + mm.vmas.MergeAdjacent(ar) + mm.pmas.MergeRange(ar) + mm.pmas.MergeAdjacent(ar) + }() + pseg := mm.pmas.LowerBoundSegment(ar.Start) + var didUnmapAS bool + for { + // Check for permission validity before splitting vmas, for consistency + // with Linux. + if !vseg.ValuePtr().maxPerms.SupersetOf(effectivePerms) { + return syserror.EACCES + } + vseg = mm.vmas.Isolate(vseg, ar) + + // Update vma permissions. + vma := vseg.ValuePtr() + vma.realPerms = realPerms + vma.effectivePerms = effectivePerms + + // Propagate vma permission changes to pmas. + for pseg.Ok() && pseg.Start() < vseg.End() { + if pseg.Range().Overlaps(vseg.Range()) { + pseg = mm.pmas.Isolate(pseg, vseg.Range()) + if !effectivePerms.SupersetOf(pseg.ValuePtr().vmaEffectivePerms) && !didUnmapAS { + // Unmap all of ar, not just vseg.Range(), to minimize host + // syscalls. + mm.unmapASLocked(ar) + didUnmapAS = true + } + pseg.ValuePtr().vmaEffectivePerms = effectivePerms + } + pseg = pseg.NextSegment() + } + + // Continue to the next vma. + if ar.End <= vseg.End() { + return nil + } + vseg, _ = vseg.NextNonEmpty() + if !vseg.Ok() { + return syserror.ENOMEM + } + } +} + +// BrkSetup sets mm's brk address to addr and its brk size to 0. +func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) { + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + // Unmap the existing brk. + if mm.brk.Length() != 0 { + mm.unmapLocked(ctx, mm.brk) + } + mm.brk = usermem.AddrRange{addr, addr} +} + +// Brk implements the semantics of Linux's brk(2), except that it returns an +// error on failure. +func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) { + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + + if addr < mm.brk.Start { + return mm.brk.End, syserror.EINVAL + } + + // TODO: This enforces RLIMIT_DATA, but is slightly more + // permissive than the usual data limit. In particular, this only + // limits the size of the heap; a true RLIMIT_DATA limits the size of + // heap + data + bss. The segment sizes need to be plumbed from the + // loader package to fully enforce RLIMIT_DATA. + if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur { + return mm.brk.End, syserror.ENOMEM + } + + oldbrkpg, _ := mm.brk.End.RoundUp() + newbrkpg, ok := addr.RoundUp() + if !ok { + return mm.brk.End, syserror.EFAULT + } + + switch { + case newbrkpg < oldbrkpg: + mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg}) + + case oldbrkpg < newbrkpg: + _, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{ + Length: uint64(newbrkpg - oldbrkpg), + Addr: oldbrkpg, + Fixed: true, + // Compare Linux's + // arch/x86/include/asm/page_types.h:VM_DATA_DEFAULT_FLAGS. + Perms: usermem.ReadWrite, + MaxPerms: usermem.AnyAccess, + Private: true, + Hint: "[heap]", + }) + if err != nil { + return mm.brk.End, err + } + } + + mm.brk.End = addr + return addr, nil +} + +// Decommit implements the semantics of Linux's madvise(MADV_DONTNEED). +func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { + ar, ok := addr.ToRange(length) + if !ok { + return syserror.EINVAL + } + + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + mm.activeMu.Lock() + defer mm.activeMu.Unlock() + + // Linux's mm/madvise.c:madvise_dontneed() => mm/memory.c:zap_page_range() + // is analogous to our mm.invalidateLocked(ar, true, true). We inline this + // here, with the special case that we synchronously decommit + // uniquely-owned (non-copy-on-write) pages for private anonymous vma, + // which is the common case for MADV_DONTNEED. Invalidating these pmas, and + // allowing them to be reallocated when touched again, increases pma + // fragmentation, which may significantly reduce performance for + // non-vectored I/O implementations. Also, decommitting synchronously + // ensures that Decommit immediately reduces host memory usage. + var didUnmapAS bool + pseg := mm.pmas.LowerBoundSegment(ar.Start) + vseg := mm.vmas.LowerBoundSegment(ar.Start) + mem := mm.p.Memory() + for pseg.Ok() && pseg.Start() < ar.End { + pma := pseg.ValuePtr() + if pma.private && !mm.isPMACopyOnWriteLocked(pseg) { + psegAR := pseg.Range().Intersect(ar) + vseg = vseg.seekNextLowerBound(psegAR.Start) + if checkInvariants { + if !vseg.Ok() { + panic(fmt.Sprintf("no vma after %#x", psegAR.Start)) + } + if psegAR.Start < vseg.Start() { + panic(fmt.Sprintf("no vma in [%#x, %#x)", psegAR.Start, vseg.Start())) + } + } + if vseg.Range().IsSupersetOf(psegAR) && vseg.ValuePtr().mappable == nil { + if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil { + pseg = pseg.NextSegment() + continue + } + // If an error occurs, fall through to the general + // invalidation case below. + } + } + pseg = mm.pmas.Isolate(pseg, ar) + pma = pseg.ValuePtr() + if !didUnmapAS { + // Unmap all of ar, not just pseg.Range(), to minimize host + // syscalls. AddressSpace mappings must be removed before + // mm.decPrivateRef(). + mm.unmapASLocked(ar) + didUnmapAS = true + } + if pma.private { + mm.decPrivateRef(pseg.fileRange()) + } + pma.file.DecRef(pseg.fileRange()) + mm.removeRSSLocked(pseg.Range()) + + pseg = mm.pmas.Remove(pseg).NextSegment() + } + + // "If there are some parts of the specified address space that are not + // mapped, the Linux version of madvise() ignores them and applies the call + // to the rest (but returns ENOMEM from the system call, as it should)." - + // madvise(2) + if mm.vmas.SpanRange(ar) != ar.Length() { + return syserror.ENOMEM + } + return nil +} + +// Sync implements the semantics of Linux's msync(MS_SYNC). +func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uint64) error { + ar, ok := addr.ToRange(length) + if !ok { + return syserror.ENOMEM + } + + mm.mappingMu.RLock() + // Can't defer mm.mappingMu.RUnlock(); see below. + vseg := mm.vmas.LowerBoundSegment(ar.Start) + if !vseg.Ok() { + mm.mappingMu.RUnlock() + return syserror.ENOMEM + } + var unmapped bool + lastEnd := ar.Start + for { + if !vseg.Ok() { + mm.mappingMu.RUnlock() + unmapped = true + break + } + if lastEnd < vseg.Start() { + unmapped = true + } + lastEnd = vseg.End() + vma := vseg.ValuePtr() + // It's only possible to have dirtied the Mappable through a shared + // mapping. Don't check if the mapping is writable, because mprotect + // may have changed this, and also because Linux doesn't. + if id := vma.id; id != nil && vma.mappable != nil && !vma.private { + // We can't call memmap.MappingIdentity.Msync while holding + // mm.mappingMu since it may take fs locks that precede it in the + // lock order. + id.IncRef() + mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar)) + mm.mappingMu.RUnlock() + err := id.Msync(ctx, mr) + id.DecRef() + if err != nil { + return err + } + if lastEnd >= ar.End { + break + } + mm.mappingMu.RLock() + vseg = mm.vmas.LowerBoundSegment(lastEnd) + } else { + if lastEnd >= ar.End { + mm.mappingMu.RUnlock() + break + } + vseg = vseg.NextSegment() + } + } + + if unmapped { + return syserror.ENOMEM + } + return nil +} + +// VirtualMemorySize returns the combined length in bytes of all mappings in +// mm. +func (mm *MemoryManager) VirtualMemorySize() uint64 { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + return uint64(mm.usageAS) +} + +// VirtualMemorySizeRange returns the combined length in bytes of all mappings +// in ar in mm. +func (mm *MemoryManager) VirtualMemorySizeRange(ar usermem.AddrRange) uint64 { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + return uint64(mm.vmas.SpanRange(ar)) +} + +// ResidentSetSize returns the value advertised as mm's RSS in bytes. +func (mm *MemoryManager) ResidentSetSize() uint64 { + mm.activeMu.RLock() + defer mm.activeMu.RUnlock() + return uint64(mm.curRSS) +} + +// MaxResidentSetSize returns the value advertised as mm's max RSS in bytes. +func (mm *MemoryManager) MaxResidentSetSize() uint64 { + mm.activeMu.RLock() + defer mm.activeMu.RUnlock() + return uint64(mm.maxRSS) +} diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go new file mode 100644 index 000000000..b6af48cb7 --- /dev/null +++ b/pkg/sentry/mm/vma.go @@ -0,0 +1,476 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Preconditions: mm.mappingMu must be locked for writing. opts must be valid +// as defined by the checks in MMap. +func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts) (vmaIterator, usermem.AddrRange, error) { + if opts.MaxPerms != opts.MaxPerms.Effective() { + panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms)) + } + + // Find a useable range. + addr, err := mm.findAvailableLocked(opts.Length, findAvailableOpts{ + Addr: opts.Addr, + Fixed: opts.Fixed, + Unmap: opts.Unmap, + }) + if err != nil { + return vmaIterator{}, usermem.AddrRange{}, err + } + ar, _ := addr.ToRange(opts.Length) + + // Check against RLIMIT_AS. + newUsageAS := mm.usageAS + opts.Length + if opts.Unmap { + newUsageAS -= uint64(mm.vmas.SpanRange(ar)) + } + if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS { + return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM + } + + // Remove overwritten mappings. This ordering is consistent with Linux: + // compare Linux's mm/mmap.c:mmap_region() => do_munmap(), + // file->f_op->mmap(). + var vgap vmaGapIterator + if opts.Unmap { + vgap = mm.unmapLocked(ctx, ar) + } else { + vgap = mm.vmas.FindGap(ar.Start) + } + + // Inform the Mappable, if any, of the new mapping. + if opts.Mappable != nil { + if err := opts.Mappable.AddMapping(ctx, mm, ar, opts.Offset); err != nil { + return vmaIterator{}, usermem.AddrRange{}, err + } + } + + // Take a reference on opts.MappingIdentity before inserting the vma since + // vma merging can drop the reference. + if opts.MappingIdentity != nil { + opts.MappingIdentity.IncRef() + } + + // Finally insert the vma. + vseg := mm.vmas.Insert(vgap, ar, vma{ + mappable: opts.Mappable, + off: opts.Offset, + realPerms: opts.Perms, + effectivePerms: opts.Perms.Effective(), + maxPerms: opts.MaxPerms, + private: opts.Private, + growsDown: opts.GrowsDown, + id: opts.MappingIdentity, + hint: opts.Hint, + }) + mm.usageAS += opts.Length + + return vseg, ar, nil +} + +type findAvailableOpts struct { + // Addr is a suggested address. Addr must be page-aligned. + Addr usermem.Addr + + // Fixed is true if only the suggested address is acceptable. + Fixed bool + + // Unmap is true if existing vmas and guard pages may exist in the returned + // range. + Unmap bool +} + +// findAvailableLocked finds an allocatable range. +// +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) findAvailableLocked(length uint64, opts findAvailableOpts) (usermem.Addr, error) { + // Does the provided suggestion work? + if ar, ok := opts.Addr.ToRange(length); ok { + if mm.applicationAddrRange().IsSupersetOf(ar) { + if opts.Unmap { + return ar.Start, nil + } + // Check for the presence of an existing vma or guard page. + if vgap := mm.vmas.FindGap(ar.Start); vgap.Ok() && vgap.availableRange().IsSupersetOf(ar) { + return ar.Start, nil + } + } + } + + // Fixed mappings accept only the requested address. + if opts.Fixed { + return 0, syserror.ENOMEM + } + + // Prefer hugepage alignment if a hugepage or more is requested. + alignment := uint64(usermem.PageSize) + if length >= usermem.HugePageSize { + alignment = usermem.HugePageSize + } + + if mm.layout.DefaultDirection == arch.MmapBottomUp { + return mm.findLowestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.BottomUpBase, mm.layout.MaxAddr}) + } + return mm.findHighestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.MinAddr, mm.layout.TopDownBase}) +} + +func (mm *MemoryManager) applicationAddrRange() usermem.AddrRange { + return usermem.AddrRange{mm.layout.MinAddr, mm.layout.MaxAddr} +} + +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) { + for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextGap() { + if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length { + // Can we shift up to match the alignment? + if offset := uint64(gr.Start) % alignment; offset != 0 { + if uint64(gr.Length()) >= length+alignment-offset { + // Yes, we're aligned. + return gr.Start + usermem.Addr(alignment-offset), nil + } + } + + // Either aligned perfectly, or can't align it. + return gr.Start, nil + } + } + return 0, syserror.ENOMEM +} + +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) { + for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevGap() { + if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length { + // Can we shift down to match the alignment? + start := gr.End - usermem.Addr(length) + if offset := uint64(start) % alignment; offset != 0 { + if gr.Start <= start-usermem.Addr(offset) { + // Yes, we're aligned. + return start - usermem.Addr(offset), nil + } + } + + // Either aligned perfectly, or can't align it. + return start, nil + } + } + return 0, syserror.ENOMEM +} + +// getVMAsLocked ensures that vmas exist for all addresses in ar, and support +// access of type (at, ignorePermissions). It returns: +// +// - An iterator to the vma containing ar.Start. If no vma contains ar.Start, +// the iterator is unspecified. +// +// - An iterator to the gap after the last vma containing an address in ar. If +// vmas exist for no addresses in ar, the iterator is to a gap that begins +// before ar.Start. +// +// - An error that is non-nil if vmas exist for only a subset of ar. +// +// Preconditions: mm.mappingMu must be locked for reading; it may be +// temporarily unlocked. ar.Length() != 0. +func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + // Inline mm.vmas.LowerBoundSegment so that we have the preceding gap if + // !vbegin.Ok(). + vbegin, vgap := mm.vmas.Find(ar.Start) + if !vbegin.Ok() { + vbegin = vgap.NextSegment() + // vseg.Ok() is checked before entering the following loop. + } else { + vgap = vbegin.PrevGap() + } + + addr := ar.Start + vseg := vbegin + for vseg.Ok() { + // Loop invariants: vgap = vseg.PrevGap(); addr < vseg.End(). + vma := vseg.ValuePtr() + if addr < vseg.Start() { + // TODO: Implement vma.growsDown here. + return vbegin, vgap, syserror.EFAULT + } + + perms := vma.effectivePerms + if ignorePermissions { + perms = vma.maxPerms + } + if !perms.SupersetOf(at) { + return vbegin, vgap, syserror.EPERM + } + + addr = vseg.End() + vgap = vseg.NextGap() + if addr >= ar.End { + return vbegin, vgap, nil + } + vseg = vgap.NextSegment() + } + + // Ran out of vmas before ar.End. + return vbegin, vgap, syserror.EFAULT +} + +// getVecVMAsLocked ensures that vmas exist for all addresses in ars, and +// support access to type of (at, ignorePermissions). It retuns the subset of +// ars for which vmas exist. If this is not equal to ars, it returns a non-nil +// error explaining why. +// +// Preconditions: mm.mappingMu must be locked for reading; it may be +// temporarily unlocked. +// +// Postconditions: ars is not mutated. +func (mm *MemoryManager) getVecVMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool) (usermem.AddrRangeSeq, error) { + for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { + ar := arsit.Head() + if ar.Length() == 0 { + continue + } + if _, vend, err := mm.getVMAsLocked(ctx, ar, at, ignorePermissions); err != nil { + return truncatedAddrRangeSeq(ars, arsit, vend.Start()), err + } + } + return ars, nil +} + +// vma extension will not shrink the number of unmapped bytes between the start +// of a growsDown vma and the end of its predecessor non-growsDown vma below +// guardBytes. +// +// guardBytes is equivalent to Linux's stack_guard_gap after upstream +// 1be7107fbe18 "mm: larger stack guard gap, between vmas". +const guardBytes = 256 * usermem.PageSize + +// unmapLocked unmaps all addresses in ar and returns the resulting gap in +// mm.vmas. +// +// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. +// ar must be page-aligned. +func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + // AddressSpace mappings and pmas must be invalidated before + // mm.removeVMAsLocked() => memmap.Mappable.RemoveMapping(). + mm.Invalidate(ar, memmap.InvalidateOpts{InvalidatePrivate: true}) + return mm.removeVMAsLocked(ctx, ar) +} + +// removeVMAsLocked removes vmas for addresses in ar and returns the resulting +// gap in mm.vmas. It does not remove pmas or AddressSpace mappings; clients +// must do so before calling removeVMAsLocked. +// +// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. ar +// must be page-aligned. +func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + vseg, vgap := mm.vmas.Find(ar.Start) + if vgap.Ok() { + vseg = vgap.NextSegment() + } + for vseg.Ok() && vseg.Start() < ar.End { + vseg = mm.vmas.Isolate(vseg, ar) + vmaAR := vseg.Range() + vma := vseg.ValuePtr() + if vma.mappable != nil { + vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off) + } + if vma.id != nil { + vma.id.DecRef() + } + mm.usageAS -= uint64(vmaAR.Length()) + vgap = mm.vmas.Remove(vseg) + vseg = vgap.NextSegment() + } + return vgap +} + +// vmaSetFunctions implements segment.Functions for vmaSet. +type vmaSetFunctions struct{} + +func (vmaSetFunctions) MinKey() usermem.Addr { + return 0 +} + +func (vmaSetFunctions) MaxKey() usermem.Addr { + return ^usermem.Addr(0) +} + +func (vmaSetFunctions) ClearValue(vma *vma) { + vma.mappable = nil + vma.id = nil + vma.hint = "" +} + +func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRange, vma2 vma) (vma, bool) { + if vma1.mappable != vma2.mappable || + (vma1.mappable != nil && vma1.off+uint64(ar1.Length()) != vma2.off) || + vma1.realPerms != vma2.realPerms || + vma1.maxPerms != vma2.maxPerms || + vma1.private != vma2.private || + vma1.growsDown != vma2.growsDown || + vma1.id != vma2.id || + vma1.hint != vma2.hint { + return vma{}, false + } + + if vma2.id != nil { + vma2.id.DecRef() + } + return vma1, true +} + +func (vmaSetFunctions) Split(ar usermem.AddrRange, v vma, split usermem.Addr) (vma, vma) { + v2 := v + if v2.mappable != nil { + v2.off += uint64(split - ar.Start) + } + if v2.id != nil { + v2.id.IncRef() + } + return v, v2 +} + +// Preconditions: vseg.ValuePtr().mappable != nil. vseg.Range().Contains(addr). +func (vseg vmaIterator) mappableOffsetAt(addr usermem.Addr) uint64 { + if checkInvariants { + if !vseg.Ok() { + panic("terminal vma iterator") + } + if vseg.ValuePtr().mappable == nil { + panic("Mappable offset is meaningless for anonymous vma") + } + if !vseg.Range().Contains(addr) { + panic(fmt.Sprintf("addr %v out of bounds %v", addr, vseg.Range())) + } + } + + vma := vseg.ValuePtr() + vstart := vseg.Start() + return vma.off + uint64(addr-vstart) +} + +// Preconditions: vseg.ValuePtr().mappable != nil. +func (vseg vmaIterator) mappableRange() memmap.MappableRange { + return vseg.mappableRangeOf(vseg.Range()) +} + +// Preconditions: vseg.ValuePtr().mappable != nil. +// vseg.Range().IsSupersetOf(ar). ar.Length() != 0. +func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRange { + if checkInvariants { + if !vseg.Ok() { + panic("terminal vma iterator") + } + if vseg.ValuePtr().mappable == nil { + panic("MappableRange is meaningless for anonymous vma") + } + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !vseg.Range().IsSupersetOf(ar) { + panic(fmt.Sprintf("ar %v out of bounds %v", ar, vseg.Range())) + } + } + + vma := vseg.ValuePtr() + vstart := vseg.Start() + return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)} +} + +// Preconditions: vseg.ValuePtr().mappable != nil. +// vseg.mappableRange().IsSupersetOf(mr). mr.Length() != 0. +func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange { + if checkInvariants { + if !vseg.Ok() { + panic("terminal vma iterator") + } + if vseg.ValuePtr().mappable == nil { + panic("MappableRange is meaningless for anonymous vma") + } + if !mr.WellFormed() || mr.Length() <= 0 { + panic(fmt.Sprintf("invalid mr: %v", mr)) + } + if !vseg.mappableRange().IsSupersetOf(mr) { + panic(fmt.Sprintf("mr %v out of bounds %v", mr, vseg.mappableRange())) + } + } + + vma := vseg.ValuePtr() + vstart := vseg.Start() + return usermem.AddrRange{vstart + usermem.Addr(mr.Start-vma.off), vstart + usermem.Addr(mr.End-vma.off)} +} + +// seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by +// scanning linearly forward from vseg. +// +// Preconditions: mm.mappingMu must be locked. addr >= vseg.Start(). +func (vseg vmaIterator) seekNextLowerBound(addr usermem.Addr) vmaIterator { + if checkInvariants { + if !vseg.Ok() { + panic("terminal vma iterator") + } + if addr < vseg.Start() { + panic(fmt.Sprintf("can't seek forward to %#x from %#x", addr, vseg.Start())) + } + } + for vseg.Ok() && addr >= vseg.End() { + vseg = vseg.NextSegment() + } + return vseg +} + +// availableRange returns the subset of vgap.Range() in which new vmas may be +// created without MMapOpts.Unmap == true. +func (vgap vmaGapIterator) availableRange() usermem.AddrRange { + ar := vgap.Range() + next := vgap.NextSegment() + if !next.Ok() || !next.ValuePtr().growsDown { + return ar + } + // Exclude guard pages. + if ar.Length() < guardBytes { + return usermem.AddrRange{ar.Start, ar.Start} + } + ar.End -= guardBytes + return ar +} |