diff options
Diffstat (limited to 'pkg/sentry/mm')
-rw-r--r-- | pkg/sentry/mm/BUILD | 142 | ||||
-rw-r--r-- | pkg/sentry/mm/README.md | 280 | ||||
-rw-r--r-- | pkg/sentry/mm/address_space.go | 236 | ||||
-rw-r--r-- | pkg/sentry/mm/aio_context.go | 429 | ||||
-rw-r--r-- | pkg/sentry/mm/aio_context_state.go | 20 | ||||
-rw-r--r-- | pkg/sentry/mm/debug.go | 98 | ||||
-rw-r--r-- | pkg/sentry/mm/io.go | 639 | ||||
-rw-r--r-- | pkg/sentry/mm/lifecycle.go | 283 | ||||
-rw-r--r-- | pkg/sentry/mm/metadata.go | 183 | ||||
-rw-r--r-- | pkg/sentry/mm/mm.go | 478 | ||||
-rw-r--r-- | pkg/sentry/mm/mm_test.go | 230 | ||||
-rw-r--r-- | pkg/sentry/mm/pma.go | 1036 | ||||
-rw-r--r-- | pkg/sentry/mm/procfs.go | 329 | ||||
-rw-r--r-- | pkg/sentry/mm/save_restore.go | 57 | ||||
-rw-r--r-- | pkg/sentry/mm/shm.go | 66 | ||||
-rw-r--r-- | pkg/sentry/mm/special_mappable.go | 157 | ||||
-rw-r--r-- | pkg/sentry/mm/syscalls.go | 1286 | ||||
-rw-r--r-- | pkg/sentry/mm/vma.go | 568 |
18 files changed, 6517 insertions, 0 deletions
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD new file mode 100644 index 000000000..a036ce53c --- /dev/null +++ b/pkg/sentry/mm/BUILD @@ -0,0 +1,142 @@ +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "file_refcount_set", + out = "file_refcount_set.go", + imports = { + "platform": "gvisor.dev/gvisor/pkg/sentry/platform", + }, + package = "mm", + prefix = "fileRefcount", + template = "//pkg/segment:generic_set", + types = { + "Key": "uint64", + "Range": "platform.FileRange", + "Value": "int32", + "Functions": "fileRefcountSetFunctions", + }, +) + +go_template_instance( + name = "vma_set", + out = "vma_set.go", + consts = { + "minDegree": "8", + "trackGaps": "1", + }, + imports = { + "usermem": "gvisor.dev/gvisor/pkg/usermem", + }, + package = "mm", + prefix = "vma", + template = "//pkg/segment:generic_set", + types = { + "Key": "usermem.Addr", + "Range": "usermem.AddrRange", + "Value": "vma", + "Functions": "vmaSetFunctions", + }, +) + +go_template_instance( + name = "pma_set", + out = "pma_set.go", + consts = { + "minDegree": "8", + }, + imports = { + "usermem": "gvisor.dev/gvisor/pkg/usermem", + }, + package = "mm", + prefix = "pma", + template = "//pkg/segment:generic_set", + types = { + "Key": "usermem.Addr", + "Range": "usermem.AddrRange", + "Value": "pma", + "Functions": "pmaSetFunctions", + }, +) + +go_template_instance( + name = "io_list", + out = "io_list.go", + package = "mm", + prefix = "io", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*ioResult", + "Linker": "*ioResult", + }, +) + +go_library( + name = "mm", + srcs = [ + "address_space.go", + "aio_context.go", + "aio_context_state.go", + "debug.go", + "file_refcount_set.go", + "io.go", + "io_list.go", + "lifecycle.go", + "metadata.go", + "mm.go", + "pma.go", + "pma_set.go", + "procfs.go", + "save_restore.go", + "shm.go", + "special_mappable.go", + "syscalls.go", + "vma.go", + "vma_set.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/atomicbitops", + "//pkg/context", + "//pkg/log", + "//pkg/refs", + "//pkg/safecopy", + "//pkg/safemem", + "//pkg/sentry/arch", + "//pkg/sentry/fs/proc/seqfile", + "//pkg/sentry/fsbridge", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/futex", + "//pkg/sentry/kernel/shm", + "//pkg/sentry/limits", + "//pkg/sentry/memmap", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/usage", + "//pkg/sync", + "//pkg/syserror", + "//pkg/tcpip/buffer", + "//pkg/usermem", + ], +) + +go_test( + name = "mm_test", + size = "small", + srcs = ["mm_test.go"], + library = ":mm", + deps = [ + "//pkg/context", + "//pkg/sentry/arch", + "//pkg/sentry/contexttest", + "//pkg/sentry/limits", + "//pkg/sentry/memmap", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/mm/README.md b/pkg/sentry/mm/README.md new file mode 100644 index 000000000..f4d43d927 --- /dev/null +++ b/pkg/sentry/mm/README.md @@ -0,0 +1,280 @@ +This package provides an emulation of Linux semantics for application virtual +memory mappings. + +For completeness, this document also describes aspects of the memory management +subsystem defined outside this package. + +# Background + +We begin by describing semantics for virtual memory in Linux. + +A virtual address space is defined as a collection of mappings from virtual +addresses to physical memory. However, userspace applications do not configure +mappings to physical memory directly. Instead, applications configure memory +mappings from virtual addresses to offsets into a file using the `mmap` system +call.[^mmap-anon] For example, a call to: + + mmap( + /* addr = */ 0x400000, + /* length = */ 0x1000, + PROT_READ | PROT_WRITE, + MAP_SHARED, + /* fd = */ 3, + /* offset = */ 0); + +creates a mapping of length 0x1000 bytes, starting at virtual address (VA) +0x400000, to offset 0 in the file represented by file descriptor (FD) 3. Within +the Linux kernel, virtual memory mappings are represented by *virtual memory +areas* (VMAs). Supposing that FD 3 represents file /tmp/foo, the state of the +virtual memory subsystem after the `mmap` call may be depicted as: + + VMA: VA:0x400000 -> /tmp/foo:0x0 + +Establishing a virtual memory area does not necessarily establish a mapping to a +physical address, because Linux has not necessarily provisioned physical memory +to store the file's contents. Thus, if the application attempts to read the +contents of VA 0x400000, it may incur a *page fault*, a CPU exception that +forces the kernel to create such a mapping to service the read. + +For a file, doing so consists of several logical phases: + +1. The kernel allocates physical memory to store the contents of the required + part of the file, and copies file contents to the allocated memory. + Supposing that the kernel chooses the physical memory at physical address + (PA) 0x2fb000, the resulting state of the system is: + + VMA: VA:0x400000 -> /tmp/foo:0x0 + Filemap: /tmp/foo:0x0 -> PA:0x2fb000 + + (In Linux the state of the mapping from file offset to physical memory is + stored in `struct address_space`, but to avoid confusion with other notions + of address space we will refer to this system as filemap, named after Linux + kernel source file `mm/filemap.c`.) + +2. The kernel stores the effective mapping from virtual to physical address in + a *page table entry* (PTE) in the application's *page tables*, which are + used by the CPU's virtual memory hardware to perform address translation. + The resulting state of the system is: + + VMA: VA:0x400000 -> /tmp/foo:0x0 + Filemap: /tmp/foo:0x0 -> PA:0x2fb000 + PTE: VA:0x400000 -----------------> PA:0x2fb000 + + The PTE is required for the application to actually use the contents of the + mapped file as virtual memory. However, the PTE is derived from the VMA and + filemap state, both of which are independently mutable, such that mutations + to either will affect the PTE. For example: + + - The application may remove the VMA using the `munmap` system call. This + breaks the mapping from VA:0x400000 to /tmp/foo:0x0, and consequently + the mapping from VA:0x400000 to PA:0x2fb000. However, it does not + necessarily break the mapping from /tmp/foo:0x0 to PA:0x2fb000, so a + future mapping of the same file offset may reuse this physical memory. + + - The application may invalidate the file's contents by passing a length + of 0 to the `ftruncate` system call. This breaks the mapping from + /tmp/foo:0x0 to PA:0x2fb000, and consequently the mapping from + VA:0x400000 to PA:0x2fb000. However, it does not break the mapping from + VA:0x400000 to /tmp/foo:0x0, so future changes to the file's contents + may again be made visible at VA:0x400000 after another page fault + results in the allocation of a new physical address. + + Note that, in order to correctly break the mapping from VA:0x400000 to + PA:0x2fb000 in the latter case, filemap must also store a *reverse mapping* + from /tmp/foo:0x0 to VA:0x400000 so that it can locate and remove the PTE. + +[^mmap-anon]: Memory mappings to non-files are discussed in later sections. + +## Private Mappings + +The preceding example considered VMAs created using the `MAP_SHARED` flag, which +means that PTEs derived from the mapping should always use physical memory that +represents the current state of the mapped file.[^mmap-dev-zero] Applications +can alternatively pass the `MAP_PRIVATE` flag to create a *private mapping*. +Private mappings are *copy-on-write*. + +Suppose that the application instead created a private mapping in the previous +example. In Linux, the state of the system after a read page fault would be: + + VMA: VA:0x400000 -> /tmp/foo:0x0 (private) + Filemap: /tmp/foo:0x0 -> PA:0x2fb000 + PTE: VA:0x400000 -----------------> PA:0x2fb000 (read-only) + +Now suppose the application attempts to write to VA:0x400000. For a shared +mapping, the write would be propagated to PA:0x2fb000, and the kernel would be +responsible for ensuring that the write is later propagated to the mapped file. +For a private mapping, the write incurs another page fault since the PTE is +marked read-only. In response, the kernel allocates physical memory to store the +mapping's *private copy* of the file's contents, copies file contents to the +allocated memory, and changes the PTE to map to the private copy. Supposing that +the kernel chooses the physical memory at physical address (PA) 0x5ea000, the +resulting state of the system is: + + VMA: VA:0x400000 -> /tmp/foo:0x0 (private) + Filemap: /tmp/foo:0x0 -> PA:0x2fb000 + PTE: VA:0x400000 -----------------> PA:0x5ea000 + +Note that the filemap mapping from /tmp/foo:0x0 to PA:0x2fb000 may still exist, +but is now irrelevant to this mapping. + +[^mmap-dev-zero]: Modulo files with special mmap semantics such as `/dev/zero`. + +## Anonymous Mappings + +Instead of passing a file to the `mmap` system call, applications can instead +request an *anonymous* mapping by passing the `MAP_ANONYMOUS` flag. +Semantically, an anonymous mapping is essentially a mapping to an ephemeral file +initially filled with zero bytes. Practically speaking, this is how shared +anonymous mappings are implemented, but private anonymous mappings do not result +in the creation of an ephemeral file; since there would be no way to modify the +contents of the underlying file through a private mapping, all private anonymous +mappings use a single shared page filled with zero bytes until copy-on-write +occurs. + +# Virtual Memory in the Sentry + +The sentry implements application virtual memory atop a host kernel, introducing +an additional level of indirection to the above. + +Consider the same scenario as in the previous section. Since the sentry handles +application system calls, the effect of an application `mmap` system call is to +create a VMA in the sentry (as opposed to the host kernel): + + Sentry VMA: VA:0x400000 -> /tmp/foo:0x0 + +When the application first incurs a page fault on this address, the host kernel +delivers information about the page fault to the sentry in a platform-dependent +manner, and the sentry handles the fault: + +1. The sentry allocates memory to store the contents of the required part of + the file, and copies file contents to the allocated memory. However, since + the sentry is implemented atop a host kernel, it does not configure mappings + to physical memory directly. Instead, mappable "memory" in the sentry is + represented by a host file descriptor and offset, since (as noted in + "Background") this is the memory mapping primitive provided by the host + kernel. In general, memory is allocated from a temporary host file using the + `pgalloc` package. Supposing that the sentry allocates offset 0x3000 from + host file "memory-file", the resulting state is: + + Sentry VMA: VA:0x400000 -> /tmp/foo:0x0 + Sentry filemap: /tmp/foo:0x0 -> host:memory-file:0x3000 + +2. The sentry stores the effective mapping from virtual address to host file in + a host VMA by invoking the `mmap` system call: + + Sentry VMA: VA:0x400000 -> /tmp/foo:0x0 + Sentry filemap: /tmp/foo:0x0 -> host:memory-file:0x3000 + Host VMA: VA:0x400000 -----------------> host:memory-file:0x3000 + +3. The sentry returns control to the application, which immediately incurs the + page fault again.[^mmap-populate] However, since a host VMA now exists for + the faulting virtual address, the host kernel now handles the page fault as + described in "Background": + + Sentry VMA: VA:0x400000 -> /tmp/foo:0x0 + Sentry filemap: /tmp/foo:0x0 -> host:memory-file:0x3000 + Host VMA: VA:0x400000 -----------------> host:memory-file:0x3000 + Host filemap: host:memory-file:0x3000 -> PA:0x2fb000 + Host PTE: VA:0x400000 --------------------------------------------> PA:0x2fb000 + +Thus, from an implementation standpoint, host VMAs serve the same purpose in the +sentry that PTEs do in Linux. As in Linux, sentry VMA and filemap state is +independently mutable, and the desired state of host VMAs is derived from that +state. + +[^mmap-populate]: The sentry could force the host kernel to establish PTEs when + it creates the host VMA by passing the `MAP_POPULATE` flag to + the `mmap` system call, but usually does not. This is because, + to reduce the number of page faults that require handling by + the sentry and (correspondingly) the number of host `mmap` + system calls, the sentry usually creates host VMAs that are + much larger than the single faulting page. + +## Private Mappings + +The sentry implements private mappings consistently with Linux. Before +copy-on-write, the private mapping example given in the Background results in: + + Sentry VMA: VA:0x400000 -> /tmp/foo:0x0 (private) + Sentry filemap: /tmp/foo:0x0 -> host:memory-file:0x3000 + Host VMA: VA:0x400000 -----------------> host:memory-file:0x3000 (read-only) + Host filemap: host:memory-file:0x3000 -> PA:0x2fb000 + Host PTE: VA:0x400000 --------------------------------------------> PA:0x2fb000 (read-only) + +When the application attempts to write to this address, the host kernel delivers +information about the resulting page fault to the sentry. Analogous to Linux, +the sentry allocates memory to store the mapping's private copy of the file's +contents, copies file contents to the allocated memory, and changes the host VMA +to map to the private copy. Supposing that the sentry chooses the offset 0x4000 +in host file `memory-file` to store the private copy, the state of the system +after copy-on-write is: + + Sentry VMA: VA:0x400000 -> /tmp/foo:0x0 (private) + Sentry filemap: /tmp/foo:0x0 -> host:memory-file:0x3000 + Host VMA: VA:0x400000 -----------------> host:memory-file:0x4000 + Host filemap: host:memory-file:0x4000 -> PA:0x5ea000 + Host PTE: VA:0x400000 --------------------------------------------> PA:0x5ea000 + +However, this highlights an important difference between Linux and the sentry. +In Linux, page tables are concrete (architecture-dependent) data structures +owned by the kernel. Conversely, the sentry has the ability to create and +destroy host VMAs using host system calls, but it does not have direct access to +their state. Thus, as written, if the application invokes the `munmap` system +call to remove the sentry VMA, it is non-trivial for the sentry to determine +that it should deallocate `host:memory-file:0x4000`. This implies that the +sentry must retain information about the host VMAs that it has created. + +## Anonymous Mappings + +The sentry implements anonymous mappings consistently with Linux, except that +there is no shared zero page. + +# Implementation Constructs + +In Linux: + +- A virtual address space is represented by `struct mm_struct`. + +- VMAs are represented by `struct vm_area_struct`, stored in `struct + mm_struct::mmap`. + +- Mappings from file offsets to physical memory are stored in `struct + address_space`. + +- Reverse mappings from file offsets to virtual mappings are stored in `struct + address_space::i_mmap`. + +- Physical memory pages are represented by a pointer to `struct page` or an + index called a *page frame number* (PFN), represented by `pfn_t`. + +- PTEs are represented by architecture-dependent type `pte_t`, stored in a + table hierarchy rooted at `struct mm_struct::pgd`. + +In the sentry: + +- A virtual address space is represented by type [`mm.MemoryManager`][mm]. + +- Sentry VMAs are represented by type [`mm.vma`][mm], stored in + `mm.MemoryManager.vmas`. + +- Mappings from sentry file offsets to host file offsets are abstracted + through interface method [`memmap.Mappable.Translate`][memmap]. + +- Reverse mappings from sentry file offsets to virtual mappings are abstracted + through interface methods + [`memmap.Mappable.AddMapping` and `memmap.Mappable.RemoveMapping`][memmap]. + +- Host files that may be mapped into host VMAs are represented by type + [`platform.File`][platform]. + +- Host VMAs are represented in the sentry by type [`mm.pma`][mm] ("platform + mapping area"), stored in `mm.MemoryManager.pmas`. + +- Creation and destruction of host VMAs is abstracted through interface + methods + [`platform.AddressSpace.MapFile` and `platform.AddressSpace.Unmap`][platform]. + +[memmap]: https://github.com/google/gvisor/blob/master/pkg/sentry/memmap/memmap.go +[mm]: https://github.com/google/gvisor/blob/master/pkg/sentry/mm/mm.go +[pgalloc]: https://github.com/google/gvisor/blob/master/pkg/sentry/pgalloc/pgalloc.go +[platform]: https://github.com/google/gvisor/blob/master/pkg/sentry/platform/platform.go diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go new file mode 100644 index 000000000..5c667117c --- /dev/null +++ b/pkg/sentry/mm/address_space.go @@ -0,0 +1,236 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/usermem" +) + +// AddressSpace returns the platform.AddressSpace bound to mm. +// +// Preconditions: The caller must have called mm.Activate(). +func (mm *MemoryManager) AddressSpace() platform.AddressSpace { + if atomic.LoadInt32(&mm.active) == 0 { + panic("trying to use inactive address space?") + } + return mm.as +} + +// Activate ensures this MemoryManager has a platform.AddressSpace. +// +// The caller must not hold any locks when calling Activate. +// +// When this MemoryManager is no longer needed by a task, it should call +// Deactivate to release the reference. +func (mm *MemoryManager) Activate(ctx context.Context) error { + // Fast path: the MemoryManager already has an active + // platform.AddressSpace, and we just need to indicate that we need it too. + for { + active := atomic.LoadInt32(&mm.active) + if active == 0 { + // Fall back to the slow path. + break + } + if atomic.CompareAndSwapInt32(&mm.active, active, active+1) { + return nil + } + } + + for { + // Slow path: may need to synchronize with other goroutines changing + // mm.active to or from zero. + mm.activeMu.Lock() + // Inline Unlock instead of using a defer for performance since this + // method is commonly in the hot-path. + + // Check if we raced with another goroutine performing activation. + if atomic.LoadInt32(&mm.active) > 0 { + // This can't race; Deactivate can't decrease mm.active from 1 to 0 + // without holding activeMu. + atomic.AddInt32(&mm.active, 1) + mm.activeMu.Unlock() + return nil + } + + // Do we have a context? If so, then we never unmapped it. This can + // only be the case if !mm.p.CooperativelySchedulesAddressSpace(). + if mm.as != nil { + atomic.StoreInt32(&mm.active, 1) + mm.activeMu.Unlock() + return nil + } + + // Get a new address space. We must force unmapping by passing nil to + // NewAddressSpace if requested. (As in the nil interface object, not a + // typed nil.) + mappingsID := (interface{})(mm) + if mm.unmapAllOnActivate { + mappingsID = nil + } + as, c, err := mm.p.NewAddressSpace(mappingsID) + if err != nil { + mm.activeMu.Unlock() + return err + } + if as == nil { + // AddressSpace is unavailable, we must wait. + // + // activeMu must not be held while waiting, as the user of the address + // space we are waiting on may attempt to take activeMu. + mm.activeMu.Unlock() + + sleep := mm.p.CooperativelySchedulesAddressSpace() && mm.sleepForActivation + if sleep { + // Mark this task sleeping while waiting for the address space to + // prevent the watchdog from reporting it as a stuck task. + ctx.UninterruptibleSleepStart(false) + } + <-c + if sleep { + ctx.UninterruptibleSleepFinish(false) + } + continue + } + + // Okay, we could restore all mappings at this point. + // But forget that. Let's just let them fault in. + mm.as = as + + // Unmapping is done, if necessary. + mm.unmapAllOnActivate = false + + // Now that m.as has been assigned, we can set m.active to a non-zero value + // to enable the fast path. + atomic.StoreInt32(&mm.active, 1) + + mm.activeMu.Unlock() + return nil + } +} + +// Deactivate releases a reference to the MemoryManager. +func (mm *MemoryManager) Deactivate() { + // Fast path: this is not the last goroutine to deactivate the + // MemoryManager. + for { + active := atomic.LoadInt32(&mm.active) + if active == 1 { + // Fall back to the slow path. + break + } + if atomic.CompareAndSwapInt32(&mm.active, active, active-1) { + return + } + } + + mm.activeMu.Lock() + // Same as Activate. + + // Still active? + if atomic.AddInt32(&mm.active, -1) > 0 { + mm.activeMu.Unlock() + return + } + + // Can we hold on to the address space? + if !mm.p.CooperativelySchedulesAddressSpace() { + mm.activeMu.Unlock() + return + } + + // Release the address space. + mm.as.Release() + + // Lost it. + mm.as = nil + mm.activeMu.Unlock() +} + +// mapASLocked maps addresses in ar into mm.as. If precommit is true, mappings +// for all addresses in ar should be precommitted. +// +// Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0. +// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start). +func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error { + // By default, map entire pmas at a time, under the assumption that there + // is no cost to mapping more of a pma than necessary. + mapAR := usermem.AddrRange{0, ^usermem.Addr(usermem.PageSize - 1)} + if precommit { + // When explicitly precommitting, only map ar, since overmapping may + // incur unexpected resource usage. + mapAR = ar + } else if mapUnit := mm.p.MapUnit(); mapUnit != 0 { + // Limit the range we map to ar, aligned to mapUnit. + mapMask := usermem.Addr(mapUnit - 1) + mapAR.Start = ar.Start &^ mapMask + // If rounding ar.End up overflows, just keep the existing mapAR.End. + if end := (ar.End + mapMask) &^ mapMask; end >= ar.End { + mapAR.End = end + } + } + if checkInvariants { + if !mapAR.IsSupersetOf(ar) { + panic(fmt.Sprintf("mapAR %#v is not a superset of ar %#v", mapAR, ar)) + } + } + + // Since this checks ar.End and not mapAR.End, we will never map a pma that + // is not required. + for pseg.Ok() && pseg.Start() < ar.End { + pma := pseg.ValuePtr() + pmaAR := pseg.Range() + pmaMapAR := pmaAR.Intersect(mapAR) + perms := pma.effectivePerms + if pma.needCOW { + perms.Write = false + } + if perms.Any() { // MapFile precondition + if err := mm.as.MapFile(pmaMapAR.Start, pma.file, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil { + return err + } + } + pseg = pseg.NextSegment() + } + return nil +} + +// unmapASLocked removes all AddressSpace mappings for addresses in ar. +// +// Preconditions: mm.activeMu must be locked. +func (mm *MemoryManager) unmapASLocked(ar usermem.AddrRange) { + if mm.as == nil { + // No AddressSpace? Force all mappings to be unmapped on the next + // Activate. + mm.unmapAllOnActivate = true + return + } + + // unmapASLocked doesn't require vmas or pmas to exist for ar, so it can be + // passed ranges that include addresses that can't be mapped by the + // application. + ar = ar.Intersect(mm.applicationAddrRange()) + + // Note that this AddressSpace may or may not be active. If the + // platform does not require cooperative sharing of AddressSpaces, they + // are retained between Deactivate/Activate calls. Despite not being + // active, it is still valid to perform operations on these address + // spaces. + mm.as.Unmap(ar.Start, uint64(ar.Length())) +} diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go new file mode 100644 index 000000000..379148903 --- /dev/null +++ b/pkg/sentry/mm/aio_context.go @@ -0,0 +1,429 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// aioManager creates and manages asynchronous I/O contexts. +// +// +stateify savable +type aioManager struct { + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // aioContexts is the set of asynchronous I/O contexts. + contexts map[uint64]*AIOContext +} + +func (a *aioManager) destroy() { + a.mu.Lock() + defer a.mu.Unlock() + + for _, ctx := range a.contexts { + ctx.destroy() + } +} + +// newAIOContext creates a new context for asynchronous I/O. +// +// Returns false if 'id' is currently in use. +func (a *aioManager) newAIOContext(events uint32, id uint64) bool { + a.mu.Lock() + defer a.mu.Unlock() + + if _, ok := a.contexts[id]; ok { + return false + } + + a.contexts[id] = &AIOContext{ + requestReady: make(chan struct{}, 1), + maxOutstanding: events, + } + return true +} + +// destroyAIOContext destroys an asynchronous I/O context. It doesn't wait for +// for pending requests to complete. Returns the destroyed AIOContext so it can +// be drained. +// +// Nil is returned if the context does not exist. +func (a *aioManager) destroyAIOContext(id uint64) *AIOContext { + a.mu.Lock() + defer a.mu.Unlock() + ctx, ok := a.contexts[id] + if !ok { + return nil + } + delete(a.contexts, id) + ctx.destroy() + return ctx +} + +// lookupAIOContext looks up the given context. +// +// Returns false if context does not exist. +func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) { + a.mu.Lock() + defer a.mu.Unlock() + ctx, ok := a.contexts[id] + return ctx, ok +} + +// ioResult is a completed I/O operation. +// +// +stateify savable +type ioResult struct { + data interface{} + ioEntry +} + +// AIOContext is a single asynchronous I/O context. +// +// +stateify savable +type AIOContext struct { + // requestReady is the notification channel used for all requests. + requestReady chan struct{} `state:"nosave"` + + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // results is the set of completed requests. + results ioList + + // maxOutstanding is the maximum number of outstanding entries; this value + // is immutable. + maxOutstanding uint32 + + // outstanding is the number of requests outstanding; this will effectively + // be the number of entries in the result list or that are expected to be + // added to the result list. + outstanding uint32 + + // dead is set when the context is destroyed. + dead bool `state:"zerovalue"` +} + +// destroy marks the context dead. +func (ctx *AIOContext) destroy() { + ctx.mu.Lock() + defer ctx.mu.Unlock() + ctx.dead = true + ctx.checkForDone() +} + +// Preconditions: ctx.mu must be held by caller. +func (ctx *AIOContext) checkForDone() { + if ctx.dead && ctx.outstanding == 0 { + close(ctx.requestReady) + ctx.requestReady = nil + } +} + +// Prepare reserves space for a new request, returning true if available. +// Returns false if the context is busy. +func (ctx *AIOContext) Prepare() bool { + ctx.mu.Lock() + defer ctx.mu.Unlock() + if ctx.outstanding >= ctx.maxOutstanding { + return false + } + ctx.outstanding++ + return true +} + +// PopRequest pops a completed request if available, this function does not do +// any blocking. Returns false if no request is available. +func (ctx *AIOContext) PopRequest() (interface{}, bool) { + ctx.mu.Lock() + defer ctx.mu.Unlock() + + // Is there anything ready? + if e := ctx.results.Front(); e != nil { + if ctx.outstanding == 0 { + panic("AIOContext outstanding is going negative") + } + ctx.outstanding-- + ctx.results.Remove(e) + ctx.checkForDone() + return e.data, true + } + return nil, false +} + +// FinishRequest finishes a pending request. It queues up the data +// and notifies listeners. +func (ctx *AIOContext) FinishRequest(data interface{}) { + ctx.mu.Lock() + defer ctx.mu.Unlock() + + // Push to the list and notify opportunistically. The channel notify + // here is guaranteed to be safe because outstanding must be non-zero. + // The requestReady channel is only closed when outstanding reaches zero. + ctx.results.PushBack(&ioResult{data: data}) + + select { + case ctx.requestReady <- struct{}{}: + default: + } +} + +// WaitChannel returns a channel that is notified when an AIO request is +// completed. Returns nil if the context is destroyed and there are no more +// outstanding requests. +func (ctx *AIOContext) WaitChannel() chan struct{} { + ctx.mu.Lock() + defer ctx.mu.Unlock() + return ctx.requestReady +} + +// Dead returns true if the context has been destroyed. +func (ctx *AIOContext) Dead() bool { + ctx.mu.Lock() + defer ctx.mu.Unlock() + return ctx.dead +} + +// CancelPendingRequest forgets about a request that hasn't yet completed. +func (ctx *AIOContext) CancelPendingRequest() { + ctx.mu.Lock() + defer ctx.mu.Unlock() + + if ctx.outstanding == 0 { + panic("AIOContext outstanding is going negative") + } + ctx.outstanding-- + ctx.checkForDone() +} + +// Drain drops all completed requests. Pending requests remain untouched. +func (ctx *AIOContext) Drain() { + ctx.mu.Lock() + defer ctx.mu.Unlock() + + if ctx.outstanding == 0 { + return + } + size := uint32(ctx.results.Len()) + if ctx.outstanding < size { + panic("AIOContext outstanding is going negative") + } + ctx.outstanding -= size + ctx.results.Reset() + ctx.checkForDone() +} + +// aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO +// ring buffers. +// +// +stateify savable +type aioMappable struct { + refs.AtomicRefCount + + mfp pgalloc.MemoryFileProvider + fr platform.FileRange +} + +var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp()) + +func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) { + fr, err := mfp.MemoryFile().Allocate(aioRingBufferSize, usage.Anonymous) + if err != nil { + return nil, err + } + m := aioMappable{mfp: mfp, fr: fr} + m.EnableLeakCheck("mm.aioMappable") + return &m, nil +} + +// DecRef implements refs.RefCounter.DecRef. +func (m *aioMappable) DecRef() { + m.AtomicRefCount.DecRefWithDestructor(func() { + m.mfp.MemoryFile().DecRef(m.fr) + }) +} + +// MappedName implements memmap.MappingIdentity.MappedName. +func (m *aioMappable) MappedName(ctx context.Context) string { + return "[aio]" +} + +// DeviceID implements memmap.MappingIdentity.DeviceID. +func (m *aioMappable) DeviceID() uint64 { + return 0 +} + +// InodeID implements memmap.MappingIdentity.InodeID. +func (m *aioMappable) InodeID() uint64 { + return 0 +} + +// Msync implements memmap.MappingIdentity.Msync. +func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error { + // Linux: aio_ring_fops.fsync == NULL + return syserror.EINVAL +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (m *aioMappable) AddMapping(_ context.Context, _ memmap.MappingSpace, ar usermem.AddrRange, offset uint64, _ bool) error { + // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() + // sets VM_DONTEXPAND). + if offset != 0 || uint64(ar.Length()) != aioRingBufferSize { + return syserror.EFAULT + } + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (m *aioMappable) RemoveMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) { +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, _ bool) error { + // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() + // sets VM_DONTEXPAND). + if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize { + return syserror.EFAULT + } + // Require that the mapping correspond to a live AIOContext. Compare + // Linux's fs/aio.c:aio_ring_mremap(). + mm, ok := ms.(*MemoryManager) + if !ok { + return syserror.EINVAL + } + am := &mm.aioManager + am.mu.Lock() + defer am.mu.Unlock() + oldID := uint64(srcAR.Start) + aioCtx, ok := am.contexts[oldID] + if !ok { + return syserror.EINVAL + } + aioCtx.mu.Lock() + defer aioCtx.mu.Unlock() + if aioCtx.dead { + return syserror.EINVAL + } + // Use the new ID for the AIOContext. + am.contexts[uint64(dstAR.Start)] = aioCtx + delete(am.contexts, oldID) + return nil +} + +// Translate implements memmap.Mappable.Translate. +func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + var err error + if required.End > m.fr.Length() { + err = &memmap.BusError{syserror.EFAULT} + } + if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 { + return []memmap.Translation{ + { + Source: source, + File: m.mfp.MemoryFile(), + Offset: m.fr.Start + source.Start, + Perms: usermem.AnyAccess, + }, + }, err + } + return nil, err +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (m *aioMappable) InvalidateUnsavable(ctx context.Context) error { + return nil +} + +// NewAIOContext creates a new context for asynchronous I/O. +// +// NewAIOContext is analogous to Linux's fs/aio.c:ioctx_alloc(). +func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint64, error) { + // libaio get_ioevents() expects context "handle" to be a valid address. + // libaio peeks inside looking for a magic number. This function allocates + // a page per context and keeps it set to zeroes to ensure it will not + // match AIO_RING_MAGIC and make libaio happy. + m, err := newAIOMappable(mm.mfp) + if err != nil { + return 0, err + } + defer m.DecRef() + addr, err := mm.MMap(ctx, memmap.MMapOpts{ + Length: aioRingBufferSize, + MappingIdentity: m, + Mappable: m, + // Linux uses "do_mmap_pgoff(..., PROT_READ | PROT_WRITE, ...)" in + // fs/aio.c:aio_setup_ring(). Since we don't implement AIO_RING_MAGIC, + // user mode should not write to this page. + Perms: usermem.Read, + MaxPerms: usermem.Read, + }) + if err != nil { + return 0, err + } + id := uint64(addr) + if !mm.aioManager.newAIOContext(events, id) { + mm.MUnmap(ctx, addr, aioRingBufferSize) + return 0, syserror.EINVAL + } + return id, nil +} + +// DestroyAIOContext destroys an asynchronous I/O context. It returns the +// destroyed context. nil if the context does not exist. +func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOContext { + if _, ok := mm.LookupAIOContext(ctx, id); !ok { + return nil + } + + // Only unmaps after it assured that the address is a valid aio context to + // prevent random memory from been unmapped. + // + // Note: It's possible to unmap this address and map something else into + // the same address. Then it would be unmapping memory that it doesn't own. + // This is, however, the way Linux implements AIO. Keeps the same [weird] + // semantics in case anyone relies on it. + mm.MUnmap(ctx, usermem.Addr(id), aioRingBufferSize) + + return mm.aioManager.destroyAIOContext(id) +} + +// LookupAIOContext looks up the given context. It returns false if the context +// does not exist. +func (mm *MemoryManager) LookupAIOContext(ctx context.Context, id uint64) (*AIOContext, bool) { + aioCtx, ok := mm.aioManager.lookupAIOContext(id) + if !ok { + return nil, false + } + + // Protect against 'ids' that are inaccessible (Linux also reads 4 bytes + // from id). + var buf [4]byte + _, err := mm.CopyIn(ctx, usermem.Addr(id), buf[:], usermem.IOOpts{}) + if err != nil { + return nil, false + } + + return aioCtx, true +} diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go new file mode 100644 index 000000000..3dabac1af --- /dev/null +++ b/pkg/sentry/mm/aio_context_state.go @@ -0,0 +1,20 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +// afterLoad is invoked by stateify. +func (a *AIOContext) afterLoad() { + a.requestReady = make(chan struct{}, 1) +} diff --git a/pkg/sentry/mm/debug.go b/pkg/sentry/mm/debug.go new file mode 100644 index 000000000..c273c982e --- /dev/null +++ b/pkg/sentry/mm/debug.go @@ -0,0 +1,98 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/context" +) + +const ( + // If checkInvariants is true, perform runtime checks for invariants + // expected by the mm package. This is normally disabled since MM is a + // significant hot path in general, and some such checks (notably + // memmap.CheckTranslateResult) are very expensive. + checkInvariants = false + + // If logIOErrors is true, log I/O errors that originate from MM before + // converting them to EFAULT. + logIOErrors = false +) + +// String implements fmt.Stringer.String. +func (mm *MemoryManager) String() string { + return mm.DebugString(context.Background()) +} + +// DebugString returns a string containing information about mm for debugging. +func (mm *MemoryManager) DebugString(ctx context.Context) string { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + mm.activeMu.RLock() + defer mm.activeMu.RUnlock() + return mm.debugStringLocked(ctx) +} + +// Preconditions: mm.mappingMu and mm.activeMu must be locked. +func (mm *MemoryManager) debugStringLocked(ctx context.Context) string { + var b bytes.Buffer + b.WriteString("VMAs:\n") + for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { + b.Write(mm.vmaMapsEntryLocked(ctx, vseg)) + } + b.WriteString("PMAs:\n") + for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() { + b.Write(pseg.debugStringEntryLocked()) + } + return string(b.Bytes()) +} + +// Preconditions: mm.activeMu must be locked. +func (pseg pmaIterator) debugStringEntryLocked() []byte { + var b bytes.Buffer + + fmt.Fprintf(&b, "%08x-%08x ", pseg.Start(), pseg.End()) + + pma := pseg.ValuePtr() + if pma.effectivePerms.Read { + b.WriteByte('r') + } else { + b.WriteByte('-') + } + if pma.effectivePerms.Write { + if pma.needCOW { + b.WriteByte('c') + } else { + b.WriteByte('w') + } + } else { + b.WriteByte('-') + } + if pma.effectivePerms.Execute { + b.WriteByte('x') + } else { + b.WriteByte('-') + } + if pma.private { + b.WriteByte('p') + } else { + b.WriteByte('s') + } + + fmt.Fprintf(&b, " %08x %T\n", pma.off, pma.file) + return b.Bytes() +} diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go new file mode 100644 index 000000000..fa776f9c6 --- /dev/null +++ b/pkg/sentry/mm/io.go @@ -0,0 +1,639 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// There are two supported ways to copy data to/from application virtual +// memory: +// +// 1. Internally-mapped copying: Determine the platform.File that backs the +// copied-to/from virtual address, obtain a mapping of its pages, and read or +// write to the mapping. +// +// 2. AddressSpace copying: If platform.Platform.SupportsAddressSpaceIO() is +// true, AddressSpace permissions are applicable, and an AddressSpace is +// available, copy directly through the AddressSpace, handling faults as +// needed. +// +// (Given that internally-mapped copying requires that backing memory is always +// implemented using a host file descriptor, we could also preadv/pwritev to it +// instead. But this would incur a host syscall for each use of the mapped +// page, whereas mmap is a one-time cost.) +// +// The fixed overhead of internally-mapped copying is expected to be higher +// than that of AddressSpace copying since the former always needs to translate +// addresses, whereas the latter only needs to do so when faults occur. +// However, the throughput of internally-mapped copying is expected to be +// somewhat higher than that of AddressSpace copying due to the high cost of +// page faults and because implementations of the latter usually rely on +// safecopy, which doesn't use AVX registers. So we prefer to use AddressSpace +// copying (when available) for smaller copies, and switch to internally-mapped +// copying once a size threshold is exceeded. +const ( + // copyMapMinBytes is the size threshold for switching to internally-mapped + // copying in CopyOut, CopyIn, and ZeroOut. + copyMapMinBytes = 32 << 10 // 32 KB + + // rwMapMinBytes is the size threshold for switching to internally-mapped + // copying in CopyOutFrom and CopyInTo. It's lower than copyMapMinBytes + // since AddressSpace copying in this case requires additional buffering; + // see CopyOutFrom for details. + rwMapMinBytes = 512 +) + +// CheckIORange is similar to usermem.Addr.ToRange, but applies bounds checks +// consistent with Linux's arch/x86/include/asm/uaccess.h:access_ok(). +// +// Preconditions: length >= 0. +func (mm *MemoryManager) CheckIORange(addr usermem.Addr, length int64) (usermem.AddrRange, bool) { + // Note that access_ok() constrains end even if length == 0. + ar, ok := addr.ToRange(uint64(length)) + return ar, (ok && ar.End <= mm.layout.MaxAddr) +} + +// checkIOVec applies bound checks consistent with Linux's +// arch/x86/include/asm/uaccess.h:access_ok() to ars. +func (mm *MemoryManager) checkIOVec(ars usermem.AddrRangeSeq) bool { + for !ars.IsEmpty() { + ar := ars.Head() + if _, ok := mm.CheckIORange(ar.Start, int64(ar.Length())); !ok { + return false + } + ars = ars.Tail() + } + return true +} + +func (mm *MemoryManager) asioEnabled(opts usermem.IOOpts) bool { + return mm.haveASIO && !opts.IgnorePermissions && opts.AddressSpaceActive +} + +// translateIOError converts errors to EFAULT, as is usually reported for all +// I/O errors originating from MM in Linux. +func translateIOError(ctx context.Context, err error) error { + if err == nil { + return nil + } + if logIOErrors { + ctx.Debugf("MM I/O error: %v", err) + } + return syserror.EFAULT +} + +// CopyOut implements usermem.IO.CopyOut. +func (mm *MemoryManager) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) { + ar, ok := mm.CheckIORange(addr, int64(len(src))) + if !ok { + return 0, syserror.EFAULT + } + + if len(src) == 0 { + return 0, nil + } + + // Do AddressSpace IO if applicable. + if mm.asioEnabled(opts) && len(src) < copyMapMinBytes { + return mm.asCopyOut(ctx, addr, src) + } + + // Go through internal mappings. + n64, err := mm.withInternalMappings(ctx, ar, usermem.Write, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { + n, err := safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src))) + return n, translateIOError(ctx, err) + }) + return int(n64), err +} + +func (mm *MemoryManager) asCopyOut(ctx context.Context, addr usermem.Addr, src []byte) (int, error) { + var done int + for { + n, err := mm.as.CopyOut(addr+usermem.Addr(done), src[done:]) + done += n + if err == nil { + return done, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + ar, _ := addr.ToRange(uint64(len(src))) + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Write); err != nil { + return done, err + } + continue + } + return done, translateIOError(ctx, err) + } +} + +// CopyIn implements usermem.IO.CopyIn. +func (mm *MemoryManager) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) { + ar, ok := mm.CheckIORange(addr, int64(len(dst))) + if !ok { + return 0, syserror.EFAULT + } + + if len(dst) == 0 { + return 0, nil + } + + // Do AddressSpace IO if applicable. + if mm.asioEnabled(opts) && len(dst) < copyMapMinBytes { + return mm.asCopyIn(ctx, addr, dst) + } + + // Go through internal mappings. + n64, err := mm.withInternalMappings(ctx, ar, usermem.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { + n, err := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), ims) + return n, translateIOError(ctx, err) + }) + return int(n64), err +} + +func (mm *MemoryManager) asCopyIn(ctx context.Context, addr usermem.Addr, dst []byte) (int, error) { + var done int + for { + n, err := mm.as.CopyIn(addr+usermem.Addr(done), dst[done:]) + done += n + if err == nil { + return done, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + ar, _ := addr.ToRange(uint64(len(dst))) + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Read); err != nil { + return done, err + } + continue + } + return done, translateIOError(ctx, err) + } +} + +// ZeroOut implements usermem.IO.ZeroOut. +func (mm *MemoryManager) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { + ar, ok := mm.CheckIORange(addr, toZero) + if !ok { + return 0, syserror.EFAULT + } + + if toZero == 0 { + return 0, nil + } + + // Do AddressSpace IO if applicable. + if mm.asioEnabled(opts) && toZero < copyMapMinBytes { + return mm.asZeroOut(ctx, addr, toZero) + } + + // Go through internal mappings. + return mm.withInternalMappings(ctx, ar, usermem.Write, opts.IgnorePermissions, func(dsts safemem.BlockSeq) (uint64, error) { + n, err := safemem.ZeroSeq(dsts) + return n, translateIOError(ctx, err) + }) +} + +func (mm *MemoryManager) asZeroOut(ctx context.Context, addr usermem.Addr, toZero int64) (int64, error) { + var done int64 + for { + n, err := mm.as.ZeroOut(addr+usermem.Addr(done), uintptr(toZero-done)) + done += int64(n) + if err == nil { + return done, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + ar, _ := addr.ToRange(uint64(toZero)) + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Write); err != nil { + return done, err + } + continue + } + return done, translateIOError(ctx, err) + } +} + +// CopyOutFrom implements usermem.IO.CopyOutFrom. +func (mm *MemoryManager) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { + if !mm.checkIOVec(ars) { + return 0, syserror.EFAULT + } + + if ars.NumBytes() == 0 { + return 0, nil + } + + // Do AddressSpace IO if applicable. + if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes { + // We have to introduce a buffered copy, instead of just passing a + // safemem.BlockSeq representing addresses in the AddressSpace to src. + // This is because usermem.IO.CopyOutFrom() guarantees that it calls + // src.ReadToBlocks() at most once, which is incompatible with handling + // faults between calls. In the future, this is probably best resolved + // by introducing a CopyOutFrom variant or option that allows it to + // call src.ReadToBlocks() any number of times. + // + // This issue applies to CopyInTo as well. + buf := make([]byte, int(ars.NumBytes())) + bufN, bufErr := src.ReadToBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf))) + var done int64 + for done < int64(bufN) { + ar := ars.Head() + cplen := int64(ar.Length()) + if cplen > int64(bufN)-done { + cplen = int64(bufN) - done + } + n, err := mm.asCopyOut(ctx, ar.Start, buf[int(done):int(done+cplen)]) + done += int64(n) + if err != nil { + return done, err + } + ars = ars.Tail() + } + // Do not convert errors returned by src to EFAULT. + return done, bufErr + } + + // Go through internal mappings. + return mm.withVecInternalMappings(ctx, ars, usermem.Write, opts.IgnorePermissions, src.ReadToBlocks) +} + +// CopyInTo implements usermem.IO.CopyInTo. +func (mm *MemoryManager) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { + if !mm.checkIOVec(ars) { + return 0, syserror.EFAULT + } + + if ars.NumBytes() == 0 { + return 0, nil + } + + // Do AddressSpace IO if applicable. + if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes { + buf := make([]byte, int(ars.NumBytes())) + var done int + var bufErr error + for !ars.IsEmpty() { + ar := ars.Head() + var n int + n, bufErr = mm.asCopyIn(ctx, ar.Start, buf[done:done+int(ar.Length())]) + done += n + if bufErr != nil { + break + } + ars = ars.Tail() + } + n, err := dst.WriteFromBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:done]))) + if err != nil { + return int64(n), err + } + // Do not convert errors returned by dst to EFAULT. + return int64(n), bufErr + } + + // Go through internal mappings. + return mm.withVecInternalMappings(ctx, ars, usermem.Read, opts.IgnorePermissions, dst.WriteFromBlocks) +} + +// SwapUint32 implements usermem.IO.SwapUint32. +func (mm *MemoryManager) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) { + ar, ok := mm.CheckIORange(addr, 4) + if !ok { + return 0, syserror.EFAULT + } + + // Do AddressSpace IO if applicable. + if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions { + for { + old, err := mm.as.SwapUint32(addr, new) + if err == nil { + return old, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.ReadWrite); err != nil { + return 0, err + } + continue + } + return 0, translateIOError(ctx, err) + } + } + + // Go through internal mappings. + var old uint32 + _, err := mm.withInternalMappings(ctx, ar, usermem.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { + if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { + // Atomicity is unachievable across mappings. + return 0, syserror.EFAULT + } + im := ims.Head() + var err error + old, err = safemem.SwapUint32(im, new) + if err != nil { + return 0, translateIOError(ctx, err) + } + // Return the number of bytes read. + return 4, nil + }) + return old, err +} + +// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32. +func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) { + ar, ok := mm.CheckIORange(addr, 4) + if !ok { + return 0, syserror.EFAULT + } + + // Do AddressSpace IO if applicable. + if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions { + for { + prev, err := mm.as.CompareAndSwapUint32(addr, old, new) + if err == nil { + return prev, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.ReadWrite); err != nil { + return 0, err + } + continue + } + return 0, translateIOError(ctx, err) + } + } + + // Go through internal mappings. + var prev uint32 + _, err := mm.withInternalMappings(ctx, ar, usermem.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { + if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { + // Atomicity is unachievable across mappings. + return 0, syserror.EFAULT + } + im := ims.Head() + var err error + prev, err = safemem.CompareAndSwapUint32(im, old, new) + if err != nil { + return 0, translateIOError(ctx, err) + } + // Return the number of bytes read. + return 4, nil + }) + return prev, err +} + +// LoadUint32 implements usermem.IO.LoadUint32. +func (mm *MemoryManager) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) { + ar, ok := mm.CheckIORange(addr, 4) + if !ok { + return 0, syserror.EFAULT + } + + // Do AddressSpace IO if applicable. + if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions { + for { + val, err := mm.as.LoadUint32(addr) + if err == nil { + return val, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Read); err != nil { + return 0, err + } + continue + } + return 0, translateIOError(ctx, err) + } + } + + // Go through internal mappings. + var val uint32 + _, err := mm.withInternalMappings(ctx, ar, usermem.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { + if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { + // Atomicity is unachievable across mappings. + return 0, syserror.EFAULT + } + im := ims.Head() + var err error + val, err = safemem.LoadUint32(im) + if err != nil { + return 0, translateIOError(ctx, err) + } + // Return the number of bytes read. + return 4, nil + }) + return val, err +} + +// handleASIOFault handles a page fault at address addr for an AddressSpaceIO +// operation spanning ioar. +// +// Preconditions: mm.as != nil. ioar.Length() != 0. ioar.Contains(addr). +func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr usermem.Addr, ioar usermem.AddrRange, at usermem.AccessType) error { + // Try to map all remaining pages in the I/O operation. This RoundUp can't + // overflow because otherwise it would have been caught by CheckIORange. + end, _ := ioar.End.RoundUp() + ar := usermem.AddrRange{addr.RoundDown(), end} + + // Don't bother trying existingPMAsLocked; in most cases, if we did have + // existing pmas, we wouldn't have faulted. + + // Ensure that we have usable vmas. Here and below, only return early if we + // can't map the first (faulting) page; failure to map later pages are + // silently ignored. This maximizes partial success. + mm.mappingMu.RLock() + vseg, vend, err := mm.getVMAsLocked(ctx, ar, at, false) + if vendaddr := vend.Start(); vendaddr < ar.End { + if vendaddr <= ar.Start { + mm.mappingMu.RUnlock() + return translateIOError(ctx, err) + } + ar.End = vendaddr + } + + // Ensure that we have usable pmas. + mm.activeMu.Lock() + pseg, pend, err := mm.getPMAsLocked(ctx, vseg, ar, at) + mm.mappingMu.RUnlock() + if pendaddr := pend.Start(); pendaddr < ar.End { + if pendaddr <= ar.Start { + mm.activeMu.Unlock() + return translateIOError(ctx, err) + } + ar.End = pendaddr + } + + // Downgrade to a read-lock on activeMu since we don't need to mutate pmas + // anymore. + mm.activeMu.DowngradeLock() + + err = mm.mapASLocked(pseg, ar, false) + mm.activeMu.RUnlock() + return translateIOError(ctx, err) +} + +// withInternalMappings ensures that pmas exist for all addresses in ar, +// support access of type (at, ignorePermissions), and have internal mappings +// cached. It then calls f with mm.activeMu locked for reading, passing +// internal mappings for the subrange of ar for which this property holds. +// +// withInternalMappings takes a function returning uint64 since many safemem +// functions have this property, but returns an int64 since this is usually +// more useful for usermem.IO methods. +// +// Preconditions: 0 < ar.Length() <= math.MaxInt64. +func (mm *MemoryManager) withInternalMappings(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { + // If pmas are already available, we can do IO without touching mm.vmas or + // mm.mappingMu. + mm.activeMu.RLock() + if pseg := mm.existingPMAsLocked(ar, at, ignorePermissions, true /* needInternalMappings */); pseg.Ok() { + n, err := f(mm.internalMappingsLocked(pseg, ar)) + mm.activeMu.RUnlock() + // Do not convert errors returned by f to EFAULT. + return int64(n), err + } + mm.activeMu.RUnlock() + + // Ensure that we have usable vmas. + mm.mappingMu.RLock() + vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions) + if vendaddr := vend.Start(); vendaddr < ar.End { + if vendaddr <= ar.Start { + mm.mappingMu.RUnlock() + return 0, translateIOError(ctx, verr) + } + ar.End = vendaddr + } + + // Ensure that we have usable pmas. + mm.activeMu.Lock() + pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at) + mm.mappingMu.RUnlock() + if pendaddr := pend.Start(); pendaddr < ar.End { + if pendaddr <= ar.Start { + mm.activeMu.Unlock() + return 0, translateIOError(ctx, perr) + } + ar.End = pendaddr + } + imend, imerr := mm.getPMAInternalMappingsLocked(pseg, ar) + mm.activeMu.DowngradeLock() + if imendaddr := imend.Start(); imendaddr < ar.End { + if imendaddr <= ar.Start { + mm.activeMu.RUnlock() + return 0, translateIOError(ctx, imerr) + } + ar.End = imendaddr + } + + // Do I/O. + un, err := f(mm.internalMappingsLocked(pseg, ar)) + mm.activeMu.RUnlock() + n := int64(un) + + // Return the first error in order of progress through ar. + if err != nil { + // Do not convert errors returned by f to EFAULT. + return n, err + } + if imerr != nil { + return n, translateIOError(ctx, imerr) + } + if perr != nil { + return n, translateIOError(ctx, perr) + } + return n, translateIOError(ctx, verr) +} + +// withVecInternalMappings ensures that pmas exist for all addresses in ars, +// support access of type (at, ignorePermissions), and have internal mappings +// cached. It then calls f with mm.activeMu locked for reading, passing +// internal mappings for the subset of ars for which this property holds. +// +// Preconditions: !ars.IsEmpty(). +func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { + // withInternalMappings is faster than withVecInternalMappings because of + // iterator plumbing (this isn't generally practical in the vector case due + // to iterator invalidation between AddrRanges). Use it if possible. + if ars.NumRanges() == 1 { + return mm.withInternalMappings(ctx, ars.Head(), at, ignorePermissions, f) + } + + // If pmas are already available, we can do IO without touching mm.vmas or + // mm.mappingMu. + mm.activeMu.RLock() + if mm.existingVecPMAsLocked(ars, at, ignorePermissions, true /* needInternalMappings */) { + n, err := f(mm.vecInternalMappingsLocked(ars)) + mm.activeMu.RUnlock() + // Do not convert errors returned by f to EFAULT. + return int64(n), err + } + mm.activeMu.RUnlock() + + // Ensure that we have usable vmas. + mm.mappingMu.RLock() + vars, verr := mm.getVecVMAsLocked(ctx, ars, at, ignorePermissions) + if vars.NumBytes() == 0 { + mm.mappingMu.RUnlock() + return 0, translateIOError(ctx, verr) + } + + // Ensure that we have usable pmas. + mm.activeMu.Lock() + pars, perr := mm.getVecPMAsLocked(ctx, vars, at) + mm.mappingMu.RUnlock() + if pars.NumBytes() == 0 { + mm.activeMu.Unlock() + return 0, translateIOError(ctx, perr) + } + imars, imerr := mm.getVecPMAInternalMappingsLocked(pars) + mm.activeMu.DowngradeLock() + if imars.NumBytes() == 0 { + mm.activeMu.RUnlock() + return 0, translateIOError(ctx, imerr) + } + + // Do I/O. + un, err := f(mm.vecInternalMappingsLocked(imars)) + mm.activeMu.RUnlock() + n := int64(un) + + // Return the first error in order of progress through ars. + if err != nil { + // Do not convert errors from f to EFAULT. + return n, err + } + if imerr != nil { + return n, translateIOError(ctx, imerr) + } + if perr != nil { + return n, translateIOError(ctx, perr) + } + return n, translateIOError(ctx, verr) +} + +// truncatedAddrRangeSeq returns a copy of ars, but with the end truncated to +// at most address end on AddrRange arsit.Head(). It is used in vector I/O paths to +// truncate usermem.AddrRangeSeq when errors occur. +// +// Preconditions: !arsit.IsEmpty(). end <= arsit.Head().End. +func truncatedAddrRangeSeq(ars, arsit usermem.AddrRangeSeq, end usermem.Addr) usermem.AddrRangeSeq { + ar := arsit.Head() + if end <= ar.Start { + return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes()) + } + return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes() + int64(end-ar.Start)) +} diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go new file mode 100644 index 000000000..aac56679b --- /dev/null +++ b/pkg/sentry/mm/lifecycle.go @@ -0,0 +1,283 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/usermem" +) + +// NewMemoryManager returns a new MemoryManager with no mappings and 1 user. +func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider, sleepForActivation bool) *MemoryManager { + return &MemoryManager{ + p: p, + mfp: mfp, + haveASIO: p.SupportsAddressSpaceIO(), + privateRefs: &privateRefs{}, + users: 1, + auxv: arch.Auxv{}, + dumpability: UserDumpable, + aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + sleepForActivation: sleepForActivation, + } +} + +// SetMmapLayout initializes mm's layout from the given arch.Context. +// +// Preconditions: mm contains no mappings and is not used concurrently. +func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arch.MmapLayout, error) { + layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r) + if err != nil { + return arch.MmapLayout{}, err + } + mm.layout = layout + return layout, nil +} + +// Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or +// clone() (without CLONE_VM). +func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + mm2 := &MemoryManager{ + p: mm.p, + mfp: mm.mfp, + haveASIO: mm.haveASIO, + layout: mm.layout, + privateRefs: mm.privateRefs, + users: 1, + brk: mm.brk, + usageAS: mm.usageAS, + dataAS: mm.dataAS, + // "The child does not inherit its parent's memory locks (mlock(2), + // mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is + // MLockNone, both of which are zero values. vma.mlockMode is reset + // when copied below. + captureInvalidations: true, + argv: mm.argv, + envv: mm.envv, + auxv: append(arch.Auxv(nil), mm.auxv...), + // IncRef'd below, once we know that there isn't an error. + executable: mm.executable, + dumpability: mm.dumpability, + aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + sleepForActivation: mm.sleepForActivation, + vdsoSigReturnAddr: mm.vdsoSigReturnAddr, + } + + // Copy vmas. + dontforks := false + dstvgap := mm2.vmas.FirstGap() + for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() { + vma := srcvseg.Value() // makes a copy of the vma + vmaAR := srcvseg.Range() + + if vma.dontfork { + length := uint64(vmaAR.Length()) + mm2.usageAS -= length + if vma.isPrivateDataLocked() { + mm2.dataAS -= length + } + dontforks = true + continue + } + + // Inform the Mappable, if any, of the new mapping. + if vma.mappable != nil { + if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil { + mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange()) + return nil, err + } + } + if vma.id != nil { + vma.id.IncRef() + } + vma.mlockMode = memmap.MLockNone + dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap() + // We don't need to update mm2.usageAS since we copied it from mm + // above. + } + + // Copy pmas. We have to lock mm.activeMu for writing to make existing + // private pmas copy-on-write. We also have to lock mm2.activeMu since + // after copying vmas above, memmap.Mappables may call mm2.Invalidate. We + // only copy private pmas, since in the common case where fork(2) is + // immediately followed by execve(2), copying non-private pmas that can be + // regenerated by calling memmap.Mappable.Translate is a waste of time. + // (Linux does the same; compare kernel/fork.c:dup_mmap() => + // mm/memory.c:copy_page_range().) + mm2.activeMu.Lock() + defer mm2.activeMu.Unlock() + mm.activeMu.Lock() + defer mm.activeMu.Unlock() + if dontforks { + defer mm.pmas.MergeRange(mm.applicationAddrRange()) + } + srcvseg := mm.vmas.FirstSegment() + dstpgap := mm2.pmas.FirstGap() + var unmapAR usermem.AddrRange + for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() { + pma := srcpseg.ValuePtr() + if !pma.private { + continue + } + + if dontforks { + // Find the 'vma' that contains the starting address + // associated with the 'pma' (there must be one). + srcvseg = srcvseg.seekNextLowerBound(srcpseg.Start()) + if checkInvariants { + if !srcvseg.Ok() { + panic(fmt.Sprintf("no vma covers pma range %v", srcpseg.Range())) + } + if srcpseg.Start() < srcvseg.Start() { + panic(fmt.Sprintf("vma %v ran ahead of pma %v", srcvseg.Range(), srcpseg.Range())) + } + } + + srcpseg = mm.pmas.Isolate(srcpseg, srcvseg.Range()) + if srcvseg.ValuePtr().dontfork { + continue + } + pma = srcpseg.ValuePtr() + } + + if !pma.needCOW { + pma.needCOW = true + if pma.effectivePerms.Write { + // We don't want to unmap the whole address space, even though + // doing so would reduce calls to unmapASLocked(), because mm + // will most likely continue to be used after the fork, so + // unmapping pmas unnecessarily will result in extra page + // faults. But we do want to merge consecutive AddrRanges + // across pma boundaries. + if unmapAR.End == srcpseg.Start() { + unmapAR.End = srcpseg.End() + } else { + if unmapAR.Length() != 0 { + mm.unmapASLocked(unmapAR) + } + unmapAR = srcpseg.Range() + } + pma.effectivePerms.Write = false + } + pma.maxPerms.Write = false + } + fr := srcpseg.fileRange() + mm2.incPrivateRef(fr) + srcpseg.ValuePtr().file.IncRef(fr) + addrRange := srcpseg.Range() + mm2.addRSSLocked(addrRange) + dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap() + } + if unmapAR.Length() != 0 { + mm.unmapASLocked(unmapAR) + } + + // Between when we call memmap.Mappable.AddMapping while copying vmas and + // when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are + // ineffective because the pmas they invalidate haven't yet been copied, + // possibly allowing mm2 to get invalidated translations: + // + // Invalidating Mappable mm.Fork + // --------------------- ------- + // + // mm2.Invalidate() + // mm.activeMu.Lock() + // mm.Invalidate() /* blocks */ + // mm2.activeMu.Lock() + // (mm copies invalidated pma to mm2) + // + // This would technically be both safe (since we only copy private pmas, + // which will still hold a reference on their memory) and consistent with + // Linux, but we avoid it anyway by setting mm2.captureInvalidations during + // construction, causing calls to mm2.Invalidate() to be captured in + // mm2.capturedInvalidations, to be replayed after pmas are copied - i.e. + // here. + mm2.captureInvalidations = false + for _, invArgs := range mm2.capturedInvalidations { + mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true) + } + mm2.capturedInvalidations = nil + + if mm2.executable != nil { + mm2.executable.IncRef() + } + return mm2, nil +} + +// IncUsers increments mm's user count and returns true. If the user count is +// already 0, IncUsers does nothing and returns false. +func (mm *MemoryManager) IncUsers() bool { + for { + users := atomic.LoadInt32(&mm.users) + if users == 0 { + return false + } + if atomic.CompareAndSwapInt32(&mm.users, users, users+1) { + return true + } + } +} + +// DecUsers decrements mm's user count. If the user count reaches 0, all +// mappings in mm are unmapped. +func (mm *MemoryManager) DecUsers(ctx context.Context) { + if users := atomic.AddInt32(&mm.users, -1); users > 0 { + return + } else if users < 0 { + panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users)) + } + + mm.aioManager.destroy() + + mm.metadataMu.Lock() + exe := mm.executable + mm.executable = nil + mm.metadataMu.Unlock() + if exe != nil { + exe.DecRef() + } + + mm.activeMu.Lock() + // Sanity check. + if atomic.LoadInt32(&mm.active) != 0 { + panic("active address space lost?") + } + // Make sure the AddressSpace is returned. + if mm.as != nil { + mm.as.Release() + mm.as = nil + } + mm.activeMu.Unlock() + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + // If mm is being dropped before mm.SetMmapLayout was called, + // mm.applicationAddrRange() will be empty. + if ar := mm.applicationAddrRange(); ar.Length() != 0 { + mm.unmapLocked(ctx, ar) + } +} diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go new file mode 100644 index 000000000..28e5057f7 --- /dev/null +++ b/pkg/sentry/mm/metadata.go @@ -0,0 +1,183 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Dumpability describes if and how core dumps should be created. +type Dumpability int + +const ( + // NotDumpable indicates that core dumps should never be created. + NotDumpable Dumpability = iota + + // UserDumpable indicates that core dumps should be created, owned by + // the current user. + UserDumpable + + // RootDumpable indicates that core dumps should be created, owned by + // root. + RootDumpable +) + +// Dumpability returns the dumpability. +func (mm *MemoryManager) Dumpability() Dumpability { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return mm.dumpability +} + +// SetDumpability sets the dumpability. +func (mm *MemoryManager) SetDumpability(d Dumpability) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.dumpability = d +} + +// ArgvStart returns the start of the application argument vector. +// +// There is no guarantee that this value is sensible w.r.t. ArgvEnd. +func (mm *MemoryManager) ArgvStart() usermem.Addr { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return mm.argv.Start +} + +// SetArgvStart sets the start of the application argument vector. +func (mm *MemoryManager) SetArgvStart(a usermem.Addr) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.argv.Start = a +} + +// ArgvEnd returns the end of the application argument vector. +// +// There is no guarantee that this value is sensible w.r.t. ArgvStart. +func (mm *MemoryManager) ArgvEnd() usermem.Addr { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return mm.argv.End +} + +// SetArgvEnd sets the end of the application argument vector. +func (mm *MemoryManager) SetArgvEnd(a usermem.Addr) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.argv.End = a +} + +// EnvvStart returns the start of the application environment vector. +// +// There is no guarantee that this value is sensible w.r.t. EnvvEnd. +func (mm *MemoryManager) EnvvStart() usermem.Addr { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return mm.envv.Start +} + +// SetEnvvStart sets the start of the application environment vector. +func (mm *MemoryManager) SetEnvvStart(a usermem.Addr) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.envv.Start = a +} + +// EnvvEnd returns the end of the application environment vector. +// +// There is no guarantee that this value is sensible w.r.t. EnvvStart. +func (mm *MemoryManager) EnvvEnd() usermem.Addr { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return mm.envv.End +} + +// SetEnvvEnd sets the end of the application environment vector. +func (mm *MemoryManager) SetEnvvEnd(a usermem.Addr) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.envv.End = a +} + +// Auxv returns the current map of auxiliary vectors. +func (mm *MemoryManager) Auxv() arch.Auxv { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return append(arch.Auxv(nil), mm.auxv...) +} + +// SetAuxv sets the entire map of auxiliary vectors. +func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.auxv = append(arch.Auxv(nil), auxv...) +} + +// Executable returns the executable, if available. +// +// An additional reference will be taken in the case of a non-nil executable, +// which must be released by the caller. +func (mm *MemoryManager) Executable() fsbridge.File { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + + if mm.executable == nil { + return nil + } + + mm.executable.IncRef() + return mm.executable +} + +// SetExecutable sets the executable. +// +// This takes a reference on d. +func (mm *MemoryManager) SetExecutable(file fsbridge.File) { + mm.metadataMu.Lock() + + // Grab a new reference. + file.IncRef() + + // Set the executable. + orig := mm.executable + mm.executable = file + + mm.metadataMu.Unlock() + + // Release the old reference. + // + // Do this without holding the lock, since it may wind up doing some + // I/O to sync the dirent, etc. + if orig != nil { + orig.DecRef() + } +} + +// VDSOSigReturn returns the address of vdso_sigreturn. +func (mm *MemoryManager) VDSOSigReturn() uint64 { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return mm.vdsoSigReturnAddr +} + +// SetVDSOSigReturn sets the address of vdso_sigreturn. +func (mm *MemoryManager) SetVDSOSigReturn(addr uint64) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.vdsoSigReturnAddr = addr +} diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go new file mode 100644 index 000000000..6db7c3d40 --- /dev/null +++ b/pkg/sentry/mm/mm.go @@ -0,0 +1,478 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package mm provides a memory management subsystem. See README.md for a +// detailed overview. +// +// Lock order: +// +// fs locks, except for memmap.Mappable locks +// mm.MemoryManager.metadataMu +// mm.MemoryManager.mappingMu +// Locks taken by memmap.Mappable methods other than Translate +// mm.MemoryManager.activeMu +// Locks taken by memmap.Mappable.Translate +// mm.privateRefs.mu +// platform.AddressSpace locks +// platform.File locks +// mm.aioManager.mu +// mm.AIOContext.mu +// +// Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in +// multiple mm.MemoryManagers, as it does so in a well-defined order (forked +// child first). +package mm + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" +) + +// MemoryManager implements a virtual address space. +// +// +stateify savable +type MemoryManager struct { + // p and mfp are immutable. + p platform.Platform + mfp pgalloc.MemoryFileProvider + + // haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from + // eliminating an indirect call in the hot I/O path, this makes + // MemoryManager.asioEnabled() a leaf function, allowing it to be inlined. + // + // haveASIO is immutable. + haveASIO bool `state:"nosave"` + + // layout is the memory layout. + // + // layout is set by the binary loader before the MemoryManager can be used. + layout arch.MmapLayout + + // privateRefs stores reference counts for private memory (memory whose + // ownership is shared by one or more pmas instead of being owned by a + // memmap.Mappable). + // + // privateRefs is immutable. + privateRefs *privateRefs + + // users is the number of dependencies on the mappings in the MemoryManager. + // When the number of references in users reaches zero, all mappings are + // unmapped. + // + // users is accessed using atomic memory operations. + users int32 + + // mappingMu is analogous to Linux's struct mm_struct::mmap_sem. + mappingMu sync.RWMutex `state:"nosave"` + + // vmas stores virtual memory areas. Since vmas are stored by value, + // clients should usually use vmaIterator.ValuePtr() instead of + // vmaIterator.Value() to get a pointer to the vma rather than a copy. + // + // Invariants: vmas are always page-aligned. + // + // vmas is protected by mappingMu. + vmas vmaSet + + // brk is the mm's brk, which is manipulated using the brk(2) system call. + // The brk is initially set up by the loader which maps an executable + // binary into the mm. + // + // brk is protected by mappingMu. + brk usermem.AddrRange + + // usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks. + // + // usageAS is protected by mappingMu. + usageAS uint64 + + // lockedAS is the combined size in bytes of all vmas with vma.mlockMode != + // memmap.MLockNone. + // + // lockedAS is protected by mappingMu. + lockedAS uint64 + + // dataAS is the size of private data segments, like mm_struct->data_vm. + // It means the vma which is private, writable, not stack. + // + // dataAS is protected by mappingMu. + dataAS uint64 + + // New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or + // defMLockMode is greater. + // + // defMLockMode is protected by mappingMu. + defMLockMode memmap.MLockMode + + // activeMu is loosely analogous to Linux's struct + // mm_struct::page_table_lock. + activeMu sync.RWMutex `state:"nosave"` + + // pmas stores platform mapping areas used to implement vmas. Since pmas + // are stored by value, clients should usually use pmaIterator.ValuePtr() + // instead of pmaIterator.Value() to get a pointer to the pma rather than + // a copy. + // + // Inserting or removing segments from pmas should happen along with a + // call to mm.insertRSS or mm.removeRSS. + // + // Invariants: pmas are always page-aligned. If a pma exists for a given + // address, a vma must also exist for that address. + // + // pmas is protected by activeMu. + pmas pmaSet + + // curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is + // reported as the MemoryManager's RSS. + // + // maxRSS should be modified only via insertRSS and removeRSS, not + // directly. + // + // maxRSS is protected by activeMu. + curRSS uint64 + + // maxRSS is the maximum resident set size in bytes of a MemoryManager. + // It is tracked as the application adds and removes mappings to pmas. + // + // maxRSS should be modified only via insertRSS, not directly. + // + // maxRSS is protected by activeMu. + maxRSS uint64 + + // as is the platform.AddressSpace that pmas are mapped into. active is the + // number of contexts that require as to be non-nil; if active == 0, as may + // be nil. + // + // as is protected by activeMu. active is manipulated with atomic memory + // operations; transitions to and from zero are additionally protected by + // activeMu. (This is because such transitions may need to be atomic with + // changes to as.) + as platform.AddressSpace `state:"nosave"` + active int32 `state:"zerovalue"` + + // unmapAllOnActivate indicates that the next Activate call should activate + // an empty AddressSpace. + // + // This is used to ensure that an AddressSpace cached in + // NewAddressSpace is not used after some change in the MemoryManager + // or VMAs has made that AddressSpace stale. + // + // unmapAllOnActivate is protected by activeMu. It must only be set when + // there is no active or cached AddressSpace. If as != nil, then + // invalidations should be propagated immediately. + unmapAllOnActivate bool `state:"nosave"` + + // If captureInvalidations is true, calls to MM.Invalidate() are recorded + // in capturedInvalidations rather than being applied immediately to pmas. + // This is to avoid a race condition in MM.Fork(); see that function for + // details. + // + // Both captureInvalidations and capturedInvalidations are protected by + // activeMu. Neither need to be saved since captureInvalidations is only + // enabled during MM.Fork(), during which saving can't occur. + captureInvalidations bool `state:"zerovalue"` + capturedInvalidations []invalidateArgs `state:"nosave"` + + metadataMu sync.Mutex `state:"nosave"` + + // argv is the application argv. This is set up by the loader and may be + // modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No + // requirements apply to argv; we do not require that argv.WellFormed(). + // + // argv is protected by metadataMu. + argv usermem.AddrRange + + // envv is the application envv. This is set up by the loader and may be + // modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No + // requirements apply to envv; we do not require that envv.WellFormed(). + // + // envv is protected by metadataMu. + envv usermem.AddrRange + + // auxv is the ELF's auxiliary vector. + // + // auxv is protected by metadataMu. + auxv arch.Auxv + + // executable is the executable for this MemoryManager. If executable + // is not nil, it holds a reference on the Dirent. + // + // executable is protected by metadataMu. + executable fsbridge.File + + // dumpability describes if and how this MemoryManager may be dumped to + // userspace. + // + // dumpability is protected by metadataMu. + dumpability Dumpability + + // aioManager keeps track of AIOContexts used for async IOs. AIOManager + // must be cloned when CLONE_VM is used. + aioManager aioManager + + // sleepForActivation indicates whether the task should report to be sleeping + // before trying to activate the address space. When set to true, delays in + // activation are not reported as stuck tasks by the watchdog. + sleepForActivation bool + + // vdsoSigReturnAddr is the address of 'vdso_sigreturn'. + vdsoSigReturnAddr uint64 +} + +// vma represents a virtual memory area. +// +// +stateify savable +type vma struct { + // mappable is the virtual memory object mapped by this vma. If mappable is + // nil, the vma represents a private anonymous mapping. + mappable memmap.Mappable + + // off is the offset into mappable at which this vma begins. If mappable is + // nil, off is meaningless. + off uint64 + + // To speedup VMA save/restore, we group and save the following booleans + // as a single integer. + + // realPerms are the memory permissions on this vma, as defined by the + // application. + realPerms usermem.AccessType `state:".(int)"` + + // effectivePerms are the memory permissions on this vma which are + // actually used to control access. + // + // Invariant: effectivePerms == realPerms.Effective(). + effectivePerms usermem.AccessType `state:"manual"` + + // maxPerms limits the set of permissions that may ever apply to this + // memory, as well as accesses for which usermem.IOOpts.IgnorePermissions + // is true (e.g. ptrace(PTRACE_POKEDATA)). + // + // Invariant: maxPerms == maxPerms.Effective(). + maxPerms usermem.AccessType `state:"manual"` + + // private is true if this is a MAP_PRIVATE mapping, such that writes to + // the mapping are propagated to a copy. + private bool `state:"manual"` + + // growsDown is true if the mapping may be automatically extended downward + // under certain conditions. If growsDown is true, mappable must be nil. + // + // There is currently no corresponding growsUp flag; in Linux, the only + // architectures that can have VM_GROWSUP mappings are ia64, parisc, and + // metag, none of which we currently support. + growsDown bool `state:"manual"` + + // dontfork is the MADV_DONTFORK setting for this vma configured by madvise(). + dontfork bool + + mlockMode memmap.MLockMode + + // numaPolicy is the NUMA policy for this vma set by mbind(). + numaPolicy linux.NumaPolicy + + // numaNodemask is the NUMA nodemask for this vma set by mbind(). + numaNodemask uint64 + + // If id is not nil, it controls the lifecycle of mappable and provides vma + // metadata shown in /proc/[pid]/maps, and the vma holds a reference. + id memmap.MappingIdentity + + // If hint is non-empty, it is a description of the vma printed in + // /proc/[pid]/maps. hint takes priority over id.MappedName(). + hint string +} + +const ( + vmaRealPermsRead = 1 << iota + vmaRealPermsWrite + vmaRealPermsExecute + vmaEffectivePermsRead + vmaEffectivePermsWrite + vmaEffectivePermsExecute + vmaMaxPermsRead + vmaMaxPermsWrite + vmaMaxPermsExecute + vmaPrivate + vmaGrowsDown +) + +func (v *vma) saveRealPerms() int { + var b int + if v.realPerms.Read { + b |= vmaRealPermsRead + } + if v.realPerms.Write { + b |= vmaRealPermsWrite + } + if v.realPerms.Execute { + b |= vmaRealPermsExecute + } + if v.effectivePerms.Read { + b |= vmaEffectivePermsRead + } + if v.effectivePerms.Write { + b |= vmaEffectivePermsWrite + } + if v.effectivePerms.Execute { + b |= vmaEffectivePermsExecute + } + if v.maxPerms.Read { + b |= vmaMaxPermsRead + } + if v.maxPerms.Write { + b |= vmaMaxPermsWrite + } + if v.maxPerms.Execute { + b |= vmaMaxPermsExecute + } + if v.private { + b |= vmaPrivate + } + if v.growsDown { + b |= vmaGrowsDown + } + return b +} + +func (v *vma) loadRealPerms(b int) { + if b&vmaRealPermsRead > 0 { + v.realPerms.Read = true + } + if b&vmaRealPermsWrite > 0 { + v.realPerms.Write = true + } + if b&vmaRealPermsExecute > 0 { + v.realPerms.Execute = true + } + if b&vmaEffectivePermsRead > 0 { + v.effectivePerms.Read = true + } + if b&vmaEffectivePermsWrite > 0 { + v.effectivePerms.Write = true + } + if b&vmaEffectivePermsExecute > 0 { + v.effectivePerms.Execute = true + } + if b&vmaMaxPermsRead > 0 { + v.maxPerms.Read = true + } + if b&vmaMaxPermsWrite > 0 { + v.maxPerms.Write = true + } + if b&vmaMaxPermsExecute > 0 { + v.maxPerms.Execute = true + } + if b&vmaPrivate > 0 { + v.private = true + } + if b&vmaGrowsDown > 0 { + v.growsDown = true + } +} + +// pma represents a platform mapping area. +// +// +stateify savable +type pma struct { + // file is the file mapped by this pma. Only pmas for which file == + // MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to + // the corresponding file range while they exist. + file platform.File `state:"nosave"` + + // off is the offset into file at which this pma begins. + // + // Note that pmas do *not* hold references on offsets in file! If private + // is true, MemoryManager.privateRefs holds the reference instead. If + // private is false, the corresponding memmap.Mappable holds the reference + // instead (per memmap.Mappable.Translate requirement). + off uint64 + + // translatePerms is the permissions returned by memmap.Mappable.Translate. + // If private is true, translatePerms is usermem.AnyAccess. + translatePerms usermem.AccessType + + // effectivePerms is the permissions allowed for non-ignorePermissions + // accesses. maxPerms is the permissions allowed for ignorePermissions + // accesses. These are vma.effectivePerms and vma.maxPerms respectively, + // masked by pma.translatePerms and with Write disallowed if pma.needCOW is + // true. + // + // These are stored in the pma so that the IO implementation can avoid + // iterating mm.vmas when pmas already exist. + effectivePerms usermem.AccessType + maxPerms usermem.AccessType + + // needCOW is true if writes to the mapping must be propagated to a copy. + needCOW bool + + // private is true if this pma represents private memory. + // + // If private is true, file must be MemoryManager.mfp.MemoryFile(), the pma + // holds a reference on the mapped memory that is tracked in privateRefs, + // and calls to Invalidate for which + // memmap.InvalidateOpts.InvalidatePrivate is false should ignore the pma. + // + // If private is false, this pma caches a translation from the + // corresponding vma's memmap.Mappable.Translate. + private bool + + // If internalMappings is not empty, it is the cached return value of + // file.MapInternal for the platform.FileRange mapped by this pma. + internalMappings safemem.BlockSeq `state:"nosave"` +} + +// +stateify savable +type privateRefs struct { + mu sync.Mutex `state:"nosave"` + + // refs maps offsets into MemoryManager.mfp.MemoryFile() to the number of + // pmas (or, equivalently, MemoryManagers) that share ownership of the + // memory at that offset. + refs fileRefcountSet +} + +type invalidateArgs struct { + ar usermem.AddrRange + opts memmap.InvalidateOpts +} + +// fileRefcountSetFunctions implements segment.Functions for fileRefcountSet. +type fileRefcountSetFunctions struct{} + +func (fileRefcountSetFunctions) MinKey() uint64 { + return 0 +} + +func (fileRefcountSetFunctions) MaxKey() uint64 { + return ^uint64(0) +} + +func (fileRefcountSetFunctions) ClearValue(_ *int32) { +} + +func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) { + return rc1, rc1 == rc2 +} + +func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) { + return rc, rc +} diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go new file mode 100644 index 000000000..fdc308542 --- /dev/null +++ b/pkg/sentry/mm/mm_test.go @@ -0,0 +1,230 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +func testMemoryManager(ctx context.Context) *MemoryManager { + p := platform.FromContext(ctx) + mfp := pgalloc.MemoryFileProviderFromContext(ctx) + mm := NewMemoryManager(p, mfp, false) + mm.layout = arch.MmapLayout{ + MinAddr: p.MinUserAddress(), + MaxAddr: p.MaxUserAddress(), + BottomUpBase: p.MinUserAddress(), + TopDownBase: p.MaxUserAddress(), + } + return mm +} + +func (mm *MemoryManager) realUsageAS() uint64 { + return uint64(mm.vmas.Span()) +} + +func TestUsageASUpdates(t *testing.T) { + ctx := contexttest.Context(t) + mm := testMemoryManager(ctx) + defer mm.DecUsers(ctx) + + addr, err := mm.MMap(ctx, memmap.MMapOpts{ + Length: 2 * usermem.PageSize, + }) + if err != nil { + t.Fatalf("MMap got err %v want nil", err) + } + realUsage := mm.realUsageAS() + if mm.usageAS != realUsage { + t.Fatalf("usageAS believes %v bytes are mapped; %v bytes are actually mapped", mm.usageAS, realUsage) + } + + mm.MUnmap(ctx, addr, usermem.PageSize) + realUsage = mm.realUsageAS() + if mm.usageAS != realUsage { + t.Fatalf("usageAS believes %v bytes are mapped; %v bytes are actually mapped", mm.usageAS, realUsage) + } +} + +func (mm *MemoryManager) realDataAS() uint64 { + var sz uint64 + for seg := mm.vmas.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + vma := seg.Value() + if vma.isPrivateDataLocked() { + sz += uint64(seg.Range().Length()) + } + } + return sz +} + +func TestDataASUpdates(t *testing.T) { + ctx := contexttest.Context(t) + mm := testMemoryManager(ctx) + defer mm.DecUsers(ctx) + + addr, err := mm.MMap(ctx, memmap.MMapOpts{ + Length: 3 * usermem.PageSize, + Private: true, + Perms: usermem.Write, + MaxPerms: usermem.AnyAccess, + }) + if err != nil { + t.Fatalf("MMap got err %v want nil", err) + } + if mm.dataAS == 0 { + t.Fatalf("dataAS is 0, wanted not 0") + } + realDataAS := mm.realDataAS() + if mm.dataAS != realDataAS { + t.Fatalf("dataAS believes %v bytes are mapped; %v bytes are actually mapped", mm.dataAS, realDataAS) + } + + mm.MUnmap(ctx, addr, usermem.PageSize) + realDataAS = mm.realDataAS() + if mm.dataAS != realDataAS { + t.Fatalf("dataAS believes %v bytes are mapped; %v bytes are actually mapped", mm.dataAS, realDataAS) + } + + mm.MProtect(addr+usermem.PageSize, usermem.PageSize, usermem.Read, false) + realDataAS = mm.realDataAS() + if mm.dataAS != realDataAS { + t.Fatalf("dataAS believes %v bytes are mapped; %v bytes are actually mapped", mm.dataAS, realDataAS) + } + + mm.MRemap(ctx, addr+2*usermem.PageSize, usermem.PageSize, 2*usermem.PageSize, MRemapOpts{ + Move: MRemapMayMove, + }) + realDataAS = mm.realDataAS() + if mm.dataAS != realDataAS { + t.Fatalf("dataAS believes %v bytes are mapped; %v bytes are actually mapped", mm.dataAS, realDataAS) + } +} + +func TestBrkDataLimitUpdates(t *testing.T) { + limitSet := limits.NewLimitSet() + limitSet.Set(limits.Data, limits.Limit{}, true /* privileged */) // zero RLIMIT_DATA + + ctx := contexttest.WithLimitSet(contexttest.Context(t), limitSet) + mm := testMemoryManager(ctx) + defer mm.DecUsers(ctx) + + // Try to extend the brk by one page and expect doing so to fail. + oldBrk, _ := mm.Brk(ctx, 0) + if newBrk, _ := mm.Brk(ctx, oldBrk+usermem.PageSize); newBrk != oldBrk { + t.Errorf("brk() increased data segment above RLIMIT_DATA (old brk = %#x, new brk = %#x", oldBrk, newBrk) + } +} + +// TestIOAfterUnmap ensures that IO fails after unmap. +func TestIOAfterUnmap(t *testing.T) { + ctx := contexttest.Context(t) + mm := testMemoryManager(ctx) + defer mm.DecUsers(ctx) + + addr, err := mm.MMap(ctx, memmap.MMapOpts{ + Length: usermem.PageSize, + Private: true, + Perms: usermem.Read, + MaxPerms: usermem.AnyAccess, + }) + if err != nil { + t.Fatalf("MMap got err %v want nil", err) + } + + // IO works before munmap. + b := make([]byte, 1) + n, err := mm.CopyIn(ctx, addr, b, usermem.IOOpts{}) + if err != nil { + t.Errorf("CopyIn got err %v want nil", err) + } + if n != 1 { + t.Errorf("CopyIn got %d want 1", n) + } + + err = mm.MUnmap(ctx, addr, usermem.PageSize) + if err != nil { + t.Fatalf("MUnmap got err %v want nil", err) + } + + n, err = mm.CopyIn(ctx, addr, b, usermem.IOOpts{}) + if err != syserror.EFAULT { + t.Errorf("CopyIn got err %v want EFAULT", err) + } + if n != 0 { + t.Errorf("CopyIn got %d want 0", n) + } +} + +// TestIOAfterMProtect tests IO interaction with mprotect permissions. +func TestIOAfterMProtect(t *testing.T) { + ctx := contexttest.Context(t) + mm := testMemoryManager(ctx) + defer mm.DecUsers(ctx) + + addr, err := mm.MMap(ctx, memmap.MMapOpts{ + Length: usermem.PageSize, + Private: true, + Perms: usermem.ReadWrite, + MaxPerms: usermem.AnyAccess, + }) + if err != nil { + t.Fatalf("MMap got err %v want nil", err) + } + + // Writing works before mprotect. + b := make([]byte, 1) + n, err := mm.CopyOut(ctx, addr, b, usermem.IOOpts{}) + if err != nil { + t.Errorf("CopyOut got err %v want nil", err) + } + if n != 1 { + t.Errorf("CopyOut got %d want 1", n) + } + + err = mm.MProtect(addr, usermem.PageSize, usermem.Read, false) + if err != nil { + t.Errorf("MProtect got err %v want nil", err) + } + + // Without IgnorePermissions, CopyOut should no longer succeed. + n, err = mm.CopyOut(ctx, addr, b, usermem.IOOpts{}) + if err != syserror.EFAULT { + t.Errorf("CopyOut got err %v want EFAULT", err) + } + if n != 0 { + t.Errorf("CopyOut got %d want 0", n) + } + + // With IgnorePermissions, CopyOut should succeed despite mprotect. + n, err = mm.CopyOut(ctx, addr, b, usermem.IOOpts{ + IgnorePermissions: true, + }) + if err != nil { + t.Errorf("CopyOut got err %v want nil", err) + } + if n != 1 { + t.Errorf("CopyOut got %d want 1", n) + } +} diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go new file mode 100644 index 000000000..62e4c20af --- /dev/null +++ b/pkg/sentry/mm/pma.go @@ -0,0 +1,1036 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safecopy" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// existingPMAsLocked checks that pmas exist for all addresses in ar, and +// support access of type (at, ignorePermissions). If so, it returns an +// iterator to the pma containing ar.Start. Otherwise it returns a terminal +// iterator. +// +// Preconditions: mm.activeMu must be locked. ar.Length() != 0. +func (mm *MemoryManager) existingPMAsLocked(ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + first := mm.pmas.FindSegment(ar.Start) + pseg := first + for pseg.Ok() { + pma := pseg.ValuePtr() + perms := pma.effectivePerms + if ignorePermissions { + perms = pma.maxPerms + } + if !perms.SupersetOf(at) { + return pmaIterator{} + } + if needInternalMappings && pma.internalMappings.IsEmpty() { + return pmaIterator{} + } + + if ar.End <= pseg.End() { + return first + } + pseg, _ = pseg.NextNonEmpty() + } + + // Ran out of pmas before reaching ar.End. + return pmaIterator{} +} + +// existingVecPMAsLocked returns true if pmas exist for all addresses in ars, +// and support access of type (at, ignorePermissions). +// +// Preconditions: mm.activeMu must be locked. +func (mm *MemoryManager) existingVecPMAsLocked(ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool, needInternalMappings bool) bool { + for ; !ars.IsEmpty(); ars = ars.Tail() { + if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, needInternalMappings).Ok() { + return false + } + } + return true +} + +// getPMAsLocked ensures that pmas exist for all addresses in ar, and support +// access of type at. It returns: +// +// - An iterator to the pma containing ar.Start. If no pma contains ar.Start, +// the iterator is unspecified. +// +// - An iterator to the gap after the last pma containing an address in ar. If +// pmas exist for no addresses in ar, the iterator is to a gap that begins +// before ar.Start. +// +// - An error that is non-nil if pmas exist for only a subset of ar. +// +// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for +// writing. ar.Length() != 0. vseg.Range().Contains(ar.Start). vmas must exist +// for all addresses in ar, and support accesses of type at (i.e. permission +// checks must have been performed against vmas). +func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !vseg.Ok() { + panic("terminal vma iterator") + } + if !vseg.Range().Contains(ar.Start) { + panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar)) + } + } + + // Page-align ar so that all AddrRanges are aligned. + end, ok := ar.End.RoundUp() + var alignerr error + if !ok { + end = ar.End.RoundDown() + alignerr = syserror.EFAULT + } + ar = usermem.AddrRange{ar.Start.RoundDown(), end} + + pstart, pend, perr := mm.getPMAsInternalLocked(ctx, vseg, ar, at) + if pend.Start() <= ar.Start { + return pmaIterator{}, pend, perr + } + // getPMAsInternalLocked may not have returned pstart due to iterator + // invalidation. + if !pstart.Ok() { + pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend) + } + if perr != nil { + return pstart, pend, perr + } + return pstart, pend, alignerr +} + +// getVecPMAsLocked ensures that pmas exist for all addresses in ars, and +// support access of type at. It returns the subset of ars for which pmas +// exist. If this is not equal to ars, it returns a non-nil error explaining +// why. +// +// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for +// writing. vmas must exist for all addresses in ars, and support accesses of +// type at (i.e. permission checks must have been performed against vmas). +func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType) (usermem.AddrRangeSeq, error) { + for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { + ar := arsit.Head() + if ar.Length() == 0 { + continue + } + if checkInvariants { + if !ar.WellFormed() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + // Page-align ar so that all AddrRanges are aligned. + end, ok := ar.End.RoundUp() + var alignerr error + if !ok { + end = ar.End.RoundDown() + alignerr = syserror.EFAULT + } + ar = usermem.AddrRange{ar.Start.RoundDown(), end} + + _, pend, perr := mm.getPMAsInternalLocked(ctx, mm.vmas.FindSegment(ar.Start), ar, at) + if perr != nil { + return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr + } + if alignerr != nil { + return truncatedAddrRangeSeq(ars, arsit, pend.Start()), alignerr + } + } + + return ars, nil +} + +// getPMAsInternalLocked is equivalent to getPMAsLocked, with the following +// exceptions: +// +// - getPMAsInternalLocked returns a pmaIterator on a best-effort basis (that +// is, the returned iterator may be terminal, even if a pma that contains +// ar.Start exists). Returning this iterator on a best-effort basis allows +// callers that require it to use it when it's cheaply available, while also +// avoiding the overhead of retrieving it when it's not. +// +// - getPMAsInternalLocked additionally requires that ar is page-aligned. +// +// getPMAsInternalLocked is an implementation helper for getPMAsLocked and +// getVecPMAsLocked; other clients should call one of those instead. +func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !vseg.Ok() { + panic("terminal vma iterator") + } + if !vseg.Range().Contains(ar.Start) { + panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar)) + } + } + + mf := mm.mfp.MemoryFile() + // Limit the range we allocate to ar, aligned to privateAllocUnit. + maskAR := privateAligned(ar) + didUnmapAS := false + // The range in which we iterate vmas and pmas is still limited to ar, to + // ensure that we don't allocate or COW-break a pma we don't need. + pseg, pgap := mm.pmas.Find(ar.Start) + pstart := pseg + for { + // Get pmas for this vma. + vsegAR := vseg.Range().Intersect(ar) + vma := vseg.ValuePtr() + pmaLoop: + for { + switch { + case pgap.Ok() && pgap.Start() < vsegAR.End: + // Need a pma here. + optAR := vseg.Range().Intersect(pgap.Range()) + if checkInvariants { + if optAR.Length() <= 0 { + panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap)) + } + } + if vma.mappable == nil { + // Private anonymous mappings get pmas by allocating. + allocAR := optAR.Intersect(maskAR) + fr, err := mf.Allocate(uint64(allocAR.Length()), usage.Anonymous) + if err != nil { + return pstart, pgap, err + } + if checkInvariants { + if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) { + panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr)) + } + } + mm.addRSSLocked(allocAR) + mm.incPrivateRef(fr) + mf.IncRef(fr) + pseg, pgap = mm.pmas.Insert(pgap, allocAR, pma{ + file: mf, + off: fr.Start, + translatePerms: usermem.AnyAccess, + effectivePerms: vma.effectivePerms, + maxPerms: vma.maxPerms, + // Since we just allocated this memory and have the + // only reference, the new pma does not need + // copy-on-write. + private: true, + }).NextNonEmpty() + pstart = pmaIterator{} // iterators invalidated + } else { + // Other mappings get pmas by translating. + optMR := vseg.mappableRangeOf(optAR) + reqAR := optAR.Intersect(ar) + reqMR := vseg.mappableRangeOf(reqAR) + perms := at + if vma.private { + // This pma will be copy-on-write; don't require write + // permission, but do require read permission to + // facilitate the copy. + // + // If at.Write is true, we will need to break + // copy-on-write immediately, which occurs after + // translation below. + perms.Read = true + perms.Write = false + } + ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms) + if checkInvariants { + if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil { + panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err)) + } + } + // Install a pma for each translation. + if len(ts) == 0 { + return pstart, pgap, err + } + pstart = pmaIterator{} // iterators invalidated + for _, t := range ts { + newpmaAR := vseg.addrRangeOf(t.Source) + newpma := pma{ + file: t.File, + off: t.Offset, + translatePerms: t.Perms, + effectivePerms: vma.effectivePerms.Intersect(t.Perms), + maxPerms: vma.maxPerms.Intersect(t.Perms), + } + if vma.private { + newpma.effectivePerms.Write = false + newpma.maxPerms.Write = false + newpma.needCOW = true + } + mm.addRSSLocked(newpmaAR) + t.File.IncRef(t.FileRange()) + // This is valid because memmap.Mappable.Translate is + // required to return Translations in increasing + // Translation.Source order. + pseg = mm.pmas.Insert(pgap, newpmaAR, newpma) + pgap = pseg.NextGap() + } + // The error returned by Translate is only significant if + // it occurred before ar.End. + if err != nil && vseg.addrRangeOf(ts[len(ts)-1].Source).End < ar.End { + return pstart, pgap, err + } + // Rewind pseg to the first pma inserted and continue the + // loop to check if we need to break copy-on-write. + pseg, pgap = mm.findOrSeekPrevUpperBoundPMA(vseg.addrRangeOf(ts[0].Source).Start, pgap), pmaGapIterator{} + continue + } + + case pseg.Ok() && pseg.Start() < vsegAR.End: + oldpma := pseg.ValuePtr() + if at.Write && mm.isPMACopyOnWriteLocked(vseg, pseg) { + // Break copy-on-write by copying. + if checkInvariants { + if !oldpma.maxPerms.Read { + panic(fmt.Sprintf("pma %v needs to be copied for writing, but is not readable: %v", pseg.Range(), oldpma)) + } + } + // The majority of copy-on-write breaks on executable pages + // come from: + // + // - The ELF loader, which must zero out bytes on the last + // page of each segment after the end of the segment. + // + // - gdb's use of ptrace to insert breakpoints. + // + // Neither of these cases has enough spatial locality to + // benefit from copying nearby pages, so if the vma is + // executable, only copy the pages required. + var copyAR usermem.AddrRange + if vseg.ValuePtr().effectivePerms.Execute { + copyAR = pseg.Range().Intersect(ar) + } else { + copyAR = pseg.Range().Intersect(maskAR) + } + // Get internal mappings from the pma to copy from. + if err := pseg.getInternalMappingsLocked(); err != nil { + return pstart, pseg.PrevGap(), err + } + // Copy contents. + fr, err := mf.AllocateAndFill(uint64(copyAR.Length()), usage.Anonymous, &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)}) + if _, ok := err.(safecopy.BusError); ok { + // If we got SIGBUS during the copy, deliver SIGBUS to + // userspace (instead of SIGSEGV) if we're breaking + // copy-on-write due to application page fault. + err = &memmap.BusError{err} + } + if fr.Length() == 0 { + return pstart, pseg.PrevGap(), err + } + // Unmap all of maskAR, not just copyAR, to minimize host + // syscalls. AddressSpace mappings must be removed before + // mm.decPrivateRef(). + if !didUnmapAS { + mm.unmapASLocked(maskAR) + didUnmapAS = true + } + // Replace the pma with a copy in the part of the address + // range where copying was successful. This doesn't change + // RSS. + copyAR.End = copyAR.Start + usermem.Addr(fr.Length()) + if copyAR != pseg.Range() { + pseg = mm.pmas.Isolate(pseg, copyAR) + pstart = pmaIterator{} // iterators invalidated + } + oldpma = pseg.ValuePtr() + if oldpma.private { + mm.decPrivateRef(pseg.fileRange()) + } + oldpma.file.DecRef(pseg.fileRange()) + mm.incPrivateRef(fr) + mf.IncRef(fr) + oldpma.file = mf + oldpma.off = fr.Start + oldpma.translatePerms = usermem.AnyAccess + oldpma.effectivePerms = vma.effectivePerms + oldpma.maxPerms = vma.maxPerms + oldpma.needCOW = false + oldpma.private = true + oldpma.internalMappings = safemem.BlockSeq{} + // Try to merge the pma with its neighbors. + if prev := pseg.PrevSegment(); prev.Ok() { + if merged := mm.pmas.Merge(prev, pseg); merged.Ok() { + pseg = merged + pstart = pmaIterator{} // iterators invalidated + } + } + if next := pseg.NextSegment(); next.Ok() { + if merged := mm.pmas.Merge(pseg, next); merged.Ok() { + pseg = merged + pstart = pmaIterator{} // iterators invalidated + } + } + // The error returned by AllocateAndFill is only + // significant if it occurred before ar.End. + if err != nil && pseg.End() < ar.End { + return pstart, pseg.NextGap(), err + } + // Ensure pseg and pgap are correct for the next iteration + // of the loop. + pseg, pgap = pseg.NextNonEmpty() + } else if !oldpma.translatePerms.SupersetOf(at) { + // Get new pmas (with sufficient permissions) by calling + // memmap.Mappable.Translate again. + if checkInvariants { + if oldpma.private { + panic(fmt.Sprintf("private pma %v has non-maximal pma.translatePerms: %v", pseg.Range(), oldpma)) + } + } + // Allow the entire pma to be replaced. + optAR := pseg.Range() + optMR := vseg.mappableRangeOf(optAR) + reqAR := optAR.Intersect(ar) + reqMR := vseg.mappableRangeOf(reqAR) + perms := oldpma.translatePerms.Union(at) + ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms) + if checkInvariants { + if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil { + panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err)) + } + } + // Remove the part of the existing pma covered by new + // Translations, then insert new pmas. This doesn't change + // RSS. Note that we don't need to call unmapASLocked: any + // existing AddressSpace mappings are still valid (though + // less permissive than the new pmas indicate) until + // Invalidate is called, and will be replaced by future + // calls to mapASLocked. + if len(ts) == 0 { + return pstart, pseg.PrevGap(), err + } + transMR := memmap.MappableRange{ts[0].Source.Start, ts[len(ts)-1].Source.End} + transAR := vseg.addrRangeOf(transMR) + pseg = mm.pmas.Isolate(pseg, transAR) + pseg.ValuePtr().file.DecRef(pseg.fileRange()) + pgap = mm.pmas.Remove(pseg) + pstart = pmaIterator{} // iterators invalidated + for _, t := range ts { + newpmaAR := vseg.addrRangeOf(t.Source) + newpma := pma{ + file: t.File, + off: t.Offset, + translatePerms: t.Perms, + effectivePerms: vma.effectivePerms.Intersect(t.Perms), + maxPerms: vma.maxPerms.Intersect(t.Perms), + } + if vma.private { + newpma.effectivePerms.Write = false + newpma.maxPerms.Write = false + newpma.needCOW = true + } + t.File.IncRef(t.FileRange()) + pseg = mm.pmas.Insert(pgap, newpmaAR, newpma) + pgap = pseg.NextGap() + } + // The error returned by Translate is only significant if + // it occurred before ar.End. + if err != nil && pseg.End() < ar.End { + return pstart, pgap, err + } + // Ensure pseg and pgap are correct for the next iteration + // of the loop. + if pgap.Range().Length() == 0 { + pseg, pgap = pgap.NextSegment(), pmaGapIterator{} + } else { + pseg = pmaIterator{} + } + } else { + // We have a usable pma; continue. + pseg, pgap = pseg.NextNonEmpty() + } + + default: + break pmaLoop + } + } + // Go to the next vma. + if ar.End <= vseg.End() { + if pgap.Ok() { + return pstart, pgap, nil + } + return pstart, pseg.PrevGap(), nil + } + vseg = vseg.NextSegment() + } +} + +const ( + // When memory is allocated for a private pma, align the allocated address + // range to a privateAllocUnit boundary when possible. Larger values of + // privateAllocUnit may reduce page faults by allowing fewer, larger pmas + // to be mapped, but may result in larger amounts of wasted memory in the + // presence of fragmentation. privateAllocUnit must be a power-of-2 + // multiple of usermem.PageSize. + privateAllocUnit = usermem.HugePageSize + + privateAllocMask = privateAllocUnit - 1 +) + +func privateAligned(ar usermem.AddrRange) usermem.AddrRange { + aligned := usermem.AddrRange{ar.Start &^ privateAllocMask, ar.End} + if end := (ar.End + privateAllocMask) &^ privateAllocMask; end >= ar.End { + aligned.End = end + } + if checkInvariants { + if !aligned.IsSupersetOf(ar) { + panic(fmt.Sprintf("aligned AddrRange %#v is not a superset of ar %#v", aligned, ar)) + } + } + return aligned +} + +// isPMACopyOnWriteLocked returns true if the contents of the pma represented +// by pseg must be copied to a new private pma to be written to. +// +// If the pma is a copy-on-write private pma, and holds the only reference on +// the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory +// and update the pma to indicate that it does not require copy-on-write. +// +// Preconditions: vseg.Range().IsSupersetOf(pseg.Range()). mm.mappingMu must be +// locked. mm.activeMu must be locked for writing. +func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool { + pma := pseg.ValuePtr() + if !pma.needCOW { + return false + } + if !pma.private { + return true + } + // If we have the only reference on private memory to be copied, just take + // ownership of it instead of copying. If we do hold the only reference, + // additional references can only be taken by mm.Fork(), which is excluded + // by mm.activeMu, so this isn't racy. + mm.privateRefs.mu.Lock() + defer mm.privateRefs.mu.Unlock() + fr := pseg.fileRange() + // This check relies on mm.privateRefs.refs being kept fully merged. + rseg := mm.privateRefs.refs.FindSegment(fr.Start) + if rseg.Ok() && rseg.Value() == 1 && fr.End <= rseg.End() { + pma.needCOW = false + // pma.private => pma.translatePerms == usermem.AnyAccess + vma := vseg.ValuePtr() + pma.effectivePerms = vma.effectivePerms + pma.maxPerms = vma.maxPerms + return false + } + return true +} + +// Invalidate implements memmap.MappingSpace.Invalidate. +func (mm *MemoryManager) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateOpts) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + mm.activeMu.Lock() + defer mm.activeMu.Unlock() + if mm.captureInvalidations { + mm.capturedInvalidations = append(mm.capturedInvalidations, invalidateArgs{ar, opts}) + return + } + mm.invalidateLocked(ar, opts.InvalidatePrivate, true) +} + +// invalidateLocked removes pmas and AddressSpace mappings of those pmas for +// addresses in ar. +// +// Preconditions: mm.activeMu must be locked for writing. ar.Length() != 0. ar +// must be page-aligned. +func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivate, invalidateShared bool) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + var didUnmapAS bool + pseg := mm.pmas.LowerBoundSegment(ar.Start) + for pseg.Ok() && pseg.Start() < ar.End { + pma := pseg.ValuePtr() + if (invalidatePrivate && pma.private) || (invalidateShared && !pma.private) { + pseg = mm.pmas.Isolate(pseg, ar) + pma = pseg.ValuePtr() + if !didUnmapAS { + // Unmap all of ar, not just pseg.Range(), to minimize host + // syscalls. AddressSpace mappings must be removed before + // mm.decPrivateRef(). + mm.unmapASLocked(ar) + didUnmapAS = true + } + if pma.private { + mm.decPrivateRef(pseg.fileRange()) + } + mm.removeRSSLocked(pseg.Range()) + pma.file.DecRef(pseg.fileRange()) + pseg = mm.pmas.Remove(pseg).NextSegment() + } else { + pseg = pseg.NextSegment() + } + } +} + +// Pin returns the platform.File ranges currently mapped by addresses in ar in +// mm, acquiring a reference on the returned ranges which the caller must +// release by calling Unpin. If not all addresses are mapped, Pin returns a +// non-nil error. Note that Pin may return both a non-empty slice of +// PinnedRanges and a non-nil error. +// +// Pin does not prevent mapped ranges from changing, making it unsuitable for +// most I/O. It should only be used in contexts that would use get_user_pages() +// in the Linux kernel. +// +// Preconditions: ar.Length() != 0. ar must be page-aligned. +func (mm *MemoryManager) Pin(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) ([]PinnedRange, error) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + // Ensure that we have usable vmas. + mm.mappingMu.RLock() + vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions) + if vendaddr := vend.Start(); vendaddr < ar.End { + if vendaddr <= ar.Start { + mm.mappingMu.RUnlock() + return nil, verr + } + ar.End = vendaddr + } + + // Ensure that we have usable pmas. + mm.activeMu.Lock() + pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at) + mm.mappingMu.RUnlock() + if pendaddr := pend.Start(); pendaddr < ar.End { + if pendaddr <= ar.Start { + mm.activeMu.Unlock() + return nil, perr + } + ar.End = pendaddr + } + + // Gather pmas. + var prs []PinnedRange + for pseg.Ok() && pseg.Start() < ar.End { + psar := pseg.Range().Intersect(ar) + f := pseg.ValuePtr().file + fr := pseg.fileRangeOf(psar) + f.IncRef(fr) + prs = append(prs, PinnedRange{ + Source: psar, + File: f, + Offset: fr.Start, + }) + pseg = pseg.NextSegment() + } + mm.activeMu.Unlock() + + // Return the first error in order of progress through ar. + if perr != nil { + return prs, perr + } + return prs, verr +} + +// PinnedRanges are returned by MemoryManager.Pin. +type PinnedRange struct { + // Source is the corresponding range of addresses. + Source usermem.AddrRange + + // File is the mapped file. + File platform.File + + // Offset is the offset into File at which this PinnedRange begins. + Offset uint64 +} + +// FileRange returns the platform.File offsets mapped by pr. +func (pr PinnedRange) FileRange() platform.FileRange { + return platform.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())} +} + +// Unpin releases the reference held by prs. +func Unpin(prs []PinnedRange) { + for i := range prs { + prs[i].File.DecRef(prs[i].FileRange()) + } +} + +// movePMAsLocked moves all pmas in oldAR to newAR. +// +// Preconditions: mm.activeMu must be locked for writing. oldAR.Length() != 0. +// oldAR.Length() <= newAR.Length(). !oldAR.Overlaps(newAR). +// mm.pmas.IsEmptyRange(newAR). oldAR and newAR must be page-aligned. +func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) { + if checkInvariants { + if !oldAR.WellFormed() || oldAR.Length() <= 0 || !oldAR.IsPageAligned() { + panic(fmt.Sprintf("invalid oldAR: %v", oldAR)) + } + if !newAR.WellFormed() || newAR.Length() <= 0 || !newAR.IsPageAligned() { + panic(fmt.Sprintf("invalid newAR: %v", newAR)) + } + if oldAR.Length() > newAR.Length() { + panic(fmt.Sprintf("old address range %v may contain pmas that will not fit in new address range %v", oldAR, newAR)) + } + if oldAR.Overlaps(newAR) { + panic(fmt.Sprintf("old and new address ranges overlap: %v, %v", oldAR, newAR)) + } + // mm.pmas.IsEmptyRange is checked by mm.pmas.Insert. + } + + type movedPMA struct { + oldAR usermem.AddrRange + pma pma + } + var movedPMAs []movedPMA + pseg := mm.pmas.LowerBoundSegment(oldAR.Start) + for pseg.Ok() && pseg.Start() < oldAR.End { + pseg = mm.pmas.Isolate(pseg, oldAR) + movedPMAs = append(movedPMAs, movedPMA{ + oldAR: pseg.Range(), + pma: pseg.Value(), + }) + pseg = mm.pmas.Remove(pseg).NextSegment() + // No RSS change is needed since we're re-inserting the same pmas + // below. + } + + off := newAR.Start - oldAR.Start + pgap := mm.pmas.FindGap(newAR.Start) + for i := range movedPMAs { + mpma := &movedPMAs[i] + pmaNewAR := usermem.AddrRange{mpma.oldAR.Start + off, mpma.oldAR.End + off} + pgap = mm.pmas.Insert(pgap, pmaNewAR, mpma.pma).NextGap() + } + + mm.unmapASLocked(oldAR) +} + +// getPMAInternalMappingsLocked ensures that pmas for all addresses in ar have +// cached internal mappings. It returns: +// +// - An iterator to the gap after the last pma with internal mappings +// containing an address in ar. If internal mappings exist for no addresses in +// ar, the iterator is to a gap that begins before ar.Start. +// +// - An error that is non-nil if internal mappings exist for only a subset of +// ar. +// +// Preconditions: mm.activeMu must be locked for writing. +// pseg.Range().Contains(ar.Start). pmas must exist for all addresses in ar. +// ar.Length() != 0. +// +// Postconditions: getPMAInternalMappingsLocked does not invalidate iterators +// into mm.pmas. +func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) (pmaGapIterator, error) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !pseg.Range().Contains(ar.Start) { + panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar)) + } + } + + for { + if err := pseg.getInternalMappingsLocked(); err != nil { + return pseg.PrevGap(), err + } + if ar.End <= pseg.End() { + return pseg.NextGap(), nil + } + pseg, _ = pseg.NextNonEmpty() + } +} + +// getVecPMAInternalMappingsLocked ensures that pmas for all addresses in ars +// have cached internal mappings. It returns the subset of ars for which +// internal mappings exist. If this is not equal to ars, it returns a non-nil +// error explaining why. +// +// Preconditions: mm.activeMu must be locked for writing. pmas must exist for +// all addresses in ar. +// +// Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators +// into mm.pmas. +func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars usermem.AddrRangeSeq) (usermem.AddrRangeSeq, error) { + for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { + ar := arsit.Head() + if ar.Length() == 0 { + continue + } + if pend, err := mm.getPMAInternalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); err != nil { + return truncatedAddrRangeSeq(ars, arsit, pend.Start()), err + } + } + return ars, nil +} + +// internalMappingsLocked returns internal mappings for addresses in ar. +// +// Preconditions: mm.activeMu must be locked. Internal mappings must have been +// previously established for all addresses in ar. ar.Length() != 0. +// pseg.Range().Contains(ar.Start). +func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) safemem.BlockSeq { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !pseg.Range().Contains(ar.Start) { + panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar)) + } + } + + if ar.End <= pseg.End() { + // Since only one pma is involved, we can use pma.internalMappings + // directly, avoiding a slice allocation. + offset := uint64(ar.Start - pseg.Start()) + return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length())) + } + + var ims []safemem.Block + for { + pr := pseg.Range().Intersect(ar) + for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pr.Start - pseg.Start())).TakeFirst64(uint64(pr.Length())); !pims.IsEmpty(); pims = pims.Tail() { + ims = append(ims, pims.Head()) + } + if ar.End <= pseg.End() { + break + } + pseg = pseg.NextSegment() + } + return safemem.BlockSeqFromSlice(ims) +} + +// vecInternalMappingsLocked returns internal mappings for addresses in ars. +// +// Preconditions: mm.activeMu must be locked. Internal mappings must have been +// previously established for all addresses in ars. +func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) safemem.BlockSeq { + var ims []safemem.Block + for ; !ars.IsEmpty(); ars = ars.Tail() { + ar := ars.Head() + if ar.Length() == 0 { + continue + } + for pims := mm.internalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); !pims.IsEmpty(); pims = pims.Tail() { + ims = append(ims, pims.Head()) + } + } + return safemem.BlockSeqFromSlice(ims) +} + +// incPrivateRef acquires a reference on private pages in fr. +func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) { + mm.privateRefs.mu.Lock() + defer mm.privateRefs.mu.Unlock() + refSet := &mm.privateRefs.refs + seg, gap := refSet.Find(fr.Start) + for { + switch { + case seg.Ok() && seg.Start() < fr.End: + seg = refSet.Isolate(seg, fr) + seg.SetValue(seg.Value() + 1) + seg, gap = seg.NextNonEmpty() + case gap.Ok() && gap.Start() < fr.End: + seg, gap = refSet.InsertWithoutMerging(gap, gap.Range().Intersect(fr), 1).NextNonEmpty() + default: + refSet.MergeAdjacent(fr) + return + } + } +} + +// decPrivateRef releases a reference on private pages in fr. +func (mm *MemoryManager) decPrivateRef(fr platform.FileRange) { + var freed []platform.FileRange + + mm.privateRefs.mu.Lock() + refSet := &mm.privateRefs.refs + seg := refSet.LowerBoundSegment(fr.Start) + for seg.Ok() && seg.Start() < fr.End { + seg = refSet.Isolate(seg, fr) + if old := seg.Value(); old == 1 { + freed = append(freed, seg.Range()) + seg = refSet.Remove(seg).NextSegment() + } else { + seg.SetValue(old - 1) + seg = seg.NextSegment() + } + } + refSet.MergeAdjacent(fr) + mm.privateRefs.mu.Unlock() + + mf := mm.mfp.MemoryFile() + for _, fr := range freed { + mf.DecRef(fr) + } +} + +// addRSSLocked updates the current and maximum resident set size of a +// MemoryManager to reflect the insertion of a pma at ar. +// +// Preconditions: mm.activeMu must be locked for writing. +func (mm *MemoryManager) addRSSLocked(ar usermem.AddrRange) { + mm.curRSS += uint64(ar.Length()) + if mm.curRSS > mm.maxRSS { + mm.maxRSS = mm.curRSS + } +} + +// removeRSSLocked updates the current resident set size of a MemoryManager to +// reflect the removal of a pma at ar. +// +// Preconditions: mm.activeMu must be locked for writing. +func (mm *MemoryManager) removeRSSLocked(ar usermem.AddrRange) { + mm.curRSS -= uint64(ar.Length()) +} + +// pmaSetFunctions implements segment.Functions for pmaSet. +type pmaSetFunctions struct{} + +func (pmaSetFunctions) MinKey() usermem.Addr { + return 0 +} + +func (pmaSetFunctions) MaxKey() usermem.Addr { + return ^usermem.Addr(0) +} + +func (pmaSetFunctions) ClearValue(pma *pma) { + pma.file = nil + pma.internalMappings = safemem.BlockSeq{} +} + +func (pmaSetFunctions) Merge(ar1 usermem.AddrRange, pma1 pma, ar2 usermem.AddrRange, pma2 pma) (pma, bool) { + if pma1.file != pma2.file || + pma1.off+uint64(ar1.Length()) != pma2.off || + pma1.translatePerms != pma2.translatePerms || + pma1.effectivePerms != pma2.effectivePerms || + pma1.maxPerms != pma2.maxPerms || + pma1.needCOW != pma2.needCOW || + pma1.private != pma2.private { + return pma{}, false + } + + // Discard internal mappings instead of trying to merge them, since merging + // them requires an allocation and getting them again from the + // platform.File might not. + pma1.internalMappings = safemem.BlockSeq{} + return pma1, true +} + +func (pmaSetFunctions) Split(ar usermem.AddrRange, p pma, split usermem.Addr) (pma, pma) { + newlen1 := uint64(split - ar.Start) + p2 := p + p2.off += newlen1 + if !p.internalMappings.IsEmpty() { + p.internalMappings = p.internalMappings.TakeFirst64(newlen1) + p2.internalMappings = p2.internalMappings.DropFirst64(newlen1) + } + return p, p2 +} + +// findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do +// so by scanning linearly backward from pgap. +// +// Preconditions: mm.activeMu must be locked. addr <= pgap.Start(). +func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr usermem.Addr, pgap pmaGapIterator) pmaIterator { + if checkInvariants { + if !pgap.Ok() { + panic("terminal pma iterator") + } + if addr > pgap.Start() { + panic(fmt.Sprintf("can't seek backward to %#x from %#x", addr, pgap.Start())) + } + } + // Optimistically check if pgap.PrevSegment() is the PMA we're looking for, + // which is the case if findOrSeekPrevUpperBoundPMA is called to find the + // start of a range containing only a single PMA. + if pseg := pgap.PrevSegment(); pseg.Start() <= addr { + return pseg + } + return mm.pmas.UpperBoundSegment(addr) +} + +// getInternalMappingsLocked ensures that pseg.ValuePtr().internalMappings is +// non-empty. +// +// Preconditions: mm.activeMu must be locked for writing. +func (pseg pmaIterator) getInternalMappingsLocked() error { + pma := pseg.ValuePtr() + if pma.internalMappings.IsEmpty() { + // This must use maxPerms (instead of perms) because some permission + // constraints are only visible to vmas; for example, mappings of + // read-only files have vma.maxPerms.Write unset, but this may not be + // visible to the memmap.Mappable. + perms := pma.maxPerms + // We will never execute application code through an internal mapping. + perms.Execute = false + ims, err := pma.file.MapInternal(pseg.fileRange(), perms) + if err != nil { + return err + } + pma.internalMappings = ims + } + return nil +} + +func (pseg pmaIterator) fileRange() platform.FileRange { + return pseg.fileRangeOf(pseg.Range()) +} + +// Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0. +func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange { + if checkInvariants { + if !pseg.Ok() { + panic("terminal pma iterator") + } + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !pseg.Range().IsSupersetOf(ar) { + panic(fmt.Sprintf("ar %v out of bounds %v", ar, pseg.Range())) + } + } + + pma := pseg.ValuePtr() + pstart := pseg.Start() + return platform.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)} +} diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go new file mode 100644 index 000000000..6efe5102b --- /dev/null +++ b/pkg/sentry/mm/procfs.go @@ -0,0 +1,329 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "bytes" + "fmt" + "strings" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/usermem" +) + +const ( + // devMinorBits is the number of minor bits in a device number. Linux: + // include/linux/kdev_t.h:MINORBITS + devMinorBits = 20 + + vsyscallEnd = usermem.Addr(0xffffffffff601000) + vsyscallMapsEntry = "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n" + vsyscallSmapsEntry = vsyscallMapsEntry + + "Size: 4 kB\n" + + "Rss: 0 kB\n" + + "Pss: 0 kB\n" + + "Shared_Clean: 0 kB\n" + + "Shared_Dirty: 0 kB\n" + + "Private_Clean: 0 kB\n" + + "Private_Dirty: 0 kB\n" + + "Referenced: 0 kB\n" + + "Anonymous: 0 kB\n" + + "AnonHugePages: 0 kB\n" + + "Shared_Hugetlb: 0 kB\n" + + "Private_Hugetlb: 0 kB\n" + + "Swap: 0 kB\n" + + "SwapPss: 0 kB\n" + + "KernelPageSize: 4 kB\n" + + "MMUPageSize: 4 kB\n" + + "Locked: 0 kB\n" + + "VmFlags: rd ex \n" +) + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (mm *MemoryManager) NeedsUpdate(generation int64) bool { + return true +} + +// ReadMapsDataInto is called by fsimpl/proc.mapsData.Generate to +// implement /proc/[pid]/maps. +func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, buf *bytes.Buffer) { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + var start usermem.Addr + + for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { + mm.appendVMAMapsEntryLocked(ctx, vseg, buf) + } + + // We always emulate vsyscall, so advertise it here. Everything about a + // vsyscall region is static, so just hard code the maps entry since we + // don't have a real vma backing it. The vsyscall region is at the end of + // the virtual address space so nothing should be mapped after it (if + // something is really mapped in the tiny ~10 MiB segment afterwards, we'll + // get the sorting on the maps file wrong at worst; but that's not possible + // on any current platform). + // + // Artifically adjust the seqfile handle so we only output vsyscall entry once. + if start != vsyscallEnd { + buf.WriteString(vsyscallMapsEntry) + } +} + +// ReadMapsSeqFileData is called by fs/proc.mapsData.ReadSeqFileData to +// implement /proc/[pid]/maps. +func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + var data []seqfile.SeqData + var start usermem.Addr + if handle != nil { + start = *handle.(*usermem.Addr) + } + for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { + vmaAddr := vseg.End() + data = append(data, seqfile.SeqData{ + Buf: mm.vmaMapsEntryLocked(ctx, vseg), + Handle: &vmaAddr, + }) + } + + // We always emulate vsyscall, so advertise it here. Everything about a + // vsyscall region is static, so just hard code the maps entry since we + // don't have a real vma backing it. The vsyscall region is at the end of + // the virtual address space so nothing should be mapped after it (if + // something is really mapped in the tiny ~10 MiB segment afterwards, we'll + // get the sorting on the maps file wrong at worst; but that's not possible + // on any current platform). + // + // Artifically adjust the seqfile handle so we only output vsyscall entry once. + if start != vsyscallEnd { + vmaAddr := vsyscallEnd + data = append(data, seqfile.SeqData{ + Buf: []byte(vsyscallMapsEntry), + Handle: &vmaAddr, + }) + } + return data, 1 +} + +// vmaMapsEntryLocked returns a /proc/[pid]/maps entry for the vma iterated by +// vseg, including the trailing newline. +// +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) vmaMapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte { + var b bytes.Buffer + mm.appendVMAMapsEntryLocked(ctx, vseg, &b) + return b.Bytes() +} + +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) { + vma := vseg.ValuePtr() + private := "p" + if !vma.private { + private = "s" + } + + var dev, ino uint64 + if vma.id != nil { + dev = vma.id.DeviceID() + ino = vma.id.InodeID() + } + devMajor := uint32(dev >> devMinorBits) + devMinor := uint32(dev & ((1 << devMinorBits) - 1)) + + // Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() => + // stack_guard_page_start(). + lineLen, _ := fmt.Fprintf(b, "%08x-%08x %s%s %08x %02x:%02x %d ", + vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino) + + // Figure out our filename or hint. + var s string + if vma.hint != "" { + s = vma.hint + } else if vma.id != nil { + // FIXME(jamieliu): We are holding mm.mappingMu here, which is + // consistent with Linux's holding mmap_sem in + // fs/proc/task_mmu.c:show_map_vma() => fs/seq_file.c:seq_file_path(). + // However, it's not clear that fs.File.MappedName() is actually + // consistent with this lock order. + s = vma.id.MappedName(ctx) + } + if s != "" { + // Per linux, we pad until the 74th character. + if pad := 73 - lineLen; pad > 0 { + b.WriteString(strings.Repeat(" ", pad)) + } + b.WriteString(s) + } + b.WriteString("\n") +} + +// ReadSmapsDataInto is called by fsimpl/proc.smapsData.Generate to +// implement /proc/[pid]/maps. +func (mm *MemoryManager) ReadSmapsDataInto(ctx context.Context, buf *bytes.Buffer) { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + var start usermem.Addr + + for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { + mm.vmaSmapsEntryIntoLocked(ctx, vseg, buf) + } + + // We always emulate vsyscall, so advertise it here. See + // ReadMapsSeqFileData for additional commentary. + if start != vsyscallEnd { + buf.WriteString(vsyscallSmapsEntry) + } +} + +// ReadSmapsSeqFileData is called by fs/proc.smapsData.ReadSeqFileData to +// implement /proc/[pid]/smaps. +func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + var data []seqfile.SeqData + var start usermem.Addr + if handle != nil { + start = *handle.(*usermem.Addr) + } + for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { + vmaAddr := vseg.End() + data = append(data, seqfile.SeqData{ + Buf: mm.vmaSmapsEntryLocked(ctx, vseg), + Handle: &vmaAddr, + }) + } + + // We always emulate vsyscall, so advertise it here. See + // ReadMapsSeqFileData for additional commentary. + if start != vsyscallEnd { + vmaAddr := vsyscallEnd + data = append(data, seqfile.SeqData{ + Buf: []byte(vsyscallSmapsEntry), + Handle: &vmaAddr, + }) + } + return data, 1 +} + +// vmaSmapsEntryLocked returns a /proc/[pid]/smaps entry for the vma iterated +// by vseg, including the trailing newline. +// +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte { + var b bytes.Buffer + mm.vmaSmapsEntryIntoLocked(ctx, vseg, &b) + return b.Bytes() +} + +func (mm *MemoryManager) vmaSmapsEntryIntoLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) { + mm.appendVMAMapsEntryLocked(ctx, vseg, b) + vma := vseg.ValuePtr() + + // We take mm.activeMu here in each call to vmaSmapsEntryLocked, instead of + // requiring it to be locked as a precondition, to reduce the latency + // impact of reading /proc/[pid]/smaps on concurrent performance-sensitive + // operations requiring activeMu for writing like faults. + mm.activeMu.RLock() + var rss uint64 + var anon uint64 + vsegAR := vseg.Range() + for pseg := mm.pmas.LowerBoundSegment(vsegAR.Start); pseg.Ok() && pseg.Start() < vsegAR.End; pseg = pseg.NextSegment() { + psegAR := pseg.Range().Intersect(vsegAR) + size := uint64(psegAR.Length()) + rss += size + if pseg.ValuePtr().private { + anon += size + } + } + mm.activeMu.RUnlock() + + fmt.Fprintf(b, "Size: %8d kB\n", vseg.Range().Length()/1024) + fmt.Fprintf(b, "Rss: %8d kB\n", rss/1024) + // Currently we report PSS = RSS, i.e. we pretend each page mapped by a pma + // is only mapped by that pma. This avoids having to query memmap.Mappables + // for reference count information on each page. As a corollary, all pages + // are accounted as "private" whether or not the vma is private; compare + // Linux's fs/proc/task_mmu.c:smaps_account(). + fmt.Fprintf(b, "Pss: %8d kB\n", rss/1024) + fmt.Fprintf(b, "Shared_Clean: %8d kB\n", 0) + fmt.Fprintf(b, "Shared_Dirty: %8d kB\n", 0) + // Pretend that all pages are dirty if the vma is writable, and clean otherwise. + clean := rss + if vma.effectivePerms.Write { + clean = 0 + } + fmt.Fprintf(b, "Private_Clean: %8d kB\n", clean/1024) + fmt.Fprintf(b, "Private_Dirty: %8d kB\n", (rss-clean)/1024) + // Pretend that all pages are "referenced" (recently touched). + fmt.Fprintf(b, "Referenced: %8d kB\n", rss/1024) + fmt.Fprintf(b, "Anonymous: %8d kB\n", anon/1024) + // Hugepages (hugetlb and THP) are not implemented. + fmt.Fprintf(b, "AnonHugePages: %8d kB\n", 0) + fmt.Fprintf(b, "Shared_Hugetlb: %8d kB\n", 0) + fmt.Fprintf(b, "Private_Hugetlb: %7d kB\n", 0) + // Swap is not implemented. + fmt.Fprintf(b, "Swap: %8d kB\n", 0) + fmt.Fprintf(b, "SwapPss: %8d kB\n", 0) + fmt.Fprintf(b, "KernelPageSize: %8d kB\n", usermem.PageSize/1024) + fmt.Fprintf(b, "MMUPageSize: %8d kB\n", usermem.PageSize/1024) + locked := rss + if vma.mlockMode == memmap.MLockNone { + locked = 0 + } + fmt.Fprintf(b, "Locked: %8d kB\n", locked/1024) + + b.WriteString("VmFlags: ") + if vma.realPerms.Read { + b.WriteString("rd ") + } + if vma.realPerms.Write { + b.WriteString("wr ") + } + if vma.realPerms.Execute { + b.WriteString("ex ") + } + if vma.canWriteMappableLocked() { // VM_SHARED + b.WriteString("sh ") + } + if vma.maxPerms.Read { + b.WriteString("mr ") + } + if vma.maxPerms.Write { + b.WriteString("mw ") + } + if vma.maxPerms.Execute { + b.WriteString("me ") + } + if !vma.private { // VM_MAYSHARE + b.WriteString("ms ") + } + if vma.growsDown { + b.WriteString("gd ") + } + if vma.mlockMode != memmap.MLockNone { // VM_LOCKED + b.WriteString("lo ") + } + if vma.mlockMode == memmap.MLockLazy { // VM_LOCKONFAULT + b.WriteString("?? ") // no explicit encoding in fs/proc/task_mmu.c:show_smap_vma_flags() + } + if vma.private && vma.effectivePerms.Write { // VM_ACCOUNT + b.WriteString("ac ") + } + b.WriteString("\n") +} diff --git a/pkg/sentry/mm/save_restore.go b/pkg/sentry/mm/save_restore.go new file mode 100644 index 000000000..f56215d9a --- /dev/null +++ b/pkg/sentry/mm/save_restore.go @@ -0,0 +1,57 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/context" +) + +// InvalidateUnsavable invokes memmap.Mappable.InvalidateUnsavable on all +// Mappables mapped by mm. +func (mm *MemoryManager) InvalidateUnsavable(ctx context.Context) error { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { + if vma := vseg.ValuePtr(); vma.mappable != nil { + if err := vma.mappable.InvalidateUnsavable(ctx); err != nil { + return err + } + } + } + return nil +} + +// beforeSave is invoked by stateify. +func (mm *MemoryManager) beforeSave() { + mf := mm.mfp.MemoryFile() + for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() { + if pma := pseg.ValuePtr(); pma.file != mf { + // InvalidateUnsavable should have caused all such pmas to be + // invalidated. + panic(fmt.Sprintf("Can't save pma %#v with non-MemoryFile of type %T:\n%s", pseg.Range(), pma.file, mm)) + } + } +} + +// afterLoad is invoked by stateify. +func (mm *MemoryManager) afterLoad() { + mm.haveASIO = mm.p.SupportsAddressSpaceIO() + mf := mm.mfp.MemoryFile() + for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() { + pseg.ValuePtr().file = mf + } +} diff --git a/pkg/sentry/mm/shm.go b/pkg/sentry/mm/shm.go new file mode 100644 index 000000000..6432731d4 --- /dev/null +++ b/pkg/sentry/mm/shm.go @@ -0,0 +1,66 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/shm" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// DetachShm unmaps a sysv shared memory segment. +func (mm *MemoryManager) DetachShm(ctx context.Context, addr usermem.Addr) error { + if addr != addr.RoundDown() { + // "... shmaddr is not aligned on a page boundary." - man shmdt(2) + return syserror.EINVAL + } + + var detached *shm.Shm + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + + // Find and remove the first vma containing an address >= addr that maps a + // segment originally attached at addr. + vseg := mm.vmas.LowerBoundSegment(addr) + for vseg.Ok() { + vma := vseg.ValuePtr() + if shm, ok := vma.mappable.(*shm.Shm); ok && vseg.Start() >= addr && uint64(vseg.Start()-addr) == vma.off { + detached = shm + vseg = mm.unmapLocked(ctx, vseg.Range()).NextSegment() + break + } else { + vseg = vseg.NextSegment() + } + } + + if detached == nil { + // There is no shared memory segment attached at addr. + return syserror.EINVAL + } + + // Remove all vmas that could have been created by the same attach. + end := addr + usermem.Addr(detached.EffectiveSize()) + for vseg.Ok() && vseg.End() <= end { + vma := vseg.ValuePtr() + if vma.mappable == detached && uint64(vseg.Start()-addr) == vma.off { + vseg = mm.unmapLocked(ctx, vseg.Range()).NextSegment() + } else { + vseg = vseg.NextSegment() + } + } + + return nil +} diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go new file mode 100644 index 000000000..9ad52082d --- /dev/null +++ b/pkg/sentry/mm/special_mappable.go @@ -0,0 +1,157 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// SpecialMappable implements memmap.MappingIdentity and memmap.Mappable with +// semantics similar to Linux's mm/mmap.c:_install_special_mapping(), except +// that SpecialMappable takes ownership of the memory that it represents +// (_install_special_mapping() does not.) +// +// +stateify savable +type SpecialMappable struct { + refs.AtomicRefCount + + mfp pgalloc.MemoryFileProvider + fr platform.FileRange + name string +} + +// NewSpecialMappable returns a SpecialMappable that owns fr, which represents +// offsets in mfp.MemoryFile() that contain the SpecialMappable's data. The +// SpecialMappable will use the given name in /proc/[pid]/maps. +// +// Preconditions: fr.Length() != 0. +func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable { + m := SpecialMappable{mfp: mfp, fr: fr, name: name} + m.EnableLeakCheck("mm.SpecialMappable") + return &m +} + +// DecRef implements refs.RefCounter.DecRef. +func (m *SpecialMappable) DecRef() { + m.AtomicRefCount.DecRefWithDestructor(func() { + m.mfp.MemoryFile().DecRef(m.fr) + }) +} + +// MappedName implements memmap.MappingIdentity.MappedName. +func (m *SpecialMappable) MappedName(ctx context.Context) string { + return m.name +} + +// DeviceID implements memmap.MappingIdentity.DeviceID. +func (m *SpecialMappable) DeviceID() uint64 { + return 0 +} + +// InodeID implements memmap.MappingIdentity.InodeID. +func (m *SpecialMappable) InodeID() uint64 { + return 0 +} + +// Msync implements memmap.MappingIdentity.Msync. +func (m *SpecialMappable) Msync(ctx context.Context, mr memmap.MappableRange) error { + // Linux: vm_file is NULL, causing msync to skip it entirely. + return nil +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (*SpecialMappable) AddMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) error { + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (*SpecialMappable) RemoveMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) { +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (*SpecialMappable) CopyMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, usermem.AddrRange, uint64, bool) error { + return nil +} + +// Translate implements memmap.Mappable.Translate. +func (m *SpecialMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + var err error + if required.End > m.fr.Length() { + err = &memmap.BusError{syserror.EFAULT} + } + if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 { + return []memmap.Translation{ + { + Source: source, + File: m.mfp.MemoryFile(), + Offset: m.fr.Start + source.Start, + Perms: usermem.AnyAccess, + }, + }, err + } + return nil, err +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (m *SpecialMappable) InvalidateUnsavable(ctx context.Context) error { + // Since data is stored in pgalloc.MemoryFile, the contents of which are + // preserved across save/restore, we don't need to do anything. + return nil +} + +// MemoryFileProvider returns the MemoryFileProvider whose MemoryFile stores +// the SpecialMappable's contents. +func (m *SpecialMappable) MemoryFileProvider() pgalloc.MemoryFileProvider { + return m.mfp +} + +// FileRange returns the offsets into MemoryFileProvider().MemoryFile() that +// store the SpecialMappable's contents. +func (m *SpecialMappable) FileRange() platform.FileRange { + return m.fr +} + +// Length returns the length of the SpecialMappable. +func (m *SpecialMappable) Length() uint64 { + return m.fr.Length() +} + +// NewSharedAnonMappable returns a SpecialMappable that implements the +// semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero. +// +// TODO(jamieliu): The use of SpecialMappable is a lazy code reuse hack. Linux +// uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should +// do the same to get non-zero device and inode IDs. +func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) { + if length == 0 { + return nil, syserror.EINVAL + } + alignedLen, ok := usermem.Addr(length).RoundUp() + if !ok { + return nil, syserror.EINVAL + } + fr, err := mfp.MemoryFile().Allocate(uint64(alignedLen), usage.Anonymous) + if err != nil { + return nil, err + } + return NewSpecialMappable("/dev/zero (deleted)", mfp, fr), nil +} diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go new file mode 100644 index 000000000..3f496aa9f --- /dev/null +++ b/pkg/sentry/mm/syscalls.go @@ -0,0 +1,1286 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + mrand "math/rand" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/futex" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// HandleUserFault handles an application page fault. sp is the faulting +// application thread's stack pointer. +// +// Preconditions: mm.as != nil. +func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr usermem.Addr, at usermem.AccessType, sp usermem.Addr) error { + ar, ok := addr.RoundDown().ToRange(usermem.PageSize) + if !ok { + return syserror.EFAULT + } + + // Don't bother trying existingPMAsLocked; in most cases, if we did have + // existing pmas, we wouldn't have faulted. + + // Ensure that we have a usable vma. Here and below, since we are only + // asking for a single page, there is no possibility of partial success, + // and any error is immediately fatal. + mm.mappingMu.RLock() + vseg, _, err := mm.getVMAsLocked(ctx, ar, at, false) + if err != nil { + mm.mappingMu.RUnlock() + return err + } + + // Ensure that we have a usable pma. + mm.activeMu.Lock() + pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, at) + mm.mappingMu.RUnlock() + if err != nil { + mm.activeMu.Unlock() + return err + } + + // Downgrade to a read-lock on activeMu since we don't need to mutate pmas + // anymore. + mm.activeMu.DowngradeLock() + + // Map the faulted page into the active AddressSpace. + err = mm.mapASLocked(pseg, ar, false) + mm.activeMu.RUnlock() + return err +} + +// MMap establishes a memory mapping. +func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (usermem.Addr, error) { + if opts.Length == 0 { + return 0, syserror.EINVAL + } + length, ok := usermem.Addr(opts.Length).RoundUp() + if !ok { + return 0, syserror.ENOMEM + } + opts.Length = uint64(length) + + if opts.Mappable != nil { + // Offset must be aligned. + if usermem.Addr(opts.Offset).RoundDown() != usermem.Addr(opts.Offset) { + return 0, syserror.EINVAL + } + // Offset + length must not overflow. + if end := opts.Offset + opts.Length; end < opts.Offset { + return 0, syserror.ENOMEM + } + } else { + opts.Offset = 0 + if !opts.Private { + if opts.MappingIdentity != nil { + return 0, syserror.EINVAL + } + m, err := NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx)) + if err != nil { + return 0, err + } + defer m.DecRef() + opts.MappingIdentity = m + opts.Mappable = m + } + } + + if opts.Addr.RoundDown() != opts.Addr { + // MAP_FIXED requires addr to be page-aligned; non-fixed mappings + // don't. + if opts.Fixed { + return 0, syserror.EINVAL + } + opts.Addr = opts.Addr.RoundDown() + } + + if !opts.MaxPerms.SupersetOf(opts.Perms) { + return 0, syserror.EACCES + } + if opts.Unmap && !opts.Fixed { + return 0, syserror.EINVAL + } + if opts.GrowsDown && opts.Mappable != nil { + return 0, syserror.EINVAL + } + + // Get the new vma. + mm.mappingMu.Lock() + if opts.MLockMode < mm.defMLockMode { + opts.MLockMode = mm.defMLockMode + } + vseg, ar, err := mm.createVMALocked(ctx, opts) + if err != nil { + mm.mappingMu.Unlock() + return 0, err + } + + // TODO(jamieliu): In Linux, VM_LOCKONFAULT (which may be set on the new + // vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears + // to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in + // mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() => + // populate_vma_page_range(). Confirm this behavior. + switch { + case opts.Precommit || opts.MLockMode == memmap.MLockEager: + // Get pmas and map with precommit as requested. + mm.populateVMAAndUnlock(ctx, vseg, ar, true) + + case opts.Mappable == nil && length <= privateAllocUnit: + // NOTE(b/63077076, b/63360184): Get pmas and map eagerly in the hope + // that doing so will save on future page faults. We only do this for + // anonymous mappings, since otherwise the cost of + // memmap.Mappable.Translate is unknown; and only for small mappings, + // to avoid needing to allocate large amounts of memory that we may + // subsequently need to checkpoint. + mm.populateVMAAndUnlock(ctx, vseg, ar, false) + + default: + mm.mappingMu.Unlock() + } + + return ar.Start, nil +} + +// populateVMA obtains pmas for addresses in ar in the given vma, and maps them +// into mm.as if it is active. +// +// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar). +func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) { + if !vseg.ValuePtr().effectivePerms.Any() { + // Linux doesn't populate inaccessible pages. See + // mm/gup.c:populate_vma_page_range. + return + } + + mm.activeMu.Lock() + // Can't defer mm.activeMu.Unlock(); see below. + + // Even if we get new pmas, we can't actually map them if we don't have an + // AddressSpace. + if mm.as == nil { + mm.activeMu.Unlock() + return + } + + // Ensure that we have usable pmas. + pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, usermem.NoAccess) + if err != nil { + // mm/util.c:vm_mmap_pgoff() ignores the error, if any, from + // mm/gup.c:mm_populate(). If it matters, we'll get it again when + // userspace actually tries to use the failing page. + mm.activeMu.Unlock() + return + } + + // Downgrade to a read-lock on activeMu since we don't need to mutate pmas + // anymore. + mm.activeMu.DowngradeLock() + + // As above, errors are silently ignored. + mm.mapASLocked(pseg, ar, precommit) + mm.activeMu.RUnlock() +} + +// populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally +// unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is +// preferable to populateVMA since it unlocks mm.mappingMu before performing +// expensive operations that don't require it to be locked. +// +// Preconditions: mm.mappingMu must be locked for writing. +// vseg.Range().IsSupersetOf(ar). +// +// Postconditions: mm.mappingMu will be unlocked. +func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) { + // See populateVMA above for commentary. + if !vseg.ValuePtr().effectivePerms.Any() { + mm.mappingMu.Unlock() + return + } + + mm.activeMu.Lock() + + if mm.as == nil { + mm.activeMu.Unlock() + mm.mappingMu.Unlock() + return + } + + // mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it + // isn't needed at all for mapASLocked. + mm.mappingMu.DowngradeLock() + pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, usermem.NoAccess) + mm.mappingMu.RUnlock() + if err != nil { + mm.activeMu.Unlock() + return + } + + mm.activeMu.DowngradeLock() + mm.mapASLocked(pseg, ar, precommit) + mm.activeMu.RUnlock() +} + +// MapStack allocates the initial process stack. +func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) { + // maxStackSize is the maximum supported process stack size in bytes. + // + // This limit exists because stack growing isn't implemented, so the entire + // process stack must be mapped up-front. + const maxStackSize = 128 << 20 + + stackSize := limits.FromContext(ctx).Get(limits.Stack) + r, ok := usermem.Addr(stackSize.Cur).RoundUp() + sz := uint64(r) + if !ok { + // RLIM_INFINITY rounds up to 0. + sz = linux.DefaultStackSoftLimit + } else if sz > maxStackSize { + ctx.Warningf("Capping stack size from RLIMIT_STACK of %v down to %v.", sz, maxStackSize) + sz = maxStackSize + } else if sz == 0 { + return usermem.AddrRange{}, syserror.ENOMEM + } + szaddr := usermem.Addr(sz) + ctx.Debugf("Allocating stack with size of %v bytes", sz) + + // Determine the stack's desired location. Unlike Linux, address + // randomization can't be disabled. + stackEnd := mm.layout.MaxAddr - usermem.Addr(mrand.Int63n(int64(mm.layout.MaxStackRand))).RoundDown() + if stackEnd < szaddr { + return usermem.AddrRange{}, syserror.ENOMEM + } + stackStart := stackEnd - szaddr + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + _, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{ + Length: sz, + Addr: stackStart, + Perms: usermem.ReadWrite, + MaxPerms: usermem.AnyAccess, + Private: true, + GrowsDown: true, + MLockMode: mm.defMLockMode, + Hint: "[stack]", + }) + return ar, err +} + +// MUnmap implements the semantics of Linux's munmap(2). +func (mm *MemoryManager) MUnmap(ctx context.Context, addr usermem.Addr, length uint64) error { + if addr != addr.RoundDown() { + return syserror.EINVAL + } + if length == 0 { + return syserror.EINVAL + } + la, ok := usermem.Addr(length).RoundUp() + if !ok { + return syserror.EINVAL + } + ar, ok := addr.ToRange(uint64(la)) + if !ok { + return syserror.EINVAL + } + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + mm.unmapLocked(ctx, ar) + return nil +} + +// MRemapOpts specifies options to MRemap. +type MRemapOpts struct { + // Move controls whether MRemap moves the remapped mapping to a new address. + Move MRemapMoveMode + + // NewAddr is the new address for the remapping. NewAddr is ignored unless + // Move is MMRemapMustMove. + NewAddr usermem.Addr +} + +// MRemapMoveMode controls MRemap's moving behavior. +type MRemapMoveMode int + +const ( + // MRemapNoMove prevents MRemap from moving the remapped mapping. + MRemapNoMove MRemapMoveMode = iota + + // MRemapMayMove allows MRemap to move the remapped mapping. + MRemapMayMove + + // MRemapMustMove requires MRemap to move the remapped mapping to + // MRemapOpts.NewAddr, replacing any existing mappings in the remapped + // range. + MRemapMustMove +) + +// MRemap implements the semantics of Linux's mremap(2). +func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSize uint64, newSize uint64, opts MRemapOpts) (usermem.Addr, error) { + // "Note that old_address has to be page aligned." - mremap(2) + if oldAddr.RoundDown() != oldAddr { + return 0, syserror.EINVAL + } + + // Linux treats an old_size that rounds up to 0 as 0, which is otherwise a + // valid size. However, new_size can't be 0 after rounding. + oldSizeAddr, _ := usermem.Addr(oldSize).RoundUp() + oldSize = uint64(oldSizeAddr) + newSizeAddr, ok := usermem.Addr(newSize).RoundUp() + if !ok || newSizeAddr == 0 { + return 0, syserror.EINVAL + } + newSize = uint64(newSizeAddr) + + oldEnd, ok := oldAddr.AddLength(oldSize) + if !ok { + return 0, syserror.EINVAL + } + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + + // All cases require that a vma exists at oldAddr. + vseg := mm.vmas.FindSegment(oldAddr) + if !vseg.Ok() { + return 0, syserror.EFAULT + } + + // Behavior matrix: + // + // Move | oldSize = 0 | oldSize < newSize | oldSize = newSize | oldSize > newSize + // ---------+-------------+-------------------+-------------------+------------------ + // NoMove | ENOMEM [1] | Grow in-place | No-op | Shrink in-place + // MayMove | Copy [1] | Grow in-place or | No-op | Shrink in-place + // | | move | | + // MustMove | Copy | Move and grow | Move | Shrink and move + // + // [1] In-place growth is impossible because the vma at oldAddr already + // occupies at least part of the destination. Thus the NoMove case always + // fails and the MayMove case always falls back to copying. + + if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone { + // Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall, + // mremap in Linux does not check mm/mlock.c:can_do_mlock() and + // therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and + // !CAP_IPC_LOCK. + mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur + if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { + if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit { + return 0, syserror.EAGAIN + } + } + } + + if opts.Move != MRemapMustMove { + // Handle no-ops and in-place shrinking. These cases don't care if + // [oldAddr, oldEnd) maps to a single vma, or is even mapped at all + // (aside from oldAddr). + if newSize <= oldSize { + if newSize < oldSize { + // If oldAddr+oldSize didn't overflow, oldAddr+newSize can't + // either. + newEnd := oldAddr + usermem.Addr(newSize) + mm.unmapLocked(ctx, usermem.AddrRange{newEnd, oldEnd}) + } + return oldAddr, nil + } + + // Handle in-place growing. + + // Check that oldEnd maps to the same vma as oldAddr. + if vseg.End() < oldEnd { + return 0, syserror.EFAULT + } + // "Grow" the existing vma by creating a new mergeable one. + vma := vseg.ValuePtr() + var newOffset uint64 + if vma.mappable != nil { + newOffset = vseg.mappableRange().End + } + vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{ + Length: newSize - oldSize, + MappingIdentity: vma.id, + Mappable: vma.mappable, + Offset: newOffset, + Addr: oldEnd, + Fixed: true, + Perms: vma.realPerms, + MaxPerms: vma.maxPerms, + Private: vma.private, + GrowsDown: vma.growsDown, + MLockMode: vma.mlockMode, + Hint: vma.hint, + }) + if err == nil { + if vma.mlockMode == memmap.MLockEager { + mm.populateVMA(ctx, vseg, ar, true) + } + return oldAddr, nil + } + // In-place growth failed. In the MRemapMayMove case, fall through to + // copying/moving below. + if opts.Move == MRemapNoMove { + return 0, err + } + } + + // Find a location for the new mapping. + var newAR usermem.AddrRange + switch opts.Move { + case MRemapMayMove: + newAddr, err := mm.findAvailableLocked(newSize, findAvailableOpts{}) + if err != nil { + return 0, err + } + newAR, _ = newAddr.ToRange(newSize) + + case MRemapMustMove: + newAddr := opts.NewAddr + if newAddr.RoundDown() != newAddr { + return 0, syserror.EINVAL + } + var ok bool + newAR, ok = newAddr.ToRange(newSize) + if !ok { + return 0, syserror.EINVAL + } + if (usermem.AddrRange{oldAddr, oldEnd}).Overlaps(newAR) { + return 0, syserror.EINVAL + } + + // Check that the new region is valid. + _, err := mm.findAvailableLocked(newSize, findAvailableOpts{ + Addr: newAddr, + Fixed: true, + Unmap: true, + }) + if err != nil { + return 0, err + } + + // Unmap any mappings at the destination. + mm.unmapLocked(ctx, newAR) + + // If the sizes specify shrinking, unmap everything between the new and + // old sizes at the source. Unmapping before the following checks is + // correct: compare Linux's mm/mremap.c:mremap_to() => do_munmap(), + // vma_to_resize(). + if newSize < oldSize { + oldNewEnd := oldAddr + usermem.Addr(newSize) + mm.unmapLocked(ctx, usermem.AddrRange{oldNewEnd, oldEnd}) + oldEnd = oldNewEnd + } + + // unmapLocked may have invalidated vseg; look it up again. + vseg = mm.vmas.FindSegment(oldAddr) + } + + oldAR := usermem.AddrRange{oldAddr, oldEnd} + + // Check that oldEnd maps to the same vma as oldAddr. + if vseg.End() < oldEnd { + return 0, syserror.EFAULT + } + + // Check against RLIMIT_AS. + newUsageAS := mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length()) + if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS { + return 0, syserror.ENOMEM + } + + if vma := vseg.ValuePtr(); vma.mappable != nil { + // Check that offset+length does not overflow. + if vma.off+uint64(newAR.Length()) < vma.off { + return 0, syserror.EINVAL + } + // Inform the Mappable, if any, of the new mapping. + if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start), vma.canWriteMappableLocked()); err != nil { + return 0, err + } + } + + if oldSize == 0 { + // Handle copying. + // + // We can't use createVMALocked because it calls Mappable.AddMapping, + // whereas we've already called Mappable.CopyMapping (which is + // consistent with Linux). Call vseg.Value() (rather than + // vseg.ValuePtr()) to make a copy of the vma. + vma := vseg.Value() + if vma.mappable != nil { + vma.off = vseg.mappableOffsetAt(oldAR.Start) + } + if vma.id != nil { + vma.id.IncRef() + } + vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma) + mm.usageAS += uint64(newAR.Length()) + if vma.isPrivateDataLocked() { + mm.dataAS += uint64(newAR.Length()) + } + if vma.mlockMode != memmap.MLockNone { + mm.lockedAS += uint64(newAR.Length()) + if vma.mlockMode == memmap.MLockEager { + mm.populateVMA(ctx, vseg, newAR, true) + } + } + return newAR.Start, nil + } + + // Handle moving. + // + // Remove the existing vma before inserting the new one to minimize + // iterator invalidation. We do this directly (instead of calling + // removeVMAsLocked) because: + // + // 1. We can't drop the reference on vma.id, which will be transferred to + // the new vma. + // + // 2. We can't call vma.mappable.RemoveMapping, because pmas are still at + // oldAR, so calling RemoveMapping could cause us to miss an invalidation + // overlapping oldAR. + // + // Call vseg.Value() (rather than vseg.ValuePtr()) to make a copy of the + // vma. + vseg = mm.vmas.Isolate(vseg, oldAR) + vma := vseg.Value() + mm.vmas.Remove(vseg) + vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma) + mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length()) + if vma.isPrivateDataLocked() { + mm.dataAS = mm.dataAS - uint64(oldAR.Length()) + uint64(newAR.Length()) + } + if vma.mlockMode != memmap.MLockNone { + mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length()) + } + + // Move pmas. This is technically optional for non-private pmas, which + // could just go through memmap.Mappable.Translate again, but it's required + // for private pmas. + mm.activeMu.Lock() + mm.movePMAsLocked(oldAR, newAR) + mm.activeMu.Unlock() + + // Now that pmas have been moved to newAR, we can notify vma.mappable that + // oldAR is no longer mapped. + if vma.mappable != nil { + vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.canWriteMappableLocked()) + } + + if vma.mlockMode == memmap.MLockEager { + mm.populateVMA(ctx, vseg, newAR, true) + } + + return newAR.Start, nil +} + +// MProtect implements the semantics of Linux's mprotect(2). +func (mm *MemoryManager) MProtect(addr usermem.Addr, length uint64, realPerms usermem.AccessType, growsDown bool) error { + if addr.RoundDown() != addr { + return syserror.EINVAL + } + if length == 0 { + return nil + } + rlength, ok := usermem.Addr(length).RoundUp() + if !ok { + return syserror.ENOMEM + } + ar, ok := addr.ToRange(uint64(rlength)) + if !ok { + return syserror.ENOMEM + } + effectivePerms := realPerms.Effective() + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + // Non-growsDown mprotect requires that all of ar is mapped, and stops at + // the first non-empty gap. growsDown mprotect requires that the first vma + // be growsDown, but does not require it to extend all the way to ar.Start; + // vmas after the first must be contiguous but need not be growsDown, like + // the non-growsDown case. + vseg := mm.vmas.LowerBoundSegment(ar.Start) + if !vseg.Ok() { + return syserror.ENOMEM + } + if growsDown { + if !vseg.ValuePtr().growsDown { + return syserror.EINVAL + } + if ar.End <= vseg.Start() { + return syserror.ENOMEM + } + ar.Start = vseg.Start() + } else { + if ar.Start < vseg.Start() { + return syserror.ENOMEM + } + } + + mm.activeMu.Lock() + defer mm.activeMu.Unlock() + defer func() { + mm.vmas.MergeRange(ar) + mm.vmas.MergeAdjacent(ar) + mm.pmas.MergeRange(ar) + mm.pmas.MergeAdjacent(ar) + }() + pseg := mm.pmas.LowerBoundSegment(ar.Start) + var didUnmapAS bool + for { + // Check for permission validity before splitting vmas, for consistency + // with Linux. + if !vseg.ValuePtr().maxPerms.SupersetOf(effectivePerms) { + return syserror.EACCES + } + vseg = mm.vmas.Isolate(vseg, ar) + + // Update vma permissions. + vma := vseg.ValuePtr() + vmaLength := vseg.Range().Length() + if vma.isPrivateDataLocked() { + mm.dataAS -= uint64(vmaLength) + } + + vma.realPerms = realPerms + vma.effectivePerms = effectivePerms + if vma.isPrivateDataLocked() { + mm.dataAS += uint64(vmaLength) + } + + // Propagate vma permission changes to pmas. + for pseg.Ok() && pseg.Start() < vseg.End() { + if pseg.Range().Overlaps(vseg.Range()) { + pseg = mm.pmas.Isolate(pseg, vseg.Range()) + pma := pseg.ValuePtr() + if !effectivePerms.SupersetOf(pma.effectivePerms) && !didUnmapAS { + // Unmap all of ar, not just vseg.Range(), to minimize host + // syscalls. + mm.unmapASLocked(ar) + didUnmapAS = true + } + pma.effectivePerms = effectivePerms.Intersect(pma.translatePerms) + if pma.needCOW { + pma.effectivePerms.Write = false + } + } + pseg = pseg.NextSegment() + } + + // Continue to the next vma. + if ar.End <= vseg.End() { + return nil + } + vseg, _ = vseg.NextNonEmpty() + if !vseg.Ok() { + return syserror.ENOMEM + } + } +} + +// BrkSetup sets mm's brk address to addr and its brk size to 0. +func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) { + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + // Unmap the existing brk. + if mm.brk.Length() != 0 { + mm.unmapLocked(ctx, mm.brk) + } + mm.brk = usermem.AddrRange{addr, addr} +} + +// Brk implements the semantics of Linux's brk(2), except that it returns an +// error on failure. +func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) { + mm.mappingMu.Lock() + // Can't defer mm.mappingMu.Unlock(); see below. + + if addr < mm.brk.Start { + addr = mm.brk.End + mm.mappingMu.Unlock() + return addr, syserror.EINVAL + } + + // TODO(gvisor.dev/issue/156): This enforces RLIMIT_DATA, but is + // slightly more permissive than the usual data limit. In particular, + // this only limits the size of the heap; a true RLIMIT_DATA limits the + // size of heap + data + bss. The segment sizes need to be plumbed from + // the loader package to fully enforce RLIMIT_DATA. + if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur { + addr = mm.brk.End + mm.mappingMu.Unlock() + return addr, syserror.ENOMEM + } + + oldbrkpg, _ := mm.brk.End.RoundUp() + newbrkpg, ok := addr.RoundUp() + if !ok { + addr = mm.brk.End + mm.mappingMu.Unlock() + return addr, syserror.EFAULT + } + + switch { + case oldbrkpg < newbrkpg: + vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{ + Length: uint64(newbrkpg - oldbrkpg), + Addr: oldbrkpg, + Fixed: true, + // Compare Linux's + // arch/x86/include/asm/page_types.h:VM_DATA_DEFAULT_FLAGS. + Perms: usermem.ReadWrite, + MaxPerms: usermem.AnyAccess, + Private: true, + // Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes + // mm->def_flags. + MLockMode: mm.defMLockMode, + Hint: "[heap]", + }) + if err != nil { + addr = mm.brk.End + mm.mappingMu.Unlock() + return addr, err + } + mm.brk.End = addr + if mm.defMLockMode == memmap.MLockEager { + mm.populateVMAAndUnlock(ctx, vseg, ar, true) + } else { + mm.mappingMu.Unlock() + } + + case newbrkpg < oldbrkpg: + mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg}) + fallthrough + + default: + mm.brk.End = addr + mm.mappingMu.Unlock() + } + + return addr, nil +} + +// MLock implements the semantics of Linux's mlock()/mlock2()/munlock(), +// depending on mode. +func (mm *MemoryManager) MLock(ctx context.Context, addr usermem.Addr, length uint64, mode memmap.MLockMode) error { + // Linux allows this to overflow. + la, _ := usermem.Addr(length + addr.PageOffset()).RoundUp() + ar, ok := addr.RoundDown().ToRange(uint64(la)) + if !ok { + return syserror.EINVAL + } + + mm.mappingMu.Lock() + // Can't defer mm.mappingMu.Unlock(); see below. + + if mode != memmap.MLockNone { + // Check against RLIMIT_MEMLOCK. + if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { + mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur + if mlockLimit == 0 { + mm.mappingMu.Unlock() + return syserror.EPERM + } + if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit { + mm.mappingMu.Unlock() + return syserror.ENOMEM + } + } + } + + // Check this after RLIMIT_MEMLOCK for consistency with Linux. + if ar.Length() == 0 { + mm.mappingMu.Unlock() + return nil + } + + // Apply the new mlock mode to vmas. + var unmapped bool + vseg := mm.vmas.FindSegment(ar.Start) + for { + if !vseg.Ok() { + unmapped = true + break + } + vseg = mm.vmas.Isolate(vseg, ar) + vma := vseg.ValuePtr() + prevMode := vma.mlockMode + vma.mlockMode = mode + if mode != memmap.MLockNone && prevMode == memmap.MLockNone { + mm.lockedAS += uint64(vseg.Range().Length()) + } else if mode == memmap.MLockNone && prevMode != memmap.MLockNone { + mm.lockedAS -= uint64(vseg.Range().Length()) + } + if ar.End <= vseg.End() { + break + } + vseg, _ = vseg.NextNonEmpty() + } + mm.vmas.MergeRange(ar) + mm.vmas.MergeAdjacent(ar) + if unmapped { + mm.mappingMu.Unlock() + return syserror.ENOMEM + } + + if mode == memmap.MLockEager { + // Ensure that we have usable pmas. Since we didn't return ENOMEM + // above, ar must be fully covered by vmas, so we can just use + // NextSegment below. + mm.activeMu.Lock() + mm.mappingMu.DowngradeLock() + for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { + if !vseg.ValuePtr().effectivePerms.Any() { + // Linux: mm/gup.c:__get_user_pages() returns EFAULT in this + // case, which is converted to ENOMEM by mlock. + mm.activeMu.Unlock() + mm.mappingMu.RUnlock() + return syserror.ENOMEM + } + _, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), usermem.NoAccess) + if err != nil { + mm.activeMu.Unlock() + mm.mappingMu.RUnlock() + // Linux: mm/mlock.c:__mlock_posix_error_return() + if err == syserror.EFAULT { + return syserror.ENOMEM + } + if err == syserror.ENOMEM { + return syserror.EAGAIN + } + return err + } + } + + // Map pmas into the active AddressSpace, if we have one. + mm.mappingMu.RUnlock() + if mm.as != nil { + mm.activeMu.DowngradeLock() + err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */) + mm.activeMu.RUnlock() + if err != nil { + return err + } + } else { + mm.activeMu.Unlock() + } + } else { + mm.mappingMu.Unlock() + } + + return nil +} + +// MLockAllOpts holds options to MLockAll. +type MLockAllOpts struct { + // If Current is true, change the memory-locking behavior of all mappings + // to Mode. If Future is true, upgrade the memory-locking behavior of all + // future mappings to Mode. At least one of Current or Future must be true. + Current bool + Future bool + Mode memmap.MLockMode +} + +// MLockAll implements the semantics of Linux's mlockall()/munlockall(), +// depending on opts. +func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error { + if !opts.Current && !opts.Future { + return syserror.EINVAL + } + + mm.mappingMu.Lock() + // Can't defer mm.mappingMu.Unlock(); see below. + + if opts.Current { + if opts.Mode != memmap.MLockNone { + // Check against RLIMIT_MEMLOCK. + if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { + mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur + if mlockLimit == 0 { + mm.mappingMu.Unlock() + return syserror.EPERM + } + if uint64(mm.vmas.Span()) > mlockLimit { + mm.mappingMu.Unlock() + return syserror.ENOMEM + } + } + } + for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { + vma := vseg.ValuePtr() + prevMode := vma.mlockMode + vma.mlockMode = opts.Mode + if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone { + mm.lockedAS += uint64(vseg.Range().Length()) + } else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone { + mm.lockedAS -= uint64(vseg.Range().Length()) + } + } + } + + if opts.Future { + mm.defMLockMode = opts.Mode + } + + if opts.Current && opts.Mode == memmap.MLockEager { + // Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate() + // ignores the return value of __mm_populate(), so all errors below are + // ignored. + // + // Try to get usable pmas. + mm.activeMu.Lock() + mm.mappingMu.DowngradeLock() + for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { + if vseg.ValuePtr().effectivePerms.Any() { + mm.getPMAsLocked(ctx, vseg, vseg.Range(), usermem.NoAccess) + } + } + + // Map all pmas into the active AddressSpace, if we have one. + mm.mappingMu.RUnlock() + if mm.as != nil { + mm.activeMu.DowngradeLock() + mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */) + mm.activeMu.RUnlock() + } else { + mm.activeMu.Unlock() + } + } else { + mm.mappingMu.Unlock() + } + return nil +} + +// NumaPolicy implements the semantics of Linux's get_mempolicy(MPOL_F_ADDR). +func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (linux.NumaPolicy, uint64, error) { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + vseg := mm.vmas.FindSegment(addr) + if !vseg.Ok() { + return 0, 0, syserror.EFAULT + } + vma := vseg.ValuePtr() + return vma.numaPolicy, vma.numaNodemask, nil +} + +// SetNumaPolicy implements the semantics of Linux's mbind(). +func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy linux.NumaPolicy, nodemask uint64) error { + if !addr.IsPageAligned() { + return syserror.EINVAL + } + // Linux allows this to overflow. + la, _ := usermem.Addr(length).RoundUp() + ar, ok := addr.ToRange(uint64(la)) + if !ok { + return syserror.EINVAL + } + if ar.Length() == 0 { + return nil + } + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + defer func() { + mm.vmas.MergeRange(ar) + mm.vmas.MergeAdjacent(ar) + }() + vseg := mm.vmas.LowerBoundSegment(ar.Start) + lastEnd := ar.Start + for { + if !vseg.Ok() || lastEnd < vseg.Start() { + // "EFAULT: ... there was an unmapped hole in the specified memory + // range specified [sic] by addr and len." - mbind(2) + return syserror.EFAULT + } + vseg = mm.vmas.Isolate(vseg, ar) + vma := vseg.ValuePtr() + vma.numaPolicy = policy + vma.numaNodemask = nodemask + lastEnd = vseg.End() + if ar.End <= lastEnd { + return nil + } + vseg, _ = vseg.NextNonEmpty() + } +} + +// SetDontFork implements the semantics of madvise MADV_DONTFORK. +func (mm *MemoryManager) SetDontFork(addr usermem.Addr, length uint64, dontfork bool) error { + ar, ok := addr.ToRange(length) + if !ok { + return syserror.EINVAL + } + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + defer func() { + mm.vmas.MergeRange(ar) + mm.vmas.MergeAdjacent(ar) + }() + + for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { + vseg = mm.vmas.Isolate(vseg, ar) + vma := vseg.ValuePtr() + vma.dontfork = dontfork + } + + if mm.vmas.SpanRange(ar) != ar.Length() { + return syserror.ENOMEM + } + return nil +} + +// Decommit implements the semantics of Linux's madvise(MADV_DONTNEED). +func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { + ar, ok := addr.ToRange(length) + if !ok { + return syserror.EINVAL + } + + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + mm.activeMu.Lock() + defer mm.activeMu.Unlock() + + // Linux's mm/madvise.c:madvise_dontneed() => mm/memory.c:zap_page_range() + // is analogous to our mm.invalidateLocked(ar, true, true). We inline this + // here, with the special case that we synchronously decommit + // uniquely-owned (non-copy-on-write) pages for private anonymous vma, + // which is the common case for MADV_DONTNEED. Invalidating these pmas, and + // allowing them to be reallocated when touched again, increases pma + // fragmentation, which may significantly reduce performance for + // non-vectored I/O implementations. Also, decommitting synchronously + // ensures that Decommit immediately reduces host memory usage. + var didUnmapAS bool + pseg := mm.pmas.LowerBoundSegment(ar.Start) + mf := mm.mfp.MemoryFile() + for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { + vma := vseg.ValuePtr() + if vma.mlockMode != memmap.MLockNone { + return syserror.EINVAL + } + vsegAR := vseg.Range().Intersect(ar) + // pseg should already correspond to either this vma or a later one, + // since there can't be a pma without a corresponding vma. + if checkInvariants { + if pseg.Ok() && pseg.End() <= vsegAR.Start { + panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR)) + } + } + for pseg.Ok() && pseg.Start() < vsegAR.End { + pma := pseg.ValuePtr() + if pma.private && !mm.isPMACopyOnWriteLocked(vseg, pseg) { + psegAR := pseg.Range().Intersect(ar) + if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil { + if err := mf.Decommit(pseg.fileRangeOf(psegAR)); err == nil { + pseg = pseg.NextSegment() + continue + } + // If an error occurs, fall through to the general + // invalidation case below. + } + } + pseg = mm.pmas.Isolate(pseg, vsegAR) + pma = pseg.ValuePtr() + if !didUnmapAS { + // Unmap all of ar, not just pseg.Range(), to minimize host + // syscalls. AddressSpace mappings must be removed before + // mm.decPrivateRef(). + mm.unmapASLocked(ar) + didUnmapAS = true + } + if pma.private { + mm.decPrivateRef(pseg.fileRange()) + } + pma.file.DecRef(pseg.fileRange()) + mm.removeRSSLocked(pseg.Range()) + pseg = mm.pmas.Remove(pseg).NextSegment() + } + } + + // "If there are some parts of the specified address space that are not + // mapped, the Linux version of madvise() ignores them and applies the call + // to the rest (but returns ENOMEM from the system call, as it should)." - + // madvise(2) + if mm.vmas.SpanRange(ar) != ar.Length() { + return syserror.ENOMEM + } + return nil +} + +// MSyncOpts holds options to MSync. +type MSyncOpts struct { + // Sync has the semantics of MS_SYNC. + Sync bool + + // Invalidate has the semantics of MS_INVALIDATE. + Invalidate bool +} + +// MSync implements the semantics of Linux's msync(). +func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length uint64, opts MSyncOpts) error { + if addr != addr.RoundDown() { + return syserror.EINVAL + } + if length == 0 { + return nil + } + la, ok := usermem.Addr(length).RoundUp() + if !ok { + return syserror.ENOMEM + } + ar, ok := addr.ToRange(uint64(la)) + if !ok { + return syserror.ENOMEM + } + + mm.mappingMu.RLock() + // Can't defer mm.mappingMu.RUnlock(); see below. + vseg := mm.vmas.LowerBoundSegment(ar.Start) + if !vseg.Ok() { + mm.mappingMu.RUnlock() + return syserror.ENOMEM + } + var unmapped bool + lastEnd := ar.Start + for { + if !vseg.Ok() { + mm.mappingMu.RUnlock() + unmapped = true + break + } + if lastEnd < vseg.Start() { + unmapped = true + } + lastEnd = vseg.End() + vma := vseg.ValuePtr() + if opts.Invalidate && vma.mlockMode != memmap.MLockNone { + mm.mappingMu.RUnlock() + return syserror.EBUSY + } + // It's only possible to have dirtied the Mappable through a shared + // mapping. Don't check if the mapping is writable, because mprotect + // may have changed this, and also because Linux doesn't. + if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private { + // We can't call memmap.MappingIdentity.Msync while holding + // mm.mappingMu since it may take fs locks that precede it in the + // lock order. + id.IncRef() + mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar)) + mm.mappingMu.RUnlock() + err := id.Msync(ctx, mr) + id.DecRef() + if err != nil { + return err + } + if lastEnd >= ar.End { + break + } + mm.mappingMu.RLock() + vseg = mm.vmas.LowerBoundSegment(lastEnd) + } else { + if lastEnd >= ar.End { + mm.mappingMu.RUnlock() + break + } + vseg = vseg.NextSegment() + } + } + + if unmapped { + return syserror.ENOMEM + } + return nil +} + +// GetSharedFutexKey is used by kernel.Task.GetSharedKey. +func (mm *MemoryManager) GetSharedFutexKey(ctx context.Context, addr usermem.Addr) (futex.Key, error) { + ar, ok := addr.ToRange(4) // sizeof(int32). + if !ok { + return futex.Key{}, syserror.EFAULT + } + + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + vseg, _, err := mm.getVMAsLocked(ctx, ar, usermem.Read, false) + if err != nil { + return futex.Key{}, err + } + vma := vseg.ValuePtr() + + if vma.private { + return futex.Key{ + Kind: futex.KindSharedPrivate, + Offset: uint64(addr), + }, nil + } + + if vma.id != nil { + vma.id.IncRef() + } + return futex.Key{ + Kind: futex.KindSharedMappable, + Mappable: vma.mappable, + MappingIdentity: vma.id, + Offset: vseg.mappableOffsetAt(addr), + }, nil +} + +// VirtualMemorySize returns the combined length in bytes of all mappings in +// mm. +func (mm *MemoryManager) VirtualMemorySize() uint64 { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + return mm.usageAS +} + +// VirtualMemorySizeRange returns the combined length in bytes of all mappings +// in ar in mm. +func (mm *MemoryManager) VirtualMemorySizeRange(ar usermem.AddrRange) uint64 { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + return uint64(mm.vmas.SpanRange(ar)) +} + +// ResidentSetSize returns the value advertised as mm's RSS in bytes. +func (mm *MemoryManager) ResidentSetSize() uint64 { + mm.activeMu.RLock() + defer mm.activeMu.RUnlock() + return mm.curRSS +} + +// MaxResidentSetSize returns the value advertised as mm's max RSS in bytes. +func (mm *MemoryManager) MaxResidentSetSize() uint64 { + mm.activeMu.RLock() + defer mm.activeMu.RUnlock() + return mm.maxRSS +} + +// VirtualDataSize returns the size of private data segments in mm. +func (mm *MemoryManager) VirtualDataSize() uint64 { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + return mm.dataAS +} diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go new file mode 100644 index 000000000..16d8207e9 --- /dev/null +++ b/pkg/sentry/mm/vma.go @@ -0,0 +1,568 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Preconditions: mm.mappingMu must be locked for writing. opts must be valid +// as defined by the checks in MMap. +func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts) (vmaIterator, usermem.AddrRange, error) { + if opts.MaxPerms != opts.MaxPerms.Effective() { + panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms)) + } + + // Find a usable range. + addr, err := mm.findAvailableLocked(opts.Length, findAvailableOpts{ + Addr: opts.Addr, + Fixed: opts.Fixed, + Unmap: opts.Unmap, + Map32Bit: opts.Map32Bit, + }) + if err != nil { + return vmaIterator{}, usermem.AddrRange{}, err + } + ar, _ := addr.ToRange(opts.Length) + + // Check against RLIMIT_AS. + newUsageAS := mm.usageAS + opts.Length + if opts.Unmap { + newUsageAS -= uint64(mm.vmas.SpanRange(ar)) + } + if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS { + return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM + } + + if opts.MLockMode != memmap.MLockNone { + // Check against RLIMIT_MEMLOCK. + if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { + mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur + if mlockLimit == 0 { + return vmaIterator{}, usermem.AddrRange{}, syserror.EPERM + } + newLockedAS := mm.lockedAS + opts.Length + if opts.Unmap { + newLockedAS -= mm.mlockedBytesRangeLocked(ar) + } + if newLockedAS > mlockLimit { + return vmaIterator{}, usermem.AddrRange{}, syserror.EAGAIN + } + } + } + + // Remove overwritten mappings. This ordering is consistent with Linux: + // compare Linux's mm/mmap.c:mmap_region() => do_munmap(), + // file->f_op->mmap(). + var vgap vmaGapIterator + if opts.Unmap { + vgap = mm.unmapLocked(ctx, ar) + } else { + vgap = mm.vmas.FindGap(ar.Start) + } + + // Inform the Mappable, if any, of the new mapping. + if opts.Mappable != nil { + // The expression for writable is vma.canWriteMappableLocked(), but we + // don't yet have a vma. + if err := opts.Mappable.AddMapping(ctx, mm, ar, opts.Offset, !opts.Private && opts.MaxPerms.Write); err != nil { + return vmaIterator{}, usermem.AddrRange{}, err + } + } + + // Take a reference on opts.MappingIdentity before inserting the vma since + // vma merging can drop the reference. + if opts.MappingIdentity != nil { + opts.MappingIdentity.IncRef() + } + + // Finally insert the vma. + v := vma{ + mappable: opts.Mappable, + off: opts.Offset, + realPerms: opts.Perms, + effectivePerms: opts.Perms.Effective(), + maxPerms: opts.MaxPerms, + private: opts.Private, + growsDown: opts.GrowsDown, + mlockMode: opts.MLockMode, + numaPolicy: linux.MPOL_DEFAULT, + id: opts.MappingIdentity, + hint: opts.Hint, + } + + vseg := mm.vmas.Insert(vgap, ar, v) + mm.usageAS += opts.Length + if v.isPrivateDataLocked() { + mm.dataAS += opts.Length + } + if opts.MLockMode != memmap.MLockNone { + mm.lockedAS += opts.Length + } + + return vseg, ar, nil +} + +type findAvailableOpts struct { + // These fields are equivalent to those in memmap.MMapOpts, except that: + // + // - Addr must be page-aligned. + // + // - Unmap allows existing guard pages in the returned range. + + Addr usermem.Addr + Fixed bool + Unmap bool + Map32Bit bool +} + +// map32Start/End are the bounds to which MAP_32BIT mappings are constrained, +// and are equivalent to Linux's MAP32_BASE and MAP32_MAX respectively. +const ( + map32Start = 0x40000000 + map32End = 0x80000000 +) + +// findAvailableLocked finds an allocatable range. +// +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) findAvailableLocked(length uint64, opts findAvailableOpts) (usermem.Addr, error) { + if opts.Fixed { + opts.Map32Bit = false + } + allowedAR := mm.applicationAddrRange() + if opts.Map32Bit { + allowedAR = allowedAR.Intersect(usermem.AddrRange{map32Start, map32End}) + } + + // Does the provided suggestion work? + if ar, ok := opts.Addr.ToRange(length); ok { + if allowedAR.IsSupersetOf(ar) { + if opts.Unmap { + return ar.Start, nil + } + // Check for the presence of an existing vma or guard page. + if vgap := mm.vmas.FindGap(ar.Start); vgap.Ok() && vgap.availableRange().IsSupersetOf(ar) { + return ar.Start, nil + } + } + } + + // Fixed mappings accept only the requested address. + if opts.Fixed { + return 0, syserror.ENOMEM + } + + // Prefer hugepage alignment if a hugepage or more is requested. + alignment := uint64(usermem.PageSize) + if length >= usermem.HugePageSize { + alignment = usermem.HugePageSize + } + + if opts.Map32Bit { + return mm.findLowestAvailableLocked(length, alignment, allowedAR) + } + if mm.layout.DefaultDirection == arch.MmapBottomUp { + return mm.findLowestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.BottomUpBase, mm.layout.MaxAddr}) + } + return mm.findHighestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.MinAddr, mm.layout.TopDownBase}) +} + +func (mm *MemoryManager) applicationAddrRange() usermem.AddrRange { + return usermem.AddrRange{mm.layout.MinAddr, mm.layout.MaxAddr} +} + +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) { + for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextLargeEnoughGap(usermem.Addr(length)) { + if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length { + // Can we shift up to match the alignment? + if offset := uint64(gr.Start) % alignment; offset != 0 { + if uint64(gr.Length()) >= length+alignment-offset { + // Yes, we're aligned. + return gr.Start + usermem.Addr(alignment-offset), nil + } + } + + // Either aligned perfectly, or can't align it. + return gr.Start, nil + } + } + return 0, syserror.ENOMEM +} + +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) { + for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevLargeEnoughGap(usermem.Addr(length)) { + if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length { + // Can we shift down to match the alignment? + start := gr.End - usermem.Addr(length) + if offset := uint64(start) % alignment; offset != 0 { + if gr.Start <= start-usermem.Addr(offset) { + // Yes, we're aligned. + return start - usermem.Addr(offset), nil + } + } + + // Either aligned perfectly, or can't align it. + return start, nil + } + } + return 0, syserror.ENOMEM +} + +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 { + var total uint64 + for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { + if vseg.ValuePtr().mlockMode != memmap.MLockNone { + total += uint64(vseg.Range().Intersect(ar).Length()) + } + } + return total +} + +// getVMAsLocked ensures that vmas exist for all addresses in ar, and support +// access of type (at, ignorePermissions). It returns: +// +// - An iterator to the vma containing ar.Start. If no vma contains ar.Start, +// the iterator is unspecified. +// +// - An iterator to the gap after the last vma containing an address in ar. If +// vmas exist for no addresses in ar, the iterator is to a gap that begins +// before ar.Start. +// +// - An error that is non-nil if vmas exist for only a subset of ar. +// +// Preconditions: mm.mappingMu must be locked for reading; it may be +// temporarily unlocked. ar.Length() != 0. +func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + // Inline mm.vmas.LowerBoundSegment so that we have the preceding gap if + // !vbegin.Ok(). + vbegin, vgap := mm.vmas.Find(ar.Start) + if !vbegin.Ok() { + vbegin = vgap.NextSegment() + // vseg.Ok() is checked before entering the following loop. + } else { + vgap = vbegin.PrevGap() + } + + addr := ar.Start + vseg := vbegin + for vseg.Ok() { + // Loop invariants: vgap = vseg.PrevGap(); addr < vseg.End(). + vma := vseg.ValuePtr() + if addr < vseg.Start() { + // TODO(jamieliu): Implement vma.growsDown here. + return vbegin, vgap, syserror.EFAULT + } + + perms := vma.effectivePerms + if ignorePermissions { + perms = vma.maxPerms + } + if !perms.SupersetOf(at) { + return vbegin, vgap, syserror.EPERM + } + + addr = vseg.End() + vgap = vseg.NextGap() + if addr >= ar.End { + return vbegin, vgap, nil + } + vseg = vgap.NextSegment() + } + + // Ran out of vmas before ar.End. + return vbegin, vgap, syserror.EFAULT +} + +// getVecVMAsLocked ensures that vmas exist for all addresses in ars, and +// support access to type of (at, ignorePermissions). It returns the subset of +// ars for which vmas exist. If this is not equal to ars, it returns a non-nil +// error explaining why. +// +// Preconditions: mm.mappingMu must be locked for reading; it may be +// temporarily unlocked. +// +// Postconditions: ars is not mutated. +func (mm *MemoryManager) getVecVMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool) (usermem.AddrRangeSeq, error) { + for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { + ar := arsit.Head() + if ar.Length() == 0 { + continue + } + if _, vend, err := mm.getVMAsLocked(ctx, ar, at, ignorePermissions); err != nil { + return truncatedAddrRangeSeq(ars, arsit, vend.Start()), err + } + } + return ars, nil +} + +// vma extension will not shrink the number of unmapped bytes between the start +// of a growsDown vma and the end of its predecessor non-growsDown vma below +// guardBytes. +// +// guardBytes is equivalent to Linux's stack_guard_gap after upstream +// 1be7107fbe18 "mm: larger stack guard gap, between vmas". +const guardBytes = 256 * usermem.PageSize + +// unmapLocked unmaps all addresses in ar and returns the resulting gap in +// mm.vmas. +// +// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. +// ar must be page-aligned. +func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + // AddressSpace mappings and pmas must be invalidated before + // mm.removeVMAsLocked() => memmap.Mappable.RemoveMapping(). + mm.Invalidate(ar, memmap.InvalidateOpts{InvalidatePrivate: true}) + return mm.removeVMAsLocked(ctx, ar) +} + +// removeVMAsLocked removes vmas for addresses in ar and returns the resulting +// gap in mm.vmas. It does not remove pmas or AddressSpace mappings; clients +// must do so before calling removeVMAsLocked. +// +// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. ar +// must be page-aligned. +func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + vseg, vgap := mm.vmas.Find(ar.Start) + if vgap.Ok() { + vseg = vgap.NextSegment() + } + for vseg.Ok() && vseg.Start() < ar.End { + vseg = mm.vmas.Isolate(vseg, ar) + vmaAR := vseg.Range() + vma := vseg.ValuePtr() + if vma.mappable != nil { + vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked()) + } + if vma.id != nil { + vma.id.DecRef() + } + mm.usageAS -= uint64(vmaAR.Length()) + if vma.isPrivateDataLocked() { + mm.dataAS -= uint64(vmaAR.Length()) + } + if vma.mlockMode != memmap.MLockNone { + mm.lockedAS -= uint64(vmaAR.Length()) + } + vgap = mm.vmas.Remove(vseg) + vseg = vgap.NextSegment() + } + return vgap +} + +// canWriteMappableLocked returns true if it is possible for vma.mappable to be +// written to via this vma, i.e. if it is possible that +// vma.mappable.Translate(at.Write=true) may be called as a result of this vma. +// This includes via I/O with usermem.IOOpts.IgnorePermissions = true, such as +// PTRACE_POKEDATA. +// +// canWriteMappableLocked is equivalent to Linux's VM_SHARED. +// +// Preconditions: mm.mappingMu must be locked. +func (vma *vma) canWriteMappableLocked() bool { + return !vma.private && vma.maxPerms.Write +} + +// isPrivateDataLocked identify the data segments - private, writable, not stack +// +// Preconditions: mm.mappingMu must be locked. +func (vma *vma) isPrivateDataLocked() bool { + return vma.realPerms.Write && vma.private && !vma.growsDown +} + +// vmaSetFunctions implements segment.Functions for vmaSet. +type vmaSetFunctions struct{} + +func (vmaSetFunctions) MinKey() usermem.Addr { + return 0 +} + +func (vmaSetFunctions) MaxKey() usermem.Addr { + return ^usermem.Addr(0) +} + +func (vmaSetFunctions) ClearValue(vma *vma) { + vma.mappable = nil + vma.id = nil + vma.hint = "" +} + +func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRange, vma2 vma) (vma, bool) { + if vma1.mappable != vma2.mappable || + (vma1.mappable != nil && vma1.off+uint64(ar1.Length()) != vma2.off) || + vma1.realPerms != vma2.realPerms || + vma1.maxPerms != vma2.maxPerms || + vma1.private != vma2.private || + vma1.growsDown != vma2.growsDown || + vma1.mlockMode != vma2.mlockMode || + vma1.numaPolicy != vma2.numaPolicy || + vma1.numaNodemask != vma2.numaNodemask || + vma1.dontfork != vma2.dontfork || + vma1.id != vma2.id || + vma1.hint != vma2.hint { + return vma{}, false + } + + if vma2.id != nil { + vma2.id.DecRef() + } + return vma1, true +} + +func (vmaSetFunctions) Split(ar usermem.AddrRange, v vma, split usermem.Addr) (vma, vma) { + v2 := v + if v2.mappable != nil { + v2.off += uint64(split - ar.Start) + } + if v2.id != nil { + v2.id.IncRef() + } + return v, v2 +} + +// Preconditions: vseg.ValuePtr().mappable != nil. vseg.Range().Contains(addr). +func (vseg vmaIterator) mappableOffsetAt(addr usermem.Addr) uint64 { + if checkInvariants { + if !vseg.Ok() { + panic("terminal vma iterator") + } + if vseg.ValuePtr().mappable == nil { + panic("Mappable offset is meaningless for anonymous vma") + } + if !vseg.Range().Contains(addr) { + panic(fmt.Sprintf("addr %v out of bounds %v", addr, vseg.Range())) + } + } + + vma := vseg.ValuePtr() + vstart := vseg.Start() + return vma.off + uint64(addr-vstart) +} + +// Preconditions: vseg.ValuePtr().mappable != nil. +func (vseg vmaIterator) mappableRange() memmap.MappableRange { + return vseg.mappableRangeOf(vseg.Range()) +} + +// Preconditions: vseg.ValuePtr().mappable != nil. +// vseg.Range().IsSupersetOf(ar). ar.Length() != 0. +func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRange { + if checkInvariants { + if !vseg.Ok() { + panic("terminal vma iterator") + } + if vseg.ValuePtr().mappable == nil { + panic("MappableRange is meaningless for anonymous vma") + } + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !vseg.Range().IsSupersetOf(ar) { + panic(fmt.Sprintf("ar %v out of bounds %v", ar, vseg.Range())) + } + } + + vma := vseg.ValuePtr() + vstart := vseg.Start() + return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)} +} + +// Preconditions: vseg.ValuePtr().mappable != nil. +// vseg.mappableRange().IsSupersetOf(mr). mr.Length() != 0. +func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange { + if checkInvariants { + if !vseg.Ok() { + panic("terminal vma iterator") + } + if vseg.ValuePtr().mappable == nil { + panic("MappableRange is meaningless for anonymous vma") + } + if !mr.WellFormed() || mr.Length() <= 0 { + panic(fmt.Sprintf("invalid mr: %v", mr)) + } + if !vseg.mappableRange().IsSupersetOf(mr) { + panic(fmt.Sprintf("mr %v out of bounds %v", mr, vseg.mappableRange())) + } + } + + vma := vseg.ValuePtr() + vstart := vseg.Start() + return usermem.AddrRange{vstart + usermem.Addr(mr.Start-vma.off), vstart + usermem.Addr(mr.End-vma.off)} +} + +// seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by +// scanning linearly forward from vseg. +// +// Preconditions: mm.mappingMu must be locked. addr >= vseg.Start(). +func (vseg vmaIterator) seekNextLowerBound(addr usermem.Addr) vmaIterator { + if checkInvariants { + if !vseg.Ok() { + panic("terminal vma iterator") + } + if addr < vseg.Start() { + panic(fmt.Sprintf("can't seek forward to %#x from %#x", addr, vseg.Start())) + } + } + for vseg.Ok() && addr >= vseg.End() { + vseg = vseg.NextSegment() + } + return vseg +} + +// availableRange returns the subset of vgap.Range() in which new vmas may be +// created without MMapOpts.Unmap == true. +func (vgap vmaGapIterator) availableRange() usermem.AddrRange { + ar := vgap.Range() + next := vgap.NextSegment() + if !next.Ok() || !next.ValuePtr().growsDown { + return ar + } + // Exclude guard pages. + if ar.Length() < guardBytes { + return usermem.AddrRange{ar.Start, ar.Start} + } + ar.End -= guardBytes + return ar +} |