summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/mm
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/mm')
-rw-r--r--pkg/sentry/mm/BUILD3
-rw-r--r--pkg/sentry/mm/address_space.go52
-rw-r--r--pkg/sentry/mm/aio_context.go101
-rw-r--r--pkg/sentry/mm/aio_context_state.go2
-rw-r--r--pkg/sentry/mm/lifecycle.go38
-rw-r--r--pkg/sentry/mm/metadata.go24
-rw-r--r--pkg/sentry/mm/mm.go15
-rw-r--r--pkg/sentry/mm/mm_test.go2
-rw-r--r--pkg/sentry/mm/procfs.go4
-rw-r--r--pkg/sentry/mm/syscalls.go4
-rw-r--r--pkg/sentry/mm/vma.go4
11 files changed, 172 insertions, 77 deletions
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index e5729ced5..a036ce53c 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -25,6 +25,7 @@ go_template_instance(
out = "vma_set.go",
consts = {
"minDegree": "8",
+ "trackGaps": "1",
},
imports = {
"usermem": "gvisor.dev/gvisor/pkg/usermem",
@@ -105,8 +106,8 @@ go_library(
"//pkg/safecopy",
"//pkg/safemem",
"//pkg/sentry/arch",
- "//pkg/sentry/fs",
"//pkg/sentry/fs/proc/seqfile",
+ "//pkg/sentry/fsbridge",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/futex",
"//pkg/sentry/kernel/shm",
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index e58a63deb..5c667117c 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -18,7 +18,7 @@ import (
"fmt"
"sync/atomic"
- "gvisor.dev/gvisor/pkg/atomicbitops"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -39,11 +39,18 @@ func (mm *MemoryManager) AddressSpace() platform.AddressSpace {
//
// When this MemoryManager is no longer needed by a task, it should call
// Deactivate to release the reference.
-func (mm *MemoryManager) Activate() error {
+func (mm *MemoryManager) Activate(ctx context.Context) error {
// Fast path: the MemoryManager already has an active
// platform.AddressSpace, and we just need to indicate that we need it too.
- if atomicbitops.IncUnlessZeroInt32(&mm.active) {
- return nil
+ for {
+ active := atomic.LoadInt32(&mm.active)
+ if active == 0 {
+ // Fall back to the slow path.
+ break
+ }
+ if atomic.CompareAndSwapInt32(&mm.active, active, active+1) {
+ return nil
+ }
}
for {
@@ -85,16 +92,20 @@ func (mm *MemoryManager) Activate() error {
if as == nil {
// AddressSpace is unavailable, we must wait.
//
- // activeMu must not be held while waiting, as the user
- // of the address space we are waiting on may attempt
- // to take activeMu.
- //
- // Don't call UninterruptibleSleepStart to register the
- // wait to allow the watchdog stuck task to trigger in
- // case a process is starved waiting for the address
- // space.
+ // activeMu must not be held while waiting, as the user of the address
+ // space we are waiting on may attempt to take activeMu.
mm.activeMu.Unlock()
+
+ sleep := mm.p.CooperativelySchedulesAddressSpace() && mm.sleepForActivation
+ if sleep {
+ // Mark this task sleeping while waiting for the address space to
+ // prevent the watchdog from reporting it as a stuck task.
+ ctx.UninterruptibleSleepStart(false)
+ }
<-c
+ if sleep {
+ ctx.UninterruptibleSleepFinish(false)
+ }
continue
}
@@ -118,8 +129,15 @@ func (mm *MemoryManager) Activate() error {
func (mm *MemoryManager) Deactivate() {
// Fast path: this is not the last goroutine to deactivate the
// MemoryManager.
- if atomicbitops.DecUnlessOneInt32(&mm.active) {
- return
+ for {
+ active := atomic.LoadInt32(&mm.active)
+ if active == 1 {
+ // Fall back to the slow path.
+ break
+ }
+ if atomic.CompareAndSwapInt32(&mm.active, active, active-1) {
+ return
+ }
}
mm.activeMu.Lock()
@@ -183,8 +201,10 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
if pma.needCOW {
perms.Write = false
}
- if err := mm.as.MapFile(pmaMapAR.Start, pma.file, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
- return err
+ if perms.Any() { // MapFile precondition
+ if err := mm.as.MapFile(pmaMapAR.Start, pma.file, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
+ return err
+ }
}
pseg = pseg.NextSegment()
}
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index cb29d94b0..379148903 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -59,25 +59,27 @@ func (a *aioManager) newAIOContext(events uint32, id uint64) bool {
}
a.contexts[id] = &AIOContext{
- done: make(chan struct{}, 1),
+ requestReady: make(chan struct{}, 1),
maxOutstanding: events,
}
return true
}
-// destroyAIOContext destroys an asynchronous I/O context.
+// destroyAIOContext destroys an asynchronous I/O context. It doesn't wait for
+// for pending requests to complete. Returns the destroyed AIOContext so it can
+// be drained.
//
-// False is returned if the context does not exist.
-func (a *aioManager) destroyAIOContext(id uint64) bool {
+// Nil is returned if the context does not exist.
+func (a *aioManager) destroyAIOContext(id uint64) *AIOContext {
a.mu.Lock()
defer a.mu.Unlock()
ctx, ok := a.contexts[id]
if !ok {
- return false
+ return nil
}
delete(a.contexts, id)
ctx.destroy()
- return true
+ return ctx
}
// lookupAIOContext looks up the given context.
@@ -102,8 +104,8 @@ type ioResult struct {
//
// +stateify savable
type AIOContext struct {
- // done is the notification channel used for all requests.
- done chan struct{} `state:"nosave"`
+ // requestReady is the notification channel used for all requests.
+ requestReady chan struct{} `state:"nosave"`
// mu protects below.
mu sync.Mutex `state:"nosave"`
@@ -129,8 +131,14 @@ func (ctx *AIOContext) destroy() {
ctx.mu.Lock()
defer ctx.mu.Unlock()
ctx.dead = true
- if ctx.outstanding == 0 {
- close(ctx.done)
+ ctx.checkForDone()
+}
+
+// Preconditions: ctx.mu must be held by caller.
+func (ctx *AIOContext) checkForDone() {
+ if ctx.dead && ctx.outstanding == 0 {
+ close(ctx.requestReady)
+ ctx.requestReady = nil
}
}
@@ -154,11 +162,12 @@ func (ctx *AIOContext) PopRequest() (interface{}, bool) {
// Is there anything ready?
if e := ctx.results.Front(); e != nil {
- ctx.results.Remove(e)
- ctx.outstanding--
- if ctx.outstanding == 0 && ctx.dead {
- close(ctx.done)
+ if ctx.outstanding == 0 {
+ panic("AIOContext outstanding is going negative")
}
+ ctx.outstanding--
+ ctx.results.Remove(e)
+ ctx.checkForDone()
return e.data, true
}
return nil, false
@@ -172,26 +181,58 @@ func (ctx *AIOContext) FinishRequest(data interface{}) {
// Push to the list and notify opportunistically. The channel notify
// here is guaranteed to be safe because outstanding must be non-zero.
- // The done channel is only closed when outstanding reaches zero.
+ // The requestReady channel is only closed when outstanding reaches zero.
ctx.results.PushBack(&ioResult{data: data})
select {
- case ctx.done <- struct{}{}:
+ case ctx.requestReady <- struct{}{}:
default:
}
}
// WaitChannel returns a channel that is notified when an AIO request is
-// completed.
-//
-// The boolean return value indicates whether or not the context is active.
-func (ctx *AIOContext) WaitChannel() (chan struct{}, bool) {
+// completed. Returns nil if the context is destroyed and there are no more
+// outstanding requests.
+func (ctx *AIOContext) WaitChannel() chan struct{} {
ctx.mu.Lock()
defer ctx.mu.Unlock()
- if ctx.outstanding == 0 && ctx.dead {
- return nil, false
+ return ctx.requestReady
+}
+
+// Dead returns true if the context has been destroyed.
+func (ctx *AIOContext) Dead() bool {
+ ctx.mu.Lock()
+ defer ctx.mu.Unlock()
+ return ctx.dead
+}
+
+// CancelPendingRequest forgets about a request that hasn't yet completed.
+func (ctx *AIOContext) CancelPendingRequest() {
+ ctx.mu.Lock()
+ defer ctx.mu.Unlock()
+
+ if ctx.outstanding == 0 {
+ panic("AIOContext outstanding is going negative")
}
- return ctx.done, true
+ ctx.outstanding--
+ ctx.checkForDone()
+}
+
+// Drain drops all completed requests. Pending requests remain untouched.
+func (ctx *AIOContext) Drain() {
+ ctx.mu.Lock()
+ defer ctx.mu.Unlock()
+
+ if ctx.outstanding == 0 {
+ return
+ }
+ size := uint32(ctx.results.Len())
+ if ctx.outstanding < size {
+ panic("AIOContext outstanding is going negative")
+ }
+ ctx.outstanding -= size
+ ctx.results.Reset()
+ ctx.checkForDone()
}
// aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO
@@ -332,9 +373,9 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint
Length: aioRingBufferSize,
MappingIdentity: m,
Mappable: m,
- // TODO(fvoznika): Linux does "do_mmap_pgoff(..., PROT_READ |
- // PROT_WRITE, ...)" in fs/aio.c:aio_setup_ring(); why do we make this
- // mapping read-only?
+ // Linux uses "do_mmap_pgoff(..., PROT_READ | PROT_WRITE, ...)" in
+ // fs/aio.c:aio_setup_ring(). Since we don't implement AIO_RING_MAGIC,
+ // user mode should not write to this page.
Perms: usermem.Read,
MaxPerms: usermem.Read,
})
@@ -349,11 +390,11 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint
return id, nil
}
-// DestroyAIOContext destroys an asynchronous I/O context. It returns false if
-// the context does not exist.
-func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) bool {
+// DestroyAIOContext destroys an asynchronous I/O context. It returns the
+// destroyed context. nil if the context does not exist.
+func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOContext {
if _, ok := mm.LookupAIOContext(ctx, id); !ok {
- return false
+ return nil
}
// Only unmaps after it assured that the address is a valid aio context to
diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go
index c37fc9f7b..3dabac1af 100644
--- a/pkg/sentry/mm/aio_context_state.go
+++ b/pkg/sentry/mm/aio_context_state.go
@@ -16,5 +16,5 @@ package mm
// afterLoad is invoked by stateify.
func (a *AIOContext) afterLoad() {
- a.done = make(chan struct{}, 1)
+ a.requestReady = make(chan struct{}, 1)
}
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 47b8fbf43..aac56679b 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -18,7 +18,6 @@ import (
"fmt"
"sync/atomic"
- "gvisor.dev/gvisor/pkg/atomicbitops"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/limits"
@@ -29,16 +28,17 @@ import (
)
// NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
-func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider) *MemoryManager {
+func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider, sleepForActivation bool) *MemoryManager {
return &MemoryManager{
- p: p,
- mfp: mfp,
- haveASIO: p.SupportsAddressSpaceIO(),
- privateRefs: &privateRefs{},
- users: 1,
- auxv: arch.Auxv{},
- dumpability: UserDumpable,
- aioManager: aioManager{contexts: make(map[uint64]*AIOContext)},
+ p: p,
+ mfp: mfp,
+ haveASIO: p.SupportsAddressSpaceIO(),
+ privateRefs: &privateRefs{},
+ users: 1,
+ auxv: arch.Auxv{},
+ dumpability: UserDumpable,
+ aioManager: aioManager{contexts: make(map[uint64]*AIOContext)},
+ sleepForActivation: sleepForActivation,
}
}
@@ -80,9 +80,11 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
envv: mm.envv,
auxv: append(arch.Auxv(nil), mm.auxv...),
// IncRef'd below, once we know that there isn't an error.
- executable: mm.executable,
- dumpability: mm.dumpability,
- aioManager: aioManager{contexts: make(map[uint64]*AIOContext)},
+ executable: mm.executable,
+ dumpability: mm.dumpability,
+ aioManager: aioManager{contexts: make(map[uint64]*AIOContext)},
+ sleepForActivation: mm.sleepForActivation,
+ vdsoSigReturnAddr: mm.vdsoSigReturnAddr,
}
// Copy vmas.
@@ -229,7 +231,15 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
// IncUsers increments mm's user count and returns true. If the user count is
// already 0, IncUsers does nothing and returns false.
func (mm *MemoryManager) IncUsers() bool {
- return atomicbitops.IncUnlessZeroInt32(&mm.users)
+ for {
+ users := atomic.LoadInt32(&mm.users)
+ if users == 0 {
+ return false
+ }
+ if atomic.CompareAndSwapInt32(&mm.users, users, users+1) {
+ return true
+ }
+ }
}
// DecUsers decrements mm's user count. If the user count reaches 0, all
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
index f550acae0..28e5057f7 100644
--- a/pkg/sentry/mm/metadata.go
+++ b/pkg/sentry/mm/metadata.go
@@ -16,7 +16,7 @@ package mm
import (
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -132,7 +132,7 @@ func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) {
//
// An additional reference will be taken in the case of a non-nil executable,
// which must be released by the caller.
-func (mm *MemoryManager) Executable() *fs.Dirent {
+func (mm *MemoryManager) Executable() fsbridge.File {
mm.metadataMu.Lock()
defer mm.metadataMu.Unlock()
@@ -147,15 +147,15 @@ func (mm *MemoryManager) Executable() *fs.Dirent {
// SetExecutable sets the executable.
//
// This takes a reference on d.
-func (mm *MemoryManager) SetExecutable(d *fs.Dirent) {
+func (mm *MemoryManager) SetExecutable(file fsbridge.File) {
mm.metadataMu.Lock()
// Grab a new reference.
- d.IncRef()
+ file.IncRef()
// Set the executable.
orig := mm.executable
- mm.executable = d
+ mm.executable = file
mm.metadataMu.Unlock()
@@ -167,3 +167,17 @@ func (mm *MemoryManager) SetExecutable(d *fs.Dirent) {
orig.DecRef()
}
}
+
+// VDSOSigReturn returns the address of vdso_sigreturn.
+func (mm *MemoryManager) VDSOSigReturn() uint64 {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ return mm.vdsoSigReturnAddr
+}
+
+// SetVDSOSigReturn sets the address of vdso_sigreturn.
+func (mm *MemoryManager) SetVDSOSigReturn(addr uint64) {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ mm.vdsoSigReturnAddr = addr
+}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 09e582dd3..6db7c3d40 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -35,9 +35,10 @@
package mm
import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/platform"
@@ -215,7 +216,7 @@ type MemoryManager struct {
// is not nil, it holds a reference on the Dirent.
//
// executable is protected by metadataMu.
- executable *fs.Dirent
+ executable fsbridge.File
// dumpability describes if and how this MemoryManager may be dumped to
// userspace.
@@ -226,6 +227,14 @@ type MemoryManager struct {
// aioManager keeps track of AIOContexts used for async IOs. AIOManager
// must be cloned when CLONE_VM is used.
aioManager aioManager
+
+ // sleepForActivation indicates whether the task should report to be sleeping
+ // before trying to activate the address space. When set to true, delays in
+ // activation are not reported as stuck tasks by the watchdog.
+ sleepForActivation bool
+
+ // vdsoSigReturnAddr is the address of 'vdso_sigreturn'.
+ vdsoSigReturnAddr uint64
}
// vma represents a virtual memory area.
@@ -278,7 +287,7 @@ type vma struct {
mlockMode memmap.MLockMode
// numaPolicy is the NUMA policy for this vma set by mbind().
- numaPolicy int32
+ numaPolicy linux.NumaPolicy
// numaNodemask is the NUMA nodemask for this vma set by mbind().
numaNodemask uint64
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index edacca741..fdc308542 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -31,7 +31,7 @@ import (
func testMemoryManager(ctx context.Context) *MemoryManager {
p := platform.FromContext(ctx)
mfp := pgalloc.MemoryFileProviderFromContext(ctx)
- mm := NewMemoryManager(p, mfp)
+ mm := NewMemoryManager(p, mfp, false)
mm.layout = arch.MmapLayout{
MinAddr: p.MinUserAddress(),
MaxAddr: p.MaxUserAddress(),
diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go
index 1ab92f046..6efe5102b 100644
--- a/pkg/sentry/mm/procfs.go
+++ b/pkg/sentry/mm/procfs.go
@@ -148,7 +148,7 @@ func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaI
// Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() =>
// stack_guard_page_start().
- fmt.Fprintf(b, "%08x-%08x %s%s %08x %02x:%02x %d ",
+ lineLen, _ := fmt.Fprintf(b, "%08x-%08x %s%s %08x %02x:%02x %d ",
vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino)
// Figure out our filename or hint.
@@ -165,7 +165,7 @@ func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaI
}
if s != "" {
// Per linux, we pad until the 74th character.
- if pad := 73 - b.Len(); pad > 0 {
+ if pad := 73 - lineLen; pad > 0 {
b.WriteString(strings.Repeat(" ", pad))
}
b.WriteString(s)
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index c5dfa5972..3f496aa9f 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -974,7 +974,7 @@ func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error
}
// NumaPolicy implements the semantics of Linux's get_mempolicy(MPOL_F_ADDR).
-func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (int32, uint64, error) {
+func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (linux.NumaPolicy, uint64, error) {
mm.mappingMu.RLock()
defer mm.mappingMu.RUnlock()
vseg := mm.vmas.FindSegment(addr)
@@ -986,7 +986,7 @@ func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (int32, uint64, error) {
}
// SetNumaPolicy implements the semantics of Linux's mbind().
-func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy int32, nodemask uint64) error {
+func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy linux.NumaPolicy, nodemask uint64) error {
if !addr.IsPageAligned() {
return syserror.EINVAL
}
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 9a14e69e6..16d8207e9 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -195,7 +195,7 @@ func (mm *MemoryManager) applicationAddrRange() usermem.AddrRange {
// Preconditions: mm.mappingMu must be locked.
func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) {
- for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextGap() {
+ for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextLargeEnoughGap(usermem.Addr(length)) {
if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
// Can we shift up to match the alignment?
if offset := uint64(gr.Start) % alignment; offset != 0 {
@@ -214,7 +214,7 @@ func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bou
// Preconditions: mm.mappingMu must be locked.
func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) {
- for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevGap() {
+ for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevLargeEnoughGap(usermem.Addr(length)) {
if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
// Can we shift down to match the alignment?
start := gr.End - usermem.Addr(length)