From 0b2135072d3a6b418f87f166b58dcf877f7c2fba Mon Sep 17 00:00:00 2001 From: Neel Natu Date: Thu, 20 Jun 2019 12:54:40 -0700 Subject: Implement madvise(MADV_DONTFORK) PiperOrigin-RevId: 254253777 --- pkg/sentry/mm/lifecycle.go | 37 +++++++++++++++++++++++++++++++++++ pkg/sentry/mm/mm.go | 3 +++ pkg/sentry/mm/syscalls.go | 26 ++++++++++++++++++++++++ pkg/sentry/mm/vma.go | 1 + pkg/sentry/syscalls/linux/sys_mmap.go | 6 +++++- 5 files changed, 72 insertions(+), 1 deletion(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go index 06e4372ff..4e9ca1de6 100644 --- a/pkg/sentry/mm/lifecycle.go +++ b/pkg/sentry/mm/lifecycle.go @@ -86,10 +86,22 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { } // Copy vmas. + dontforks := false dstvgap := mm2.vmas.FirstGap() for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() { vma := srcvseg.Value() // makes a copy of the vma vmaAR := srcvseg.Range() + + if vma.dontfork { + length := uint64(vmaAR.Length()) + mm2.usageAS -= length + if vma.isPrivateDataLocked() { + mm2.dataAS -= length + } + dontforks = true + continue + } + // Inform the Mappable, if any, of the new mapping. if vma.mappable != nil { if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil { @@ -118,6 +130,10 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { defer mm2.activeMu.Unlock() mm.activeMu.Lock() defer mm.activeMu.Unlock() + if dontforks { + defer mm.pmas.MergeRange(mm.applicationAddrRange()) + } + srcvseg := mm.vmas.FirstSegment() dstpgap := mm2.pmas.FirstGap() var unmapAR usermem.AddrRange for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() { @@ -125,6 +141,27 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { if !pma.private { continue } + + if dontforks { + // Find the 'vma' that contains the starting address + // associated with the 'pma' (there must be one). + srcvseg = srcvseg.seekNextLowerBound(srcpseg.Start()) + if checkInvariants { + if !srcvseg.Ok() { + panic(fmt.Sprintf("no vma covers pma range %v", srcpseg.Range())) + } + if srcpseg.Start() < srcvseg.Start() { + panic(fmt.Sprintf("vma %v ran ahead of pma %v", srcvseg.Range(), srcpseg.Range())) + } + } + + srcpseg = mm.pmas.Isolate(srcpseg, srcvseg.Range()) + if srcvseg.ValuePtr().dontfork { + continue + } + pma = srcpseg.ValuePtr() + } + if !pma.needCOW { pma.needCOW = true if pma.effectivePerms.Write { diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 2ec2ad99b..7bb96b159 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -274,6 +274,9 @@ type vma struct { // metag, none of which we currently support. growsDown bool `state:"manual"` + // dontfork is the MADV_DONTFORK setting for this vma configured by madvise(). + dontfork bool + mlockMode memmap.MLockMode // numaPolicy is the NUMA policy for this vma set by mbind(). diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index 9aa39e31d..c2466c988 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -1026,6 +1026,32 @@ func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy } } +// SetDontFork implements the semantics of madvise MADV_DONTFORK. +func (mm *MemoryManager) SetDontFork(addr usermem.Addr, length uint64, dontfork bool) error { + ar, ok := addr.ToRange(length) + if !ok { + return syserror.EINVAL + } + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + defer func() { + mm.vmas.MergeRange(ar) + mm.vmas.MergeAdjacent(ar) + }() + + for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { + vseg = mm.vmas.Isolate(vseg, ar) + vma := vseg.ValuePtr() + vma.dontfork = dontfork + } + + if mm.vmas.SpanRange(ar) != ar.Length() { + return syserror.ENOMEM + } + return nil +} + // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED). func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { ar, ok := addr.ToRange(length) diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index 9f846cdb8..074e2b141 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -439,6 +439,7 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa vma1.mlockMode != vma2.mlockMode || vma1.numaPolicy != vma2.numaPolicy || vma1.numaNodemask != vma2.numaNodemask || + vma1.dontfork != vma2.dontfork || vma1.id != vma2.id || vma1.hint != vma2.hint { return vma{}, false diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 8a45dceeb..d831833bc 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -180,6 +180,10 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca switch adv { case linux.MADV_DONTNEED: return 0, nil, t.MemoryManager().Decommit(addr, length) + case linux.MADV_DOFORK: + return 0, nil, t.MemoryManager().SetDontFork(addr, length, false) + case linux.MADV_DONTFORK: + return 0, nil, t.MemoryManager().SetDontFork(addr, length, true) case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE: fallthrough case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE: @@ -191,7 +195,7 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED: // Do nothing, we totally ignore the suggestions above. return 0, nil, nil - case linux.MADV_REMOVE, linux.MADV_DOFORK, linux.MADV_DONTFORK: + case linux.MADV_REMOVE: // These "suggestions" have application-visible side effects, so we // have to indicate that we don't support them. return 0, nil, syserror.ENOSYS -- cgit v1.2.3