summaryrefslogtreecommitdiffhomepage
path: root/pkg
diff options
context:
space:
mode:
authorNeel Natu <neelnatu@google.com>2019-06-20 12:54:40 -0700
committergVisor bot <gvisor-bot@google.com>2019-06-20 12:56:00 -0700
commit0b2135072d3a6b418f87f166b58dcf877f7c2fba (patch)
tree40c316afa59e5786fac24b7e8be940ae645ffb16 /pkg
parentb46ec3704b60bebdd63a597c62f3f471ee0d9be9 (diff)
Implement madvise(MADV_DONTFORK)
PiperOrigin-RevId: 254253777
Diffstat (limited to 'pkg')
-rw-r--r--pkg/sentry/mm/lifecycle.go37
-rw-r--r--pkg/sentry/mm/mm.go3
-rw-r--r--pkg/sentry/mm/syscalls.go26
-rw-r--r--pkg/sentry/mm/vma.go1
-rw-r--r--pkg/sentry/syscalls/linux/sys_mmap.go6
5 files changed, 72 insertions, 1 deletions
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 06e4372ff..4e9ca1de6 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -86,10 +86,22 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
}
// Copy vmas.
+ dontforks := false
dstvgap := mm2.vmas.FirstGap()
for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
vma := srcvseg.Value() // makes a copy of the vma
vmaAR := srcvseg.Range()
+
+ if vma.dontfork {
+ length := uint64(vmaAR.Length())
+ mm2.usageAS -= length
+ if vma.isPrivateDataLocked() {
+ mm2.dataAS -= length
+ }
+ dontforks = true
+ continue
+ }
+
// Inform the Mappable, if any, of the new mapping.
if vma.mappable != nil {
if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil {
@@ -118,6 +130,10 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
defer mm2.activeMu.Unlock()
mm.activeMu.Lock()
defer mm.activeMu.Unlock()
+ if dontforks {
+ defer mm.pmas.MergeRange(mm.applicationAddrRange())
+ }
+ srcvseg := mm.vmas.FirstSegment()
dstpgap := mm2.pmas.FirstGap()
var unmapAR usermem.AddrRange
for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() {
@@ -125,6 +141,27 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
if !pma.private {
continue
}
+
+ if dontforks {
+ // Find the 'vma' that contains the starting address
+ // associated with the 'pma' (there must be one).
+ srcvseg = srcvseg.seekNextLowerBound(srcpseg.Start())
+ if checkInvariants {
+ if !srcvseg.Ok() {
+ panic(fmt.Sprintf("no vma covers pma range %v", srcpseg.Range()))
+ }
+ if srcpseg.Start() < srcvseg.Start() {
+ panic(fmt.Sprintf("vma %v ran ahead of pma %v", srcvseg.Range(), srcpseg.Range()))
+ }
+ }
+
+ srcpseg = mm.pmas.Isolate(srcpseg, srcvseg.Range())
+ if srcvseg.ValuePtr().dontfork {
+ continue
+ }
+ pma = srcpseg.ValuePtr()
+ }
+
if !pma.needCOW {
pma.needCOW = true
if pma.effectivePerms.Write {
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 2ec2ad99b..7bb96b159 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -274,6 +274,9 @@ type vma struct {
// metag, none of which we currently support.
growsDown bool `state:"manual"`
+ // dontfork is the MADV_DONTFORK setting for this vma configured by madvise().
+ dontfork bool
+
mlockMode memmap.MLockMode
// numaPolicy is the NUMA policy for this vma set by mbind().
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 9aa39e31d..c2466c988 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -1026,6 +1026,32 @@ func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy
}
}
+// SetDontFork implements the semantics of madvise MADV_DONTFORK.
+func (mm *MemoryManager) SetDontFork(addr usermem.Addr, length uint64, dontfork bool) error {
+ ar, ok := addr.ToRange(length)
+ if !ok {
+ return syserror.EINVAL
+ }
+
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+ defer func() {
+ mm.vmas.MergeRange(ar)
+ mm.vmas.MergeAdjacent(ar)
+ }()
+
+ for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+ vseg = mm.vmas.Isolate(vseg, ar)
+ vma := vseg.ValuePtr()
+ vma.dontfork = dontfork
+ }
+
+ if mm.vmas.SpanRange(ar) != ar.Length() {
+ return syserror.ENOMEM
+ }
+ return nil
+}
+
// Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
ar, ok := addr.ToRange(length)
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 9f846cdb8..074e2b141 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -439,6 +439,7 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa
vma1.mlockMode != vma2.mlockMode ||
vma1.numaPolicy != vma2.numaPolicy ||
vma1.numaNodemask != vma2.numaNodemask ||
+ vma1.dontfork != vma2.dontfork ||
vma1.id != vma2.id ||
vma1.hint != vma2.hint {
return vma{}, false
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 8a45dceeb..d831833bc 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -180,6 +180,10 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
switch adv {
case linux.MADV_DONTNEED:
return 0, nil, t.MemoryManager().Decommit(addr, length)
+ case linux.MADV_DOFORK:
+ return 0, nil, t.MemoryManager().SetDontFork(addr, length, false)
+ case linux.MADV_DONTFORK:
+ return 0, nil, t.MemoryManager().SetDontFork(addr, length, true)
case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE:
fallthrough
case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE:
@@ -191,7 +195,7 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED:
// Do nothing, we totally ignore the suggestions above.
return 0, nil, nil
- case linux.MADV_REMOVE, linux.MADV_DOFORK, linux.MADV_DONTFORK:
+ case linux.MADV_REMOVE:
// These "suggestions" have application-visible side effects, so we
// have to indicate that we don't support them.
return 0, nil, syserror.ENOSYS