diff options
-rw-r--r-- | pkg/sentry/fs/proc/task.go | 43 | ||||
-rw-r--r-- | pkg/sentry/mm/BUILD | 2 | ||||
-rw-r--r-- | pkg/sentry/mm/lifecycle.go | 2 | ||||
-rw-r--r-- | pkg/sentry/mm/mm.go | 4 | ||||
-rw-r--r-- | pkg/sentry/mm/proc_pid_maps.go | 121 | ||||
-rw-r--r-- | pkg/sentry/mm/procfs.go | 289 | ||||
-rw-r--r-- | pkg/sentry/mm/syscalls.go | 4 | ||||
-rw-r--r-- | pkg/sentry/mm/vma.go | 17 | ||||
-rw-r--r-- | test/syscalls/linux/BUILD | 22 | ||||
-rw-r--r-- | test/syscalls/linux/proc_pid_smaps.cc | 467 |
10 files changed, 840 insertions, 131 deletions
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 9f13ff91c..91bda8a95 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -82,6 +82,7 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace "mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), "mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), "ns": newNamespaceDir(t, msrc), + "smaps": newSmaps(t, msrc), "stat": newTaskStat(t, msrc, showSubtasks, pidns), "statm": newStatm(t, msrc), "status": newStatus(t, msrc, pidns), @@ -316,7 +317,47 @@ func (md *mapsData) NeedsUpdate(generation int64) bool { // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. func (md *mapsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { if mm := md.mm(); mm != nil { - return mm.ReadSeqFileData(ctx, h) + return mm.ReadMapsSeqFileData(ctx, h) + } + return []seqfile.SeqData{}, 0 +} + +// smapsData implements seqfile.SeqSource for /proc/[pid]/smaps. +// +// +stateify savable +type smapsData struct { + t *kernel.Task +} + +func newSmaps(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + return newFile(seqfile.NewSeqFile(t, &smapsData{t}), msrc, fs.SpecialFile, t) +} + +func (sd *smapsData) mm() *mm.MemoryManager { + var tmm *mm.MemoryManager + sd.t.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + // No additional reference is taken on mm here. This is safe + // because MemoryManager.destroy is required to leave the + // MemoryManager in a state where it's still usable as a SeqSource. + tmm = mm + } + }) + return tmm +} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (sd *smapsData) NeedsUpdate(generation int64) bool { + if mm := sd.mm(); mm != nil { + return mm.NeedsUpdate(generation) + } + return true +} + +// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. +func (sd *smapsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if mm := sd.mm(); mm != nil { + return mm.ReadSmapsSeqFileData(ctx, h) } return []seqfile.SeqData{}, 0 } diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index 5a9185e5d..0997ec0a7 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -87,7 +87,7 @@ go_library( "mm.go", "pma.go", "pma_set.go", - "proc_pid_maps.go", + "procfs.go", "save_restore.go", "shm.go", "special_mappable.go", diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go index a42e32b43..1ee8ae74e 100644 --- a/pkg/sentry/mm/lifecycle.go +++ b/pkg/sentry/mm/lifecycle.go @@ -86,7 +86,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { vmaAR := srcvseg.Range() // Inform the Mappable, if any, of the new mapping. if vma.mappable != nil { - if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.isMappableAsWritable()); err != nil { + if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil { mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange()) return nil, err } diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index c0632d232..2154e7918 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -363,10 +363,6 @@ func (v *vma) loadRealPerms(b int) { } } -func (v *vma) isMappableAsWritable() bool { - return !v.private && v.maxPerms.Write -} - // pma represents a platform mapping area. // // +stateify savable diff --git a/pkg/sentry/mm/proc_pid_maps.go b/pkg/sentry/mm/proc_pid_maps.go deleted file mode 100644 index 247ee45ef..000000000 --- a/pkg/sentry/mm/proc_pid_maps.go +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright 2018 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package mm - -import ( - "bytes" - "fmt" - "strings" - - "gvisor.googlesource.com/gvisor/pkg/sentry/context" - "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile" - "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" -) - -const ( - // devMinorBits is the number of minor bits in a device number. Linux: - // include/linux/kdev_t.h:MINORBITS - devMinorBits = 20 -) - -// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. -func (mm *MemoryManager) NeedsUpdate(generation int64) bool { - return true -} - -// ReadSeqFileData is called by fs/proc.mapsData.ReadSeqFileData. -func (mm *MemoryManager) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) { - mm.mappingMu.RLock() - defer mm.mappingMu.RUnlock() - var data []seqfile.SeqData - var start usermem.Addr - if handle != nil { - start = *handle.(*usermem.Addr) - } - for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { - // FIXME: If we use a usermem.Addr for the handle, we get - // "panic: autosave error: type usermem.Addr is not registered". - vmaAddr := vseg.End() - data = append(data, seqfile.SeqData{ - Buf: mm.vmaMapsEntryLocked(ctx, vseg), - Handle: &vmaAddr, - }) - } - - // We always emulate vsyscall, so advertise it here. Everything about a - // vsyscall region is static, so just hard code the maps entry since we - // don't have a real vma backing it. The vsyscall region is at the end of - // the virtual address space so nothing should be mapped after it (if - // something is really mapped in the tiny ~10 MiB segment afterwards, we'll - // get the sorting on the maps file wrong at worst; but that's not possible - // on any current platform). - // - // Artifically adjust the seqfile handle so we only output vsyscall entry once. - if vsyscallEnd := usermem.Addr(0xffffffffff601000); start != vsyscallEnd { - data = append(data, seqfile.SeqData{ - Buf: []byte("ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"), - Handle: &vsyscallEnd, - }) - } - return data, 1 -} - -// vmaMapsEntryLocked returns a /proc/[pid]/maps entry for the vma iterated by -// vseg, including the trailing newline. -// -// Preconditions: mm.mappingMu must be locked. -func (mm *MemoryManager) vmaMapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte { - vma := vseg.ValuePtr() - private := "p" - if !vma.private { - private = "s" - } - - var dev, ino uint64 - if vma.id != nil { - dev = vma.id.DeviceID() - ino = vma.id.InodeID() - } - devMajor := uint32(dev >> devMinorBits) - devMinor := uint32(dev & ((1 << devMinorBits) - 1)) - - var b bytes.Buffer - // Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() => - // stack_guard_page_start(). - fmt.Fprintf(&b, "%08x-%08x %s%s %08x %02x:%02x %d ", - vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino) - - // Figure out our filename or hint. - var s string - if vma.hint != "" { - s = vma.hint - } else if vma.id != nil { - // FIXME: We are holding mm.mappingMu here, which is - // consistent with Linux's holding mmap_sem in - // fs/proc/task_mmu.c:show_map_vma() => fs/seq_file.c:seq_file_path(). - // However, it's not clear that fs.File.MappedName() is actually - // consistent with this lock order. - s = vma.id.MappedName(ctx) - } - if s != "" { - // Per linux, we pad until the 74th character. - if pad := 73 - b.Len(); pad > 0 { - b.WriteString(strings.Repeat(" ", pad)) - } - b.WriteString(s) - } - b.WriteString("\n") - return b.Bytes() -} diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go new file mode 100644 index 000000000..0c4b8895d --- /dev/null +++ b/pkg/sentry/mm/procfs.go @@ -0,0 +1,289 @@ +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "bytes" + "fmt" + "strings" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +const ( + // devMinorBits is the number of minor bits in a device number. Linux: + // include/linux/kdev_t.h:MINORBITS + devMinorBits = 20 + + vsyscallEnd = usermem.Addr(0xffffffffff601000) + vsyscallMapsEntry = "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n" + vsyscallSmapsEntry = vsyscallMapsEntry + + "Size: 4 kB\n" + + "Rss: 0 kB\n" + + "Pss: 0 kB\n" + + "Shared_Clean: 0 kB\n" + + "Shared_Dirty: 0 kB\n" + + "Private_Clean: 0 kB\n" + + "Private_Dirty: 0 kB\n" + + "Referenced: 0 kB\n" + + "Anonymous: 0 kB\n" + + "AnonHugePages: 0 kB\n" + + "Shared_Hugetlb: 0 kB\n" + + "Private_Hugetlb: 0 kB\n" + + "Swap: 0 kB\n" + + "SwapPss: 0 kB\n" + + "KernelPageSize: 4 kB\n" + + "MMUPageSize: 4 kB\n" + + "Locked: 0 kB\n" + + "VmFlags: rd ex \n" +) + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (mm *MemoryManager) NeedsUpdate(generation int64) bool { + return true +} + +// ReadMapsSeqFileData is called by fs/proc.mapsData.ReadSeqFileData to +// implement /proc/[pid]/maps. +func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + var data []seqfile.SeqData + var start usermem.Addr + if handle != nil { + start = *handle.(*usermem.Addr) + } + for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { + // FIXME: If we use a usermem.Addr for the handle, we get + // "panic: autosave error: type usermem.Addr is not registered". + vmaAddr := vseg.End() + data = append(data, seqfile.SeqData{ + Buf: mm.vmaMapsEntryLocked(ctx, vseg), + Handle: &vmaAddr, + }) + } + + // We always emulate vsyscall, so advertise it here. Everything about a + // vsyscall region is static, so just hard code the maps entry since we + // don't have a real vma backing it. The vsyscall region is at the end of + // the virtual address space so nothing should be mapped after it (if + // something is really mapped in the tiny ~10 MiB segment afterwards, we'll + // get the sorting on the maps file wrong at worst; but that's not possible + // on any current platform). + // + // Artifically adjust the seqfile handle so we only output vsyscall entry once. + if start != vsyscallEnd { + // FIXME: Can't get a pointer to constant vsyscallEnd. + vmaAddr := vsyscallEnd + data = append(data, seqfile.SeqData{ + Buf: []byte(vsyscallMapsEntry), + Handle: &vmaAddr, + }) + } + return data, 1 +} + +// vmaMapsEntryLocked returns a /proc/[pid]/maps entry for the vma iterated by +// vseg, including the trailing newline. +// +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) vmaMapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte { + var b bytes.Buffer + mm.appendVMAMapsEntryLocked(ctx, vseg, &b) + return b.Bytes() +} + +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) { + vma := vseg.ValuePtr() + private := "p" + if !vma.private { + private = "s" + } + + var dev, ino uint64 + if vma.id != nil { + dev = vma.id.DeviceID() + ino = vma.id.InodeID() + } + devMajor := uint32(dev >> devMinorBits) + devMinor := uint32(dev & ((1 << devMinorBits) - 1)) + + // Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() => + // stack_guard_page_start(). + fmt.Fprintf(b, "%08x-%08x %s%s %08x %02x:%02x %d ", + vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino) + + // Figure out our filename or hint. + var s string + if vma.hint != "" { + s = vma.hint + } else if vma.id != nil { + // FIXME: We are holding mm.mappingMu here, which is + // consistent with Linux's holding mmap_sem in + // fs/proc/task_mmu.c:show_map_vma() => fs/seq_file.c:seq_file_path(). + // However, it's not clear that fs.File.MappedName() is actually + // consistent with this lock order. + s = vma.id.MappedName(ctx) + } + if s != "" { + // Per linux, we pad until the 74th character. + if pad := 73 - b.Len(); pad > 0 { + b.WriteString(strings.Repeat(" ", pad)) + } + b.WriteString(s) + } + b.WriteString("\n") +} + +// ReadSmapsSeqFileData is called by fs/proc.smapsData.ReadSeqFileData to +// implement /proc/[pid]/smaps. +func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + var data []seqfile.SeqData + var start usermem.Addr + if handle != nil { + start = *handle.(*usermem.Addr) + } + for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { + // FIXME: If we use a usermem.Addr for the handle, we get + // "panic: autosave error: type usermem.Addr is not registered". + vmaAddr := vseg.End() + data = append(data, seqfile.SeqData{ + Buf: mm.vmaSmapsEntryLocked(ctx, vseg), + Handle: &vmaAddr, + }) + } + + // We always emulate vsyscall, so advertise it here. See + // ReadMapsSeqFileData for additional commentary. + if start != vsyscallEnd { + // FIXME: Can't get a pointer to constant vsyscallEnd. + vmaAddr := vsyscallEnd + data = append(data, seqfile.SeqData{ + Buf: []byte(vsyscallSmapsEntry), + Handle: &vmaAddr, + }) + } + return data, 1 +} + +// vmaSmapsEntryLocked returns a /proc/[pid]/smaps entry for the vma iterated +// by vseg, including the trailing newline. +// +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte { + var b bytes.Buffer + mm.appendVMAMapsEntryLocked(ctx, vseg, &b) + vma := vseg.ValuePtr() + + // We take mm.activeMu here in each call to vmaSmapsEntryLocked, instead of + // requiring it to be locked as a precondition, to reduce the latency + // impact of reading /proc/[pid]/smaps on concurrent performance-sensitive + // operations requiring activeMu for writing like faults. + mm.activeMu.RLock() + var rss uint64 + var anon uint64 + vsegAR := vseg.Range() + for pseg := mm.pmas.LowerBoundSegment(vsegAR.Start); pseg.Ok() && pseg.Start() < vsegAR.End; pseg = pseg.NextSegment() { + psegAR := pseg.Range().Intersect(vsegAR) + size := uint64(psegAR.Length()) + rss += size + if pseg.ValuePtr().private { + anon += size + } + } + mm.activeMu.RUnlock() + + fmt.Fprintf(&b, "Size: %8d kB\n", vseg.Range().Length()/1024) + fmt.Fprintf(&b, "Rss: %8d kB\n", rss/1024) + // Currently we report PSS = RSS, i.e. we pretend each page mapped by a pma + // is only mapped by that pma. This avoids having to query memmap.Mappables + // for reference count information on each page. As a corollary, all pages + // are accounted as "private" whether or not the vma is private; compare + // Linux's fs/proc/task_mmu.c:smaps_account(). + fmt.Fprintf(&b, "Pss: %8d kB\n", rss/1024) + fmt.Fprintf(&b, "Shared_Clean: %8d kB\n", 0) + fmt.Fprintf(&b, "Shared_Dirty: %8d kB\n", 0) + // Pretend that all pages are dirty if the vma is writable, and clean otherwise. + clean := rss + if vma.effectivePerms.Write { + clean = 0 + } + fmt.Fprintf(&b, "Private_Clean: %8d kB\n", clean/1024) + fmt.Fprintf(&b, "Private_Dirty: %8d kB\n", (rss-clean)/1024) + // Pretend that all pages are "referenced" (recently touched). + fmt.Fprintf(&b, "Referenced: %8d kB\n", rss/1024) + fmt.Fprintf(&b, "Anonymous: %8d kB\n", anon/1024) + // Hugepages (hugetlb and THP) are not implemented. + fmt.Fprintf(&b, "AnonHugePages: %8d kB\n", 0) + fmt.Fprintf(&b, "Shared_Hugetlb: %8d kB\n", 0) + fmt.Fprintf(&b, "Private_Hugetlb: %7d kB\n", 0) + // Swap is not implemented. + fmt.Fprintf(&b, "Swap: %8d kB\n", 0) + fmt.Fprintf(&b, "SwapPss: %8d kB\n", 0) + fmt.Fprintf(&b, "KernelPageSize: %8d kB\n", usermem.PageSize/1024) + fmt.Fprintf(&b, "MMUPageSize: %8d kB\n", usermem.PageSize/1024) + locked := rss + if vma.mlockMode == memmap.MLockNone { + locked = 0 + } + fmt.Fprintf(&b, "Locked: %8d kB\n", locked/1024) + + b.WriteString("VmFlags: ") + if vma.realPerms.Read { + b.WriteString("rd ") + } + if vma.realPerms.Write { + b.WriteString("wr ") + } + if vma.realPerms.Execute { + b.WriteString("ex ") + } + if vma.canWriteMappableLocked() { // VM_SHARED + b.WriteString("sh ") + } + if vma.maxPerms.Read { + b.WriteString("mr ") + } + if vma.maxPerms.Write { + b.WriteString("mw ") + } + if vma.maxPerms.Execute { + b.WriteString("me ") + } + if !vma.private { // VM_MAYSHARE + b.WriteString("ms ") + } + if vma.growsDown { + b.WriteString("gd ") + } + if vma.mlockMode != memmap.MLockNone { // VM_LOCKED + b.WriteString("lo ") + } + if vma.mlockMode == memmap.MLockLazy { // VM_LOCKONFAULT + b.WriteString("?? ") // no explicit encoding in fs/proc/task_mmu.c:show_smap_vma_flags() + } + if vma.private && vma.effectivePerms.Write { // VM_ACCOUNT + b.WriteString("ac ") + } + b.WriteString("\n") + + return b.Bytes() +} diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index 383703ec3..fd6929e08 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -507,7 +507,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi return 0, syserror.EINVAL } // Inform the Mappable, if any, of the new mapping. - if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start), vma.isMappableAsWritable()); err != nil { + if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start), vma.canWriteMappableLocked()); err != nil { return 0, err } } @@ -571,7 +571,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi // Now that pmas have been moved to newAR, we can notify vma.mappable that // oldAR is no longer mapped. if vma.mappable != nil { - vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.isMappableAsWritable()) + vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.canWriteMappableLocked()) } if vma.mlockMode == memmap.MLockEager { diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index 28ba9f2f5..e9c9a80ea 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -84,6 +84,8 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp // Inform the Mappable, if any, of the new mapping. if opts.Mappable != nil { + // The expression for writable is vma.canWriteMappableLocked(), but we + // don't yet have a vma. if err := opts.Mappable.AddMapping(ctx, mm, ar, opts.Offset, !opts.Private && opts.MaxPerms.Write); err != nil { return vmaIterator{}, usermem.AddrRange{}, err } @@ -366,7 +368,7 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa vmaAR := vseg.Range() vma := vseg.ValuePtr() if vma.mappable != nil { - vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.isMappableAsWritable()) + vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked()) } if vma.id != nil { vma.id.DecRef() @@ -381,6 +383,19 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa return vgap } +// canWriteMappableLocked returns true if it is possible for vma.mappable to be +// written to via this vma, i.e. if it is possible that +// vma.mappable.Translate(at.Write=true) may be called as a result of this vma. +// This includes via I/O with usermem.IOOpts.IgnorePermissions = true, such as +// PTRACE_POKEDATA. +// +// canWriteMappableLocked is equivalent to Linux's VM_SHARED. +// +// Preconditions: mm.mappingMu must be locked. +func (vma *vma) canWriteMappableLocked() bool { + return !vma.private && vma.maxPerms.Write +} + // vmaSetFunctions implements segment.Functions for vmaSet. type vmaSetFunctions struct{} diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index f0e61e083..028c686a8 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -1414,6 +1414,28 @@ cc_binary( ) cc_binary( + name = "proc_pid_smaps_test", + testonly = 1, + srcs = ["proc_pid_smaps.cc"], + linkstatic = 1, + deps = [ + "//test/util:file_descriptor", + "//test/util:fs_util", + "//test/util:memory_util", + "//test/util:posix_error", + "//test/util:proc_util", + "//test/util:temp_path", + "//test/util:test_main", + "//test/util:test_util", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:optional", + "@com_google_googletest//:gtest", + ], +) + +cc_binary( name = "pselect_test", testonly = 1, srcs = ["pselect.cc"], diff --git a/test/syscalls/linux/proc_pid_smaps.cc b/test/syscalls/linux/proc_pid_smaps.cc new file mode 100644 index 000000000..4aefc1b41 --- /dev/null +++ b/test/syscalls/linux/proc_pid_smaps.cc @@ -0,0 +1,467 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stddef.h> +#include <stdint.h> + +#include <algorithm> +#include <string> +#include <utility> +#include <vector> + +#include "absl/container/flat_hash_set.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_split.h" +#include "absl/strings/string_view.h" +#include "absl/types/optional.h" +#include "test/util/file_descriptor.h" +#include "test/util/fs_util.h" +#include "test/util/memory_util.h" +#include "test/util/posix_error.h" +#include "test/util/proc_util.h" +#include "test/util/temp_path.h" +#include "test/util/test_util.h" + +using ::testing::Contains; +using ::testing::ElementsAreArray; +using ::testing::IsSupersetOf; +using ::testing::Not; +using ::testing::Optional; + +namespace gvisor { +namespace testing { + +namespace { + +struct ProcPidSmapsEntry { + ProcMapsEntry maps_entry; + + // These fields should always exist, as they were included in e070ad49f311 + // "[PATCH] add /proc/pid/smaps". + size_t size_kb; + size_t rss_kb; + size_t shared_clean_kb; + size_t shared_dirty_kb; + size_t private_clean_kb; + size_t private_dirty_kb; + + // These fields were added later and may not be present. + absl::optional<size_t> pss_kb; + absl::optional<size_t> referenced_kb; + absl::optional<size_t> anonymous_kb; + absl::optional<size_t> anon_huge_pages_kb; + absl::optional<size_t> shared_hugetlb_kb; + absl::optional<size_t> private_hugetlb_kb; + absl::optional<size_t> swap_kb; + absl::optional<size_t> swap_pss_kb; + absl::optional<size_t> kernel_page_size_kb; + absl::optional<size_t> mmu_page_size_kb; + absl::optional<size_t> locked_kb; + + // Caution: "Note that there is no guarantee that every flag and associated + // mnemonic will be present in all further kernel releases. Things get + // changed, the flags may be vanished or the reverse -- new added." - Linux + // Documentation/filesystems/proc.txt, on VmFlags. Avoid checking for any + // flags that are not extremely well-established. + absl::optional<std::vector<std::string>> vm_flags; +}; + +// Given the value part of a /proc/[pid]/smaps field containing a value in kB +// (for example, " 4 kB", returns the value in kB (in this example, 4). +PosixErrorOr<size_t> SmapsValueKb(absl::string_view value) { + // TODO: let us use RE2 or <regex> + std::pair<absl::string_view, absl::string_view> parts = + absl::StrSplit(value, ' ', absl::SkipEmpty()); + if (parts.second != "kB") { + return PosixError(EINVAL, + absl::StrCat("invalid smaps field value: ", value)); + } + ASSIGN_OR_RETURN_ERRNO(auto val_kb, Atoi<size_t>(parts.first)); + return val_kb; +} + +PosixErrorOr<std::vector<ProcPidSmapsEntry>> ParseProcPidSmaps( + absl::string_view contents) { + std::vector<ProcPidSmapsEntry> entries; + absl::optional<ProcPidSmapsEntry> entry; + bool have_size_kb = false; + bool have_rss_kb = false; + bool have_shared_clean_kb = false; + bool have_shared_dirty_kb = false; + bool have_private_clean_kb = false; + bool have_private_dirty_kb = false; + + auto const finish_entry = [&] { + if (entry) { + if (!have_size_kb) { + return PosixError(EINVAL, "smaps entry is missing Size"); + } + if (!have_rss_kb) { + return PosixError(EINVAL, "smaps entry is missing Rss"); + } + if (!have_shared_clean_kb) { + return PosixError(EINVAL, "smaps entry is missing Shared_Clean"); + } + if (!have_shared_dirty_kb) { + return PosixError(EINVAL, "smaps entry is missing Shared_Dirty"); + } + if (!have_private_clean_kb) { + return PosixError(EINVAL, "smaps entry is missing Private_Clean"); + } + if (!have_private_dirty_kb) { + return PosixError(EINVAL, "smaps entry is missing Private_Dirty"); + } + // std::move(entry.value()) instead of std::move(entry).value(), because + // otherwise tools may report a "use-after-move" warning, which is + // spurious because entry.emplace() below resets entry to a new + // ProcPidSmapsEntry. + entries.emplace_back(std::move(entry.value())); + } + entry.emplace(); + have_size_kb = false; + have_rss_kb = false; + have_shared_clean_kb = false; + have_shared_dirty_kb = false; + have_private_clean_kb = false; + have_private_dirty_kb = false; + return NoError(); + }; + + // Holds key/value pairs from smaps field lines. Declared here so it can be + // captured by reference by the following lambdas. + std::vector<absl::string_view> key_value; + + auto const on_required_field_kb = [&](size_t* field, bool* have_field) { + if (*have_field) { + return PosixError( + EINVAL, + absl::StrFormat("smaps entry has duplicate %s line", key_value[0])); + } + ASSIGN_OR_RETURN_ERRNO(*field, SmapsValueKb(key_value[1])); + *have_field = true; + return NoError(); + }; + + auto const on_optional_field_kb = [&](absl::optional<size_t>* field) { + if (*field) { + return PosixError( + EINVAL, + absl::StrFormat("smaps entry has duplicate %s line", key_value[0])); + } + ASSIGN_OR_RETURN_ERRNO(*field, SmapsValueKb(key_value[1])); + return NoError(); + }; + + absl::flat_hash_set<std::string> unknown_fields; + auto const on_unknown_field = [&] { + absl::string_view key = key_value[0]; + // Don't mention unknown fields more than once. + if (unknown_fields.count(key)) { + return; + } + unknown_fields.insert(std::string(key)); + LOG(INFO) << "skipping unknown smaps field " << key; + }; + + auto lines = absl::StrSplit(contents, '\n', absl::SkipEmpty()); + for (absl::string_view l : lines) { + // Is this line a valid /proc/[pid]/maps entry? + auto maybe_maps_entry = ParseProcMapsLine(l); + if (maybe_maps_entry.ok()) { + // This marks the beginning of a new /proc/[pid]/smaps entry. + RETURN_IF_ERRNO(finish_entry()); + entry->maps_entry = std::move(maybe_maps_entry).ValueOrDie(); + continue; + } + // Otherwise it's a field in an existing /proc/[pid]/smaps entry of the form + // "key:value" (where value in practice will be preceded by a variable + // amount of whitespace). + if (!entry) { + LOG(WARNING) << "smaps line not considered a maps line: " + << maybe_maps_entry.error_message(); + return PosixError( + EINVAL, + absl::StrCat("smaps field line without preceding maps line: ", l)); + } + key_value = absl::StrSplit(l, absl::MaxSplits(':', 1)); + if (key_value.size() != 2) { + return PosixError(EINVAL, absl::StrCat("invalid smaps field line: ", l)); + } + absl::string_view const key = key_value[0]; + if (key == "Size") { + RETURN_IF_ERRNO(on_required_field_kb(&entry->size_kb, &have_size_kb)); + } else if (key == "Rss") { + RETURN_IF_ERRNO(on_required_field_kb(&entry->rss_kb, &have_rss_kb)); + } else if (key == "Shared_Clean") { + RETURN_IF_ERRNO( + on_required_field_kb(&entry->shared_clean_kb, &have_shared_clean_kb)); + } else if (key == "Shared_Dirty") { + RETURN_IF_ERRNO( + on_required_field_kb(&entry->shared_dirty_kb, &have_shared_dirty_kb)); + } else if (key == "Private_Clean") { + RETURN_IF_ERRNO(on_required_field_kb(&entry->private_clean_kb, + &have_private_clean_kb)); + } else if (key == "Private_Dirty") { + RETURN_IF_ERRNO(on_required_field_kb(&entry->private_dirty_kb, + &have_private_dirty_kb)); + } else if (key == "Pss") { + RETURN_IF_ERRNO(on_optional_field_kb(&entry->pss_kb)); + } else if (key == "Referenced") { + RETURN_IF_ERRNO(on_optional_field_kb(&entry->referenced_kb)); + } else if (key == "Anonymous") { + RETURN_IF_ERRNO(on_optional_field_kb(&entry->anonymous_kb)); + } else if (key == "AnonHugePages") { + RETURN_IF_ERRNO(on_optional_field_kb(&entry->anon_huge_pages_kb)); + } else if (key == "Shared_Hugetlb") { + RETURN_IF_ERRNO(on_optional_field_kb(&entry->shared_hugetlb_kb)); + } else if (key == "Private_Hugetlb") { + RETURN_IF_ERRNO(on_optional_field_kb(&entry->private_hugetlb_kb)); + } else if (key == "Swap") { + RETURN_IF_ERRNO(on_optional_field_kb(&entry->swap_kb)); + } else if (key == "SwapPss") { + RETURN_IF_ERRNO(on_optional_field_kb(&entry->swap_pss_kb)); + } else if (key == "KernelPageSize") { + RETURN_IF_ERRNO(on_optional_field_kb(&entry->kernel_page_size_kb)); + } else if (key == "MMUPageSize") { + RETURN_IF_ERRNO(on_optional_field_kb(&entry->mmu_page_size_kb)); + } else if (key == "Locked") { + RETURN_IF_ERRNO(on_optional_field_kb(&entry->locked_kb)); + } else if (key == "VmFlags") { + if (entry->vm_flags) { + return PosixError(EINVAL, "duplicate VmFlags line"); + } + entry->vm_flags = absl::StrSplit(key_value[1], ' ', absl::SkipEmpty()); + } else { + on_unknown_field(); + } + } + RETURN_IF_ERRNO(finish_entry()); + return entries; +}; + +TEST(ParseProcPidSmapsTest, Correctness) { + auto entries = ASSERT_NO_ERRNO_AND_VALUE( + ParseProcPidSmaps("0-10000 rw-s 00000000 00:00 0 " + " /dev/zero (deleted)\n" + "Size: 0 kB\n" + "Rss: 1 kB\n" + "Pss: 2 kB\n" + "Shared_Clean: 3 kB\n" + "Shared_Dirty: 4 kB\n" + "Private_Clean: 5 kB\n" + "Private_Dirty: 6 kB\n" + "Referenced: 7 kB\n" + "Anonymous: 8 kB\n" + "AnonHugePages: 9 kB\n" + "Shared_Hugetlb: 10 kB\n" + "Private_Hugetlb: 11 kB\n" + "Swap: 12 kB\n" + "SwapPss: 13 kB\n" + "KernelPageSize: 14 kB\n" + "MMUPageSize: 15 kB\n" + "Locked: 16 kB\n" + "FutureUnknownKey: 17 kB\n" + "VmFlags: rd wr sh mr mw me ms lo ?? sd \n")); + ASSERT_EQ(entries.size(), 1); + auto& entry = entries[0]; + EXPECT_EQ(entry.maps_entry.filename, "/dev/zero (deleted)"); + EXPECT_EQ(entry.size_kb, 0); + EXPECT_EQ(entry.rss_kb, 1); + EXPECT_THAT(entry.pss_kb, Optional(2)); + EXPECT_EQ(entry.shared_clean_kb, 3); + EXPECT_EQ(entry.shared_dirty_kb, 4); + EXPECT_EQ(entry.private_clean_kb, 5); + EXPECT_EQ(entry.private_dirty_kb, 6); + EXPECT_THAT(entry.referenced_kb, Optional(7)); + EXPECT_THAT(entry.anonymous_kb, Optional(8)); + EXPECT_THAT(entry.anon_huge_pages_kb, Optional(9)); + EXPECT_THAT(entry.shared_hugetlb_kb, Optional(10)); + EXPECT_THAT(entry.private_hugetlb_kb, Optional(11)); + EXPECT_THAT(entry.swap_kb, Optional(12)); + EXPECT_THAT(entry.swap_pss_kb, Optional(13)); + EXPECT_THAT(entry.kernel_page_size_kb, Optional(14)); + EXPECT_THAT(entry.mmu_page_size_kb, Optional(15)); + EXPECT_THAT(entry.locked_kb, Optional(16)); + EXPECT_THAT(entry.vm_flags, + Optional(ElementsAreArray({"rd", "wr", "sh", "mr", "mw", "me", + "ms", "lo", "??", "sd"}))); +} + +// Returns the unique entry in entries containing the given address. +PosixErrorOr<ProcPidSmapsEntry> FindUniqueSmapsEntry( + std::vector<ProcPidSmapsEntry> const& entries, uintptr_t addr) { + auto const pred = [&](ProcPidSmapsEntry const& entry) { + return entry.maps_entry.start <= addr && addr < entry.maps_entry.end; + }; + auto const it = std::find_if(entries.begin(), entries.end(), pred); + if (it == entries.end()) { + return PosixError(EINVAL, + absl::StrFormat("no entry contains address %#x", addr)); + } + auto const it2 = std::find_if(it + 1, entries.end(), pred); + if (it2 != entries.end()) { + return PosixError( + EINVAL, + absl::StrFormat("overlapping entries [%#x-%#x) and [%#x-%#x) both " + "contain address %#x", + it->maps_entry.start, it->maps_entry.end, + it2->maps_entry.start, it2->maps_entry.end, addr)); + } + return *it; +} + +PosixErrorOr<std::vector<ProcPidSmapsEntry>> ReadProcSelfSmaps() { + ASSIGN_OR_RETURN_ERRNO(std::string contents, GetContents("/proc/self/smaps")); + return ParseProcPidSmaps(contents); +} + +TEST(ProcPidSmapsTest, SharedAnon) { + // Map with MAP_POPULATE so we get some RSS. + Mapping const m = ASSERT_NO_ERRNO_AND_VALUE(MmapAnon( + 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE)); + auto const entries = ASSERT_NO_ERRNO_AND_VALUE(ReadProcSelfSmaps()); + auto const entry = + ASSERT_NO_ERRNO_AND_VALUE(FindUniqueSmapsEntry(entries, m.addr())); + + EXPECT_EQ(entry.size_kb, m.len() / 1024); + // It's possible that populated pages have been swapped out, so RSS might be + // less than size. + EXPECT_LE(entry.rss_kb, entry.size_kb); + + if (entry.pss_kb) { + // PSS should be exactly equal to RSS since no other address spaces should + // be sharing our new mapping. + EXPECT_EQ(entry.pss_kb.value(), entry.rss_kb); + } + + // "Shared" and "private" in smaps refers to whether or not *physical pages* + // are shared; thus all pages in our MAP_SHARED mapping should nevertheless + // be private. + EXPECT_EQ(entry.shared_clean_kb, 0); + EXPECT_EQ(entry.shared_dirty_kb, 0); + EXPECT_EQ(entry.private_clean_kb + entry.private_dirty_kb, entry.rss_kb) + << "Private_Clean = " << entry.private_clean_kb + << " kB, Private_Dirty = " << entry.private_dirty_kb << " kB"; + + // Shared anonymous mappings are implemented as a shmem file, so their pages + // are not PageAnon. + if (entry.anonymous_kb) { + EXPECT_EQ(entry.anonymous_kb.value(), 0); + } + + if (entry.vm_flags) { + EXPECT_THAT(entry.vm_flags.value(), + IsSupersetOf({"rd", "wr", "sh", "mr", "mw", "me", "ms"})); + EXPECT_THAT(entry.vm_flags.value(), Not(Contains("ex"))); + } +} + +TEST(ProcPidSmapsTest, PrivateAnon) { + // Map with MAP_POPULATE so we get some RSS. + Mapping const m = ASSERT_NO_ERRNO_AND_VALUE( + MmapAnon(2 * kPageSize, PROT_WRITE, MAP_PRIVATE | MAP_POPULATE)); + auto const entries = ASSERT_NO_ERRNO_AND_VALUE(ReadProcSelfSmaps()); + auto const entry = + ASSERT_NO_ERRNO_AND_VALUE(FindUniqueSmapsEntry(entries, m.addr())); + + // It's possible that our mapping was merged with another vma, so the smaps + // entry might be bigger than our original mapping. + EXPECT_GE(entry.size_kb, m.len() / 1024); + EXPECT_LE(entry.rss_kb, entry.size_kb); + if (entry.pss_kb) { + EXPECT_LE(entry.pss_kb.value(), entry.rss_kb); + } + + if (entry.anonymous_kb) { + EXPECT_EQ(entry.anonymous_kb.value(), entry.rss_kb); + } + + if (entry.vm_flags) { + EXPECT_THAT(entry.vm_flags.value(), IsSupersetOf({"wr", "mr", "mw", "me"})); + // We passed PROT_WRITE to mmap. On at least x86, the mapping is in + // practice readable because there is no way to configure the MMU to make + // pages writable but not readable. However, VmFlags should reflect the + // flags set on the VMA, so "rd" (VM_READ) should not appear in VmFlags. + EXPECT_THAT(entry.vm_flags.value(), Not(Contains("rd"))); + EXPECT_THAT(entry.vm_flags.value(), Not(Contains("ex"))); + EXPECT_THAT(entry.vm_flags.value(), Not(Contains("sh"))); + EXPECT_THAT(entry.vm_flags.value(), Not(Contains("ms"))); + } +} + +TEST(ProcPidSmapsTest, SharedReadOnlyFile) { + size_t const kFileSize = kPageSize; + + auto const temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + ASSERT_THAT(truncate(temp_file.path().c_str(), kFileSize), SyscallSucceeds()); + auto const fd = ASSERT_NO_ERRNO_AND_VALUE(Open(temp_file.path(), O_RDONLY)); + + auto const m = ASSERT_NO_ERRNO_AND_VALUE(Mmap( + nullptr, kFileSize, PROT_READ, MAP_SHARED | MAP_POPULATE, fd.get(), 0)); + auto const entries = ASSERT_NO_ERRNO_AND_VALUE(ReadProcSelfSmaps()); + auto const entry = + ASSERT_NO_ERRNO_AND_VALUE(FindUniqueSmapsEntry(entries, m.addr())); + + // Most of the same logic as the SharedAnon case applies. + EXPECT_EQ(entry.size_kb, kFileSize / 1024); + EXPECT_LE(entry.rss_kb, entry.size_kb); + if (entry.pss_kb) { + EXPECT_EQ(entry.pss_kb.value(), entry.rss_kb); + } + EXPECT_EQ(entry.shared_clean_kb, 0); + EXPECT_EQ(entry.shared_dirty_kb, 0); + EXPECT_EQ(entry.private_clean_kb + entry.private_dirty_kb, entry.rss_kb) + << "Private_Clean = " << entry.private_clean_kb + << " kB, Private_Dirty = " << entry.private_dirty_kb << " kB"; + if (entry.anonymous_kb) { + EXPECT_EQ(entry.anonymous_kb.value(), 0); + } + + if (entry.vm_flags) { + EXPECT_THAT(entry.vm_flags.value(), IsSupersetOf({"rd", "mr", "me", "ms"})); + EXPECT_THAT(entry.vm_flags.value(), Not(Contains("wr"))); + EXPECT_THAT(entry.vm_flags.value(), Not(Contains("ex"))); + // Because the mapped file was opened O_RDONLY, the VMA is !VM_MAYWRITE and + // also !VM_SHARED. + EXPECT_THAT(entry.vm_flags.value(), Not(Contains("sh"))); + EXPECT_THAT(entry.vm_flags.value(), Not(Contains("mw"))); + } +} + +// Tests that gVisor's /proc/[pid]/smaps provides all of the fields we expect it +// to, which as of this writing is all fields provided by Linux 4.4. +TEST(ProcPidSmapsTest, GvisorFields) { + SKIP_IF(!IsRunningOnGvisor()); + auto const entries = ASSERT_NO_ERRNO_AND_VALUE(ReadProcSelfSmaps()); + for (auto const& entry : entries) { + EXPECT_TRUE(entry.pss_kb); + EXPECT_TRUE(entry.referenced_kb); + EXPECT_TRUE(entry.anonymous_kb); + EXPECT_TRUE(entry.anon_huge_pages_kb); + EXPECT_TRUE(entry.shared_hugetlb_kb); + EXPECT_TRUE(entry.private_hugetlb_kb); + EXPECT_TRUE(entry.swap_kb); + EXPECT_TRUE(entry.swap_pss_kb); + EXPECT_THAT(entry.kernel_page_size_kb, Optional(kPageSize / 1024)); + EXPECT_THAT(entry.mmu_page_size_kb, Optional(kPageSize / 1024)); + EXPECT_TRUE(entry.locked_kb); + EXPECT_TRUE(entry.vm_flags); + } +} + +} // namespace + +} // namespace testing +} // namespace gvisor |