From c8857f72696c1097a427b75f4340969e20cc0e95 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Tue, 21 May 2019 17:04:58 -0700 Subject: Fix inconsistencies in ELF anonymous mappings * A segment with filesz == 0, memsz > 0 should be an anonymous only mapping. We were failing to load such an ELF. * Anonymous pages are always mapped RW, regardless of the segment protections. PiperOrigin-RevId: 249355239 Change-Id: I251e5c0ce8848cf8420c3aadf337b0d77b1ad991 --- pkg/sentry/loader/elf.go | 118 +++++++++++++++++--------------- test/syscalls/linux/exec_binary.cc | 135 +++++++++++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+), 56 deletions(-) diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go index 97e32c8ba..900236531 100644 --- a/pkg/sentry/loader/elf.go +++ b/pkg/sentry/loader/elf.go @@ -223,13 +223,8 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) { // mapSegment maps a phdr into the Task. offset is the offset to apply to // phdr.Vaddr. func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.ProgHeader, offset usermem.Addr) error { - // Alignment of vaddr and offset must match. We'll need to map on the - // page boundary. + // We must make a page-aligned mapping. adjust := usermem.Addr(phdr.Vaddr).PageOffset() - if adjust != usermem.Addr(phdr.Off).PageOffset() { - ctx.Infof("Alignment of vaddr %#x != off %#x", phdr.Vaddr, phdr.Off) - return syserror.ENOEXEC - } addr, ok := offset.AddLength(phdr.Vaddr) if !ok { @@ -239,17 +234,11 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf. } addr -= usermem.Addr(adjust) - fileOffset := phdr.Off - adjust fileSize := phdr.Filesz + adjust if fileSize < phdr.Filesz { ctx.Infof("Computed segment file size overflows: %#x + %#x", phdr.Filesz, adjust) return syserror.ENOEXEC } - memSize := phdr.Memsz + adjust - if memSize < phdr.Memsz { - ctx.Infof("Computed segment mem size overflows: %#x + %#x", phdr.Memsz, adjust) - return syserror.ENOEXEC - } ms, ok := usermem.Addr(fileSize).RoundUp() if !ok { ctx.Infof("fileSize %#x too large", fileSize) @@ -257,51 +246,64 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf. } mapSize := uint64(ms) - prot := progFlagsAsPerms(phdr.Flags) - mopts := memmap.MMapOpts{ - Length: mapSize, - Offset: fileOffset, - Addr: addr, - Fixed: true, - // Linux will happily allow conflicting segments to map over - // one another. - Unmap: true, - Private: true, - Perms: prot, - MaxPerms: usermem.AnyAccess, - } - defer func() { - if mopts.MappingIdentity != nil { - mopts.MappingIdentity.DecRef() - } - }() - if err := f.ConfigureMMap(ctx, &mopts); err != nil { - ctx.Infof("File is not memory-mappable: %v", err) - return err - } - if _, err := m.MMap(ctx, mopts); err != nil { - ctx.Infof("Error mapping PT_LOAD segment %+v at %#x: %v", phdr, addr, err) - return err - } - - // We need to clear the end of the last page that exceeds fileSize so - // we don't map part of the file beyond fileSize. - // - // Note that Linux *does not* clear the portion of the first page - // before phdr.Off. - if mapSize > fileSize { - zeroAddr, ok := addr.AddLength(fileSize) - if !ok { - panic(fmt.Sprintf("successfully mmaped address overflows? %#x + %#x", addr, fileSize)) + if mapSize > 0 { + // This must result in a page-aligned offset. i.e., the original + // phdr.Off must have the same alignment as phdr.Vaddr. If that is not + // true, MMap will reject the mapping. + fileOffset := phdr.Off - adjust + + prot := progFlagsAsPerms(phdr.Flags) + mopts := memmap.MMapOpts{ + Length: mapSize, + Offset: fileOffset, + Addr: addr, + Fixed: true, + // Linux will happily allow conflicting segments to map over + // one another. + Unmap: true, + Private: true, + Perms: prot, + MaxPerms: usermem.AnyAccess, } - zeroSize := int64(mapSize - fileSize) - if zeroSize < 0 { - panic(fmt.Sprintf("zeroSize too big? %#x", uint64(zeroSize))) + defer func() { + if mopts.MappingIdentity != nil { + mopts.MappingIdentity.DecRef() + } + }() + if err := f.ConfigureMMap(ctx, &mopts); err != nil { + ctx.Infof("File is not memory-mappable: %v", err) + return err } - if _, err := m.ZeroOut(ctx, zeroAddr, zeroSize, usermem.IOOpts{IgnorePermissions: true}); err != nil { - ctx.Warningf("Failed to zero end of page [%#x, %#x): %v", zeroAddr, zeroAddr+usermem.Addr(zeroSize), err) + if _, err := m.MMap(ctx, mopts); err != nil { + ctx.Infof("Error mapping PT_LOAD segment %+v at %#x: %v", phdr, addr, err) return err } + + // We need to clear the end of the last page that exceeds fileSize so + // we don't map part of the file beyond fileSize. + // + // Note that Linux *does not* clear the portion of the first page + // before phdr.Off. + if mapSize > fileSize { + zeroAddr, ok := addr.AddLength(fileSize) + if !ok { + panic(fmt.Sprintf("successfully mmaped address overflows? %#x + %#x", addr, fileSize)) + } + zeroSize := int64(mapSize - fileSize) + if zeroSize < 0 { + panic(fmt.Sprintf("zeroSize too big? %#x", uint64(zeroSize))) + } + if _, err := m.ZeroOut(ctx, zeroAddr, zeroSize, usermem.IOOpts{IgnorePermissions: true}); err != nil { + ctx.Warningf("Failed to zero end of page [%#x, %#x): %v", zeroAddr, zeroAddr+usermem.Addr(zeroSize), err) + return err + } + } + } + + memSize := phdr.Memsz + adjust + if memSize < phdr.Memsz { + ctx.Infof("Computed segment mem size overflows: %#x + %#x", phdr.Memsz, adjust) + return syserror.ENOEXEC } // Allocate more anonymous pages if necessary. @@ -321,9 +323,13 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf. Addr: anonAddr, // Fixed without Unmap will fail the mmap if something is // already at addr. - Fixed: true, - Private: true, - Perms: progFlagsAsPerms(phdr.Flags), + Fixed: true, + Private: true, + // N.B. Linux uses vm_brk to map these pages, ignoring + // the segment protections, instead always mapping RW. + // These pages are not included in the final brk + // region. + Perms: usermem.ReadWrite, MaxPerms: usermem.AnyAccess, }); err != nil { ctx.Infof("Error mapping PT_LOAD segment %v anonymous memory: %v", phdr, err) diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc index bdd6eb10b..91b55015c 100644 --- a/test/syscalls/linux/exec_binary.cc +++ b/test/syscalls/linux/exec_binary.cc @@ -401,6 +401,141 @@ TEST(ElfTest, DataSegment) { }))); } +// Additonal pages beyond filesz are always RW. +// +// N.B. Linux uses set_brk -> vm_brk to additional pages beyond filesz (even +// though start_brk itself will always be beyond memsz). As a result, the +// segment permissions don't apply; the mapping is always RW. +TEST(ElfTest, ExtraMemPages) { + ElfBinary<64> elf = StandardElf(); + + // Create a standard ELF, but extend to 1.5 pages. The second page will be the + // beginning of a multi-page data + bss segment. + elf.data.resize(kPageSize + kPageSize / 2); + + decltype(elf)::ElfPhdr phdr = {}; + phdr.p_type = PT_LOAD; + // RWX segment. The extra anon page will be RW anyways. + // + // N.B. Linux uses clear_user to clear the end of the file-mapped page, which + // respects the mapping protections. Thus if we map this RO with memsz > + // (unaligned) filesz, then execve will fail with EFAULT. See padzero(elf_bss) + // in fs/binfmt_elf.c:load_elf_binary. + // + // N.N.B.B. The above only applies to the last segment. For earlier segments, + // the clear_user error is ignored. + phdr.p_flags = PF_R | PF_W | PF_X; + phdr.p_offset = kPageSize; + phdr.p_vaddr = 0x41000; + phdr.p_filesz = kPageSize / 2; + // The header is going to push vaddr up by a few hundred bytes. Keep p_memsz a + // bit less than 2 pages so this mapping doesn't extend beyond 0x43000. + phdr.p_memsz = 2 * kPageSize - kPageSize / 2; + elf.phdrs.push_back(phdr); + + elf.UpdateOffsets(); + + TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf)); + + pid_t child; + int execve_errno; + auto cleanup = ASSERT_NO_ERRNO_AND_VALUE( + ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno)); + ASSERT_EQ(execve_errno, 0); + + ASSERT_NO_ERRNO(WaitStopped(child)); + + EXPECT_THAT(child, + ContainsMappings(std::vector({ + // text page. + {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0, + file.path().c_str()}, + // data + bss page from file. + {0x41000, 0x42000, true, true, true, true, kPageSize, 0, 0, 0, + file.path().c_str()}, + // extra page from anon. + {0x42000, 0x43000, true, true, false, true, 0, 0, 0, 0, ""}, + }))); +} + +// An aligned segment with filesz == 0, memsz > 0 is anon-only. +TEST(ElfTest, AnonOnlySegment) { + ElfBinary<64> elf = StandardElf(); + + decltype(elf)::ElfPhdr phdr = {}; + phdr.p_type = PT_LOAD; + // RO segment. The extra anon page will be RW anyways. + phdr.p_flags = PF_R; + phdr.p_offset = 0; + phdr.p_vaddr = 0x41000; + phdr.p_filesz = 0; + phdr.p_memsz = kPageSize - 0xe8; + elf.phdrs.push_back(phdr); + + elf.UpdateOffsets(); + + // UpdateOffsets adjusts p_vaddr and p_offset by the header size, but we need + // a page-aligned p_vaddr to get a truly anon-only page. + elf.phdrs[2].p_vaddr = 0x41000; + // N.B. p_offset is now unaligned, but Linux doesn't care since this is + // anon-only. + + TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf)); + + pid_t child; + int execve_errno; + auto cleanup = ASSERT_NO_ERRNO_AND_VALUE( + ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno)); + ASSERT_EQ(execve_errno, 0); + + ASSERT_NO_ERRNO(WaitStopped(child)); + + EXPECT_THAT(child, + ContainsMappings(std::vector({ + // text page. + {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0, + file.path().c_str()}, + // anon page. + {0x41000, 0x42000, true, true, false, true, 0, 0, 0, 0, ""}, + }))); +} + +// p_offset must have the same alignment as p_vaddr. +TEST(ElfTest, UnalignedOffset) { + ElfBinary<64> elf = StandardElf(); + + // Unaligned offset. + elf.phdrs[1].p_offset += 1; + + elf.UpdateOffsets(); + + TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf)); + + pid_t child; + int execve_errno; + auto cleanup = ASSERT_NO_ERRNO_AND_VALUE( + ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno)); + + // execve(2) return EINVAL, but behavior varies between Linux and gVisor. + // + // On Linux, the new mm is committed before attempting to map into it. By the + // time we hit EINVAL in the segment mmap, the old mm is gone. Linux returns + // to an empty mm, which immediately segfaults. + // + // OTOH, gVisor maps into the new mm before committing it. Thus when it hits + // failure, the caller is still intact to receive the error. + if (IsRunningOnGvisor()) { + ASSERT_EQ(execve_errno, EINVAL); + } else { + ASSERT_EQ(execve_errno, 0); + + int status; + ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0), + SyscallSucceedsWithValue(child)); + EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV) << status; + } +} + // Linux will allow PT_LOAD segments to overlap. TEST(ElfTest, DirectlyOverlappingSegments) { // NOTE(b/37289926): see PIEOutOfOrderSegments. -- cgit v1.2.3