summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/loader
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/loader')
-rw-r--r--pkg/sentry/loader/BUILD59
-rw-r--r--pkg/sentry/loader/elf.go637
-rw-r--r--pkg/sentry/loader/interpreter.go105
-rw-r--r--pkg/sentry/loader/loader.go277
-rw-r--r--pkg/sentry/loader/vdso.go382
-rw-r--r--pkg/sentry/loader/vdso_state.go47
6 files changed, 1507 insertions, 0 deletions
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
new file mode 100644
index 000000000..917ec8cc8
--- /dev/null
+++ b/pkg/sentry/loader/BUILD
@@ -0,0 +1,59 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_embed_data", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_embed_data(
+ name = "vdso_bin",
+ src = "//vdso:vdso.so",
+ package = "loader",
+ var = "vdsoBin",
+)
+
+go_stateify(
+ name = "loader_state",
+ srcs = [
+ "vdso.go",
+ "vdso_state.go",
+ ],
+ out = "loader_state.go",
+ package = "loader",
+)
+
+go_library(
+ name = "loader",
+ srcs = [
+ "elf.go",
+ "interpreter.go",
+ "loader.go",
+ "loader_state.go",
+ "vdso.go",
+ "vdso_state.go",
+ ":vdso_bin",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/loader",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi",
+ "//pkg/abi/linux",
+ "//pkg/binary",
+ "//pkg/cpuid",
+ "//pkg/log",
+ "//pkg/refs",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/context",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/limits",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/mm",
+ "//pkg/sentry/platform",
+ "//pkg/sentry/safemem",
+ "//pkg/sentry/uniqueid",
+ "//pkg/sentry/usage",
+ "//pkg/sentry/usermem",
+ "//pkg/state",
+ "//pkg/syserror",
+ "//pkg/waiter",
+ ],
+)
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
new file mode 100644
index 000000000..d23dc1096
--- /dev/null
+++ b/pkg/sentry/loader/elf.go
@@ -0,0 +1,637 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package loader
+
+import (
+ "bytes"
+ "debug/elf"
+ "fmt"
+ "io"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/binary"
+ "gvisor.googlesource.com/gvisor/pkg/cpuid"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+ // elfMagic identifies an ELF file.
+ elfMagic = "\x7fELF"
+
+ // maxTotalPhdrSize is the maximum combined size of all program
+ // headers. Linux limits this to one page.
+ maxTotalPhdrSize = usermem.PageSize
+)
+
+var (
+ // header64Size is the size of elf.Header64.
+ header64Size = int(binary.Size(elf.Header64{}))
+
+ // Prog64Size is the size of elf.Prog64.
+ prog64Size = int(binary.Size(elf.Prog64{}))
+)
+
+func progFlagsAsPerms(f elf.ProgFlag) usermem.AccessType {
+ var p usermem.AccessType
+ if f&elf.PF_R == elf.PF_R {
+ p.Read = true
+ }
+ if f&elf.PF_W == elf.PF_W {
+ p.Write = true
+ }
+ if f&elf.PF_X == elf.PF_X {
+ p.Execute = true
+ }
+ return p
+}
+
+// elfInfo contains the metadata needed to load an ELF binary.
+type elfInfo struct {
+ // os is the target OS of the ELF.
+ os abi.OS
+
+ // arch is the target architecture of the ELF.
+ arch arch.Arch
+
+ // entry is the program entry point.
+ entry usermem.Addr
+
+ // phdrs are the program headers.
+ phdrs []elf.ProgHeader
+
+ // phdrSize is the size of a single program header in the ELF.
+ phdrSize int
+
+ // phdrOff is the offset of the program headers in the file.
+ phdrOff uint64
+
+ // sharedObject is true if the ELF represents a shared object.
+ sharedObject bool
+}
+
+// parseHeader parse the ELF header, verifying that this is a supported ELF
+// file and returning the ELF program headers.
+//
+// This is similar to elf.NewFile, except that it is more strict about what it
+// accepts from the ELF, and it doesn't parse unnecessary parts of the file.
+//
+// ctx may be nil if f does not need it.
+func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
+ // Check ident first; it will tell us the endianness of the rest of the
+ // structs.
+ var ident [elf.EI_NIDENT]byte
+ _, err := readFull(ctx, f, usermem.BytesIOSequence(ident[:]), 0)
+ if err != nil {
+ log.Infof("Error reading ELF ident: %v", err)
+ // The entire ident array always exists.
+ if err == io.EOF || err == io.ErrUnexpectedEOF {
+ err = syserror.ENOEXEC
+ }
+ return elfInfo{}, err
+ }
+
+ // Only some callers pre-check the ELF magic.
+ if !bytes.Equal(ident[:len(elfMagic)], []byte(elfMagic)) {
+ log.Infof("File is not an ELF")
+ return elfInfo{}, syserror.ENOEXEC
+ }
+
+ // We only support 64-bit, little endian binaries
+ if class := elf.Class(ident[elf.EI_CLASS]); class != elf.ELFCLASS64 {
+ log.Infof("Unsupported ELF class: %v", class)
+ return elfInfo{}, syserror.ENOEXEC
+ }
+ if endian := elf.Data(ident[elf.EI_DATA]); endian != elf.ELFDATA2LSB {
+ log.Infof("Unsupported ELF endianness: %v", endian)
+ return elfInfo{}, syserror.ENOEXEC
+ }
+ byteOrder := binary.LittleEndian
+
+ if version := elf.Version(ident[elf.EI_VERSION]); version != elf.EV_CURRENT {
+ log.Infof("Unsupported ELF version: %v", version)
+ return elfInfo{}, syserror.ENOEXEC
+ }
+ // EI_OSABI is ignored by Linux, which is the only OS supported.
+ os := abi.Linux
+
+ var hdr elf.Header64
+ hdrBuf := make([]byte, header64Size)
+ _, err = readFull(ctx, f, usermem.BytesIOSequence(hdrBuf), 0)
+ if err != nil {
+ log.Infof("Error reading ELF header: %v", err)
+ // The entire header always exists.
+ if err == io.EOF || err == io.ErrUnexpectedEOF {
+ err = syserror.ENOEXEC
+ }
+ return elfInfo{}, err
+ }
+ binary.Unmarshal(hdrBuf, byteOrder, &hdr)
+
+ // We only support amd64.
+ if machine := elf.Machine(hdr.Machine); machine != elf.EM_X86_64 {
+ log.Infof("Unsupported ELF machine %d", machine)
+ return elfInfo{}, syserror.ENOEXEC
+ }
+ a := arch.AMD64
+
+ var sharedObject bool
+ elfType := elf.Type(hdr.Type)
+ switch elfType {
+ case elf.ET_EXEC:
+ sharedObject = false
+ case elf.ET_DYN:
+ sharedObject = true
+ default:
+ log.Infof("Unsupported ELF type %v", elfType)
+ return elfInfo{}, syserror.ENOEXEC
+ }
+
+ if int(hdr.Phentsize) != prog64Size {
+ log.Infof("Unsupported phdr size %d", hdr.Phentsize)
+ return elfInfo{}, syserror.ENOEXEC
+ }
+ totalPhdrSize := prog64Size * int(hdr.Phnum)
+ if totalPhdrSize < prog64Size {
+ log.Warningf("No phdrs or total phdr size overflows: prog64Size: %d phnum: %d", prog64Size, int(hdr.Phnum))
+ return elfInfo{}, syserror.ENOEXEC
+ }
+ if totalPhdrSize > maxTotalPhdrSize {
+ log.Infof("Too many phdrs (%d): total size %d > %d", hdr.Phnum, totalPhdrSize, maxTotalPhdrSize)
+ return elfInfo{}, syserror.ENOEXEC
+ }
+
+ phdrBuf := make([]byte, totalPhdrSize)
+ _, err = readFull(ctx, f, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff))
+ if err != nil {
+ log.Infof("Error reading ELF phdrs: %v", err)
+ // If phdrs were specified, they should all exist.
+ if err == io.EOF || err == io.ErrUnexpectedEOF {
+ err = syserror.ENOEXEC
+ }
+ return elfInfo{}, err
+ }
+
+ phdrs := make([]elf.ProgHeader, hdr.Phnum)
+ for i := range phdrs {
+ var prog64 elf.Prog64
+ binary.Unmarshal(phdrBuf[:prog64Size], byteOrder, &prog64)
+ phdrBuf = phdrBuf[prog64Size:]
+ phdrs[i] = elf.ProgHeader{
+ Type: elf.ProgType(prog64.Type),
+ Flags: elf.ProgFlag(prog64.Flags),
+ Off: prog64.Off,
+ Vaddr: prog64.Vaddr,
+ Paddr: prog64.Paddr,
+ Filesz: prog64.Filesz,
+ Memsz: prog64.Memsz,
+ Align: prog64.Align,
+ }
+ }
+
+ return elfInfo{
+ os: os,
+ arch: a,
+ entry: usermem.Addr(hdr.Entry),
+ phdrs: phdrs,
+ phdrOff: hdr.Phoff,
+ phdrSize: prog64Size,
+ sharedObject: sharedObject,
+ }, nil
+}
+
+// mapSegment maps a phdr into the Task. offset is the offset to apply to
+// phdr.Vaddr.
+func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.ProgHeader, offset usermem.Addr) error {
+ // Alignment of vaddr and offset must match. We'll need to map on the
+ // page boundary.
+ adjust := usermem.Addr(phdr.Vaddr).PageOffset()
+ if adjust != usermem.Addr(phdr.Off).PageOffset() {
+ ctx.Infof("Alignment of vaddr %#x != off %#x", phdr.Vaddr, phdr.Off)
+ return syserror.ENOEXEC
+ }
+
+ addr, ok := offset.AddLength(phdr.Vaddr)
+ if !ok {
+ // If offset != 0 we should have ensured this would fit.
+ ctx.Warningf("Computed segment load address overflows: %#x + %#x", phdr.Vaddr, offset)
+ return syserror.ENOEXEC
+ }
+ addr -= usermem.Addr(adjust)
+
+ fileOffset := phdr.Off - adjust
+ fileSize := phdr.Filesz + adjust
+ if fileSize < phdr.Filesz {
+ ctx.Infof("Computed segment file size overflows: %#x + %#x", phdr.Filesz, adjust)
+ return syserror.ENOEXEC
+ }
+ memSize := phdr.Memsz + adjust
+ if memSize < phdr.Memsz {
+ ctx.Infof("Computed segment mem size overflows: %#x + %#x", phdr.Memsz, adjust)
+ return syserror.ENOEXEC
+ }
+ ms, ok := usermem.Addr(fileSize).RoundUp()
+ if !ok {
+ ctx.Infof("fileSize %#x too large", fileSize)
+ return syserror.ENOEXEC
+ }
+ mapSize := uint64(ms)
+
+ prot := progFlagsAsPerms(phdr.Flags)
+ mopts := memmap.MMapOpts{
+ Length: mapSize,
+ Offset: fileOffset,
+ Addr: addr,
+ Fixed: true,
+ // Linux will happily allow conflicting segments to map over
+ // one another.
+ Unmap: true,
+ Private: true,
+ Perms: prot,
+ MaxPerms: usermem.AnyAccess,
+ }
+ if err := f.ConfigureMMap(ctx, &mopts); err != nil {
+ ctx.Infof("File is not memory-mappable: %v", err)
+ return err
+ }
+ if _, err := m.MMap(ctx, mopts); err != nil {
+ ctx.Infof("Error mapping PT_LOAD segment %+v at %#x: %v", phdr, addr, err)
+ return err
+ }
+
+ // We need to clear the end of the last page that exceeds fileSize so
+ // we don't map part of the file beyond fileSize.
+ //
+ // Note that Linux *does not* clear the portion of the first page
+ // before phdr.Off.
+ if mapSize > fileSize {
+ zeroAddr, ok := addr.AddLength(fileSize)
+ if !ok {
+ panic(fmt.Sprintf("successfully mmaped address overflows? %#x + %#x", addr, fileSize))
+ }
+ zeroSize := int64(mapSize - fileSize)
+ if zeroSize < 0 {
+ panic(fmt.Sprintf("zeroSize too big? %#x", uint64(zeroSize)))
+ }
+ if _, err := m.ZeroOut(ctx, zeroAddr, zeroSize, usermem.IOOpts{IgnorePermissions: true}); err != nil {
+ ctx.Warningf("Failed to zero end of page [%#x, %#x): %v", zeroAddr, zeroAddr+usermem.Addr(zeroSize), err)
+ return err
+ }
+ }
+
+ // Allocate more anonymous pages if necessary.
+ if mapSize < memSize {
+ anonAddr, ok := addr.AddLength(mapSize)
+ if !ok {
+ panic(fmt.Sprintf("anonymous memory doesn't fit in pre-sized range? %#x + %#x", addr, mapSize))
+ }
+ anonSize, ok := usermem.Addr(memSize - mapSize).RoundUp()
+ if !ok {
+ ctx.Infof("extra anon pages too large: %#x", memSize-mapSize)
+ return syserror.ENOEXEC
+ }
+
+ if _, err := m.MMap(ctx, memmap.MMapOpts{
+ Length: uint64(anonSize),
+ Addr: anonAddr,
+ // Fixed without Unmap will fail the mmap if something is
+ // already at addr.
+ Fixed: true,
+ Private: true,
+ Perms: progFlagsAsPerms(phdr.Flags),
+ MaxPerms: usermem.AnyAccess,
+ }); err != nil {
+ ctx.Infof("Error mapping PT_LOAD segment %v anonymous memory: %v", phdr, err)
+ return err
+ }
+ }
+
+ return nil
+}
+
+// loadedELF describes an ELF that has been successfully loaded.
+type loadedELF struct {
+ // os is the target OS of the ELF.
+ os abi.OS
+
+ // arch is the target architecture of the ELF.
+ arch arch.Arch
+
+ // entry is the entry point of the ELF.
+ entry usermem.Addr
+
+ // start is the end of the ELF.
+ start usermem.Addr
+
+ // end is the end of the ELF.
+ end usermem.Addr
+
+ // interpter is the path to the ELF interpreter.
+ interpreter string
+
+ // phdrAddr is the address of the ELF program headers.
+ phdrAddr usermem.Addr
+
+ // phdrSize is the size of a single program header in the ELF.
+ phdrSize int
+
+ // phdrNum is the number of program headers.
+ phdrNum int
+
+ // auxv contains a subset of ELF-specific auxiliary vector entries:
+ // * AT_PHDR
+ // * AT_PHENT
+ // * AT_PHNUM
+ // * AT_BASE
+ // * AT_ENTRY
+ auxv arch.Auxv
+}
+
+// loadParsedELF loads f into mm.
+//
+// info is the parsed elfInfo from the header.
+//
+// It does not load the ELF interpreter, or return any auxv entries.
+//
+// Preconditions:
+// * f is an ELF file
+func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) {
+ first := true
+ var start, end usermem.Addr
+ var interpreter string
+ for _, phdr := range info.phdrs {
+ switch phdr.Type {
+ case elf.PT_LOAD:
+ vaddr := usermem.Addr(phdr.Vaddr)
+ if first {
+ first = false
+ start = vaddr
+ }
+ if vaddr < end {
+ ctx.Infof("PT_LOAD headers out-of-order. %#x < %#x", vaddr, end)
+ return loadedELF{}, syserror.ENOEXEC
+ }
+ var ok bool
+ end, ok = vaddr.AddLength(phdr.Memsz)
+ if !ok {
+ ctx.Infof("PT_LOAD header size overflows. %#x + %#x", vaddr, phdr.Memsz)
+ return loadedELF{}, syserror.ENOEXEC
+ }
+
+ case elf.PT_INTERP:
+ if phdr.Filesz > syscall.PathMax {
+ ctx.Infof("PT_INTERP path too big: %v", phdr.Filesz)
+ return loadedELF{}, syserror.ENOEXEC
+ }
+
+ path := make([]byte, phdr.Filesz)
+ _, err := readFull(ctx, f, usermem.BytesIOSequence(path), int64(phdr.Off))
+ if err != nil {
+ ctx.Infof("Error reading PT_INTERP path: %v", err)
+ // If an interpreter was specified, it should exist.
+ if err == io.EOF || err == io.ErrUnexpectedEOF {
+ err = syserror.ENOEXEC
+ }
+ return loadedELF{}, syserror.ENOEXEC
+ }
+
+ if path[len(path)-1] != 0 {
+ ctx.Infof("PT_INTERP path not NUL-terminated: %v", path)
+ return loadedELF{}, syserror.ENOEXEC
+ }
+
+ // Strip NUL-terminator from string.
+ interpreter = string(path[:len(path)-1])
+ }
+ }
+
+ // Shared objects don't have fixed load addresses. We need to pick a
+ // base address big enough to fit all segments, so we first create a
+ // mapping for the total size just to find a region that is big enough.
+ //
+ // It is safe to unmap it immediately with racing with another mapping
+ // because we are the only one in control of the MemoryManager.
+ //
+ // Note that the vaddr of the first PT_LOAD segment is ignored when
+ // choosing the load address (even if it is non-zero). The vaddr does
+ // become an offset from that load address.
+ var offset usermem.Addr
+ if info.sharedObject {
+ totalSize := end - start
+ totalSize, ok := totalSize.RoundUp()
+ if !ok {
+ ctx.Infof("ELF PT_LOAD segments too big")
+ return loadedELF{}, syserror.ENOEXEC
+ }
+
+ var err error
+ offset, err = m.MMap(ctx, memmap.MMapOpts{
+ Length: uint64(totalSize),
+ Addr: sharedLoadOffset,
+ Private: true,
+ })
+ if err != nil {
+ ctx.Infof("Error allocating address space for shared object: %v", err)
+ return loadedELF{}, err
+ }
+ if err := m.MUnmap(ctx, offset, uint64(totalSize)); err != nil {
+ panic(fmt.Sprintf("Failed to unmap base address: %v", err))
+ }
+
+ start, ok = start.AddLength(uint64(offset))
+ if !ok {
+ panic(fmt.Sprintf("Start %#x + offset %#x overflows?", start, offset))
+ }
+
+ end, ok = end.AddLength(uint64(offset))
+ if !ok {
+ panic(fmt.Sprintf("End %#x + offset %#x overflows?", end, offset))
+ }
+
+ info.entry, ok = info.entry.AddLength(uint64(offset))
+ if !ok {
+ ctx.Infof("Entrypoint %#x + offset %#x overflows? Is the entrypoint within a segment?", info.entry, offset)
+ return loadedELF{}, err
+ }
+ }
+
+ // Map PT_LOAD segments.
+ for _, phdr := range info.phdrs {
+ switch phdr.Type {
+ case elf.PT_LOAD:
+ if phdr.Memsz == 0 {
+ // No need to load segments with size 0, but
+ // they exist in some binaries.
+ continue
+ }
+
+ if err := mapSegment(ctx, m, f, &phdr, offset); err != nil {
+ ctx.Infof("Failed to map PT_LOAD segment: %+v", phdr)
+ return loadedELF{}, err
+ }
+ }
+ }
+
+ // This assumes that the first segment contains the ELF headers. This
+ // may not be true in a malformed ELF, but Linux makes the same
+ // assumption.
+ phdrAddr, ok := start.AddLength(info.phdrOff)
+ if !ok {
+ ctx.Warningf("ELF start address %#x + phdr offset %#x overflows", start, info.phdrOff)
+ phdrAddr = 0
+ }
+
+ return loadedELF{
+ os: info.os,
+ arch: info.arch,
+ entry: info.entry,
+ start: start,
+ end: end,
+ interpreter: interpreter,
+ phdrAddr: phdrAddr,
+ phdrSize: info.phdrSize,
+ phdrNum: len(info.phdrs),
+ }, nil
+}
+
+// loadInitialELF loads f into mm.
+//
+// It creates an arch.Context for the ELF and prepares the mm for this arch.
+//
+// It does not load the ELF interpreter, or return any auxv entries.
+//
+// Preconditions:
+// * f is an ELF file
+// * f is the first ELF loaded into m
+func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) {
+ info, err := parseHeader(ctx, f)
+ if err != nil {
+ ctx.Infof("Failed to parse initial ELF: %v", err)
+ return loadedELF{}, nil, err
+ }
+
+ // Create the arch.Context now so we can prepare the mmap layout before
+ // mapping anything.
+ ac := arch.New(info.arch, fs)
+
+ l, err := m.SetMmapLayout(ac, limits.FromContext(ctx))
+ if err != nil {
+ ctx.Warningf("Failed to set mmap layout: %v", err)
+ return loadedELF{}, nil, err
+ }
+
+ // PIELoadAddress tries to move the ELF out of the way of the default
+ // mmap base to ensure that the initial brk has sufficient space to
+ // grow.
+ le, err := loadParsedELF(ctx, m, f, info, ac.PIELoadAddress(l))
+ return le, ac, err
+}
+
+// loadInterpreterELF loads f into mm.
+//
+// The interpreter must be for the same OS/Arch as the initial ELF.
+//
+// It does not return any auxv entries.
+//
+// Preconditions:
+// * f is an ELF file
+func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, initial loadedELF) (loadedELF, error) {
+ info, err := parseHeader(ctx, f)
+ if err != nil {
+ if err == syserror.ENOEXEC {
+ // Bad interpreter.
+ err = syserror.ELIBBAD
+ }
+ return loadedELF{}, err
+ }
+
+ if info.os != initial.os {
+ ctx.Infof("Initial ELF OS %v and interpreter ELF OS %v differ", initial.os, info.os)
+ return loadedELF{}, syserror.ELIBBAD
+ }
+ if info.arch != initial.arch {
+ ctx.Infof("Initial ELF arch %v and interpreter ELF arch %v differ", initial.arch, info.arch)
+ return loadedELF{}, syserror.ELIBBAD
+ }
+
+ // The interpreter is not given a load offset, as its location does not
+ // affect brk.
+ return loadParsedELF(ctx, m, f, info, 0)
+}
+
+// loadELF loads f into the Task address space.
+//
+// If loadELF returns ErrSwitchFile it should be called again with the returned
+// path and argv.
+//
+// Preconditions:
+// * f is an ELF file
+func loadELF(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) {
+ bin, ac, err := loadInitialELF(ctx, m, fs, f)
+ if err != nil {
+ ctx.Infof("Error loading binary: %v", err)
+ return loadedELF{}, nil, err
+ }
+
+ var interp loadedELF
+ if bin.interpreter != "" {
+ d, i, err := openPath(ctx, mounts, root, wd, maxTraversals, bin.interpreter)
+ if err != nil {
+ ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err)
+ return loadedELF{}, nil, err
+ }
+ defer i.DecRef()
+ // We don't need the Dirent.
+ d.DecRef()
+
+ interp, err = loadInterpreterELF(ctx, m, i, bin)
+ if err != nil {
+ ctx.Infof("Error loading interpreter: %v", err)
+ return loadedELF{}, nil, err
+ }
+
+ if interp.interpreter != "" {
+ // No recursive interpreters!
+ ctx.Infof("Interpreter requires an interpreter")
+ return loadedELF{}, nil, syserror.ENOEXEC
+ }
+ }
+
+ // ELF-specific auxv entries.
+ bin.auxv = arch.Auxv{
+ arch.AuxEntry{linux.AT_PHDR, bin.phdrAddr},
+ arch.AuxEntry{linux.AT_PHENT, usermem.Addr(bin.phdrSize)},
+ arch.AuxEntry{linux.AT_PHNUM, usermem.Addr(bin.phdrNum)},
+ arch.AuxEntry{linux.AT_ENTRY, bin.entry},
+ }
+ if bin.interpreter != "" {
+ bin.auxv = append(bin.auxv, arch.AuxEntry{linux.AT_BASE, interp.start})
+
+ // Start in the interpreter.
+ // N.B. AT_ENTRY above contains the *original* entry point.
+ bin.entry = interp.entry
+ }
+
+ return bin, ac, nil
+}
diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go
new file mode 100644
index 000000000..b8ecbe92f
--- /dev/null
+++ b/pkg/sentry/loader/interpreter.go
@@ -0,0 +1,105 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package loader
+
+import (
+ "bytes"
+ "io"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+ // interpreterScriptMagic identifies an interpreter script.
+ interpreterScriptMagic = "#!"
+
+ // interpMaxLineLength is the maximum length for the first line of an
+ // interpreter script.
+ //
+ // From execve(2): "A maximum line length of 127 characters is allowed
+ // for the first line in a #! executable shell script."
+ interpMaxLineLength = 127
+)
+
+// parseInterpreterScript returns the interpreter path and argv.
+func parseInterpreterScript(ctx context.Context, filename string, f *fs.File, argv, envv []string) (newpath string, newargv []string, err error) {
+ line := make([]byte, interpMaxLineLength)
+ n, err := readFull(ctx, f, usermem.BytesIOSequence(line), 0)
+ // Short read is OK.
+ if err != nil && err != io.ErrUnexpectedEOF {
+ if err == io.EOF {
+ err = syserror.ENOEXEC
+ }
+ return "", []string{}, err
+ }
+ line = line[:n]
+
+ if !bytes.Equal(line[:2], []byte(interpreterScriptMagic)) {
+ return "", []string{}, syserror.ENOEXEC
+ }
+ // Ignore #!.
+ line = line[2:]
+
+ // Ignore everything after newline.
+ // Linux silently truncates the remainder of the line if it exceeds
+ // interpMaxLineLength.
+ i := bytes.IndexByte(line, '\n')
+ if i > 0 {
+ line = line[:i]
+ }
+
+ // Skip any whitespace before the interpeter.
+ line = bytes.TrimLeft(line, " \t")
+
+ // Linux only looks for a space or tab delimiting the interpreter and
+ // arg.
+ //
+ // execve(2): "On Linux, the entire string following the interpreter
+ // name is passed as a single argument to the interpreter, and this
+ // string can include white space."
+ interp := line
+ var arg []byte
+ i = bytes.IndexAny(line, " \t")
+ if i >= 0 {
+ interp = line[:i]
+ if i+1 < len(line) {
+ arg = line[i+1:]
+ }
+ }
+
+ // Build the new argument list:
+ //
+ // 1. The interpreter.
+ newargv = append(newargv, string(interp))
+
+ // 2. The optional interpreter argument.
+ if len(arg) > 0 {
+ newargv = append(newargv, string(arg))
+ }
+
+ // 3. The original arguments. The original argv[0] is replaced with the
+ // full script filename.
+ if len(argv) > 0 {
+ argv[0] = filename
+ } else {
+ argv = []string{filename}
+ }
+ newargv = append(newargv, argv...)
+
+ return string(interp), newargv, nil
+}
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
new file mode 100644
index 000000000..94c281b72
--- /dev/null
+++ b/pkg/sentry/loader/loader.go
@@ -0,0 +1,277 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package loader loads a binary into a MemoryManager.
+package loader
+
+import (
+ "bytes"
+ "crypto/rand"
+ "io"
+ "path"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/cpuid"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// readFull behaves like io.ReadFull for an *fs.File.
+func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ var total int64
+ for dst.NumBytes() > 0 {
+ n, err := f.Preadv(ctx, dst, offset+total)
+ total += n
+ if err == io.EOF && total != 0 {
+ return total, io.ErrUnexpectedEOF
+ } else if err != nil {
+ return total, err
+ }
+ dst = dst.DropFirst64(n)
+ }
+ return total, nil
+}
+
+// openPath opens name for loading.
+//
+// openPath returns the fs.Dirent and an *fs.File for name, which is not
+// installed in the Task FDMap. The caller takes ownership of both.
+//
+// name must be a readable, executable, regular file.
+func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, name string) (*fs.Dirent, *fs.File, error) {
+ d, err := mm.FindInode(ctx, root, wd, name, maxTraversals)
+ if err != nil {
+ return nil, nil, err
+ }
+ defer d.DecRef()
+
+ perms := fs.PermMask{
+ // TODO: Linux requires only execute permission,
+ // not read. However, our backing filesystems may prevent us
+ // from reading the file without read permission.
+ //
+ // Additionally, a task with a non-readable executable has
+ // additional constraints on access via ptrace and procfs.
+ Read: true,
+ Execute: true,
+ }
+ if err := d.Inode.CheckPermission(ctx, perms); err != nil {
+ return nil, nil, err
+ }
+
+ // If they claim it's a directory, then make sure.
+ //
+ // N.B. we reject directories below, but we must first reject
+ // non-directories passed as directories.
+ if len(name) > 0 && name[len(name)-1] == '/' && !fs.IsDir(d.Inode.StableAttr) {
+ return nil, nil, syserror.ENOTDIR
+ }
+
+ // No exec-ing directories, pipes, etc!
+ if !fs.IsRegular(d.Inode.StableAttr) {
+ ctx.Infof("Error regularing %s: %v", name, d.Inode.StableAttr)
+ return nil, nil, syserror.EACCES
+ }
+
+ // Create a new file.
+ file, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
+ if err != nil {
+ return nil, nil, err
+ }
+
+ // We must be able to read at arbitrary offsets.
+ if !file.Flags().Pread {
+ file.DecRef()
+ ctx.Infof("%s cannot be read at an offset: %+v", name, file.Flags())
+ return nil, nil, syserror.EACCES
+ }
+
+ // Grab a reference for the caller.
+ d.IncRef()
+ return d, file, nil
+}
+
+// allocStack allocates and maps a stack in to any available part of the address space.
+func allocStack(ctx context.Context, m *mm.MemoryManager, a arch.Context) (*arch.Stack, error) {
+ ar, err := m.MapStack(ctx)
+ if err != nil {
+ return nil, err
+ }
+ return &arch.Stack{a, m, ar.End}, nil
+}
+
+const (
+ // maxLoaderAttempts is the maximum number of attempts to try to load
+ // an interpreter scripts, to prevent loops. 6 (inital + 5 changes) is
+ // what the Linux kernel allows (fs/exec.c:search_binary_handler).
+ maxLoaderAttempts = 6
+)
+
+// loadPath resolves filename to a binary and loads it.
+//
+// It returns:
+// * loadedELF, description of the loaded binary
+// * arch.Context matching the binary arch
+// * fs.Dirent of the binary file
+// * Possibly updated argv
+func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, fs *cpuid.FeatureSet, filename string, argv, envv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
+ for i := 0; i < maxLoaderAttempts; i++ {
+ d, f, err := openPath(ctx, mounts, root, wd, maxTraversals, filename)
+ if err != nil {
+ ctx.Infof("Error opening %s: %v", filename, err)
+ return loadedELF{}, nil, nil, nil, err
+ }
+ defer f.DecRef()
+ // We will return d in the successful case, but defer a DecRef
+ // for intermediate loops and failure cases.
+ defer d.DecRef()
+
+ // Check the header. Is this an ELF or interpreter script?
+ var hdr [4]uint8
+ // N.B. We assume that reading from a regular file cannot block.
+ _, err = readFull(ctx, f, usermem.BytesIOSequence(hdr[:]), 0)
+ // Allow unexpected EOF, as a valid executable could be only three
+ // bytes (e.g., #!a).
+ if err != nil && err != io.ErrUnexpectedEOF {
+ if err == io.EOF {
+ err = syserror.ENOEXEC
+ }
+ return loadedELF{}, nil, nil, nil, err
+ }
+
+ switch {
+ case bytes.Equal(hdr[:], []byte(elfMagic)):
+ loaded, ac, err := loadELF(ctx, m, mounts, root, wd, maxTraversals, fs, f)
+ if err != nil {
+ ctx.Infof("Error loading ELF: %v", err)
+ return loadedELF{}, nil, nil, nil, err
+ }
+ // An ELF is always terminal. Hold on to d.
+ d.IncRef()
+ return loaded, ac, d, argv, err
+ case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)):
+ newpath, newargv, err := parseInterpreterScript(ctx, filename, f, argv, envv)
+ if err != nil {
+ ctx.Infof("Error loading interpreter script: %v", err)
+ return loadedELF{}, nil, nil, nil, err
+ }
+ filename = newpath
+ argv = newargv
+ default:
+ ctx.Infof("Unknown magic: %v", hdr)
+ return loadedELF{}, nil, nil, nil, syserror.ENOEXEC
+ }
+ }
+
+ return loadedELF{}, nil, nil, nil, syserror.ELOOP
+}
+
+// Load loads filename into a MemoryManager.
+//
+// If Load returns ErrSwitchFile it should be called again with the returned
+// path and argv.
+//
+// Preconditions:
+// * The Task MemoryManager is empty.
+// * Load is called on the Task goroutine.
+func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, fs *cpuid.FeatureSet, filename string, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, error) {
+ // Load the binary itself.
+ loaded, ac, d, argv, err := loadPath(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv)
+ if err != nil {
+ ctx.Infof("Failed to load %s: %v", filename, err)
+ return 0, nil, "", err
+ }
+ defer d.DecRef()
+
+ // Load the VDSO.
+ vdsoAddr, err := loadVDSO(ctx, m, vdso, loaded)
+ if err != nil {
+ ctx.Infof("Error loading VDSO: %v", err)
+ return 0, nil, "", err
+ }
+
+ // Setup the heap. brk starts at the next page after the end of the
+ // binary. Userspace can assume that the remainer of the page after
+ // loaded.end is available for its use.
+ e, ok := loaded.end.RoundUp()
+ if !ok {
+ ctx.Warningf("brk overflows: %#x", loaded.end)
+ return 0, nil, "", syserror.ENOEXEC
+ }
+ m.BrkSetup(ctx, e)
+
+ // Allocate our stack.
+ stack, err := allocStack(ctx, m, ac)
+ if err != nil {
+ ctx.Infof("Failed to allocate stack: %v", err)
+ return 0, nil, "", err
+ }
+
+ // Push the original filename to the stack, for AT_EXECFN.
+ execfn, err := stack.Push(filename)
+ if err != nil {
+ ctx.Infof("Failed to push exec filename: %v", err)
+ return 0, nil, "", err
+ }
+
+ // Push 16 random bytes on the stack which AT_RANDOM will point to.
+ var b [16]byte
+ if _, err := rand.Read(b[:]); err != nil {
+ ctx.Infof("Failed to read random bytes: %v", err)
+ return 0, nil, "", err
+ }
+ random, err := stack.Push(b)
+ if err != nil {
+ ctx.Infof("Failed to push random bytes: %v", err)
+ return 0, nil, "", err
+ }
+
+ // Add generic auxv entries
+ auxv := append(loaded.auxv, arch.Auxv{
+ arch.AuxEntry{linux.AT_CLKTCK, linux.CLOCKS_PER_SEC},
+ arch.AuxEntry{linux.AT_EXECFN, execfn},
+ arch.AuxEntry{linux.AT_RANDOM, random},
+ arch.AuxEntry{linux.AT_PAGESZ, usermem.PageSize},
+ arch.AuxEntry{linux.AT_SYSINFO_EHDR, vdsoAddr},
+ }...)
+ auxv = append(auxv, extraAuxv...)
+
+ sl, err := stack.Load(argv, envv, auxv)
+ if err != nil {
+ ctx.Infof("Failed to load stack: %v", err)
+ return 0, nil, "", err
+ }
+
+ m.SetArgvStart(sl.ArgvStart)
+ m.SetArgvEnd(sl.ArgvEnd)
+ m.SetEnvvStart(sl.EnvvStart)
+ m.SetEnvvEnd(sl.EnvvEnd)
+ m.SetAuxv(auxv)
+ m.SetExecutable(d)
+
+ ac.SetIP(uintptr(loaded.entry))
+ ac.SetStack(uintptr(stack.Bottom))
+
+ name := path.Base(filename)
+ if len(name) > linux.TASK_COMM_LEN-1 {
+ name = name[:linux.TASK_COMM_LEN-1]
+ }
+
+ return loaded.os, ac, name, nil
+}
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
new file mode 100644
index 000000000..ce4f6f5d9
--- /dev/null
+++ b/pkg/sentry/loader/vdso.go
@@ -0,0 +1,382 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package loader
+
+import (
+ "debug/elf"
+ "fmt"
+ "io"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// byteReaderFileOperations implements fs.FileOperations for reading
+// from a []byte source.
+type byteReader struct {
+ fsutil.NoopRelease
+ fsutil.PipeSeek
+ fsutil.NotDirReaddir
+ fsutil.NoFsync
+ fsutil.NoopFlush
+ fsutil.NoMMap
+ fsutil.NoIoctl
+ waiter.AlwaysReady
+ data []byte
+}
+
+type fileContext struct {
+ context.Context
+}
+
+func (f *fileContext) Value(key interface{}) interface{} {
+ switch key {
+ case uniqueid.CtxGlobalUniqueID:
+ return uint64(0)
+ default:
+ return f.Context.Value(key)
+ }
+}
+
+func newByteReaderFile(data []byte) *fs.File {
+ dirent := fs.NewTransientDirent(nil)
+ flags := fs.FileFlags{Read: true, Pread: true}
+ return fs.NewFile(&fileContext{Context: context.Background()}, dirent, flags, &byteReader{
+ data: data,
+ })
+}
+
+func (b *byteReader) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ if offset >= int64(len(b.data)) {
+ return 0, io.EOF
+ }
+ n, err := dst.CopyOut(ctx, b.data[offset:])
+ return int64(n), err
+}
+
+func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+ panic("Write not supported")
+}
+
+// validateVDSO checks that the VDSO can be loaded by loadVDSO.
+//
+// VDSOs are special (see below). Since we are going to map the VDSO directly
+// rather than using a normal loading process, we require that the PT_LOAD
+// segments have the same layout in the ELF as they expect to have in memory.
+//
+// Namely, this means that we must verify:
+// * PT_LOAD file offsets are equivalent to the memory offset from the first
+// segment.
+// * No extra zeroed space (memsz) is required.
+// * PT_LOAD segments are in order.
+// * No two PT_LOAD segments occupy parts of the same page.
+// * PT_LOAD segments don't extend beyond the end of the file.
+//
+// ctx may be nil if f does not need it.
+func validateVDSO(ctx context.Context, f *fs.File, size uint64) (elfInfo, error) {
+ info, err := parseHeader(ctx, f)
+ if err != nil {
+ log.Infof("Unable to parse VDSO header: %v", err)
+ return elfInfo{}, err
+ }
+
+ var first *elf.ProgHeader
+ var prev *elf.ProgHeader
+ var prevEnd usermem.Addr
+ for i, phdr := range info.phdrs {
+ if phdr.Type != elf.PT_LOAD {
+ continue
+ }
+
+ if first == nil {
+ first = &info.phdrs[i]
+ if phdr.Off != 0 {
+ log.Warningf("First PT_LOAD segment has non-zero file offset")
+ return elfInfo{}, syserror.ENOEXEC
+ }
+ }
+
+ memoryOffset := phdr.Vaddr - first.Vaddr
+ if memoryOffset != phdr.Off {
+ log.Warningf("PT_LOAD segment memory offset %#x != file offset %#x", memoryOffset, phdr.Off)
+ return elfInfo{}, syserror.ENOEXEC
+ }
+
+ // memsz larger than filesz means that extra zeroed space should be
+ // provided at the end of the segment. Since we are mapping the ELF
+ // directly, we don't want to just overwrite part of the ELF with
+ // zeroes.
+ if phdr.Memsz != phdr.Filesz {
+ log.Warningf("PT_LOAD segment memsz %#x != filesz %#x", phdr.Memsz, phdr.Filesz)
+ return elfInfo{}, syserror.ENOEXEC
+ }
+
+ start := usermem.Addr(memoryOffset)
+ end, ok := start.AddLength(phdr.Memsz)
+ if !ok {
+ log.Warningf("PT_LOAD segment size overflows: %#x + %#x", start, end)
+ return elfInfo{}, syserror.ENOEXEC
+ }
+ if uint64(end) > size {
+ log.Warningf("PT_LOAD segment end %#x extends beyond end of file %#x", end, size)
+ return elfInfo{}, syserror.ENOEXEC
+ }
+
+ if prev != nil {
+ if start < prevEnd {
+ log.Warningf("PT_LOAD segments out of order")
+ return elfInfo{}, syserror.ENOEXEC
+ }
+
+ // We mprotect entire pages, so each segment must be in
+ // its own page.
+ prevEndPage := prevEnd.RoundDown()
+ startPage := start.RoundDown()
+ if prevEndPage >= startPage {
+ log.Warningf("PT_LOAD segments share a page: %#x", prevEndPage)
+ return elfInfo{}, syserror.ENOEXEC
+ }
+ }
+ prev = &info.phdrs[i]
+ prevEnd = end
+ }
+
+ return info, nil
+}
+
+// VDSO describes a VDSO.
+//
+// NOTE: to support multiple architectures or operating systems, this
+// would need to contain a VDSO for each.
+type VDSO struct {
+ // ParamPage is the VDSO parameter page. This page should be updated to
+ // inform the VDSO for timekeeping data.
+ ParamPage *mm.SpecialMappable
+
+ // vdso is the VDSO ELF itself.
+ vdso *mm.SpecialMappable
+
+ // os is the operating system targeted by the VDSO.
+ os abi.OS
+
+ // arch is the architecture targeted by the VDSO.
+ arch arch.Arch
+
+ // phdrs are the VDSO ELF phdrs.
+ phdrs []elf.ProgHeader `state:".([]elfProgHeader)"`
+}
+
+// PrepareVDSO validates the system VDSO and returns a VDSO, containing the
+// param page for updating by the kernel.
+func PrepareVDSO(p platform.Platform) (*VDSO, error) {
+ vdsoFile := newByteReaderFile(vdsoBin)
+
+ // First make sure the VDSO is valid. vdsoFile does not use ctx, so a
+ // nil context can be passed.
+ info, err := validateVDSO(nil, vdsoFile, uint64(len(vdsoBin)))
+ if err != nil {
+ return nil, err
+ }
+
+ // Then copy it into a VDSO mapping.
+ size, ok := usermem.Addr(len(vdsoBin)).RoundUp()
+ if !ok {
+ return nil, fmt.Errorf("VDSO size overflows? %#x", len(vdsoBin))
+ }
+
+ vdso, err := p.Memory().Allocate(uint64(size), usage.System)
+ if err != nil {
+ return nil, fmt.Errorf("unable to allocate VDSO memory: %v", err)
+ }
+
+ ims, err := p.Memory().MapInternal(vdso, usermem.ReadWrite)
+ if err != nil {
+ p.Memory().DecRef(vdso)
+ return nil, fmt.Errorf("unable to map VDSO memory: %v", err)
+ }
+
+ _, err = safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(vdsoBin)))
+ if err != nil {
+ p.Memory().DecRef(vdso)
+ return nil, fmt.Errorf("unable to copy VDSO into memory: %v", err)
+ }
+
+ // Finally, allocate a param page for this VDSO.
+ paramPage, err := p.Memory().Allocate(usermem.PageSize, usage.System)
+ if err != nil {
+ p.Memory().DecRef(vdso)
+ return nil, fmt.Errorf("unable to allocate VDSO param page: %v", err)
+ }
+
+ return &VDSO{
+ ParamPage: mm.NewSpecialMappable("[vvar]", p, paramPage),
+ // TODO: Don't advertise the VDSO, as some applications may
+ // not be able to handle multiple [vdso] hints.
+ vdso: mm.NewSpecialMappable("", p, vdso),
+ phdrs: info.phdrs,
+ }, nil
+}
+
+// loadVDSO loads the VDSO into m.
+//
+// VDSOs are special.
+//
+// VDSOs are fully position independent. However, instead of loading a VDSO
+// like a normal ELF binary, mapping only the PT_LOAD segments, the Linux
+// kernel simply directly maps the entire file into process memory, with very
+// little real ELF parsing.
+//
+// NOTE: This means that userspace can, and unfortunately does,
+// depend on parts of the ELF that would normally not be mapped. To maintain
+// compatibility with such binaries, we load the VDSO much like Linux.
+//
+// loadVDSO takes a reference on the VDSO and parameter page FrameRegions.
+func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF) (usermem.Addr, error) {
+ if v == nil {
+ // Should be used only by tests.
+ ctx.Warningf("No VDSO provided, skipping VDSO mapping")
+ return 0, nil
+ }
+
+ if v.os != bin.os {
+ ctx.Warningf("Binary ELF OS %v and VDSO ELF OS %v differ", bin.os, v.os)
+ return 0, syserror.ENOEXEC
+ }
+ if v.arch != bin.arch {
+ ctx.Warningf("Binary ELF arch %v and VDSO ELF arch %v differ", bin.arch, v.arch)
+ return 0, syserror.ENOEXEC
+ }
+
+ // Reserve address space for the VDSO and its parameter page, which is
+ // mapped just before the VDSO.
+ mapSize := v.vdso.Length() + v.ParamPage.Length()
+ addr, err := m.MMap(ctx, memmap.MMapOpts{
+ Length: mapSize,
+ Private: true,
+ })
+ if err != nil {
+ ctx.Infof("Unable to reserve VDSO address space: %v", err)
+ return 0, err
+ }
+
+ // Now map the param page.
+ _, err = m.MMap(ctx, memmap.MMapOpts{
+ Length: v.ParamPage.Length(),
+ MappingIdentity: v.ParamPage,
+ Mappable: v.ParamPage,
+ Addr: addr,
+ Fixed: true,
+ Unmap: true,
+ Private: true,
+ Perms: usermem.Read,
+ MaxPerms: usermem.Read,
+ })
+ if err != nil {
+ ctx.Infof("Unable to map VDSO param page: %v", err)
+ return 0, err
+ }
+
+ // Now map the VDSO itself.
+ vdsoAddr, ok := addr.AddLength(v.ParamPage.Length())
+ if !ok {
+ panic(fmt.Sprintf("Part of mapped range overflows? %#x + %#x", addr, v.ParamPage.Length()))
+ }
+ _, err = m.MMap(ctx, memmap.MMapOpts{
+ Length: v.vdso.Length(),
+ MappingIdentity: v.vdso,
+ Mappable: v.vdso,
+ Addr: vdsoAddr,
+ Fixed: true,
+ Unmap: true,
+ Private: true,
+ Perms: usermem.Read,
+ MaxPerms: usermem.AnyAccess,
+ })
+ if err != nil {
+ ctx.Infof("Unable to map VDSO: %v", err)
+ return 0, err
+ }
+
+ vdsoEnd, ok := vdsoAddr.AddLength(v.vdso.Length())
+ if !ok {
+ panic(fmt.Sprintf("VDSO mapping overflows? %#x + %#x", vdsoAddr, v.vdso.Length()))
+ }
+
+ // Set additional protections for the individual segments.
+ var first *elf.ProgHeader
+ for i, phdr := range v.phdrs {
+ if phdr.Type != elf.PT_LOAD {
+ continue
+ }
+
+ if first == nil {
+ first = &v.phdrs[i]
+ }
+
+ memoryOffset := phdr.Vaddr - first.Vaddr
+ segAddr, ok := vdsoAddr.AddLength(memoryOffset)
+ if !ok {
+ ctx.Warningf("PT_LOAD segment address overflows: %#x + %#x", segAddr, memoryOffset)
+ return 0, syserror.ENOEXEC
+ }
+ segPage := segAddr.RoundDown()
+ segSize := usermem.Addr(phdr.Memsz)
+ segSize, ok = segSize.AddLength(segAddr.PageOffset())
+ if !ok {
+ ctx.Warningf("PT_LOAD segment memsize %#x + offset %#x overflows", phdr.Memsz, segAddr.PageOffset())
+ return 0, syserror.ENOEXEC
+ }
+ segSize, ok = segSize.RoundUp()
+ if !ok {
+ ctx.Warningf("PT_LOAD segment size overflows: %#x", phdr.Memsz+segAddr.PageOffset())
+ return 0, syserror.ENOEXEC
+ }
+ segEnd, ok := segPage.AddLength(uint64(segSize))
+ if !ok {
+ ctx.Warningf("PT_LOAD segment range overflows: %#x + %#x", segAddr, segSize)
+ return 0, syserror.ENOEXEC
+ }
+ if segEnd > vdsoEnd {
+ ctx.Warningf("PT_LOAD segment ends beyond VDSO: %#x > %#x", segEnd, vdsoEnd)
+ return 0, syserror.ENOEXEC
+ }
+
+ perms := progFlagsAsPerms(phdr.Flags)
+ if perms != usermem.Read {
+ if err := m.MProtect(segPage, uint64(segSize), perms, false); err != nil {
+ ctx.Warningf("Unable to set PT_LOAD segment protections %+v at [%#x, %#x): %v", perms, segAddr, segEnd, err)
+ return 0, syserror.ENOEXEC
+ }
+ }
+ }
+
+ return vdsoAddr, nil
+}
diff --git a/pkg/sentry/loader/vdso_state.go b/pkg/sentry/loader/vdso_state.go
new file mode 100644
index 000000000..92004ad9e
--- /dev/null
+++ b/pkg/sentry/loader/vdso_state.go
@@ -0,0 +1,47 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package loader
+
+import (
+ "debug/elf"
+)
+
+type elfProgHeader struct {
+ Type elf.ProgType
+ Flags elf.ProgFlag
+ Off uint64
+ Vaddr uint64
+ Paddr uint64
+ Filesz uint64
+ Memsz uint64
+ Align uint64
+}
+
+// savePhdrs is invoked by stateify.
+func (v *VDSO) savePhdrs() []elfProgHeader {
+ s := make([]elfProgHeader, 0, len(v.phdrs))
+ for _, h := range v.phdrs {
+ s = append(s, elfProgHeader(h))
+ }
+ return s
+}
+
+// loadPhdrs is invoked by stateify.
+func (v *VDSO) loadPhdrs(s []elfProgHeader) {
+ v.phdrs = make([]elf.ProgHeader, 0, len(s))
+ for _, h := range s {
+ v.phdrs = append(v.phdrs, elf.ProgHeader(h))
+ }
+}