Check in gVisor.

PiperOrigin-RevId: 194583126 Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
author: Googler <noreply@google.com> 2018-04-27 10:37:02 -0700
committer: Adin Scannell <ascannell@google.com> 2018-04-28 01:44:26 -0400
commit: d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree: 54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/loader
parent: f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)
6 files changed, 1507 insertions, 0 deletions
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
new file mode 100644
index 000000000..917ec8cc8
--- /dev/null
+++ b/pkg/sentry/loader/BUILD
@@ -0,0 +1,59 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_embed_data", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_embed_data(
+    name = "vdso_bin",
+    src = "//vdso:vdso.so",
+    package = "loader",
+    var = "vdsoBin",
+)
+
+go_stateify(
+    name = "loader_state",
+    srcs = [
+        "vdso.go",
+        "vdso_state.go",
+    ],
+    out = "loader_state.go",
+    package = "loader",
+)
+
+go_library(
+    name = "loader",
+    srcs = [
+        "elf.go",
+        "interpreter.go",
+        "loader.go",
+        "loader_state.go",
+        "vdso.go",
+        "vdso_state.go",
+        ":vdso_bin",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/loader",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi",
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/cpuid",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/mm",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/uniqueid",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
new file mode 100644
index 000000000..d23dc1096
--- /dev/null
+++ b/pkg/sentry/loader/elf.go
@@ -0,0 +1,637 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package loader
+
+import (
+	"bytes"
+	"debug/elf"
+	"fmt"
+	"io"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	// elfMagic identifies an ELF file.
+	elfMagic = "\x7fELF"
+
+	// maxTotalPhdrSize is the maximum combined size of all program
+	// headers.  Linux limits this to one page.
+	maxTotalPhdrSize = usermem.PageSize
+)
+
+var (
+	// header64Size is the size of elf.Header64.
+	header64Size = int(binary.Size(elf.Header64{}))
+
+	// Prog64Size is the size of elf.Prog64.
+	prog64Size = int(binary.Size(elf.Prog64{}))
+)
+
+func progFlagsAsPerms(f elf.ProgFlag) usermem.AccessType {
+	var p usermem.AccessType
+	if f&elf.PF_R == elf.PF_R {
+		p.Read = true
+	}
+	if f&elf.PF_W == elf.PF_W {
+		p.Write = true
+	}
+	if f&elf.PF_X == elf.PF_X {
+		p.Execute = true
+	}
+	return p
+}
+
+// elfInfo contains the metadata needed to load an ELF binary.
+type elfInfo struct {
+	// os is the target OS of the ELF.
+	os abi.OS
+
+	// arch is the target architecture of the ELF.
+	arch arch.Arch
+
+	// entry is the program entry point.
+	entry usermem.Addr
+
+	// phdrs are the program headers.
+	phdrs []elf.ProgHeader
+
+	// phdrSize is the size of a single program header in the ELF.
+	phdrSize int
+
+	// phdrOff is the offset of the program headers in the file.
+	phdrOff uint64
+
+	// sharedObject is true if the ELF represents a shared object.
+	sharedObject bool
+}
+
+// parseHeader parse the ELF header, verifying that this is a supported ELF
+// file and returning the ELF program headers.
+//
+// This is similar to elf.NewFile, except that it is more strict about what it
+// accepts from the ELF, and it doesn't parse unnecessary parts of the file.
+//
+// ctx may be nil if f does not need it.
+func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
+	// Check ident first; it will tell us the endianness of the rest of the
+	// structs.
+	var ident [elf.EI_NIDENT]byte
+	_, err := readFull(ctx, f, usermem.BytesIOSequence(ident[:]), 0)
+	if err != nil {
+		log.Infof("Error reading ELF ident: %v", err)
+		// The entire ident array always exists.
+		if err == io.EOF || err == io.ErrUnexpectedEOF {
+			err = syserror.ENOEXEC
+		}
+		return elfInfo{}, err
+	}
+
+	// Only some callers pre-check the ELF magic.
+	if !bytes.Equal(ident[:len(elfMagic)], []byte(elfMagic)) {
+		log.Infof("File is not an ELF")
+		return elfInfo{}, syserror.ENOEXEC
+	}
+
+	// We only support 64-bit, little endian binaries
+	if class := elf.Class(ident[elf.EI_CLASS]); class != elf.ELFCLASS64 {
+		log.Infof("Unsupported ELF class: %v", class)
+		return elfInfo{}, syserror.ENOEXEC
+	}
+	if endian := elf.Data(ident[elf.EI_DATA]); endian != elf.ELFDATA2LSB {
+		log.Infof("Unsupported ELF endianness: %v", endian)
+		return elfInfo{}, syserror.ENOEXEC
+	}
+	byteOrder := binary.LittleEndian
+
+	if version := elf.Version(ident[elf.EI_VERSION]); version != elf.EV_CURRENT {
+		log.Infof("Unsupported ELF version: %v", version)
+		return elfInfo{}, syserror.ENOEXEC
+	}
+	// EI_OSABI is ignored by Linux, which is the only OS supported.
+	os := abi.Linux
+
+	var hdr elf.Header64
+	hdrBuf := make([]byte, header64Size)
+	_, err = readFull(ctx, f, usermem.BytesIOSequence(hdrBuf), 0)
+	if err != nil {
+		log.Infof("Error reading ELF header: %v", err)
+		// The entire header always exists.
+		if err == io.EOF || err == io.ErrUnexpectedEOF {
+			err = syserror.ENOEXEC
+		}
+		return elfInfo{}, err
+	}
+	binary.Unmarshal(hdrBuf, byteOrder, &hdr)
+
+	// We only support amd64.
+	if machine := elf.Machine(hdr.Machine); machine != elf.EM_X86_64 {
+		log.Infof("Unsupported ELF machine %d", machine)
+		return elfInfo{}, syserror.ENOEXEC
+	}
+	a := arch.AMD64
+
+	var sharedObject bool
+	elfType := elf.Type(hdr.Type)
+	switch elfType {
+	case elf.ET_EXEC:
+		sharedObject = false
+	case elf.ET_DYN:
+		sharedObject = true
+	default:
+		log.Infof("Unsupported ELF type %v", elfType)
+		return elfInfo{}, syserror.ENOEXEC
+	}
+
+	if int(hdr.Phentsize) != prog64Size {
+		log.Infof("Unsupported phdr size %d", hdr.Phentsize)
+		return elfInfo{}, syserror.ENOEXEC
+	}
+	totalPhdrSize := prog64Size * int(hdr.Phnum)
+	if totalPhdrSize < prog64Size {
+		log.Warningf("No phdrs or total phdr size overflows: prog64Size: %d phnum: %d", prog64Size, int(hdr.Phnum))
+		return elfInfo{}, syserror.ENOEXEC
+	}
+	if totalPhdrSize > maxTotalPhdrSize {
+		log.Infof("Too many phdrs (%d): total size %d > %d", hdr.Phnum, totalPhdrSize, maxTotalPhdrSize)
+		return elfInfo{}, syserror.ENOEXEC
+	}
+
+	phdrBuf := make([]byte, totalPhdrSize)
+	_, err = readFull(ctx, f, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff))
+	if err != nil {
+		log.Infof("Error reading ELF phdrs: %v", err)
+		// If phdrs were specified, they should all exist.
+		if err == io.EOF || err == io.ErrUnexpectedEOF {
+			err = syserror.ENOEXEC
+		}
+		return elfInfo{}, err
+	}
+
+	phdrs := make([]elf.ProgHeader, hdr.Phnum)
+	for i := range phdrs {
+		var prog64 elf.Prog64
+		binary.Unmarshal(phdrBuf[:prog64Size], byteOrder, &prog64)
+		phdrBuf = phdrBuf[prog64Size:]
+		phdrs[i] = elf.ProgHeader{
+			Type:   elf.ProgType(prog64.Type),
+			Flags:  elf.ProgFlag(prog64.Flags),
+			Off:    prog64.Off,
+			Vaddr:  prog64.Vaddr,
+			Paddr:  prog64.Paddr,
+			Filesz: prog64.Filesz,
+			Memsz:  prog64.Memsz,
+			Align:  prog64.Align,
+		}
+	}
+
+	return elfInfo{
+		os:           os,
+		arch:         a,
+		entry:        usermem.Addr(hdr.Entry),
+		phdrs:        phdrs,
+		phdrOff:      hdr.Phoff,
+		phdrSize:     prog64Size,
+		sharedObject: sharedObject,
+	}, nil
+}
+
+// mapSegment maps a phdr into the Task. offset is the offset to apply to
+// phdr.Vaddr.
+func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.ProgHeader, offset usermem.Addr) error {
+	// Alignment of vaddr and offset must match. We'll need to map on the
+	// page boundary.
+	adjust := usermem.Addr(phdr.Vaddr).PageOffset()
+	if adjust != usermem.Addr(phdr.Off).PageOffset() {
+		ctx.Infof("Alignment of vaddr %#x != off %#x", phdr.Vaddr, phdr.Off)
+		return syserror.ENOEXEC
+	}
+
+	addr, ok := offset.AddLength(phdr.Vaddr)
+	if !ok {
+		// If offset != 0 we should have ensured this would fit.
+		ctx.Warningf("Computed segment load address overflows: %#x + %#x", phdr.Vaddr, offset)
+		return syserror.ENOEXEC
+	}
+	addr -= usermem.Addr(adjust)
+
+	fileOffset := phdr.Off - adjust
+	fileSize := phdr.Filesz + adjust
+	if fileSize < phdr.Filesz {
+		ctx.Infof("Computed segment file size overflows: %#x + %#x", phdr.Filesz, adjust)
+		return syserror.ENOEXEC
+	}
+	memSize := phdr.Memsz + adjust
+	if memSize < phdr.Memsz {
+		ctx.Infof("Computed segment mem size overflows: %#x + %#x", phdr.Memsz, adjust)
+		return syserror.ENOEXEC
+	}
+	ms, ok := usermem.Addr(fileSize).RoundUp()
+	if !ok {
+		ctx.Infof("fileSize %#x too large", fileSize)
+		return syserror.ENOEXEC
+	}
+	mapSize := uint64(ms)
+
+	prot := progFlagsAsPerms(phdr.Flags)
+	mopts := memmap.MMapOpts{
+		Length: mapSize,
+		Offset: fileOffset,
+		Addr:   addr,
+		Fixed:  true,
+		// Linux will happily allow conflicting segments to map over
+		// one another.
+		Unmap:    true,
+		Private:  true,
+		Perms:    prot,
+		MaxPerms: usermem.AnyAccess,
+	}
+	if err := f.ConfigureMMap(ctx, &mopts); err != nil {
+		ctx.Infof("File is not memory-mappable: %v", err)
+		return err
+	}
+	if _, err := m.MMap(ctx, mopts); err != nil {
+		ctx.Infof("Error mapping PT_LOAD segment %+v at %#x: %v", phdr, addr, err)
+		return err
+	}
+
+	// We need to clear the end of the last page that exceeds fileSize so
+	// we don't map part of the file beyond fileSize.
+	//
+	// Note that Linux *does not* clear the portion of the first page
+	// before phdr.Off.
+	if mapSize > fileSize {
+		zeroAddr, ok := addr.AddLength(fileSize)
+		if !ok {
+			panic(fmt.Sprintf("successfully mmaped address overflows? %#x + %#x", addr, fileSize))
+		}
+		zeroSize := int64(mapSize - fileSize)
+		if zeroSize < 0 {
+			panic(fmt.Sprintf("zeroSize too big? %#x", uint64(zeroSize)))
+		}
+		if _, err := m.ZeroOut(ctx, zeroAddr, zeroSize, usermem.IOOpts{IgnorePermissions: true}); err != nil {
+			ctx.Warningf("Failed to zero end of page [%#x, %#x): %v", zeroAddr, zeroAddr+usermem.Addr(zeroSize), err)
+			return err
+		}
+	}
+
+	// Allocate more anonymous pages if necessary.
+	if mapSize < memSize {
+		anonAddr, ok := addr.AddLength(mapSize)
+		if !ok {
+			panic(fmt.Sprintf("anonymous memory doesn't fit in pre-sized range? %#x + %#x", addr, mapSize))
+		}
+		anonSize, ok := usermem.Addr(memSize - mapSize).RoundUp()
+		if !ok {
+			ctx.Infof("extra anon pages too large: %#x", memSize-mapSize)
+			return syserror.ENOEXEC
+		}
+
+		if _, err := m.MMap(ctx, memmap.MMapOpts{
+			Length: uint64(anonSize),
+			Addr:   anonAddr,
+			// Fixed without Unmap will fail the mmap if something is
+			// already at addr.
+			Fixed:    true,
+			Private:  true,
+			Perms:    progFlagsAsPerms(phdr.Flags),
+			MaxPerms: usermem.AnyAccess,
+		}); err != nil {
+			ctx.Infof("Error mapping PT_LOAD segment %v anonymous memory: %v", phdr, err)
+			return err
+		}
+	}
+
+	return nil
+}
+
+// loadedELF describes an ELF that has been successfully loaded.
+type loadedELF struct {
+	// os is the target OS of the ELF.
+	os abi.OS
+
+	// arch is the target architecture of the ELF.
+	arch arch.Arch
+
+	// entry is the entry point of the ELF.
+	entry usermem.Addr
+
+	// start is the end of the ELF.
+	start usermem.Addr
+
+	// end is the end of the ELF.
+	end usermem.Addr
+
+	// interpter is the path to the ELF interpreter.
+	interpreter string
+
+	// phdrAddr is the address of the ELF program headers.
+	phdrAddr usermem.Addr
+
+	// phdrSize is the size of a single program header in the ELF.
+	phdrSize int
+
+	// phdrNum is the number of program headers.
+	phdrNum int
+
+	// auxv contains a subset of ELF-specific auxiliary vector entries:
+	// * AT_PHDR
+	// * AT_PHENT
+	// * AT_PHNUM
+	// * AT_BASE
+	// * AT_ENTRY
+	auxv arch.Auxv
+}
+
+// loadParsedELF loads f into mm.
+//
+// info is the parsed elfInfo from the header.
+//
+// It does not load the ELF interpreter, or return any auxv entries.
+//
+// Preconditions:
+//  * f is an ELF file
+func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) {
+	first := true
+	var start, end usermem.Addr
+	var interpreter string
+	for _, phdr := range info.phdrs {
+		switch phdr.Type {
+		case elf.PT_LOAD:
+			vaddr := usermem.Addr(phdr.Vaddr)
+			if first {
+				first = false
+				start = vaddr
+			}
+			if vaddr < end {
+				ctx.Infof("PT_LOAD headers out-of-order. %#x < %#x", vaddr, end)
+				return loadedELF{}, syserror.ENOEXEC
+			}
+			var ok bool
+			end, ok = vaddr.AddLength(phdr.Memsz)
+			if !ok {
+				ctx.Infof("PT_LOAD header size overflows. %#x + %#x", vaddr, phdr.Memsz)
+				return loadedELF{}, syserror.ENOEXEC
+			}
+
+		case elf.PT_INTERP:
+			if phdr.Filesz > syscall.PathMax {
+				ctx.Infof("PT_INTERP path too big: %v", phdr.Filesz)
+				return loadedELF{}, syserror.ENOEXEC
+			}
+
+			path := make([]byte, phdr.Filesz)
+			_, err := readFull(ctx, f, usermem.BytesIOSequence(path), int64(phdr.Off))
+			if err != nil {
+				ctx.Infof("Error reading PT_INTERP path: %v", err)
+				// If an interpreter was specified, it should exist.
+				if err == io.EOF || err == io.ErrUnexpectedEOF {
+					err = syserror.ENOEXEC
+				}
+				return loadedELF{}, syserror.ENOEXEC
+			}
+
+			if path[len(path)-1] != 0 {
+				ctx.Infof("PT_INTERP path not NUL-terminated: %v", path)
+				return loadedELF{}, syserror.ENOEXEC
+			}
+
+			// Strip NUL-terminator from string.
+			interpreter = string(path[:len(path)-1])
+		}
+	}
+
+	// Shared objects don't have fixed load addresses. We need to pick a
+	// base address big enough to fit all segments, so we first create a
+	// mapping for the total size just to find a region that is big enough.
+	//
+	// It is safe to unmap it immediately with racing with another mapping
+	// because we are the only one in control of the MemoryManager.
+	//
+	// Note that the vaddr of the first PT_LOAD segment is ignored when
+	// choosing the load address (even if it is non-zero). The vaddr does
+	// become an offset from that load address.
+	var offset usermem.Addr
+	if info.sharedObject {
+		totalSize := end - start
+		totalSize, ok := totalSize.RoundUp()
+		if !ok {
+			ctx.Infof("ELF PT_LOAD segments too big")
+			return loadedELF{}, syserror.ENOEXEC
+		}
+
+		var err error
+		offset, err = m.MMap(ctx, memmap.MMapOpts{
+			Length:  uint64(totalSize),
+			Addr:    sharedLoadOffset,
+			Private: true,
+		})
+		if err != nil {
+			ctx.Infof("Error allocating address space for shared object: %v", err)
+			return loadedELF{}, err
+		}
+		if err := m.MUnmap(ctx, offset, uint64(totalSize)); err != nil {
+			panic(fmt.Sprintf("Failed to unmap base address: %v", err))
+		}
+
+		start, ok = start.AddLength(uint64(offset))
+		if !ok {
+			panic(fmt.Sprintf("Start %#x + offset %#x overflows?", start, offset))
+		}
+
+		end, ok = end.AddLength(uint64(offset))
+		if !ok {
+			panic(fmt.Sprintf("End %#x + offset %#x overflows?", end, offset))
+		}
+
+		info.entry, ok = info.entry.AddLength(uint64(offset))
+		if !ok {
+			ctx.Infof("Entrypoint %#x + offset %#x overflows? Is the entrypoint within a segment?", info.entry, offset)
+			return loadedELF{}, err
+		}
+	}
+
+	// Map PT_LOAD segments.
+	for _, phdr := range info.phdrs {
+		switch phdr.Type {
+		case elf.PT_LOAD:
+			if phdr.Memsz == 0 {
+				// No need to load segments with size 0, but
+				// they exist in some binaries.
+				continue
+			}
+
+			if err := mapSegment(ctx, m, f, &phdr, offset); err != nil {
+				ctx.Infof("Failed to map PT_LOAD segment: %+v", phdr)
+				return loadedELF{}, err
+			}
+		}
+	}
+
+	// This assumes that the first segment contains the ELF headers. This
+	// may not be true in a malformed ELF, but Linux makes the same
+	// assumption.
+	phdrAddr, ok := start.AddLength(info.phdrOff)
+	if !ok {
+		ctx.Warningf("ELF start address %#x + phdr offset %#x overflows", start, info.phdrOff)
+		phdrAddr = 0
+	}
+
+	return loadedELF{
+		os:          info.os,
+		arch:        info.arch,
+		entry:       info.entry,
+		start:       start,
+		end:         end,
+		interpreter: interpreter,
+		phdrAddr:    phdrAddr,
+		phdrSize:    info.phdrSize,
+		phdrNum:     len(info.phdrs),
+	}, nil
+}
+
+// loadInitialELF loads f into mm.
+//
+// It creates an arch.Context for the ELF and prepares the mm for this arch.
+//
+// It does not load the ELF interpreter, or return any auxv entries.
+//
+// Preconditions:
+//  * f is an ELF file
+//  * f is the first ELF loaded into m
+func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) {
+	info, err := parseHeader(ctx, f)
+	if err != nil {
+		ctx.Infof("Failed to parse initial ELF: %v", err)
+		return loadedELF{}, nil, err
+	}
+
+	// Create the arch.Context now so we can prepare the mmap layout before
+	// mapping anything.
+	ac := arch.New(info.arch, fs)
+
+	l, err := m.SetMmapLayout(ac, limits.FromContext(ctx))
+	if err != nil {
+		ctx.Warningf("Failed to set mmap layout: %v", err)
+		return loadedELF{}, nil, err
+	}
+
+	// PIELoadAddress tries to move the ELF out of the way of the default
+	// mmap base to ensure that the initial brk has sufficient space to
+	// grow.
+	le, err := loadParsedELF(ctx, m, f, info, ac.PIELoadAddress(l))
+	return le, ac, err
+}
+
+// loadInterpreterELF loads f into mm.
+//
+// The interpreter must be for the same OS/Arch as the initial ELF.
+//
+// It does not return any auxv entries.
+//
+// Preconditions:
+//  * f is an ELF file
+func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, initial loadedELF) (loadedELF, error) {
+	info, err := parseHeader(ctx, f)
+	if err != nil {
+		if err == syserror.ENOEXEC {
+			// Bad interpreter.
+			err = syserror.ELIBBAD
+		}
+		return loadedELF{}, err
+	}
+
+	if info.os != initial.os {
+		ctx.Infof("Initial ELF OS %v and interpreter ELF OS %v differ", initial.os, info.os)
+		return loadedELF{}, syserror.ELIBBAD
+	}
+	if info.arch != initial.arch {
+		ctx.Infof("Initial ELF arch %v and interpreter ELF arch %v differ", initial.arch, info.arch)
+		return loadedELF{}, syserror.ELIBBAD
+	}
+
+	// The interpreter is not given a load offset, as its location does not
+	// affect brk.
+	return loadParsedELF(ctx, m, f, info, 0)
+}
+
+// loadELF loads f into the Task address space.
+//
+// If loadELF returns ErrSwitchFile it should be called again with the returned
+// path and argv.
+//
+// Preconditions:
+//  * f is an ELF file
+func loadELF(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) {
+	bin, ac, err := loadInitialELF(ctx, m, fs, f)
+	if err != nil {
+		ctx.Infof("Error loading binary: %v", err)
+		return loadedELF{}, nil, err
+	}
+
+	var interp loadedELF
+	if bin.interpreter != "" {
+		d, i, err := openPath(ctx, mounts, root, wd, maxTraversals, bin.interpreter)
+		if err != nil {
+			ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err)
+			return loadedELF{}, nil, err
+		}
+		defer i.DecRef()
+		// We don't need the Dirent.
+		d.DecRef()
+
+		interp, err = loadInterpreterELF(ctx, m, i, bin)
+		if err != nil {
+			ctx.Infof("Error loading interpreter: %v", err)
+			return loadedELF{}, nil, err
+		}
+
+		if interp.interpreter != "" {
+			// No recursive interpreters!
+			ctx.Infof("Interpreter requires an interpreter")
+			return loadedELF{}, nil, syserror.ENOEXEC
+		}
+	}
+
+	// ELF-specific auxv entries.
+	bin.auxv = arch.Auxv{
+		arch.AuxEntry{linux.AT_PHDR, bin.phdrAddr},
+		arch.AuxEntry{linux.AT_PHENT, usermem.Addr(bin.phdrSize)},
+		arch.AuxEntry{linux.AT_PHNUM, usermem.Addr(bin.phdrNum)},
+		arch.AuxEntry{linux.AT_ENTRY, bin.entry},
+	}
+	if bin.interpreter != "" {
+		bin.auxv = append(bin.auxv, arch.AuxEntry{linux.AT_BASE, interp.start})
+
+		// Start in the interpreter.
+		// N.B. AT_ENTRY above contains the *original* entry point.
+		bin.entry = interp.entry
+	}
+
+	return bin, ac, nil
+}
diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go
new file mode 100644
index 000000000..b8ecbe92f
--- /dev/null
+++ b/pkg/sentry/loader/interpreter.go
@@ -0,0 +1,105 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package loader
+
+import (
+	"bytes"
+	"io"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	// interpreterScriptMagic identifies an interpreter script.
+	interpreterScriptMagic = "#!"
+
+	// interpMaxLineLength is the maximum length for the first line of an
+	// interpreter script.
+	//
+	// From execve(2): "A maximum line length of 127 characters is allowed
+	// for the first line in a #! executable shell script."
+	interpMaxLineLength = 127
+)
+
+// parseInterpreterScript returns the interpreter path and argv.
+func parseInterpreterScript(ctx context.Context, filename string, f *fs.File, argv, envv []string) (newpath string, newargv []string, err error) {
+	line := make([]byte, interpMaxLineLength)
+	n, err := readFull(ctx, f, usermem.BytesIOSequence(line), 0)
+	// Short read is OK.
+	if err != nil && err != io.ErrUnexpectedEOF {
+		if err == io.EOF {
+			err = syserror.ENOEXEC
+		}
+		return "", []string{}, err
+	}
+	line = line[:n]
+
+	if !bytes.Equal(line[:2], []byte(interpreterScriptMagic)) {
+		return "", []string{}, syserror.ENOEXEC
+	}
+	// Ignore #!.
+	line = line[2:]
+
+	// Ignore everything after newline.
+	// Linux silently truncates the remainder of the line if it exceeds
+	// interpMaxLineLength.
+	i := bytes.IndexByte(line, '\n')
+	if i > 0 {
+		line = line[:i]
+	}
+
+	// Skip any whitespace before the interpeter.
+	line = bytes.TrimLeft(line, " \t")
+
+	// Linux only looks for a space or tab delimiting the interpreter and
+	// arg.
+	//
+	// execve(2): "On Linux, the entire string following the interpreter
+	// name is passed as a single argument to the interpreter, and this
+	// string can include white space."
+	interp := line
+	var arg []byte
+	i = bytes.IndexAny(line, " \t")
+	if i >= 0 {
+		interp = line[:i]
+		if i+1 < len(line) {
+			arg = line[i+1:]
+		}
+	}
+
+	// Build the new argument list:
+	//
+	// 1. The interpreter.
+	newargv = append(newargv, string(interp))
+
+	// 2. The optional interpreter argument.
+	if len(arg) > 0 {
+		newargv = append(newargv, string(arg))
+	}
+
+	// 3. The original arguments. The original argv[0] is replaced with the
+	// full script filename.
+	if len(argv) > 0 {
+		argv[0] = filename
+	} else {
+		argv = []string{filename}
+	}
+	newargv = append(newargv, argv...)
+
+	return string(interp), newargv, nil
+}
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
new file mode 100644
index 000000000..94c281b72
--- /dev/null
+++ b/pkg/sentry/loader/loader.go
@@ -0,0 +1,277 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package loader loads a binary into a MemoryManager.
+package loader
+
+import (
+	"bytes"
+	"crypto/rand"
+	"io"
+	"path"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// readFull behaves like io.ReadFull for an *fs.File.
+func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	var total int64
+	for dst.NumBytes() > 0 {
+		n, err := f.Preadv(ctx, dst, offset+total)
+		total += n
+		if err == io.EOF && total != 0 {
+			return total, io.ErrUnexpectedEOF
+		} else if err != nil {
+			return total, err
+		}
+		dst = dst.DropFirst64(n)
+	}
+	return total, nil
+}
+
+// openPath opens name for loading.
+//
+// openPath returns the fs.Dirent and an *fs.File for name, which is not
+// installed in the Task FDMap. The caller takes ownership of both.
+//
+// name must be a readable, executable, regular file.
+func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, name string) (*fs.Dirent, *fs.File, error) {
+	d, err := mm.FindInode(ctx, root, wd, name, maxTraversals)
+	if err != nil {
+		return nil, nil, err
+	}
+	defer d.DecRef()
+
+	perms := fs.PermMask{
+		// TODO: Linux requires only execute permission,
+		// not read. However, our backing filesystems may prevent us
+		// from reading the file without read permission.
+		//
+		// Additionally, a task with a non-readable executable has
+		// additional constraints on access via ptrace and procfs.
+		Read:    true,
+		Execute: true,
+	}
+	if err := d.Inode.CheckPermission(ctx, perms); err != nil {
+		return nil, nil, err
+	}
+
+	// If they claim it's a directory, then make sure.
+	//
+	// N.B. we reject directories below, but we must first reject
+	// non-directories passed as directories.
+	if len(name) > 0 && name[len(name)-1] == '/' && !fs.IsDir(d.Inode.StableAttr) {
+		return nil, nil, syserror.ENOTDIR
+	}
+
+	// No exec-ing directories, pipes, etc!
+	if !fs.IsRegular(d.Inode.StableAttr) {
+		ctx.Infof("Error regularing %s: %v", name, d.Inode.StableAttr)
+		return nil, nil, syserror.EACCES
+	}
+
+	// Create a new file.
+	file, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// We must be able to read at arbitrary offsets.
+	if !file.Flags().Pread {
+		file.DecRef()
+		ctx.Infof("%s cannot be read at an offset: %+v", name, file.Flags())
+		return nil, nil, syserror.EACCES
+	}
+
+	// Grab a reference for the caller.
+	d.IncRef()
+	return d, file, nil
+}
+
+// allocStack allocates and maps a stack in to any available part of the address space.
+func allocStack(ctx context.Context, m *mm.MemoryManager, a arch.Context) (*arch.Stack, error) {
+	ar, err := m.MapStack(ctx)
+	if err != nil {
+		return nil, err
+	}
+	return &arch.Stack{a, m, ar.End}, nil
+}
+
+const (
+	// maxLoaderAttempts is the maximum number of attempts to try to load
+	// an interpreter scripts, to prevent loops. 6 (inital + 5 changes) is
+	// what the Linux kernel allows (fs/exec.c:search_binary_handler).
+	maxLoaderAttempts = 6
+)
+
+// loadPath resolves filename to a binary and loads it.
+//
+// It returns:
+//  * loadedELF, description of the loaded binary
+//  * arch.Context matching the binary arch
+//  * fs.Dirent of the binary file
+//  * Possibly updated argv
+func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, fs *cpuid.FeatureSet, filename string, argv, envv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
+	for i := 0; i < maxLoaderAttempts; i++ {
+		d, f, err := openPath(ctx, mounts, root, wd, maxTraversals, filename)
+		if err != nil {
+			ctx.Infof("Error opening %s: %v", filename, err)
+			return loadedELF{}, nil, nil, nil, err
+		}
+		defer f.DecRef()
+		// We will return d in the successful case, but defer a DecRef
+		// for intermediate loops and failure cases.
+		defer d.DecRef()
+
+		// Check the header. Is this an ELF or interpreter script?
+		var hdr [4]uint8
+		// N.B. We assume that reading from a regular file cannot block.
+		_, err = readFull(ctx, f, usermem.BytesIOSequence(hdr[:]), 0)
+		// Allow unexpected EOF, as a valid executable could be only three
+		// bytes (e.g., #!a).
+		if err != nil && err != io.ErrUnexpectedEOF {
+			if err == io.EOF {
+				err = syserror.ENOEXEC
+			}
+			return loadedELF{}, nil, nil, nil, err
+		}
+
+		switch {
+		case bytes.Equal(hdr[:], []byte(elfMagic)):
+			loaded, ac, err := loadELF(ctx, m, mounts, root, wd, maxTraversals, fs, f)
+			if err != nil {
+				ctx.Infof("Error loading ELF: %v", err)
+				return loadedELF{}, nil, nil, nil, err
+			}
+			// An ELF is always terminal. Hold on to d.
+			d.IncRef()
+			return loaded, ac, d, argv, err
+		case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)):
+			newpath, newargv, err := parseInterpreterScript(ctx, filename, f, argv, envv)
+			if err != nil {
+				ctx.Infof("Error loading interpreter script: %v", err)
+				return loadedELF{}, nil, nil, nil, err
+			}
+			filename = newpath
+			argv = newargv
+		default:
+			ctx.Infof("Unknown magic: %v", hdr)
+			return loadedELF{}, nil, nil, nil, syserror.ENOEXEC
+		}
+	}
+
+	return loadedELF{}, nil, nil, nil, syserror.ELOOP
+}
+
+// Load loads filename into a MemoryManager.
+//
+// If Load returns ErrSwitchFile it should be called again with the returned
+// path and argv.
+//
+// Preconditions:
+//  * The Task MemoryManager is empty.
+//  * Load is called on the Task goroutine.
+func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, fs *cpuid.FeatureSet, filename string, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, error) {
+	// Load the binary itself.
+	loaded, ac, d, argv, err := loadPath(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv)
+	if err != nil {
+		ctx.Infof("Failed to load %s: %v", filename, err)
+		return 0, nil, "", err
+	}
+	defer d.DecRef()
+
+	// Load the VDSO.
+	vdsoAddr, err := loadVDSO(ctx, m, vdso, loaded)
+	if err != nil {
+		ctx.Infof("Error loading VDSO: %v", err)
+		return 0, nil, "", err
+	}
+
+	// Setup the heap. brk starts at the next page after the end of the
+	// binary. Userspace can assume that the remainer of the page after
+	// loaded.end is available for its use.
+	e, ok := loaded.end.RoundUp()
+	if !ok {
+		ctx.Warningf("brk overflows: %#x", loaded.end)
+		return 0, nil, "", syserror.ENOEXEC
+	}
+	m.BrkSetup(ctx, e)
+
+	// Allocate our stack.
+	stack, err := allocStack(ctx, m, ac)
+	if err != nil {
+		ctx.Infof("Failed to allocate stack: %v", err)
+		return 0, nil, "", err
+	}
+
+	// Push the original filename to the stack, for AT_EXECFN.
+	execfn, err := stack.Push(filename)
+	if err != nil {
+		ctx.Infof("Failed to push exec filename: %v", err)
+		return 0, nil, "", err
+	}
+
+	// Push 16 random bytes on the stack which AT_RANDOM will point to.
+	var b [16]byte
+	if _, err := rand.Read(b[:]); err != nil {
+		ctx.Infof("Failed to read random bytes: %v", err)
+		return 0, nil, "", err
+	}
+	random, err := stack.Push(b)
+	if err != nil {
+		ctx.Infof("Failed to push random bytes: %v", err)
+		return 0, nil, "", err
+	}
+
+	// Add generic auxv entries
+	auxv := append(loaded.auxv, arch.Auxv{
+		arch.AuxEntry{linux.AT_CLKTCK, linux.CLOCKS_PER_SEC},
+		arch.AuxEntry{linux.AT_EXECFN, execfn},
+		arch.AuxEntry{linux.AT_RANDOM, random},
+		arch.AuxEntry{linux.AT_PAGESZ, usermem.PageSize},
+		arch.AuxEntry{linux.AT_SYSINFO_EHDR, vdsoAddr},
+	}...)
+	auxv = append(auxv, extraAuxv...)
+
+	sl, err := stack.Load(argv, envv, auxv)
+	if err != nil {
+		ctx.Infof("Failed to load stack: %v", err)
+		return 0, nil, "", err
+	}
+
+	m.SetArgvStart(sl.ArgvStart)
+	m.SetArgvEnd(sl.ArgvEnd)
+	m.SetEnvvStart(sl.EnvvStart)
+	m.SetEnvvEnd(sl.EnvvEnd)
+	m.SetAuxv(auxv)
+	m.SetExecutable(d)
+
+	ac.SetIP(uintptr(loaded.entry))
+	ac.SetStack(uintptr(stack.Bottom))
+
+	name := path.Base(filename)
+	if len(name) > linux.TASK_COMM_LEN-1 {
+		name = name[:linux.TASK_COMM_LEN-1]
+	}
+
+	return loaded.os, ac, name, nil
+}
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
new file mode 100644
index 000000000..ce4f6f5d9
--- /dev/null
+++ b/pkg/sentry/loader/vdso.go
@@ -0,0 +1,382 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package loader
+
+import (
+	"debug/elf"
+	"fmt"
+	"io"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// byteReaderFileOperations implements fs.FileOperations for reading
+// from a []byte source.
+type byteReader struct {
+	fsutil.NoopRelease
+	fsutil.PipeSeek
+	fsutil.NotDirReaddir
+	fsutil.NoFsync
+	fsutil.NoopFlush
+	fsutil.NoMMap
+	fsutil.NoIoctl
+	waiter.AlwaysReady
+	data []byte
+}
+
+type fileContext struct {
+	context.Context
+}
+
+func (f *fileContext) Value(key interface{}) interface{} {
+	switch key {
+	case uniqueid.CtxGlobalUniqueID:
+		return uint64(0)
+	default:
+		return f.Context.Value(key)
+	}
+}
+
+func newByteReaderFile(data []byte) *fs.File {
+	dirent := fs.NewTransientDirent(nil)
+	flags := fs.FileFlags{Read: true, Pread: true}
+	return fs.NewFile(&fileContext{Context: context.Background()}, dirent, flags, &byteReader{
+		data: data,
+	})
+}
+
+func (b *byteReader) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if offset >= int64(len(b.data)) {
+		return 0, io.EOF
+	}
+	n, err := dst.CopyOut(ctx, b.data[offset:])
+	return int64(n), err
+}
+
+func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	panic("Write not supported")
+}
+
+// validateVDSO checks that the VDSO can be loaded by loadVDSO.
+//
+// VDSOs are special (see below). Since we are going to map the VDSO directly
+// rather than using a normal loading process, we require that the PT_LOAD
+// segments have the same layout in the ELF as they expect to have in memory.
+//
+// Namely, this means that we must verify:
+// * PT_LOAD file offsets are equivalent to the memory offset from the first
+//   segment.
+// * No extra zeroed space (memsz) is required.
+// * PT_LOAD segments are in order.
+// * No two PT_LOAD segments occupy parts of the same page.
+// * PT_LOAD segments don't extend beyond the end of the file.
+//
+// ctx may be nil if f does not need it.
+func validateVDSO(ctx context.Context, f *fs.File, size uint64) (elfInfo, error) {
+	info, err := parseHeader(ctx, f)
+	if err != nil {
+		log.Infof("Unable to parse VDSO header: %v", err)
+		return elfInfo{}, err
+	}
+
+	var first *elf.ProgHeader
+	var prev *elf.ProgHeader
+	var prevEnd usermem.Addr
+	for i, phdr := range info.phdrs {
+		if phdr.Type != elf.PT_LOAD {
+			continue
+		}
+
+		if first == nil {
+			first = &info.phdrs[i]
+			if phdr.Off != 0 {
+				log.Warningf("First PT_LOAD segment has non-zero file offset")
+				return elfInfo{}, syserror.ENOEXEC
+			}
+		}
+
+		memoryOffset := phdr.Vaddr - first.Vaddr
+		if memoryOffset != phdr.Off {
+			log.Warningf("PT_LOAD segment memory offset %#x != file offset %#x", memoryOffset, phdr.Off)
+			return elfInfo{}, syserror.ENOEXEC
+		}
+
+		// memsz larger than filesz means that extra zeroed space should be
+		// provided at the end of the segment. Since we are mapping the ELF
+		// directly, we don't want to just overwrite part of the ELF with
+		// zeroes.
+		if phdr.Memsz != phdr.Filesz {
+			log.Warningf("PT_LOAD segment memsz %#x != filesz %#x", phdr.Memsz, phdr.Filesz)
+			return elfInfo{}, syserror.ENOEXEC
+		}
+
+		start := usermem.Addr(memoryOffset)
+		end, ok := start.AddLength(phdr.Memsz)
+		if !ok {
+			log.Warningf("PT_LOAD segment size overflows: %#x + %#x", start, end)
+			return elfInfo{}, syserror.ENOEXEC
+		}
+		if uint64(end) > size {
+			log.Warningf("PT_LOAD segment end %#x extends beyond end of file %#x", end, size)
+			return elfInfo{}, syserror.ENOEXEC
+		}
+
+		if prev != nil {
+			if start < prevEnd {
+				log.Warningf("PT_LOAD segments out of order")
+				return elfInfo{}, syserror.ENOEXEC
+			}
+
+			// We mprotect entire pages, so each segment must be in
+			// its own page.
+			prevEndPage := prevEnd.RoundDown()
+			startPage := start.RoundDown()
+			if prevEndPage >= startPage {
+				log.Warningf("PT_LOAD segments share a page: %#x", prevEndPage)
+				return elfInfo{}, syserror.ENOEXEC
+			}
+		}
+		prev = &info.phdrs[i]
+		prevEnd = end
+	}
+
+	return info, nil
+}
+
+// VDSO describes a VDSO.
+//
+// NOTE: to support multiple architectures or operating systems, this
+// would need to contain a VDSO for each.
+type VDSO struct {
+	// ParamPage is the VDSO parameter page. This page should be updated to
+	// inform the VDSO for timekeeping data.
+	ParamPage *mm.SpecialMappable
+
+	// vdso is the VDSO ELF itself.
+	vdso *mm.SpecialMappable
+
+	// os is the operating system targeted by the VDSO.
+	os abi.OS
+
+	// arch is the architecture targeted by the VDSO.
+	arch arch.Arch
+
+	// phdrs are the VDSO ELF phdrs.
+	phdrs []elf.ProgHeader `state:".([]elfProgHeader)"`
+}
+
+// PrepareVDSO validates the system VDSO and returns a VDSO, containing the
+// param page for updating by the kernel.
+func PrepareVDSO(p platform.Platform) (*VDSO, error) {
+	vdsoFile := newByteReaderFile(vdsoBin)
+
+	// First make sure the VDSO is valid. vdsoFile does not use ctx, so a
+	// nil context can be passed.
+	info, err := validateVDSO(nil, vdsoFile, uint64(len(vdsoBin)))
+	if err != nil {
+		return nil, err
+	}
+
+	// Then copy it into a VDSO mapping.
+	size, ok := usermem.Addr(len(vdsoBin)).RoundUp()
+	if !ok {
+		return nil, fmt.Errorf("VDSO size overflows? %#x", len(vdsoBin))
+	}
+
+	vdso, err := p.Memory().Allocate(uint64(size), usage.System)
+	if err != nil {
+		return nil, fmt.Errorf("unable to allocate VDSO memory: %v", err)
+	}
+
+	ims, err := p.Memory().MapInternal(vdso, usermem.ReadWrite)
+	if err != nil {
+		p.Memory().DecRef(vdso)
+		return nil, fmt.Errorf("unable to map VDSO memory: %v", err)
+	}
+
+	_, err = safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(vdsoBin)))
+	if err != nil {
+		p.Memory().DecRef(vdso)
+		return nil, fmt.Errorf("unable to copy VDSO into memory: %v", err)
+	}
+
+	// Finally, allocate a param page for this VDSO.
+	paramPage, err := p.Memory().Allocate(usermem.PageSize, usage.System)
+	if err != nil {
+		p.Memory().DecRef(vdso)
+		return nil, fmt.Errorf("unable to allocate VDSO param page: %v", err)
+	}
+
+	return &VDSO{
+		ParamPage: mm.NewSpecialMappable("[vvar]", p, paramPage),
+		// TODO: Don't advertise the VDSO, as some applications may
+		// not be able to handle multiple [vdso] hints.
+		vdso:  mm.NewSpecialMappable("", p, vdso),
+		phdrs: info.phdrs,
+	}, nil
+}
+
+// loadVDSO loads the VDSO into m.
+//
+// VDSOs are special.
+//
+// VDSOs are fully position independent. However, instead of loading a VDSO
+// like a normal ELF binary, mapping only the PT_LOAD segments, the Linux
+// kernel simply directly maps the entire file into process memory, with very
+// little real ELF parsing.
+//
+// NOTE: This means that userspace can, and unfortunately does,
+// depend on parts of the ELF that would normally not be mapped.  To maintain
+// compatibility with such binaries, we load the VDSO much like Linux.
+//
+// loadVDSO takes a reference on the VDSO and parameter page FrameRegions.
+func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF) (usermem.Addr, error) {
+	if v == nil {
+		// Should be used only by tests.
+		ctx.Warningf("No VDSO provided, skipping VDSO mapping")
+		return 0, nil
+	}
+
+	if v.os != bin.os {
+		ctx.Warningf("Binary ELF OS %v and VDSO ELF OS %v differ", bin.os, v.os)
+		return 0, syserror.ENOEXEC
+	}
+	if v.arch != bin.arch {
+		ctx.Warningf("Binary ELF arch %v and VDSO ELF arch %v differ", bin.arch, v.arch)
+		return 0, syserror.ENOEXEC
+	}
+
+	// Reserve address space for the VDSO and its parameter page, which is
+	// mapped just before the VDSO.
+	mapSize := v.vdso.Length() + v.ParamPage.Length()
+	addr, err := m.MMap(ctx, memmap.MMapOpts{
+		Length:  mapSize,
+		Private: true,
+	})
+	if err != nil {
+		ctx.Infof("Unable to reserve VDSO address space: %v", err)
+		return 0, err
+	}
+
+	// Now map the param page.
+	_, err = m.MMap(ctx, memmap.MMapOpts{
+		Length:          v.ParamPage.Length(),
+		MappingIdentity: v.ParamPage,
+		Mappable:        v.ParamPage,
+		Addr:            addr,
+		Fixed:           true,
+		Unmap:           true,
+		Private:         true,
+		Perms:           usermem.Read,
+		MaxPerms:        usermem.Read,
+	})
+	if err != nil {
+		ctx.Infof("Unable to map VDSO param page: %v", err)
+		return 0, err
+	}
+
+	// Now map the VDSO itself.
+	vdsoAddr, ok := addr.AddLength(v.ParamPage.Length())
+	if !ok {
+		panic(fmt.Sprintf("Part of mapped range overflows? %#x + %#x", addr, v.ParamPage.Length()))
+	}
+	_, err = m.MMap(ctx, memmap.MMapOpts{
+		Length:          v.vdso.Length(),
+		MappingIdentity: v.vdso,
+		Mappable:        v.vdso,
+		Addr:            vdsoAddr,
+		Fixed:           true,
+		Unmap:           true,
+		Private:         true,
+		Perms:           usermem.Read,
+		MaxPerms:        usermem.AnyAccess,
+	})
+	if err != nil {
+		ctx.Infof("Unable to map VDSO: %v", err)
+		return 0, err
+	}
+
+	vdsoEnd, ok := vdsoAddr.AddLength(v.vdso.Length())
+	if !ok {
+		panic(fmt.Sprintf("VDSO mapping overflows? %#x + %#x", vdsoAddr, v.vdso.Length()))
+	}
+
+	// Set additional protections for the individual segments.
+	var first *elf.ProgHeader
+	for i, phdr := range v.phdrs {
+		if phdr.Type != elf.PT_LOAD {
+			continue
+		}
+
+		if first == nil {
+			first = &v.phdrs[i]
+		}
+
+		memoryOffset := phdr.Vaddr - first.Vaddr
+		segAddr, ok := vdsoAddr.AddLength(memoryOffset)
+		if !ok {
+			ctx.Warningf("PT_LOAD segment address overflows: %#x + %#x", segAddr, memoryOffset)
+			return 0, syserror.ENOEXEC
+		}
+		segPage := segAddr.RoundDown()
+		segSize := usermem.Addr(phdr.Memsz)
+		segSize, ok = segSize.AddLength(segAddr.PageOffset())
+		if !ok {
+			ctx.Warningf("PT_LOAD segment memsize %#x + offset %#x overflows", phdr.Memsz, segAddr.PageOffset())
+			return 0, syserror.ENOEXEC
+		}
+		segSize, ok = segSize.RoundUp()
+		if !ok {
+			ctx.Warningf("PT_LOAD segment size overflows: %#x", phdr.Memsz+segAddr.PageOffset())
+			return 0, syserror.ENOEXEC
+		}
+		segEnd, ok := segPage.AddLength(uint64(segSize))
+		if !ok {
+			ctx.Warningf("PT_LOAD segment range overflows: %#x + %#x", segAddr, segSize)
+			return 0, syserror.ENOEXEC
+		}
+		if segEnd > vdsoEnd {
+			ctx.Warningf("PT_LOAD segment ends beyond VDSO: %#x > %#x", segEnd, vdsoEnd)
+			return 0, syserror.ENOEXEC
+		}
+
+		perms := progFlagsAsPerms(phdr.Flags)
+		if perms != usermem.Read {
+			if err := m.MProtect(segPage, uint64(segSize), perms, false); err != nil {
+				ctx.Warningf("Unable to set PT_LOAD segment protections %+v at [%#x, %#x): %v", perms, segAddr, segEnd, err)
+				return 0, syserror.ENOEXEC
+			}
+		}
+	}
+
+	return vdsoAddr, nil
+}
diff --git a/pkg/sentry/loader/vdso_state.go b/pkg/sentry/loader/vdso_state.go
new file mode 100644
index 000000000..92004ad9e
--- /dev/null
+++ b/pkg/sentry/loader/vdso_state.go
@@ -0,0 +1,47 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package loader
+
+import (
+	"debug/elf"
+)
+
+type elfProgHeader struct {
+	Type   elf.ProgType
+	Flags  elf.ProgFlag
+	Off    uint64
+	Vaddr  uint64
+	Paddr  uint64
+	Filesz uint64
+	Memsz  uint64
+	Align  uint64
+}
+
+// savePhdrs is invoked by stateify.
+func (v *VDSO) savePhdrs() []elfProgHeader {
+	s := make([]elfProgHeader, 0, len(v.phdrs))
+	for _, h := range v.phdrs {
+		s = append(s, elfProgHeader(h))
+	}
+	return s
+}
+
+// loadPhdrs is invoked by stateify.
+func (v *VDSO) loadPhdrs(s []elfProgHeader) {
+	v.phdrs = make([]elf.ProgHeader, 0, len(s))
+	for _, h := range s {
+		v.phdrs = append(v.phdrs, elf.ProgHeader(h))
+	}
+}
author	Googler <noreply@google.com>	2018-04-27 10:37:02 -0700
committer	Adin Scannell <ascannell@google.com>	2018-04-28 01:44:26 -0400
commit	d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree	54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/loader
parent	f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)