diff options
Diffstat (limited to 'pkg/sentry/arch/arch_amd64.go')
-rw-r--r-- | pkg/sentry/arch/arch_amd64.go | 302 |
1 files changed, 302 insertions, 0 deletions
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go new file mode 100644 index 000000000..23526fe8e --- /dev/null +++ b/pkg/sentry/arch/arch_amd64.go @@ -0,0 +1,302 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arch + +import ( + "bytes" + "fmt" + "math/rand" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/binary" + "gvisor.googlesource.com/gvisor/pkg/cpuid" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// These constants come directly from Linux. +const ( + // maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux + // for a 64-bit process. + maxAddr64 usermem.Addr = (1 << 47) - usermem.PageSize + + // maxStackRand64 is the maximum randomization to apply to the stack. + // It is defined by arch/x86/mm/mmap.c:stack_maxrandom_size in Linux. + maxStackRand64 = 16 << 30 // 16 GB + + // maxMmapRand64 is the maximum randomization to apply to the mmap + // layout. It is defined by arch/x86/mm/mmap.c:arch_mmap_rnd in Linux. + maxMmapRand64 = (1 << 28) * usermem.PageSize + + // minGap64 is the minimum gap to leave at the top of the address space + // for the stack. It is defined by arch/x86/mm/mmap.c:MIN_GAP in Linux. + minGap64 = (128 << 20) + maxStackRand64 + + // preferredPIELoadAddr is the standard Linux position-independent + // executable base load address. It is ELF_ET_DYN_BASE in Linux. + // + // The Platform {Min,Max}UserAddress() may preclude loading at this + // address. See other preferredFoo comments below. + preferredPIELoadAddr usermem.Addr = maxAddr64 / 3 * 2 +) + +// These constants are selected as heuristics to help make the Platform's +// potentially limited address space conform as closely to Linux as possible. +const ( + // Select a preferred minimum TopDownBase address. + // + // Some applications (TSAN and other *SANs) are very particular about + // the way the Linux mmap allocator layouts out the address space. + // + // TSAN in particular expects top down allocations to be made in the + // range [0x7e8000000000, 0x800000000000). + // + // The minimum TopDownBase on Linux would be: + // 0x800000000000 - minGap64 - maxMmapRand64 = 0x7efbf8000000. + // + // (minGap64 because TSAN uses a small RLIMIT_STACK.) + // + // 0x7e8000000000 is selected arbitrarily by TSAN to leave room for + // allocations below TopDownBase. + // + // N.B. ASAN and MSAN are more forgiving; ASAN allows allocations all + // the way down to 0x10007fff8000, and MSAN down to 0x700000000000. + // + // Of course, there is no hard minimum to allocation; an allocator can + // search all the way from TopDownBase to Min. However, TSAN declared + // their range "good enough". + // + // We would like to pick a TopDownBase such that it is unlikely that an + // allocator will select an address below TSAN's minimum. We achieve + // this by trying to leave a sizable gap below TopDownBase. + // + // This is all "preferred" because the layout min/max address may not + // allow us to select such a TopDownBase, in which case we have to fall + // back to a layout that TSAN may not be happy with. + preferredTopDownAllocMin usermem.Addr = 0x7e8000000000 + preferredAllocationGap = 128 << 30 // 128 GB + preferredTopDownBaseMin = preferredTopDownAllocMin + preferredAllocationGap + + // minMmapRand64 is the smallest we are willing to make the + // randomization to stay above preferredTopDownBaseMin. + minMmapRand64 = (1 << 26) * usermem.PageSize +) + +// context64 represents an AMD64 context. +type context64 struct { + State + sigFPState []x86FPState // fpstate to be restored on sigreturn. +} + +// Arch implements Context.Arch. +func (c *context64) Arch() Arch { + return AMD64 +} + +func (c *context64) copySigFPState() []x86FPState { + var sigfps []x86FPState + for _, s := range c.sigFPState { + sigfps = append(sigfps, s.fork()) + } + return sigfps +} + +// Fork returns an exact copy of this context. +func (c *context64) Fork() Context { + return &context64{ + State: c.State.Fork(), + sigFPState: c.copySigFPState(), + } +} + +// Return returns the current syscall return value. +func (c *context64) Return() uintptr { + return uintptr(c.Regs.Rax) +} + +// SetReturn sets the syscall return value. +func (c *context64) SetReturn(value uintptr) { + c.Regs.Rax = uint64(value) +} + +// IP returns the current instruction pointer. +func (c *context64) IP() uintptr { + return uintptr(c.Regs.Rip) +} + +// SetIP sets the current instruction pointer. +func (c *context64) SetIP(value uintptr) { + c.Regs.Rip = uint64(value) +} + +// Stack returns the current stack pointer. +func (c *context64) Stack() uintptr { + return uintptr(c.Regs.Rsp) +} + +// SetStack sets the current stack pointer. +func (c *context64) SetStack(value uintptr) { + c.Regs.Rsp = uint64(value) +} + +// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP. +func (c *context64) SetRSEQInterruptedIP(value uintptr) { + c.Regs.R10 = uint64(value) +} + +// Native returns the native type for the given val. +func (c *context64) Native(val uintptr) interface{} { + v := uint64(val) + return &v +} + +// Value returns the generic val for the given native type. +func (c *context64) Value(val interface{}) uintptr { + return uintptr(*val.(*uint64)) +} + +// Width returns the byte width of this architecture. +func (c *context64) Width() uint { + return 8 +} + +// FeatureSet returns the FeatureSet in use. +func (c *context64) FeatureSet() *cpuid.FeatureSet { + return c.State.FeatureSet +} + +// mmapRand returns a random adjustment for randomizing an mmap layout. +func mmapRand(max uint64) usermem.Addr { + return usermem.Addr(rand.Int63n(int64(max))).RoundDown() +} + +// NewMmapLayout implements Context.NewMmapLayout consistently with Linux. +func (c *context64) NewMmapLayout(min, max usermem.Addr, r *limits.LimitSet) (MmapLayout, error) { + min, ok := min.RoundUp() + if !ok { + return MmapLayout{}, syscall.EINVAL + } + if max > maxAddr64 { + max = maxAddr64 + } + max = max.RoundDown() + + if min > max { + return MmapLayout{}, syscall.EINVAL + } + + stackSize := r.Get(limits.Stack) + + // MAX_GAP in Linux. + maxGap := (max / 6) * 5 + gap := usermem.Addr(stackSize.Cur) + if gap < minGap64 { + gap = minGap64 + } + if gap > maxGap { + gap = maxGap + } + defaultDir := MmapTopDown + if stackSize.Cur == limits.Infinity { + defaultDir = MmapBottomUp + } + + topDownMin := max - gap - maxMmapRand64 + maxRand := usermem.Addr(maxMmapRand64) + if topDownMin < preferredTopDownBaseMin { + // Try to keep TopDownBase above preferredTopDownBaseMin by + // shrinking maxRand. + maxAdjust := maxRand - minMmapRand64 + needAdjust := preferredTopDownBaseMin - topDownMin + if needAdjust <= maxAdjust { + maxRand -= needAdjust + } + } + + rnd := mmapRand(uint64(maxRand)) + l := MmapLayout{ + MinAddr: min, + MaxAddr: max, + // TASK_UNMAPPED_BASE in Linux. + BottomUpBase: (max/3 + rnd).RoundDown(), + TopDownBase: (max - gap - rnd).RoundDown(), + DefaultDirection: defaultDir, + // We may have reduced the maximum randomization to keep + // TopDownBase above preferredTopDownBaseMin while maintaining + // our stack gap. Stack allocations must use that max + // randomization to avoiding eating into the gap. + MaxStackRand: uint64(maxRand), + } + + // Final sanity check on the layout. + if !l.Valid() { + panic(fmt.Sprintf("Invalid MmapLayout: %+v", l)) + } + + return l, nil +} + +// PIELoadAddress implements Context.PIELoadAddress. +func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr { + base := preferredPIELoadAddr + max, ok := base.AddLength(maxMmapRand64) + if !ok { + panic(fmt.Sprintf("preferredPIELoadAddr %#x too large", base)) + } + + if max > l.MaxAddr { + // preferredPIELoadAddr won't fit; fall back to the standard + // Linux behavior of 2/3 of TopDownBase. TSAN won't like this. + // + // Don't bother trying to shrink the randomization for now. + base = l.TopDownBase / 3 * 2 + } + + return base + mmapRand(maxMmapRand64) +} + +// userStructSize is the size in bytes of Linux's struct user on amd64. +const userStructSize = 928 + +// PtracePeekUser implements Context.PtracePeekUser. +func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) { + if addr&7 != 0 || addr >= userStructSize { + return nil, syscall.EIO + } + // PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and + // u_debugreg, returning 0 or silently no-oping for other fields + // respectively. + if addr < uintptr(ptraceRegsSize) { + buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs()) + return c.Native(uintptr(usermem.ByteOrder.Uint64(buf[addr:]))), nil + } + // TODO: debug registers + return c.Native(0), nil +} + +// PtracePokeUser implements Context.PtracePokeUser. +func (c *context64) PtracePokeUser(addr, data uintptr) error { + if addr&7 != 0 || addr >= userStructSize { + return syscall.EIO + } + if addr < uintptr(ptraceRegsSize) { + buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs()) + usermem.ByteOrder.PutUint64(buf[addr:], uint64(data)) + _, err := c.PtraceSetRegs(bytes.NewBuffer(buf)) + return err + } + // TODO: debug registers + return nil +} |