diff options
author | Googler <noreply@google.com> | 2018-04-27 10:37:02 -0700 |
---|---|---|
committer | Adin Scannell <ascannell@google.com> | 2018-04-28 01:44:26 -0400 |
commit | d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch) | |
tree | 54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/arch | |
parent | f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff) |
Check in gVisor.
PiperOrigin-RevId: 194583126
Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
Diffstat (limited to 'pkg/sentry/arch')
-rw-r--r-- | pkg/sentry/arch/BUILD | 66 | ||||
-rw-r--r-- | pkg/sentry/arch/aligned.go | 31 | ||||
-rw-r--r-- | pkg/sentry/arch/arch.go | 351 | ||||
-rw-r--r-- | pkg/sentry/arch/arch_amd64.go | 302 | ||||
-rw-r--r-- | pkg/sentry/arch/arch_amd64.s | 135 | ||||
-rw-r--r-- | pkg/sentry/arch/arch_state_x86.go | 97 | ||||
-rw-r--r-- | pkg/sentry/arch/arch_x86.go | 613 | ||||
-rw-r--r-- | pkg/sentry/arch/auxv.go | 28 | ||||
-rw-r--r-- | pkg/sentry/arch/registers.proto | 55 | ||||
-rw-r--r-- | pkg/sentry/arch/signal_act.go | 79 | ||||
-rw-r--r-- | pkg/sentry/arch/signal_amd64.go | 476 | ||||
-rw-r--r-- | pkg/sentry/arch/signal_info.go | 66 | ||||
-rw-r--r-- | pkg/sentry/arch/signal_stack.go | 58 | ||||
-rw-r--r-- | pkg/sentry/arch/stack.go | 246 | ||||
-rw-r--r-- | pkg/sentry/arch/syscalls_amd64.go | 52 |
15 files changed, 2655 insertions, 0 deletions
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD new file mode 100644 index 000000000..a88f57ac7 --- /dev/null +++ b/pkg/sentry/arch/BUILD @@ -0,0 +1,66 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "arch_state", + srcs = [ + "arch.go", + "arch_amd64.go", + "arch_state_x86.go", + "arch_x86.go", + "auxv.go", + "signal_amd64.go", + ], + out = "arch_state.go", + package = "arch", +) + +go_library( + name = "arch", + srcs = [ + "aligned.go", + "arch.go", + "arch_amd64.go", + "arch_amd64.s", + "arch_state.go", + "arch_state_x86.go", + "arch_x86.go", + "auxv.go", + "signal_act.go", + "signal_amd64.go", + "signal_info.go", + "signal_stack.go", + "stack.go", + "syscalls_amd64.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/arch", + visibility = ["//:sandbox"], + deps = [ + ":registers_go_proto", + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/cpuid", + "//pkg/log", + "//pkg/sentry/context", + "//pkg/sentry/limits", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserror", + ], +) + +proto_library( + name = "registers_proto", + srcs = ["registers.proto"], + visibility = ["//visibility:public"], +) + +go_proto_library( + name = "registers_go_proto", + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto", + proto = ":registers_proto", + visibility = ["//visibility:public"], +) diff --git a/pkg/sentry/arch/aligned.go b/pkg/sentry/arch/aligned.go new file mode 100644 index 000000000..193232e27 --- /dev/null +++ b/pkg/sentry/arch/aligned.go @@ -0,0 +1,31 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arch + +import ( + "reflect" +) + +// alignedBytes returns a slice of size bytes, aligned in memory to the given +// alignment. This is used because we require certain structures to be aligned +// in a specific way (for example, the X86 floating point data). +func alignedBytes(size, alignment uint) []byte { + data := make([]byte, size+alignment-1) + offset := uint(reflect.ValueOf(data).Index(0).Addr().Pointer() % uintptr(alignment)) + if offset == 0 { + return data[:size:size] + } + return data[alignment-offset:][:size:size] +} diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go new file mode 100644 index 000000000..021789e4b --- /dev/null +++ b/pkg/sentry/arch/arch.go @@ -0,0 +1,351 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package arch provides abstractions around architecture-dependent details, +// such as syscall calling conventions, native types, etc. +package arch + +import ( + "fmt" + "io" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/cpuid" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// Arch describes an architecture. +type Arch int + +const ( + // AMD64 is the x86-64 architecture. + AMD64 Arch = iota +) + +// String implements fmt.Stringer. +func (a Arch) String() string { + switch a { + case AMD64: + return "amd64" + default: + return fmt.Sprintf("Arch(%d)", a) + } +} + +// FloatingPointData is a generic type, and will always be passed as a pointer. +// We rely on the individual arch implementations to meet all the necessary +// requirements. For example, on x86 the region must be 16-byte aligned and 512 +// bytes in size. +type FloatingPointData byte + +// Context provides architecture-dependent information for a specific thread. +// +// NOTE: Currently we use uintptr here to refer to a generic native +// register value. While this will work for the foreseeable future, it isn't +// strictly correct. We may want to create some abstraction that makes this +// more clear or enables us to store values of arbitrary widths. This is +// particularly true for RegisterMap(). +type Context interface { + // Arch returns the architecture for this Context. + Arch() Arch + + // Native converts a generic type to a native value. + // + // Because the architecture is not specified here, we may be dealing + // with return values of varying sizes (for example ARCH_GETFS). This + // is a simple utility function to convert to the native size in these + // cases, and then we can CopyOut. + Native(val uintptr) interface{} + + // Value converts a native type back to a generic value. + // Once a value has been converted to native via the above call -- it + // can be converted back here. + Value(val interface{}) uintptr + + // Width returns the number of bytes for a native value. + Width() uint + + // Fork creates a clone of the context. + Fork() Context + + // SyscallNo returns the syscall number. + SyscallNo() uintptr + + // SyscallArgs returns the syscall arguments in an array. + SyscallArgs() SyscallArguments + + // Return returns the return value for a system call. + Return() uintptr + + // SetReturn sets the return value for a system call. + SetReturn(value uintptr) + + // RestartSyscall reverses over the current syscall instruction, such that + // when the application resumes execution the syscall will be re-attempted. + RestartSyscall() + + // RestartSyscallWithRestartBlock reverses over the current syscall + // instraction and overwrites the current syscall number with that of + // restart_syscall(2). This causes the application to restart the current + // syscall with a custom function when execution resumes. + RestartSyscallWithRestartBlock() + + // IP returns the current instruction pointer. + IP() uintptr + + // SetIP sets the current instruction pointer. + SetIP(value uintptr) + + // Stack returns the current stack pointer. + Stack() uintptr + + // SetStack sets the current stack pointer. + SetStack(value uintptr) + + // SetRSEQInterruptedIP sets the register that contains the old IP when a + // restartable sequence is interrupted. + SetRSEQInterruptedIP(value uintptr) + + // StateData returns a pointer to underlying architecture state. + StateData() *State + + // RegisterMap returns a map of all registers. + RegisterMap() (map[string]uintptr, error) + + // NewSignalAct returns a new object that is equivalent to struct sigaction + // in the guest architecture. + NewSignalAct() NativeSignalAct + + // NewSignalStack returns a new object that is equivalent to stack_t in the + // guest architecture. + NewSignalStack() NativeSignalStack + + // SignalSetup modifies the context in preparation for handling the + // given signal. + // + // st is the stack where the signal handler frame should be + // constructed. + // + // act is the SignalAct that specifies how this signal is being + // handled. + // + // info is the SignalInfo of the signal being delivered. + // + // alt is the alternate signal stack (even if the alternate signal + // stack is not going to be used). + // + // sigset is the signal mask before entering the signal handler. + SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt *SignalStack, sigset linux.SignalSet) error + + // SignalRestore restores context after returning from a signal + // handler. + // + // st is the current thread stack. + // + // rt is true if SignalRestore is being entered from rt_sigreturn and + // false if SignalRestore is being entered from sigreturn. + // SignalRestore returns the thread's new signal mask. + SignalRestore(st *Stack, rt bool) (linux.SignalSet, error) + + // CPUIDEmulate emulates a CPUID instruction according to current register state. + CPUIDEmulate(l log.Logger) + + // SingleStep returns true if single stepping is enabled. + SingleStep() bool + + // SetSingleStep enables single stepping. + SetSingleStep() + + // ClearSingleStep disables single stepping. + ClearSingleStep() + + // FloatingPointData will be passed to underlying save routines. + FloatingPointData() *FloatingPointData + + // NewMmapLayout returns a layout for a new MM, where MinAddr for the + // returned layout must be no lower than min, and MaxAddr for the returned + // layout must be no higher than max. Repeated calls to NewMmapLayout may + // return different layouts. + NewMmapLayout(min, max usermem.Addr, limits *limits.LimitSet) (MmapLayout, error) + + // PIELoadAddress returns a preferred load address for a + // position-independent executable within l. + PIELoadAddress(l MmapLayout) usermem.Addr + + // FeatureSet returns the FeatureSet in use in this context. + FeatureSet() *cpuid.FeatureSet + + // Hack around our package dependences being too broken to support the + // equivalent of arch_ptrace(): + + // PtracePeekUser implements ptrace(PTRACE_PEEKUSR). + PtracePeekUser(addr uintptr) (interface{}, error) + + // PtracePokeUser implements ptrace(PTRACE_POKEUSR). + PtracePokeUser(addr, data uintptr) error + + // PtraceGetRegs implements ptrace(PTRACE_GETREGS) by writing the + // general-purpose registers represented by this Context to dst and + // returning the number of bytes written. + PtraceGetRegs(dst io.Writer) (int, error) + + // PtraceSetRegs implements ptrace(PTRACE_SETREGS) by reading + // general-purpose registers from src into this Context and returning the + // number of bytes read. + PtraceSetRegs(src io.Reader) (int, error) + + // PtraceGetFPRegs implements ptrace(PTRACE_GETFPREGS) by writing the + // floating-point registers represented by this Context to addr in dst and + // returning the number of bytes written. + PtraceGetFPRegs(dst io.Writer) (int, error) + + // PtraceSetFPRegs implements ptrace(PTRACE_SETFPREGS) by reading + // floating-point registers from src into this Context and returning the + // number of bytes read. + PtraceSetFPRegs(src io.Reader) (int, error) + + // PtraceGetRegSet implements ptrace(PTRACE_GETREGSET) by writing the + // register set given by architecture-defined value regset from this + // Context to dst and returning the number of bytes written, which must be + // less than or equal to maxlen. + PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) + + // PtraceSetRegSet implements ptrace(PTRACE_SETREGSET) by reading the + // register set given by architecture-defined value regset from src and + // returning the number of bytes read, which must be less than or equal to + // maxlen. + PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) + + // FullRestore returns 'true' if all CPU registers must be restored + // when switching to the untrusted application. Typically a task enters + // and leaves the kernel via a system call. Platform.Switch() may + // optimize for this by not saving/restoring all registers if allowed + // by the ABI. For e.g. the amd64 ABI specifies that syscall clobbers + // %rcx and %r11. If FullRestore returns true then these optimizations + // must be disabled and all registers restored. + FullRestore() bool +} + +// MmapDirection is a search direction for mmaps. +type MmapDirection int + +const ( + // MmapBottomUp instructs mmap to prefer lower addresses. + MmapBottomUp MmapDirection = iota + + // MmapTopDown instructs mmap to prefer higher addresses. + MmapTopDown +) + +// MmapLayout defines the layout of the user address space for a particular +// MemoryManager. +// +// Note that "highest address" below is always exclusive. +type MmapLayout struct { + // MinAddr is the lowest mappable address. + MinAddr usermem.Addr + + // MaxAddr is the highest mappable address. + MaxAddr usermem.Addr + + // BottomUpBase is the lowest address that may be returned for a + // MmapBottomUp mmap. + BottomUpBase usermem.Addr + + // TopDownBase is the highest address that may be returned for a + // MmapTopDown mmap. + TopDownBase usermem.Addr + + // DefaultDirection is the direction for most non-fixed mmaps in this + // layout. + DefaultDirection MmapDirection + + // MaxStackRand is the maximum randomization to apply to stack + // allocations to maintain a proper gap between the stack and + // TopDownBase. + MaxStackRand uint64 +} + +// Valid returns true if this layout is valid. +func (m *MmapLayout) Valid() bool { + if m.MinAddr > m.MaxAddr { + return false + } + if m.BottomUpBase < m.MinAddr { + return false + } + if m.BottomUpBase > m.MaxAddr { + return false + } + if m.TopDownBase < m.MinAddr { + return false + } + if m.TopDownBase > m.MaxAddr { + return false + } + return true +} + +// SyscallArgument is an argument supplied to a syscall implementation. The +// methods used to access the arguments are named after the ***C type name*** and +// they convert to the closest Go type available. For example, Int() refers to a +// 32-bit signed integer argument represented in Go as an int32. +// +// Using the accessor methods guarantees that the conversion between types is +// correct, taking into account size and signedness (i.e., zero-extension vs +// signed-extension). +type SyscallArgument struct { + // Prefer to use accessor methods instead of 'Value' directly. + Value uintptr +} + +// SyscallArguments represents the set of arguments passed to a syscall. +type SyscallArguments [6]SyscallArgument + +// Pointer returns the usermem.Addr representation of a pointer argument. +func (a SyscallArgument) Pointer() usermem.Addr { + return usermem.Addr(a.Value) +} + +// Int returns the int32 representation of a 32-bit signed integer argument. +func (a SyscallArgument) Int() int32 { + return int32(a.Value) +} + +// Uint returns the uint32 representation of a 32-bit unsigned integer argument. +func (a SyscallArgument) Uint() uint32 { + return uint32(a.Value) +} + +// Int64 returns the int64 representation of a 64-bit signed integer argument. +func (a SyscallArgument) Int64() int64 { + return int64(a.Value) +} + +// Uint64 returns the uint64 representation of a 64-bit unsigned integer argument. +func (a SyscallArgument) Uint64() uint64 { + return uint64(a.Value) +} + +// SizeT returns the uint representation of a size_t argument. +func (a SyscallArgument) SizeT() uint { + return uint(a.Value) +} + +// ModeT returns the int representation of a mode_t argument. +func (a SyscallArgument) ModeT() uint { + return uint(uint16(a.Value)) +} diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go new file mode 100644 index 000000000..23526fe8e --- /dev/null +++ b/pkg/sentry/arch/arch_amd64.go @@ -0,0 +1,302 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arch + +import ( + "bytes" + "fmt" + "math/rand" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/binary" + "gvisor.googlesource.com/gvisor/pkg/cpuid" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// These constants come directly from Linux. +const ( + // maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux + // for a 64-bit process. + maxAddr64 usermem.Addr = (1 << 47) - usermem.PageSize + + // maxStackRand64 is the maximum randomization to apply to the stack. + // It is defined by arch/x86/mm/mmap.c:stack_maxrandom_size in Linux. + maxStackRand64 = 16 << 30 // 16 GB + + // maxMmapRand64 is the maximum randomization to apply to the mmap + // layout. It is defined by arch/x86/mm/mmap.c:arch_mmap_rnd in Linux. + maxMmapRand64 = (1 << 28) * usermem.PageSize + + // minGap64 is the minimum gap to leave at the top of the address space + // for the stack. It is defined by arch/x86/mm/mmap.c:MIN_GAP in Linux. + minGap64 = (128 << 20) + maxStackRand64 + + // preferredPIELoadAddr is the standard Linux position-independent + // executable base load address. It is ELF_ET_DYN_BASE in Linux. + // + // The Platform {Min,Max}UserAddress() may preclude loading at this + // address. See other preferredFoo comments below. + preferredPIELoadAddr usermem.Addr = maxAddr64 / 3 * 2 +) + +// These constants are selected as heuristics to help make the Platform's +// potentially limited address space conform as closely to Linux as possible. +const ( + // Select a preferred minimum TopDownBase address. + // + // Some applications (TSAN and other *SANs) are very particular about + // the way the Linux mmap allocator layouts out the address space. + // + // TSAN in particular expects top down allocations to be made in the + // range [0x7e8000000000, 0x800000000000). + // + // The minimum TopDownBase on Linux would be: + // 0x800000000000 - minGap64 - maxMmapRand64 = 0x7efbf8000000. + // + // (minGap64 because TSAN uses a small RLIMIT_STACK.) + // + // 0x7e8000000000 is selected arbitrarily by TSAN to leave room for + // allocations below TopDownBase. + // + // N.B. ASAN and MSAN are more forgiving; ASAN allows allocations all + // the way down to 0x10007fff8000, and MSAN down to 0x700000000000. + // + // Of course, there is no hard minimum to allocation; an allocator can + // search all the way from TopDownBase to Min. However, TSAN declared + // their range "good enough". + // + // We would like to pick a TopDownBase such that it is unlikely that an + // allocator will select an address below TSAN's minimum. We achieve + // this by trying to leave a sizable gap below TopDownBase. + // + // This is all "preferred" because the layout min/max address may not + // allow us to select such a TopDownBase, in which case we have to fall + // back to a layout that TSAN may not be happy with. + preferredTopDownAllocMin usermem.Addr = 0x7e8000000000 + preferredAllocationGap = 128 << 30 // 128 GB + preferredTopDownBaseMin = preferredTopDownAllocMin + preferredAllocationGap + + // minMmapRand64 is the smallest we are willing to make the + // randomization to stay above preferredTopDownBaseMin. + minMmapRand64 = (1 << 26) * usermem.PageSize +) + +// context64 represents an AMD64 context. +type context64 struct { + State + sigFPState []x86FPState // fpstate to be restored on sigreturn. +} + +// Arch implements Context.Arch. +func (c *context64) Arch() Arch { + return AMD64 +} + +func (c *context64) copySigFPState() []x86FPState { + var sigfps []x86FPState + for _, s := range c.sigFPState { + sigfps = append(sigfps, s.fork()) + } + return sigfps +} + +// Fork returns an exact copy of this context. +func (c *context64) Fork() Context { + return &context64{ + State: c.State.Fork(), + sigFPState: c.copySigFPState(), + } +} + +// Return returns the current syscall return value. +func (c *context64) Return() uintptr { + return uintptr(c.Regs.Rax) +} + +// SetReturn sets the syscall return value. +func (c *context64) SetReturn(value uintptr) { + c.Regs.Rax = uint64(value) +} + +// IP returns the current instruction pointer. +func (c *context64) IP() uintptr { + return uintptr(c.Regs.Rip) +} + +// SetIP sets the current instruction pointer. +func (c *context64) SetIP(value uintptr) { + c.Regs.Rip = uint64(value) +} + +// Stack returns the current stack pointer. +func (c *context64) Stack() uintptr { + return uintptr(c.Regs.Rsp) +} + +// SetStack sets the current stack pointer. +func (c *context64) SetStack(value uintptr) { + c.Regs.Rsp = uint64(value) +} + +// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP. +func (c *context64) SetRSEQInterruptedIP(value uintptr) { + c.Regs.R10 = uint64(value) +} + +// Native returns the native type for the given val. +func (c *context64) Native(val uintptr) interface{} { + v := uint64(val) + return &v +} + +// Value returns the generic val for the given native type. +func (c *context64) Value(val interface{}) uintptr { + return uintptr(*val.(*uint64)) +} + +// Width returns the byte width of this architecture. +func (c *context64) Width() uint { + return 8 +} + +// FeatureSet returns the FeatureSet in use. +func (c *context64) FeatureSet() *cpuid.FeatureSet { + return c.State.FeatureSet +} + +// mmapRand returns a random adjustment for randomizing an mmap layout. +func mmapRand(max uint64) usermem.Addr { + return usermem.Addr(rand.Int63n(int64(max))).RoundDown() +} + +// NewMmapLayout implements Context.NewMmapLayout consistently with Linux. +func (c *context64) NewMmapLayout(min, max usermem.Addr, r *limits.LimitSet) (MmapLayout, error) { + min, ok := min.RoundUp() + if !ok { + return MmapLayout{}, syscall.EINVAL + } + if max > maxAddr64 { + max = maxAddr64 + } + max = max.RoundDown() + + if min > max { + return MmapLayout{}, syscall.EINVAL + } + + stackSize := r.Get(limits.Stack) + + // MAX_GAP in Linux. + maxGap := (max / 6) * 5 + gap := usermem.Addr(stackSize.Cur) + if gap < minGap64 { + gap = minGap64 + } + if gap > maxGap { + gap = maxGap + } + defaultDir := MmapTopDown + if stackSize.Cur == limits.Infinity { + defaultDir = MmapBottomUp + } + + topDownMin := max - gap - maxMmapRand64 + maxRand := usermem.Addr(maxMmapRand64) + if topDownMin < preferredTopDownBaseMin { + // Try to keep TopDownBase above preferredTopDownBaseMin by + // shrinking maxRand. + maxAdjust := maxRand - minMmapRand64 + needAdjust := preferredTopDownBaseMin - topDownMin + if needAdjust <= maxAdjust { + maxRand -= needAdjust + } + } + + rnd := mmapRand(uint64(maxRand)) + l := MmapLayout{ + MinAddr: min, + MaxAddr: max, + // TASK_UNMAPPED_BASE in Linux. + BottomUpBase: (max/3 + rnd).RoundDown(), + TopDownBase: (max - gap - rnd).RoundDown(), + DefaultDirection: defaultDir, + // We may have reduced the maximum randomization to keep + // TopDownBase above preferredTopDownBaseMin while maintaining + // our stack gap. Stack allocations must use that max + // randomization to avoiding eating into the gap. + MaxStackRand: uint64(maxRand), + } + + // Final sanity check on the layout. + if !l.Valid() { + panic(fmt.Sprintf("Invalid MmapLayout: %+v", l)) + } + + return l, nil +} + +// PIELoadAddress implements Context.PIELoadAddress. +func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr { + base := preferredPIELoadAddr + max, ok := base.AddLength(maxMmapRand64) + if !ok { + panic(fmt.Sprintf("preferredPIELoadAddr %#x too large", base)) + } + + if max > l.MaxAddr { + // preferredPIELoadAddr won't fit; fall back to the standard + // Linux behavior of 2/3 of TopDownBase. TSAN won't like this. + // + // Don't bother trying to shrink the randomization for now. + base = l.TopDownBase / 3 * 2 + } + + return base + mmapRand(maxMmapRand64) +} + +// userStructSize is the size in bytes of Linux's struct user on amd64. +const userStructSize = 928 + +// PtracePeekUser implements Context.PtracePeekUser. +func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) { + if addr&7 != 0 || addr >= userStructSize { + return nil, syscall.EIO + } + // PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and + // u_debugreg, returning 0 or silently no-oping for other fields + // respectively. + if addr < uintptr(ptraceRegsSize) { + buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs()) + return c.Native(uintptr(usermem.ByteOrder.Uint64(buf[addr:]))), nil + } + // TODO: debug registers + return c.Native(0), nil +} + +// PtracePokeUser implements Context.PtracePokeUser. +func (c *context64) PtracePokeUser(addr, data uintptr) error { + if addr&7 != 0 || addr >= userStructSize { + return syscall.EIO + } + if addr < uintptr(ptraceRegsSize) { + buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs()) + usermem.ByteOrder.PutUint64(buf[addr:], uint64(data)) + _, err := c.PtraceSetRegs(bytes.NewBuffer(buf)) + return err + } + // TODO: debug registers + return nil +} diff --git a/pkg/sentry/arch/arch_amd64.s b/pkg/sentry/arch/arch_amd64.s new file mode 100644 index 000000000..10d621b6d --- /dev/null +++ b/pkg/sentry/arch/arch_amd64.s @@ -0,0 +1,135 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "textflag.h" + +// MXCSR_DEFAULT is the reset value of MXCSR (Intel SDM Vol. 2, Ch. 3.2 +// "LDMXCSR") +#define MXCSR_DEFAULT 0x1f80 + +// MXCSR_OFFSET is the offset in bytes of the MXCSR field from the start of the +// FXSAVE/XSAVE area. (Intel SDM Vol. 1, Table 10-2 "Format of an FXSAVE Area") +#define MXCSR_OFFSET 24 + +// initX86FPState initializes floating point state. +// +// func initX86FPState(data *FloatingPointData, useXsave bool) +// +// We need to clear out and initialize an empty fp state area since the sentry +// may have left sensitive information in the floating point registers. +// +// Preconditions: data is zeroed +TEXT ·initX86FPState(SB), $24-16 + // Save MXCSR (callee-save) + STMXCSR mxcsr-8(SP) + + // Save x87 CW (callee-save) + FSTCW cw-16(SP) + + MOVQ fpState+0(FP), DI + + // Do we use xsave? + MOVBQZX useXsave+8(FP), AX + TESTQ AX, AX + JZ no_xsave + + // Use XRSTOR to clear all FP state to an initial state. + // + // The fpState XSAVE area is zeroed on function entry, meaning + // XSTATE_BV is zero. + // + // "If RFBM[i] = 1 and bit i is clear in the XSTATE_BV field in the + // XSAVE header, XRSTOR initializes state component i." + // + // Initialization is defined in SDM Vol 1, Chapter 13.3. It puts all + // the registers in a reasonable initial state, except MXCSR: + // + // "The MXCSR register is part of state component 1, SSE state (see + // Section 13.5.2). However, the standard form of XRSTOR loads the + // MXCSR register from memory whenever the RFBM[1] (SSE) or RFBM[2] + // (AVX) is set, regardless of the values of XSTATE_BV[1] and + // XSTATE_BV[2]." + + // Set MXCSR to the default value. + MOVL $MXCSR_DEFAULT, MXCSR_OFFSET(DI) + + // Initialize registers with XRSTOR. + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f // XRSTOR64 0(DI) + + // Now that all the state has been reset, write it back out to the + // XSAVE area. + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27 // XSAVE64 0(DI) + + JMP out + +no_xsave: + // Clear out existing X values. + PXOR X0, X0 + MOVO X0, X1 + MOVO X0, X2 + MOVO X0, X3 + MOVO X0, X4 + MOVO X0, X5 + MOVO X0, X6 + MOVO X0, X7 + MOVO X0, X8 + MOVO X0, X9 + MOVO X0, X10 + MOVO X0, X11 + MOVO X0, X12 + MOVO X0, X13 + MOVO X0, X14 + MOVO X0, X15 + + // Zero out %rax and store into MMX registers. MMX registers are + // an alias of 8x64 bits of the 8x80 bits used for the original + // x87 registers. Storing zero into them will reset the FPU registers + // to bits [63:0] = 0, [79:64] = 1. But the contents aren't too + // important, just the fact that we have reset them to a known value. + XORQ AX, AX + MOVQ AX, M0 + MOVQ AX, M1 + MOVQ AX, M2 + MOVQ AX, M3 + MOVQ AX, M4 + MOVQ AX, M5 + MOVQ AX, M6 + MOVQ AX, M7 + + // The Go assembler doesn't support FNINIT, so we use BYTE. + // This will: + // - Reset FPU control word to 0x037f + // - Clear FPU status word + // - Reset FPU tag word to 0xffff + // - Clear FPU data pointer + // - Clear FPU instruction pointer + BYTE $0xDB; BYTE $0xE3; // FNINIT + + // Reset MXCSR. + MOVL $MXCSR_DEFAULT, tmpmxcsr-24(SP) + LDMXCSR tmpmxcsr-24(SP) + + // Save the floating point state with fxsave. + FXSAVE64 0(DI) + +out: + // Restore MXCSR. + LDMXCSR mxcsr-8(SP) + + // Restore x87 CW. + FLDCW cw-16(SP) + + RET diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go new file mode 100644 index 000000000..cb38d098a --- /dev/null +++ b/pkg/sentry/arch/arch_state_x86.go @@ -0,0 +1,97 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arch + +import ( + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/log" +) + +// warnOnce is used to warn about truncated state only once. +var warnOnce sync.Once + +// afterLoad is invoked by stateify. +func (s *State) afterLoad() { + old := s.x86FPState + + // Recreate the slice. This is done to ensure that it is aligned + // appropriately in memory, and large enough to accommodate any new + // state that may be saved by the new CPU. Even if extraneous new state + // is saved, the state we care about is guaranteed to be a subset of + // new state. Later optimizations can use less space when using a + // smaller state component bitmap. Intel SDM section 13 has more info. + s.x86FPState = newX86FPState() + + // x86FPState always contains all the FP state supported by the host. + // We may have come from a newer machine that supports additional state + // which we cannot restore. + // + // The x86 FP state areas are backwards compatible, so we can simply + // truncate the additional floating point state. Applications should + // not depend on the truncated state because it should relate only to + // features that were not exposed in the app FeatureSet. + if len(s.x86FPState) < len(old) { + warnOnce.Do(func() { + // This will occur on every instance of state, don't + // bother warning more than once. + log.Infof("dropping %d bytes of floating point state; the application should not depend on this state", len(old)-len(s.x86FPState)) + }) + } + + // Copy to the new, aligned location. + copy(s.x86FPState, old) +} + +type syscallPtraceRegs struct { + R15 uint64 + R14 uint64 + R13 uint64 + R12 uint64 + Rbp uint64 + Rbx uint64 + R11 uint64 + R10 uint64 + R9 uint64 + R8 uint64 + Rax uint64 + Rcx uint64 + Rdx uint64 + Rsi uint64 + Rdi uint64 + Orig_rax uint64 + Rip uint64 + Cs uint64 + Eflags uint64 + Rsp uint64 + Ss uint64 + Fs_base uint64 + Gs_base uint64 + Ds uint64 + Es uint64 + Fs uint64 + Gs uint64 +} + +// saveRegs is invoked by stateify. +func (s *State) saveRegs() syscallPtraceRegs { + return syscallPtraceRegs(s.Regs) +} + +// loadRegs is invoked by stateify. +func (s *State) loadRegs(r syscallPtraceRegs) { + s.Regs = syscall.PtraceRegs(r) +} diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go new file mode 100644 index 000000000..5cc4f8377 --- /dev/null +++ b/pkg/sentry/arch/arch_x86.go @@ -0,0 +1,613 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 i386 + +package arch + +import ( + "fmt" + "io" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/binary" + "gvisor.googlesource.com/gvisor/pkg/cpuid" + "gvisor.googlesource.com/gvisor/pkg/log" + rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// System-related constants for x86. +const ( + // SyscallWidth is the width of syscall, sysenter, and int 80 insturctions. + SyscallWidth = 2 +) + +// EFLAGS register bits. +const ( + // eflagsCF is the mask for the carry flag. + eflagsCF = uint64(1) << 0 + // eflagsPF is the mask for the parity flag. + eflagsPF = uint64(1) << 2 + // eflagsAF is the mask for the auxiliary carry flag. + eflagsAF = uint64(1) << 4 + // eflagsZF is the mask for the zero flag. + eflagsZF = uint64(1) << 6 + // eflagsSF is the mask for the sign flag. + eflagsSF = uint64(1) << 7 + // eflagsTF is the mask for the trap flag. + eflagsTF = uint64(1) << 8 + // eflagsIF is the mask for the interrupt flag. + eflagsIF = uint64(1) << 9 + // eflagsDF is the mask for the direction flag. + eflagsDF = uint64(1) << 10 + // eflagsOF is the mask for the overflow flag. + eflagsOF = uint64(1) << 11 + // eflagsIOPL is the mask for the I/O privilege level. + eflagsIOPL = uint64(3) << 12 + // eflagsNT is the mask for the nested task bit. + eflagsNT = uint64(1) << 14 + // eflagsRF is the mask for the resume flag. + eflagsRF = uint64(1) << 16 + // eflagsVM is the mask for the virtual mode bit. + eflagsVM = uint64(1) << 17 + // eflagsAC is the mask for the alignment check / access control bit. + eflagsAC = uint64(1) << 18 + // eflagsVIF is the mask for the virtual interrupt flag. + eflagsVIF = uint64(1) << 19 + // eflagsVIP is the mask for the virtual interrupt pending bit. + eflagsVIP = uint64(1) << 20 + // eflagsID is the mask for the CPUID detection bit. + eflagsID = uint64(1) << 21 + + // eflagsPtraceMutable is the mask for the set of EFLAGS that may be + // changed by ptrace(PTRACE_SETREGS). eflagsPtraceMutable is analogous to + // Linux's FLAG_MASK. + eflagsPtraceMutable = eflagsCF | eflagsPF | eflagsAF | eflagsZF | eflagsSF | eflagsTF | eflagsDF | eflagsOF | eflagsRF | eflagsAC | eflagsNT + + // eflagsRestorable is the mask for the set of EFLAGS that may be changed by + // SignalReturn. eflagsRestorable is analogous to Linux's FIX_EFLAGS. + eflagsRestorable = eflagsAC | eflagsOF | eflagsDF | eflagsTF | eflagsSF | eflagsZF | eflagsAF | eflagsPF | eflagsCF | eflagsRF +) + +// Segment selectors. See arch/x86/include/asm/segment.h. +const ( + userCS = 0x33 // guest ring 3 code selector + user32CS = 0x23 // guest ring 3 32 bit code selector + userDS = 0x2b // guest ring 3 data selector + + _FS_TLS_SEL = 0x63 // Linux FS thread-local storage selector + _GS_TLS_SEL = 0x6b // Linux GS thread-local storage selector +) + +var ( + // TrapInstruction is the x86 trap instruction. + TrapInstruction = [1]byte{0xcc} + + // CPUIDInstruction is the x86 CPUID instruction. + CPUIDInstruction = [2]byte{0xf, 0xa2} + + // X86TrapFlag is an exported const for use by other packages. + X86TrapFlag uint64 = (1 << 8) +) + +// x86FPState is x86 floating point state. +type x86FPState []byte + +// initX86FPState (defined in asm files) sets up initial state. +func initX86FPState(data *FloatingPointData, useXsave bool) + +func newX86FPStateSlice() []byte { + size, align := cpuid.HostFeatureSet().ExtendedStateSize() + capacity := size + // Always use at least 4096 bytes. + if capacity < 4096 { + capacity = 4096 + } + return alignedBytes(capacity, align)[:size] +} + +// newX86FPState returns an initialized floating point state. +// +// The returned state is large enough to store all floating point state +// supported by host, even if the app won't use much of it due to a restricted +// FeatureSet. Since they may still be able to see state not advertised by +// CPUID we must ensure it does not contain any sentry state. +func newX86FPState() x86FPState { + f := x86FPState(newX86FPStateSlice()) + initX86FPState(f.FloatingPointData(), cpuid.HostFeatureSet().UseXsave()) + return f +} + +// fork creates and returns an identical copy of the x86 floating point state. +func (f x86FPState) fork() x86FPState { + n := x86FPState(newX86FPStateSlice()) + copy(n, f) + return n +} + +// FloatingPointData returns the raw data pointer. +func (f x86FPState) FloatingPointData() *FloatingPointData { + return (*FloatingPointData)(&f[0]) +} + +// NewFloatingPointData returns a new floating point data blob. +// +// This is primarily for use in tests. +func NewFloatingPointData() *FloatingPointData { + return (*FloatingPointData)(&(newX86FPState()[0])) +} + +// State contains the common architecture bits for X86 (the build tag of this +// file ensures it's only built on x86). +type State struct { + // The system registers. + Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"` + + // Our floating point state. + x86FPState `state:"wait"` + + // FeatureSet is a pointer to the currently active feature set. + FeatureSet *cpuid.FeatureSet +} + +// Proto returns a protobuf representation of the system registers in State. +func (s State) Proto() *rpb.Registers { + regs := &rpb.AMD64Registers{ + Rax: s.Regs.Rax, + Rbx: s.Regs.Rbx, + Rcx: s.Regs.Rcx, + Rdx: s.Regs.Rdx, + Rsi: s.Regs.Rsi, + Rdi: s.Regs.Rdi, + Rsp: s.Regs.Rsp, + Rbp: s.Regs.Rbp, + R8: s.Regs.R8, + R9: s.Regs.R9, + R10: s.Regs.R10, + R11: s.Regs.R11, + R12: s.Regs.R12, + R13: s.Regs.R13, + R14: s.Regs.R14, + R15: s.Regs.R15, + Rip: s.Regs.Rip, + Rflags: s.Regs.Eflags, + OrigRax: s.Regs.Orig_rax, + Cs: s.Regs.Cs, + Ds: s.Regs.Ds, + Es: s.Regs.Es, + Fs: s.Regs.Fs, + Gs: s.Regs.Gs, + Ss: s.Regs.Ss, + FsBase: s.Regs.Fs_base, + GsBase: s.Regs.Gs_base, + } + return &rpb.Registers{Arch: &rpb.Registers_Amd64{Amd64: regs}} +} + +// Fork creates and returns an identical copy of the state. +func (s *State) Fork() State { + return State{ + Regs: s.Regs, + x86FPState: s.x86FPState.fork(), + FeatureSet: s.FeatureSet, + } +} + +// StateData implements Context.StateData. +func (s *State) StateData() *State { + return s +} + +// CPUIDEmulate emulates a cpuid instruction. +func (s *State) CPUIDEmulate(l log.Logger) { + argax := uint32(s.Regs.Rax) + argcx := uint32(s.Regs.Rcx) + ax, bx, cx, dx := s.FeatureSet.EmulateID(argax, argcx) + s.Regs.Rax = uint64(ax) + s.Regs.Rbx = uint64(bx) + s.Regs.Rcx = uint64(cx) + s.Regs.Rdx = uint64(dx) + l.Debugf("CPUID(%x,%x): %x %x %x %x", argax, argcx, ax, bx, cx, dx) +} + +// SingleStep implements Context.SingleStep. +func (s *State) SingleStep() bool { + return s.Regs.Eflags&X86TrapFlag != 0 +} + +// SetSingleStep enables single stepping. +func (s *State) SetSingleStep() { + // Set the trap flag. + s.Regs.Eflags |= X86TrapFlag +} + +// ClearSingleStep enables single stepping. +func (s *State) ClearSingleStep() { + // Clear the trap flag. + s.Regs.Eflags &= ^X86TrapFlag +} + +// RegisterMap returns a map of all registers. +func (s *State) RegisterMap() (map[string]uintptr, error) { + return map[string]uintptr{ + "R15": uintptr(s.Regs.R15), + "R14": uintptr(s.Regs.R14), + "R13": uintptr(s.Regs.R13), + "R12": uintptr(s.Regs.R12), + "Rbp": uintptr(s.Regs.Rbp), + "Rbx": uintptr(s.Regs.Rbx), + "R11": uintptr(s.Regs.R11), + "R10": uintptr(s.Regs.R10), + "R9": uintptr(s.Regs.R9), + "R8": uintptr(s.Regs.R8), + "Rax": uintptr(s.Regs.Rax), + "Rcx": uintptr(s.Regs.Rcx), + "Rdx": uintptr(s.Regs.Rdx), + "Rsi": uintptr(s.Regs.Rsi), + "Rdi": uintptr(s.Regs.Rdi), + "Orig_rax": uintptr(s.Regs.Orig_rax), + "Rip": uintptr(s.Regs.Rip), + "Cs": uintptr(s.Regs.Cs), + "Eflags": uintptr(s.Regs.Eflags), + "Rsp": uintptr(s.Regs.Rsp), + "Ss": uintptr(s.Regs.Ss), + "Fs_base": uintptr(s.Regs.Fs_base), + "Gs_base": uintptr(s.Regs.Gs_base), + "Ds": uintptr(s.Regs.Ds), + "Es": uintptr(s.Regs.Es), + "Fs": uintptr(s.Regs.Fs), + "Gs": uintptr(s.Regs.Gs), + }, nil +} + +// PtraceGetRegs implements Context.PtraceGetRegs. +func (s *State) PtraceGetRegs(dst io.Writer) (int, error) { + return dst.Write(binary.Marshal(nil, usermem.ByteOrder, s.ptraceGetRegs())) +} + +func (s *State) ptraceGetRegs() syscall.PtraceRegs { + regs := s.Regs + // These may not be initialized. + if regs.Cs == 0 || regs.Ss == 0 || regs.Eflags == 0 { + regs.Eflags = eflagsIF + regs.Cs = userCS + regs.Ss = userDS + } + // As an optimization, Linux <4.7 implements 32-bit fs_base/gs_base + // addresses using reserved descriptors in the GDT instead of the MSRs, + // with selector values FS_TLS_SEL and GS_TLS_SEL respectively. These + // values are actually visible in struct user_regs_struct::fs/gs; + // arch/x86/kernel/ptrace.c:getreg() doesn't attempt to sanitize struct + // thread_struct::fsindex/gsindex. + // + // We always use fs == gs == 0 when fs_base/gs_base is in use, for + // simplicity. + // + // Luckily, Linux <4.7 silently ignores setting fs/gs to 0 via + // arch/x86/kernel/ptrace.c:set_segment_reg() when fs_base/gs_base is a + // 32-bit value and fsindex/gsindex indicates that this optimization is + // in use, as well as the reverse case of setting fs/gs to + // FS/GS_TLS_SEL when fs_base/gs_base is a 64-bit value. (We do the + // same in PtraceSetRegs.) + // + // TODO: Remove this fixup since newer Linux doesn't have + // this behavior anymore. + if regs.Fs == 0 && regs.Fs_base <= 0xffffffff { + regs.Fs = _FS_TLS_SEL + } + if regs.Gs == 0 && regs.Gs_base <= 0xffffffff { + regs.Gs = _GS_TLS_SEL + } + return regs +} + +var ptraceRegsSize = int(binary.Size(syscall.PtraceRegs{})) + +// PtraceSetRegs implements Context.PtraceSetRegs. +func (s *State) PtraceSetRegs(src io.Reader) (int, error) { + var regs syscall.PtraceRegs + buf := make([]byte, ptraceRegsSize) + if _, err := io.ReadFull(src, buf); err != nil { + return 0, err + } + binary.Unmarshal(buf, usermem.ByteOrder, ®s) + // Truncate segment registers to 16 bits. + regs.Cs = uint64(uint16(regs.Cs)) + regs.Ds = uint64(uint16(regs.Ds)) + regs.Es = uint64(uint16(regs.Es)) + regs.Fs = uint64(uint16(regs.Fs)) + regs.Gs = uint64(uint16(regs.Gs)) + regs.Ss = uint64(uint16(regs.Ss)) + // In Linux this validation is via arch/x86/kernel/ptrace.c:putreg(). + if !isUserSegmentSelector(regs.Cs) { + return 0, syscall.EIO + } + if regs.Ds != 0 && !isUserSegmentSelector(regs.Ds) { + return 0, syscall.EIO + } + if regs.Es != 0 && !isUserSegmentSelector(regs.Es) { + return 0, syscall.EIO + } + if regs.Fs != 0 && !isUserSegmentSelector(regs.Fs) { + return 0, syscall.EIO + } + if regs.Gs != 0 && !isUserSegmentSelector(regs.Gs) { + return 0, syscall.EIO + } + if !isUserSegmentSelector(regs.Ss) { + return 0, syscall.EIO + } + if regs.Fs_base >= uint64(maxAddr64) { + return 0, syscall.EIO + } + if regs.Gs_base >= uint64(maxAddr64) { + return 0, syscall.EIO + } + // CS and SS are validated, but changes to them are otherwise silently + // ignored on amd64. + regs.Cs = s.Regs.Cs + regs.Ss = s.Regs.Ss + // fs_base/gs_base changes reset fs/gs via do_arch_prctl() on Linux. + if regs.Fs_base != s.Regs.Fs_base { + regs.Fs = 0 + } + if regs.Gs_base != s.Regs.Gs_base { + regs.Gs = 0 + } + // Ignore "stale" TLS segment selectors for FS and GS. See comment in + // ptraceGetRegs. + if regs.Fs == _FS_TLS_SEL && regs.Fs_base != 0 { + regs.Fs = 0 + } + if regs.Gs == _GS_TLS_SEL && regs.Gs_base != 0 { + regs.Gs = 0 + } + regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable) + s.Regs = regs + return ptraceRegsSize, nil +} + +// isUserSegmentSelector returns true if the given segment selector specifies a +// privilege level of 3 (USER_RPL). +func isUserSegmentSelector(reg uint64) bool { + return reg&3 == 3 +} + +// ptraceFPRegsSize is the size in bytes of Linux's user_i387_struct, the type +// manipulated by PTRACE_GETFPREGS and PTRACE_SETFPREGS on x86. Equivalently, +// ptraceFPRegsSize is the size in bytes of the x86 FXSAVE area. +const ptraceFPRegsSize = 512 + +// PtraceGetFPRegs implements Context.PtraceGetFPRegs. +func (s *State) PtraceGetFPRegs(dst io.Writer) (int, error) { + return dst.Write(s.x86FPState[:ptraceFPRegsSize]) +} + +// PtraceSetFPRegs implements Context.PtraceSetFPRegs. +func (s *State) PtraceSetFPRegs(src io.Reader) (int, error) { + var f [ptraceFPRegsSize]byte + n, err := io.ReadFull(src, f[:]) + if err != nil { + return 0, err + } + // Force reserved bits in MXCSR to 0. This is consistent with Linux. + sanitizeMXCSR(x86FPState(f[:])) + // N.B. this only copies the beginning of the FP state, which + // corresponds to the FXSAVE area. + copy(s.x86FPState, f[:]) + return n, nil +} + +const ( + // mxcsrOffset is the offset in bytes of the MXCSR field from the start of + // the FXSAVE area. (Intel SDM Vol. 1, Table 10-2 "Format of an FXSAVE + // Area") + mxcsrOffset = 24 + + // mxcsrMaskOffset is the offset in bytes of the MXCSR_MASK field from the + // start of the FXSAVE area. + mxcsrMaskOffset = 28 +) + +var ( + mxcsrMask uint32 + initMXCSRMask sync.Once +) + +// sanitizeMXCSR coerces reserved bits in the MXCSR field of f to 0. ("FXRSTOR +// generates a general-protection fault (#GP) in response to an attempt to set +// any of the reserved bits of the MXCSR register." - Intel SDM Vol. 1, Section +// 10.5.1.2 "SSE State") +func sanitizeMXCSR(f x86FPState) { + mxcsr := usermem.ByteOrder.Uint32(f[mxcsrOffset:]) + initMXCSRMask.Do(func() { + temp := x86FPState(alignedBytes(uint(ptraceFPRegsSize), 16)) + initX86FPState(temp.FloatingPointData(), false /* useXsave */) + mxcsrMask = usermem.ByteOrder.Uint32(temp[mxcsrMaskOffset:]) + if mxcsrMask == 0 { + // "If the value of the MXCSR_MASK field is 00000000H, then the + // MXCSR_MASK value is the default value of 0000FFBFH." - Intel SDM + // Vol. 1, Section 11.6.6 "Guidelines for Writing to the MXCSR + // Register" + mxcsrMask = 0xffbf + } + }) + mxcsr &= mxcsrMask + usermem.ByteOrder.PutUint32(f[mxcsrOffset:], mxcsr) +} + +const ( + // minXstateBytes is the minimum size in bytes of an x86 XSAVE area, equal + // to the size of the XSAVE legacy area (512 bytes) plus the size of the + // XSAVE header (64 bytes). Equivalently, minXstateBytes is GDB's + // X86_XSTATE_SSE_SIZE. + minXstateBytes = 512 + 64 + + // userXstateXCR0Offset is the offset in bytes of the USER_XSTATE_XCR0_WORD + // field in Linux's struct user_xstateregs, which is the type manipulated + // by ptrace(PTRACE_GET/SETREGSET, NT_X86_XSTATE). Equivalently, + // userXstateXCR0Offset is GDB's I386_LINUX_XSAVE_XCR0_OFFSET. + userXstateXCR0Offset = 464 + + // xstateBVOffset is the offset in bytes of the XSTATE_BV field in an x86 + // XSAVE area. + xstateBVOffset = 512 + + // xsaveHeaderZeroedOffset and xsaveHeaderZeroedBytes indicate parts of the + // XSAVE header that we coerce to zero: "Bytes 15:8 of the XSAVE header is + // a state-component bitmap called XCOMP_BV. ... Bytes 63:16 of the XSAVE + // header are reserved." - Intel SDM Vol. 1, Section 13.4.2 "XSAVE Header". + // Linux ignores XCOMP_BV, but it's able to recover from XRSTOR #GP + // exceptions resulting from invalid values; we aren't. Linux also never + // uses the compacted format when doing XSAVE and doesn't even define the + // compaction extensions to XSAVE as a CPU feature, so for simplicity we + // assume no one is using them. + xsaveHeaderZeroedOffset = 512 + 8 + xsaveHeaderZeroedBytes = 64 - 8 +) + +func (s *State) ptraceGetXstateRegs(dst io.Writer, maxlen int) (int, error) { + // N.B. s.x86FPState may contain more state than the application + // expects. We only copy the subset that would be in their XSAVE area. + ess, _ := s.FeatureSet.ExtendedStateSize() + f := make([]byte, ess) + copy(f, s.x86FPState) + // "The XSAVE feature set does not use bytes 511:416; bytes 463:416 are + // reserved." - Intel SDM Vol 1., Section 13.4.1 "Legacy Region of an XSAVE + // Area". Linux uses the first 8 bytes of this area to store the OS XSTATE + // mask. GDB relies on this: see + // gdb/x86-linux-nat.c:x86_linux_read_description(). + usermem.ByteOrder.PutUint64(f[userXstateXCR0Offset:], s.FeatureSet.ValidXCR0Mask()) + if len(f) > maxlen { + f = f[:maxlen] + } + return dst.Write(f) +} + +func (s *State) ptraceSetXstateRegs(src io.Reader, maxlen int) (int, error) { + // Allow users to pass an xstate register set smaller than ours (they can + // mask bits out of XSTATE_BV), as long as it's at least minXstateBytes. + // Also allow users to pass a register set larger than ours; anything after + // their ExtendedStateSize will be ignored. (I think Linux technically + // permits setting a register set smaller than minXstateBytes, but it has + // the same silent truncation behavior in kernel/ptrace.c:ptrace_regset().) + if maxlen < minXstateBytes { + return 0, syscall.EFAULT + } + ess, _ := s.FeatureSet.ExtendedStateSize() + if maxlen > int(ess) { + maxlen = int(ess) + } + f := make([]byte, maxlen) + if _, err := io.ReadFull(src, f); err != nil { + return 0, err + } + // Force reserved bits in MXCSR to 0. This is consistent with Linux. + sanitizeMXCSR(x86FPState(f)) + // Users can't enable *more* XCR0 bits than what we, and the CPU, support. + xstateBV := usermem.ByteOrder.Uint64(f[xstateBVOffset:]) + xstateBV &= s.FeatureSet.ValidXCR0Mask() + usermem.ByteOrder.PutUint64(f[xstateBVOffset:], xstateBV) + // Force XCOMP_BV and reserved bytes in the XSAVE header to 0. + reserved := f[xsaveHeaderZeroedOffset : xsaveHeaderZeroedOffset+xsaveHeaderZeroedBytes] + for i := range reserved { + reserved[i] = 0 + } + return copy(s.x86FPState, f), nil +} + +// Register sets defined in include/uapi/linux/elf.h. +const ( + _NT_PRSTATUS = 1 + _NT_PRFPREG = 2 + _NT_X86_XSTATE = 0x202 +) + +// PtraceGetRegSet implements Context.PtraceGetRegSet. +func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) { + switch regset { + case _NT_PRSTATUS: + if maxlen < ptraceRegsSize { + return 0, syserror.EFAULT + } + return s.PtraceGetRegs(dst) + case _NT_PRFPREG: + if maxlen < ptraceFPRegsSize { + return 0, syserror.EFAULT + } + return s.PtraceGetFPRegs(dst) + case _NT_X86_XSTATE: + return s.ptraceGetXstateRegs(dst, maxlen) + default: + return 0, syserror.EINVAL + } +} + +// PtraceSetRegSet implements Context.PtraceSetRegSet. +func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) { + switch regset { + case _NT_PRSTATUS: + if maxlen < ptraceRegsSize { + return 0, syserror.EFAULT + } + return s.PtraceSetRegs(src) + case _NT_PRFPREG: + if maxlen < ptraceFPRegsSize { + return 0, syserror.EFAULT + } + return s.PtraceSetFPRegs(src) + case _NT_X86_XSTATE: + return s.ptraceSetXstateRegs(src, maxlen) + default: + return 0, syserror.EINVAL + } +} + +// FullRestore indicates whether a full restore is required. +func (s *State) FullRestore() bool { + // A fast system call return is possible only if + // + // * RCX matches the instruction pointer. + // * R11 matches our flags value. + // * Usermode does not expect to set either the resume flag or the + // virtual mode flags (unlikely.) + // * CS and SS are set to the standard selectors. + // + // That is, SYSRET results in the correct final state. + fastRestore := s.Regs.Rcx == s.Regs.Rip && + s.Regs.Eflags == s.Regs.R11 && + (s.Regs.Eflags&eflagsRF == 0) && + (s.Regs.Eflags&eflagsVM == 0) && + s.Regs.Cs == userCS && + s.Regs.Ss == userDS + return !fastRestore +} + +// New returns a new architecture context. +func New(arch Arch, fs *cpuid.FeatureSet) Context { + switch arch { + case AMD64: + return &context64{ + State{ + x86FPState: newX86FPState(), + FeatureSet: fs, + }, + []x86FPState(nil), + } + } + panic(fmt.Sprintf("unknown architecture %v", arch)) +} diff --git a/pkg/sentry/arch/auxv.go b/pkg/sentry/arch/auxv.go new file mode 100644 index 000000000..70e0e35b7 --- /dev/null +++ b/pkg/sentry/arch/auxv.go @@ -0,0 +1,28 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arch + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// An AuxEntry represents an entry in an ELF auxiliary vector. +type AuxEntry struct { + Key uint64 + Value usermem.Addr +} + +// An Auxv represents an ELF auxiliary vector. +type Auxv []AuxEntry diff --git a/pkg/sentry/arch/registers.proto b/pkg/sentry/arch/registers.proto new file mode 100644 index 000000000..437ff44ca --- /dev/null +++ b/pkg/sentry/arch/registers.proto @@ -0,0 +1,55 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package gvisor; + +message AMD64Registers { + uint64 rax = 1; + uint64 rbx = 2; + uint64 rcx = 3; + uint64 rdx = 4; + uint64 rsi = 5; + uint64 rdi = 6; + uint64 rsp = 7; + uint64 rbp = 8; + + uint64 r8 = 9; + uint64 r9 = 10; + uint64 r10 = 11; + uint64 r11 = 12; + uint64 r12 = 13; + uint64 r13 = 14; + uint64 r14 = 15; + uint64 r15 = 16; + + uint64 rip = 17; + uint64 rflags = 18; + uint64 orig_rax = 19; + uint64 cs = 20; + uint64 ds = 21; + uint64 es = 22; + uint64 fs = 23; + uint64 gs = 24; + uint64 ss = 25; + uint64 fs_base = 26; + uint64 gs_base = 27; +} + +message Registers { + oneof arch { + AMD64Registers amd64 = 1; + } +} diff --git a/pkg/sentry/arch/signal_act.go b/pkg/sentry/arch/signal_act.go new file mode 100644 index 000000000..36437b965 --- /dev/null +++ b/pkg/sentry/arch/signal_act.go @@ -0,0 +1,79 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arch + +// Special values for SignalAct.Handler. +const ( + // SignalActDefault is SIG_DFL and specifies that the default behavior for + // a signal should be taken. + SignalActDefault = 0 + + // SignalActIgnore is SIG_IGN and specifies that a signal should be + // ignored. + SignalActIgnore = 1 +) + +// Available signal flags. +const ( + SignalFlagNoCldStop = 0x00000001 + SignalFlagNoCldWait = 0x00000002 + SignalFlagSigInfo = 0x00000004 + SignalFlagRestorer = 0x04000000 + SignalFlagOnStack = 0x08000000 + SignalFlagRestart = 0x10000000 + SignalFlagInterrupt = 0x20000000 + SignalFlagNoDefer = 0x40000000 + SignalFlagResetHandler = 0x80000000 +) + +// IsSigInfo returns true iff this handle expects siginfo. +func (s SignalAct) IsSigInfo() bool { + return s.Flags&SignalFlagSigInfo != 0 +} + +// IsNoDefer returns true iff this SignalAct has the NoDefer flag set. +func (s SignalAct) IsNoDefer() bool { + return s.Flags&SignalFlagNoDefer != 0 +} + +// IsRestart returns true iff this SignalAct has the Restart flag set. +func (s SignalAct) IsRestart() bool { + return s.Flags&SignalFlagRestart != 0 +} + +// IsResetHandler returns true iff this SignalAct has the ResetHandler flag set. +func (s SignalAct) IsResetHandler() bool { + return s.Flags&SignalFlagResetHandler != 0 +} + +// IsOnStack returns true iff this SignalAct has the OnStack flag set. +func (s SignalAct) IsOnStack() bool { + return s.Flags&SignalFlagOnStack != 0 +} + +// HasRestorer returns true iff this SignalAct has the Restorer flag set. +func (s SignalAct) HasRestorer() bool { + return s.Flags&SignalFlagRestorer != 0 +} + +// NativeSignalAct is a type that is equivalent to struct sigaction in the +// guest architecture. +type NativeSignalAct interface { + // SerializeFrom copies the data in the host SignalAct s into this object. + SerializeFrom(s *SignalAct) + + // DeserializeTo copies the data in this object into the host SignalAct s. + DeserializeTo(s *SignalAct) +} diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go new file mode 100644 index 000000000..4040b530f --- /dev/null +++ b/pkg/sentry/arch/signal_amd64.go @@ -0,0 +1,476 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package arch + +import ( + "encoding/binary" + "math" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// SignalAct represents the action that should be taken when a signal is +// delivered, and is equivalent to struct sigaction on 64-bit x86. +type SignalAct struct { + Handler uint64 + Flags uint64 + Restorer uint64 + Mask linux.SignalSet +} + +// SerializeFrom implements NativeSignalAct.SerializeFrom. +func (s *SignalAct) SerializeFrom(other *SignalAct) { + *s = *other +} + +// DeserializeTo implements NativeSignalAct.DeserializeTo. +func (s *SignalAct) DeserializeTo(other *SignalAct) { + *other = *s +} + +// SignalStack represents information about a user stack, and is equivalent to +// stack_t on 64-bit x86. +type SignalStack struct { + Addr uint64 + Flags uint32 + _ uint32 + Size uint64 +} + +// SerializeFrom implements NativeSignalStack.SerializeFrom. +func (s *SignalStack) SerializeFrom(other *SignalStack) { + *s = *other +} + +// DeserializeTo implements NativeSignalStack.DeserializeTo. +func (s *SignalStack) DeserializeTo(other *SignalStack) { + *other = *s +} + +// SignalInfo represents information about a signal being delivered, and is +// equivalent to struct siginfo on 64-bit x86. +type SignalInfo struct { + Signo int32 // Signal number + Errno int32 // Errno value + Code int32 // Signal code + _ uint32 + + // struct siginfo::_sifields is a union. In SignalInfo, fields in the union + // are accessed through methods. + // + // For reference, here is the definition of _sifields: (_sigfault._trapno, + // which does not exist on x86, omitted for clarity) + // + // union { + // int _pad[SI_PAD_SIZE]; + // + // /* kill() */ + // struct { + // __kernel_pid_t _pid; /* sender's pid */ + // __ARCH_SI_UID_T _uid; /* sender's uid */ + // } _kill; + // + // /* POSIX.1b timers */ + // struct { + // __kernel_timer_t _tid; /* timer id */ + // int _overrun; /* overrun count */ + // char _pad[sizeof( __ARCH_SI_UID_T) - sizeof(int)]; + // sigval_t _sigval; /* same as below */ + // int _sys_private; /* not to be passed to user */ + // } _timer; + // + // /* POSIX.1b signals */ + // struct { + // __kernel_pid_t _pid; /* sender's pid */ + // __ARCH_SI_UID_T _uid; /* sender's uid */ + // sigval_t _sigval; + // } _rt; + // + // /* SIGCHLD */ + // struct { + // __kernel_pid_t _pid; /* which child */ + // __ARCH_SI_UID_T _uid; /* sender's uid */ + // int _status; /* exit code */ + // __ARCH_SI_CLOCK_T _utime; + // __ARCH_SI_CLOCK_T _stime; + // } _sigchld; + // + // /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ + // struct { + // void *_addr; /* faulting insn/memory ref. */ + // short _addr_lsb; /* LSB of the reported address */ + // } _sigfault; + // + // /* SIGPOLL */ + // struct { + // __ARCH_SI_BAND_T _band; /* POLL_IN, POLL_OUT, POLL_MSG */ + // int _fd; + // } _sigpoll; + // + // /* SIGSYS */ + // struct { + // void *_call_addr; /* calling user insn */ + // int _syscall; /* triggering system call number */ + // unsigned int _arch; /* AUDIT_ARCH_* of syscall */ + // } _sigsys; + // } _sifields; + // + // _sifields is padded so that the size of siginfo is SI_MAX_SIZE = 128 + // bytes. + Fields [128 - 16]byte +} + +// FixSignalCodeForUser fixes up si_code. +// +// The si_code we get from Linux may contain the kernel-specific code in the +// top 16 bits if it's positive (e.g., from ptrace). Linux's +// copy_siginfo_to_user does +// err |= __put_user((short)from->si_code, &to->si_code); +// to mask out those bits and we need to do the same. +func (s *SignalInfo) FixSignalCodeForUser() { + if s.Code > 0 { + s.Code &= 0x0000ffff + } +} + +// Pid returns the si_pid field. +func (s *SignalInfo) Pid() int32 { + return int32(usermem.ByteOrder.Uint32(s.Fields[0:4])) +} + +// SetPid mutates the si_pid field. +func (s *SignalInfo) SetPid(val int32) { + usermem.ByteOrder.PutUint32(s.Fields[0:4], uint32(val)) +} + +// Uid returns the si_uid field. +func (s *SignalInfo) Uid() int32 { + return int32(usermem.ByteOrder.Uint32(s.Fields[4:8])) +} + +// SetUid mutates the si_uid field. +func (s *SignalInfo) SetUid(val int32) { + usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val)) +} + +// Addr returns the si_addr field. +func (s *SignalInfo) Addr() uint64 { + return usermem.ByteOrder.Uint64(s.Fields[0:8]) +} + +// SetAddr sets the si_addr field. +func (s *SignalInfo) SetAddr(val uint64) { + usermem.ByteOrder.PutUint64(s.Fields[0:8], val) +} + +// Status returns the si_status field. +func (s *SignalInfo) Status() int32 { + return int32(usermem.ByteOrder.Uint32(s.Fields[8:12])) +} + +// SetStatus mutates the si_status field. +func (s *SignalInfo) SetStatus(val int32) { + usermem.ByteOrder.PutUint32(s.Fields[8:12], uint32(val)) +} + +// CallAddr returns the si_call_addr field. +func (s *SignalInfo) CallAddr() uint64 { + return usermem.ByteOrder.Uint64(s.Fields[0:8]) +} + +// SetCallAddr mutates the si_call_addr field. +func (s *SignalInfo) SetCallAddr(val uint64) { + usermem.ByteOrder.PutUint64(s.Fields[0:8], val) +} + +// Syscall returns the si_syscall field. +func (s *SignalInfo) Syscall() int32 { + return int32(usermem.ByteOrder.Uint32(s.Fields[8:12])) +} + +// SetSyscall mutates the si_syscall field. +func (s *SignalInfo) SetSyscall(val int32) { + usermem.ByteOrder.PutUint32(s.Fields[8:12], uint32(val)) +} + +// Arch returns the si_arch field. +func (s *SignalInfo) Arch() uint32 { + return usermem.ByteOrder.Uint32(s.Fields[12:16]) +} + +// SetArch mutates the si_arch field. +func (s *SignalInfo) SetArch(val uint32) { + usermem.ByteOrder.PutUint32(s.Fields[12:16], val) +} + +// SignalContext64 is equivalent to struct sigcontext, the type passed as the +// second argument to signal handlers set by signal(2). +type SignalContext64 struct { + R8 uint64 + R9 uint64 + R10 uint64 + R11 uint64 + R12 uint64 + R13 uint64 + R14 uint64 + R15 uint64 + Rdi uint64 + Rsi uint64 + Rbp uint64 + Rbx uint64 + Rdx uint64 + Rax uint64 + Rcx uint64 + Rsp uint64 + Rip uint64 + Eflags uint64 + Cs uint16 + Gs uint16 // always 0 on amd64. + Fs uint16 // always 0 on amd64. + Ss uint16 // only restored if _UC_STRICT_RESTORE_SS (unsupported). + Err uint64 + Trapno uint64 + Oldmask linux.SignalSet + Cr2 uint64 + // Pointer to a struct _fpstate. + Fpstate uint64 + Reserved [8]uint64 +} + +// Flags for UContext64.Flags. +const ( + _UC_FP_XSTATE = 1 + _UC_SIGCONTEXT_SS = 2 + _UC_STRICT_RESTORE_SS = 4 +) + +// UContext64 is equivalent to ucontext_t on 64-bit x86. +type UContext64 struct { + Flags uint64 + Link uint64 + Stack SignalStack + MContext SignalContext64 + Sigset linux.SignalSet +} + +// NewSignalAct implements Context.NewSignalAct. +func (c *context64) NewSignalAct() NativeSignalAct { + return &SignalAct{} +} + +// NewSignalStack implements Context.NewSignalStack. +func (c *context64) NewSignalStack() NativeSignalStack { + return &SignalStack{} +} + +// From Linux 'arch/x86/include/uapi/asm/sigcontext.h' the following is the +// size of the magic cookie at the end of the xsave frame. +// +// NOTE: Currently we don't actually populate the fpstate +// on the signal stack. +const _FP_XSTATE_MAGIC2_SIZE = 4 + +func (c *context64) fpuFrameSize() (size int, useXsave bool) { + size = len(c.x86FPState) + if size > 512 { + // Make room for the magic cookie at the end of the xsave frame. + size += _FP_XSTATE_MAGIC2_SIZE + useXsave = true + } + return size, useXsave +} + +// SignalSetup implements Context.SignalSetup. (Compare to Linux's +// arch/x86/kernel/signal.c:__setup_rt_frame().) +func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt *SignalStack, sigset linux.SignalSet) error { + sp := st.Bottom + + // "The 128-byte area beyond the location pointed to by %rsp is considered + // to be reserved and shall not be modified by signal or interrupt + // handlers. ... leaf functions may use this area for their entire stack + // frame, rather than adjusting the stack pointer in the prologue and + // epilogue." - AMD64 ABI + // + // (But this doesn't apply if we're starting at the top of the signal + // stack, in which case there is no following stack frame.) + if !(alt.IsEnabled() && sp == alt.Top()) { + sp -= 128 + } + + // Allocate space for floating point state on the stack. + // + // This isn't strictly necessary because we don't actually populate + // the fpstate. However we do store the floating point state of the + // interrupted thread inside the sentry. Simply accounting for this + // space on the user stack naturally caps the amount of memory the + // sentry will allocate for this purpose. + fpSize, _ := c.fpuFrameSize() + sp = (sp - usermem.Addr(fpSize)) & ^usermem.Addr(63) + + // Construct the UContext64 now since we need its size. + uc := &UContext64{ + // No _UC_FP_XSTATE: see Fpstate above. + // No _UC_STRICT_RESTORE_SS: we don't allow SS changes. + Flags: _UC_SIGCONTEXT_SS, + Stack: *alt, + MContext: SignalContext64{ + R8: c.Regs.R8, + R9: c.Regs.R9, + R10: c.Regs.R10, + R11: c.Regs.R11, + R12: c.Regs.R12, + R13: c.Regs.R13, + R14: c.Regs.R14, + R15: c.Regs.R15, + Rdi: c.Regs.Rdi, + Rsi: c.Regs.Rsi, + Rbp: c.Regs.Rbp, + Rbx: c.Regs.Rbx, + Rdx: c.Regs.Rdx, + Rax: c.Regs.Rax, + Rcx: c.Regs.Rcx, + Rsp: c.Regs.Rsp, + Rip: c.Regs.Rip, + Eflags: c.Regs.Eflags, + Cs: uint16(c.Regs.Cs), + Ss: uint16(c.Regs.Ss), + Oldmask: sigset, + }, + Sigset: sigset, + } + + // TODO: Set SignalContext64.Err, Trapno, and Cr2 based on + // the fault that caused the signal. For now, leave Err and Trapno + // unset and assume CR2 == info.Addr() for SIGSEGVs and SIGBUSes. + if linux.Signal(info.Signo) == linux.SIGSEGV || linux.Signal(info.Signo) == linux.SIGBUS { + uc.MContext.Cr2 = info.Addr() + } + + // "... the value (%rsp+8) is always a multiple of 16 (...) when control is + // transferred to the function entry point." - AMD64 ABI + ucSize := binary.Size(uc) + if ucSize < 0 { + // This can only happen if we've screwed up the definition of + // UContext64. + panic("can't get size of UContext64") + } + // st.Arch.Width() is for the restorer address. sizeof(siginfo) == 128. + frameSize := int(st.Arch.Width()) + ucSize + 128 + frameBottom := (sp-usermem.Addr(frameSize)) & ^usermem.Addr(15) - 8 + sp = frameBottom + usermem.Addr(frameSize) + st.Bottom = sp + + info.FixSignalCodeForUser() + + // Set up the stack frame. + infoAddr, err := st.Push(info) + if err != nil { + return err + } + ucAddr, err := st.Push(uc) + if err != nil { + return err + } + if act.HasRestorer() { + // Push the restorer return address. + // Note that this doesn't need to be popped. + if _, err := st.Push(usermem.Addr(act.Restorer)); err != nil { + return err + } + } else { + // amd64 requires a restorer. + return syscall.EFAULT + } + + // Set up registers. + c.Regs.Rip = act.Handler + c.Regs.Rsp = uint64(st.Bottom) + c.Regs.Rdi = uint64(info.Signo) + c.Regs.Rsi = uint64(infoAddr) + c.Regs.Rdx = uint64(ucAddr) + c.Regs.Rax = 0 + c.Regs.Ds = userDS + c.Regs.Es = userDS + c.Regs.Cs = userCS + c.Regs.Ss = userDS + + // Save the thread's floating point state. + c.sigFPState = append(c.sigFPState, c.x86FPState) + + // Signal handler gets a clean floating point state. + c.x86FPState = newX86FPState() + + return nil +} + +// SignalRestore implements Context.SignalRestore. (Compare to Linux's +// arch/x86/kernel/signal.c:sys_rt_sigreturn().) +func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, error) { + // Copy out the stack frame. + var uc UContext64 + if _, err := st.Pop(&uc); err != nil { + return 0, err + } + var info SignalInfo + if _, err := st.Pop(&info); err != nil { + return 0, err + } + + // Restore registers. + c.Regs.R8 = uc.MContext.R8 + c.Regs.R9 = uc.MContext.R9 + c.Regs.R10 = uc.MContext.R10 + c.Regs.R11 = uc.MContext.R11 + c.Regs.R12 = uc.MContext.R12 + c.Regs.R13 = uc.MContext.R13 + c.Regs.R14 = uc.MContext.R14 + c.Regs.R15 = uc.MContext.R15 + c.Regs.Rdi = uc.MContext.Rdi + c.Regs.Rsi = uc.MContext.Rsi + c.Regs.Rbp = uc.MContext.Rbp + c.Regs.Rbx = uc.MContext.Rbx + c.Regs.Rdx = uc.MContext.Rdx + c.Regs.Rax = uc.MContext.Rax + c.Regs.Rcx = uc.MContext.Rcx + c.Regs.Rsp = uc.MContext.Rsp + c.Regs.Rip = uc.MContext.Rip + c.Regs.Eflags = (c.Regs.Eflags & ^eflagsRestorable) | (uc.MContext.Eflags & eflagsRestorable) + c.Regs.Cs = uint64(uc.MContext.Cs) | 3 + // N.B. _UC_STRICT_RESTORE_SS not supported. + c.Regs.Orig_rax = math.MaxUint64 + + // Restore floating point state. + l := len(c.sigFPState) + if l > 0 { + c.x86FPState = c.sigFPState[l-1] + // NOTE: State save requires that any slice + // elements from '[len:cap]' to be zero value. + c.sigFPState[l-1] = nil + c.sigFPState = c.sigFPState[0 : l-1] + } else { + // This might happen if sigreturn(2) calls are unbalanced with + // respect to signal handler entries. This is not expected so + // don't bother to do anything fancy with the floating point + // state. + log.Infof("sigreturn unable to restore application fpstate") + } + + return uc.Sigset, nil +} diff --git a/pkg/sentry/arch/signal_info.go b/pkg/sentry/arch/signal_info.go new file mode 100644 index 000000000..ec004ae75 --- /dev/null +++ b/pkg/sentry/arch/signal_info.go @@ -0,0 +1,66 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arch + +// Possible values for SignalInfo.Code. These values originate from the Linux +// kernel's include/uapi/asm-generic/siginfo.h. +const ( + // SignalInfoUser (properly SI_USER) indicates that a signal was sent from + // a kill() or raise() syscall. + SignalInfoUser = 0 + + // SignalInfoKernel (properly SI_KERNEL) indicates that the signal was sent + // by the kernel. + SignalInfoKernel = 0x80 + + // SignalInfoTimer (properly SI_TIMER) indicates that the signal was sent + // by an expired timer. + SignalInfoTimer = -2 + + // SignalInfoTkill (properly SI_TKILL) indicates that the signal was sent + // from a tkill() or tgkill() syscall. + SignalInfoTkill = -6 + + // CLD_* codes are only meaningful for SIGCHLD. + + // CLD_EXITED indicates that a task exited. + CLD_EXITED = 1 + + // CLD_KILLED indicates that a task was killed by a signal. + CLD_KILLED = 2 + + // CLD_DUMPED indicates that a task was killed by a signal and then dumped + // core. + CLD_DUMPED = 3 + + // CLD_TRAPPED indicates that a task was stopped by ptrace. + CLD_TRAPPED = 4 + + // CLD_STOPPED indicates that a thread group completed a group stop. + CLD_STOPPED = 5 + + // CLD_CONTINUED indicates that a group-stopped thread group was continued. + CLD_CONTINUED = 6 + + // SYS_* codes are only meaningful for SIGSYS. + + // SYS_SECCOMP indicates that a signal originates from seccomp. + SYS_SECCOMP = 1 + + // TRAP_* codes are only meaningful for SIGTRAP. + + // TRAP_BRKPT indicates a breakpoint trap. + TRAP_BRKPT = 1 +) diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go new file mode 100644 index 000000000..7c6531d79 --- /dev/null +++ b/pkg/sentry/arch/signal_stack.go @@ -0,0 +1,58 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build i386 amd64 + +package arch + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +const ( + // SignalStackFlagOnStack is possible set on return from getaltstack, + // in order to indicate that the thread is currently on the alt stack. + SignalStackFlagOnStack = 1 + + // SignalStackFlagDisable is a flag to indicate the stack is disabled. + SignalStackFlagDisable = 2 +) + +// IsEnabled returns true iff this signal stack is marked as enabled. +func (s SignalStack) IsEnabled() bool { + return s.Flags&SignalStackFlagDisable == 0 +} + +// Top returns the stack's top address. +func (s SignalStack) Top() usermem.Addr { + return usermem.Addr(s.Addr + s.Size) +} + +// SetOnStack marks this signal stack as in use. (This is only called on copies +// sent to user applications, so there's no corresponding ClearOnStack.) +func (s *SignalStack) SetOnStack() { + s.Flags |= SignalStackFlagOnStack +} + +// NativeSignalStack is a type that is equivalent to stack_t in the guest +// architecture. +type NativeSignalStack interface { + // SerializeFrom copies the data in the host SignalStack s into this + // object. + SerializeFrom(s *SignalStack) + + // DeserializeTo copies the data in this object into the host SignalStack + // s. + DeserializeTo(s *SignalStack) +} diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go new file mode 100644 index 000000000..6c1b9be82 --- /dev/null +++ b/pkg/sentry/arch/stack.go @@ -0,0 +1,246 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arch + +import ( + "encoding/binary" + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// Stack is a simple wrapper around a usermem.IO and an address. +type Stack struct { + // Our arch info. + // We use this for automatic Native conversion of usermem.Addrs during + // Push() and Pop(). + Arch Context + + // The interface used to actually copy user memory. + IO usermem.IO + + // Our current stack bottom. + Bottom usermem.Addr +} + +// Push pushes the given values on to the stack. +// +// (This method supports Addrs and treats them as native types.) +func (s *Stack) Push(vals ...interface{}) (usermem.Addr, error) { + for _, v := range vals { + + // We convert some types to well-known serializable quanities. + var norm interface{} + + // For array types, we will automatically add an appropriate + // terminal value. This is done simply to make the interface + // easier to use. + var term interface{} + + switch v.(type) { + case string: + norm = []byte(v.(string)) + term = byte(0) + case []int8, []uint8: + norm = v + term = byte(0) + case []int16, []uint16: + norm = v + term = uint16(0) + case []int32, []uint32: + norm = v + term = uint32(0) + case []int64, []uint64: + norm = v + term = uint64(0) + case []usermem.Addr: + // Special case: simply push recursively. + _, err := s.Push(s.Arch.Native(uintptr(0))) + if err != nil { + return 0, err + } + varr := v.([]usermem.Addr) + for i := len(varr) - 1; i >= 0; i-- { + _, err := s.Push(varr[i]) + if err != nil { + return 0, err + } + } + continue + case usermem.Addr: + norm = s.Arch.Native(uintptr(v.(usermem.Addr))) + default: + norm = v + } + + if term != nil { + _, err := s.Push(term) + if err != nil { + return 0, err + } + } + + c := binary.Size(norm) + if c < 0 { + return 0, fmt.Errorf("bad binary.Size for %T", v) + } + // TODO: Use a real context.Context. + n, err := usermem.CopyObjectOut(context.Background(), s.IO, s.Bottom-usermem.Addr(c), norm, usermem.IOOpts{}) + if err != nil || c != n { + return 0, err + } + + s.Bottom -= usermem.Addr(n) + } + + return s.Bottom, nil +} + +// Pop pops the given values off the stack. +// +// (This method supports Addrs and treats them as native types.) +func (s *Stack) Pop(vals ...interface{}) (usermem.Addr, error) { + for _, v := range vals { + + vaddr, isVaddr := v.(*usermem.Addr) + + var n int + var err error + if isVaddr { + value := s.Arch.Native(uintptr(0)) + // TODO: Use a real context.Context. + n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, value, usermem.IOOpts{}) + *vaddr = usermem.Addr(s.Arch.Value(value)) + } else { + // TODO: Use a real context.Context. + n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, v, usermem.IOOpts{}) + } + if err != nil { + return 0, err + } + + s.Bottom += usermem.Addr(n) + } + + return s.Bottom, nil +} + +// Align aligns the stack to the given offset. +func (s *Stack) Align(offset int) { + if s.Bottom%usermem.Addr(offset) != 0 { + s.Bottom -= (s.Bottom % usermem.Addr(offset)) + } +} + +// StackLayout describes the location of the arguments and environment on the +// stack. +type StackLayout struct { + // ArgvStart is the beginning of the argument vector. + ArgvStart usermem.Addr + + // ArgvEnd is the end of the argument vector. + ArgvEnd usermem.Addr + + // EnvvStart is the beginning of the environment vector. + EnvvStart usermem.Addr + + // EnvvEnd is the end of the environment vector. + EnvvEnd usermem.Addr +} + +// Load pushes the given args, env and aux vector to the stack using the +// well-known format for a new executable. It returns the start and end +// of the argument and environment vectors. +func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error) { + l := StackLayout{} + + // Make sure we start with a 16-byte alignment. + s.Align(16) + + // Push our strings. + l.ArgvEnd = s.Bottom + argAddrs := make([]usermem.Addr, len(args)) + for i := len(args) - 1; i >= 0; i-- { + addr, err := s.Push(args[i]) + if err != nil { + return StackLayout{}, err + } + argAddrs[i] = addr + } + l.ArgvStart = s.Bottom + + // Push our environment. + l.EnvvEnd = s.Bottom + envAddrs := make([]usermem.Addr, len(env)) + for i := len(env) - 1; i >= 0; i-- { + addr, err := s.Push(env[i]) + if err != nil { + return StackLayout{}, err + } + envAddrs[i] = addr + } + l.EnvvStart = s.Bottom + + // We need to align the arguments appropriately. + // + // We must finish on a 16-byte alignment, but we'll play it + // conservatively and finish at 32-bytes. It would be nice to be able + // to call Align here, but unfortunately we need to align the stack + // with all the variable sized arrays pushed. So we just need to do + // some calculations. + argvSize := s.Arch.Width() * uint(len(args)+1) + envvSize := s.Arch.Width() * uint(len(env)+1) + auxvSize := s.Arch.Width() * 2 * uint(len(aux)+1) + total := usermem.Addr(argvSize) + usermem.Addr(envvSize) + usermem.Addr(auxvSize) + usermem.Addr(s.Arch.Width()) + expectedBottom := s.Bottom - total + if expectedBottom%32 != 0 { + s.Bottom -= expectedBottom % 32 + } + + // Push our auxvec. + // NOTE: We need an extra zero here per spec. + // The Push function will automatically terminate + // strings and arrays with a single null value. + auxv := make([]usermem.Addr, 0, len(aux)) + for _, a := range aux { + auxv = append(auxv, usermem.Addr(a.Key), a.Value) + } + auxv = append(auxv, usermem.Addr(0)) + _, err := s.Push(auxv) + if err != nil { + return StackLayout{}, err + } + + // Push environment. + _, err = s.Push(envAddrs) + if err != nil { + return StackLayout{}, err + } + + // Push args. + _, err = s.Push(argAddrs) + if err != nil { + return StackLayout{}, err + } + + // Push arg count. + _, err = s.Push(usermem.Addr(len(args))) + if err != nil { + return StackLayout{}, err + } + + return l, nil +} diff --git a/pkg/sentry/arch/syscalls_amd64.go b/pkg/sentry/arch/syscalls_amd64.go new file mode 100644 index 000000000..41d8ba0d1 --- /dev/null +++ b/pkg/sentry/arch/syscalls_amd64.go @@ -0,0 +1,52 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package arch + +const restartSyscallNr = uintptr(219) + +// SyscallNo returns the syscall number according to the 64-bit convention. +func (c *context64) SyscallNo() uintptr { + return uintptr(c.Regs.Orig_rax) +} + +// SyscallArgs provides syscall arguments according to the 64-bit convention. +// +// Due to the way addresses are mapped for the sentry this binary *must* be +// built in 64-bit mode. So we can just assume the syscall numbers that come +// back match the expected host system call numbers. +func (c *context64) SyscallArgs() SyscallArguments { + return SyscallArguments{ + SyscallArgument{Value: uintptr(c.Regs.Rdi)}, + SyscallArgument{Value: uintptr(c.Regs.Rsi)}, + SyscallArgument{Value: uintptr(c.Regs.Rdx)}, + SyscallArgument{Value: uintptr(c.Regs.R10)}, + SyscallArgument{Value: uintptr(c.Regs.R8)}, + SyscallArgument{Value: uintptr(c.Regs.R9)}, + } +} + +// RestartSyscall implements Context.RestartSyscall. +func (c *context64) RestartSyscall() { + c.Regs.Rip -= SyscallWidth + c.Regs.Rax = c.Regs.Orig_rax +} + +// RestartSyscallWithRestartBlock implements Context.RestartSyscallWithRestartBlock. +func (c *context64) RestartSyscallWithRestartBlock() { + c.Regs.Rip -= SyscallWidth + c.Regs.Rax = uint64(restartSyscallNr) +} |