diff options
author | gVisor bot <gvisor-bot@google.com> | 2019-06-02 06:44:55 +0000 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2019-06-02 06:44:55 +0000 |
commit | ceb0d792f328d1fc0692197d8856a43c3936a571 (patch) | |
tree | 83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/platform/ptrace | |
parent | deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff) | |
parent | 216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff) |
Merge 216da0b7 (automated)
Diffstat (limited to 'pkg/sentry/platform/ptrace')
-rw-r--r-- | pkg/sentry/platform/ptrace/ptrace.go | 238 | ||||
-rwxr-xr-x | pkg/sentry/platform/ptrace/ptrace_state_autogen.go | 4 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/ptrace_unsafe.go | 166 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/stub_amd64.s | 114 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/stub_unsafe.go | 98 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess.go | 610 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess_amd64.go | 104 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess_linux.go | 338 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go | 109 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess_unsafe.go | 33 |
10 files changed, 1814 insertions, 0 deletions
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go new file mode 100644 index 000000000..6a890dd81 --- /dev/null +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -0,0 +1,238 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package ptrace provides a ptrace-based implementation of the platform +// interface. This is useful for development and testing purposes primarily, +// and runs on stock kernels without special permissions. +// +// In a nutshell, it works as follows: +// +// The creation of a new address space creates a new child processes with a +// single thread which is traced by a single goroutine. +// +// A context is just a collection of temporary variables. Calling Switch on a +// context does the following: +// +// Locks the runtime thread. +// +// Looks up a traced subprocess thread for the current runtime thread. If +// none exists, the dedicated goroutine is asked to create a new stopped +// thread in the subprocess. This stopped subprocess thread is then traced +// by the current thread and this information is stored for subsequent +// switches. +// +// The context is then bound with information about the subprocess thread +// so that the context may be appropriately interrupted via a signal. +// +// The requested operation is performed in the traced subprocess thread +// (e.g. set registers, execute, return). +// +// Lock order: +// +// subprocess.mu +// context.mu +package ptrace + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/interrupt" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +var ( + // stubStart is the link address for our stub, and determines the + // maximum user address. This is valid only after a call to stubInit. + // + // We attempt to link the stub here, and adjust downward as needed. + stubStart uintptr = 0x7fffffff0000 + + // stubEnd is the first byte past the end of the stub, as with + // stubStart this is valid only after a call to stubInit. + stubEnd uintptr + + // stubInitialized controls one-time stub initialization. + stubInitialized sync.Once +) + +type context struct { + // signalInfo is the signal info, if and when a signal is received. + signalInfo arch.SignalInfo + + // interrupt is the interrupt context. + interrupt interrupt.Forwarder + + // mu protects the following fields. + mu sync.Mutex + + // If lastFaultSP is non-nil, the last context switch was due to a fault + // received while executing lastFaultSP. Only context.Switch may set + // lastFaultSP to a non-nil value. + lastFaultSP *subprocess + + // lastFaultAddr is the last faulting address; this is only meaningful if + // lastFaultSP is non-nil. + lastFaultAddr usermem.Addr + + // lastFaultIP is the address of the last faulting instruction; + // this is also only meaningful if lastFaultSP is non-nil. + lastFaultIP usermem.Addr +} + +// Switch runs the provided context in the given address space. +func (c *context) Switch(as platform.AddressSpace, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) { + s := as.(*subprocess) + isSyscall := s.switchToApp(c, ac) + + var ( + faultSP *subprocess + faultAddr usermem.Addr + faultIP usermem.Addr + ) + if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGSEGV { + faultSP = s + faultAddr = usermem.Addr(c.signalInfo.Addr()) + faultIP = usermem.Addr(ac.IP()) + } + + // Update the context to reflect the outcome of this context switch. + c.mu.Lock() + lastFaultSP := c.lastFaultSP + lastFaultAddr := c.lastFaultAddr + lastFaultIP := c.lastFaultIP + // At this point, c may not yet be in s.contexts, so c.lastFaultSP won't be + // updated by s.Unmap(). This is fine; we only need to synchronize with + // calls to s.Unmap() that occur after the handling of this fault. + c.lastFaultSP = faultSP + c.lastFaultAddr = faultAddr + c.lastFaultIP = faultIP + c.mu.Unlock() + + // Update subprocesses to reflect the outcome of this context switch. + if lastFaultSP != faultSP { + if lastFaultSP != nil { + lastFaultSP.mu.Lock() + delete(lastFaultSP.contexts, c) + lastFaultSP.mu.Unlock() + } + if faultSP != nil { + faultSP.mu.Lock() + faultSP.contexts[c] = struct{}{} + faultSP.mu.Unlock() + } + } + + if isSyscall { + return nil, usermem.NoAccess, nil + } + + si := c.signalInfo + + if faultSP == nil { + // Non-fault signal. + return &si, usermem.NoAccess, platform.ErrContextSignal + } + + // Got a page fault. Ideally, we'd get real fault type here, but ptrace + // doesn't expose this information. Instead, we use a simple heuristic: + // + // It was an instruction fault iff the faulting addr == instruction + // pointer. + // + // It was a write fault if the fault is immediately repeated. + at := usermem.Read + if faultAddr == faultIP { + at.Execute = true + } + if lastFaultSP == faultSP && + lastFaultAddr == faultAddr && + lastFaultIP == faultIP { + at.Write = true + } + + // Unfortunately, we have to unilaterally return ErrContextSignalCPUID + // here, in case this fault was generated by a CPUID exception. There + // is no way to distinguish between CPUID-generated faults and regular + // page faults. + return &si, at, platform.ErrContextSignalCPUID +} + +// Interrupt interrupts the running guest application associated with this context. +func (c *context) Interrupt() { + c.interrupt.NotifyInterrupt() +} + +// PTrace represents a collection of ptrace subprocesses. +type PTrace struct { + platform.MMapMinAddr + platform.NoCPUPreemptionDetection +} + +// New returns a new ptrace-based implementation of the platform interface. +func New() (*PTrace, error) { + stubInitialized.Do(func() { + // Initialize the stub. + stubInit() + + // Create the master process for the global pool. This must be + // done before initializing any other processes. + master, err := newSubprocess(createStub) + if err != nil { + // Should never happen. + panic("unable to initialize ptrace master: " + err.Error()) + } + + // Set the master on the globalPool. + globalPool.master = master + }) + + return &PTrace{}, nil +} + +// SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO. +func (*PTrace) SupportsAddressSpaceIO() bool { + return false +} + +// CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace. +func (*PTrace) CooperativelySchedulesAddressSpace() bool { + return false +} + +// MapUnit implements platform.Platform.MapUnit. +func (*PTrace) MapUnit() uint64 { + // The host kernel manages page tables and arbitrary-sized mappings + // have effectively the same cost. + return 0 +} + +// MaxUserAddress returns the first address that may not be used by user +// applications. +func (*PTrace) MaxUserAddress() usermem.Addr { + return usermem.Addr(stubStart) +} + +// NewAddressSpace returns a new subprocess. +func (p *PTrace) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) { + as, err := newSubprocess(globalPool.master.createStub) + return as, nil, err +} + +// NewContext returns an interruptible context. +func (*PTrace) NewContext() platform.Context { + return &context{} +} diff --git a/pkg/sentry/platform/ptrace/ptrace_state_autogen.go b/pkg/sentry/platform/ptrace/ptrace_state_autogen.go new file mode 100755 index 000000000..ac83a71e7 --- /dev/null +++ b/pkg/sentry/platform/ptrace/ptrace_state_autogen.go @@ -0,0 +1,4 @@ +// automatically generated by stateify. + +package ptrace + diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go new file mode 100644 index 000000000..585f6c1fb --- /dev/null +++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go @@ -0,0 +1,166 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ptrace + +import ( + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// GETREGSET/SETREGSET register set types. +// +// See include/uapi/linux/elf.h. +const ( + // _NT_PRFPREG is for x86 floating-point state without using xsave. + _NT_PRFPREG = 0x2 + + // _NT_X86_XSTATE is for x86 extended state using xsave. + _NT_X86_XSTATE = 0x202 +) + +// fpRegSet returns the GETREGSET/SETREGSET register set type to be used. +func fpRegSet(useXsave bool) uintptr { + if useXsave { + return _NT_X86_XSTATE + } + return _NT_PRFPREG +} + +// getRegs sets the regular register set. +func (t *thread) getRegs(regs *syscall.PtraceRegs) error { + _, _, errno := syscall.RawSyscall6( + syscall.SYS_PTRACE, + syscall.PTRACE_GETREGS, + uintptr(t.tid), + 0, + uintptr(unsafe.Pointer(regs)), + 0, 0) + if errno != 0 { + return errno + } + return nil +} + +// setRegs sets the regular register set. +func (t *thread) setRegs(regs *syscall.PtraceRegs) error { + _, _, errno := syscall.RawSyscall6( + syscall.SYS_PTRACE, + syscall.PTRACE_SETREGS, + uintptr(t.tid), + 0, + uintptr(unsafe.Pointer(regs)), + 0, 0) + if errno != 0 { + return errno + } + return nil +} + +// getFPRegs gets the floating-point data via the GETREGSET ptrace syscall. +func (t *thread) getFPRegs(fpState *arch.FloatingPointData, fpLen uint64, useXsave bool) error { + iovec := syscall.Iovec{ + Base: (*byte)(fpState), + Len: fpLen, + } + _, _, errno := syscall.RawSyscall6( + syscall.SYS_PTRACE, + syscall.PTRACE_GETREGSET, + uintptr(t.tid), + fpRegSet(useXsave), + uintptr(unsafe.Pointer(&iovec)), + 0, 0) + if errno != 0 { + return errno + } + return nil +} + +// setFPRegs sets the floating-point data via the SETREGSET ptrace syscall. +func (t *thread) setFPRegs(fpState *arch.FloatingPointData, fpLen uint64, useXsave bool) error { + iovec := syscall.Iovec{ + Base: (*byte)(fpState), + Len: fpLen, + } + _, _, errno := syscall.RawSyscall6( + syscall.SYS_PTRACE, + syscall.PTRACE_SETREGSET, + uintptr(t.tid), + fpRegSet(useXsave), + uintptr(unsafe.Pointer(&iovec)), + 0, 0) + if errno != 0 { + return errno + } + return nil +} + +// getSignalInfo retrieves information about the signal that caused the stop. +func (t *thread) getSignalInfo(si *arch.SignalInfo) error { + _, _, errno := syscall.RawSyscall6( + syscall.SYS_PTRACE, + syscall.PTRACE_GETSIGINFO, + uintptr(t.tid), + 0, + uintptr(unsafe.Pointer(si)), + 0, 0) + if errno != 0 { + return errno + } + return nil +} + +// clone creates a new thread from this one. +// +// The returned thread will be stopped and available for any system thread to +// call attach on it. +// +// Precondition: the OS thread must be locked and own t. +func (t *thread) clone() (*thread, error) { + r, ok := usermem.Addr(t.initRegs.Rsp).RoundUp() + if !ok { + return nil, syscall.EINVAL + } + rval, err := t.syscallIgnoreInterrupt( + &t.initRegs, + syscall.SYS_CLONE, + arch.SyscallArgument{Value: uintptr( + syscall.CLONE_FILES | + syscall.CLONE_FS | + syscall.CLONE_SIGHAND | + syscall.CLONE_THREAD | + syscall.CLONE_PTRACE | + syscall.CLONE_VM)}, + // The stack pointer is just made up, but we have it be + // something sensible so the kernel doesn't think we're + // up to no good. Which we are. + arch.SyscallArgument{Value: uintptr(r)}, + arch.SyscallArgument{}, + arch.SyscallArgument{}, + // We use these registers initially, but really they + // could be anything. We're going to stop immediately. + arch.SyscallArgument{Value: uintptr(unsafe.Pointer(&t.initRegs))}) + if err != nil { + return nil, err + } + + return &thread{ + tgid: t.tgid, + tid: int32(rval), + cpu: ^uint32(0), + }, nil +} diff --git a/pkg/sentry/platform/ptrace/stub_amd64.s b/pkg/sentry/platform/ptrace/stub_amd64.s new file mode 100644 index 000000000..64c718d21 --- /dev/null +++ b/pkg/sentry/platform/ptrace/stub_amd64.s @@ -0,0 +1,114 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "funcdata.h" +#include "textflag.h" + +#define SYS_GETPID 39 +#define SYS_EXIT 60 +#define SYS_KILL 62 +#define SYS_GETPPID 110 +#define SYS_PRCTL 157 + +#define SIGKILL 9 +#define SIGSTOP 19 + +#define PR_SET_PDEATHSIG 1 + +// stub bootstraps the child and sends itself SIGSTOP to wait for attach. +// +// R15 contains the expected PPID. R15 is used instead of a more typical DI +// since syscalls will clobber DI and createStub wants to pass a new PPID to +// grandchildren. +// +// This should not be used outside the context of a new ptrace child (as the +// function is otherwise a bunch of nonsense). +TEXT ·stub(SB),NOSPLIT,$0 +begin: + // N.B. This loop only executes in the context of a single-threaded + // fork child. + + MOVQ $SYS_PRCTL, AX + MOVQ $PR_SET_PDEATHSIG, DI + MOVQ $SIGKILL, SI + SYSCALL + + CMPQ AX, $0 + JNE error + + // If the parent already died before we called PR_SET_DEATHSIG then + // we'll have an unexpected PPID. + MOVQ $SYS_GETPPID, AX + SYSCALL + + CMPQ AX, $0 + JL error + + CMPQ AX, R15 + JNE parent_dead + + MOVQ $SYS_GETPID, AX + SYSCALL + + CMPQ AX, $0 + JL error + + // SIGSTOP to wait for attach. + // + // The SYSCALL instruction will be used for future syscall injection by + // thread.syscall. + MOVQ AX, DI + MOVQ $SYS_KILL, AX + MOVQ $SIGSTOP, SI + SYSCALL + + // The tracer may "detach" and/or allow code execution here in three cases: + // + // 1. New (traced) stub threads are explicitly detached by the + // goroutine in newSubprocess. However, they are detached while in + // group-stop, so they do not execute code here. + // + // 2. If a tracer thread exits, it implicitly detaches from the stub, + // potentially allowing code execution here. However, the Go runtime + // never exits individual threads, so this case never occurs. + // + // 3. subprocess.createStub clones a new stub process that is untraced, + // thus executing this code. We setup the PDEATHSIG before SIGSTOPing + // ourselves for attach by the tracer. + // + // R15 has been updated with the expected PPID. + JMP begin + +error: + // Exit with -errno. + MOVQ AX, DI + NEGQ DI + MOVQ $SYS_EXIT, AX + SYSCALL + HLT + +parent_dead: + MOVQ $SYS_EXIT, AX + MOVQ $1, DI + SYSCALL + HLT + +// stubCall calls the stub function at the given address with the given PPID. +// +// This is a distinct function because stub, above, may be mapped at any +// arbitrary location, and stub has a specific binary API (see above). +TEXT ·stubCall(SB),NOSPLIT,$0-16 + MOVQ addr+0(FP), AX + MOVQ pid+8(FP), R15 + JMP AX diff --git a/pkg/sentry/platform/ptrace/stub_unsafe.go b/pkg/sentry/platform/ptrace/stub_unsafe.go new file mode 100644 index 000000000..54d5021a9 --- /dev/null +++ b/pkg/sentry/platform/ptrace/stub_unsafe.go @@ -0,0 +1,98 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ptrace + +import ( + "reflect" + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// stub is defined in arch-specific assembly. +func stub() + +// stubCall calls the stub at the given address with the given pid. +func stubCall(addr, pid uintptr) + +// unsafeSlice returns a slice for the given address and length. +func unsafeSlice(addr uintptr, length int) (slice []byte) { + sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) + sh.Data = addr + sh.Len = length + sh.Cap = length + return +} + +// stubInit initializes the stub. +func stubInit() { + // Grab the existing stub. + stubBegin := reflect.ValueOf(stub).Pointer() + stubLen := int(safecopy.FindEndAddress(stubBegin) - stubBegin) + stubSlice := unsafeSlice(stubBegin, stubLen) + mapLen := uintptr(stubLen) + if offset := mapLen % usermem.PageSize; offset != 0 { + mapLen += usermem.PageSize - offset + } + + for stubStart > 0 { + // Map the target address for the stub. + // + // We don't use FIXED here because we don't want to unmap + // something that may have been there already. We just walk + // down the address space until we find a place where the stub + // can be placed. + addr, _, errno := syscall.RawSyscall6( + syscall.SYS_MMAP, + stubStart, + mapLen, + syscall.PROT_WRITE|syscall.PROT_READ, + syscall.MAP_PRIVATE|syscall.MAP_ANONYMOUS, + 0 /* fd */, 0 /* offset */) + if addr != stubStart || errno != 0 { + if addr != 0 { + // Unmap the region we've mapped accidentally. + syscall.RawSyscall(syscall.SYS_MUNMAP, addr, mapLen, 0) + } + + // Attempt to begin at a lower address. + stubStart -= uintptr(usermem.PageSize) + continue + } + + // Copy the stub to the address. + targetSlice := unsafeSlice(addr, stubLen) + copy(targetSlice, stubSlice) + + // Make the stub executable. + if _, _, errno := syscall.RawSyscall( + syscall.SYS_MPROTECT, + stubStart, + mapLen, + syscall.PROT_EXEC|syscall.PROT_READ); errno != 0 { + panic("mprotect failed: " + errno.Error()) + } + + // Set the end. + stubEnd = stubStart + mapLen + return + } + + // This will happen only if we exhaust the entire address + // space, and it will take a long, long time. + panic("failed to map stub") +} diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go new file mode 100644 index 000000000..83b43057f --- /dev/null +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -0,0 +1,610 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ptrace + +import ( + "fmt" + "os" + "runtime" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// globalPool exists to solve two distinct problems: +// +// 1) Subprocesses can't always be killed properly (see Release). +// +// 2) Any seccomp filters that have been installed will apply to subprocesses +// created here. Therefore we use the intermediary (master), which is created +// on initialization of the platform. +var globalPool struct { + mu sync.Mutex + master *subprocess + available []*subprocess +} + +// thread is a traced thread; it is a thread identifier. +// +// This is a convenience type for defining ptrace operations. +type thread struct { + tgid int32 + tid int32 + cpu uint32 + + // initRegs are the initial registers for the first thread. + // + // These are used for the register set for system calls. + initRegs syscall.PtraceRegs +} + +// threadPool is a collection of threads. +type threadPool struct { + // mu protects below. + mu sync.Mutex + + // threads is the collection of threads. + // + // This map is indexed by system TID (the calling thread); which will + // be the tracer for the given *thread, and therefore capable of using + // relevant ptrace calls. + threads map[int32]*thread +} + +// lookupOrCreate looks up a given thread or creates one. +// +// newThread will generally be subprocess.newThread. +// +// Precondition: the runtime OS thread must be locked. +func (tp *threadPool) lookupOrCreate(currentTID int32, newThread func() *thread) *thread { + tp.mu.Lock() + t, ok := tp.threads[currentTID] + if !ok { + // Before creating a new thread, see if we can find a thread + // whose system tid has disappeared. + // + // TODO(b/77216482): Other parts of this package depend on + // threads never exiting. + for origTID, t := range tp.threads { + // Signal zero is an easy existence check. + if err := syscall.Tgkill(syscall.Getpid(), int(origTID), 0); err != nil { + // This thread has been abandoned; reuse it. + delete(tp.threads, origTID) + tp.threads[currentTID] = t + tp.mu.Unlock() + return t + } + } + + // Create a new thread. + t = newThread() + tp.threads[currentTID] = t + } + tp.mu.Unlock() + return t +} + +// subprocess is a collection of threads being traced. +type subprocess struct { + platform.NoAddressSpaceIO + + // requests is used to signal creation of new threads. + requests chan chan *thread + + // sysemuThreads are reserved for emulation. + sysemuThreads threadPool + + // syscallThreads are reserved for syscalls (except clone, which is + // handled in the dedicated goroutine corresponding to requests above). + syscallThreads threadPool + + // mu protects the following fields. + mu sync.Mutex + + // contexts is the set of contexts for which it's possible that + // context.lastFaultSP == this subprocess. + contexts map[*context]struct{} +} + +// newSubprocess returns a useable subprocess. +// +// This will either be a newly created subprocess, or one from the global pool. +// The create function will be called in the latter case, which is guaranteed +// to happen with the runtime thread locked. +func newSubprocess(create func() (*thread, error)) (*subprocess, error) { + // See Release. + globalPool.mu.Lock() + if len(globalPool.available) > 0 { + sp := globalPool.available[len(globalPool.available)-1] + globalPool.available = globalPool.available[:len(globalPool.available)-1] + globalPool.mu.Unlock() + return sp, nil + } + globalPool.mu.Unlock() + + // The following goroutine is responsible for creating the first traced + // thread, and responding to requests to make additional threads in the + // traced process. The process will be killed and reaped when the + // request channel is closed, which happens in Release below. + errChan := make(chan error) + requests := make(chan chan *thread) + go func() { // S/R-SAFE: Platform-related. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + // Initialize the first thread. + firstThread, err := create() + if err != nil { + errChan <- err + return + } + + // Ready to handle requests. + errChan <- nil + + // Wait for requests to create threads. + for r := range requests { + t, err := firstThread.clone() + if err != nil { + // Should not happen: not recoverable. + panic(fmt.Sprintf("error initializing first thread: %v", err)) + } + + // Since the new thread was created with + // clone(CLONE_PTRACE), it will begin execution with + // SIGSTOP pending and with this thread as its tracer. + // (Hopefully nobody tgkilled it with a signal < + // SIGSTOP before the SIGSTOP was delivered, in which + // case that signal would be delivered before SIGSTOP.) + if sig := t.wait(stopped); sig != syscall.SIGSTOP { + panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig)) + } + + // Detach the thread. + t.detach() + + // Return the thread. + r <- t + } + + // Requests should never be closed. + panic("unreachable") + }() + + // Wait until error or readiness. + if err := <-errChan; err != nil { + return nil, err + } + + // Ready. + sp := &subprocess{ + requests: requests, + sysemuThreads: threadPool{ + threads: make(map[int32]*thread), + }, + syscallThreads: threadPool{ + threads: make(map[int32]*thread), + }, + contexts: make(map[*context]struct{}), + } + + sp.unmap() + return sp, nil +} + +// unmap unmaps non-stub regions of the process. +// +// This will panic on failure (which should never happen). +func (s *subprocess) unmap() { + s.Unmap(0, uint64(stubStart)) + if maximumUserAddress != stubEnd { + s.Unmap(usermem.Addr(stubEnd), uint64(maximumUserAddress-stubEnd)) + } +} + +// Release kills the subprocess. +// +// Just kidding! We can't safely co-ordinate the detaching of all the +// tracees (since the tracers are random runtime threads, and the process +// won't exit until tracers have been notifier). +// +// Therefore we simply unmap everything in the subprocess and return it to the +// globalPool. This has the added benefit of reducing creation time for new +// subprocesses. +func (s *subprocess) Release() { + go func() { // S/R-SAFE: Platform. + s.unmap() + globalPool.mu.Lock() + globalPool.available = append(globalPool.available, s) + globalPool.mu.Unlock() + }() +} + +// newThread creates a new traced thread. +// +// Precondition: the OS thread must be locked. +func (s *subprocess) newThread() *thread { + // Ask the first thread to create a new one. + r := make(chan *thread) + s.requests <- r + t := <-r + + // Attach the subprocess to this one. + t.attach() + + // Return the new thread, which is now bound. + return t +} + +// attach attachs to the thread. +func (t *thread) attach() { + if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_ATTACH, uintptr(t.tid), 0); errno != 0 { + panic(fmt.Sprintf("unable to attach: %v", errno)) + } + + // PTRACE_ATTACH sends SIGSTOP, and wakes the tracee if it was already + // stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of + // newSubprocess), so we always expect to see signal-delivery-stop with + // SIGSTOP. + if sig := t.wait(stopped); sig != syscall.SIGSTOP { + panic(fmt.Sprintf("wait failed: expected SIGSTOP, got %v", sig)) + } + + // Initialize options. + t.init() + + // Grab registers. + // + // Note that we adjust the current register RIP value to be just before + // the current system call executed. This depends on the definition of + // the stub itself. + if err := t.getRegs(&t.initRegs); err != nil { + panic(fmt.Sprintf("ptrace get regs failed: %v", err)) + } + t.initRegs.Rip -= initRegsRipAdjustment +} + +// detach detachs from the thread. +// +// Because the SIGSTOP is not supressed, the thread will enter group-stop. +func (t *thread) detach() { + if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(syscall.SIGSTOP), 0, 0); errno != 0 { + panic(fmt.Sprintf("can't detach new clone: %v", errno)) + } +} + +// waitOutcome is used for wait below. +type waitOutcome int + +const ( + // stopped indicates that the process was stopped. + stopped waitOutcome = iota + + // killed indicates that the process was killed. + killed +) + +// wait waits for a stop event. +// +// Precondition: outcome is a valid waitOutcome. +func (t *thread) wait(outcome waitOutcome) syscall.Signal { + var status syscall.WaitStatus + + for { + r, err := syscall.Wait4(int(t.tid), &status, syscall.WALL|syscall.WUNTRACED, nil) + if err == syscall.EINTR || err == syscall.EAGAIN { + // Wait was interrupted; wait again. + continue + } else if err != nil { + panic(fmt.Sprintf("ptrace wait failed: %v", err)) + } + if int(r) != int(t.tid) { + panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid)) + } + switch outcome { + case stopped: + if !status.Stopped() { + panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status)) + } + stopSig := status.StopSignal() + if stopSig == 0 { + continue // Spurious stop. + } + if stopSig == syscall.SIGTRAP { + // Re-encode the trap cause the way it's expected. + return stopSig | syscall.Signal(status.TrapCause()<<8) + } + // Not a trap signal. + return stopSig + case killed: + if !status.Exited() && !status.Signaled() { + panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status)) + } + return syscall.Signal(status.ExitStatus()) + default: + // Should not happen. + panic(fmt.Sprintf("unknown outcome: %v", outcome)) + } + } +} + +// destroy kills the thread. +// +// Note that this should not be used in the general case; the death of threads +// will typically cause the death of the parent. This is a utility method for +// manually created threads. +func (t *thread) destroy() { + t.detach() + syscall.Tgkill(int(t.tgid), int(t.tid), syscall.Signal(syscall.SIGKILL)) + t.wait(killed) +} + +// init initializes trace options. +func (t *thread) init() { + // Set our TRACESYSGOOD option to differeniate real SIGTRAP. + _, _, errno := syscall.RawSyscall6( + syscall.SYS_PTRACE, + syscall.PTRACE_SETOPTIONS, + uintptr(t.tid), + 0, + syscall.PTRACE_O_TRACESYSGOOD, + 0, 0) + if errno != 0 { + panic(fmt.Sprintf("ptrace set options failed: %v", errno)) + } +} + +// syscall executes a system call cycle in the traced context. +// +// This is _not_ for use by application system calls, rather it is for use when +// a system call must be injected into the remote context (e.g. mmap, munmap). +// Note that clones are handled separately. +func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) { + // Set registers. + if err := t.setRegs(regs); err != nil { + panic(fmt.Sprintf("ptrace set regs failed: %v", err)) + } + + for { + // Execute the syscall instruction. + if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0); errno != 0 { + panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) + } + + sig := t.wait(stopped) + if sig == (syscallEvent | syscall.SIGTRAP) { + // Reached syscall-enter-stop. + break + } else { + // Some other signal caused a thread stop; ignore. + continue + } + } + + // Complete the actual system call. + if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0); errno != 0 { + panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) + } + + // Wait for syscall-exit-stop. "[Signal-delivery-stop] never happens + // between syscall-enter-stop and syscall-exit-stop; it happens *after* + // syscall-exit-stop.)" - ptrace(2), "Syscall-stops" + if sig := t.wait(stopped); sig != (syscallEvent | syscall.SIGTRAP) { + panic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig)) + } + + // Grab registers. + if err := t.getRegs(regs); err != nil { + panic(fmt.Sprintf("ptrace get regs failed: %v", err)) + } + + return syscallReturnValue(regs) +} + +// syscallIgnoreInterrupt ignores interrupts on the system call thread and +// restarts the syscall if the kernel indicates that should happen. +func (t *thread) syscallIgnoreInterrupt( + initRegs *syscall.PtraceRegs, + sysno uintptr, + args ...arch.SyscallArgument) (uintptr, error) { + for { + regs := createSyscallRegs(initRegs, sysno, args...) + rval, err := t.syscall(®s) + switch err { + case ERESTARTSYS: + continue + case ERESTARTNOINTR: + continue + case ERESTARTNOHAND: + continue + default: + return rval, err + } + } +} + +// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. +func (t *thread) NotifyInterrupt() { + syscall.Tgkill(int(t.tgid), int(t.tid), syscall.Signal(platform.SignalInterrupt)) +} + +// switchToApp is called from the main SwitchToApp entrypoint. +// +// This function returns true on a system call, false on a signal. +func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { + // Lock the thread for ptrace operations. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + // Extract floating point state. + fpState := ac.FloatingPointData() + fpLen, _ := ac.FeatureSet().ExtendedStateSize() + useXsave := ac.FeatureSet().UseXsave() + + // Grab our thread from the pool. + currentTID := int32(procid.Current()) + t := s.sysemuThreads.lookupOrCreate(currentTID, s.newThread) + + // Reset necessary registers. + regs := &ac.StateData().Regs + t.resetSysemuRegs(regs) + + // Check for interrupts, and ensure that future interrupts will signal t. + if !c.interrupt.Enable(t) { + // Pending interrupt; simulate. + c.signalInfo = arch.SignalInfo{Signo: int32(platform.SignalInterrupt)} + return false + } + defer c.interrupt.Disable() + + // Ensure that the CPU set is bound appropriately; this makes the + // emulation below several times faster, presumably by avoiding + // interprocessor wakeups and by simplifying the schedule. + t.bind() + + // Set registers. + if err := t.setRegs(regs); err != nil { + panic(fmt.Sprintf("ptrace set regs (%+v) failed: %v", regs, err)) + } + if err := t.setFPRegs(fpState, uint64(fpLen), useXsave); err != nil { + panic(fmt.Sprintf("ptrace set fpregs (%+v) failed: %v", fpState, err)) + } + + for { + // Start running until the next system call. + if isSingleStepping(regs) { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_PTRACE, + syscall.PTRACE_SYSEMU_SINGLESTEP, + uintptr(t.tid), 0); errno != 0 { + panic(fmt.Sprintf("ptrace sysemu failed: %v", errno)) + } + } else { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_PTRACE, + syscall.PTRACE_SYSEMU, + uintptr(t.tid), 0); errno != 0 { + panic(fmt.Sprintf("ptrace sysemu failed: %v", errno)) + } + } + + // Wait for the syscall-enter stop. + sig := t.wait(stopped) + + // Refresh all registers. + if err := t.getRegs(regs); err != nil { + panic(fmt.Sprintf("ptrace get regs failed: %v", err)) + } + if err := t.getFPRegs(fpState, uint64(fpLen), useXsave); err != nil { + panic(fmt.Sprintf("ptrace get fpregs failed: %v", err)) + } + + // Is it a system call? + if sig == (syscallEvent | syscall.SIGTRAP) { + // Ensure registers are sane. + updateSyscallRegs(regs) + return true + } else if sig == syscall.SIGSTOP { + // SIGSTOP was delivered to another thread in the same thread + // group, which initiated another group stop. Just ignore it. + continue + } + + // Grab signal information. + if err := t.getSignalInfo(&c.signalInfo); err != nil { + // Should never happen. + panic(fmt.Sprintf("ptrace get signal info failed: %v", err)) + } + + // We have a signal. We verify however, that the signal was + // either delivered from the kernel or from this process. We + // don't respect other signals. + if c.signalInfo.Code > 0 { + // The signal was generated by the kernel. We inspect + // the signal information, and may patch it in order to + // faciliate vsyscall emulation. See patchSignalInfo. + patchSignalInfo(regs, &c.signalInfo) + return false + } else if c.signalInfo.Code <= 0 && c.signalInfo.Pid() == int32(os.Getpid()) { + // The signal was generated by this process. That means + // that it was an interrupt or something else that we + // should bail for. Note that we ignore signals + // generated by other processes. + return false + } + } +} + +// syscall executes the given system call without handling interruptions. +func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) { + // Grab a thread. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + currentTID := int32(procid.Current()) + t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread) + + return t.syscallIgnoreInterrupt(&t.initRegs, sysno, args...) +} + +// MapFile implements platform.AddressSpace.MapFile. +func (s *subprocess) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error { + var flags int + if precommit { + flags |= syscall.MAP_POPULATE + } + _, err := s.syscall( + syscall.SYS_MMAP, + arch.SyscallArgument{Value: uintptr(addr)}, + arch.SyscallArgument{Value: uintptr(fr.Length())}, + arch.SyscallArgument{Value: uintptr(at.Prot())}, + arch.SyscallArgument{Value: uintptr(flags | syscall.MAP_SHARED | syscall.MAP_FIXED)}, + arch.SyscallArgument{Value: uintptr(f.FD())}, + arch.SyscallArgument{Value: uintptr(fr.Start)}) + return err +} + +// Unmap implements platform.AddressSpace.Unmap. +func (s *subprocess) Unmap(addr usermem.Addr, length uint64) { + ar, ok := addr.ToRange(length) + if !ok { + panic(fmt.Sprintf("addr %#x + length %#x overflows", addr, length)) + } + s.mu.Lock() + for c := range s.contexts { + c.mu.Lock() + if c.lastFaultSP == s && ar.Contains(c.lastFaultAddr) { + // Forget the last fault so that if c faults again, the fault isn't + // incorrectly reported as a write fault. If this is being called + // due to munmap() of the corresponding vma, handling of the second + // fault will fail anyway. + c.lastFaultSP = nil + delete(s.contexts, c) + } + c.mu.Unlock() + } + s.mu.Unlock() + _, err := s.syscall( + syscall.SYS_MUNMAP, + arch.SyscallArgument{Value: uintptr(addr)}, + arch.SyscallArgument{Value: uintptr(length)}) + if err != nil { + // We never expect this to happen. + panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err)) + } +} diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go new file mode 100644 index 000000000..77a0e908f --- /dev/null +++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go @@ -0,0 +1,104 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package ptrace + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" +) + +const ( + // maximumUserAddress is the largest possible user address. + maximumUserAddress = 0x7ffffffff000 + + // initRegsRipAdjustment is the size of the syscall instruction. + initRegsRipAdjustment = 2 +) + +// Linux kernel errnos which "should never be seen by user programs", but will +// be revealed to ptrace syscall exit tracing. +// +// These constants are used in subprocess.go. +const ( + ERESTARTSYS = syscall.Errno(512) + ERESTARTNOINTR = syscall.Errno(513) + ERESTARTNOHAND = syscall.Errno(514) +) + +// resetSysemuRegs sets up emulation registers. +// +// This should be called prior to calling sysemu. +func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) { + regs.Cs = t.initRegs.Cs + regs.Ss = t.initRegs.Ss + regs.Ds = t.initRegs.Ds + regs.Es = t.initRegs.Es + regs.Fs = t.initRegs.Fs + regs.Gs = t.initRegs.Gs +} + +// createSyscallRegs sets up syscall registers. +// +// This should be called to generate registers for a system call. +func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs { + // Copy initial registers. + regs := *initRegs + + // Set our syscall number. + regs.Rax = uint64(sysno) + if len(args) >= 1 { + regs.Rdi = args[0].Uint64() + } + if len(args) >= 2 { + regs.Rsi = args[1].Uint64() + } + if len(args) >= 3 { + regs.Rdx = args[2].Uint64() + } + if len(args) >= 4 { + regs.R10 = args[3].Uint64() + } + if len(args) >= 5 { + regs.R8 = args[4].Uint64() + } + if len(args) >= 6 { + regs.R9 = args[5].Uint64() + } + + return regs +} + +// isSingleStepping determines if the registers indicate single-stepping. +func isSingleStepping(regs *syscall.PtraceRegs) bool { + return (regs.Eflags & arch.X86TrapFlag) != 0 +} + +// updateSyscallRegs updates registers after finishing sysemu. +func updateSyscallRegs(regs *syscall.PtraceRegs) { + // Ptrace puts -ENOSYS in rax on syscall-enter-stop. + regs.Rax = regs.Orig_rax +} + +// syscallReturnValue extracts a sensible return from registers. +func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) { + rval := int64(regs.Rax) + if rval < 0 { + return 0, syscall.Errno(-rval) + } + return uintptr(rval), nil +} diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go new file mode 100644 index 000000000..2c07b4ac3 --- /dev/null +++ b/pkg/sentry/platform/ptrace/subprocess_linux.go @@ -0,0 +1,338 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux + +package ptrace + +import ( + "fmt" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/seccomp" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid" +) + +const syscallEvent syscall.Signal = 0x80 + +// probeSeccomp returns true iff seccomp is run after ptrace notifications, +// which is generally the case for kernel version >= 4.8. This check is dynamic +// because kernels have be backported behavior. +// +// See createStub for more information. +// +// Precondition: the runtime OS thread must be locked. +func probeSeccomp() bool { + // Create a completely new, destroyable process. + t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO) + if err != nil { + panic(fmt.Sprintf("seccomp probe failed: %v", err)) + } + defer t.destroy() + + // Set registers to the yield system call. This call is not allowed + // by the filters specified in the attachThread function. + regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD) + if err := t.setRegs(®s); err != nil { + panic(fmt.Sprintf("ptrace set regs failed: %v", err)) + } + + for { + // Attempt an emulation. + if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0); errno != 0 { + panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) + } + + sig := t.wait(stopped) + if sig == (syscallEvent | syscall.SIGTRAP) { + // Did the seccomp errno hook already run? This would + // indicate that seccomp is first in line and we're + // less than 4.8. + if err := t.getRegs(®s); err != nil { + panic(fmt.Sprintf("ptrace get-regs failed: %v", err)) + } + if _, err := syscallReturnValue(®s); err == nil { + // The seccomp errno mode ran first, and reset + // the error in the registers. + return false + } + // The seccomp hook did not run yet, and therefore it + // is safe to use RET_KILL mode for dispatched calls. + return true + } + } +} + +// patchSignalInfo patches the signal info to account for hitting the seccomp +// filters from vsyscall emulation, specified below. We allow for SIGSYS as a +// synchronous trap, but patch the structure to appear like a SIGSEGV with the +// Rip as the faulting address. +// +// Note that this should only be called after verifying that the signalInfo has +// been generated by the kernel. +func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) { + if linux.Signal(signalInfo.Signo) == linux.SIGSYS { + signalInfo.Signo = int32(linux.SIGSEGV) + + // Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered + // with the si_call_addr field pointing to the current RIP. This field + // aligns with the si_addr field for a SIGSEGV, so we don't need to touch + // anything there. We do need to unwind emulation however, so we set the + // instruction pointer to the faulting value, and "unpop" the stack. + regs.Rip = signalInfo.Addr() + regs.Rsp -= 8 + } +} + +// createStub creates a fresh stub processes. +// +// Precondition: the runtime OS thread must be locked. +func createStub() (*thread, error) { + // The exact interactions of ptrace and seccomp are complex, and + // changed in recent kernel versions. Before commit 93e35efb8de45, the + // seccomp check is done before the ptrace emulation check. This means + // that any calls not matching this list will trigger the seccomp + // default action instead of notifying ptrace. + // + // After commit 93e35efb8de45, the seccomp check is done after the + // ptrace emulation check. This simplifies using SYSEMU, since seccomp + // will never run for emulation. Seccomp will only run for injected + // system calls, and thus we can use RET_KILL as our violation action. + var defaultAction linux.BPFAction + if probeSeccomp() { + log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)") + defaultAction = linux.SECCOMP_RET_KILL_THREAD + } else { + // We must rely on SYSEMU behavior; tracing with SYSEMU is broken. + log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)") + defaultAction = linux.SECCOMP_RET_ALLOW + } + + // When creating the new child process, we specify SIGKILL as the + // signal to deliver when the child exits. We never expect a subprocess + // to exit; they are pooled and reused. This is done to ensure that if + // a subprocess is OOM-killed, this process (and all other stubs, + // transitively) will be killed as well. It's simply not possible to + // safely handle a single stub getting killed: the exact state of + // execution is unknown and not recoverable. + return attachedThread(uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, defaultAction) +} + +// attachedThread returns a new attached thread. +// +// Precondition: the runtime OS thread must be locked. +func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) { + // Create a BPF program that allows only the system calls needed by the + // stub and all its children. This is used to create child stubs + // (below), so we must include the ability to fork, but otherwise lock + // down available calls only to what is needed. + rules := []seccomp.RuleSet{ + // Rules for trapping vsyscall access. + seccomp.RuleSet{ + Rules: seccomp.SyscallRules{ + syscall.SYS_GETTIMEOFDAY: {}, + syscall.SYS_TIME: {}, + 309: {}, // SYS_GETCPU. + }, + Action: linux.SECCOMP_RET_TRAP, + Vsyscall: true, + }, + } + if defaultAction != linux.SECCOMP_RET_ALLOW { + rules = append(rules, seccomp.RuleSet{ + Rules: seccomp.SyscallRules{ + syscall.SYS_CLONE: []seccomp.Rule{ + // Allow creation of new subprocesses (used by the master). + {seccomp.AllowValue(syscall.CLONE_FILES | syscall.SIGKILL)}, + // Allow creation of new threads within a single address space (used by addresss spaces). + {seccomp.AllowValue( + syscall.CLONE_FILES | + syscall.CLONE_FS | + syscall.CLONE_SIGHAND | + syscall.CLONE_THREAD | + syscall.CLONE_PTRACE | + syscall.CLONE_VM)}, + }, + + // For the initial process creation. + syscall.SYS_WAIT4: {}, + syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ + {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)}, + }, + syscall.SYS_EXIT: {}, + + // For the stub prctl dance (all). + syscall.SYS_PRCTL: []seccomp.Rule{ + {seccomp.AllowValue(syscall.PR_SET_PDEATHSIG), seccomp.AllowValue(syscall.SIGKILL)}, + }, + syscall.SYS_GETPPID: {}, + + // For the stub to stop itself (all). + syscall.SYS_GETPID: {}, + syscall.SYS_KILL: []seccomp.Rule{ + {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SIGSTOP)}, + }, + + // Injected to support the address space operations. + syscall.SYS_MMAP: {}, + syscall.SYS_MUNMAP: {}, + }, + Action: linux.SECCOMP_RET_ALLOW, + }) + } + instrs, err := seccomp.BuildProgram(rules, defaultAction) + if err != nil { + return nil, err + } + + // Declare all variables up front in order to ensure that there's no + // need for allocations between beforeFork & afterFork. + var ( + pid uintptr + ppid uintptr + errno syscall.Errno + ) + + // Remember the current ppid for the pdeathsig race. + ppid, _, _ = syscall.RawSyscall(syscall.SYS_GETPID, 0, 0, 0) + + // Among other things, beforeFork masks all signals. + beforeFork() + + // Do the clone. + pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, flags, 0, 0, 0, 0, 0) + if errno != 0 { + afterFork() + return nil, errno + } + + // Is this the parent? + if pid != 0 { + // Among other things, restore signal mask. + afterFork() + + // Initialize the first thread. + t := &thread{ + tgid: int32(pid), + tid: int32(pid), + cpu: ^uint32(0), + } + if sig := t.wait(stopped); sig != syscall.SIGSTOP { + return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig) + } + t.attach() + + return t, nil + } + + // Move the stub to a new session (and thus a new process group). This + // prevents the stub from getting PTY job control signals intended only + // for the sentry process. We must call this before restoring signal + // mask. + if _, _, errno := syscall.RawSyscall(syscall.SYS_SETSID, 0, 0, 0); errno != 0 { + syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0) + } + + // afterForkInChild resets all signals to their default dispositions + // and restores the signal mask to its pre-fork state. + afterForkInChild() + + // Explicitly unmask all signals to ensure that the tracer can see + // them. + if errno := unmaskAllSignals(); errno != 0 { + syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0) + } + + // Set an aggressive BPF filter for the stub and all it's children. See + // the description of the BPF program built above. + if errno := seccomp.SetFilter(instrs); errno != 0 { + syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0) + } + + // Enable cpuid-faulting; this may fail on older kernels or hardware, + // so we just disregard the result. Host CPUID will be enabled. + syscall.RawSyscall(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0) + + // Call the stub; should not return. + stubCall(stubStart, ppid) + panic("unreachable") +} + +// createStub creates a stub processes as a child of an existing subprocesses. +// +// Precondition: the runtime OS thread must be locked. +func (s *subprocess) createStub() (*thread, error) { + // There's no need to lock the runtime thread here, as this can only be + // called from a context that is already locked. + currentTID := int32(procid.Current()) + t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread) + + // Pass the expected PPID to the child via R15. + regs := t.initRegs + regs.R15 = uint64(t.tgid) + + // Call fork in a subprocess. + // + // The new child must set up PDEATHSIG to ensure it dies if this + // process dies. Since this process could die at any time, this cannot + // be done via instrumentation from here. + // + // Instead, we create the child untraced, which will do the PDEATHSIG + // setup and then SIGSTOP itself for our attach below. + // + // See above re: SIGKILL. + pid, err := t.syscallIgnoreInterrupt( + ®s, + syscall.SYS_CLONE, + arch.SyscallArgument{Value: uintptr(syscall.SIGKILL | syscall.CLONE_FILES)}, + arch.SyscallArgument{Value: 0}, + arch.SyscallArgument{Value: 0}, + arch.SyscallArgument{Value: 0}, + arch.SyscallArgument{Value: 0}, + arch.SyscallArgument{Value: 0}) + if err != nil { + return nil, err + } + + // Wait for child to enter group-stop, so we don't stop its + // bootstrapping work with t.attach below. + // + // We unfortunately don't have a handy part of memory to write the wait + // status. If the wait succeeds, we'll assume that it was the SIGSTOP. + // If the child actually exited, the attach below will fail. + _, err = t.syscallIgnoreInterrupt( + &t.initRegs, + syscall.SYS_WAIT4, + arch.SyscallArgument{Value: uintptr(pid)}, + arch.SyscallArgument{Value: 0}, + arch.SyscallArgument{Value: syscall.WALL | syscall.WUNTRACED}, + arch.SyscallArgument{Value: 0}, + arch.SyscallArgument{Value: 0}, + arch.SyscallArgument{Value: 0}) + if err != nil { + return nil, err + } + + childT := &thread{ + tgid: int32(pid), + tid: int32(pid), + cpu: ^uint32(0), + } + childT.attach() + + return childT, nil +} diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go new file mode 100644 index 000000000..1bf7eab28 --- /dev/null +++ b/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go @@ -0,0 +1,109 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 linux + +package ptrace + +import ( + "sync" + "sync/atomic" + "syscall" + "unsafe" + + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" +) + +// maskPool contains reusable CPU masks for setting affinity. Unfortunately, +// runtime.NumCPU doesn't actually record the number of CPUs on the system, it +// just records the number of CPUs available in the scheduler affinity set at +// startup. This may a) change over time and b) gives a number far lower than +// the maximum indexable CPU. To prevent lots of allocation in the hot path, we +// use a pool to store large masks that we can reuse during bind. +var maskPool = sync.Pool{ + New: func() interface{} { + const maxCPUs = 1024 // Not a hard limit; see below. + return make([]uintptr, maxCPUs/64) + }, +} + +// unmaskAllSignals unmasks all signals on the current thread. +// +//go:nosplit +func unmaskAllSignals() syscall.Errno { + var set linux.SignalSet + _, _, errno := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_SETMASK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0) + return errno +} + +// getCPU gets the current CPU. +// +// Precondition: the current runtime thread should be locked. +func getCPU() (uint32, error) { + var cpu uintptr + if _, _, errno := syscall.RawSyscall( + unix.SYS_GETCPU, + uintptr(unsafe.Pointer(&cpu)), + 0, 0); errno != 0 { + return 0, errno + } + return uint32(cpu), nil +} + +// setCPU sets the CPU affinity. +func (t *thread) setCPU(cpu uint32) error { + mask := maskPool.Get().([]uintptr) + n := int(cpu / 64) + v := uintptr(1 << uintptr(cpu%64)) + if n >= len(mask) { + // See maskPool note above. We've actually exceeded the number + // of available cores. Grow the mask and return it. + mask = make([]uintptr, n+1) + } + mask[n] |= v + if _, _, errno := syscall.RawSyscall( + unix.SYS_SCHED_SETAFFINITY, + uintptr(t.tid), + uintptr(len(mask)*8), + uintptr(unsafe.Pointer(&mask[0]))); errno != 0 { + return errno + } + mask[n] &^= v + maskPool.Put(mask) + return nil +} + +// bind attempts to ensure that the thread is on the same CPU as the current +// thread. This provides no guarantees as it is fundamentally a racy operation: +// CPU sets may change and we may be rescheduled in the middle of this +// operation. As a result, no failures are reported. +// +// Precondition: the current runtime thread should be locked. +func (t *thread) bind() { + currentCPU, err := getCPU() + if err != nil { + return + } + if oldCPU := atomic.SwapUint32(&t.cpu, currentCPU); oldCPU != currentCPU { + // Set the affinity on the thread and save the CPU for next + // round; we don't expect CPUs to bounce around too frequently. + // + // (It's worth noting that we could move CPUs between this point + // and when the tracee finishes executing. But that would be + // roughly the status quo anyways -- we're just maximizing our + // chances of colocation, not guaranteeing it.) + t.setCPU(currentCPU) + } +} diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go new file mode 100644 index 000000000..b80a3604d --- /dev/null +++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go @@ -0,0 +1,33 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build go1.12 +// +build !go1.14 + +// Check go:linkname function signatures when updating Go version. + +package ptrace + +import ( + _ "unsafe" // required for go:linkname. +) + +//go:linkname beforeFork syscall.runtime_BeforeFork +func beforeFork() + +//go:linkname afterFork syscall.runtime_AfterFork +func afterFork() + +//go:linkname afterForkInChild syscall.runtime_AfterForkInChild +func afterForkInChild() |