diff options
Diffstat (limited to 'pkg/sentry/platform')
-rw-r--r-- | pkg/sentry/platform/ptrace/BUILD | 2 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/ptrace_unsafe.go | 2 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess.go | 150 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess_amd64.go | 16 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess_linux.go | 175 |
5 files changed, 276 insertions, 69 deletions
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index ceee895dc..debae058b 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -19,6 +19,8 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", + "//pkg/log", + "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/platform", "//pkg/sentry/platform/filemem", diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go index b55b2795a..46a8bda8e 100644 --- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go +++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go @@ -136,7 +136,7 @@ func (t *thread) clone(initRegs *syscall.PtraceRegs) (*thread, error) { return nil, syscall.EINVAL } rval, err := t.syscallIgnoreInterrupt( - initRegs, + &t.initRegs, syscall.SYS_CLONE, arch.SyscallArgument{Value: uintptr( syscall.CLONE_FILES | diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index 035ebc332..6d5ad6b71 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -47,6 +47,11 @@ type thread struct { tgid int32 tid int32 cpu uint32 + + // initRegs are the initial registers for the first thread. + // + // These are used for the register set for system calls. + initRegs syscall.PtraceRegs } // threadPool is a collection of threads. @@ -99,11 +104,6 @@ func (tp *threadPool) lookupOrCreate(currentTID int32, newThread func() *thread) type subprocess struct { platform.NoAddressSpaceIO - // initRegs are the initial registers for the first thread. - // - // These are used for the register set for system calls. - initRegs syscall.PtraceRegs - // requests is used to signal creation of new threads. requests chan chan *thread @@ -142,7 +142,6 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) { // thread, and responding to requests to make additional threads in the // traced process. The process will be killed and reaped when the // request channel is closed, which happens in Release below. - var initRegs syscall.PtraceRegs errChan := make(chan error) requests := make(chan chan *thread) go func() { // S/R-SAFE: Platform-related. @@ -156,22 +155,12 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) { return } - // Grab registers. - // - // Note that we adjust the current register RIP value to be - // just before the current system call executed. This depends - // on the definition of the stub itself. - if err := firstThread.getRegs(&initRegs); err != nil { - panic(fmt.Sprintf("ptrace get regs failed: %v", err)) - } - initRegs.Rip -= initRegsRipAdjustment - // Ready to handle requests. errChan <- nil // Wait for requests to create threads. for r := range requests { - t, err := firstThread.clone(&initRegs) + t, err := firstThread.clone(&firstThread.initRegs) if err != nil { // Should not happen: not recoverable. panic(fmt.Sprintf("error initializing first thread: %v", err)) @@ -183,15 +172,12 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) { // (Hopefully nobody tgkilled it with a signal < // SIGSTOP before the SIGSTOP was delivered, in which // case that signal would be delivered before SIGSTOP.) - if sig := t.wait(); sig != syscall.SIGSTOP { + if sig := t.wait(stopped); sig != syscall.SIGSTOP { panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig)) } - // Detach the thread without suppressing the SIGSTOP, - // causing it to enter group-stop. - if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(syscall.SIGSTOP), 0, 0); errno != 0 { - panic(fmt.Sprintf("can't detach new clone: %v", errno)) - } + // Detach the thread. + t.detach() // Return the thread. r <- t @@ -208,7 +194,6 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) { // Ready. sp := &subprocess{ - initRegs: initRegs, requests: requests, sysemuThreads: threadPool{ threads: make(map[int32]*thread), @@ -277,16 +262,48 @@ func (t *thread) attach() { // stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of // newSubprocess), so we always expect to see signal-delivery-stop with // SIGSTOP. - if sig := t.wait(); sig != syscall.SIGSTOP { + if sig := t.wait(stopped); sig != syscall.SIGSTOP { panic(fmt.Sprintf("wait failed: expected SIGSTOP, got %v", sig)) } // Initialize options. t.init() + + // Grab registers. + // + // Note that we adjust the current register RIP value to be just before + // the current system call executed. This depends on the definition of + // the stub itself. + if err := t.getRegs(&t.initRegs); err != nil { + panic(fmt.Sprintf("ptrace get regs failed: %v", err)) + } + t.initRegs.Rip -= initRegsRipAdjustment } +// detach detachs from the thread. +// +// Because the SIGSTOP is not supressed, the thread will enter group-stop. +func (t *thread) detach() { + if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(syscall.SIGSTOP), 0, 0); errno != 0 { + panic(fmt.Sprintf("can't detach new clone: %v", errno)) + } +} + +// waitOutcome is used for wait below. +type waitOutcome int + +const ( + // stopped indicates that the process was stopped. + stopped waitOutcome = iota + + // killed indicates that the process was killed. + killed +) + // wait waits for a stop event. -func (t *thread) wait() syscall.Signal { +// +// Precondition: outcome is a valid waitOutcome. +func (t *thread) wait(outcome waitOutcome) syscall.Signal { var status syscall.WaitStatus for { @@ -300,25 +317,55 @@ func (t *thread) wait() syscall.Signal { if int(r) != int(t.tid) { panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid)) } - if !status.Stopped() { - panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status)) - } - if status.StopSignal() == 0 { - continue // Spurious stop. + switch outcome { + case stopped: + if !status.Stopped() { + panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status)) + } + stopSig := status.StopSignal() + if stopSig == 0 { + continue // Spurious stop. + } + if stopSig == syscall.SIGTRAP { + // Re-encode the trap cause the way it's expected. + return stopSig | syscall.Signal(status.TrapCause()<<8) + } + // Not a trap signal. + return stopSig + case killed: + if !status.Exited() && !status.Signaled() { + panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status)) + } + return syscall.Signal(status.ExitStatus()) + default: + // Should not happen. + panic(fmt.Sprintf("unknown outcome: %v", outcome)) } - return status.StopSignal() } } +// destroy kills the thread. +// +// Note that this should not be used in the general case; the death of threads +// will typically cause the death of the parent. This is a utility method for +// manually created threads. +func (t *thread) destroy() { + t.detach() + syscall.Tgkill(int(t.tgid), int(t.tid), syscall.Signal(syscall.SIGKILL)) + t.wait(killed) +} + // init initializes trace options. func (t *thread) init() { - // Set our TRACESYSGOOD option to differeniate real SIGTRAP. + // Set our TRACESYSGOOD option to differeniate real SIGTRAP. Also, we + // require the SECCOMP option to ensure that seccomp violations + // generate a ptrace event. _, _, errno := syscall.RawSyscall6( syscall.SYS_PTRACE, syscall.PTRACE_SETOPTIONS, uintptr(t.tid), 0, - syscall.PTRACE_O_TRACESYSGOOD, + syscall.PTRACE_O_TRACESYSGOOD|_PTRACE_O_TRACESECCOMP, 0, 0) if errno != 0 { panic(fmt.Sprintf("ptrace set options failed: %v", errno)) @@ -342,8 +389,8 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) { panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) } - sig := t.wait() - if sig == (0x80 | syscall.SIGTRAP) { + sig := t.wait(stopped) + if sig == (syscallEvent | syscall.SIGTRAP) { // Reached syscall-enter-stop. break } else { @@ -360,7 +407,7 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) { // Wait for syscall-exit-stop. "[Signal-delivery-stop] never happens // between syscall-enter-stop and syscall-exit-stop; it happens *after* // syscall-exit-stop.)" - ptrace(2), "Syscall-stops" - if sig := t.wait(); sig != (0x80 | syscall.SIGTRAP) { + if sig := t.wait(stopped); sig != (syscallEvent | syscall.SIGTRAP) { panic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig)) } @@ -403,22 +450,23 @@ func (t *thread) NotifyInterrupt() { // // This function returns true on a system call, false on a signal. func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { - regs := &ac.StateData().Regs - s.resetSysemuRegs(regs) + // Lock the thread for ptrace operations. + runtime.LockOSThread() + defer runtime.UnlockOSThread() // Extract floating point state. fpState := ac.FloatingPointData() fpLen, _ := ac.FeatureSet().ExtendedStateSize() useXsave := ac.FeatureSet().UseXsave() - // Lock the thread for ptrace operations. - runtime.LockOSThread() - defer runtime.UnlockOSThread() - // Grab our thread from the pool. currentTID := int32(procid.Current()) t := s.sysemuThreads.lookupOrCreate(currentTID, s.newThread) + // Reset necessary registers. + regs := &ac.StateData().Regs + t.resetSysemuRegs(regs) + // Check for interrupts, and ensure that future interrupts will signal t. if !c.interrupt.Enable(t) { // Pending interrupt; simulate. @@ -459,7 +507,7 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { } // Wait for the syscall-enter stop. - sig := t.wait() + sig := t.wait(stopped) // Refresh all registers. if err := t.getRegs(regs); err != nil { @@ -470,13 +518,17 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { } // Is it a system call? - if sig == (0x80 | syscall.SIGTRAP) { + if sig == (syscallEvent | syscall.SIGTRAP) { // Ensure registers are sane. updateSyscallRegs(regs) return true - } - - if sig == syscall.SIGSTOP { + } else if sig == (seccompEvent | syscall.SIGTRAP) { + // Seccomp is enabled, and caught the system call. This + // is an emulated vsyscall call, since those are caught + // only by seccomp and explicitly set to trace. + updateSyscallRegs(regs) + return true + } else if sig == syscall.SIGSTOP { // SIGSTOP was delivered to another thread in the same thread // group, which initiated another group stop. Just ignore it. continue @@ -507,7 +559,7 @@ func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintp currentTID := int32(procid.Current()) t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread) - return t.syscallIgnoreInterrupt(&s.initRegs, sysno, args...) + return t.syscallIgnoreInterrupt(&t.initRegs, sysno, args...) } // MapFile implements platform.AddressSpace.MapFile. diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go index 8211215df..c38dc1ff8 100644 --- a/pkg/sentry/platform/ptrace/subprocess_amd64.go +++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go @@ -43,20 +43,20 @@ const ( // resetSysemuRegs sets up emulation registers. // // This should be called prior to calling sysemu. -func (s *subprocess) resetSysemuRegs(regs *syscall.PtraceRegs) { - regs.Cs = s.initRegs.Cs - regs.Ss = s.initRegs.Ss - regs.Ds = s.initRegs.Ds - regs.Es = s.initRegs.Es - regs.Fs = s.initRegs.Fs - regs.Gs = s.initRegs.Gs +func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) { + regs.Cs = t.initRegs.Cs + regs.Ss = t.initRegs.Ss + regs.Ds = t.initRegs.Ds + regs.Es = t.initRegs.Es + regs.Fs = t.initRegs.Fs + regs.Gs = t.initRegs.Gs } // createSyscallRegs sets up syscall registers. // // This should be called to generate registers for a system call. func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs { - // Copy initial registers (RIP, segments, etc.). + // Copy initial registers. regs := *initRegs // Set our syscall number. diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go index b212bbdfe..53adadadd 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux.go @@ -21,14 +21,167 @@ import ( "syscall" "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/seccomp" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid" ) +const ( + syscallEvent syscall.Signal = 0x80 + seccompEvent syscall.Signal = 0x700 // 0x7 (PTRACE_SECCOMP_EVENT) << 8 + _PTRACE_O_TRACESECCOMP = 0x80 // 1 << 0x7 (PTRACE_SECCOMP_EVENT) +) + +// probeSeccomp returns true iff seccomp is run after ptrace notifications, +// which is generally the case for kernel version >= 4.8. This check is dynamic +// because kernels have be backported behavior. +// +// See createStub for more information. +// +// Precondition: the runtime OS thread must be locked. +func probeSeccomp() bool { + // Create a completely new, destroyable process. + t, err := attachedThread(0, uint32(linux.SECCOMP_RET_ERRNO)) + if err != nil { + panic(fmt.Sprintf("seccomp probe failed: %v", err)) + } + defer t.destroy() + + // Set registers to the yield system call. This call is not allowed + // by the filters specified in the attachThread function. + regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD) + if err := t.setRegs(®s); err != nil { + panic(fmt.Sprintf("ptrace set regs failed: %v", err)) + } + + for { + // Attempt an emulation. + if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0); errno != 0 { + panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) + } + + sig := t.wait(stopped) + if sig == (syscallEvent | syscall.SIGTRAP) { + // Did the seccomp errno hook already run? This would + // indicate that seccomp is first in line and we're + // less than 4.8. + if err := t.getRegs(®s); err != nil { + panic(fmt.Sprintf("ptrace get-regs failed: %v", err)) + } + if _, err := syscallReturnValue(®s); err == nil { + // The seccomp errno mode ran first, and reset + // the error in the registers. + return false + } + // The seccomp hook did not run yet, and therefore it + // is safe to use RET_KILL mode for dispatched calls. + return true + } + } +} + // createStub creates a fresh stub processes. // // Precondition: the runtime OS thread must be locked. func createStub() (*thread, error) { + // The exact interactions of ptrace and seccomp are complex, and + // changed in recent kernel versions. Before commit 93e35efb8de45, the + // seccomp check is done before the ptrace emulation check. This means + // that any calls not matching this list will trigger the seccomp + // default action instead of notifying ptrace. + // + // After commit 93e35efb8de45, the seccomp check is done after the + // ptrace emulation check. This simplifies using SYSEMU, since seccomp + // will never run for emulation. Seccomp will only run for injected + // system calls, and thus we can use RET_KILL as our violation action. + var defaultAction uint32 + if probeSeccomp() { + log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)") + defaultAction = uint32(linux.SECCOMP_RET_KILL) + } else { + // We must rely on SYSEMU behavior; tracing with SYSEMU is broken. + log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)") + defaultAction = uint32(linux.SECCOMP_RET_ALLOW) + } + + // When creating the new child process, we specify SIGKILL as the + // signal to deliver when the child exits. We never expect a subprocess + // to exit; they are pooled and reused. This is done to ensure that if + // a subprocess is OOM-killed, this process (and all other stubs, + // transitively) will be killed as well. It's simply not possible to + // safely handle a single stub getting killed: the exact state of + // execution is unknown and not recoverable. + return attachedThread(uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, defaultAction) +} + +// attachedThread returns a new attached thread. +// +// Precondition: the runtime OS thread must be locked. +func attachedThread(flags uintptr, defaultAction uint32) (*thread, error) { + // Create a BPF program that allows only the system calls needed by the + // stub and all its children. This is used to create child stubs + // (below), so we must include the ability to fork, but otherwise lock + // down available calls only to what is needed. + rules := []seccomp.RuleSet{ + // Rules for trapping vsyscall access. + seccomp.RuleSet{ + Rules: seccomp.SyscallRules{ + syscall.SYS_GETTIMEOFDAY: {}, + syscall.SYS_TIME: {}, + 309: {}, // SYS_GETCPU. + }, + Action: uint32(linux.SECCOMP_RET_TRACE), + Vsyscall: true, + }, + } + if defaultAction != uint32(linux.SECCOMP_RET_ALLOW) { + rules = append(rules, seccomp.RuleSet{ + Rules: seccomp.SyscallRules{ + syscall.SYS_CLONE: []seccomp.Rule{ + // Allow creation of new subprocesses (used by the master). + {seccomp.AllowValue(syscall.CLONE_FILES | syscall.SIGKILL)}, + // Allow creation of new threads within a single address space (used by addresss spaces). + {seccomp.AllowValue( + syscall.CLONE_FILES | + syscall.CLONE_FS | + syscall.CLONE_SIGHAND | + syscall.CLONE_THREAD | + syscall.CLONE_PTRACE | + syscall.CLONE_VM)}, + }, + + // For the initial process creation. + syscall.SYS_WAIT4: {}, + syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ + {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)}, + }, + syscall.SYS_EXIT: {}, + + // For the stub prctl dance (all). + syscall.SYS_PRCTL: []seccomp.Rule{ + {seccomp.AllowValue(syscall.PR_SET_PDEATHSIG), seccomp.AllowValue(syscall.SIGKILL)}, + }, + syscall.SYS_GETPPID: {}, + + // For the stub to stop itself (all). + syscall.SYS_GETPID: {}, + syscall.SYS_KILL: []seccomp.Rule{ + {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SIGSTOP)}, + }, + + // Injected to support the address space operations. + syscall.SYS_MMAP: {}, + syscall.SYS_MUNMAP: {}, + }, + Action: uint32(linux.SECCOMP_RET_ALLOW), + }) + } + instrs, err := seccomp.BuildProgram(rules, defaultAction) + if err != nil { + return nil, err + } + // Declare all variables up front in order to ensure that there's no // need for allocations between beforeFork & afterFork. var ( @@ -43,14 +196,8 @@ func createStub() (*thread, error) { // Among other things, beforeFork masks all signals. beforeFork() - // When creating the new child process, we specify SIGKILL as the - // signal to deliver when the child exits. We never expect a subprocess - // to exit; they are pooled and reused. This is done to ensure that if - // a subprocess is OOM-killed, this process (and all other stubs, - // transitively) will be killed as well. It's simply not possible to - // safely handle a single stub getting killed: the exact state of - // execution is unknown and not recoverable. - pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, 0, 0, 0, 0, 0) + // Do the clone. + pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, flags, 0, 0, 0, 0, 0) if errno != 0 { afterFork() return nil, errno @@ -67,7 +214,7 @@ func createStub() (*thread, error) { tid: int32(pid), cpu: ^uint32(0), } - if sig := t.wait(); sig != syscall.SIGSTOP { + if sig := t.wait(stopped); sig != syscall.SIGSTOP { return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig) } t.attach() @@ -86,6 +233,12 @@ func createStub() (*thread, error) { syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0) } + // Set an aggressive BPF filter for the stub and all it's children. See + // the description of the BPF program built above. + if errno := seccomp.SetFilter(instrs); errno != 0 { + syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0) + } + // Enable cpuid-faulting; this may fail on older kernels or hardware, // so we just disregard the result. Host CPUID will be enabled. syscall.RawSyscall(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0) @@ -105,7 +258,7 @@ func (s *subprocess) createStub() (*thread, error) { t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread) // Pass the expected PPID to the child via R15. - regs := s.initRegs + regs := t.initRegs regs.R15 = uint64(t.tgid) // Call fork in a subprocess. @@ -138,7 +291,7 @@ func (s *subprocess) createStub() (*thread, error) { // status. If the wait succeeds, we'll assume that it was the SIGSTOP. // If the child actually exited, the attach below will fail. _, err = t.syscallIgnoreInterrupt( - &s.initRegs, + &t.initRegs, syscall.SYS_WAIT4, arch.SyscallArgument{Value: uintptr(pid)}, arch.SyscallArgument{Value: 0}, |