1 files changed, 164 insertions, 11 deletions
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index b212bbdfe..53adadadd 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -21,14 +21,167 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
 )
 
+const (
+	syscallEvent           syscall.Signal = 0x80
+	seccompEvent           syscall.Signal = 0x700 // 0x7 (PTRACE_SECCOMP_EVENT) << 8
+	_PTRACE_O_TRACESECCOMP                = 0x80  // 1 << 0x7 (PTRACE_SECCOMP_EVENT)
+)
+
+// probeSeccomp returns true iff seccomp is run after ptrace notifications,
+// which is generally the case for kernel version >= 4.8. This check is dynamic
+// because kernels have be backported behavior.
+//
+// See createStub for more information.
+//
+// Precondition: the runtime OS thread must be locked.
+func probeSeccomp() bool {
+	// Create a completely new, destroyable process.
+	t, err := attachedThread(0, uint32(linux.SECCOMP_RET_ERRNO))
+	if err != nil {
+		panic(fmt.Sprintf("seccomp probe failed: %v", err))
+	}
+	defer t.destroy()
+
+	// Set registers to the yield system call. This call is not allowed
+	// by the filters specified in the attachThread function.
+	regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD)
+	if err := t.setRegs(&regs); err != nil {
+		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
+	}
+
+	for {
+		// Attempt an emulation.
+		if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0); errno != 0 {
+			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
+		}
+
+		sig := t.wait(stopped)
+		if sig == (syscallEvent | syscall.SIGTRAP) {
+			// Did the seccomp errno hook already run? This would
+			// indicate that seccomp is first in line and we're
+			// less than 4.8.
+			if err := t.getRegs(&regs); err != nil {
+				panic(fmt.Sprintf("ptrace get-regs failed: %v", err))
+			}
+			if _, err := syscallReturnValue(&regs); err == nil {
+				// The seccomp errno mode ran first, and reset
+				// the error in the registers.
+				return false
+			}
+			// The seccomp hook did not run yet, and therefore it
+			// is safe to use RET_KILL mode for dispatched calls.
+			return true
+		}
+	}
+}
+
 // createStub creates a fresh stub processes.
 //
 // Precondition: the runtime OS thread must be locked.
 func createStub() (*thread, error) {
+	// The exact interactions of ptrace and seccomp are complex, and
+	// changed in recent kernel versions. Before commit 93e35efb8de45, the
+	// seccomp check is done before the ptrace emulation check. This means
+	// that any calls not matching this list will trigger the seccomp
+	// default action instead of notifying ptrace.
+	//
+	// After commit 93e35efb8de45, the seccomp check is done after the
+	// ptrace emulation check. This simplifies using SYSEMU, since seccomp
+	// will never run for emulation. Seccomp will only run for injected
+	// system calls, and thus we can use RET_KILL as our violation action.
+	var defaultAction uint32
+	if probeSeccomp() {
+		log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)")
+		defaultAction = uint32(linux.SECCOMP_RET_KILL)
+	} else {
+		// We must rely on SYSEMU behavior; tracing with SYSEMU is broken.
+		log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)")
+		defaultAction = uint32(linux.SECCOMP_RET_ALLOW)
+	}
+
+	// When creating the new child process, we specify SIGKILL as the
+	// signal to deliver when the child exits. We never expect a subprocess
+	// to exit; they are pooled and reused. This is done to ensure that if
+	// a subprocess is OOM-killed, this process (and all other stubs,
+	// transitively) will be killed as well. It's simply not possible to
+	// safely handle a single stub getting killed: the exact state of
+	// execution is unknown and not recoverable.
+	return attachedThread(uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, defaultAction)
+}
+
+// attachedThread returns a new attached thread.
+//
+// Precondition: the runtime OS thread must be locked.
+func attachedThread(flags uintptr, defaultAction uint32) (*thread, error) {
+	// Create a BPF program that allows only the system calls needed by the
+	// stub and all its children. This is used to create child stubs
+	// (below), so we must include the ability to fork, but otherwise lock
+	// down available calls only to what is needed.
+	rules := []seccomp.RuleSet{
+		// Rules for trapping vsyscall access.
+		seccomp.RuleSet{
+			Rules: seccomp.SyscallRules{
+				syscall.SYS_GETTIMEOFDAY: {},
+				syscall.SYS_TIME:         {},
+				309:                      {}, // SYS_GETCPU.
+			},
+			Action:   uint32(linux.SECCOMP_RET_TRACE),
+			Vsyscall: true,
+		},
+	}
+	if defaultAction != uint32(linux.SECCOMP_RET_ALLOW) {
+		rules = append(rules, seccomp.RuleSet{
+			Rules: seccomp.SyscallRules{
+				syscall.SYS_CLONE: []seccomp.Rule{
+					// Allow creation of new subprocesses (used by the master).
+					{seccomp.AllowValue(syscall.CLONE_FILES | syscall.SIGKILL)},
+					// Allow creation of new threads within a single address space (used by addresss spaces).
+					{seccomp.AllowValue(
+						syscall.CLONE_FILES |
+							syscall.CLONE_FS |
+							syscall.CLONE_SIGHAND |
+							syscall.CLONE_THREAD |
+							syscall.CLONE_PTRACE |
+							syscall.CLONE_VM)},
+				},
+
+				// For the initial process creation.
+				syscall.SYS_WAIT4: {},
+				syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+					{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+				},
+				syscall.SYS_EXIT: {},
+
+				// For the stub prctl dance (all).
+				syscall.SYS_PRCTL: []seccomp.Rule{
+					{seccomp.AllowValue(syscall.PR_SET_PDEATHSIG), seccomp.AllowValue(syscall.SIGKILL)},
+				},
+				syscall.SYS_GETPPID: {},
+
+				// For the stub to stop itself (all).
+				syscall.SYS_GETPID: {},
+				syscall.SYS_KILL: []seccomp.Rule{
+					{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SIGSTOP)},
+				},
+
+				// Injected to support the address space operations.
+				syscall.SYS_MMAP:   {},
+				syscall.SYS_MUNMAP: {},
+			},
+			Action: uint32(linux.SECCOMP_RET_ALLOW),
+		})
+	}
+	instrs, err := seccomp.BuildProgram(rules, defaultAction)
+	if err != nil {
+		return nil, err
+	}
+
 	// Declare all variables up front in order to ensure that there's no
 	// need for allocations between beforeFork & afterFork.
 	var (
@@ -43,14 +196,8 @@ func createStub() (*thread, error) {
 	// Among other things, beforeFork masks all signals.
 	beforeFork()
 
-	// When creating the new child process, we specify SIGKILL as the
-	// signal to deliver when the child exits. We never expect a subprocess
-	// to exit; they are pooled and reused. This is done to ensure that if
-	// a subprocess is OOM-killed, this process (and all other stubs,
-	// transitively) will be killed as well. It's simply not possible to
-	// safely handle a single stub getting killed: the exact state of
-	// execution is unknown and not recoverable.
-	pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, 0, 0, 0, 0, 0)
+	// Do the clone.
+	pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, flags, 0, 0, 0, 0, 0)
 	if errno != 0 {
 		afterFork()
 		return nil, errno
@@ -67,7 +214,7 @@ func createStub() (*thread, error) {
 			tid:  int32(pid),
 			cpu:  ^uint32(0),
 		}
-		if sig := t.wait(); sig != syscall.SIGSTOP {
+		if sig := t.wait(stopped); sig != syscall.SIGSTOP {
 			return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
 		}
 		t.attach()
@@ -86,6 +233,12 @@ func createStub() (*thread, error) {
 		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
 	}
 
+	// Set an aggressive BPF filter for the stub and all it's children. See
+	// the description of the BPF program built above.
+	if errno := seccomp.SetFilter(instrs); errno != 0 {
+		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
+	}
+
 	// Enable cpuid-faulting; this may fail on older kernels or hardware,
 	// so we just disregard the result. Host CPUID will be enabled.
 	syscall.RawSyscall(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0)
@@ -105,7 +258,7 @@ func (s *subprocess) createStub() (*thread, error) {
 	t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
 
 	// Pass the expected PPID to the child via R15.
-	regs := s.initRegs
+	regs := t.initRegs
 	regs.R15 = uint64(t.tgid)
 
 	// Call fork in a subprocess.
@@ -138,7 +291,7 @@ func (s *subprocess) createStub() (*thread, error) {
 	// status. If the wait succeeds, we'll assume that it was the SIGSTOP.
 	// If the child actually exited, the attach below will fail.
 	_, err = t.syscallIgnoreInterrupt(
-		&s.initRegs,
+		&t.initRegs,
 		syscall.SYS_WAIT4,
 		arch.SyscallArgument{Value: uintptr(pid)},
 		arch.SyscallArgument{Value: 0},