From 463e73d46d76042c39050d02cf3b0f875e55eb01 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 10 Oct 2018 22:39:32 -0700
Subject: Add seccomp filter configuration to ptrace stubs.

This is a defense-in-depth measure. If the sentry is compromised, this prevents
system call injection to the stubs. There is some complexity with respect to
ptrace and seccomp interactions, so this protection is not really available
for kernel versions < 4.8; this is detected dynamically.

Note that this also solves the vsyscall emulation issue by adding in
appropriate trapping for those system calls. It does mean that a compromised
sentry could theoretically inject these into the stub (ignoring the trap and
resume, thereby allowing execution), but they are harmless.

PiperOrigin-RevId: 216647581
Change-Id: Id06c232cbac1f9489b1803ec97f83097fcba8eb8
---
 pkg/abi/BUILD                                  |   1 +
 pkg/abi/abi_linux.go                           |  20 +++
 pkg/seccomp/BUILD                              |   3 -
 pkg/seccomp/seccomp.go                         | 224 +++++++++++++++++--------
 pkg/seccomp/seccomp_rules.go                   |   8 +-
 pkg/seccomp/seccomp_test.go                    | 172 ++++++++++++++-----
 pkg/seccomp/seccomp_unsafe.go                  |  24 ++-
 pkg/sentry/arch/arch_amd64.go                  |   5 +
 pkg/sentry/platform/ptrace/BUILD               |   2 +
 pkg/sentry/platform/ptrace/ptrace_unsafe.go    |   2 +-
 pkg/sentry/platform/ptrace/subprocess.go       | 150 +++++++++++------
 pkg/sentry/platform/ptrace/subprocess_amd64.go |  16 +-
 pkg/sentry/platform/ptrace/subprocess_linux.go | 175 +++++++++++++++++--
 pkg/sentry/strace/BUILD                        |   1 +
 pkg/sentry/strace/strace.go                    |  11 ++
 15 files changed, 620 insertions(+), 194 deletions(-)
 create mode 100644 pkg/abi/abi_linux.go

diff --git a/pkg/abi/BUILD b/pkg/abi/BUILD
index c014d2c4b..1ba4f3a46 100644
--- a/pkg/abi/BUILD
+++ b/pkg/abi/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "abi",
     srcs = [
         "abi.go",
+        "abi_linux.go",
         "flag.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/abi",
diff --git a/pkg/abi/abi_linux.go b/pkg/abi/abi_linux.go
new file mode 100644
index 000000000..dd5d67b51
--- /dev/null
+++ b/pkg/abi/abi_linux.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package abi
+
+// Host specifies the host ABI.
+const Host = Linux
diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index b3e2f0b38..1975d17a6 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -28,12 +28,9 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/seccomp",
     visibility = ["//visibility:public"],
     deps = [
-        "//pkg/abi",
         "//pkg/abi/linux",
         "//pkg/bpf",
         "//pkg/log",
-        "//pkg/sentry/arch",
-        "//pkg/sentry/strace",
     ],
 )
 
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index 49da3c775..a746dc9b3 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -20,31 +20,36 @@ import (
 	"reflect"
 	"sort"
 
-	"gvisor.googlesource.com/gvisor/pkg/abi"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/bpf"
 	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/strace"
 )
 
 const (
-	// violationLabel is added to the program to take action on a violation.
-	violationLabel = "violation"
-
 	// skipOneInst is the offset to take for skipping one instruction.
 	skipOneInst = 1
+
+	// defaultLabel is the label for the default action.
+	defaultLabel = "default_action"
 )
 
 // Install generates BPF code based on the set of syscalls provided. It only
-// allows syscalls that conform to the specification (*) and generates SIGSYS
+// allows syscalls that conform to the specification and generates SIGSYS
 // trap unless kill is set.
 //
-// (*) The current implementation only checks the syscall number. It does NOT
-// validate any of the arguments.
+// This is a convenience wrapper around BuildProgram and SetFilter.
 func Install(rules SyscallRules, kill bool) error {
 	log.Infof("Installing seccomp filters for %d syscalls (kill=%t)", len(rules), kill)
-	instrs, err := buildProgram(rules, kill)
+	defaultAction := uint32(linux.SECCOMP_RET_TRAP)
+	if kill {
+		defaultAction = uint32(linux.SECCOMP_RET_KILL)
+	}
+	instrs, err := BuildProgram([]RuleSet{
+		RuleSet{
+			Rules:  rules,
+			Action: uint32(linux.SECCOMP_RET_ALLOW),
+		},
+	}, defaultAction)
 	if log.IsLogging(log.Debug) {
 		programStr, errDecode := bpf.DecodeProgram(instrs)
 		if errDecode != nil {
@@ -56,60 +61,84 @@ func Install(rules SyscallRules, kill bool) error {
 		return err
 	}
 
-	if err := seccomp(instrs); err != nil {
-		return err
+	// Perform the actual installation.
+	if errno := SetFilter(instrs); errno != 0 {
+		return fmt.Errorf("Failed to set filter: %v", errno)
 	}
 
 	log.Infof("Seccomp filters installed.")
 	return nil
 }
 
-// buildProgram builds a BPF program that whitelists all given syscall rules.
-func buildProgram(rules SyscallRules, kill bool) ([]linux.BPFInstruction, error) {
+// RuleSet is a set of rules and associated action.
+type RuleSet struct {
+	Rules  SyscallRules
+	Action uint32
+
+	// Vsyscall indicates that a check is made for a function being called
+	// from kernel mappings. This is where the vsyscall page is located
+	// (and typically) emulated, so this RuleSet will not match any
+	// functions not dispatched from the vsyscall page.
+	Vsyscall bool
+}
+
+// SyscallName gives names to system calls. It is used purely for debugging purposes.
+//
+// An alternate namer can be provided to the package at initialization time.
+var SyscallName = func(sysno uintptr) string {
+	return fmt.Sprintf("syscall_%d", sysno)
+}
+
+// BuildProgram builds a BPF program from the given map of actions to matching
+// SyscallRules. The single generated program covers all provided RuleSets.
+func BuildProgram(rules []RuleSet, defaultAction uint32) ([]linux.BPFInstruction, error) {
 	program := bpf.NewProgramBuilder()
-	violationAction := uint32(linux.SECCOMP_RET_KILL)
-	if !kill {
-		violationAction = linux.SECCOMP_RET_TRAP
-	}
 
 	// Be paranoid and check that syscall is done in the expected architecture.
 	//
 	// A = seccomp_data.arch
-	// if (A != AUDIT_ARCH_X86_64) goto violation
+	// if (A != AUDIT_ARCH_X86_64) goto defaultAction.
 	program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArch)
-	// violationLabel is at the bottom of the program. The size of program
+	// defaultLabel is at the bottom of the program. The size of program
 	// may exceeds 255 lines, which is the limit of a condition jump.
 	program.AddJump(bpf.Jmp|bpf.Jeq|bpf.K, linux.AUDIT_ARCH_X86_64, skipOneInst, 0)
-	program.AddDirectJumpLabel(violationLabel)
-
+	program.AddDirectJumpLabel(defaultLabel)
 	if err := buildIndex(rules, program); err != nil {
 		return nil, err
 	}
 
-	// violation: return violationAction
-	if err := program.AddLabel(violationLabel); err != nil {
+	// Exhausted: return defaultAction.
+	if err := program.AddLabel(defaultLabel); err != nil {
 		return nil, err
 	}
-	program.AddStmt(bpf.Ret|bpf.K, violationAction)
+	program.AddStmt(bpf.Ret|bpf.K, defaultAction)
 
 	return program.Instructions()
 }
 
-// buildIndex builds a BST to quickly search through all syscalls that are whitelisted.
-func buildIndex(rules SyscallRules, program *bpf.ProgramBuilder) error {
-	syscalls := []uintptr{}
-	for sysno := range rules {
-		syscalls = append(syscalls, sysno)
+// buildIndex builds a BST to quickly search through all syscalls.
+func buildIndex(rules []RuleSet, program *bpf.ProgramBuilder) error {
+	// Build a list of all application system calls, across all given rule
+	// sets. We have a simple BST, but may dispatch individual matchers
+	// with different actions. The matchers are evaluated linearly.
+	requiredSyscalls := make(map[uintptr]struct{})
+	for _, rs := range rules {
+		for sysno := range rs.Rules {
+			requiredSyscalls[sysno] = struct{}{}
+		}
 	}
-
-	t, ok := strace.Lookup(abi.Linux, arch.AMD64)
-	if !ok {
-		panic("Can't find amd64 Linux syscall table")
+	syscalls := make([]uintptr, 0, len(requiredSyscalls))
+	for sysno, _ := range requiredSyscalls {
+		syscalls = append(syscalls, sysno)
 	}
-
 	sort.Slice(syscalls, func(i, j int) bool { return syscalls[i] < syscalls[j] })
-	for _, s := range syscalls {
-		log.Infof("syscall filter: %v (%v): %s", s, t.Name(s), rules[s])
+	for _, sysno := range syscalls {
+		for _, rs := range rules {
+			// Print only if there is a corresponding set of rules.
+			if _, ok := rs.Rules[sysno]; ok {
+				log.Debugf("syscall filter %v: %s => 0x%x", SyscallName(sysno), rs.Rules[sysno], rs.Action)
+			}
+		}
 	}
 
 	root := createBST(syscalls)
@@ -119,7 +148,7 @@ func buildIndex(rules SyscallRules, program *bpf.ProgramBuilder) error {
 	//
 	// A = seccomp_data.nr
 	program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetNR)
-	return root.traverse(buildBSTProgram, program, rules)
+	return root.traverse(buildBSTProgram, rules, program)
 }
 
 // createBST converts sorted syscall slice into a balanced BST.
@@ -136,15 +165,23 @@ func createBST(syscalls []uintptr) *node {
 	return &parent
 }
 
-func ruleViolationLabel(sysno uintptr, idx int) string {
-	return fmt.Sprintf("ruleViolation_%v_%v", sysno, idx)
+func vsyscallViolationLabel(ruleSetIdx int, sysno uintptr) string {
+	return fmt.Sprintf("vsyscallViolation_%v_%v", ruleSetIdx, sysno)
+}
+
+func ruleViolationLabel(ruleSetIdx int, sysno uintptr, idx int) string {
+	return fmt.Sprintf("ruleViolation_%v_%v_%v", ruleSetIdx, sysno, idx)
 }
 
 func checkArgsLabel(sysno uintptr) string {
 	return fmt.Sprintf("checkArgs_%v", sysno)
 }
 
-func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, sysno uintptr) error {
+// addSyscallArgsCheck adds argument checks for a single system call. It does
+// not insert a jump to the default action at the end and it is the
+// responsibility of the caller to insert an appropriate jump after calling
+// this function.
+func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action uint32, ruleSetIdx int, sysno uintptr) error {
 	for ruleidx, rule := range rules {
 		labelled := false
 		for i, arg := range rule {
@@ -155,28 +192,29 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, sysno uintptr) err
 					high, low := uint32(a>>32), uint32(a)
 					// assert arg_low == low
 					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i))
-					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(sysno, ruleidx))
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					// assert arg_high == high
 					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i))
-					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(sysno, ruleidx))
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					labelled = true
-
 				default:
 					return fmt.Errorf("unknown syscall rule type: %v", reflect.TypeOf(a))
 				}
 			}
 		}
-		// Matched, allow the syscall.
-		p.AddStmt(bpf.Ret|bpf.K, linux.SECCOMP_RET_ALLOW)
-		// Label the end of the rule if necessary.
+
+		// Matched, emit the given action.
+		p.AddStmt(bpf.Ret|bpf.K, action)
+
+		// Label the end of the rule if necessary. This is added for
+		// the jumps above when the argument check fails.
 		if labelled {
-			if err := p.AddLabel(ruleViolationLabel(sysno, ruleidx)); err != nil {
+			if err := p.AddLabel(ruleViolationLabel(ruleSetIdx, sysno, ruleidx)); err != nil {
 				return err
 			}
 		}
 	}
-	// Not matched?
-	p.AddDirectJumpLabel(violationLabel)
+
 	return nil
 }
 
@@ -188,16 +226,16 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, sysno uintptr) err
 //   (A > 22) ? goto index_35 : goto index_9
 //
 // index_9:  // SYS_MMAP(9), leaf
-//   A == 9) ? goto argument check : violation
+//   A == 9) ? goto argument check : defaultLabel
 //
 // index_35:  // SYS_NANOSLEEP(35), single child
 //   (A == 35) ? goto argument check : continue
-//   (A > 35) ? goto index_50 : goto violation
+//   (A > 35) ? goto index_50 : goto defaultLabel
 //
 // index_50:  // SYS_LISTEN(50), leaf
-//   (A == 50) ? goto argument check : goto violation
+//   (A == 50) ? goto argument check : goto defaultLabel
 //
-func buildBSTProgram(program *bpf.ProgramBuilder, rules SyscallRules, n *node) error {
+func buildBSTProgram(n *node, rules []RuleSet, program *bpf.ProgramBuilder) error {
 	// Root node is never referenced by label, skip it.
 	if !n.root {
 		if err := program.AddLabel(n.label()); err != nil {
@@ -209,11 +247,10 @@ func buildBSTProgram(program *bpf.ProgramBuilder, rules SyscallRules, n *node) e
 	program.AddJumpTrueLabel(bpf.Jmp|bpf.Jeq|bpf.K, uint32(sysno), checkArgsLabel(sysno), 0)
 	if n.left == nil && n.right == nil {
 		// Leaf nodes don't require extra check.
-		program.AddDirectJumpLabel(violationLabel)
+		program.AddDirectJumpLabel(defaultLabel)
 	} else {
 		// Non-leaf node. Check which turn to take otherwise. Using direct jumps
 		// in case that the offset may exceed the limit of a conditional jump (255)
-		// Note that 'violationLabel' is returned for nil children.
 		program.AddJump(bpf.Jmp|bpf.Jgt|bpf.K, uint32(sysno), 0, skipOneInst)
 		program.AddDirectJumpLabel(n.right.label())
 		program.AddDirectJumpLabel(n.left.label())
@@ -222,12 +259,60 @@ func buildBSTProgram(program *bpf.ProgramBuilder, rules SyscallRules, n *node) e
 	if err := program.AddLabel(checkArgsLabel(sysno)); err != nil {
 		return err
 	}
-	// No rules, just allow it and save one jmp.
-	if len(rules[sysno]) == 0 {
-		program.AddStmt(bpf.Ret|bpf.K, linux.SECCOMP_RET_ALLOW)
-		return nil
+
+	emitted := false
+	for ruleSetIdx, rs := range rules {
+		if _, ok := rs.Rules[sysno]; ok {
+			// If there are no rules, then this will always match.
+			// Remember we've done this so that we can emit a
+			// sensible error. We can't catch all overlaps, but we
+			// can catch this one at least.
+			if emitted {
+				return fmt.Errorf("unreachable action for %v: 0x%x (rule set %d)", SyscallName(sysno), rs.Action, ruleSetIdx)
+			}
+
+			// Emit a vsyscall check if this rule requires a
+			// Vsyscall match. This rule ensures that the top bit
+			// is set in the instruction pointer, which is where
+			// the vsyscall page will be mapped.
+			if rs.Vsyscall {
+				program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetIPHigh)
+				program.AddJumpFalseLabel(bpf.Jmp|bpf.Jset|bpf.K, 0x80000000, 0, vsyscallViolationLabel(ruleSetIdx, sysno))
+			}
+
+			// Emit matchers.
+			if len(rs.Rules[sysno]) == 0 {
+				// This is a blanket action.
+				program.AddStmt(bpf.Ret|bpf.K, rs.Action)
+				emitted = true
+			} else {
+				// Add an argument check for these particular
+				// arguments. This will continue execution and
+				// check the next rule set. We need to ensure
+				// that at the very end, we insert a direct
+				// jump label for the unmatched case.
+				if err := addSyscallArgsCheck(program, rs.Rules[sysno], rs.Action, ruleSetIdx, sysno); err != nil {
+					return err
+				}
+			}
+
+			// If there was a Vsyscall check for this rule, then we
+			// need to add an appropriate label for the jump above.
+			if rs.Vsyscall {
+				if err := program.AddLabel(vsyscallViolationLabel(ruleSetIdx, sysno)); err != nil {
+					return err
+				}
+			}
+		}
 	}
-	return addSyscallArgsCheck(program, rules[sysno], sysno)
+
+	// Not matched? We only need to insert a jump to the default label if
+	// not default action has been emitted for this call.
+	if !emitted {
+		program.AddDirectJumpLabel(defaultLabel)
+	}
+
+	return nil
 }
 
 // node represents a tree node.
@@ -238,26 +323,27 @@ type node struct {
 	root  bool
 }
 
-// label returns the label corresponding to this node. If node is nil (syscall not present),
-// violationLabel is returned for convenience.
+// label returns the label corresponding to this node.
+//
+// If n is nil, then the defaultLabel is returned.
 func (n *node) label() string {
 	if n == nil {
-		return violationLabel
+		return defaultLabel
 	}
 	return fmt.Sprintf("index_%v", n.value)
 }
 
-type traverseFunc func(*bpf.ProgramBuilder, SyscallRules, *node) error
+type traverseFunc func(*node, []RuleSet, *bpf.ProgramBuilder) error
 
-func (n *node) traverse(fn traverseFunc, p *bpf.ProgramBuilder, rules SyscallRules) error {
+func (n *node) traverse(fn traverseFunc, rules []RuleSet, p *bpf.ProgramBuilder) error {
 	if n == nil {
 		return nil
 	}
-	if err := fn(p, rules, n); err != nil {
+	if err := fn(n, rules, p); err != nil {
 		return err
 	}
-	if err := n.left.traverse(fn, p, rules); err != nil {
+	if err := n.left.traverse(fn, rules, p); err != nil {
 		return err
 	}
-	return n.right.traverse(fn, p, rules)
+	return n.right.traverse(fn, rules, p)
 }
diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go
index 9215e5c90..6b707f195 100644
--- a/pkg/seccomp/seccomp_rules.go
+++ b/pkg/seccomp/seccomp_rules.go
@@ -24,9 +24,11 @@ import "fmt"
 //	__u64 args[6];
 // };
 const (
-	seccompDataOffsetNR   = 0
-	seccompDataOffsetArch = 4
-	seccompDataOffsetArgs = 16
+	seccompDataOffsetNR     = 0
+	seccompDataOffsetArch   = 4
+	seccompDataOffsetIPLow  = 8
+	seccompDataOffsetIPHigh = 12
+	seccompDataOffsetArgs   = 16
 )
 
 func seccompDataOffsetArgLow(i int) uint32 {
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index 42cf85c03..0188ad4f3 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -76,14 +76,18 @@ func TestBasic(t *testing.T) {
 	}
 
 	for _, test := range []struct {
-		// filters are the set of syscall that are allowed.
-		filters SyscallRules
-		kill    bool
-		specs   []spec
+		ruleSets      []RuleSet
+		defaultAction uint32
+		specs         []spec
 	}{
 		{
-			filters: SyscallRules{1: {}},
-			kill:    false,
+			ruleSets: []RuleSet{
+				{
+					Rules:  SyscallRules{1: {}},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
 			specs: []spec{
 				{
 					desc: "Single syscall allowed",
@@ -98,12 +102,61 @@ func TestBasic(t *testing.T) {
 			},
 		},
 		{
-			filters: SyscallRules{
-				1: {},
-				3: {},
-				5: {},
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								AllowValue(0x1),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					Rules: SyscallRules{
+						1: {},
+						2: {},
+					},
+					Action: linux.SECCOMP_RET_TRAP,
+				},
 			},
-			kill: false,
+			defaultAction: linux.SECCOMP_RET_KILL,
+			specs: []spec{
+				{
+					desc: "Multiple rulesets allowed (1a)",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{0x1}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "Multiple rulesets allowed (1b)",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "Multiple rulesets allowed (2)",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "Multiple rulesets allowed (2)",
+					data: seccompData{nr: 0, arch: linux.AUDIT_ARCH_X86_64},
+					want: linux.SECCOMP_RET_KILL,
+				},
+			},
+		},
+		{
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: {},
+						3: {},
+						5: {},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
 			specs: []spec{
 				{
 					desc: "Multiple syscalls allowed (1)",
@@ -148,8 +201,15 @@ func TestBasic(t *testing.T) {
 			},
 		},
 		{
-			filters: SyscallRules{1: {}},
-			kill:    false,
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: {},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
 			specs: []spec{
 				{
 					desc: "Wrong architecture",
@@ -159,26 +219,38 @@ func TestBasic(t *testing.T) {
 			},
 		},
 		{
-			filters: SyscallRules{1: {}},
-			kill:    true,
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: {},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
 			specs: []spec{
 				{
-					desc: "Syscall disallowed, action kill",
+					desc: "Syscall disallowed, action trap",
 					data: seccompData{nr: 2, arch: linux.AUDIT_ARCH_X86_64},
-					want: linux.SECCOMP_RET_KILL,
+					want: linux.SECCOMP_RET_TRAP,
 				},
 			},
 		},
 		{
-			filters: SyscallRules{
-				1: []Rule{
-					{
-						AllowAny{},
-						AllowValue(0xf),
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								AllowAny{},
+								AllowValue(0xf),
+							},
+						},
 					},
+					Action: linux.SECCOMP_RET_ALLOW,
 				},
 			},
-			kill: false,
+			defaultAction: linux.SECCOMP_RET_TRAP,
 			specs: []spec{
 				{
 					desc: "Syscall argument allowed",
@@ -193,17 +265,22 @@ func TestBasic(t *testing.T) {
 			},
 		},
 		{
-			filters: SyscallRules{
-				1: []Rule{
-					{
-						AllowValue(0xf),
-					},
-					{
-						AllowValue(0xe),
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								AllowValue(0xf),
+							},
+							{
+								AllowValue(0xe),
+							},
+						},
 					},
+					Action: linux.SECCOMP_RET_ALLOW,
 				},
 			},
-			kill: false,
+			defaultAction: linux.SECCOMP_RET_TRAP,
 			specs: []spec{
 				{
 					desc: "Syscall argument allowed, two rules",
@@ -218,16 +295,21 @@ func TestBasic(t *testing.T) {
 			},
 		},
 		{
-			filters: SyscallRules{
-				1: []Rule{
-					{
-						AllowValue(0),
-						AllowValue(math.MaxUint64 - 1),
-						AllowValue(math.MaxUint32),
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								AllowValue(0),
+								AllowValue(math.MaxUint64 - 1),
+								AllowValue(math.MaxUint32),
+							},
+						},
 					},
+					Action: linux.SECCOMP_RET_ALLOW,
 				},
 			},
-			kill: false,
+			defaultAction: linux.SECCOMP_RET_TRAP,
 			specs: []spec{
 				{
 					desc: "64bit syscall argument allowed",
@@ -259,7 +341,7 @@ func TestBasic(t *testing.T) {
 			},
 		},
 	} {
-		instrs, err := buildProgram(test.filters, test.kill)
+		instrs, err := BuildProgram(test.ruleSets, test.defaultAction)
 		if err != nil {
 			t.Errorf("%s: buildProgram() got error: %v", test.specs[0].desc, err)
 			continue
@@ -282,6 +364,7 @@ func TestBasic(t *testing.T) {
 	}
 }
 
+// TestRandom tests that randomly generated rules are encoded correctly.
 func TestRandom(t *testing.T) {
 	rand.Seed(time.Now().UnixNano())
 	size := rand.Intn(50) + 1
@@ -294,7 +377,12 @@ func TestRandom(t *testing.T) {
 	}
 
 	fmt.Printf("Testing filters: %v", syscallRules)
-	instrs, err := buildProgram(syscallRules, false)
+	instrs, err := BuildProgram([]RuleSet{
+		RuleSet{
+			Rules:  syscallRules,
+			Action: uint32(linux.SECCOMP_RET_ALLOW),
+		},
+	}, uint32(linux.SECCOMP_RET_TRAP))
 	if err != nil {
 		t.Fatalf("buildProgram() got error: %v", err)
 	}
@@ -319,8 +407,8 @@ func TestRandom(t *testing.T) {
 	}
 }
 
-// TestReadDeal checks that a process dies when it trips over the filter and that it
-// doesn't die when the filter is not triggered.
+// TestReadDeal checks that a process dies when it trips over the filter and
+// that it doesn't die when the filter is not triggered.
 func TestRealDeal(t *testing.T) {
 	for _, test := range []struct {
 		die  bool
diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go
index 6682f8d9b..ae18534bf 100644
--- a/pkg/seccomp/seccomp_unsafe.go
+++ b/pkg/seccomp/seccomp_unsafe.go
@@ -17,7 +17,6 @@
 package seccomp
 
 import (
-	"fmt"
 	"syscall"
 	"unsafe"
 
@@ -31,19 +30,28 @@ type sockFprog struct {
 	Filter *linux.BPFInstruction
 }
 
-func seccomp(instrs []linux.BPFInstruction) error {
+// SetFilter installs the given BPF program.
+//
+// This is safe to call from an afterFork context.
+//
+//go:nosplit
+func SetFilter(instrs []linux.BPFInstruction) syscall.Errno {
 	// SYS_SECCOMP is not available in syscall package.
 	const SYS_SECCOMP = 317
 
 	// PR_SET_NO_NEW_PRIVS is required in order to enable seccomp. See seccomp(2) for details.
-	if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0); err != 0 {
-		return fmt.Errorf("failed to set PR_SET_NO_NEW_PRIVS: %v", err)
+	if _, _, errno := syscall.RawSyscall(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0); errno != 0 {
+		return errno
 	}
-	sockProg := sockFprog{Len: uint16(len(instrs)), Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0]))}
 
 	// TODO: Use SECCOMP_FILTER_FLAG_KILL_PROCESS when available.
-	if _, _, err := syscall.RawSyscall(SYS_SECCOMP, linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(&sockProg))); err != 0 {
-		return fmt.Errorf("failed to set seccomp filter: %v", err)
+	sockProg := sockFprog{
+		Len:    uint16(len(instrs)),
+		Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])),
 	}
-	return nil
+	if _, _, errno := syscall.RawSyscall(SYS_SECCOMP, linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(&sockProg))); errno != 0 {
+		return errno
+	}
+
+	return 0
 }
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index f1e408af9..5ba6c19ea 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build amd64
+
 package arch
 
 import (
@@ -26,6 +28,9 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
+// Host specifies the host architecture.
+const Host = AMD64
+
 // These constants come directly from Linux.
 const (
 	// maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index ceee895dc..debae058b 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -19,6 +19,8 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/filemem",
diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
index b55b2795a..46a8bda8e 100644
--- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -136,7 +136,7 @@ func (t *thread) clone(initRegs *syscall.PtraceRegs) (*thread, error) {
 		return nil, syscall.EINVAL
 	}
 	rval, err := t.syscallIgnoreInterrupt(
-		initRegs,
+		&t.initRegs,
 		syscall.SYS_CLONE,
 		arch.SyscallArgument{Value: uintptr(
 			syscall.CLONE_FILES |
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 035ebc332..6d5ad6b71 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -47,6 +47,11 @@ type thread struct {
 	tgid int32
 	tid  int32
 	cpu  uint32
+
+	// initRegs are the initial registers for the first thread.
+	//
+	// These are used for the register set for system calls.
+	initRegs syscall.PtraceRegs
 }
 
 // threadPool is a collection of threads.
@@ -99,11 +104,6 @@ func (tp *threadPool) lookupOrCreate(currentTID int32, newThread func() *thread)
 type subprocess struct {
 	platform.NoAddressSpaceIO
 
-	// initRegs are the initial registers for the first thread.
-	//
-	// These are used for the register set for system calls.
-	initRegs syscall.PtraceRegs
-
 	// requests is used to signal creation of new threads.
 	requests chan chan *thread
 
@@ -142,7 +142,6 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
 	// thread, and responding to requests to make additional threads in the
 	// traced process. The process will be killed and reaped when the
 	// request channel is closed, which happens in Release below.
-	var initRegs syscall.PtraceRegs
 	errChan := make(chan error)
 	requests := make(chan chan *thread)
 	go func() { // S/R-SAFE: Platform-related.
@@ -156,22 +155,12 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
 			return
 		}
 
-		// Grab registers.
-		//
-		// Note that we adjust the current register RIP value to be
-		// just before the current system call executed. This depends
-		// on the definition of the stub itself.
-		if err := firstThread.getRegs(&initRegs); err != nil {
-			panic(fmt.Sprintf("ptrace get regs failed: %v", err))
-		}
-		initRegs.Rip -= initRegsRipAdjustment
-
 		// Ready to handle requests.
 		errChan <- nil
 
 		// Wait for requests to create threads.
 		for r := range requests {
-			t, err := firstThread.clone(&initRegs)
+			t, err := firstThread.clone(&firstThread.initRegs)
 			if err != nil {
 				// Should not happen: not recoverable.
 				panic(fmt.Sprintf("error initializing first thread: %v", err))
@@ -183,15 +172,12 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
 			// (Hopefully nobody tgkilled it with a signal <
 			// SIGSTOP before the SIGSTOP was delivered, in which
 			// case that signal would be delivered before SIGSTOP.)
-			if sig := t.wait(); sig != syscall.SIGSTOP {
+			if sig := t.wait(stopped); sig != syscall.SIGSTOP {
 				panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig))
 			}
 
-			// Detach the thread without suppressing the SIGSTOP,
-			// causing it to enter group-stop.
-			if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(syscall.SIGSTOP), 0, 0); errno != 0 {
-				panic(fmt.Sprintf("can't detach new clone: %v", errno))
-			}
+			// Detach the thread.
+			t.detach()
 
 			// Return the thread.
 			r <- t
@@ -208,7 +194,6 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
 
 	// Ready.
 	sp := &subprocess{
-		initRegs: initRegs,
 		requests: requests,
 		sysemuThreads: threadPool{
 			threads: make(map[int32]*thread),
@@ -277,16 +262,48 @@ func (t *thread) attach() {
 	// stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of
 	// newSubprocess), so we always expect to see signal-delivery-stop with
 	// SIGSTOP.
-	if sig := t.wait(); sig != syscall.SIGSTOP {
+	if sig := t.wait(stopped); sig != syscall.SIGSTOP {
 		panic(fmt.Sprintf("wait failed: expected SIGSTOP, got %v", sig))
 	}
 
 	// Initialize options.
 	t.init()
+
+	// Grab registers.
+	//
+	// Note that we adjust the current register RIP value to be just before
+	// the current system call executed. This depends on the definition of
+	// the stub itself.
+	if err := t.getRegs(&t.initRegs); err != nil {
+		panic(fmt.Sprintf("ptrace get regs failed: %v", err))
+	}
+	t.initRegs.Rip -= initRegsRipAdjustment
 }
 
+// detach detachs from the thread.
+//
+// Because the SIGSTOP is not supressed, the thread will enter group-stop.
+func (t *thread) detach() {
+	if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(syscall.SIGSTOP), 0, 0); errno != 0 {
+		panic(fmt.Sprintf("can't detach new clone: %v", errno))
+	}
+}
+
+// waitOutcome is used for wait below.
+type waitOutcome int
+
+const (
+	// stopped indicates that the process was stopped.
+	stopped waitOutcome = iota
+
+	// killed indicates that the process was killed.
+	killed
+)
+
 // wait waits for a stop event.
-func (t *thread) wait() syscall.Signal {
+//
+// Precondition: outcome is a valid waitOutcome.
+func (t *thread) wait(outcome waitOutcome) syscall.Signal {
 	var status syscall.WaitStatus
 
 	for {
@@ -300,25 +317,55 @@ func (t *thread) wait() syscall.Signal {
 		if int(r) != int(t.tid) {
 			panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid))
 		}
-		if !status.Stopped() {
-			panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
-		}
-		if status.StopSignal() == 0 {
-			continue // Spurious stop.
+		switch outcome {
+		case stopped:
+			if !status.Stopped() {
+				panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
+			}
+			stopSig := status.StopSignal()
+			if stopSig == 0 {
+				continue // Spurious stop.
+			}
+			if stopSig == syscall.SIGTRAP {
+				// Re-encode the trap cause the way it's expected.
+				return stopSig | syscall.Signal(status.TrapCause()<<8)
+			}
+			// Not a trap signal.
+			return stopSig
+		case killed:
+			if !status.Exited() && !status.Signaled() {
+				panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status))
+			}
+			return syscall.Signal(status.ExitStatus())
+		default:
+			// Should not happen.
+			panic(fmt.Sprintf("unknown outcome: %v", outcome))
 		}
-		return status.StopSignal()
 	}
 }
 
+// destroy kills the thread.
+//
+// Note that this should not be used in the general case; the death of threads
+// will typically cause the death of the parent. This is a utility method for
+// manually created threads.
+func (t *thread) destroy() {
+	t.detach()
+	syscall.Tgkill(int(t.tgid), int(t.tid), syscall.Signal(syscall.SIGKILL))
+	t.wait(killed)
+}
+
 // init initializes trace options.
 func (t *thread) init() {
-	// Set our TRACESYSGOOD option to differeniate real SIGTRAP.
+	// Set our TRACESYSGOOD option to differeniate real SIGTRAP. Also, we
+	// require the SECCOMP option to ensure that seccomp violations
+	// generate a ptrace event.
 	_, _, errno := syscall.RawSyscall6(
 		syscall.SYS_PTRACE,
 		syscall.PTRACE_SETOPTIONS,
 		uintptr(t.tid),
 		0,
-		syscall.PTRACE_O_TRACESYSGOOD,
+		syscall.PTRACE_O_TRACESYSGOOD|_PTRACE_O_TRACESECCOMP,
 		0, 0)
 	if errno != 0 {
 		panic(fmt.Sprintf("ptrace set options failed: %v", errno))
@@ -342,8 +389,8 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
 			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
 		}
 
-		sig := t.wait()
-		if sig == (0x80 | syscall.SIGTRAP) {
+		sig := t.wait(stopped)
+		if sig == (syscallEvent | syscall.SIGTRAP) {
 			// Reached syscall-enter-stop.
 			break
 		} else {
@@ -360,7 +407,7 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
 	// Wait for syscall-exit-stop. "[Signal-delivery-stop] never happens
 	// between syscall-enter-stop and syscall-exit-stop; it happens *after*
 	// syscall-exit-stop.)" - ptrace(2), "Syscall-stops"
-	if sig := t.wait(); sig != (0x80 | syscall.SIGTRAP) {
+	if sig := t.wait(stopped); sig != (syscallEvent | syscall.SIGTRAP) {
 		panic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig))
 	}
 
@@ -403,22 +450,23 @@ func (t *thread) NotifyInterrupt() {
 //
 // This function returns true on a system call, false on a signal.
 func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
-	regs := &ac.StateData().Regs
-	s.resetSysemuRegs(regs)
+	// Lock the thread for ptrace operations.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
 
 	// Extract floating point state.
 	fpState := ac.FloatingPointData()
 	fpLen, _ := ac.FeatureSet().ExtendedStateSize()
 	useXsave := ac.FeatureSet().UseXsave()
 
-	// Lock the thread for ptrace operations.
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
-
 	// Grab our thread from the pool.
 	currentTID := int32(procid.Current())
 	t := s.sysemuThreads.lookupOrCreate(currentTID, s.newThread)
 
+	// Reset necessary registers.
+	regs := &ac.StateData().Regs
+	t.resetSysemuRegs(regs)
+
 	// Check for interrupts, and ensure that future interrupts will signal t.
 	if !c.interrupt.Enable(t) {
 		// Pending interrupt; simulate.
@@ -459,7 +507,7 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 		}
 
 		// Wait for the syscall-enter stop.
-		sig := t.wait()
+		sig := t.wait(stopped)
 
 		// Refresh all registers.
 		if err := t.getRegs(regs); err != nil {
@@ -470,13 +518,17 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 		}
 
 		// Is it a system call?
-		if sig == (0x80 | syscall.SIGTRAP) {
+		if sig == (syscallEvent | syscall.SIGTRAP) {
 			// Ensure registers are sane.
 			updateSyscallRegs(regs)
 			return true
-		}
-
-		if sig == syscall.SIGSTOP {
+		} else if sig == (seccompEvent | syscall.SIGTRAP) {
+			// Seccomp is enabled, and caught the system call. This
+			// is an emulated vsyscall call, since those are caught
+			// only by seccomp and explicitly set to trace.
+			updateSyscallRegs(regs)
+			return true
+		} else if sig == syscall.SIGSTOP {
 			// SIGSTOP was delivered to another thread in the same thread
 			// group, which initiated another group stop. Just ignore it.
 			continue
@@ -507,7 +559,7 @@ func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintp
 	currentTID := int32(procid.Current())
 	t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
 
-	return t.syscallIgnoreInterrupt(&s.initRegs, sysno, args...)
+	return t.syscallIgnoreInterrupt(&t.initRegs, sysno, args...)
 }
 
 // MapFile implements platform.AddressSpace.MapFile.
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index 8211215df..c38dc1ff8 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -43,20 +43,20 @@ const (
 // resetSysemuRegs sets up emulation registers.
 //
 // This should be called prior to calling sysemu.
-func (s *subprocess) resetSysemuRegs(regs *syscall.PtraceRegs) {
-	regs.Cs = s.initRegs.Cs
-	regs.Ss = s.initRegs.Ss
-	regs.Ds = s.initRegs.Ds
-	regs.Es = s.initRegs.Es
-	regs.Fs = s.initRegs.Fs
-	regs.Gs = s.initRegs.Gs
+func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) {
+	regs.Cs = t.initRegs.Cs
+	regs.Ss = t.initRegs.Ss
+	regs.Ds = t.initRegs.Ds
+	regs.Es = t.initRegs.Es
+	regs.Fs = t.initRegs.Fs
+	regs.Gs = t.initRegs.Gs
 }
 
 // createSyscallRegs sets up syscall registers.
 //
 // This should be called to generate registers for a system call.
 func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs {
-	// Copy initial registers (RIP, segments, etc.).
+	// Copy initial registers.
 	regs := *initRegs
 
 	// Set our syscall number.
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index b212bbdfe..53adadadd 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -21,14 +21,167 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
 )
 
+const (
+	syscallEvent           syscall.Signal = 0x80
+	seccompEvent           syscall.Signal = 0x700 // 0x7 (PTRACE_SECCOMP_EVENT) << 8
+	_PTRACE_O_TRACESECCOMP                = 0x80  // 1 << 0x7 (PTRACE_SECCOMP_EVENT)
+)
+
+// probeSeccomp returns true iff seccomp is run after ptrace notifications,
+// which is generally the case for kernel version >= 4.8. This check is dynamic
+// because kernels have be backported behavior.
+//
+// See createStub for more information.
+//
+// Precondition: the runtime OS thread must be locked.
+func probeSeccomp() bool {
+	// Create a completely new, destroyable process.
+	t, err := attachedThread(0, uint32(linux.SECCOMP_RET_ERRNO))
+	if err != nil {
+		panic(fmt.Sprintf("seccomp probe failed: %v", err))
+	}
+	defer t.destroy()
+
+	// Set registers to the yield system call. This call is not allowed
+	// by the filters specified in the attachThread function.
+	regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD)
+	if err := t.setRegs(&regs); err != nil {
+		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
+	}
+
+	for {
+		// Attempt an emulation.
+		if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0); errno != 0 {
+			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
+		}
+
+		sig := t.wait(stopped)
+		if sig == (syscallEvent | syscall.SIGTRAP) {
+			// Did the seccomp errno hook already run? This would
+			// indicate that seccomp is first in line and we're
+			// less than 4.8.
+			if err := t.getRegs(&regs); err != nil {
+				panic(fmt.Sprintf("ptrace get-regs failed: %v", err))
+			}
+			if _, err := syscallReturnValue(&regs); err == nil {
+				// The seccomp errno mode ran first, and reset
+				// the error in the registers.
+				return false
+			}
+			// The seccomp hook did not run yet, and therefore it
+			// is safe to use RET_KILL mode for dispatched calls.
+			return true
+		}
+	}
+}
+
 // createStub creates a fresh stub processes.
 //
 // Precondition: the runtime OS thread must be locked.
 func createStub() (*thread, error) {
+	// The exact interactions of ptrace and seccomp are complex, and
+	// changed in recent kernel versions. Before commit 93e35efb8de45, the
+	// seccomp check is done before the ptrace emulation check. This means
+	// that any calls not matching this list will trigger the seccomp
+	// default action instead of notifying ptrace.
+	//
+	// After commit 93e35efb8de45, the seccomp check is done after the
+	// ptrace emulation check. This simplifies using SYSEMU, since seccomp
+	// will never run for emulation. Seccomp will only run for injected
+	// system calls, and thus we can use RET_KILL as our violation action.
+	var defaultAction uint32
+	if probeSeccomp() {
+		log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)")
+		defaultAction = uint32(linux.SECCOMP_RET_KILL)
+	} else {
+		// We must rely on SYSEMU behavior; tracing with SYSEMU is broken.
+		log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)")
+		defaultAction = uint32(linux.SECCOMP_RET_ALLOW)
+	}
+
+	// When creating the new child process, we specify SIGKILL as the
+	// signal to deliver when the child exits. We never expect a subprocess
+	// to exit; they are pooled and reused. This is done to ensure that if
+	// a subprocess is OOM-killed, this process (and all other stubs,
+	// transitively) will be killed as well. It's simply not possible to
+	// safely handle a single stub getting killed: the exact state of
+	// execution is unknown and not recoverable.
+	return attachedThread(uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, defaultAction)
+}
+
+// attachedThread returns a new attached thread.
+//
+// Precondition: the runtime OS thread must be locked.
+func attachedThread(flags uintptr, defaultAction uint32) (*thread, error) {
+	// Create a BPF program that allows only the system calls needed by the
+	// stub and all its children. This is used to create child stubs
+	// (below), so we must include the ability to fork, but otherwise lock
+	// down available calls only to what is needed.
+	rules := []seccomp.RuleSet{
+		// Rules for trapping vsyscall access.
+		seccomp.RuleSet{
+			Rules: seccomp.SyscallRules{
+				syscall.SYS_GETTIMEOFDAY: {},
+				syscall.SYS_TIME:         {},
+				309:                      {}, // SYS_GETCPU.
+			},
+			Action:   uint32(linux.SECCOMP_RET_TRACE),
+			Vsyscall: true,
+		},
+	}
+	if defaultAction != uint32(linux.SECCOMP_RET_ALLOW) {
+		rules = append(rules, seccomp.RuleSet{
+			Rules: seccomp.SyscallRules{
+				syscall.SYS_CLONE: []seccomp.Rule{
+					// Allow creation of new subprocesses (used by the master).
+					{seccomp.AllowValue(syscall.CLONE_FILES | syscall.SIGKILL)},
+					// Allow creation of new threads within a single address space (used by addresss spaces).
+					{seccomp.AllowValue(
+						syscall.CLONE_FILES |
+							syscall.CLONE_FS |
+							syscall.CLONE_SIGHAND |
+							syscall.CLONE_THREAD |
+							syscall.CLONE_PTRACE |
+							syscall.CLONE_VM)},
+				},
+
+				// For the initial process creation.
+				syscall.SYS_WAIT4: {},
+				syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+					{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+				},
+				syscall.SYS_EXIT: {},
+
+				// For the stub prctl dance (all).
+				syscall.SYS_PRCTL: []seccomp.Rule{
+					{seccomp.AllowValue(syscall.PR_SET_PDEATHSIG), seccomp.AllowValue(syscall.SIGKILL)},
+				},
+				syscall.SYS_GETPPID: {},
+
+				// For the stub to stop itself (all).
+				syscall.SYS_GETPID: {},
+				syscall.SYS_KILL: []seccomp.Rule{
+					{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SIGSTOP)},
+				},
+
+				// Injected to support the address space operations.
+				syscall.SYS_MMAP:   {},
+				syscall.SYS_MUNMAP: {},
+			},
+			Action: uint32(linux.SECCOMP_RET_ALLOW),
+		})
+	}
+	instrs, err := seccomp.BuildProgram(rules, defaultAction)
+	if err != nil {
+		return nil, err
+	}
+
 	// Declare all variables up front in order to ensure that there's no
 	// need for allocations between beforeFork & afterFork.
 	var (
@@ -43,14 +196,8 @@ func createStub() (*thread, error) {
 	// Among other things, beforeFork masks all signals.
 	beforeFork()
 
-	// When creating the new child process, we specify SIGKILL as the
-	// signal to deliver when the child exits. We never expect a subprocess
-	// to exit; they are pooled and reused. This is done to ensure that if
-	// a subprocess is OOM-killed, this process (and all other stubs,
-	// transitively) will be killed as well. It's simply not possible to
-	// safely handle a single stub getting killed: the exact state of
-	// execution is unknown and not recoverable.
-	pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, 0, 0, 0, 0, 0)
+	// Do the clone.
+	pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, flags, 0, 0, 0, 0, 0)
 	if errno != 0 {
 		afterFork()
 		return nil, errno
@@ -67,7 +214,7 @@ func createStub() (*thread, error) {
 			tid:  int32(pid),
 			cpu:  ^uint32(0),
 		}
-		if sig := t.wait(); sig != syscall.SIGSTOP {
+		if sig := t.wait(stopped); sig != syscall.SIGSTOP {
 			return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
 		}
 		t.attach()
@@ -86,6 +233,12 @@ func createStub() (*thread, error) {
 		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
 	}
 
+	// Set an aggressive BPF filter for the stub and all it's children. See
+	// the description of the BPF program built above.
+	if errno := seccomp.SetFilter(instrs); errno != 0 {
+		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
+	}
+
 	// Enable cpuid-faulting; this may fail on older kernels or hardware,
 	// so we just disregard the result. Host CPUID will be enabled.
 	syscall.RawSyscall(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0)
@@ -105,7 +258,7 @@ func (s *subprocess) createStub() (*thread, error) {
 	t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
 
 	// Pass the expected PPID to the child via R15.
-	regs := s.initRegs
+	regs := t.initRegs
 	regs.R15 = uint64(t.tgid)
 
 	// Call fork in a subprocess.
@@ -138,7 +291,7 @@ func (s *subprocess) createStub() (*thread, error) {
 	// status. If the wait succeeds, we'll assume that it was the SIGSTOP.
 	// If the child actually exited, the attach below will fail.
 	_, err = t.syscallIgnoreInterrupt(
-		&s.initRegs,
+		&t.initRegs,
 		syscall.SYS_WAIT4,
 		arch.SyscallArgument{Value: uintptr(pid)},
 		arch.SyscallArgument{Value: 0},
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index e1c8db67a..674554081 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -24,6 +24,7 @@ go_library(
         "//pkg/binary",
         "//pkg/bits",
         "//pkg/eventchannel",
+        "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/kernel",
         "//pkg/sentry/socket/control",
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index f2a22aaa5..a16f5490e 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -28,6 +28,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/bits"
 	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	pb "gvisor.googlesource.com/gvisor/pkg/sentry/strace/strace_go_proto"
@@ -699,3 +700,13 @@ func EnableAll(sinks SinkType) {
 		table.FeatureEnable.EnableAll(flags)
 	}
 }
+
+func init() {
+	t, ok := Lookup(abi.Host, arch.Host)
+	if ok {
+		// Provide the native table as the lookup for seccomp
+		// debugging. This is best-effort. This is provided this way to
+		// avoid dependencies from seccomp to this package.
+		seccomp.SyscallName = t.Name
+	}
+}
-- 
cgit v1.2.3