summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/abi/BUILD1
-rw-r--r--pkg/abi/abi_linux.go20
-rw-r--r--pkg/seccomp/BUILD3
-rw-r--r--pkg/seccomp/seccomp.go224
-rw-r--r--pkg/seccomp/seccomp_rules.go8
-rw-r--r--pkg/seccomp/seccomp_test.go172
-rw-r--r--pkg/seccomp/seccomp_unsafe.go24
-rw-r--r--pkg/sentry/arch/arch_amd64.go5
-rw-r--r--pkg/sentry/platform/ptrace/BUILD2
-rw-r--r--pkg/sentry/platform/ptrace/ptrace_unsafe.go2
-rw-r--r--pkg/sentry/platform/ptrace/subprocess.go150
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_amd64.go16
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_linux.go175
-rw-r--r--pkg/sentry/strace/BUILD1
-rw-r--r--pkg/sentry/strace/strace.go11
15 files changed, 620 insertions, 194 deletions
diff --git a/pkg/abi/BUILD b/pkg/abi/BUILD
index c014d2c4b..1ba4f3a46 100644
--- a/pkg/abi/BUILD
+++ b/pkg/abi/BUILD
@@ -6,6 +6,7 @@ go_library(
name = "abi",
srcs = [
"abi.go",
+ "abi_linux.go",
"flag.go",
],
importpath = "gvisor.googlesource.com/gvisor/pkg/abi",
diff --git a/pkg/abi/abi_linux.go b/pkg/abi/abi_linux.go
new file mode 100644
index 000000000..dd5d67b51
--- /dev/null
+++ b/pkg/abi/abi_linux.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package abi
+
+// Host specifies the host ABI.
+const Host = Linux
diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index b3e2f0b38..1975d17a6 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -28,12 +28,9 @@ go_library(
importpath = "gvisor.googlesource.com/gvisor/pkg/seccomp",
visibility = ["//visibility:public"],
deps = [
- "//pkg/abi",
"//pkg/abi/linux",
"//pkg/bpf",
"//pkg/log",
- "//pkg/sentry/arch",
- "//pkg/sentry/strace",
],
)
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index 49da3c775..a746dc9b3 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -20,31 +20,36 @@ import (
"reflect"
"sort"
- "gvisor.googlesource.com/gvisor/pkg/abi"
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/bpf"
"gvisor.googlesource.com/gvisor/pkg/log"
- "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
- "gvisor.googlesource.com/gvisor/pkg/sentry/strace"
)
const (
- // violationLabel is added to the program to take action on a violation.
- violationLabel = "violation"
-
// skipOneInst is the offset to take for skipping one instruction.
skipOneInst = 1
+
+ // defaultLabel is the label for the default action.
+ defaultLabel = "default_action"
)
// Install generates BPF code based on the set of syscalls provided. It only
-// allows syscalls that conform to the specification (*) and generates SIGSYS
+// allows syscalls that conform to the specification and generates SIGSYS
// trap unless kill is set.
//
-// (*) The current implementation only checks the syscall number. It does NOT
-// validate any of the arguments.
+// This is a convenience wrapper around BuildProgram and SetFilter.
func Install(rules SyscallRules, kill bool) error {
log.Infof("Installing seccomp filters for %d syscalls (kill=%t)", len(rules), kill)
- instrs, err := buildProgram(rules, kill)
+ defaultAction := uint32(linux.SECCOMP_RET_TRAP)
+ if kill {
+ defaultAction = uint32(linux.SECCOMP_RET_KILL)
+ }
+ instrs, err := BuildProgram([]RuleSet{
+ RuleSet{
+ Rules: rules,
+ Action: uint32(linux.SECCOMP_RET_ALLOW),
+ },
+ }, defaultAction)
if log.IsLogging(log.Debug) {
programStr, errDecode := bpf.DecodeProgram(instrs)
if errDecode != nil {
@@ -56,60 +61,84 @@ func Install(rules SyscallRules, kill bool) error {
return err
}
- if err := seccomp(instrs); err != nil {
- return err
+ // Perform the actual installation.
+ if errno := SetFilter(instrs); errno != 0 {
+ return fmt.Errorf("Failed to set filter: %v", errno)
}
log.Infof("Seccomp filters installed.")
return nil
}
-// buildProgram builds a BPF program that whitelists all given syscall rules.
-func buildProgram(rules SyscallRules, kill bool) ([]linux.BPFInstruction, error) {
+// RuleSet is a set of rules and associated action.
+type RuleSet struct {
+ Rules SyscallRules
+ Action uint32
+
+ // Vsyscall indicates that a check is made for a function being called
+ // from kernel mappings. This is where the vsyscall page is located
+ // (and typically) emulated, so this RuleSet will not match any
+ // functions not dispatched from the vsyscall page.
+ Vsyscall bool
+}
+
+// SyscallName gives names to system calls. It is used purely for debugging purposes.
+//
+// An alternate namer can be provided to the package at initialization time.
+var SyscallName = func(sysno uintptr) string {
+ return fmt.Sprintf("syscall_%d", sysno)
+}
+
+// BuildProgram builds a BPF program from the given map of actions to matching
+// SyscallRules. The single generated program covers all provided RuleSets.
+func BuildProgram(rules []RuleSet, defaultAction uint32) ([]linux.BPFInstruction, error) {
program := bpf.NewProgramBuilder()
- violationAction := uint32(linux.SECCOMP_RET_KILL)
- if !kill {
- violationAction = linux.SECCOMP_RET_TRAP
- }
// Be paranoid and check that syscall is done in the expected architecture.
//
// A = seccomp_data.arch
- // if (A != AUDIT_ARCH_X86_64) goto violation
+ // if (A != AUDIT_ARCH_X86_64) goto defaultAction.
program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArch)
- // violationLabel is at the bottom of the program. The size of program
+ // defaultLabel is at the bottom of the program. The size of program
// may exceeds 255 lines, which is the limit of a condition jump.
program.AddJump(bpf.Jmp|bpf.Jeq|bpf.K, linux.AUDIT_ARCH_X86_64, skipOneInst, 0)
- program.AddDirectJumpLabel(violationLabel)
-
+ program.AddDirectJumpLabel(defaultLabel)
if err := buildIndex(rules, program); err != nil {
return nil, err
}
- // violation: return violationAction
- if err := program.AddLabel(violationLabel); err != nil {
+ // Exhausted: return defaultAction.
+ if err := program.AddLabel(defaultLabel); err != nil {
return nil, err
}
- program.AddStmt(bpf.Ret|bpf.K, violationAction)
+ program.AddStmt(bpf.Ret|bpf.K, defaultAction)
return program.Instructions()
}
-// buildIndex builds a BST to quickly search through all syscalls that are whitelisted.
-func buildIndex(rules SyscallRules, program *bpf.ProgramBuilder) error {
- syscalls := []uintptr{}
- for sysno := range rules {
- syscalls = append(syscalls, sysno)
+// buildIndex builds a BST to quickly search through all syscalls.
+func buildIndex(rules []RuleSet, program *bpf.ProgramBuilder) error {
+ // Build a list of all application system calls, across all given rule
+ // sets. We have a simple BST, but may dispatch individual matchers
+ // with different actions. The matchers are evaluated linearly.
+ requiredSyscalls := make(map[uintptr]struct{})
+ for _, rs := range rules {
+ for sysno := range rs.Rules {
+ requiredSyscalls[sysno] = struct{}{}
+ }
}
-
- t, ok := strace.Lookup(abi.Linux, arch.AMD64)
- if !ok {
- panic("Can't find amd64 Linux syscall table")
+ syscalls := make([]uintptr, 0, len(requiredSyscalls))
+ for sysno, _ := range requiredSyscalls {
+ syscalls = append(syscalls, sysno)
}
-
sort.Slice(syscalls, func(i, j int) bool { return syscalls[i] < syscalls[j] })
- for _, s := range syscalls {
- log.Infof("syscall filter: %v (%v): %s", s, t.Name(s), rules[s])
+ for _, sysno := range syscalls {
+ for _, rs := range rules {
+ // Print only if there is a corresponding set of rules.
+ if _, ok := rs.Rules[sysno]; ok {
+ log.Debugf("syscall filter %v: %s => 0x%x", SyscallName(sysno), rs.Rules[sysno], rs.Action)
+ }
+ }
}
root := createBST(syscalls)
@@ -119,7 +148,7 @@ func buildIndex(rules SyscallRules, program *bpf.ProgramBuilder) error {
//
// A = seccomp_data.nr
program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetNR)
- return root.traverse(buildBSTProgram, program, rules)
+ return root.traverse(buildBSTProgram, rules, program)
}
// createBST converts sorted syscall slice into a balanced BST.
@@ -136,15 +165,23 @@ func createBST(syscalls []uintptr) *node {
return &parent
}
-func ruleViolationLabel(sysno uintptr, idx int) string {
- return fmt.Sprintf("ruleViolation_%v_%v", sysno, idx)
+func vsyscallViolationLabel(ruleSetIdx int, sysno uintptr) string {
+ return fmt.Sprintf("vsyscallViolation_%v_%v", ruleSetIdx, sysno)
+}
+
+func ruleViolationLabel(ruleSetIdx int, sysno uintptr, idx int) string {
+ return fmt.Sprintf("ruleViolation_%v_%v_%v", ruleSetIdx, sysno, idx)
}
func checkArgsLabel(sysno uintptr) string {
return fmt.Sprintf("checkArgs_%v", sysno)
}
-func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, sysno uintptr) error {
+// addSyscallArgsCheck adds argument checks for a single system call. It does
+// not insert a jump to the default action at the end and it is the
+// responsibility of the caller to insert an appropriate jump after calling
+// this function.
+func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action uint32, ruleSetIdx int, sysno uintptr) error {
for ruleidx, rule := range rules {
labelled := false
for i, arg := range rule {
@@ -155,28 +192,29 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, sysno uintptr) err
high, low := uint32(a>>32), uint32(a)
// assert arg_low == low
p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i))
- p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(sysno, ruleidx))
+ p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
// assert arg_high == high
p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i))
- p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(sysno, ruleidx))
+ p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
labelled = true
-
default:
return fmt.Errorf("unknown syscall rule type: %v", reflect.TypeOf(a))
}
}
}
- // Matched, allow the syscall.
- p.AddStmt(bpf.Ret|bpf.K, linux.SECCOMP_RET_ALLOW)
- // Label the end of the rule if necessary.
+
+ // Matched, emit the given action.
+ p.AddStmt(bpf.Ret|bpf.K, action)
+
+ // Label the end of the rule if necessary. This is added for
+ // the jumps above when the argument check fails.
if labelled {
- if err := p.AddLabel(ruleViolationLabel(sysno, ruleidx)); err != nil {
+ if err := p.AddLabel(ruleViolationLabel(ruleSetIdx, sysno, ruleidx)); err != nil {
return err
}
}
}
- // Not matched?
- p.AddDirectJumpLabel(violationLabel)
+
return nil
}
@@ -188,16 +226,16 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, sysno uintptr) err
// (A > 22) ? goto index_35 : goto index_9
//
// index_9: // SYS_MMAP(9), leaf
-// A == 9) ? goto argument check : violation
+// A == 9) ? goto argument check : defaultLabel
//
// index_35: // SYS_NANOSLEEP(35), single child
// (A == 35) ? goto argument check : continue
-// (A > 35) ? goto index_50 : goto violation
+// (A > 35) ? goto index_50 : goto defaultLabel
//
// index_50: // SYS_LISTEN(50), leaf
-// (A == 50) ? goto argument check : goto violation
+// (A == 50) ? goto argument check : goto defaultLabel
//
-func buildBSTProgram(program *bpf.ProgramBuilder, rules SyscallRules, n *node) error {
+func buildBSTProgram(n *node, rules []RuleSet, program *bpf.ProgramBuilder) error {
// Root node is never referenced by label, skip it.
if !n.root {
if err := program.AddLabel(n.label()); err != nil {
@@ -209,11 +247,10 @@ func buildBSTProgram(program *bpf.ProgramBuilder, rules SyscallRules, n *node) e
program.AddJumpTrueLabel(bpf.Jmp|bpf.Jeq|bpf.K, uint32(sysno), checkArgsLabel(sysno), 0)
if n.left == nil && n.right == nil {
// Leaf nodes don't require extra check.
- program.AddDirectJumpLabel(violationLabel)
+ program.AddDirectJumpLabel(defaultLabel)
} else {
// Non-leaf node. Check which turn to take otherwise. Using direct jumps
// in case that the offset may exceed the limit of a conditional jump (255)
- // Note that 'violationLabel' is returned for nil children.
program.AddJump(bpf.Jmp|bpf.Jgt|bpf.K, uint32(sysno), 0, skipOneInst)
program.AddDirectJumpLabel(n.right.label())
program.AddDirectJumpLabel(n.left.label())
@@ -222,12 +259,60 @@ func buildBSTProgram(program *bpf.ProgramBuilder, rules SyscallRules, n *node) e
if err := program.AddLabel(checkArgsLabel(sysno)); err != nil {
return err
}
- // No rules, just allow it and save one jmp.
- if len(rules[sysno]) == 0 {
- program.AddStmt(bpf.Ret|bpf.K, linux.SECCOMP_RET_ALLOW)
- return nil
+
+ emitted := false
+ for ruleSetIdx, rs := range rules {
+ if _, ok := rs.Rules[sysno]; ok {
+ // If there are no rules, then this will always match.
+ // Remember we've done this so that we can emit a
+ // sensible error. We can't catch all overlaps, but we
+ // can catch this one at least.
+ if emitted {
+ return fmt.Errorf("unreachable action for %v: 0x%x (rule set %d)", SyscallName(sysno), rs.Action, ruleSetIdx)
+ }
+
+ // Emit a vsyscall check if this rule requires a
+ // Vsyscall match. This rule ensures that the top bit
+ // is set in the instruction pointer, which is where
+ // the vsyscall page will be mapped.
+ if rs.Vsyscall {
+ program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetIPHigh)
+ program.AddJumpFalseLabel(bpf.Jmp|bpf.Jset|bpf.K, 0x80000000, 0, vsyscallViolationLabel(ruleSetIdx, sysno))
+ }
+
+ // Emit matchers.
+ if len(rs.Rules[sysno]) == 0 {
+ // This is a blanket action.
+ program.AddStmt(bpf.Ret|bpf.K, rs.Action)
+ emitted = true
+ } else {
+ // Add an argument check for these particular
+ // arguments. This will continue execution and
+ // check the next rule set. We need to ensure
+ // that at the very end, we insert a direct
+ // jump label for the unmatched case.
+ if err := addSyscallArgsCheck(program, rs.Rules[sysno], rs.Action, ruleSetIdx, sysno); err != nil {
+ return err
+ }
+ }
+
+ // If there was a Vsyscall check for this rule, then we
+ // need to add an appropriate label for the jump above.
+ if rs.Vsyscall {
+ if err := program.AddLabel(vsyscallViolationLabel(ruleSetIdx, sysno)); err != nil {
+ return err
+ }
+ }
+ }
}
- return addSyscallArgsCheck(program, rules[sysno], sysno)
+
+ // Not matched? We only need to insert a jump to the default label if
+ // not default action has been emitted for this call.
+ if !emitted {
+ program.AddDirectJumpLabel(defaultLabel)
+ }
+
+ return nil
}
// node represents a tree node.
@@ -238,26 +323,27 @@ type node struct {
root bool
}
-// label returns the label corresponding to this node. If node is nil (syscall not present),
-// violationLabel is returned for convenience.
+// label returns the label corresponding to this node.
+//
+// If n is nil, then the defaultLabel is returned.
func (n *node) label() string {
if n == nil {
- return violationLabel
+ return defaultLabel
}
return fmt.Sprintf("index_%v", n.value)
}
-type traverseFunc func(*bpf.ProgramBuilder, SyscallRules, *node) error
+type traverseFunc func(*node, []RuleSet, *bpf.ProgramBuilder) error
-func (n *node) traverse(fn traverseFunc, p *bpf.ProgramBuilder, rules SyscallRules) error {
+func (n *node) traverse(fn traverseFunc, rules []RuleSet, p *bpf.ProgramBuilder) error {
if n == nil {
return nil
}
- if err := fn(p, rules, n); err != nil {
+ if err := fn(n, rules, p); err != nil {
return err
}
- if err := n.left.traverse(fn, p, rules); err != nil {
+ if err := n.left.traverse(fn, rules, p); err != nil {
return err
}
- return n.right.traverse(fn, p, rules)
+ return n.right.traverse(fn, rules, p)
}
diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go
index 9215e5c90..6b707f195 100644
--- a/pkg/seccomp/seccomp_rules.go
+++ b/pkg/seccomp/seccomp_rules.go
@@ -24,9 +24,11 @@ import "fmt"
// __u64 args[6];
// };
const (
- seccompDataOffsetNR = 0
- seccompDataOffsetArch = 4
- seccompDataOffsetArgs = 16
+ seccompDataOffsetNR = 0
+ seccompDataOffsetArch = 4
+ seccompDataOffsetIPLow = 8
+ seccompDataOffsetIPHigh = 12
+ seccompDataOffsetArgs = 16
)
func seccompDataOffsetArgLow(i int) uint32 {
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index 42cf85c03..0188ad4f3 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -76,14 +76,18 @@ func TestBasic(t *testing.T) {
}
for _, test := range []struct {
- // filters are the set of syscall that are allowed.
- filters SyscallRules
- kill bool
- specs []spec
+ ruleSets []RuleSet
+ defaultAction uint32
+ specs []spec
}{
{
- filters: SyscallRules{1: {}},
- kill: false,
+ ruleSets: []RuleSet{
+ {
+ Rules: SyscallRules{1: {}},
+ Action: linux.SECCOMP_RET_ALLOW,
+ },
+ },
+ defaultAction: linux.SECCOMP_RET_TRAP,
specs: []spec{
{
desc: "Single syscall allowed",
@@ -98,12 +102,61 @@ func TestBasic(t *testing.T) {
},
},
{
- filters: SyscallRules{
- 1: {},
- 3: {},
- 5: {},
+ ruleSets: []RuleSet{
+ {
+ Rules: SyscallRules{
+ 1: []Rule{
+ {
+ AllowValue(0x1),
+ },
+ },
+ },
+ Action: linux.SECCOMP_RET_ALLOW,
+ },
+ {
+ Rules: SyscallRules{
+ 1: {},
+ 2: {},
+ },
+ Action: linux.SECCOMP_RET_TRAP,
+ },
},
- kill: false,
+ defaultAction: linux.SECCOMP_RET_KILL,
+ specs: []spec{
+ {
+ desc: "Multiple rulesets allowed (1a)",
+ data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{0x1}},
+ want: linux.SECCOMP_RET_ALLOW,
+ },
+ {
+ desc: "Multiple rulesets allowed (1b)",
+ data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64},
+ want: linux.SECCOMP_RET_TRAP,
+ },
+ {
+ desc: "Multiple rulesets allowed (2)",
+ data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64},
+ want: linux.SECCOMP_RET_TRAP,
+ },
+ {
+ desc: "Multiple rulesets allowed (2)",
+ data: seccompData{nr: 0, arch: linux.AUDIT_ARCH_X86_64},
+ want: linux.SECCOMP_RET_KILL,
+ },
+ },
+ },
+ {
+ ruleSets: []RuleSet{
+ {
+ Rules: SyscallRules{
+ 1: {},
+ 3: {},
+ 5: {},
+ },
+ Action: linux.SECCOMP_RET_ALLOW,
+ },
+ },
+ defaultAction: linux.SECCOMP_RET_TRAP,
specs: []spec{
{
desc: "Multiple syscalls allowed (1)",
@@ -148,8 +201,15 @@ func TestBasic(t *testing.T) {
},
},
{
- filters: SyscallRules{1: {}},
- kill: false,
+ ruleSets: []RuleSet{
+ {
+ Rules: SyscallRules{
+ 1: {},
+ },
+ Action: linux.SECCOMP_RET_ALLOW,
+ },
+ },
+ defaultAction: linux.SECCOMP_RET_TRAP,
specs: []spec{
{
desc: "Wrong architecture",
@@ -159,26 +219,38 @@ func TestBasic(t *testing.T) {
},
},
{
- filters: SyscallRules{1: {}},
- kill: true,
+ ruleSets: []RuleSet{
+ {
+ Rules: SyscallRules{
+ 1: {},
+ },
+ Action: linux.SECCOMP_RET_ALLOW,
+ },
+ },
+ defaultAction: linux.SECCOMP_RET_TRAP,
specs: []spec{
{
- desc: "Syscall disallowed, action kill",
+ desc: "Syscall disallowed, action trap",
data: seccompData{nr: 2, arch: linux.AUDIT_ARCH_X86_64},
- want: linux.SECCOMP_RET_KILL,
+ want: linux.SECCOMP_RET_TRAP,
},
},
},
{
- filters: SyscallRules{
- 1: []Rule{
- {
- AllowAny{},
- AllowValue(0xf),
+ ruleSets: []RuleSet{
+ {
+ Rules: SyscallRules{
+ 1: []Rule{
+ {
+ AllowAny{},
+ AllowValue(0xf),
+ },
+ },
},
+ Action: linux.SECCOMP_RET_ALLOW,
},
},
- kill: false,
+ defaultAction: linux.SECCOMP_RET_TRAP,
specs: []spec{
{
desc: "Syscall argument allowed",
@@ -193,17 +265,22 @@ func TestBasic(t *testing.T) {
},
},
{
- filters: SyscallRules{
- 1: []Rule{
- {
- AllowValue(0xf),
- },
- {
- AllowValue(0xe),
+ ruleSets: []RuleSet{
+ {
+ Rules: SyscallRules{
+ 1: []Rule{
+ {
+ AllowValue(0xf),
+ },
+ {
+ AllowValue(0xe),
+ },
+ },
},
+ Action: linux.SECCOMP_RET_ALLOW,
},
},
- kill: false,
+ defaultAction: linux.SECCOMP_RET_TRAP,
specs: []spec{
{
desc: "Syscall argument allowed, two rules",
@@ -218,16 +295,21 @@ func TestBasic(t *testing.T) {
},
},
{
- filters: SyscallRules{
- 1: []Rule{
- {
- AllowValue(0),
- AllowValue(math.MaxUint64 - 1),
- AllowValue(math.MaxUint32),
+ ruleSets: []RuleSet{
+ {
+ Rules: SyscallRules{
+ 1: []Rule{
+ {
+ AllowValue(0),
+ AllowValue(math.MaxUint64 - 1),
+ AllowValue(math.MaxUint32),
+ },
+ },
},
+ Action: linux.SECCOMP_RET_ALLOW,
},
},
- kill: false,
+ defaultAction: linux.SECCOMP_RET_TRAP,
specs: []spec{
{
desc: "64bit syscall argument allowed",
@@ -259,7 +341,7 @@ func TestBasic(t *testing.T) {
},
},
} {
- instrs, err := buildProgram(test.filters, test.kill)
+ instrs, err := BuildProgram(test.ruleSets, test.defaultAction)
if err != nil {
t.Errorf("%s: buildProgram() got error: %v", test.specs[0].desc, err)
continue
@@ -282,6 +364,7 @@ func TestBasic(t *testing.T) {
}
}
+// TestRandom tests that randomly generated rules are encoded correctly.
func TestRandom(t *testing.T) {
rand.Seed(time.Now().UnixNano())
size := rand.Intn(50) + 1
@@ -294,7 +377,12 @@ func TestRandom(t *testing.T) {
}
fmt.Printf("Testing filters: %v", syscallRules)
- instrs, err := buildProgram(syscallRules, false)
+ instrs, err := BuildProgram([]RuleSet{
+ RuleSet{
+ Rules: syscallRules,
+ Action: uint32(linux.SECCOMP_RET_ALLOW),
+ },
+ }, uint32(linux.SECCOMP_RET_TRAP))
if err != nil {
t.Fatalf("buildProgram() got error: %v", err)
}
@@ -319,8 +407,8 @@ func TestRandom(t *testing.T) {
}
}
-// TestReadDeal checks that a process dies when it trips over the filter and that it
-// doesn't die when the filter is not triggered.
+// TestReadDeal checks that a process dies when it trips over the filter and
+// that it doesn't die when the filter is not triggered.
func TestRealDeal(t *testing.T) {
for _, test := range []struct {
die bool
diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go
index 6682f8d9b..ae18534bf 100644
--- a/pkg/seccomp/seccomp_unsafe.go
+++ b/pkg/seccomp/seccomp_unsafe.go
@@ -17,7 +17,6 @@
package seccomp
import (
- "fmt"
"syscall"
"unsafe"
@@ -31,19 +30,28 @@ type sockFprog struct {
Filter *linux.BPFInstruction
}
-func seccomp(instrs []linux.BPFInstruction) error {
+// SetFilter installs the given BPF program.
+//
+// This is safe to call from an afterFork context.
+//
+//go:nosplit
+func SetFilter(instrs []linux.BPFInstruction) syscall.Errno {
// SYS_SECCOMP is not available in syscall package.
const SYS_SECCOMP = 317
// PR_SET_NO_NEW_PRIVS is required in order to enable seccomp. See seccomp(2) for details.
- if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0); err != 0 {
- return fmt.Errorf("failed to set PR_SET_NO_NEW_PRIVS: %v", err)
+ if _, _, errno := syscall.RawSyscall(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0); errno != 0 {
+ return errno
}
- sockProg := sockFprog{Len: uint16(len(instrs)), Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0]))}
// TODO: Use SECCOMP_FILTER_FLAG_KILL_PROCESS when available.
- if _, _, err := syscall.RawSyscall(SYS_SECCOMP, linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(&sockProg))); err != 0 {
- return fmt.Errorf("failed to set seccomp filter: %v", err)
+ sockProg := sockFprog{
+ Len: uint16(len(instrs)),
+ Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])),
}
- return nil
+ if _, _, errno := syscall.RawSyscall(SYS_SECCOMP, linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(&sockProg))); errno != 0 {
+ return errno
+ }
+
+ return 0
}
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index f1e408af9..5ba6c19ea 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// +build amd64
+
package arch
import (
@@ -26,6 +28,9 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
)
+// Host specifies the host architecture.
+const Host = AMD64
+
// These constants come directly from Linux.
const (
// maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index ceee895dc..debae058b 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -19,6 +19,8 @@ go_library(
visibility = ["//:sandbox"],
deps = [
"//pkg/abi/linux",
+ "//pkg/log",
+ "//pkg/seccomp",
"//pkg/sentry/arch",
"//pkg/sentry/platform",
"//pkg/sentry/platform/filemem",
diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
index b55b2795a..46a8bda8e 100644
--- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -136,7 +136,7 @@ func (t *thread) clone(initRegs *syscall.PtraceRegs) (*thread, error) {
return nil, syscall.EINVAL
}
rval, err := t.syscallIgnoreInterrupt(
- initRegs,
+ &t.initRegs,
syscall.SYS_CLONE,
arch.SyscallArgument{Value: uintptr(
syscall.CLONE_FILES |
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 035ebc332..6d5ad6b71 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -47,6 +47,11 @@ type thread struct {
tgid int32
tid int32
cpu uint32
+
+ // initRegs are the initial registers for the first thread.
+ //
+ // These are used for the register set for system calls.
+ initRegs syscall.PtraceRegs
}
// threadPool is a collection of threads.
@@ -99,11 +104,6 @@ func (tp *threadPool) lookupOrCreate(currentTID int32, newThread func() *thread)
type subprocess struct {
platform.NoAddressSpaceIO
- // initRegs are the initial registers for the first thread.
- //
- // These are used for the register set for system calls.
- initRegs syscall.PtraceRegs
-
// requests is used to signal creation of new threads.
requests chan chan *thread
@@ -142,7 +142,6 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
// thread, and responding to requests to make additional threads in the
// traced process. The process will be killed and reaped when the
// request channel is closed, which happens in Release below.
- var initRegs syscall.PtraceRegs
errChan := make(chan error)
requests := make(chan chan *thread)
go func() { // S/R-SAFE: Platform-related.
@@ -156,22 +155,12 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
return
}
- // Grab registers.
- //
- // Note that we adjust the current register RIP value to be
- // just before the current system call executed. This depends
- // on the definition of the stub itself.
- if err := firstThread.getRegs(&initRegs); err != nil {
- panic(fmt.Sprintf("ptrace get regs failed: %v", err))
- }
- initRegs.Rip -= initRegsRipAdjustment
-
// Ready to handle requests.
errChan <- nil
// Wait for requests to create threads.
for r := range requests {
- t, err := firstThread.clone(&initRegs)
+ t, err := firstThread.clone(&firstThread.initRegs)
if err != nil {
// Should not happen: not recoverable.
panic(fmt.Sprintf("error initializing first thread: %v", err))
@@ -183,15 +172,12 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
// (Hopefully nobody tgkilled it with a signal <
// SIGSTOP before the SIGSTOP was delivered, in which
// case that signal would be delivered before SIGSTOP.)
- if sig := t.wait(); sig != syscall.SIGSTOP {
+ if sig := t.wait(stopped); sig != syscall.SIGSTOP {
panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig))
}
- // Detach the thread without suppressing the SIGSTOP,
- // causing it to enter group-stop.
- if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(syscall.SIGSTOP), 0, 0); errno != 0 {
- panic(fmt.Sprintf("can't detach new clone: %v", errno))
- }
+ // Detach the thread.
+ t.detach()
// Return the thread.
r <- t
@@ -208,7 +194,6 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
// Ready.
sp := &subprocess{
- initRegs: initRegs,
requests: requests,
sysemuThreads: threadPool{
threads: make(map[int32]*thread),
@@ -277,16 +262,48 @@ func (t *thread) attach() {
// stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of
// newSubprocess), so we always expect to see signal-delivery-stop with
// SIGSTOP.
- if sig := t.wait(); sig != syscall.SIGSTOP {
+ if sig := t.wait(stopped); sig != syscall.SIGSTOP {
panic(fmt.Sprintf("wait failed: expected SIGSTOP, got %v", sig))
}
// Initialize options.
t.init()
+
+ // Grab registers.
+ //
+ // Note that we adjust the current register RIP value to be just before
+ // the current system call executed. This depends on the definition of
+ // the stub itself.
+ if err := t.getRegs(&t.initRegs); err != nil {
+ panic(fmt.Sprintf("ptrace get regs failed: %v", err))
+ }
+ t.initRegs.Rip -= initRegsRipAdjustment
}
+// detach detachs from the thread.
+//
+// Because the SIGSTOP is not supressed, the thread will enter group-stop.
+func (t *thread) detach() {
+ if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(syscall.SIGSTOP), 0, 0); errno != 0 {
+ panic(fmt.Sprintf("can't detach new clone: %v", errno))
+ }
+}
+
+// waitOutcome is used for wait below.
+type waitOutcome int
+
+const (
+ // stopped indicates that the process was stopped.
+ stopped waitOutcome = iota
+
+ // killed indicates that the process was killed.
+ killed
+)
+
// wait waits for a stop event.
-func (t *thread) wait() syscall.Signal {
+//
+// Precondition: outcome is a valid waitOutcome.
+func (t *thread) wait(outcome waitOutcome) syscall.Signal {
var status syscall.WaitStatus
for {
@@ -300,25 +317,55 @@ func (t *thread) wait() syscall.Signal {
if int(r) != int(t.tid) {
panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid))
}
- if !status.Stopped() {
- panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
- }
- if status.StopSignal() == 0 {
- continue // Spurious stop.
+ switch outcome {
+ case stopped:
+ if !status.Stopped() {
+ panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
+ }
+ stopSig := status.StopSignal()
+ if stopSig == 0 {
+ continue // Spurious stop.
+ }
+ if stopSig == syscall.SIGTRAP {
+ // Re-encode the trap cause the way it's expected.
+ return stopSig | syscall.Signal(status.TrapCause()<<8)
+ }
+ // Not a trap signal.
+ return stopSig
+ case killed:
+ if !status.Exited() && !status.Signaled() {
+ panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status))
+ }
+ return syscall.Signal(status.ExitStatus())
+ default:
+ // Should not happen.
+ panic(fmt.Sprintf("unknown outcome: %v", outcome))
}
- return status.StopSignal()
}
}
+// destroy kills the thread.
+//
+// Note that this should not be used in the general case; the death of threads
+// will typically cause the death of the parent. This is a utility method for
+// manually created threads.
+func (t *thread) destroy() {
+ t.detach()
+ syscall.Tgkill(int(t.tgid), int(t.tid), syscall.Signal(syscall.SIGKILL))
+ t.wait(killed)
+}
+
// init initializes trace options.
func (t *thread) init() {
- // Set our TRACESYSGOOD option to differeniate real SIGTRAP.
+ // Set our TRACESYSGOOD option to differeniate real SIGTRAP. Also, we
+ // require the SECCOMP option to ensure that seccomp violations
+ // generate a ptrace event.
_, _, errno := syscall.RawSyscall6(
syscall.SYS_PTRACE,
syscall.PTRACE_SETOPTIONS,
uintptr(t.tid),
0,
- syscall.PTRACE_O_TRACESYSGOOD,
+ syscall.PTRACE_O_TRACESYSGOOD|_PTRACE_O_TRACESECCOMP,
0, 0)
if errno != 0 {
panic(fmt.Sprintf("ptrace set options failed: %v", errno))
@@ -342,8 +389,8 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
}
- sig := t.wait()
- if sig == (0x80 | syscall.SIGTRAP) {
+ sig := t.wait(stopped)
+ if sig == (syscallEvent | syscall.SIGTRAP) {
// Reached syscall-enter-stop.
break
} else {
@@ -360,7 +407,7 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
// Wait for syscall-exit-stop. "[Signal-delivery-stop] never happens
// between syscall-enter-stop and syscall-exit-stop; it happens *after*
// syscall-exit-stop.)" - ptrace(2), "Syscall-stops"
- if sig := t.wait(); sig != (0x80 | syscall.SIGTRAP) {
+ if sig := t.wait(stopped); sig != (syscallEvent | syscall.SIGTRAP) {
panic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig))
}
@@ -403,22 +450,23 @@ func (t *thread) NotifyInterrupt() {
//
// This function returns true on a system call, false on a signal.
func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
- regs := &ac.StateData().Regs
- s.resetSysemuRegs(regs)
+ // Lock the thread for ptrace operations.
+ runtime.LockOSThread()
+ defer runtime.UnlockOSThread()
// Extract floating point state.
fpState := ac.FloatingPointData()
fpLen, _ := ac.FeatureSet().ExtendedStateSize()
useXsave := ac.FeatureSet().UseXsave()
- // Lock the thread for ptrace operations.
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
// Grab our thread from the pool.
currentTID := int32(procid.Current())
t := s.sysemuThreads.lookupOrCreate(currentTID, s.newThread)
+ // Reset necessary registers.
+ regs := &ac.StateData().Regs
+ t.resetSysemuRegs(regs)
+
// Check for interrupts, and ensure that future interrupts will signal t.
if !c.interrupt.Enable(t) {
// Pending interrupt; simulate.
@@ -459,7 +507,7 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
}
// Wait for the syscall-enter stop.
- sig := t.wait()
+ sig := t.wait(stopped)
// Refresh all registers.
if err := t.getRegs(regs); err != nil {
@@ -470,13 +518,17 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
}
// Is it a system call?
- if sig == (0x80 | syscall.SIGTRAP) {
+ if sig == (syscallEvent | syscall.SIGTRAP) {
// Ensure registers are sane.
updateSyscallRegs(regs)
return true
- }
-
- if sig == syscall.SIGSTOP {
+ } else if sig == (seccompEvent | syscall.SIGTRAP) {
+ // Seccomp is enabled, and caught the system call. This
+ // is an emulated vsyscall call, since those are caught
+ // only by seccomp and explicitly set to trace.
+ updateSyscallRegs(regs)
+ return true
+ } else if sig == syscall.SIGSTOP {
// SIGSTOP was delivered to another thread in the same thread
// group, which initiated another group stop. Just ignore it.
continue
@@ -507,7 +559,7 @@ func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintp
currentTID := int32(procid.Current())
t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
- return t.syscallIgnoreInterrupt(&s.initRegs, sysno, args...)
+ return t.syscallIgnoreInterrupt(&t.initRegs, sysno, args...)
}
// MapFile implements platform.AddressSpace.MapFile.
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index 8211215df..c38dc1ff8 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -43,20 +43,20 @@ const (
// resetSysemuRegs sets up emulation registers.
//
// This should be called prior to calling sysemu.
-func (s *subprocess) resetSysemuRegs(regs *syscall.PtraceRegs) {
- regs.Cs = s.initRegs.Cs
- regs.Ss = s.initRegs.Ss
- regs.Ds = s.initRegs.Ds
- regs.Es = s.initRegs.Es
- regs.Fs = s.initRegs.Fs
- regs.Gs = s.initRegs.Gs
+func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) {
+ regs.Cs = t.initRegs.Cs
+ regs.Ss = t.initRegs.Ss
+ regs.Ds = t.initRegs.Ds
+ regs.Es = t.initRegs.Es
+ regs.Fs = t.initRegs.Fs
+ regs.Gs = t.initRegs.Gs
}
// createSyscallRegs sets up syscall registers.
//
// This should be called to generate registers for a system call.
func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs {
- // Copy initial registers (RIP, segments, etc.).
+ // Copy initial registers.
regs := *initRegs
// Set our syscall number.
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index b212bbdfe..53adadadd 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -21,14 +21,167 @@ import (
"syscall"
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
)
+const (
+ syscallEvent syscall.Signal = 0x80
+ seccompEvent syscall.Signal = 0x700 // 0x7 (PTRACE_SECCOMP_EVENT) << 8
+ _PTRACE_O_TRACESECCOMP = 0x80 // 1 << 0x7 (PTRACE_SECCOMP_EVENT)
+)
+
+// probeSeccomp returns true iff seccomp is run after ptrace notifications,
+// which is generally the case for kernel version >= 4.8. This check is dynamic
+// because kernels have be backported behavior.
+//
+// See createStub for more information.
+//
+// Precondition: the runtime OS thread must be locked.
+func probeSeccomp() bool {
+ // Create a completely new, destroyable process.
+ t, err := attachedThread(0, uint32(linux.SECCOMP_RET_ERRNO))
+ if err != nil {
+ panic(fmt.Sprintf("seccomp probe failed: %v", err))
+ }
+ defer t.destroy()
+
+ // Set registers to the yield system call. This call is not allowed
+ // by the filters specified in the attachThread function.
+ regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD)
+ if err := t.setRegs(&regs); err != nil {
+ panic(fmt.Sprintf("ptrace set regs failed: %v", err))
+ }
+
+ for {
+ // Attempt an emulation.
+ if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0); errno != 0 {
+ panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
+ }
+
+ sig := t.wait(stopped)
+ if sig == (syscallEvent | syscall.SIGTRAP) {
+ // Did the seccomp errno hook already run? This would
+ // indicate that seccomp is first in line and we're
+ // less than 4.8.
+ if err := t.getRegs(&regs); err != nil {
+ panic(fmt.Sprintf("ptrace get-regs failed: %v", err))
+ }
+ if _, err := syscallReturnValue(&regs); err == nil {
+ // The seccomp errno mode ran first, and reset
+ // the error in the registers.
+ return false
+ }
+ // The seccomp hook did not run yet, and therefore it
+ // is safe to use RET_KILL mode for dispatched calls.
+ return true
+ }
+ }
+}
+
// createStub creates a fresh stub processes.
//
// Precondition: the runtime OS thread must be locked.
func createStub() (*thread, error) {
+ // The exact interactions of ptrace and seccomp are complex, and
+ // changed in recent kernel versions. Before commit 93e35efb8de45, the
+ // seccomp check is done before the ptrace emulation check. This means
+ // that any calls not matching this list will trigger the seccomp
+ // default action instead of notifying ptrace.
+ //
+ // After commit 93e35efb8de45, the seccomp check is done after the
+ // ptrace emulation check. This simplifies using SYSEMU, since seccomp
+ // will never run for emulation. Seccomp will only run for injected
+ // system calls, and thus we can use RET_KILL as our violation action.
+ var defaultAction uint32
+ if probeSeccomp() {
+ log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)")
+ defaultAction = uint32(linux.SECCOMP_RET_KILL)
+ } else {
+ // We must rely on SYSEMU behavior; tracing with SYSEMU is broken.
+ log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)")
+ defaultAction = uint32(linux.SECCOMP_RET_ALLOW)
+ }
+
+ // When creating the new child process, we specify SIGKILL as the
+ // signal to deliver when the child exits. We never expect a subprocess
+ // to exit; they are pooled and reused. This is done to ensure that if
+ // a subprocess is OOM-killed, this process (and all other stubs,
+ // transitively) will be killed as well. It's simply not possible to
+ // safely handle a single stub getting killed: the exact state of
+ // execution is unknown and not recoverable.
+ return attachedThread(uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, defaultAction)
+}
+
+// attachedThread returns a new attached thread.
+//
+// Precondition: the runtime OS thread must be locked.
+func attachedThread(flags uintptr, defaultAction uint32) (*thread, error) {
+ // Create a BPF program that allows only the system calls needed by the
+ // stub and all its children. This is used to create child stubs
+ // (below), so we must include the ability to fork, but otherwise lock
+ // down available calls only to what is needed.
+ rules := []seccomp.RuleSet{
+ // Rules for trapping vsyscall access.
+ seccomp.RuleSet{
+ Rules: seccomp.SyscallRules{
+ syscall.SYS_GETTIMEOFDAY: {},
+ syscall.SYS_TIME: {},
+ 309: {}, // SYS_GETCPU.
+ },
+ Action: uint32(linux.SECCOMP_RET_TRACE),
+ Vsyscall: true,
+ },
+ }
+ if defaultAction != uint32(linux.SECCOMP_RET_ALLOW) {
+ rules = append(rules, seccomp.RuleSet{
+ Rules: seccomp.SyscallRules{
+ syscall.SYS_CLONE: []seccomp.Rule{
+ // Allow creation of new subprocesses (used by the master).
+ {seccomp.AllowValue(syscall.CLONE_FILES | syscall.SIGKILL)},
+ // Allow creation of new threads within a single address space (used by addresss spaces).
+ {seccomp.AllowValue(
+ syscall.CLONE_FILES |
+ syscall.CLONE_FS |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_THREAD |
+ syscall.CLONE_PTRACE |
+ syscall.CLONE_VM)},
+ },
+
+ // For the initial process creation.
+ syscall.SYS_WAIT4: {},
+ syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+ {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+ },
+ syscall.SYS_EXIT: {},
+
+ // For the stub prctl dance (all).
+ syscall.SYS_PRCTL: []seccomp.Rule{
+ {seccomp.AllowValue(syscall.PR_SET_PDEATHSIG), seccomp.AllowValue(syscall.SIGKILL)},
+ },
+ syscall.SYS_GETPPID: {},
+
+ // For the stub to stop itself (all).
+ syscall.SYS_GETPID: {},
+ syscall.SYS_KILL: []seccomp.Rule{
+ {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SIGSTOP)},
+ },
+
+ // Injected to support the address space operations.
+ syscall.SYS_MMAP: {},
+ syscall.SYS_MUNMAP: {},
+ },
+ Action: uint32(linux.SECCOMP_RET_ALLOW),
+ })
+ }
+ instrs, err := seccomp.BuildProgram(rules, defaultAction)
+ if err != nil {
+ return nil, err
+ }
+
// Declare all variables up front in order to ensure that there's no
// need for allocations between beforeFork & afterFork.
var (
@@ -43,14 +196,8 @@ func createStub() (*thread, error) {
// Among other things, beforeFork masks all signals.
beforeFork()
- // When creating the new child process, we specify SIGKILL as the
- // signal to deliver when the child exits. We never expect a subprocess
- // to exit; they are pooled and reused. This is done to ensure that if
- // a subprocess is OOM-killed, this process (and all other stubs,
- // transitively) will be killed as well. It's simply not possible to
- // safely handle a single stub getting killed: the exact state of
- // execution is unknown and not recoverable.
- pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, 0, 0, 0, 0, 0)
+ // Do the clone.
+ pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, flags, 0, 0, 0, 0, 0)
if errno != 0 {
afterFork()
return nil, errno
@@ -67,7 +214,7 @@ func createStub() (*thread, error) {
tid: int32(pid),
cpu: ^uint32(0),
}
- if sig := t.wait(); sig != syscall.SIGSTOP {
+ if sig := t.wait(stopped); sig != syscall.SIGSTOP {
return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
}
t.attach()
@@ -86,6 +233,12 @@ func createStub() (*thread, error) {
syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
}
+ // Set an aggressive BPF filter for the stub and all it's children. See
+ // the description of the BPF program built above.
+ if errno := seccomp.SetFilter(instrs); errno != 0 {
+ syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
+ }
+
// Enable cpuid-faulting; this may fail on older kernels or hardware,
// so we just disregard the result. Host CPUID will be enabled.
syscall.RawSyscall(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0)
@@ -105,7 +258,7 @@ func (s *subprocess) createStub() (*thread, error) {
t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
// Pass the expected PPID to the child via R15.
- regs := s.initRegs
+ regs := t.initRegs
regs.R15 = uint64(t.tgid)
// Call fork in a subprocess.
@@ -138,7 +291,7 @@ func (s *subprocess) createStub() (*thread, error) {
// status. If the wait succeeds, we'll assume that it was the SIGSTOP.
// If the child actually exited, the attach below will fail.
_, err = t.syscallIgnoreInterrupt(
- &s.initRegs,
+ &t.initRegs,
syscall.SYS_WAIT4,
arch.SyscallArgument{Value: uintptr(pid)},
arch.SyscallArgument{Value: 0},
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index e1c8db67a..674554081 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -24,6 +24,7 @@ go_library(
"//pkg/binary",
"//pkg/bits",
"//pkg/eventchannel",
+ "//pkg/seccomp",
"//pkg/sentry/arch",
"//pkg/sentry/kernel",
"//pkg/sentry/socket/control",
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index f2a22aaa5..a16f5490e 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -28,6 +28,7 @@ import (
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/bits"
"gvisor.googlesource.com/gvisor/pkg/eventchannel"
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
pb "gvisor.googlesource.com/gvisor/pkg/sentry/strace/strace_go_proto"
@@ -699,3 +700,13 @@ func EnableAll(sinks SinkType) {
table.FeatureEnable.EnableAll(flags)
}
}
+
+func init() {
+ t, ok := Lookup(abi.Host, arch.Host)
+ if ok {
+ // Provide the native table as the lookup for seccomp
+ // debugging. This is best-effort. This is provided this way to
+ // avoid dependencies from seccomp to this package.
+ seccomp.SyscallName = t.Name
+ }
+}