diff options
-rw-r--r-- | pkg/abi/BUILD | 1 | ||||
-rw-r--r-- | pkg/abi/abi_linux.go | 20 | ||||
-rw-r--r-- | pkg/seccomp/BUILD | 3 | ||||
-rw-r--r-- | pkg/seccomp/seccomp.go | 224 | ||||
-rw-r--r-- | pkg/seccomp/seccomp_rules.go | 8 | ||||
-rw-r--r-- | pkg/seccomp/seccomp_test.go | 172 | ||||
-rw-r--r-- | pkg/seccomp/seccomp_unsafe.go | 24 | ||||
-rw-r--r-- | pkg/sentry/arch/arch_amd64.go | 5 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/BUILD | 2 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/ptrace_unsafe.go | 2 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess.go | 150 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess_amd64.go | 16 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess_linux.go | 175 | ||||
-rw-r--r-- | pkg/sentry/strace/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/strace/strace.go | 11 |
15 files changed, 620 insertions, 194 deletions
diff --git a/pkg/abi/BUILD b/pkg/abi/BUILD index c014d2c4b..1ba4f3a46 100644 --- a/pkg/abi/BUILD +++ b/pkg/abi/BUILD @@ -6,6 +6,7 @@ go_library( name = "abi", srcs = [ "abi.go", + "abi_linux.go", "flag.go", ], importpath = "gvisor.googlesource.com/gvisor/pkg/abi", diff --git a/pkg/abi/abi_linux.go b/pkg/abi/abi_linux.go new file mode 100644 index 000000000..dd5d67b51 --- /dev/null +++ b/pkg/abi/abi_linux.go @@ -0,0 +1,20 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux + +package abi + +// Host specifies the host ABI. +const Host = Linux diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD index b3e2f0b38..1975d17a6 100644 --- a/pkg/seccomp/BUILD +++ b/pkg/seccomp/BUILD @@ -28,12 +28,9 @@ go_library( importpath = "gvisor.googlesource.com/gvisor/pkg/seccomp", visibility = ["//visibility:public"], deps = [ - "//pkg/abi", "//pkg/abi/linux", "//pkg/bpf", "//pkg/log", - "//pkg/sentry/arch", - "//pkg/sentry/strace", ], ) diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go index 49da3c775..a746dc9b3 100644 --- a/pkg/seccomp/seccomp.go +++ b/pkg/seccomp/seccomp.go @@ -20,31 +20,36 @@ import ( "reflect" "sort" - "gvisor.googlesource.com/gvisor/pkg/abi" "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/bpf" "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/sentry/arch" - "gvisor.googlesource.com/gvisor/pkg/sentry/strace" ) const ( - // violationLabel is added to the program to take action on a violation. - violationLabel = "violation" - // skipOneInst is the offset to take for skipping one instruction. skipOneInst = 1 + + // defaultLabel is the label for the default action. + defaultLabel = "default_action" ) // Install generates BPF code based on the set of syscalls provided. It only -// allows syscalls that conform to the specification (*) and generates SIGSYS +// allows syscalls that conform to the specification and generates SIGSYS // trap unless kill is set. // -// (*) The current implementation only checks the syscall number. It does NOT -// validate any of the arguments. +// This is a convenience wrapper around BuildProgram and SetFilter. func Install(rules SyscallRules, kill bool) error { log.Infof("Installing seccomp filters for %d syscalls (kill=%t)", len(rules), kill) - instrs, err := buildProgram(rules, kill) + defaultAction := uint32(linux.SECCOMP_RET_TRAP) + if kill { + defaultAction = uint32(linux.SECCOMP_RET_KILL) + } + instrs, err := BuildProgram([]RuleSet{ + RuleSet{ + Rules: rules, + Action: uint32(linux.SECCOMP_RET_ALLOW), + }, + }, defaultAction) if log.IsLogging(log.Debug) { programStr, errDecode := bpf.DecodeProgram(instrs) if errDecode != nil { @@ -56,60 +61,84 @@ func Install(rules SyscallRules, kill bool) error { return err } - if err := seccomp(instrs); err != nil { - return err + // Perform the actual installation. + if errno := SetFilter(instrs); errno != 0 { + return fmt.Errorf("Failed to set filter: %v", errno) } log.Infof("Seccomp filters installed.") return nil } -// buildProgram builds a BPF program that whitelists all given syscall rules. -func buildProgram(rules SyscallRules, kill bool) ([]linux.BPFInstruction, error) { +// RuleSet is a set of rules and associated action. +type RuleSet struct { + Rules SyscallRules + Action uint32 + + // Vsyscall indicates that a check is made for a function being called + // from kernel mappings. This is where the vsyscall page is located + // (and typically) emulated, so this RuleSet will not match any + // functions not dispatched from the vsyscall page. + Vsyscall bool +} + +// SyscallName gives names to system calls. It is used purely for debugging purposes. +// +// An alternate namer can be provided to the package at initialization time. +var SyscallName = func(sysno uintptr) string { + return fmt.Sprintf("syscall_%d", sysno) +} + +// BuildProgram builds a BPF program from the given map of actions to matching +// SyscallRules. The single generated program covers all provided RuleSets. +func BuildProgram(rules []RuleSet, defaultAction uint32) ([]linux.BPFInstruction, error) { program := bpf.NewProgramBuilder() - violationAction := uint32(linux.SECCOMP_RET_KILL) - if !kill { - violationAction = linux.SECCOMP_RET_TRAP - } // Be paranoid and check that syscall is done in the expected architecture. // // A = seccomp_data.arch - // if (A != AUDIT_ARCH_X86_64) goto violation + // if (A != AUDIT_ARCH_X86_64) goto defaultAction. program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArch) - // violationLabel is at the bottom of the program. The size of program + // defaultLabel is at the bottom of the program. The size of program // may exceeds 255 lines, which is the limit of a condition jump. program.AddJump(bpf.Jmp|bpf.Jeq|bpf.K, linux.AUDIT_ARCH_X86_64, skipOneInst, 0) - program.AddDirectJumpLabel(violationLabel) - + program.AddDirectJumpLabel(defaultLabel) if err := buildIndex(rules, program); err != nil { return nil, err } - // violation: return violationAction - if err := program.AddLabel(violationLabel); err != nil { + // Exhausted: return defaultAction. + if err := program.AddLabel(defaultLabel); err != nil { return nil, err } - program.AddStmt(bpf.Ret|bpf.K, violationAction) + program.AddStmt(bpf.Ret|bpf.K, defaultAction) return program.Instructions() } -// buildIndex builds a BST to quickly search through all syscalls that are whitelisted. -func buildIndex(rules SyscallRules, program *bpf.ProgramBuilder) error { - syscalls := []uintptr{} - for sysno := range rules { - syscalls = append(syscalls, sysno) +// buildIndex builds a BST to quickly search through all syscalls. +func buildIndex(rules []RuleSet, program *bpf.ProgramBuilder) error { + // Build a list of all application system calls, across all given rule + // sets. We have a simple BST, but may dispatch individual matchers + // with different actions. The matchers are evaluated linearly. + requiredSyscalls := make(map[uintptr]struct{}) + for _, rs := range rules { + for sysno := range rs.Rules { + requiredSyscalls[sysno] = struct{}{} + } } - - t, ok := strace.Lookup(abi.Linux, arch.AMD64) - if !ok { - panic("Can't find amd64 Linux syscall table") + syscalls := make([]uintptr, 0, len(requiredSyscalls)) + for sysno, _ := range requiredSyscalls { + syscalls = append(syscalls, sysno) } - sort.Slice(syscalls, func(i, j int) bool { return syscalls[i] < syscalls[j] }) - for _, s := range syscalls { - log.Infof("syscall filter: %v (%v): %s", s, t.Name(s), rules[s]) + for _, sysno := range syscalls { + for _, rs := range rules { + // Print only if there is a corresponding set of rules. + if _, ok := rs.Rules[sysno]; ok { + log.Debugf("syscall filter %v: %s => 0x%x", SyscallName(sysno), rs.Rules[sysno], rs.Action) + } + } } root := createBST(syscalls) @@ -119,7 +148,7 @@ func buildIndex(rules SyscallRules, program *bpf.ProgramBuilder) error { // // A = seccomp_data.nr program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetNR) - return root.traverse(buildBSTProgram, program, rules) + return root.traverse(buildBSTProgram, rules, program) } // createBST converts sorted syscall slice into a balanced BST. @@ -136,15 +165,23 @@ func createBST(syscalls []uintptr) *node { return &parent } -func ruleViolationLabel(sysno uintptr, idx int) string { - return fmt.Sprintf("ruleViolation_%v_%v", sysno, idx) +func vsyscallViolationLabel(ruleSetIdx int, sysno uintptr) string { + return fmt.Sprintf("vsyscallViolation_%v_%v", ruleSetIdx, sysno) +} + +func ruleViolationLabel(ruleSetIdx int, sysno uintptr, idx int) string { + return fmt.Sprintf("ruleViolation_%v_%v_%v", ruleSetIdx, sysno, idx) } func checkArgsLabel(sysno uintptr) string { return fmt.Sprintf("checkArgs_%v", sysno) } -func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, sysno uintptr) error { +// addSyscallArgsCheck adds argument checks for a single system call. It does +// not insert a jump to the default action at the end and it is the +// responsibility of the caller to insert an appropriate jump after calling +// this function. +func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action uint32, ruleSetIdx int, sysno uintptr) error { for ruleidx, rule := range rules { labelled := false for i, arg := range rule { @@ -155,28 +192,29 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, sysno uintptr) err high, low := uint32(a>>32), uint32(a) // assert arg_low == low p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i)) - p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(sysno, ruleidx)) + p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) // assert arg_high == high p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i)) - p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(sysno, ruleidx)) + p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) labelled = true - default: return fmt.Errorf("unknown syscall rule type: %v", reflect.TypeOf(a)) } } } - // Matched, allow the syscall. - p.AddStmt(bpf.Ret|bpf.K, linux.SECCOMP_RET_ALLOW) - // Label the end of the rule if necessary. + + // Matched, emit the given action. + p.AddStmt(bpf.Ret|bpf.K, action) + + // Label the end of the rule if necessary. This is added for + // the jumps above when the argument check fails. if labelled { - if err := p.AddLabel(ruleViolationLabel(sysno, ruleidx)); err != nil { + if err := p.AddLabel(ruleViolationLabel(ruleSetIdx, sysno, ruleidx)); err != nil { return err } } } - // Not matched? - p.AddDirectJumpLabel(violationLabel) + return nil } @@ -188,16 +226,16 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, sysno uintptr) err // (A > 22) ? goto index_35 : goto index_9 // // index_9: // SYS_MMAP(9), leaf -// A == 9) ? goto argument check : violation +// A == 9) ? goto argument check : defaultLabel // // index_35: // SYS_NANOSLEEP(35), single child // (A == 35) ? goto argument check : continue -// (A > 35) ? goto index_50 : goto violation +// (A > 35) ? goto index_50 : goto defaultLabel // // index_50: // SYS_LISTEN(50), leaf -// (A == 50) ? goto argument check : goto violation +// (A == 50) ? goto argument check : goto defaultLabel // -func buildBSTProgram(program *bpf.ProgramBuilder, rules SyscallRules, n *node) error { +func buildBSTProgram(n *node, rules []RuleSet, program *bpf.ProgramBuilder) error { // Root node is never referenced by label, skip it. if !n.root { if err := program.AddLabel(n.label()); err != nil { @@ -209,11 +247,10 @@ func buildBSTProgram(program *bpf.ProgramBuilder, rules SyscallRules, n *node) e program.AddJumpTrueLabel(bpf.Jmp|bpf.Jeq|bpf.K, uint32(sysno), checkArgsLabel(sysno), 0) if n.left == nil && n.right == nil { // Leaf nodes don't require extra check. - program.AddDirectJumpLabel(violationLabel) + program.AddDirectJumpLabel(defaultLabel) } else { // Non-leaf node. Check which turn to take otherwise. Using direct jumps // in case that the offset may exceed the limit of a conditional jump (255) - // Note that 'violationLabel' is returned for nil children. program.AddJump(bpf.Jmp|bpf.Jgt|bpf.K, uint32(sysno), 0, skipOneInst) program.AddDirectJumpLabel(n.right.label()) program.AddDirectJumpLabel(n.left.label()) @@ -222,12 +259,60 @@ func buildBSTProgram(program *bpf.ProgramBuilder, rules SyscallRules, n *node) e if err := program.AddLabel(checkArgsLabel(sysno)); err != nil { return err } - // No rules, just allow it and save one jmp. - if len(rules[sysno]) == 0 { - program.AddStmt(bpf.Ret|bpf.K, linux.SECCOMP_RET_ALLOW) - return nil + + emitted := false + for ruleSetIdx, rs := range rules { + if _, ok := rs.Rules[sysno]; ok { + // If there are no rules, then this will always match. + // Remember we've done this so that we can emit a + // sensible error. We can't catch all overlaps, but we + // can catch this one at least. + if emitted { + return fmt.Errorf("unreachable action for %v: 0x%x (rule set %d)", SyscallName(sysno), rs.Action, ruleSetIdx) + } + + // Emit a vsyscall check if this rule requires a + // Vsyscall match. This rule ensures that the top bit + // is set in the instruction pointer, which is where + // the vsyscall page will be mapped. + if rs.Vsyscall { + program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetIPHigh) + program.AddJumpFalseLabel(bpf.Jmp|bpf.Jset|bpf.K, 0x80000000, 0, vsyscallViolationLabel(ruleSetIdx, sysno)) + } + + // Emit matchers. + if len(rs.Rules[sysno]) == 0 { + // This is a blanket action. + program.AddStmt(bpf.Ret|bpf.K, rs.Action) + emitted = true + } else { + // Add an argument check for these particular + // arguments. This will continue execution and + // check the next rule set. We need to ensure + // that at the very end, we insert a direct + // jump label for the unmatched case. + if err := addSyscallArgsCheck(program, rs.Rules[sysno], rs.Action, ruleSetIdx, sysno); err != nil { + return err + } + } + + // If there was a Vsyscall check for this rule, then we + // need to add an appropriate label for the jump above. + if rs.Vsyscall { + if err := program.AddLabel(vsyscallViolationLabel(ruleSetIdx, sysno)); err != nil { + return err + } + } + } } - return addSyscallArgsCheck(program, rules[sysno], sysno) + + // Not matched? We only need to insert a jump to the default label if + // not default action has been emitted for this call. + if !emitted { + program.AddDirectJumpLabel(defaultLabel) + } + + return nil } // node represents a tree node. @@ -238,26 +323,27 @@ type node struct { root bool } -// label returns the label corresponding to this node. If node is nil (syscall not present), -// violationLabel is returned for convenience. +// label returns the label corresponding to this node. +// +// If n is nil, then the defaultLabel is returned. func (n *node) label() string { if n == nil { - return violationLabel + return defaultLabel } return fmt.Sprintf("index_%v", n.value) } -type traverseFunc func(*bpf.ProgramBuilder, SyscallRules, *node) error +type traverseFunc func(*node, []RuleSet, *bpf.ProgramBuilder) error -func (n *node) traverse(fn traverseFunc, p *bpf.ProgramBuilder, rules SyscallRules) error { +func (n *node) traverse(fn traverseFunc, rules []RuleSet, p *bpf.ProgramBuilder) error { if n == nil { return nil } - if err := fn(p, rules, n); err != nil { + if err := fn(n, rules, p); err != nil { return err } - if err := n.left.traverse(fn, p, rules); err != nil { + if err := n.left.traverse(fn, rules, p); err != nil { return err } - return n.right.traverse(fn, p, rules) + return n.right.traverse(fn, rules, p) } diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go index 9215e5c90..6b707f195 100644 --- a/pkg/seccomp/seccomp_rules.go +++ b/pkg/seccomp/seccomp_rules.go @@ -24,9 +24,11 @@ import "fmt" // __u64 args[6]; // }; const ( - seccompDataOffsetNR = 0 - seccompDataOffsetArch = 4 - seccompDataOffsetArgs = 16 + seccompDataOffsetNR = 0 + seccompDataOffsetArch = 4 + seccompDataOffsetIPLow = 8 + seccompDataOffsetIPHigh = 12 + seccompDataOffsetArgs = 16 ) func seccompDataOffsetArgLow(i int) uint32 { diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go index 42cf85c03..0188ad4f3 100644 --- a/pkg/seccomp/seccomp_test.go +++ b/pkg/seccomp/seccomp_test.go @@ -76,14 +76,18 @@ func TestBasic(t *testing.T) { } for _, test := range []struct { - // filters are the set of syscall that are allowed. - filters SyscallRules - kill bool - specs []spec + ruleSets []RuleSet + defaultAction uint32 + specs []spec }{ { - filters: SyscallRules{1: {}}, - kill: false, + ruleSets: []RuleSet{ + { + Rules: SyscallRules{1: {}}, + Action: linux.SECCOMP_RET_ALLOW, + }, + }, + defaultAction: linux.SECCOMP_RET_TRAP, specs: []spec{ { desc: "Single syscall allowed", @@ -98,12 +102,61 @@ func TestBasic(t *testing.T) { }, }, { - filters: SyscallRules{ - 1: {}, - 3: {}, - 5: {}, + ruleSets: []RuleSet{ + { + Rules: SyscallRules{ + 1: []Rule{ + { + AllowValue(0x1), + }, + }, + }, + Action: linux.SECCOMP_RET_ALLOW, + }, + { + Rules: SyscallRules{ + 1: {}, + 2: {}, + }, + Action: linux.SECCOMP_RET_TRAP, + }, }, - kill: false, + defaultAction: linux.SECCOMP_RET_KILL, + specs: []spec{ + { + desc: "Multiple rulesets allowed (1a)", + data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{0x1}}, + want: linux.SECCOMP_RET_ALLOW, + }, + { + desc: "Multiple rulesets allowed (1b)", + data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64}, + want: linux.SECCOMP_RET_TRAP, + }, + { + desc: "Multiple rulesets allowed (2)", + data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64}, + want: linux.SECCOMP_RET_TRAP, + }, + { + desc: "Multiple rulesets allowed (2)", + data: seccompData{nr: 0, arch: linux.AUDIT_ARCH_X86_64}, + want: linux.SECCOMP_RET_KILL, + }, + }, + }, + { + ruleSets: []RuleSet{ + { + Rules: SyscallRules{ + 1: {}, + 3: {}, + 5: {}, + }, + Action: linux.SECCOMP_RET_ALLOW, + }, + }, + defaultAction: linux.SECCOMP_RET_TRAP, specs: []spec{ { desc: "Multiple syscalls allowed (1)", @@ -148,8 +201,15 @@ func TestBasic(t *testing.T) { }, }, { - filters: SyscallRules{1: {}}, - kill: false, + ruleSets: []RuleSet{ + { + Rules: SyscallRules{ + 1: {}, + }, + Action: linux.SECCOMP_RET_ALLOW, + }, + }, + defaultAction: linux.SECCOMP_RET_TRAP, specs: []spec{ { desc: "Wrong architecture", @@ -159,26 +219,38 @@ func TestBasic(t *testing.T) { }, }, { - filters: SyscallRules{1: {}}, - kill: true, + ruleSets: []RuleSet{ + { + Rules: SyscallRules{ + 1: {}, + }, + Action: linux.SECCOMP_RET_ALLOW, + }, + }, + defaultAction: linux.SECCOMP_RET_TRAP, specs: []spec{ { - desc: "Syscall disallowed, action kill", + desc: "Syscall disallowed, action trap", data: seccompData{nr: 2, arch: linux.AUDIT_ARCH_X86_64}, - want: linux.SECCOMP_RET_KILL, + want: linux.SECCOMP_RET_TRAP, }, }, }, { - filters: SyscallRules{ - 1: []Rule{ - { - AllowAny{}, - AllowValue(0xf), + ruleSets: []RuleSet{ + { + Rules: SyscallRules{ + 1: []Rule{ + { + AllowAny{}, + AllowValue(0xf), + }, + }, }, + Action: linux.SECCOMP_RET_ALLOW, }, }, - kill: false, + defaultAction: linux.SECCOMP_RET_TRAP, specs: []spec{ { desc: "Syscall argument allowed", @@ -193,17 +265,22 @@ func TestBasic(t *testing.T) { }, }, { - filters: SyscallRules{ - 1: []Rule{ - { - AllowValue(0xf), - }, - { - AllowValue(0xe), + ruleSets: []RuleSet{ + { + Rules: SyscallRules{ + 1: []Rule{ + { + AllowValue(0xf), + }, + { + AllowValue(0xe), + }, + }, }, + Action: linux.SECCOMP_RET_ALLOW, }, }, - kill: false, + defaultAction: linux.SECCOMP_RET_TRAP, specs: []spec{ { desc: "Syscall argument allowed, two rules", @@ -218,16 +295,21 @@ func TestBasic(t *testing.T) { }, }, { - filters: SyscallRules{ - 1: []Rule{ - { - AllowValue(0), - AllowValue(math.MaxUint64 - 1), - AllowValue(math.MaxUint32), + ruleSets: []RuleSet{ + { + Rules: SyscallRules{ + 1: []Rule{ + { + AllowValue(0), + AllowValue(math.MaxUint64 - 1), + AllowValue(math.MaxUint32), + }, + }, }, + Action: linux.SECCOMP_RET_ALLOW, }, }, - kill: false, + defaultAction: linux.SECCOMP_RET_TRAP, specs: []spec{ { desc: "64bit syscall argument allowed", @@ -259,7 +341,7 @@ func TestBasic(t *testing.T) { }, }, } { - instrs, err := buildProgram(test.filters, test.kill) + instrs, err := BuildProgram(test.ruleSets, test.defaultAction) if err != nil { t.Errorf("%s: buildProgram() got error: %v", test.specs[0].desc, err) continue @@ -282,6 +364,7 @@ func TestBasic(t *testing.T) { } } +// TestRandom tests that randomly generated rules are encoded correctly. func TestRandom(t *testing.T) { rand.Seed(time.Now().UnixNano()) size := rand.Intn(50) + 1 @@ -294,7 +377,12 @@ func TestRandom(t *testing.T) { } fmt.Printf("Testing filters: %v", syscallRules) - instrs, err := buildProgram(syscallRules, false) + instrs, err := BuildProgram([]RuleSet{ + RuleSet{ + Rules: syscallRules, + Action: uint32(linux.SECCOMP_RET_ALLOW), + }, + }, uint32(linux.SECCOMP_RET_TRAP)) if err != nil { t.Fatalf("buildProgram() got error: %v", err) } @@ -319,8 +407,8 @@ func TestRandom(t *testing.T) { } } -// TestReadDeal checks that a process dies when it trips over the filter and that it -// doesn't die when the filter is not triggered. +// TestReadDeal checks that a process dies when it trips over the filter and +// that it doesn't die when the filter is not triggered. func TestRealDeal(t *testing.T) { for _, test := range []struct { die bool diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go index 6682f8d9b..ae18534bf 100644 --- a/pkg/seccomp/seccomp_unsafe.go +++ b/pkg/seccomp/seccomp_unsafe.go @@ -17,7 +17,6 @@ package seccomp import ( - "fmt" "syscall" "unsafe" @@ -31,19 +30,28 @@ type sockFprog struct { Filter *linux.BPFInstruction } -func seccomp(instrs []linux.BPFInstruction) error { +// SetFilter installs the given BPF program. +// +// This is safe to call from an afterFork context. +// +//go:nosplit +func SetFilter(instrs []linux.BPFInstruction) syscall.Errno { // SYS_SECCOMP is not available in syscall package. const SYS_SECCOMP = 317 // PR_SET_NO_NEW_PRIVS is required in order to enable seccomp. See seccomp(2) for details. - if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0); err != 0 { - return fmt.Errorf("failed to set PR_SET_NO_NEW_PRIVS: %v", err) + if _, _, errno := syscall.RawSyscall(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0); errno != 0 { + return errno } - sockProg := sockFprog{Len: uint16(len(instrs)), Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0]))} // TODO: Use SECCOMP_FILTER_FLAG_KILL_PROCESS when available. - if _, _, err := syscall.RawSyscall(SYS_SECCOMP, linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(&sockProg))); err != 0 { - return fmt.Errorf("failed to set seccomp filter: %v", err) + sockProg := sockFprog{ + Len: uint16(len(instrs)), + Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])), } - return nil + if _, _, errno := syscall.RawSyscall(SYS_SECCOMP, linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(&sockProg))); errno != 0 { + return errno + } + + return 0 } diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go index f1e408af9..5ba6c19ea 100644 --- a/pkg/sentry/arch/arch_amd64.go +++ b/pkg/sentry/arch/arch_amd64.go @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build amd64 + package arch import ( @@ -26,6 +28,9 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" ) +// Host specifies the host architecture. +const Host = AMD64 + // These constants come directly from Linux. const ( // maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index ceee895dc..debae058b 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -19,6 +19,8 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", + "//pkg/log", + "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/platform", "//pkg/sentry/platform/filemem", diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go index b55b2795a..46a8bda8e 100644 --- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go +++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go @@ -136,7 +136,7 @@ func (t *thread) clone(initRegs *syscall.PtraceRegs) (*thread, error) { return nil, syscall.EINVAL } rval, err := t.syscallIgnoreInterrupt( - initRegs, + &t.initRegs, syscall.SYS_CLONE, arch.SyscallArgument{Value: uintptr( syscall.CLONE_FILES | diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index 035ebc332..6d5ad6b71 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -47,6 +47,11 @@ type thread struct { tgid int32 tid int32 cpu uint32 + + // initRegs are the initial registers for the first thread. + // + // These are used for the register set for system calls. + initRegs syscall.PtraceRegs } // threadPool is a collection of threads. @@ -99,11 +104,6 @@ func (tp *threadPool) lookupOrCreate(currentTID int32, newThread func() *thread) type subprocess struct { platform.NoAddressSpaceIO - // initRegs are the initial registers for the first thread. - // - // These are used for the register set for system calls. - initRegs syscall.PtraceRegs - // requests is used to signal creation of new threads. requests chan chan *thread @@ -142,7 +142,6 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) { // thread, and responding to requests to make additional threads in the // traced process. The process will be killed and reaped when the // request channel is closed, which happens in Release below. - var initRegs syscall.PtraceRegs errChan := make(chan error) requests := make(chan chan *thread) go func() { // S/R-SAFE: Platform-related. @@ -156,22 +155,12 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) { return } - // Grab registers. - // - // Note that we adjust the current register RIP value to be - // just before the current system call executed. This depends - // on the definition of the stub itself. - if err := firstThread.getRegs(&initRegs); err != nil { - panic(fmt.Sprintf("ptrace get regs failed: %v", err)) - } - initRegs.Rip -= initRegsRipAdjustment - // Ready to handle requests. errChan <- nil // Wait for requests to create threads. for r := range requests { - t, err := firstThread.clone(&initRegs) + t, err := firstThread.clone(&firstThread.initRegs) if err != nil { // Should not happen: not recoverable. panic(fmt.Sprintf("error initializing first thread: %v", err)) @@ -183,15 +172,12 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) { // (Hopefully nobody tgkilled it with a signal < // SIGSTOP before the SIGSTOP was delivered, in which // case that signal would be delivered before SIGSTOP.) - if sig := t.wait(); sig != syscall.SIGSTOP { + if sig := t.wait(stopped); sig != syscall.SIGSTOP { panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig)) } - // Detach the thread without suppressing the SIGSTOP, - // causing it to enter group-stop. - if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(syscall.SIGSTOP), 0, 0); errno != 0 { - panic(fmt.Sprintf("can't detach new clone: %v", errno)) - } + // Detach the thread. + t.detach() // Return the thread. r <- t @@ -208,7 +194,6 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) { // Ready. sp := &subprocess{ - initRegs: initRegs, requests: requests, sysemuThreads: threadPool{ threads: make(map[int32]*thread), @@ -277,16 +262,48 @@ func (t *thread) attach() { // stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of // newSubprocess), so we always expect to see signal-delivery-stop with // SIGSTOP. - if sig := t.wait(); sig != syscall.SIGSTOP { + if sig := t.wait(stopped); sig != syscall.SIGSTOP { panic(fmt.Sprintf("wait failed: expected SIGSTOP, got %v", sig)) } // Initialize options. t.init() + + // Grab registers. + // + // Note that we adjust the current register RIP value to be just before + // the current system call executed. This depends on the definition of + // the stub itself. + if err := t.getRegs(&t.initRegs); err != nil { + panic(fmt.Sprintf("ptrace get regs failed: %v", err)) + } + t.initRegs.Rip -= initRegsRipAdjustment } +// detach detachs from the thread. +// +// Because the SIGSTOP is not supressed, the thread will enter group-stop. +func (t *thread) detach() { + if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(syscall.SIGSTOP), 0, 0); errno != 0 { + panic(fmt.Sprintf("can't detach new clone: %v", errno)) + } +} + +// waitOutcome is used for wait below. +type waitOutcome int + +const ( + // stopped indicates that the process was stopped. + stopped waitOutcome = iota + + // killed indicates that the process was killed. + killed +) + // wait waits for a stop event. -func (t *thread) wait() syscall.Signal { +// +// Precondition: outcome is a valid waitOutcome. +func (t *thread) wait(outcome waitOutcome) syscall.Signal { var status syscall.WaitStatus for { @@ -300,25 +317,55 @@ func (t *thread) wait() syscall.Signal { if int(r) != int(t.tid) { panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid)) } - if !status.Stopped() { - panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status)) - } - if status.StopSignal() == 0 { - continue // Spurious stop. + switch outcome { + case stopped: + if !status.Stopped() { + panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status)) + } + stopSig := status.StopSignal() + if stopSig == 0 { + continue // Spurious stop. + } + if stopSig == syscall.SIGTRAP { + // Re-encode the trap cause the way it's expected. + return stopSig | syscall.Signal(status.TrapCause()<<8) + } + // Not a trap signal. + return stopSig + case killed: + if !status.Exited() && !status.Signaled() { + panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status)) + } + return syscall.Signal(status.ExitStatus()) + default: + // Should not happen. + panic(fmt.Sprintf("unknown outcome: %v", outcome)) } - return status.StopSignal() } } +// destroy kills the thread. +// +// Note that this should not be used in the general case; the death of threads +// will typically cause the death of the parent. This is a utility method for +// manually created threads. +func (t *thread) destroy() { + t.detach() + syscall.Tgkill(int(t.tgid), int(t.tid), syscall.Signal(syscall.SIGKILL)) + t.wait(killed) +} + // init initializes trace options. func (t *thread) init() { - // Set our TRACESYSGOOD option to differeniate real SIGTRAP. + // Set our TRACESYSGOOD option to differeniate real SIGTRAP. Also, we + // require the SECCOMP option to ensure that seccomp violations + // generate a ptrace event. _, _, errno := syscall.RawSyscall6( syscall.SYS_PTRACE, syscall.PTRACE_SETOPTIONS, uintptr(t.tid), 0, - syscall.PTRACE_O_TRACESYSGOOD, + syscall.PTRACE_O_TRACESYSGOOD|_PTRACE_O_TRACESECCOMP, 0, 0) if errno != 0 { panic(fmt.Sprintf("ptrace set options failed: %v", errno)) @@ -342,8 +389,8 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) { panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) } - sig := t.wait() - if sig == (0x80 | syscall.SIGTRAP) { + sig := t.wait(stopped) + if sig == (syscallEvent | syscall.SIGTRAP) { // Reached syscall-enter-stop. break } else { @@ -360,7 +407,7 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) { // Wait for syscall-exit-stop. "[Signal-delivery-stop] never happens // between syscall-enter-stop and syscall-exit-stop; it happens *after* // syscall-exit-stop.)" - ptrace(2), "Syscall-stops" - if sig := t.wait(); sig != (0x80 | syscall.SIGTRAP) { + if sig := t.wait(stopped); sig != (syscallEvent | syscall.SIGTRAP) { panic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig)) } @@ -403,22 +450,23 @@ func (t *thread) NotifyInterrupt() { // // This function returns true on a system call, false on a signal. func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { - regs := &ac.StateData().Regs - s.resetSysemuRegs(regs) + // Lock the thread for ptrace operations. + runtime.LockOSThread() + defer runtime.UnlockOSThread() // Extract floating point state. fpState := ac.FloatingPointData() fpLen, _ := ac.FeatureSet().ExtendedStateSize() useXsave := ac.FeatureSet().UseXsave() - // Lock the thread for ptrace operations. - runtime.LockOSThread() - defer runtime.UnlockOSThread() - // Grab our thread from the pool. currentTID := int32(procid.Current()) t := s.sysemuThreads.lookupOrCreate(currentTID, s.newThread) + // Reset necessary registers. + regs := &ac.StateData().Regs + t.resetSysemuRegs(regs) + // Check for interrupts, and ensure that future interrupts will signal t. if !c.interrupt.Enable(t) { // Pending interrupt; simulate. @@ -459,7 +507,7 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { } // Wait for the syscall-enter stop. - sig := t.wait() + sig := t.wait(stopped) // Refresh all registers. if err := t.getRegs(regs); err != nil { @@ -470,13 +518,17 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { } // Is it a system call? - if sig == (0x80 | syscall.SIGTRAP) { + if sig == (syscallEvent | syscall.SIGTRAP) { // Ensure registers are sane. updateSyscallRegs(regs) return true - } - - if sig == syscall.SIGSTOP { + } else if sig == (seccompEvent | syscall.SIGTRAP) { + // Seccomp is enabled, and caught the system call. This + // is an emulated vsyscall call, since those are caught + // only by seccomp and explicitly set to trace. + updateSyscallRegs(regs) + return true + } else if sig == syscall.SIGSTOP { // SIGSTOP was delivered to another thread in the same thread // group, which initiated another group stop. Just ignore it. continue @@ -507,7 +559,7 @@ func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintp currentTID := int32(procid.Current()) t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread) - return t.syscallIgnoreInterrupt(&s.initRegs, sysno, args...) + return t.syscallIgnoreInterrupt(&t.initRegs, sysno, args...) } // MapFile implements platform.AddressSpace.MapFile. diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go index 8211215df..c38dc1ff8 100644 --- a/pkg/sentry/platform/ptrace/subprocess_amd64.go +++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go @@ -43,20 +43,20 @@ const ( // resetSysemuRegs sets up emulation registers. // // This should be called prior to calling sysemu. -func (s *subprocess) resetSysemuRegs(regs *syscall.PtraceRegs) { - regs.Cs = s.initRegs.Cs - regs.Ss = s.initRegs.Ss - regs.Ds = s.initRegs.Ds - regs.Es = s.initRegs.Es - regs.Fs = s.initRegs.Fs - regs.Gs = s.initRegs.Gs +func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) { + regs.Cs = t.initRegs.Cs + regs.Ss = t.initRegs.Ss + regs.Ds = t.initRegs.Ds + regs.Es = t.initRegs.Es + regs.Fs = t.initRegs.Fs + regs.Gs = t.initRegs.Gs } // createSyscallRegs sets up syscall registers. // // This should be called to generate registers for a system call. func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs { - // Copy initial registers (RIP, segments, etc.). + // Copy initial registers. regs := *initRegs // Set our syscall number. diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go index b212bbdfe..53adadadd 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux.go @@ -21,14 +21,167 @@ import ( "syscall" "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/seccomp" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid" ) +const ( + syscallEvent syscall.Signal = 0x80 + seccompEvent syscall.Signal = 0x700 // 0x7 (PTRACE_SECCOMP_EVENT) << 8 + _PTRACE_O_TRACESECCOMP = 0x80 // 1 << 0x7 (PTRACE_SECCOMP_EVENT) +) + +// probeSeccomp returns true iff seccomp is run after ptrace notifications, +// which is generally the case for kernel version >= 4.8. This check is dynamic +// because kernels have be backported behavior. +// +// See createStub for more information. +// +// Precondition: the runtime OS thread must be locked. +func probeSeccomp() bool { + // Create a completely new, destroyable process. + t, err := attachedThread(0, uint32(linux.SECCOMP_RET_ERRNO)) + if err != nil { + panic(fmt.Sprintf("seccomp probe failed: %v", err)) + } + defer t.destroy() + + // Set registers to the yield system call. This call is not allowed + // by the filters specified in the attachThread function. + regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD) + if err := t.setRegs(®s); err != nil { + panic(fmt.Sprintf("ptrace set regs failed: %v", err)) + } + + for { + // Attempt an emulation. + if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0); errno != 0 { + panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) + } + + sig := t.wait(stopped) + if sig == (syscallEvent | syscall.SIGTRAP) { + // Did the seccomp errno hook already run? This would + // indicate that seccomp is first in line and we're + // less than 4.8. + if err := t.getRegs(®s); err != nil { + panic(fmt.Sprintf("ptrace get-regs failed: %v", err)) + } + if _, err := syscallReturnValue(®s); err == nil { + // The seccomp errno mode ran first, and reset + // the error in the registers. + return false + } + // The seccomp hook did not run yet, and therefore it + // is safe to use RET_KILL mode for dispatched calls. + return true + } + } +} + // createStub creates a fresh stub processes. // // Precondition: the runtime OS thread must be locked. func createStub() (*thread, error) { + // The exact interactions of ptrace and seccomp are complex, and + // changed in recent kernel versions. Before commit 93e35efb8de45, the + // seccomp check is done before the ptrace emulation check. This means + // that any calls not matching this list will trigger the seccomp + // default action instead of notifying ptrace. + // + // After commit 93e35efb8de45, the seccomp check is done after the + // ptrace emulation check. This simplifies using SYSEMU, since seccomp + // will never run for emulation. Seccomp will only run for injected + // system calls, and thus we can use RET_KILL as our violation action. + var defaultAction uint32 + if probeSeccomp() { + log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)") + defaultAction = uint32(linux.SECCOMP_RET_KILL) + } else { + // We must rely on SYSEMU behavior; tracing with SYSEMU is broken. + log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)") + defaultAction = uint32(linux.SECCOMP_RET_ALLOW) + } + + // When creating the new child process, we specify SIGKILL as the + // signal to deliver when the child exits. We never expect a subprocess + // to exit; they are pooled and reused. This is done to ensure that if + // a subprocess is OOM-killed, this process (and all other stubs, + // transitively) will be killed as well. It's simply not possible to + // safely handle a single stub getting killed: the exact state of + // execution is unknown and not recoverable. + return attachedThread(uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, defaultAction) +} + +// attachedThread returns a new attached thread. +// +// Precondition: the runtime OS thread must be locked. +func attachedThread(flags uintptr, defaultAction uint32) (*thread, error) { + // Create a BPF program that allows only the system calls needed by the + // stub and all its children. This is used to create child stubs + // (below), so we must include the ability to fork, but otherwise lock + // down available calls only to what is needed. + rules := []seccomp.RuleSet{ + // Rules for trapping vsyscall access. + seccomp.RuleSet{ + Rules: seccomp.SyscallRules{ + syscall.SYS_GETTIMEOFDAY: {}, + syscall.SYS_TIME: {}, + 309: {}, // SYS_GETCPU. + }, + Action: uint32(linux.SECCOMP_RET_TRACE), + Vsyscall: true, + }, + } + if defaultAction != uint32(linux.SECCOMP_RET_ALLOW) { + rules = append(rules, seccomp.RuleSet{ + Rules: seccomp.SyscallRules{ + syscall.SYS_CLONE: []seccomp.Rule{ + // Allow creation of new subprocesses (used by the master). + {seccomp.AllowValue(syscall.CLONE_FILES | syscall.SIGKILL)}, + // Allow creation of new threads within a single address space (used by addresss spaces). + {seccomp.AllowValue( + syscall.CLONE_FILES | + syscall.CLONE_FS | + syscall.CLONE_SIGHAND | + syscall.CLONE_THREAD | + syscall.CLONE_PTRACE | + syscall.CLONE_VM)}, + }, + + // For the initial process creation. + syscall.SYS_WAIT4: {}, + syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ + {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)}, + }, + syscall.SYS_EXIT: {}, + + // For the stub prctl dance (all). + syscall.SYS_PRCTL: []seccomp.Rule{ + {seccomp.AllowValue(syscall.PR_SET_PDEATHSIG), seccomp.AllowValue(syscall.SIGKILL)}, + }, + syscall.SYS_GETPPID: {}, + + // For the stub to stop itself (all). + syscall.SYS_GETPID: {}, + syscall.SYS_KILL: []seccomp.Rule{ + {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SIGSTOP)}, + }, + + // Injected to support the address space operations. + syscall.SYS_MMAP: {}, + syscall.SYS_MUNMAP: {}, + }, + Action: uint32(linux.SECCOMP_RET_ALLOW), + }) + } + instrs, err := seccomp.BuildProgram(rules, defaultAction) + if err != nil { + return nil, err + } + // Declare all variables up front in order to ensure that there's no // need for allocations between beforeFork & afterFork. var ( @@ -43,14 +196,8 @@ func createStub() (*thread, error) { // Among other things, beforeFork masks all signals. beforeFork() - // When creating the new child process, we specify SIGKILL as the - // signal to deliver when the child exits. We never expect a subprocess - // to exit; they are pooled and reused. This is done to ensure that if - // a subprocess is OOM-killed, this process (and all other stubs, - // transitively) will be killed as well. It's simply not possible to - // safely handle a single stub getting killed: the exact state of - // execution is unknown and not recoverable. - pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, 0, 0, 0, 0, 0) + // Do the clone. + pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, flags, 0, 0, 0, 0, 0) if errno != 0 { afterFork() return nil, errno @@ -67,7 +214,7 @@ func createStub() (*thread, error) { tid: int32(pid), cpu: ^uint32(0), } - if sig := t.wait(); sig != syscall.SIGSTOP { + if sig := t.wait(stopped); sig != syscall.SIGSTOP { return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig) } t.attach() @@ -86,6 +233,12 @@ func createStub() (*thread, error) { syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0) } + // Set an aggressive BPF filter for the stub and all it's children. See + // the description of the BPF program built above. + if errno := seccomp.SetFilter(instrs); errno != 0 { + syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0) + } + // Enable cpuid-faulting; this may fail on older kernels or hardware, // so we just disregard the result. Host CPUID will be enabled. syscall.RawSyscall(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0) @@ -105,7 +258,7 @@ func (s *subprocess) createStub() (*thread, error) { t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread) // Pass the expected PPID to the child via R15. - regs := s.initRegs + regs := t.initRegs regs.R15 = uint64(t.tgid) // Call fork in a subprocess. @@ -138,7 +291,7 @@ func (s *subprocess) createStub() (*thread, error) { // status. If the wait succeeds, we'll assume that it was the SIGSTOP. // If the child actually exited, the attach below will fail. _, err = t.syscallIgnoreInterrupt( - &s.initRegs, + &t.initRegs, syscall.SYS_WAIT4, arch.SyscallArgument{Value: uintptr(pid)}, arch.SyscallArgument{Value: 0}, diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD index e1c8db67a..674554081 100644 --- a/pkg/sentry/strace/BUILD +++ b/pkg/sentry/strace/BUILD @@ -24,6 +24,7 @@ go_library( "//pkg/binary", "//pkg/bits", "//pkg/eventchannel", + "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/kernel", "//pkg/sentry/socket/control", diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index f2a22aaa5..a16f5490e 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -28,6 +28,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/bits" "gvisor.googlesource.com/gvisor/pkg/eventchannel" + "gvisor.googlesource.com/gvisor/pkg/seccomp" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" pb "gvisor.googlesource.com/gvisor/pkg/sentry/strace/strace_go_proto" @@ -699,3 +700,13 @@ func EnableAll(sinks SinkType) { table.FeatureEnable.EnableAll(flags) } } + +func init() { + t, ok := Lookup(abi.Host, arch.Host) + if ok { + // Provide the native table as the lookup for seccomp + // debugging. This is best-effort. This is provided this way to + // avoid dependencies from seccomp to this package. + seccomp.SyscallName = t.Name + } +} |