From 463e73d46d76042c39050d02cf3b0f875e55eb01 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Wed, 10 Oct 2018 22:39:32 -0700 Subject: Add seccomp filter configuration to ptrace stubs. This is a defense-in-depth measure. If the sentry is compromised, this prevents system call injection to the stubs. There is some complexity with respect to ptrace and seccomp interactions, so this protection is not really available for kernel versions < 4.8; this is detected dynamically. Note that this also solves the vsyscall emulation issue by adding in appropriate trapping for those system calls. It does mean that a compromised sentry could theoretically inject these into the stub (ignoring the trap and resume, thereby allowing execution), but they are harmless. PiperOrigin-RevId: 216647581 Change-Id: Id06c232cbac1f9489b1803ec97f83097fcba8eb8 --- pkg/seccomp/BUILD | 3 - pkg/seccomp/seccomp.go | 224 +++++++++++++++++++++++++++++------------- pkg/seccomp/seccomp_rules.go | 8 +- pkg/seccomp/seccomp_test.go | 172 ++++++++++++++++++++++++-------- pkg/seccomp/seccomp_unsafe.go | 24 +++-- 5 files changed, 306 insertions(+), 125 deletions(-) (limited to 'pkg/seccomp') diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD index b3e2f0b38..1975d17a6 100644 --- a/pkg/seccomp/BUILD +++ b/pkg/seccomp/BUILD @@ -28,12 +28,9 @@ go_library( importpath = "gvisor.googlesource.com/gvisor/pkg/seccomp", visibility = ["//visibility:public"], deps = [ - "//pkg/abi", "//pkg/abi/linux", "//pkg/bpf", "//pkg/log", - "//pkg/sentry/arch", - "//pkg/sentry/strace", ], ) diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go index 49da3c775..a746dc9b3 100644 --- a/pkg/seccomp/seccomp.go +++ b/pkg/seccomp/seccomp.go @@ -20,31 +20,36 @@ import ( "reflect" "sort" - "gvisor.googlesource.com/gvisor/pkg/abi" "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/bpf" "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/sentry/arch" - "gvisor.googlesource.com/gvisor/pkg/sentry/strace" ) const ( - // violationLabel is added to the program to take action on a violation. - violationLabel = "violation" - // skipOneInst is the offset to take for skipping one instruction. skipOneInst = 1 + + // defaultLabel is the label for the default action. + defaultLabel = "default_action" ) // Install generates BPF code based on the set of syscalls provided. It only -// allows syscalls that conform to the specification (*) and generates SIGSYS +// allows syscalls that conform to the specification and generates SIGSYS // trap unless kill is set. // -// (*) The current implementation only checks the syscall number. It does NOT -// validate any of the arguments. +// This is a convenience wrapper around BuildProgram and SetFilter. func Install(rules SyscallRules, kill bool) error { log.Infof("Installing seccomp filters for %d syscalls (kill=%t)", len(rules), kill) - instrs, err := buildProgram(rules, kill) + defaultAction := uint32(linux.SECCOMP_RET_TRAP) + if kill { + defaultAction = uint32(linux.SECCOMP_RET_KILL) + } + instrs, err := BuildProgram([]RuleSet{ + RuleSet{ + Rules: rules, + Action: uint32(linux.SECCOMP_RET_ALLOW), + }, + }, defaultAction) if log.IsLogging(log.Debug) { programStr, errDecode := bpf.DecodeProgram(instrs) if errDecode != nil { @@ -56,60 +61,84 @@ func Install(rules SyscallRules, kill bool) error { return err } - if err := seccomp(instrs); err != nil { - return err + // Perform the actual installation. + if errno := SetFilter(instrs); errno != 0 { + return fmt.Errorf("Failed to set filter: %v", errno) } log.Infof("Seccomp filters installed.") return nil } -// buildProgram builds a BPF program that whitelists all given syscall rules. -func buildProgram(rules SyscallRules, kill bool) ([]linux.BPFInstruction, error) { +// RuleSet is a set of rules and associated action. +type RuleSet struct { + Rules SyscallRules + Action uint32 + + // Vsyscall indicates that a check is made for a function being called + // from kernel mappings. This is where the vsyscall page is located + // (and typically) emulated, so this RuleSet will not match any + // functions not dispatched from the vsyscall page. + Vsyscall bool +} + +// SyscallName gives names to system calls. It is used purely for debugging purposes. +// +// An alternate namer can be provided to the package at initialization time. +var SyscallName = func(sysno uintptr) string { + return fmt.Sprintf("syscall_%d", sysno) +} + +// BuildProgram builds a BPF program from the given map of actions to matching +// SyscallRules. The single generated program covers all provided RuleSets. +func BuildProgram(rules []RuleSet, defaultAction uint32) ([]linux.BPFInstruction, error) { program := bpf.NewProgramBuilder() - violationAction := uint32(linux.SECCOMP_RET_KILL) - if !kill { - violationAction = linux.SECCOMP_RET_TRAP - } // Be paranoid and check that syscall is done in the expected architecture. // // A = seccomp_data.arch - // if (A != AUDIT_ARCH_X86_64) goto violation + // if (A != AUDIT_ARCH_X86_64) goto defaultAction. program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArch) - // violationLabel is at the bottom of the program. The size of program + // defaultLabel is at the bottom of the program. The size of program // may exceeds 255 lines, which is the limit of a condition jump. program.AddJump(bpf.Jmp|bpf.Jeq|bpf.K, linux.AUDIT_ARCH_X86_64, skipOneInst, 0) - program.AddDirectJumpLabel(violationLabel) - + program.AddDirectJumpLabel(defaultLabel) if err := buildIndex(rules, program); err != nil { return nil, err } - // violation: return violationAction - if err := program.AddLabel(violationLabel); err != nil { + // Exhausted: return defaultAction. + if err := program.AddLabel(defaultLabel); err != nil { return nil, err } - program.AddStmt(bpf.Ret|bpf.K, violationAction) + program.AddStmt(bpf.Ret|bpf.K, defaultAction) return program.Instructions() } -// buildIndex builds a BST to quickly search through all syscalls that are whitelisted. -func buildIndex(rules SyscallRules, program *bpf.ProgramBuilder) error { - syscalls := []uintptr{} - for sysno := range rules { - syscalls = append(syscalls, sysno) +// buildIndex builds a BST to quickly search through all syscalls. +func buildIndex(rules []RuleSet, program *bpf.ProgramBuilder) error { + // Build a list of all application system calls, across all given rule + // sets. We have a simple BST, but may dispatch individual matchers + // with different actions. The matchers are evaluated linearly. + requiredSyscalls := make(map[uintptr]struct{}) + for _, rs := range rules { + for sysno := range rs.Rules { + requiredSyscalls[sysno] = struct{}{} + } } - - t, ok := strace.Lookup(abi.Linux, arch.AMD64) - if !ok { - panic("Can't find amd64 Linux syscall table") + syscalls := make([]uintptr, 0, len(requiredSyscalls)) + for sysno, _ := range requiredSyscalls { + syscalls = append(syscalls, sysno) } - sort.Slice(syscalls, func(i, j int) bool { return syscalls[i] < syscalls[j] }) - for _, s := range syscalls { - log.Infof("syscall filter: %v (%v): %s", s, t.Name(s), rules[s]) + for _, sysno := range syscalls { + for _, rs := range rules { + // Print only if there is a corresponding set of rules. + if _, ok := rs.Rules[sysno]; ok { + log.Debugf("syscall filter %v: %s => 0x%x", SyscallName(sysno), rs.Rules[sysno], rs.Action) + } + } } root := createBST(syscalls) @@ -119,7 +148,7 @@ func buildIndex(rules SyscallRules, program *bpf.ProgramBuilder) error { // // A = seccomp_data.nr program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetNR) - return root.traverse(buildBSTProgram, program, rules) + return root.traverse(buildBSTProgram, rules, program) } // createBST converts sorted syscall slice into a balanced BST. @@ -136,15 +165,23 @@ func createBST(syscalls []uintptr) *node { return &parent } -func ruleViolationLabel(sysno uintptr, idx int) string { - return fmt.Sprintf("ruleViolation_%v_%v", sysno, idx) +func vsyscallViolationLabel(ruleSetIdx int, sysno uintptr) string { + return fmt.Sprintf("vsyscallViolation_%v_%v", ruleSetIdx, sysno) +} + +func ruleViolationLabel(ruleSetIdx int, sysno uintptr, idx int) string { + return fmt.Sprintf("ruleViolation_%v_%v_%v", ruleSetIdx, sysno, idx) } func checkArgsLabel(sysno uintptr) string { return fmt.Sprintf("checkArgs_%v", sysno) } -func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, sysno uintptr) error { +// addSyscallArgsCheck adds argument checks for a single system call. It does +// not insert a jump to the default action at the end and it is the +// responsibility of the caller to insert an appropriate jump after calling +// this function. +func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action uint32, ruleSetIdx int, sysno uintptr) error { for ruleidx, rule := range rules { labelled := false for i, arg := range rule { @@ -155,28 +192,29 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, sysno uintptr) err high, low := uint32(a>>32), uint32(a) // assert arg_low == low p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i)) - p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(sysno, ruleidx)) + p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) // assert arg_high == high p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i)) - p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(sysno, ruleidx)) + p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) labelled = true - default: return fmt.Errorf("unknown syscall rule type: %v", reflect.TypeOf(a)) } } } - // Matched, allow the syscall. - p.AddStmt(bpf.Ret|bpf.K, linux.SECCOMP_RET_ALLOW) - // Label the end of the rule if necessary. + + // Matched, emit the given action. + p.AddStmt(bpf.Ret|bpf.K, action) + + // Label the end of the rule if necessary. This is added for + // the jumps above when the argument check fails. if labelled { - if err := p.AddLabel(ruleViolationLabel(sysno, ruleidx)); err != nil { + if err := p.AddLabel(ruleViolationLabel(ruleSetIdx, sysno, ruleidx)); err != nil { return err } } } - // Not matched? - p.AddDirectJumpLabel(violationLabel) + return nil } @@ -188,16 +226,16 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, sysno uintptr) err // (A > 22) ? goto index_35 : goto index_9 // // index_9: // SYS_MMAP(9), leaf -// A == 9) ? goto argument check : violation +// A == 9) ? goto argument check : defaultLabel // // index_35: // SYS_NANOSLEEP(35), single child // (A == 35) ? goto argument check : continue -// (A > 35) ? goto index_50 : goto violation +// (A > 35) ? goto index_50 : goto defaultLabel // // index_50: // SYS_LISTEN(50), leaf -// (A == 50) ? goto argument check : goto violation +// (A == 50) ? goto argument check : goto defaultLabel // -func buildBSTProgram(program *bpf.ProgramBuilder, rules SyscallRules, n *node) error { +func buildBSTProgram(n *node, rules []RuleSet, program *bpf.ProgramBuilder) error { // Root node is never referenced by label, skip it. if !n.root { if err := program.AddLabel(n.label()); err != nil { @@ -209,11 +247,10 @@ func buildBSTProgram(program *bpf.ProgramBuilder, rules SyscallRules, n *node) e program.AddJumpTrueLabel(bpf.Jmp|bpf.Jeq|bpf.K, uint32(sysno), checkArgsLabel(sysno), 0) if n.left == nil && n.right == nil { // Leaf nodes don't require extra check. - program.AddDirectJumpLabel(violationLabel) + program.AddDirectJumpLabel(defaultLabel) } else { // Non-leaf node. Check which turn to take otherwise. Using direct jumps // in case that the offset may exceed the limit of a conditional jump (255) - // Note that 'violationLabel' is returned for nil children. program.AddJump(bpf.Jmp|bpf.Jgt|bpf.K, uint32(sysno), 0, skipOneInst) program.AddDirectJumpLabel(n.right.label()) program.AddDirectJumpLabel(n.left.label()) @@ -222,12 +259,60 @@ func buildBSTProgram(program *bpf.ProgramBuilder, rules SyscallRules, n *node) e if err := program.AddLabel(checkArgsLabel(sysno)); err != nil { return err } - // No rules, just allow it and save one jmp. - if len(rules[sysno]) == 0 { - program.AddStmt(bpf.Ret|bpf.K, linux.SECCOMP_RET_ALLOW) - return nil + + emitted := false + for ruleSetIdx, rs := range rules { + if _, ok := rs.Rules[sysno]; ok { + // If there are no rules, then this will always match. + // Remember we've done this so that we can emit a + // sensible error. We can't catch all overlaps, but we + // can catch this one at least. + if emitted { + return fmt.Errorf("unreachable action for %v: 0x%x (rule set %d)", SyscallName(sysno), rs.Action, ruleSetIdx) + } + + // Emit a vsyscall check if this rule requires a + // Vsyscall match. This rule ensures that the top bit + // is set in the instruction pointer, which is where + // the vsyscall page will be mapped. + if rs.Vsyscall { + program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetIPHigh) + program.AddJumpFalseLabel(bpf.Jmp|bpf.Jset|bpf.K, 0x80000000, 0, vsyscallViolationLabel(ruleSetIdx, sysno)) + } + + // Emit matchers. + if len(rs.Rules[sysno]) == 0 { + // This is a blanket action. + program.AddStmt(bpf.Ret|bpf.K, rs.Action) + emitted = true + } else { + // Add an argument check for these particular + // arguments. This will continue execution and + // check the next rule set. We need to ensure + // that at the very end, we insert a direct + // jump label for the unmatched case. + if err := addSyscallArgsCheck(program, rs.Rules[sysno], rs.Action, ruleSetIdx, sysno); err != nil { + return err + } + } + + // If there was a Vsyscall check for this rule, then we + // need to add an appropriate label for the jump above. + if rs.Vsyscall { + if err := program.AddLabel(vsyscallViolationLabel(ruleSetIdx, sysno)); err != nil { + return err + } + } + } } - return addSyscallArgsCheck(program, rules[sysno], sysno) + + // Not matched? We only need to insert a jump to the default label if + // not default action has been emitted for this call. + if !emitted { + program.AddDirectJumpLabel(defaultLabel) + } + + return nil } // node represents a tree node. @@ -238,26 +323,27 @@ type node struct { root bool } -// label returns the label corresponding to this node. If node is nil (syscall not present), -// violationLabel is returned for convenience. +// label returns the label corresponding to this node. +// +// If n is nil, then the defaultLabel is returned. func (n *node) label() string { if n == nil { - return violationLabel + return defaultLabel } return fmt.Sprintf("index_%v", n.value) } -type traverseFunc func(*bpf.ProgramBuilder, SyscallRules, *node) error +type traverseFunc func(*node, []RuleSet, *bpf.ProgramBuilder) error -func (n *node) traverse(fn traverseFunc, p *bpf.ProgramBuilder, rules SyscallRules) error { +func (n *node) traverse(fn traverseFunc, rules []RuleSet, p *bpf.ProgramBuilder) error { if n == nil { return nil } - if err := fn(p, rules, n); err != nil { + if err := fn(n, rules, p); err != nil { return err } - if err := n.left.traverse(fn, p, rules); err != nil { + if err := n.left.traverse(fn, rules, p); err != nil { return err } - return n.right.traverse(fn, p, rules) + return n.right.traverse(fn, rules, p) } diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go index 9215e5c90..6b707f195 100644 --- a/pkg/seccomp/seccomp_rules.go +++ b/pkg/seccomp/seccomp_rules.go @@ -24,9 +24,11 @@ import "fmt" // __u64 args[6]; // }; const ( - seccompDataOffsetNR = 0 - seccompDataOffsetArch = 4 - seccompDataOffsetArgs = 16 + seccompDataOffsetNR = 0 + seccompDataOffsetArch = 4 + seccompDataOffsetIPLow = 8 + seccompDataOffsetIPHigh = 12 + seccompDataOffsetArgs = 16 ) func seccompDataOffsetArgLow(i int) uint32 { diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go index 42cf85c03..0188ad4f3 100644 --- a/pkg/seccomp/seccomp_test.go +++ b/pkg/seccomp/seccomp_test.go @@ -76,14 +76,18 @@ func TestBasic(t *testing.T) { } for _, test := range []struct { - // filters are the set of syscall that are allowed. - filters SyscallRules - kill bool - specs []spec + ruleSets []RuleSet + defaultAction uint32 + specs []spec }{ { - filters: SyscallRules{1: {}}, - kill: false, + ruleSets: []RuleSet{ + { + Rules: SyscallRules{1: {}}, + Action: linux.SECCOMP_RET_ALLOW, + }, + }, + defaultAction: linux.SECCOMP_RET_TRAP, specs: []spec{ { desc: "Single syscall allowed", @@ -98,12 +102,61 @@ func TestBasic(t *testing.T) { }, }, { - filters: SyscallRules{ - 1: {}, - 3: {}, - 5: {}, + ruleSets: []RuleSet{ + { + Rules: SyscallRules{ + 1: []Rule{ + { + AllowValue(0x1), + }, + }, + }, + Action: linux.SECCOMP_RET_ALLOW, + }, + { + Rules: SyscallRules{ + 1: {}, + 2: {}, + }, + Action: linux.SECCOMP_RET_TRAP, + }, }, - kill: false, + defaultAction: linux.SECCOMP_RET_KILL, + specs: []spec{ + { + desc: "Multiple rulesets allowed (1a)", + data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{0x1}}, + want: linux.SECCOMP_RET_ALLOW, + }, + { + desc: "Multiple rulesets allowed (1b)", + data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64}, + want: linux.SECCOMP_RET_TRAP, + }, + { + desc: "Multiple rulesets allowed (2)", + data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64}, + want: linux.SECCOMP_RET_TRAP, + }, + { + desc: "Multiple rulesets allowed (2)", + data: seccompData{nr: 0, arch: linux.AUDIT_ARCH_X86_64}, + want: linux.SECCOMP_RET_KILL, + }, + }, + }, + { + ruleSets: []RuleSet{ + { + Rules: SyscallRules{ + 1: {}, + 3: {}, + 5: {}, + }, + Action: linux.SECCOMP_RET_ALLOW, + }, + }, + defaultAction: linux.SECCOMP_RET_TRAP, specs: []spec{ { desc: "Multiple syscalls allowed (1)", @@ -148,8 +201,15 @@ func TestBasic(t *testing.T) { }, }, { - filters: SyscallRules{1: {}}, - kill: false, + ruleSets: []RuleSet{ + { + Rules: SyscallRules{ + 1: {}, + }, + Action: linux.SECCOMP_RET_ALLOW, + }, + }, + defaultAction: linux.SECCOMP_RET_TRAP, specs: []spec{ { desc: "Wrong architecture", @@ -159,26 +219,38 @@ func TestBasic(t *testing.T) { }, }, { - filters: SyscallRules{1: {}}, - kill: true, + ruleSets: []RuleSet{ + { + Rules: SyscallRules{ + 1: {}, + }, + Action: linux.SECCOMP_RET_ALLOW, + }, + }, + defaultAction: linux.SECCOMP_RET_TRAP, specs: []spec{ { - desc: "Syscall disallowed, action kill", + desc: "Syscall disallowed, action trap", data: seccompData{nr: 2, arch: linux.AUDIT_ARCH_X86_64}, - want: linux.SECCOMP_RET_KILL, + want: linux.SECCOMP_RET_TRAP, }, }, }, { - filters: SyscallRules{ - 1: []Rule{ - { - AllowAny{}, - AllowValue(0xf), + ruleSets: []RuleSet{ + { + Rules: SyscallRules{ + 1: []Rule{ + { + AllowAny{}, + AllowValue(0xf), + }, + }, }, + Action: linux.SECCOMP_RET_ALLOW, }, }, - kill: false, + defaultAction: linux.SECCOMP_RET_TRAP, specs: []spec{ { desc: "Syscall argument allowed", @@ -193,17 +265,22 @@ func TestBasic(t *testing.T) { }, }, { - filters: SyscallRules{ - 1: []Rule{ - { - AllowValue(0xf), - }, - { - AllowValue(0xe), + ruleSets: []RuleSet{ + { + Rules: SyscallRules{ + 1: []Rule{ + { + AllowValue(0xf), + }, + { + AllowValue(0xe), + }, + }, }, + Action: linux.SECCOMP_RET_ALLOW, }, }, - kill: false, + defaultAction: linux.SECCOMP_RET_TRAP, specs: []spec{ { desc: "Syscall argument allowed, two rules", @@ -218,16 +295,21 @@ func TestBasic(t *testing.T) { }, }, { - filters: SyscallRules{ - 1: []Rule{ - { - AllowValue(0), - AllowValue(math.MaxUint64 - 1), - AllowValue(math.MaxUint32), + ruleSets: []RuleSet{ + { + Rules: SyscallRules{ + 1: []Rule{ + { + AllowValue(0), + AllowValue(math.MaxUint64 - 1), + AllowValue(math.MaxUint32), + }, + }, }, + Action: linux.SECCOMP_RET_ALLOW, }, }, - kill: false, + defaultAction: linux.SECCOMP_RET_TRAP, specs: []spec{ { desc: "64bit syscall argument allowed", @@ -259,7 +341,7 @@ func TestBasic(t *testing.T) { }, }, } { - instrs, err := buildProgram(test.filters, test.kill) + instrs, err := BuildProgram(test.ruleSets, test.defaultAction) if err != nil { t.Errorf("%s: buildProgram() got error: %v", test.specs[0].desc, err) continue @@ -282,6 +364,7 @@ func TestBasic(t *testing.T) { } } +// TestRandom tests that randomly generated rules are encoded correctly. func TestRandom(t *testing.T) { rand.Seed(time.Now().UnixNano()) size := rand.Intn(50) + 1 @@ -294,7 +377,12 @@ func TestRandom(t *testing.T) { } fmt.Printf("Testing filters: %v", syscallRules) - instrs, err := buildProgram(syscallRules, false) + instrs, err := BuildProgram([]RuleSet{ + RuleSet{ + Rules: syscallRules, + Action: uint32(linux.SECCOMP_RET_ALLOW), + }, + }, uint32(linux.SECCOMP_RET_TRAP)) if err != nil { t.Fatalf("buildProgram() got error: %v", err) } @@ -319,8 +407,8 @@ func TestRandom(t *testing.T) { } } -// TestReadDeal checks that a process dies when it trips over the filter and that it -// doesn't die when the filter is not triggered. +// TestReadDeal checks that a process dies when it trips over the filter and +// that it doesn't die when the filter is not triggered. func TestRealDeal(t *testing.T) { for _, test := range []struct { die bool diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go index 6682f8d9b..ae18534bf 100644 --- a/pkg/seccomp/seccomp_unsafe.go +++ b/pkg/seccomp/seccomp_unsafe.go @@ -17,7 +17,6 @@ package seccomp import ( - "fmt" "syscall" "unsafe" @@ -31,19 +30,28 @@ type sockFprog struct { Filter *linux.BPFInstruction } -func seccomp(instrs []linux.BPFInstruction) error { +// SetFilter installs the given BPF program. +// +// This is safe to call from an afterFork context. +// +//go:nosplit +func SetFilter(instrs []linux.BPFInstruction) syscall.Errno { // SYS_SECCOMP is not available in syscall package. const SYS_SECCOMP = 317 // PR_SET_NO_NEW_PRIVS is required in order to enable seccomp. See seccomp(2) for details. - if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0); err != 0 { - return fmt.Errorf("failed to set PR_SET_NO_NEW_PRIVS: %v", err) + if _, _, errno := syscall.RawSyscall(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0); errno != 0 { + return errno } - sockProg := sockFprog{Len: uint16(len(instrs)), Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0]))} // TODO: Use SECCOMP_FILTER_FLAG_KILL_PROCESS when available. - if _, _, err := syscall.RawSyscall(SYS_SECCOMP, linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(&sockProg))); err != 0 { - return fmt.Errorf("failed to set seccomp filter: %v", err) + sockProg := sockFprog{ + Len: uint16(len(instrs)), + Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])), } - return nil + if _, _, errno := syscall.RawSyscall(SYS_SECCOMP, linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(&sockProg))); errno != 0 { + return errno + } + + return 0 } -- cgit v1.2.3