diff options
Diffstat (limited to 'pkg/seccomp/seccomp.go')
-rw-r--r-- | pkg/seccomp/seccomp.go | 404 |
1 files changed, 404 insertions, 0 deletions
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go new file mode 100644 index 000000000..55fd6967e --- /dev/null +++ b/pkg/seccomp/seccomp.go @@ -0,0 +1,404 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package seccomp provides basic seccomp filters for x86_64 (little endian). +package seccomp + +import ( + "fmt" + "reflect" + "sort" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/log" +) + +const ( + // skipOneInst is the offset to take for skipping one instruction. + skipOneInst = 1 + + // defaultLabel is the label for the default action. + defaultLabel = "default_action" +) + +// Install generates BPF code based on the set of syscalls provided. It only +// allows syscalls that conform to the specification. Syscalls that violate the +// specification will trigger RET_KILL_PROCESS, except for the cases below. +// +// RET_TRAP is used in violations, instead of RET_KILL_PROCESS, in the +// following cases: +// 1. Kernel doesn't support RET_KILL_PROCESS: RET_KILL_THREAD only kills the +// offending thread and often keeps the sentry hanging. +// 2. Debug: RET_TRAP generates a panic followed by a stack trace which is +// much easier to debug then RET_KILL_PROCESS which can't be caught. +// +// Be aware that RET_TRAP sends SIGSYS to the process and it may be ignored, +// making it possible for the process to continue running after a violation. +// However, it will leave a SECCOMP audit event trail behind. In any case, the +// syscall is still blocked from executing. +func Install(rules SyscallRules) error { + defaultAction, err := defaultAction() + if err != nil { + return err + } + + // Uncomment to get stack trace when there is a violation. + // defaultAction = linux.BPFAction(linux.SECCOMP_RET_TRAP) + + log.Infof("Installing seccomp filters for %d syscalls (action=%v)", len(rules), defaultAction) + + instrs, err := BuildProgram([]RuleSet{ + RuleSet{ + Rules: rules, + Action: linux.SECCOMP_RET_ALLOW, + }, + }, defaultAction) + if log.IsLogging(log.Debug) { + programStr, errDecode := bpf.DecodeProgram(instrs) + if errDecode != nil { + programStr = fmt.Sprintf("Error: %v\n%s", errDecode, programStr) + } + log.Debugf("Seccomp program dump:\n%s", programStr) + } + if err != nil { + return err + } + + // Perform the actual installation. + if errno := SetFilter(instrs); errno != 0 { + return fmt.Errorf("Failed to set filter: %v", errno) + } + + log.Infof("Seccomp filters installed.") + return nil +} + +func defaultAction() (linux.BPFAction, error) { + available, err := isKillProcessAvailable() + if err != nil { + return 0, err + } + if available { + return linux.SECCOMP_RET_KILL_PROCESS, nil + } + return linux.SECCOMP_RET_TRAP, nil +} + +// RuleSet is a set of rules and associated action. +type RuleSet struct { + Rules SyscallRules + Action linux.BPFAction + + // Vsyscall indicates that a check is made for a function being called + // from kernel mappings. This is where the vsyscall page is located + // (and typically) emulated, so this RuleSet will not match any + // functions not dispatched from the vsyscall page. + Vsyscall bool +} + +// SyscallName gives names to system calls. It is used purely for debugging purposes. +// +// An alternate namer can be provided to the package at initialization time. +var SyscallName = func(sysno uintptr) string { + return fmt.Sprintf("syscall_%d", sysno) +} + +// BuildProgram builds a BPF program from the given map of actions to matching +// SyscallRules. The single generated program covers all provided RuleSets. +func BuildProgram(rules []RuleSet, defaultAction linux.BPFAction) ([]linux.BPFInstruction, error) { + program := bpf.NewProgramBuilder() + + // Be paranoid and check that syscall is done in the expected architecture. + // + // A = seccomp_data.arch + // if (A != AUDIT_ARCH) goto defaultAction. + program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArch) + // defaultLabel is at the bottom of the program. The size of program + // may exceeds 255 lines, which is the limit of a condition jump. + program.AddJump(bpf.Jmp|bpf.Jeq|bpf.K, LINUX_AUDIT_ARCH, skipOneInst, 0) + program.AddDirectJumpLabel(defaultLabel) + if err := buildIndex(rules, program); err != nil { + return nil, err + } + + // Exhausted: return defaultAction. + if err := program.AddLabel(defaultLabel); err != nil { + return nil, err + } + program.AddStmt(bpf.Ret|bpf.K, uint32(defaultAction)) + + return program.Instructions() +} + +// buildIndex builds a BST to quickly search through all syscalls. +func buildIndex(rules []RuleSet, program *bpf.ProgramBuilder) error { + // Build a list of all application system calls, across all given rule + // sets. We have a simple BST, but may dispatch individual matchers + // with different actions. The matchers are evaluated linearly. + requiredSyscalls := make(map[uintptr]struct{}) + for _, rs := range rules { + for sysno := range rs.Rules { + requiredSyscalls[sysno] = struct{}{} + } + } + syscalls := make([]uintptr, 0, len(requiredSyscalls)) + for sysno, _ := range requiredSyscalls { + syscalls = append(syscalls, sysno) + } + sort.Slice(syscalls, func(i, j int) bool { return syscalls[i] < syscalls[j] }) + for _, sysno := range syscalls { + for _, rs := range rules { + // Print only if there is a corresponding set of rules. + if _, ok := rs.Rules[sysno]; ok { + log.Debugf("syscall filter %v: %s => 0x%x", SyscallName(sysno), rs.Rules[sysno], rs.Action) + } + } + } + + root := createBST(syscalls) + root.root = true + + // Load syscall number into A and run through BST. + // + // A = seccomp_data.nr + program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetNR) + return root.traverse(buildBSTProgram, rules, program) +} + +// createBST converts sorted syscall slice into a balanced BST. +// Panics if syscalls is empty. +func createBST(syscalls []uintptr) *node { + i := len(syscalls) / 2 + parent := node{value: syscalls[i]} + if i > 0 { + parent.left = createBST(syscalls[:i]) + } + if i+1 < len(syscalls) { + parent.right = createBST(syscalls[i+1:]) + } + return &parent +} + +func vsyscallViolationLabel(ruleSetIdx int, sysno uintptr) string { + return fmt.Sprintf("vsyscallViolation_%v_%v", ruleSetIdx, sysno) +} + +func ruleViolationLabel(ruleSetIdx int, sysno uintptr, idx int) string { + return fmt.Sprintf("ruleViolation_%v_%v_%v", ruleSetIdx, sysno, idx) +} + +func ruleLabel(ruleSetIdx int, sysno uintptr, idx int, name string) string { + return fmt.Sprintf("rule_%v_%v_%v_%v", ruleSetIdx, sysno, idx, name) +} + +func checkArgsLabel(sysno uintptr) string { + return fmt.Sprintf("checkArgs_%v", sysno) +} + +// addSyscallArgsCheck adds argument checks for a single system call. It does +// not insert a jump to the default action at the end and it is the +// responsibility of the caller to insert an appropriate jump after calling +// this function. +func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action linux.BPFAction, ruleSetIdx int, sysno uintptr) error { + for ruleidx, rule := range rules { + labelled := false + for i, arg := range rule { + if arg != nil { + switch a := arg.(type) { + case AllowAny: + case AllowValue: + dataOffsetLow := seccompDataOffsetArgLow(i) + dataOffsetHigh := seccompDataOffsetArgHigh(i) + if i == RuleIP { + dataOffsetLow = seccompDataOffsetIPLow + dataOffsetHigh = seccompDataOffsetIPHigh + } + high, low := uint32(a>>32), uint32(a) + // assert arg_low == low + p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow) + p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) + // assert arg_high == high + p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh) + p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) + labelled = true + case GreaterThan: + dataOffsetLow := seccompDataOffsetArgLow(i) + dataOffsetHigh := seccompDataOffsetArgHigh(i) + if i == RuleIP { + dataOffsetLow = seccompDataOffsetIPLow + dataOffsetHigh = seccompDataOffsetIPHigh + } + labelGood := fmt.Sprintf("gt%v", i) + high, low := uint32(a>>32), uint32(a) + // assert arg_high < high + p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetHigh) + p.AddJumpFalseLabel(bpf.Jmp|bpf.Jge|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) + // arg_high > high + p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood)) + // arg_low < low + p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, dataOffsetLow) + p.AddJumpFalseLabel(bpf.Jmp|bpf.Jgt|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) + p.AddLabel(ruleLabel(ruleSetIdx, sysno, ruleidx, labelGood)) + labelled = true + default: + return fmt.Errorf("unknown syscall rule type: %v", reflect.TypeOf(a)) + } + } + } + + // Matched, emit the given action. + p.AddStmt(bpf.Ret|bpf.K, uint32(action)) + + // Label the end of the rule if necessary. This is added for + // the jumps above when the argument check fails. + if labelled { + if err := p.AddLabel(ruleViolationLabel(ruleSetIdx, sysno, ruleidx)); err != nil { + return err + } + } + } + + return nil +} + +// buildBSTProgram converts a binary tree started in 'root' into BPF code. The outline of the code +// is as follows: +// +// // SYS_PIPE(22), root +// (A == 22) ? goto argument check : continue +// (A > 22) ? goto index_35 : goto index_9 +// +// index_9: // SYS_MMAP(9), leaf +// A == 9) ? goto argument check : defaultLabel +// +// index_35: // SYS_NANOSLEEP(35), single child +// (A == 35) ? goto argument check : continue +// (A > 35) ? goto index_50 : goto defaultLabel +// +// index_50: // SYS_LISTEN(50), leaf +// (A == 50) ? goto argument check : goto defaultLabel +// +func buildBSTProgram(n *node, rules []RuleSet, program *bpf.ProgramBuilder) error { + // Root node is never referenced by label, skip it. + if !n.root { + if err := program.AddLabel(n.label()); err != nil { + return err + } + } + + sysno := n.value + program.AddJumpTrueLabel(bpf.Jmp|bpf.Jeq|bpf.K, uint32(sysno), checkArgsLabel(sysno), 0) + if n.left == nil && n.right == nil { + // Leaf nodes don't require extra check. + program.AddDirectJumpLabel(defaultLabel) + } else { + // Non-leaf node. Check which turn to take otherwise. Using direct jumps + // in case that the offset may exceed the limit of a conditional jump (255) + program.AddJump(bpf.Jmp|bpf.Jgt|bpf.K, uint32(sysno), 0, skipOneInst) + program.AddDirectJumpLabel(n.right.label()) + program.AddDirectJumpLabel(n.left.label()) + } + + if err := program.AddLabel(checkArgsLabel(sysno)); err != nil { + return err + } + + emitted := false + for ruleSetIdx, rs := range rules { + if _, ok := rs.Rules[sysno]; ok { + // If there are no rules, then this will always match. + // Remember we've done this so that we can emit a + // sensible error. We can't catch all overlaps, but we + // can catch this one at least. + if emitted { + return fmt.Errorf("unreachable action for %v: 0x%x (rule set %d)", SyscallName(sysno), rs.Action, ruleSetIdx) + } + + // Emit a vsyscall check if this rule requires a + // Vsyscall match. This rule ensures that the top bit + // is set in the instruction pointer, which is where + // the vsyscall page will be mapped. + if rs.Vsyscall { + program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetIPHigh) + program.AddJumpFalseLabel(bpf.Jmp|bpf.Jset|bpf.K, 0x80000000, 0, vsyscallViolationLabel(ruleSetIdx, sysno)) + } + + // Emit matchers. + if len(rs.Rules[sysno]) == 0 { + // This is a blanket action. + program.AddStmt(bpf.Ret|bpf.K, uint32(rs.Action)) + emitted = true + } else { + // Add an argument check for these particular + // arguments. This will continue execution and + // check the next rule set. We need to ensure + // that at the very end, we insert a direct + // jump label for the unmatched case. + if err := addSyscallArgsCheck(program, rs.Rules[sysno], rs.Action, ruleSetIdx, sysno); err != nil { + return err + } + } + + // If there was a Vsyscall check for this rule, then we + // need to add an appropriate label for the jump above. + if rs.Vsyscall { + if err := program.AddLabel(vsyscallViolationLabel(ruleSetIdx, sysno)); err != nil { + return err + } + } + } + } + + // Not matched? We only need to insert a jump to the default label if + // not default action has been emitted for this call. + if !emitted { + program.AddDirectJumpLabel(defaultLabel) + } + + return nil +} + +// node represents a tree node. +type node struct { + value uintptr + left *node + right *node + root bool +} + +// label returns the label corresponding to this node. +// +// If n is nil, then the defaultLabel is returned. +func (n *node) label() string { + if n == nil { + return defaultLabel + } + return fmt.Sprintf("index_%v", n.value) +} + +type traverseFunc func(*node, []RuleSet, *bpf.ProgramBuilder) error + +func (n *node) traverse(fn traverseFunc, rules []RuleSet, p *bpf.ProgramBuilder) error { + if n == nil { + return nil + } + if err := fn(n, rules, p); err != nil { + return err + } + if err := n.left.traverse(fn, rules, p); err != nil { + return err + } + return n.right.traverse(fn, rules, p) +} |