diff options
author | gVisor bot <gvisor-bot@google.com> | 2019-06-02 06:44:55 +0000 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2019-06-02 06:44:55 +0000 |
commit | ceb0d792f328d1fc0692197d8856a43c3936a571 (patch) | |
tree | 83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/seccomp | |
parent | deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff) | |
parent | 216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff) |
Merge 216da0b7 (automated)
Diffstat (limited to 'pkg/seccomp')
-rw-r--r-- | pkg/seccomp/seccomp.go | 375 | ||||
-rw-r--r-- | pkg/seccomp/seccomp_amd64.go | 26 | ||||
-rw-r--r-- | pkg/seccomp/seccomp_arm64.go | 26 | ||||
-rw-r--r-- | pkg/seccomp/seccomp_rules.go | 132 | ||||
-rwxr-xr-x | pkg/seccomp/seccomp_state_autogen.go | 4 | ||||
-rw-r--r-- | pkg/seccomp/seccomp_unsafe.go | 70 |
6 files changed, 633 insertions, 0 deletions
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go new file mode 100644 index 000000000..cc142a497 --- /dev/null +++ b/pkg/seccomp/seccomp.go @@ -0,0 +1,375 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package seccomp provides basic seccomp filters for x86_64 (little endian). +package seccomp + +import ( + "fmt" + "reflect" + "sort" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/bpf" + "gvisor.googlesource.com/gvisor/pkg/log" +) + +const ( + // skipOneInst is the offset to take for skipping one instruction. + skipOneInst = 1 + + // defaultLabel is the label for the default action. + defaultLabel = "default_action" +) + +// Install generates BPF code based on the set of syscalls provided. It only +// allows syscalls that conform to the specification. Syscalls that violate the +// specification will trigger RET_KILL_PROCESS, except for the cases below. +// +// RET_TRAP is used in violations, instead of RET_KILL_PROCESS, in the +// following cases: +// 1. Kernel doesn't support RET_KILL_PROCESS: RET_KILL_THREAD only kills the +// offending thread and often keeps the sentry hanging. +// 2. Debug: RET_TRAP generates a panic followed by a stack trace which is +// much easier to debug then RET_KILL_PROCESS which can't be caught. +// +// Be aware that RET_TRAP sends SIGSYS to the process and it may be ignored, +// making it possible for the process to continue running after a violation. +// However, it will leave a SECCOMP audit event trail behind. In any case, the +// syscall is still blocked from executing. +func Install(rules SyscallRules) error { + defaultAction, err := defaultAction() + if err != nil { + return err + } + + // Uncomment to get stack trace when there is a violation. + // defaultAction = linux.BPFAction(linux.SECCOMP_RET_TRAP) + + log.Infof("Installing seccomp filters for %d syscalls (action=%v)", len(rules), defaultAction) + + instrs, err := BuildProgram([]RuleSet{ + RuleSet{ + Rules: rules, + Action: linux.SECCOMP_RET_ALLOW, + }, + }, defaultAction) + if log.IsLogging(log.Debug) { + programStr, errDecode := bpf.DecodeProgram(instrs) + if errDecode != nil { + programStr = fmt.Sprintf("Error: %v\n%s", errDecode, programStr) + } + log.Debugf("Seccomp program dump:\n%s", programStr) + } + if err != nil { + return err + } + + // Perform the actual installation. + if errno := SetFilter(instrs); errno != 0 { + return fmt.Errorf("Failed to set filter: %v", errno) + } + + log.Infof("Seccomp filters installed.") + return nil +} + +func defaultAction() (linux.BPFAction, error) { + available, err := isKillProcessAvailable() + if err != nil { + return 0, err + } + if available { + return linux.SECCOMP_RET_KILL_PROCESS, nil + } + return linux.SECCOMP_RET_TRAP, nil +} + +// RuleSet is a set of rules and associated action. +type RuleSet struct { + Rules SyscallRules + Action linux.BPFAction + + // Vsyscall indicates that a check is made for a function being called + // from kernel mappings. This is where the vsyscall page is located + // (and typically) emulated, so this RuleSet will not match any + // functions not dispatched from the vsyscall page. + Vsyscall bool +} + +// SyscallName gives names to system calls. It is used purely for debugging purposes. +// +// An alternate namer can be provided to the package at initialization time. +var SyscallName = func(sysno uintptr) string { + return fmt.Sprintf("syscall_%d", sysno) +} + +// BuildProgram builds a BPF program from the given map of actions to matching +// SyscallRules. The single generated program covers all provided RuleSets. +func BuildProgram(rules []RuleSet, defaultAction linux.BPFAction) ([]linux.BPFInstruction, error) { + program := bpf.NewProgramBuilder() + + // Be paranoid and check that syscall is done in the expected architecture. + // + // A = seccomp_data.arch + // if (A != AUDIT_ARCH) goto defaultAction. + program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArch) + // defaultLabel is at the bottom of the program. The size of program + // may exceeds 255 lines, which is the limit of a condition jump. + program.AddJump(bpf.Jmp|bpf.Jeq|bpf.K, LINUX_AUDIT_ARCH, skipOneInst, 0) + program.AddDirectJumpLabel(defaultLabel) + if err := buildIndex(rules, program); err != nil { + return nil, err + } + + // Exhausted: return defaultAction. + if err := program.AddLabel(defaultLabel); err != nil { + return nil, err + } + program.AddStmt(bpf.Ret|bpf.K, uint32(defaultAction)) + + return program.Instructions() +} + +// buildIndex builds a BST to quickly search through all syscalls. +func buildIndex(rules []RuleSet, program *bpf.ProgramBuilder) error { + // Build a list of all application system calls, across all given rule + // sets. We have a simple BST, but may dispatch individual matchers + // with different actions. The matchers are evaluated linearly. + requiredSyscalls := make(map[uintptr]struct{}) + for _, rs := range rules { + for sysno := range rs.Rules { + requiredSyscalls[sysno] = struct{}{} + } + } + syscalls := make([]uintptr, 0, len(requiredSyscalls)) + for sysno, _ := range requiredSyscalls { + syscalls = append(syscalls, sysno) + } + sort.Slice(syscalls, func(i, j int) bool { return syscalls[i] < syscalls[j] }) + for _, sysno := range syscalls { + for _, rs := range rules { + // Print only if there is a corresponding set of rules. + if _, ok := rs.Rules[sysno]; ok { + log.Debugf("syscall filter %v: %s => 0x%x", SyscallName(sysno), rs.Rules[sysno], rs.Action) + } + } + } + + root := createBST(syscalls) + root.root = true + + // Load syscall number into A and run through BST. + // + // A = seccomp_data.nr + program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetNR) + return root.traverse(buildBSTProgram, rules, program) +} + +// createBST converts sorted syscall slice into a balanced BST. +// Panics if syscalls is empty. +func createBST(syscalls []uintptr) *node { + i := len(syscalls) / 2 + parent := node{value: syscalls[i]} + if i > 0 { + parent.left = createBST(syscalls[:i]) + } + if i+1 < len(syscalls) { + parent.right = createBST(syscalls[i+1:]) + } + return &parent +} + +func vsyscallViolationLabel(ruleSetIdx int, sysno uintptr) string { + return fmt.Sprintf("vsyscallViolation_%v_%v", ruleSetIdx, sysno) +} + +func ruleViolationLabel(ruleSetIdx int, sysno uintptr, idx int) string { + return fmt.Sprintf("ruleViolation_%v_%v_%v", ruleSetIdx, sysno, idx) +} + +func checkArgsLabel(sysno uintptr) string { + return fmt.Sprintf("checkArgs_%v", sysno) +} + +// addSyscallArgsCheck adds argument checks for a single system call. It does +// not insert a jump to the default action at the end and it is the +// responsibility of the caller to insert an appropriate jump after calling +// this function. +func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action linux.BPFAction, ruleSetIdx int, sysno uintptr) error { + for ruleidx, rule := range rules { + labelled := false + for i, arg := range rule { + if arg != nil { + switch a := arg.(type) { + case AllowAny: + case AllowValue: + high, low := uint32(a>>32), uint32(a) + // assert arg_low == low + p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i)) + p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) + // assert arg_high == high + p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i)) + p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx)) + labelled = true + default: + return fmt.Errorf("unknown syscall rule type: %v", reflect.TypeOf(a)) + } + } + } + + // Matched, emit the given action. + p.AddStmt(bpf.Ret|bpf.K, uint32(action)) + + // Label the end of the rule if necessary. This is added for + // the jumps above when the argument check fails. + if labelled { + if err := p.AddLabel(ruleViolationLabel(ruleSetIdx, sysno, ruleidx)); err != nil { + return err + } + } + } + + return nil +} + +// buildBSTProgram converts a binary tree started in 'root' into BPF code. The ouline of the code +// is as follows: +// +// // SYS_PIPE(22), root +// (A == 22) ? goto argument check : continue +// (A > 22) ? goto index_35 : goto index_9 +// +// index_9: // SYS_MMAP(9), leaf +// A == 9) ? goto argument check : defaultLabel +// +// index_35: // SYS_NANOSLEEP(35), single child +// (A == 35) ? goto argument check : continue +// (A > 35) ? goto index_50 : goto defaultLabel +// +// index_50: // SYS_LISTEN(50), leaf +// (A == 50) ? goto argument check : goto defaultLabel +// +func buildBSTProgram(n *node, rules []RuleSet, program *bpf.ProgramBuilder) error { + // Root node is never referenced by label, skip it. + if !n.root { + if err := program.AddLabel(n.label()); err != nil { + return err + } + } + + sysno := n.value + program.AddJumpTrueLabel(bpf.Jmp|bpf.Jeq|bpf.K, uint32(sysno), checkArgsLabel(sysno), 0) + if n.left == nil && n.right == nil { + // Leaf nodes don't require extra check. + program.AddDirectJumpLabel(defaultLabel) + } else { + // Non-leaf node. Check which turn to take otherwise. Using direct jumps + // in case that the offset may exceed the limit of a conditional jump (255) + program.AddJump(bpf.Jmp|bpf.Jgt|bpf.K, uint32(sysno), 0, skipOneInst) + program.AddDirectJumpLabel(n.right.label()) + program.AddDirectJumpLabel(n.left.label()) + } + + if err := program.AddLabel(checkArgsLabel(sysno)); err != nil { + return err + } + + emitted := false + for ruleSetIdx, rs := range rules { + if _, ok := rs.Rules[sysno]; ok { + // If there are no rules, then this will always match. + // Remember we've done this so that we can emit a + // sensible error. We can't catch all overlaps, but we + // can catch this one at least. + if emitted { + return fmt.Errorf("unreachable action for %v: 0x%x (rule set %d)", SyscallName(sysno), rs.Action, ruleSetIdx) + } + + // Emit a vsyscall check if this rule requires a + // Vsyscall match. This rule ensures that the top bit + // is set in the instruction pointer, which is where + // the vsyscall page will be mapped. + if rs.Vsyscall { + program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetIPHigh) + program.AddJumpFalseLabel(bpf.Jmp|bpf.Jset|bpf.K, 0x80000000, 0, vsyscallViolationLabel(ruleSetIdx, sysno)) + } + + // Emit matchers. + if len(rs.Rules[sysno]) == 0 { + // This is a blanket action. + program.AddStmt(bpf.Ret|bpf.K, uint32(rs.Action)) + emitted = true + } else { + // Add an argument check for these particular + // arguments. This will continue execution and + // check the next rule set. We need to ensure + // that at the very end, we insert a direct + // jump label for the unmatched case. + if err := addSyscallArgsCheck(program, rs.Rules[sysno], rs.Action, ruleSetIdx, sysno); err != nil { + return err + } + } + + // If there was a Vsyscall check for this rule, then we + // need to add an appropriate label for the jump above. + if rs.Vsyscall { + if err := program.AddLabel(vsyscallViolationLabel(ruleSetIdx, sysno)); err != nil { + return err + } + } + } + } + + // Not matched? We only need to insert a jump to the default label if + // not default action has been emitted for this call. + if !emitted { + program.AddDirectJumpLabel(defaultLabel) + } + + return nil +} + +// node represents a tree node. +type node struct { + value uintptr + left *node + right *node + root bool +} + +// label returns the label corresponding to this node. +// +// If n is nil, then the defaultLabel is returned. +func (n *node) label() string { + if n == nil { + return defaultLabel + } + return fmt.Sprintf("index_%v", n.value) +} + +type traverseFunc func(*node, []RuleSet, *bpf.ProgramBuilder) error + +func (n *node) traverse(fn traverseFunc, rules []RuleSet, p *bpf.ProgramBuilder) error { + if n == nil { + return nil + } + if err := fn(n, rules, p); err != nil { + return err + } + if err := n.left.traverse(fn, rules, p); err != nil { + return err + } + return n.right.traverse(fn, rules, p) +} diff --git a/pkg/seccomp/seccomp_amd64.go b/pkg/seccomp/seccomp_amd64.go new file mode 100644 index 000000000..02dfb8d9f --- /dev/null +++ b/pkg/seccomp/seccomp_amd64.go @@ -0,0 +1,26 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package seccomp + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" +) + +const ( + LINUX_AUDIT_ARCH = linux.AUDIT_ARCH_X86_64 + SYS_SECCOMP = 317 +) diff --git a/pkg/seccomp/seccomp_arm64.go b/pkg/seccomp/seccomp_arm64.go new file mode 100644 index 000000000..b575bcdbf --- /dev/null +++ b/pkg/seccomp/seccomp_arm64.go @@ -0,0 +1,26 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package seccomp + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" +) + +const ( + LINUX_AUDIT_ARCH = linux.AUDIT_ARCH_AARCH64 + SYS_SECCOMP = 277 +) diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go new file mode 100644 index 000000000..29eec8db1 --- /dev/null +++ b/pkg/seccomp/seccomp_rules.go @@ -0,0 +1,132 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seccomp + +import "fmt" + +// The offsets are based on the following struct in include/linux/seccomp.h. +// struct seccomp_data { +// int nr; +// __u32 arch; +// __u64 instruction_pointer; +// __u64 args[6]; +// }; +const ( + seccompDataOffsetNR = 0 + seccompDataOffsetArch = 4 + seccompDataOffsetIPLow = 8 + seccompDataOffsetIPHigh = 12 + seccompDataOffsetArgs = 16 +) + +func seccompDataOffsetArgLow(i int) uint32 { + return uint32(seccompDataOffsetArgs + i*8) +} + +func seccompDataOffsetArgHigh(i int) uint32 { + return seccompDataOffsetArgLow(i) + 4 +} + +// AllowAny is marker to indicate any value will be accepted. +type AllowAny struct{} + +func (a AllowAny) String() (s string) { + return "*" +} + +// AllowValue specifies a value that needs to be strictly matched. +type AllowValue uintptr + +func (a AllowValue) String() (s string) { + return fmt.Sprintf("%#x ", uintptr(a)) +} + +// Rule stores the whitelist of syscall arguments. +// +// For example: +// rule := Rule { +// AllowValue(linux.ARCH_GET_FS | linux.ARCH_SET_FS), // arg0 +// } +type Rule [6]interface{} + +func (r Rule) String() (s string) { + if len(r) == 0 { + return + } + s += "( " + for _, arg := range r { + if arg != nil { + s += fmt.Sprintf("%v ", arg) + } + } + s += ")" + return +} + +// SyscallRules stores a map of OR'ed whitelist rules indexed by the syscall number. +// If the 'Rules' is empty, we treat it as any argument is allowed. +// +// For example: +// rules := SyscallRules{ +// syscall.SYS_FUTEX: []Rule{ +// { +// AllowAny{}, +// AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), +// }, // OR +// { +// AllowAny{}, +// AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), +// }, +// }, +// syscall.SYS_GETPID: []Rule{}, +// } +type SyscallRules map[uintptr][]Rule + +// NewSyscallRules returns a new SyscallRules. +func NewSyscallRules() SyscallRules { + return make(map[uintptr][]Rule) +} + +// AddRule adds the given rule. It will create a new entry for a new syscall, otherwise +// it will append to the existing rules. +func (sr SyscallRules) AddRule(sysno uintptr, r Rule) { + if cur, ok := sr[sysno]; ok { + // An empty rules means allow all. Honor it when more rules are added. + if len(cur) == 0 { + sr[sysno] = append(sr[sysno], Rule{}) + } + sr[sysno] = append(sr[sysno], r) + } else { + sr[sysno] = []Rule{r} + } +} + +// Merge merges the given SyscallRules. +func (sr SyscallRules) Merge(rules SyscallRules) { + for sysno, rs := range rules { + if cur, ok := sr[sysno]; ok { + // An empty rules means allow all. Honor it when more rules are added. + if len(cur) == 0 { + sr[sysno] = append(sr[sysno], Rule{}) + } + if len(rs) == 0 { + rs = []Rule{{}} + } + sr[sysno] = append(sr[sysno], rs...) + } else { + sr[sysno] = rs + } + } +} diff --git a/pkg/seccomp/seccomp_state_autogen.go b/pkg/seccomp/seccomp_state_autogen.go new file mode 100755 index 000000000..0fc23d1a8 --- /dev/null +++ b/pkg/seccomp/seccomp_state_autogen.go @@ -0,0 +1,4 @@ +// automatically generated by stateify. + +package seccomp + diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go new file mode 100644 index 000000000..ebb6397e8 --- /dev/null +++ b/pkg/seccomp/seccomp_unsafe.go @@ -0,0 +1,70 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seccomp + +import ( + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" +) + +// sockFprog is sock_fprog taken from <linux/filter.h>. +type sockFprog struct { + Len uint16 + pad [6]byte + Filter *linux.BPFInstruction +} + +// SetFilter installs the given BPF program. +// +// This is safe to call from an afterFork context. +// +//go:nosplit +func SetFilter(instrs []linux.BPFInstruction) syscall.Errno { + // PR_SET_NO_NEW_PRIVS is required in order to enable seccomp. See seccomp(2) for details. + if _, _, errno := syscall.RawSyscall(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0); errno != 0 { + return errno + } + + sockProg := sockFprog{ + Len: uint16(len(instrs)), + Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])), + } + return seccomp(linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, unsafe.Pointer(&sockProg)) +} + +func isKillProcessAvailable() (bool, error) { + action := uint32(linux.SECCOMP_RET_KILL_PROCESS) + if errno := seccomp(linux.SECCOMP_GET_ACTION_AVAIL, 0, unsafe.Pointer(&action)); errno != 0 { + // EINVAL: SECCOMP_GET_ACTION_AVAIL not in this kernel yet. + // EOPNOTSUPP: SECCOMP_RET_KILL_PROCESS not supported. + if errno == syscall.EINVAL || errno == syscall.EOPNOTSUPP { + return false, nil + } + return false, errno + } + return true, nil +} + +// seccomp calls seccomp(2). This is safe to call from an afterFork context. +// +//go:nosplit +func seccomp(op, flags uint32, ptr unsafe.Pointer) syscall.Errno { + if _, _, errno := syscall.RawSyscall(SYS_SECCOMP, uintptr(op), uintptr(flags), uintptr(ptr)); errno != 0 { + return errno + } + return 0 +} |