summaryrefslogtreecommitdiffhomepage
path: root/pkg/seccomp
diff options
context:
space:
mode:
authorGoogler <noreply@google.com>2018-04-27 10:37:02 -0700
committerAdin Scannell <ascannell@google.com>2018-04-28 01:44:26 -0400
commitd02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/seccomp
parentf70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)
Check in gVisor.
PiperOrigin-RevId: 194583126 Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
Diffstat (limited to 'pkg/seccomp')
-rw-r--r--pkg/seccomp/BUILD48
-rw-r--r--pkg/seccomp/seccomp.go210
-rw-r--r--pkg/seccomp/seccomp_test.go268
-rw-r--r--pkg/seccomp/seccomp_test_victim.go112
-rw-r--r--pkg/seccomp/seccomp_unsafe.go49
5 files changed, 687 insertions, 0 deletions
diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
new file mode 100644
index 000000000..1e19b1d25
--- /dev/null
+++ b/pkg/seccomp/BUILD
@@ -0,0 +1,48 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_embed_data", "go_library", "go_test")
+
+go_binary(
+ name = "victim",
+ testonly = 1,
+ srcs = ["seccomp_test_victim.go"],
+ deps = [":seccomp"],
+)
+
+go_embed_data(
+ name = "victim_data",
+ testonly = 1,
+ src = "victim",
+ package = "seccomp",
+ var = "victimData",
+)
+
+go_library(
+ name = "seccomp",
+ srcs = [
+ "seccomp.go",
+ "seccomp_unsafe.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/seccomp",
+ visibility = ["//visibility:public"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/bpf",
+ "//pkg/log",
+ ],
+)
+
+go_test(
+ name = "seccomp_test",
+ size = "small",
+ srcs = [
+ "seccomp_test.go",
+ ":victim_data",
+ ],
+ embed = [":seccomp"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/binary",
+ "//pkg/bpf",
+ ],
+)
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
new file mode 100644
index 000000000..7ee63140c
--- /dev/null
+++ b/pkg/seccomp/seccomp.go
@@ -0,0 +1,210 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package seccomp provides basic seccomp filters.
+package seccomp
+
+import (
+ "fmt"
+ "sort"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/bpf"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+const (
+ // violationLabel is added to the program to take action on a violation.
+ violationLabel = "violation"
+
+ // allowLabel is added to the program to allow the syscall to take place.
+ allowLabel = "allow"
+)
+
+// Install generates BPF code based on the set of syscalls provided. It only
+// allows syscalls that conform to the specification (*) and generates SIGSYS
+// trap unless kill is set.
+//
+// (*) The current implementation only checks the syscall number. It does NOT
+// validate any of the arguments.
+func Install(syscalls []uintptr, kill bool) error {
+ // Sort syscalls and remove duplicates to build the BST.
+ sort.Slice(syscalls, func(i, j int) bool { return syscalls[i] < syscalls[j] })
+ syscalls = filterUnique(syscalls)
+
+ log.Infof("Installing seccomp filters for %d syscalls (kill=%t)", len(syscalls), kill)
+ for _, s := range syscalls {
+ log.Infof("syscall filter: %v", s)
+ }
+
+ instrs, err := buildProgram(syscalls, kill)
+ if err != nil {
+ return err
+ }
+ if log.IsLogging(log.Debug) {
+ programStr, err := bpf.DecodeProgram(instrs)
+ if err != nil {
+ programStr = fmt.Sprintf("Error: %v\n%s", err, programStr)
+ }
+ log.Debugf("Seccomp program dump:\n%s", programStr)
+ }
+
+ if err := seccomp(instrs); err != nil {
+ return err
+ }
+
+ log.Infof("Seccomp filters installed.")
+ return nil
+}
+
+// buildProgram builds a BPF program that whitelists all given syscalls.
+//
+// Precondition: syscalls must be sorted and unique.
+func buildProgram(syscalls []uintptr, kill bool) ([]linux.BPFInstruction, error) {
+ const archOffset = 4 // offsetof(seccomp_data, arch)
+ program := bpf.NewProgramBuilder()
+ violationAction := uint32(linux.SECCOMP_RET_KILL)
+ if !kill {
+ violationAction = linux.SECCOMP_RET_TRAP
+ }
+
+ // Be paranoid and check that syscall is done in the expected architecture.
+ //
+ // A = seccomp_data.arch
+ // if (A != AUDIT_ARCH_X86_64) goto violation
+ program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, archOffset)
+ program.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, linux.AUDIT_ARCH_X86_64, 0, violationLabel)
+
+ if err := buildIndex(syscalls, program); err != nil {
+ return nil, err
+ }
+
+ // violation: return violationAction
+ if err := program.AddLabel(violationLabel); err != nil {
+ return nil, err
+ }
+ program.AddStmt(bpf.Ret|bpf.K, violationAction)
+
+ // allow: return SECCOMP_RET_ALLOW
+ if err := program.AddLabel(allowLabel); err != nil {
+ return nil, err
+ }
+ program.AddStmt(bpf.Ret|bpf.K, linux.SECCOMP_RET_ALLOW)
+
+ return program.Instructions()
+}
+
+// filterUnique filters unique system calls.
+//
+// Precondition: syscalls must be sorted.
+func filterUnique(syscalls []uintptr) []uintptr {
+ filtered := make([]uintptr, 0, len(syscalls))
+ for i := 0; i < len(syscalls); i++ {
+ if len(filtered) > 0 && syscalls[i] == filtered[len(filtered)-1] {
+ // This call has already been inserted, skip.
+ continue
+ }
+ filtered = append(filtered, syscalls[i])
+ }
+ return filtered
+}
+
+// buildIndex builds a BST to quickly search through all syscalls that are whitelisted.
+//
+// Precondition: syscalls must be sorted and unique.
+func buildIndex(syscalls []uintptr, program *bpf.ProgramBuilder) error {
+ root := createBST(syscalls)
+
+ // Load syscall number into A and run through BST.
+ //
+ // A = seccomp_data.nr
+ program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, 0)
+ return root.buildBSTProgram(program, true)
+}
+
+// createBST converts sorted syscall slice into a balanced BST.
+// Panics if syscalls is empty.
+func createBST(syscalls []uintptr) *node {
+ i := len(syscalls) / 2
+ parent := node{value: syscalls[i]}
+ if i > 0 {
+ parent.left = createBST(syscalls[:i])
+ }
+ if i+1 < len(syscalls) {
+ parent.right = createBST(syscalls[i+1:])
+ }
+ return &parent
+}
+
+// node represents a tree node.
+type node struct {
+ value uintptr
+ left *node
+ right *node
+}
+
+// label returns the label corresponding to this node. If node is nil (syscall not present),
+// violationLabel is returned for convenience.
+func (n *node) label() string {
+ if n == nil {
+ return violationLabel
+ }
+ return fmt.Sprintf("index_%v", n.value)
+}
+
+// buildBSTProgram converts a binary tree started in 'root' into BPF code. The ouline of the code
+// is as follows:
+//
+// // SYS_PIPE(22), root
+// (A == 22) ? goto allow : continue
+// (A > 22) ? goto index_35 : goto index_9
+//
+// index_9: // SYS_MMAP(9), leaf
+// (A == 9) ? goto allow : goto violation
+//
+// index_35: // SYS_NANOSLEEP(35), single child
+// (A == 35) ? goto allow : continue
+// (A > 35) ? goto index_50 : goto violation
+//
+// index_50: // SYS_LISTEN(50), leaf
+// (A == 50) ? goto allow : goto violation
+//
+func (n *node) buildBSTProgram(program *bpf.ProgramBuilder, root bool) error {
+ if n == nil {
+ return nil
+ }
+
+ // Root node is never referenced by label, skip it.
+ if !root {
+ if err := program.AddLabel(n.label()); err != nil {
+ return err
+ }
+ }
+
+ // Leaf nodes don't require extra check, they either allow or violate!
+ if n.left == nil && n.right == nil {
+ program.AddJumpLabels(bpf.Jmp|bpf.Jeq|bpf.K, uint32(n.value), allowLabel, violationLabel)
+ return nil
+ }
+
+ // Non-leaf node. Allows syscall if it matches, check which turn to take otherwise. Note
+ // that 'violationLabel' is returned for nil children.
+ program.AddJumpTrueLabel(bpf.Jmp|bpf.Jeq|bpf.K, uint32(n.value), allowLabel, 0)
+ program.AddJumpLabels(bpf.Jmp|bpf.Jgt|bpf.K, uint32(n.value), n.right.label(), n.left.label())
+
+ if err := n.left.buildBSTProgram(program, false); err != nil {
+ return err
+ }
+ return n.right.buildBSTProgram(program, false)
+}
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
new file mode 100644
index 000000000..c700d88d6
--- /dev/null
+++ b/pkg/seccomp/seccomp_test.go
@@ -0,0 +1,268 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package seccomp
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "math/rand"
+ "os"
+ "os/exec"
+ "sort"
+ "strings"
+ "testing"
+ "time"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/binary"
+ "gvisor.googlesource.com/gvisor/pkg/bpf"
+)
+
+type seccompData struct {
+ nr uint32
+ arch uint32
+ instructionPointer uint64
+ args [6]uint64
+}
+
+// newVictim makes a victim binary.
+func newVictim() (string, error) {
+ f, err := ioutil.TempFile("", "victim")
+ if err != nil {
+ return "", err
+ }
+ defer f.Close()
+ path := f.Name()
+ if _, err := io.Copy(f, bytes.NewBuffer(victimData)); err != nil {
+ os.Remove(path)
+ return "", err
+ }
+ if err := os.Chmod(path, 0755); err != nil {
+ os.Remove(path)
+ return "", err
+ }
+ return path, nil
+}
+
+// asInput converts a seccompData to a bpf.Input.
+func (d *seccompData) asInput() bpf.Input {
+ return bpf.InputBytes{binary.Marshal(nil, binary.LittleEndian, d), binary.LittleEndian}
+}
+
+func TestBasic(t *testing.T) {
+ type spec struct {
+ // desc is the test's description.
+ desc string
+
+ // data is the input data.
+ data seccompData
+
+ // want is the expected return value of the BPF program.
+ want uint32
+ }
+
+ for _, test := range []struct {
+ // filters are the set of syscall that are allowed.
+ filters []uintptr
+ kill bool
+ specs []spec
+ }{
+ {
+ filters: []uintptr{1},
+ kill: false,
+ specs: []spec{
+ {
+ desc: "Single syscall allowed",
+ data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64},
+ want: linux.SECCOMP_RET_ALLOW,
+ },
+ {
+ desc: "Single syscall disallowed",
+ data: seccompData{nr: 2, arch: linux.AUDIT_ARCH_X86_64},
+ want: linux.SECCOMP_RET_TRAP,
+ },
+ },
+ },
+ {
+ filters: []uintptr{1, 3, 5},
+ kill: false,
+ specs: []spec{
+ {
+ desc: "Multiple syscalls allowed (1)",
+ data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64},
+ want: linux.SECCOMP_RET_ALLOW,
+ },
+ {
+ desc: "Multiple syscalls allowed (3)",
+ data: seccompData{nr: 3, arch: linux.AUDIT_ARCH_X86_64},
+ want: linux.SECCOMP_RET_ALLOW,
+ },
+ {
+ desc: "Multiple syscalls allowed (5)",
+ data: seccompData{nr: 5, arch: linux.AUDIT_ARCH_X86_64},
+ want: linux.SECCOMP_RET_ALLOW,
+ },
+ {
+ desc: "Multiple syscalls disallowed (0)",
+ data: seccompData{nr: 0, arch: linux.AUDIT_ARCH_X86_64},
+ want: linux.SECCOMP_RET_TRAP,
+ },
+ {
+ desc: "Multiple syscalls disallowed (2)",
+ data: seccompData{nr: 2, arch: linux.AUDIT_ARCH_X86_64},
+ want: linux.SECCOMP_RET_TRAP,
+ },
+ {
+ desc: "Multiple syscalls disallowed (4)",
+ data: seccompData{nr: 4, arch: linux.AUDIT_ARCH_X86_64},
+ want: linux.SECCOMP_RET_TRAP,
+ },
+ {
+ desc: "Multiple syscalls disallowed (6)",
+ data: seccompData{nr: 6, arch: linux.AUDIT_ARCH_X86_64},
+ want: linux.SECCOMP_RET_TRAP,
+ },
+ {
+ desc: "Multiple syscalls disallowed (100)",
+ data: seccompData{nr: 100, arch: linux.AUDIT_ARCH_X86_64},
+ want: linux.SECCOMP_RET_TRAP,
+ },
+ },
+ },
+ {
+ filters: []uintptr{1},
+ kill: false,
+ specs: []spec{
+ {
+ desc: "Wrong architecture",
+ data: seccompData{nr: 1, arch: 123},
+ want: linux.SECCOMP_RET_TRAP,
+ },
+ },
+ },
+ {
+ filters: []uintptr{1},
+ kill: true,
+ specs: []spec{
+ {
+ desc: "Syscall disallowed, action kill",
+ data: seccompData{nr: 2, arch: linux.AUDIT_ARCH_X86_64},
+ want: linux.SECCOMP_RET_KILL,
+ },
+ },
+ },
+ } {
+ sort.Slice(test.filters, func(i, j int) bool { return test.filters[i] < test.filters[j] })
+ instrs, err := buildProgram(test.filters, test.kill)
+ if err != nil {
+ t.Errorf("%s: buildProgram() got error: %v", test.specs[0].desc, err)
+ continue
+ }
+ p, err := bpf.Compile(instrs)
+ if err != nil {
+ t.Errorf("%s: bpf.Compile() got error: %v", test.specs[0].desc, err)
+ continue
+ }
+ for _, spec := range test.specs {
+ got, err := bpf.Exec(p, spec.data.asInput())
+ if err != nil {
+ t.Errorf("%s: bpf.Exec() got error: %v", spec.desc, err)
+ continue
+ }
+ if got != spec.want {
+ t.Errorf("%s: bpd.Exec() = %d, want: %d", spec.desc, got, spec.want)
+ }
+ }
+ }
+}
+
+func TestRandom(t *testing.T) {
+ rand.Seed(time.Now().UnixNano())
+ size := rand.Intn(50) + 1
+ syscalls := make([]uintptr, 0, size)
+ syscallMap := make(map[uintptr]struct{})
+ for len(syscalls) < size {
+ n := uintptr(rand.Intn(200))
+ if _, ok := syscallMap[n]; !ok {
+ syscalls = append(syscalls, n)
+ syscallMap[n] = struct{}{}
+ }
+ }
+
+ sort.Slice(syscalls, func(i, j int) bool { return syscalls[i] < syscalls[j] })
+ fmt.Printf("Testing filters: %v", syscalls)
+ instrs, err := buildProgram(syscalls, false)
+ if err != nil {
+ t.Fatalf("buildProgram() got error: %v", err)
+ }
+ p, err := bpf.Compile(instrs)
+ if err != nil {
+ t.Fatalf("bpf.Compile() got error: %v", err)
+ }
+ for i := uint32(0); i < 200; i++ {
+ data := seccompData{nr: i, arch: linux.AUDIT_ARCH_X86_64}
+ got, err := bpf.Exec(p, data.asInput())
+ if err != nil {
+ t.Errorf("bpf.Exec() got error: %v, for syscall %d", err, i)
+ continue
+ }
+ want := uint32(linux.SECCOMP_RET_TRAP)
+ if _, ok := syscallMap[uintptr(i)]; ok {
+ want = linux.SECCOMP_RET_ALLOW
+ }
+ if got != want {
+ t.Errorf("bpf.Exec() = %d, want: %d, for syscall %d", got, want, i)
+ }
+ }
+}
+
+// TestReadDeal checks that a process dies when it trips over the filter and that it
+// doesn't die when the filter is not triggered.
+func TestRealDeal(t *testing.T) {
+ for _, test := range []struct {
+ die bool
+ want string
+ }{
+ {die: true, want: "bad system call"},
+ {die: false, want: "Syscall was allowed!!!"},
+ } {
+ victim, err := newVictim()
+ if err != nil {
+ t.Fatalf("unable to get victim: %v", err)
+ }
+ defer os.Remove(victim)
+ dieFlag := fmt.Sprintf("-die=%v", test.die)
+ cmd := exec.Command(victim, dieFlag)
+
+ out, err := cmd.CombinedOutput()
+ if test.die {
+ if err == nil {
+ t.Errorf("victim was not killed as expected, output: %s", out)
+ continue
+ }
+ } else {
+ if err != nil {
+ t.Errorf("victim failed to execute, err: %v", err)
+ continue
+ }
+ }
+ if !strings.Contains(string(out), test.want) {
+ t.Errorf("Victim output is wrong, got: %v, want: %v", err, test.want)
+ continue
+ }
+ }
+}
diff --git a/pkg/seccomp/seccomp_test_victim.go b/pkg/seccomp/seccomp_test_victim.go
new file mode 100644
index 000000000..fe3f96901
--- /dev/null
+++ b/pkg/seccomp/seccomp_test_victim.go
@@ -0,0 +1,112 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Test binary used to test that seccomp filters are properly constructed and
+// indeed kill the process on violation.
+package main
+
+import (
+ "flag"
+ "fmt"
+ "os"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+func main() {
+ dieFlag := flag.Bool("die", false, "trips over the filter if true")
+ flag.Parse()
+
+ syscalls := []uintptr{
+ syscall.SYS_ACCEPT,
+ syscall.SYS_ARCH_PRCTL,
+ syscall.SYS_BIND,
+ syscall.SYS_BRK,
+ syscall.SYS_CLOCK_GETTIME,
+ syscall.SYS_CLONE,
+ syscall.SYS_CLOSE,
+ syscall.SYS_DUP,
+ syscall.SYS_DUP2,
+ syscall.SYS_EPOLL_CREATE1,
+ syscall.SYS_EPOLL_CTL,
+ syscall.SYS_EPOLL_WAIT,
+ syscall.SYS_EXIT,
+ syscall.SYS_EXIT_GROUP,
+ syscall.SYS_FALLOCATE,
+ syscall.SYS_FCHMOD,
+ syscall.SYS_FCNTL,
+ syscall.SYS_FSTAT,
+ syscall.SYS_FSYNC,
+ syscall.SYS_FTRUNCATE,
+ syscall.SYS_FUTEX,
+ syscall.SYS_GETDENTS64,
+ syscall.SYS_GETPEERNAME,
+ syscall.SYS_GETPID,
+ syscall.SYS_GETSOCKNAME,
+ syscall.SYS_GETSOCKOPT,
+ syscall.SYS_GETTID,
+ syscall.SYS_GETTIMEOFDAY,
+ syscall.SYS_LISTEN,
+ syscall.SYS_LSEEK,
+ syscall.SYS_MADVISE,
+ syscall.SYS_MINCORE,
+ syscall.SYS_MMAP,
+ syscall.SYS_MPROTECT,
+ syscall.SYS_MUNLOCK,
+ syscall.SYS_MUNMAP,
+ syscall.SYS_NANOSLEEP,
+ syscall.SYS_NEWFSTATAT,
+ syscall.SYS_OPEN,
+ syscall.SYS_POLL,
+ syscall.SYS_PREAD64,
+ syscall.SYS_PSELECT6,
+ syscall.SYS_PWRITE64,
+ syscall.SYS_READ,
+ syscall.SYS_READLINKAT,
+ syscall.SYS_READV,
+ syscall.SYS_RECVMSG,
+ syscall.SYS_RENAMEAT,
+ syscall.SYS_RESTART_SYSCALL,
+ syscall.SYS_RT_SIGACTION,
+ syscall.SYS_RT_SIGPROCMASK,
+ syscall.SYS_RT_SIGRETURN,
+ syscall.SYS_SCHED_YIELD,
+ syscall.SYS_SENDMSG,
+ syscall.SYS_SETITIMER,
+ syscall.SYS_SET_ROBUST_LIST,
+ syscall.SYS_SETSOCKOPT,
+ syscall.SYS_SHUTDOWN,
+ syscall.SYS_SIGALTSTACK,
+ syscall.SYS_SOCKET,
+ syscall.SYS_SYNC_FILE_RANGE,
+ syscall.SYS_TGKILL,
+ syscall.SYS_UTIMENSAT,
+ syscall.SYS_WRITE,
+ syscall.SYS_WRITEV,
+ }
+ die := *dieFlag
+ if !die {
+ syscalls = append(syscalls, syscall.SYS_OPENAT)
+ }
+
+ if err := seccomp.Install(syscalls, false); err != nil {
+ fmt.Printf("Failed to install seccomp: %v", err)
+ os.Exit(1)
+ }
+ fmt.Printf("Filters installed\n")
+
+ syscall.RawSyscall(syscall.SYS_OPENAT, 0, 0, 0)
+ fmt.Printf("Syscall was allowed!!!\n")
+}
diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go
new file mode 100644
index 000000000..6682f8d9b
--- /dev/null
+++ b/pkg/seccomp/seccomp_unsafe.go
@@ -0,0 +1,49 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package seccomp
+
+import (
+ "fmt"
+ "syscall"
+ "unsafe"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+// sockFprog is sock_fprog taken from <linux/filter.h>.
+type sockFprog struct {
+ Len uint16
+ pad [6]byte
+ Filter *linux.BPFInstruction
+}
+
+func seccomp(instrs []linux.BPFInstruction) error {
+ // SYS_SECCOMP is not available in syscall package.
+ const SYS_SECCOMP = 317
+
+ // PR_SET_NO_NEW_PRIVS is required in order to enable seccomp. See seccomp(2) for details.
+ if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0); err != 0 {
+ return fmt.Errorf("failed to set PR_SET_NO_NEW_PRIVS: %v", err)
+ }
+ sockProg := sockFprog{Len: uint16(len(instrs)), Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0]))}
+
+ // TODO: Use SECCOMP_FILTER_FLAG_KILL_PROCESS when available.
+ if _, _, err := syscall.RawSyscall(SYS_SECCOMP, linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(&sockProg))); err != 0 {
+ return fmt.Errorf("failed to set seccomp filter: %v", err)
+ }
+ return nil
+}