summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel/seccomp.go
diff options
context:
space:
mode:
authorGoogler <noreply@google.com>2018-04-27 10:37:02 -0700
committerAdin Scannell <ascannell@google.com>2018-04-28 01:44:26 -0400
commitd02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/kernel/seccomp.go
parentf70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)
Check in gVisor.
PiperOrigin-RevId: 194583126 Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
Diffstat (limited to 'pkg/sentry/kernel/seccomp.go')
-rw-r--r--pkg/sentry/kernel/seccomp.go205
1 files changed, 205 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
new file mode 100644
index 000000000..b7c4a507f
--- /dev/null
+++ b/pkg/sentry/kernel/seccomp.go
@@ -0,0 +1,205 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/binary"
+ "gvisor.googlesource.com/gvisor/pkg/bpf"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const maxSyscallFilterInstructions = 1 << 15
+
+type seccompResult int
+
+const (
+ // seccompResultDeny indicates that a syscall should not be executed.
+ seccompResultDeny seccompResult = iota
+
+ // seccompResultAllow indicates that a syscall should be executed.
+ seccompResultAllow
+
+ // seccompResultKill indicates that the task should be killed immediately,
+ // with the exit status indicating that the task was killed by SIGSYS.
+ seccompResultKill
+
+ // seccompResultTrace indicates that a ptracer was successfully notified as
+ // a result of a SECCOMP_RET_TRACE.
+ seccompResultTrace
+)
+
+// seccompData is equivalent to struct seccomp_data, which contains the data
+// passed to seccomp-bpf filters.
+type seccompData struct {
+ // nr is the system call number.
+ nr int32
+
+ // arch is an AUDIT_ARCH_* value indicating the system call convention.
+ arch uint32
+
+ // instructionPointer is the value of the instruction pointer at the time
+ // of the system call.
+ instructionPointer uint64
+
+ // args contains the first 6 system call arguments.
+ args [6]uint64
+}
+
+func (d *seccompData) asBPFInput() bpf.Input {
+ return bpf.InputBytes{binary.Marshal(nil, usermem.ByteOrder, d), usermem.ByteOrder}
+}
+
+func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo {
+ si := &arch.SignalInfo{
+ Signo: int32(linux.SIGSYS),
+ Errno: errno,
+ Code: arch.SYS_SECCOMP,
+ }
+ si.SetCallAddr(uint64(ip))
+ si.SetSyscall(sysno)
+ si.SetArch(t.SyscallTable().AuditNumber)
+ return si
+}
+
+// checkSeccompSyscall applies the task's seccomp filters before the execution
+// of syscall sysno at instruction pointer ip. (These parameters must be passed
+// in because vsyscalls do not use the values in t.Arch().)
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip usermem.Addr) seccompResult {
+ result := t.evaluateSyscallFilters(sysno, args, ip)
+ switch result & linux.SECCOMP_RET_ACTION {
+ case linux.SECCOMP_RET_TRAP:
+ // "Results in the kernel sending a SIGSYS signal to the triggering
+ // task without executing the system call. ... The SECCOMP_RET_DATA
+ // portion of the return value will be passed as si_errno." -
+ // Documentation/prctl/seccomp_filter.txt
+ t.SendSignal(seccompSiginfo(t, int32(result&linux.SECCOMP_RET_DATA), sysno, ip))
+ return seccompResultDeny
+
+ case linux.SECCOMP_RET_ERRNO:
+ // "Results in the lower 16-bits of the return value being passed to
+ // userland as the errno without executing the system call."
+ t.Arch().SetReturn(-uintptr(result & linux.SECCOMP_RET_DATA))
+ return seccompResultDeny
+
+ case linux.SECCOMP_RET_TRACE:
+ // "When returned, this value will cause the kernel to attempt to
+ // notify a ptrace()-based tracer prior to executing the system call.
+ // If there is no tracer present, -ENOSYS is returned to userland and
+ // the system call is not executed."
+ if t.ptraceSeccomp(uint16(result & linux.SECCOMP_RET_DATA)) {
+ return seccompResultTrace
+ }
+ // This useless-looking temporary is needed because Go.
+ tmp := uintptr(syscall.ENOSYS)
+ t.Arch().SetReturn(-tmp)
+ return seccompResultDeny
+
+ case linux.SECCOMP_RET_ALLOW:
+ // "Results in the system call being executed."
+ return seccompResultAllow
+
+ case linux.SECCOMP_RET_KILL:
+ // "Results in the task exiting immediately without executing the
+ // system call. The exit status of the task will be SIGSYS, not
+ // SIGKILL."
+ fallthrough
+ default: // consistent with Linux
+ return seccompResultKill
+ }
+}
+
+func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 {
+ data := seccompData{
+ nr: sysno,
+ arch: t.tc.st.AuditNumber,
+ instructionPointer: uint64(ip),
+ }
+ // data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so
+ // we can't do any slicing tricks or even use copy/append here.
+ for i, arg := range args {
+ if i >= len(data.args) {
+ break
+ }
+ data.args[i] = arg.Uint64()
+ }
+ input := data.asBPFInput()
+
+ ret := uint32(linux.SECCOMP_RET_ALLOW)
+ // "Every filter successfully installed will be evaluated (in reverse
+ // order) for each system call the task makes." - kernel/seccomp.c
+ for i := len(t.syscallFilters) - 1; i >= 0; i-- {
+ thisRet, err := bpf.Exec(t.syscallFilters[i], input)
+ if err != nil {
+ t.Debugf("seccomp-bpf filter %d returned error: %v", i, err)
+ thisRet = linux.SECCOMP_RET_KILL
+ }
+ // "If multiple filters exist, the return value for the evaluation of a
+ // given system call will always use the highest precedent value." -
+ // Documentation/prctl/seccomp_filter.txt
+ //
+ // (Note that this contradicts prctl(2): "If the filters permit prctl()
+ // calls, then additional filters can be added; they are run in order
+ // until the first non-allow result is seen." prctl(2) is incorrect.)
+ //
+ // "The ordering ensures that a min_t() over composed return values
+ // always selects the least permissive choice." -
+ // include/uapi/linux/seccomp.h
+ if (thisRet & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) {
+ ret = thisRet
+ }
+ }
+
+ return ret
+}
+
+// AppendSyscallFilter adds BPF program p as a system call filter.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) AppendSyscallFilter(p bpf.Program) error {
+ // Cap the combined length of all syscall filters (plus a penalty of 4
+ // instructions per filter beyond the first) to
+ // maxSyscallFilterInstructions. (This restriction is inherited from
+ // Linux.)
+ totalLength := p.Length()
+ for _, f := range t.syscallFilters {
+ totalLength += f.Length() + 4
+ }
+ if totalLength > maxSyscallFilterInstructions {
+ return syserror.ENOMEM
+ }
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ t.syscallFilters = append(t.syscallFilters, p)
+ return nil
+}
+
+// SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current
+// seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP)
+// and /proc/[pid]/status.
+func (t *Task) SeccompMode() int {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ if len(t.syscallFilters) > 0 {
+ return linux.SECCOMP_MODE_FILTER
+ }
+ return linux.SECCOMP_MODE_NONE
+}