diff options
author | Googler <noreply@google.com> | 2018-04-27 10:37:02 -0700 |
---|---|---|
committer | Adin Scannell <ascannell@google.com> | 2018-04-28 01:44:26 -0400 |
commit | d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch) | |
tree | 54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/kernel/seccomp.go | |
parent | f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff) |
Check in gVisor.
PiperOrigin-RevId: 194583126
Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
Diffstat (limited to 'pkg/sentry/kernel/seccomp.go')
-rw-r--r-- | pkg/sentry/kernel/seccomp.go | 205 |
1 files changed, 205 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go new file mode 100644 index 000000000..b7c4a507f --- /dev/null +++ b/pkg/sentry/kernel/seccomp.go @@ -0,0 +1,205 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/binary" + "gvisor.googlesource.com/gvisor/pkg/bpf" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +const maxSyscallFilterInstructions = 1 << 15 + +type seccompResult int + +const ( + // seccompResultDeny indicates that a syscall should not be executed. + seccompResultDeny seccompResult = iota + + // seccompResultAllow indicates that a syscall should be executed. + seccompResultAllow + + // seccompResultKill indicates that the task should be killed immediately, + // with the exit status indicating that the task was killed by SIGSYS. + seccompResultKill + + // seccompResultTrace indicates that a ptracer was successfully notified as + // a result of a SECCOMP_RET_TRACE. + seccompResultTrace +) + +// seccompData is equivalent to struct seccomp_data, which contains the data +// passed to seccomp-bpf filters. +type seccompData struct { + // nr is the system call number. + nr int32 + + // arch is an AUDIT_ARCH_* value indicating the system call convention. + arch uint32 + + // instructionPointer is the value of the instruction pointer at the time + // of the system call. + instructionPointer uint64 + + // args contains the first 6 system call arguments. + args [6]uint64 +} + +func (d *seccompData) asBPFInput() bpf.Input { + return bpf.InputBytes{binary.Marshal(nil, usermem.ByteOrder, d), usermem.ByteOrder} +} + +func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo { + si := &arch.SignalInfo{ + Signo: int32(linux.SIGSYS), + Errno: errno, + Code: arch.SYS_SECCOMP, + } + si.SetCallAddr(uint64(ip)) + si.SetSyscall(sysno) + si.SetArch(t.SyscallTable().AuditNumber) + return si +} + +// checkSeccompSyscall applies the task's seccomp filters before the execution +// of syscall sysno at instruction pointer ip. (These parameters must be passed +// in because vsyscalls do not use the values in t.Arch().) +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip usermem.Addr) seccompResult { + result := t.evaluateSyscallFilters(sysno, args, ip) + switch result & linux.SECCOMP_RET_ACTION { + case linux.SECCOMP_RET_TRAP: + // "Results in the kernel sending a SIGSYS signal to the triggering + // task without executing the system call. ... The SECCOMP_RET_DATA + // portion of the return value will be passed as si_errno." - + // Documentation/prctl/seccomp_filter.txt + t.SendSignal(seccompSiginfo(t, int32(result&linux.SECCOMP_RET_DATA), sysno, ip)) + return seccompResultDeny + + case linux.SECCOMP_RET_ERRNO: + // "Results in the lower 16-bits of the return value being passed to + // userland as the errno without executing the system call." + t.Arch().SetReturn(-uintptr(result & linux.SECCOMP_RET_DATA)) + return seccompResultDeny + + case linux.SECCOMP_RET_TRACE: + // "When returned, this value will cause the kernel to attempt to + // notify a ptrace()-based tracer prior to executing the system call. + // If there is no tracer present, -ENOSYS is returned to userland and + // the system call is not executed." + if t.ptraceSeccomp(uint16(result & linux.SECCOMP_RET_DATA)) { + return seccompResultTrace + } + // This useless-looking temporary is needed because Go. + tmp := uintptr(syscall.ENOSYS) + t.Arch().SetReturn(-tmp) + return seccompResultDeny + + case linux.SECCOMP_RET_ALLOW: + // "Results in the system call being executed." + return seccompResultAllow + + case linux.SECCOMP_RET_KILL: + // "Results in the task exiting immediately without executing the + // system call. The exit status of the task will be SIGSYS, not + // SIGKILL." + fallthrough + default: // consistent with Linux + return seccompResultKill + } +} + +func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 { + data := seccompData{ + nr: sysno, + arch: t.tc.st.AuditNumber, + instructionPointer: uint64(ip), + } + // data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so + // we can't do any slicing tricks or even use copy/append here. + for i, arg := range args { + if i >= len(data.args) { + break + } + data.args[i] = arg.Uint64() + } + input := data.asBPFInput() + + ret := uint32(linux.SECCOMP_RET_ALLOW) + // "Every filter successfully installed will be evaluated (in reverse + // order) for each system call the task makes." - kernel/seccomp.c + for i := len(t.syscallFilters) - 1; i >= 0; i-- { + thisRet, err := bpf.Exec(t.syscallFilters[i], input) + if err != nil { + t.Debugf("seccomp-bpf filter %d returned error: %v", i, err) + thisRet = linux.SECCOMP_RET_KILL + } + // "If multiple filters exist, the return value for the evaluation of a + // given system call will always use the highest precedent value." - + // Documentation/prctl/seccomp_filter.txt + // + // (Note that this contradicts prctl(2): "If the filters permit prctl() + // calls, then additional filters can be added; they are run in order + // until the first non-allow result is seen." prctl(2) is incorrect.) + // + // "The ordering ensures that a min_t() over composed return values + // always selects the least permissive choice." - + // include/uapi/linux/seccomp.h + if (thisRet & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) { + ret = thisRet + } + } + + return ret +} + +// AppendSyscallFilter adds BPF program p as a system call filter. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) AppendSyscallFilter(p bpf.Program) error { + // Cap the combined length of all syscall filters (plus a penalty of 4 + // instructions per filter beyond the first) to + // maxSyscallFilterInstructions. (This restriction is inherited from + // Linux.) + totalLength := p.Length() + for _, f := range t.syscallFilters { + totalLength += f.Length() + 4 + } + if totalLength > maxSyscallFilterInstructions { + return syserror.ENOMEM + } + t.mu.Lock() + defer t.mu.Unlock() + t.syscallFilters = append(t.syscallFilters, p) + return nil +} + +// SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current +// seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP) +// and /proc/[pid]/status. +func (t *Task) SeccompMode() int { + t.mu.Lock() + defer t.mu.Unlock() + if len(t.syscallFilters) > 0 { + return linux.SECCOMP_MODE_FILTER + } + return linux.SECCOMP_MODE_NONE +} |