diff options
author | Brian Geffon <bgeffon@google.com> | 2018-08-02 08:09:03 -0700 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2018-08-02 08:10:30 -0700 |
commit | cf44aff6e08b0e19935d5cd98455b4af98fd8794 (patch) | |
tree | b4c95523871f54a8ec739a426bb0cc84f7f11b48 | |
parent | 3cd7824410302da00d1c8c8323db8959a124814a (diff) |
Add seccomp(2) support.
Add support for the seccomp syscall and the flag SECCOMP_FILTER_FLAG_TSYNC.
PiperOrigin-RevId: 207101507
Change-Id: I5eb8ba9d5ef71b0e683930a6429182726dc23175
-rw-r--r-- | pkg/sentry/kernel/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/kernel/seccomp.go | 62 | ||||
-rw-r--r-- | pkg/sentry/kernel/task.go | 19 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_clone.go | 5 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_syscall.go | 4 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/linux64.go | 1 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_prctl.go | 32 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_seccomp.go | 82 |
9 files changed, 160 insertions, 47 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index c4a7dacb2..1c1633068 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -47,6 +47,7 @@ go_stateify( ], out = "kernel_autogen_state.go", imports = [ + "gvisor.googlesource.com/gvisor/pkg/bpf", "gvisor.googlesource.com/gvisor/pkg/sentry/arch", "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs", "gvisor.googlesource.com/gvisor/pkg/tcpip", diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go index b7c4a507f..d77c05e2f 100644 --- a/pkg/sentry/kernel/seccomp.go +++ b/pkg/sentry/kernel/seccomp.go @@ -144,10 +144,15 @@ func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, i input := data.asBPFInput() ret := uint32(linux.SECCOMP_RET_ALLOW) + f := t.syscallFilters.Load() + if f == nil { + return ret + } + // "Every filter successfully installed will be evaluated (in reverse // order) for each system call the task makes." - kernel/seccomp.c - for i := len(t.syscallFilters) - 1; i >= 0; i-- { - thisRet, err := bpf.Exec(t.syscallFilters[i], input) + for i := len(f.([]bpf.Program)) - 1; i >= 0; i-- { + thisRet, err := bpf.Exec(f.([]bpf.Program)[i], input) if err != nil { t.Debugf("seccomp-bpf filter %d returned error: %v", i, err) thisRet = linux.SECCOMP_RET_KILL @@ -180,15 +185,53 @@ func (t *Task) AppendSyscallFilter(p bpf.Program) error { // maxSyscallFilterInstructions. (This restriction is inherited from // Linux.) totalLength := p.Length() - for _, f := range t.syscallFilters { - totalLength += f.Length() + 4 + var newFilters []bpf.Program + + // While syscallFilters are an atomic.Value we must take the mutex to + // prevent our read-copy-update from happening while another task + // is syncing syscall filters to us, this keeps the filters in a + // consistent state. + t.mu.Lock() + defer t.mu.Unlock() + if sf := t.syscallFilters.Load(); sf != nil { + oldFilters := sf.([]bpf.Program) + for _, f := range oldFilters { + totalLength += f.Length() + 4 + } + newFilters = append(newFilters, oldFilters...) } + if totalLength > maxSyscallFilterInstructions { return syserror.ENOMEM } - t.mu.Lock() - defer t.mu.Unlock() - t.syscallFilters = append(t.syscallFilters, p) + + newFilters = append(newFilters, p) + t.syscallFilters.Store(newFilters) + return nil +} + +// SyncSyscallFiltersToThreadGroup will copy this task's filters to all other +// threads in our thread group. +func (t *Task) SyncSyscallFiltersToThreadGroup() error { + f := t.syscallFilters.Load() + + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + + // Note: No new privs is always assumed to be set. + for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() { + if ot.ThreadID() != t.ThreadID() { + // We must take the other task's mutex to prevent it from + // appending to its own syscall filters while we're syncing. + ot.mu.Lock() + var copiedFilters []bpf.Program + if f != nil { + copiedFilters = append(copiedFilters, f.([]bpf.Program)...) + } + ot.syscallFilters.Store(copiedFilters) + ot.mu.Unlock() + } + } return nil } @@ -196,9 +239,8 @@ func (t *Task) AppendSyscallFilter(p bpf.Program) error { // seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP) // and /proc/[pid]/status. func (t *Task) SeccompMode() int { - t.mu.Lock() - defer t.mu.Unlock() - if len(t.syscallFilters) > 0 { + f := t.syscallFilters.Load() + if f != nil && len(f.([]bpf.Program)) > 0 { return linux.SECCOMP_MODE_FILTER } return linux.SECCOMP_MODE_NONE diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 7f6735320..e705260da 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -355,11 +355,11 @@ type Task struct { parentDeathSignal linux.Signal // syscallFilters is all seccomp-bpf syscall filters applicable to the - // task, in the order in which they were installed. + // task, in the order in which they were installed. The type of the atomic + // is []bpf.Program. Writing needs to be protected by mu. // - // syscallFilters is protected by mu. syscallFilters is owned by the task - // goroutine. - syscallFilters []bpf.Program + // syscallFilters is owned by the task goroutine. + syscallFilters atomic.Value `state:".([]bpf.Program)"` // If cleartid is non-zero, treat it as a pointer to a ThreadID in the // task's virtual address space; when the task exits, set the pointed-to @@ -469,6 +469,17 @@ func (t *Task) loadLogPrefix(prefix string) { t.logPrefix.Store(prefix) } +func (t *Task) saveSyscallFilters() []bpf.Program { + if f := t.syscallFilters.Load(); f != nil { + return f.([]bpf.Program) + } + return nil +} + +func (t *Task) loadSyscallFilters(filters []bpf.Program) { + t.syscallFilters.Store(filters) +} + // afterLoad is invoked by stateify. func (t *Task) afterLoad() { t.interruptChan = make(chan struct{}, 1) diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index a61283267..3b77a4965 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -280,7 +280,10 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // "If fork/clone and execve are allowed by @prog, any child processes will // be constrained to the same filters and system call ABI as the parent." - // Documentation/prctl/seccomp_filter.txt - nt.syscallFilters = append([]bpf.Program(nil), t.syscallFilters...) + if f := t.syscallFilters.Load(); f != nil { + copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...) + nt.syscallFilters.Store(copiedFilters) + } if opts.Vfork { nt.vforkParent = t } diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index 79f4ff60c..92ca0acd9 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -194,7 +194,7 @@ func (t *Task) doSyscall() taskRunState { // Check seccomp filters. The nil check is for performance (as seccomp use // is rare), not needed for correctness. - if t.syscallFilters != nil { + if t.syscallFilters.Load() != nil { switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r { case seccompResultDeny: t.Debugf("Syscall %d: denied by seccomp", sysno) @@ -334,7 +334,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState { // to syscall ABI because they both use RDI, RSI, and RDX for the first three // arguments and none of the vsyscalls uses more than two arguments. args := t.Arch().SyscallArgs() - if t.syscallFilters != nil { + if t.syscallFilters.Load() != nil { switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r { case seccompResultDeny: t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller)) diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 574621ad2..32fca3811 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -42,6 +42,7 @@ go_library( "sys_rlimit.go", "sys_rusage.go", "sys_sched.go", + "sys_seccomp.go", "sys_sem.go", "sys_shm.go", "sys_signal.go", diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index edfcdca3f..c102af101 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -359,6 +359,7 @@ var AMD64 = &kernel.SyscallTable{ 312: syscalls.CapError(linux.CAP_SYS_PTRACE), // Kcmp, requires cap_sys_ptrace 313: syscalls.CapError(linux.CAP_SYS_MODULE), // FinitModule, requires cap_sys_module // "Backports." + 317: Seccomp, 318: GetRandom, }, diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index 2ca7471cf..911fef658 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -18,29 +18,13 @@ import ( "syscall" "gvisor.googlesource.com/gvisor/pkg/abi/linux" - "gvisor.googlesource.com/gvisor/pkg/bpf" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/fs" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" - "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" ) -// userSockFprog is equivalent to Linux's struct sock_fprog on amd64. -type userSockFprog struct { - // Len is the length of the filter in BPF instructions. - Len uint16 - - _ [6]byte // padding for alignment - - // Filter is a user pointer to the struct sock_filter array that makes up - // the filter program. Filter is a uint64 rather than a usermem.Addr - // because usermem.Addr is actually uintptr, which is not a fixed-size - // type, and encoding/binary.Read objects to this. - Filter uint64 -} - // Prctl implements linux syscall prctl(2). // It has a list of subfunctions which operate on the process. The arguments are // all based on each subfunction. @@ -143,20 +127,8 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Unsupported mode. return 0, nil, syscall.EINVAL } - var fprog userSockFprog - if _, err := t.CopyIn(args[2].Pointer(), &fprog); err != nil { - return 0, nil, err - } - filter := make([]linux.BPFInstruction, int(fprog.Len)) - if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil { - return 0, nil, err - } - compiledFilter, err := bpf.Compile(filter) - if err != nil { - t.Debugf("Invalid seccomp-bpf filter: %v", err) - return 0, nil, syscall.EINVAL - } - return 0, nil, t.AppendSyscallFilter(compiledFilter) + + return 0, nil, seccomp(t, linux.SECCOMP_SET_MODE_FILTER, 0, args[2].Pointer()) case linux.PR_GET_SECCOMP: return uintptr(t.SeccompMode()), nil, nil diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go new file mode 100644 index 000000000..4323a4df4 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_seccomp.go @@ -0,0 +1,82 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/bpf" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// userSockFprog is equivalent to Linux's struct sock_fprog on amd64. +type userSockFprog struct { + // Len is the length of the filter in BPF instructions. + Len uint16 + + _ [6]byte // padding for alignment + + // Filter is a user pointer to the struct sock_filter array that makes up + // the filter program. Filter is a uint64 rather than a usermem.Addr + // because usermem.Addr is actually uintptr, which is not a fixed-size + // type, and encoding/binary.Read objects to this. + Filter uint64 +} + +// seccomp applies a seccomp policy to the current task. +func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error { + // We only support SECCOMP_SET_MODE_FILTER at the moment. + if mode != linux.SECCOMP_SET_MODE_FILTER { + // Unsupported mode. + return syscall.EINVAL + } + + tsync := flags&linux.SECCOMP_FILTER_FLAG_TSYNC != 0 + + // The only flag we support now is SECCOMP_FILTER_FLAG_TSYNC. + if flags&^linux.SECCOMP_FILTER_FLAG_TSYNC != 0 { + // Unsupported flag. + return syscall.EINVAL + } + + var fprog userSockFprog + if _, err := t.CopyIn(addr, &fprog); err != nil { + return err + } + filter := make([]linux.BPFInstruction, int(fprog.Len)) + if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil { + return err + } + compiledFilter, err := bpf.Compile(filter) + if err != nil { + t.Debugf("Invalid seccomp-bpf filter: %v", err) + return syscall.EINVAL + } + + err = t.AppendSyscallFilter(compiledFilter) + if err == nil && tsync { + // Now we must copy this seccomp program to all other threads. + err = t.SyncSyscallFiltersToThreadGroup() + } + return err +} + +// Seccomp implements linux syscall seccomp(2). +func Seccomp(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, seccomp(t, args[0].Uint64(), args[1].Uint64(), args[2].Pointer()) +} |