From cf44aff6e08b0e19935d5cd98455b4af98fd8794 Mon Sep 17 00:00:00 2001 From: Brian Geffon Date: Thu, 2 Aug 2018 08:09:03 -0700 Subject: Add seccomp(2) support. Add support for the seccomp syscall and the flag SECCOMP_FILTER_FLAG_TSYNC. PiperOrigin-RevId: 207101507 Change-Id: I5eb8ba9d5ef71b0e683930a6429182726dc23175 --- pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/seccomp.go | 62 ++++++++++++++++++++++++++++++++------- pkg/sentry/kernel/task.go | 19 +++++++++--- pkg/sentry/kernel/task_clone.go | 5 +++- pkg/sentry/kernel/task_syscall.go | 4 +-- 5 files changed, 74 insertions(+), 17 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index c4a7dacb2..1c1633068 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -47,6 +47,7 @@ go_stateify( ], out = "kernel_autogen_state.go", imports = [ + "gvisor.googlesource.com/gvisor/pkg/bpf", "gvisor.googlesource.com/gvisor/pkg/sentry/arch", "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs", "gvisor.googlesource.com/gvisor/pkg/tcpip", diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go index b7c4a507f..d77c05e2f 100644 --- a/pkg/sentry/kernel/seccomp.go +++ b/pkg/sentry/kernel/seccomp.go @@ -144,10 +144,15 @@ func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, i input := data.asBPFInput() ret := uint32(linux.SECCOMP_RET_ALLOW) + f := t.syscallFilters.Load() + if f == nil { + return ret + } + // "Every filter successfully installed will be evaluated (in reverse // order) for each system call the task makes." - kernel/seccomp.c - for i := len(t.syscallFilters) - 1; i >= 0; i-- { - thisRet, err := bpf.Exec(t.syscallFilters[i], input) + for i := len(f.([]bpf.Program)) - 1; i >= 0; i-- { + thisRet, err := bpf.Exec(f.([]bpf.Program)[i], input) if err != nil { t.Debugf("seccomp-bpf filter %d returned error: %v", i, err) thisRet = linux.SECCOMP_RET_KILL @@ -180,15 +185,53 @@ func (t *Task) AppendSyscallFilter(p bpf.Program) error { // maxSyscallFilterInstructions. (This restriction is inherited from // Linux.) totalLength := p.Length() - for _, f := range t.syscallFilters { - totalLength += f.Length() + 4 + var newFilters []bpf.Program + + // While syscallFilters are an atomic.Value we must take the mutex to + // prevent our read-copy-update from happening while another task + // is syncing syscall filters to us, this keeps the filters in a + // consistent state. + t.mu.Lock() + defer t.mu.Unlock() + if sf := t.syscallFilters.Load(); sf != nil { + oldFilters := sf.([]bpf.Program) + for _, f := range oldFilters { + totalLength += f.Length() + 4 + } + newFilters = append(newFilters, oldFilters...) } + if totalLength > maxSyscallFilterInstructions { return syserror.ENOMEM } - t.mu.Lock() - defer t.mu.Unlock() - t.syscallFilters = append(t.syscallFilters, p) + + newFilters = append(newFilters, p) + t.syscallFilters.Store(newFilters) + return nil +} + +// SyncSyscallFiltersToThreadGroup will copy this task's filters to all other +// threads in our thread group. +func (t *Task) SyncSyscallFiltersToThreadGroup() error { + f := t.syscallFilters.Load() + + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + + // Note: No new privs is always assumed to be set. + for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() { + if ot.ThreadID() != t.ThreadID() { + // We must take the other task's mutex to prevent it from + // appending to its own syscall filters while we're syncing. + ot.mu.Lock() + var copiedFilters []bpf.Program + if f != nil { + copiedFilters = append(copiedFilters, f.([]bpf.Program)...) + } + ot.syscallFilters.Store(copiedFilters) + ot.mu.Unlock() + } + } return nil } @@ -196,9 +239,8 @@ func (t *Task) AppendSyscallFilter(p bpf.Program) error { // seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP) // and /proc/[pid]/status. func (t *Task) SeccompMode() int { - t.mu.Lock() - defer t.mu.Unlock() - if len(t.syscallFilters) > 0 { + f := t.syscallFilters.Load() + if f != nil && len(f.([]bpf.Program)) > 0 { return linux.SECCOMP_MODE_FILTER } return linux.SECCOMP_MODE_NONE diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 7f6735320..e705260da 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -355,11 +355,11 @@ type Task struct { parentDeathSignal linux.Signal // syscallFilters is all seccomp-bpf syscall filters applicable to the - // task, in the order in which they were installed. + // task, in the order in which they were installed. The type of the atomic + // is []bpf.Program. Writing needs to be protected by mu. // - // syscallFilters is protected by mu. syscallFilters is owned by the task - // goroutine. - syscallFilters []bpf.Program + // syscallFilters is owned by the task goroutine. + syscallFilters atomic.Value `state:".([]bpf.Program)"` // If cleartid is non-zero, treat it as a pointer to a ThreadID in the // task's virtual address space; when the task exits, set the pointed-to @@ -469,6 +469,17 @@ func (t *Task) loadLogPrefix(prefix string) { t.logPrefix.Store(prefix) } +func (t *Task) saveSyscallFilters() []bpf.Program { + if f := t.syscallFilters.Load(); f != nil { + return f.([]bpf.Program) + } + return nil +} + +func (t *Task) loadSyscallFilters(filters []bpf.Program) { + t.syscallFilters.Store(filters) +} + // afterLoad is invoked by stateify. func (t *Task) afterLoad() { t.interruptChan = make(chan struct{}, 1) diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index a61283267..3b77a4965 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -280,7 +280,10 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // "If fork/clone and execve are allowed by @prog, any child processes will // be constrained to the same filters and system call ABI as the parent." - // Documentation/prctl/seccomp_filter.txt - nt.syscallFilters = append([]bpf.Program(nil), t.syscallFilters...) + if f := t.syscallFilters.Load(); f != nil { + copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...) + nt.syscallFilters.Store(copiedFilters) + } if opts.Vfork { nt.vforkParent = t } diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index 79f4ff60c..92ca0acd9 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -194,7 +194,7 @@ func (t *Task) doSyscall() taskRunState { // Check seccomp filters. The nil check is for performance (as seccomp use // is rare), not needed for correctness. - if t.syscallFilters != nil { + if t.syscallFilters.Load() != nil { switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r { case seccompResultDeny: t.Debugf("Syscall %d: denied by seccomp", sysno) @@ -334,7 +334,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState { // to syscall ABI because they both use RDI, RSI, and RDX for the first three // arguments and none of the vsyscalls uses more than two arguments. args := t.Arch().SyscallArgs() - if t.syscallFilters != nil { + if t.syscallFilters.Load() != nil { switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r { case seccompResultDeny: t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller)) -- cgit v1.2.3