summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorBrian Geffon <bgeffon@google.com>2018-08-02 08:09:03 -0700
committerShentubot <shentubot@google.com>2018-08-02 08:10:30 -0700
commitcf44aff6e08b0e19935d5cd98455b4af98fd8794 (patch)
treeb4c95523871f54a8ec739a426bb0cc84f7f11b48
parent3cd7824410302da00d1c8c8323db8959a124814a (diff)
Add seccomp(2) support.
Add support for the seccomp syscall and the flag SECCOMP_FILTER_FLAG_TSYNC. PiperOrigin-RevId: 207101507 Change-Id: I5eb8ba9d5ef71b0e683930a6429182726dc23175
-rw-r--r--pkg/sentry/kernel/BUILD1
-rw-r--r--pkg/sentry/kernel/seccomp.go62
-rw-r--r--pkg/sentry/kernel/task.go19
-rw-r--r--pkg/sentry/kernel/task_clone.go5
-rw-r--r--pkg/sentry/kernel/task_syscall.go4
-rw-r--r--pkg/sentry/syscalls/linux/BUILD1
-rw-r--r--pkg/sentry/syscalls/linux/linux64.go1
-rw-r--r--pkg/sentry/syscalls/linux/sys_prctl.go32
-rw-r--r--pkg/sentry/syscalls/linux/sys_seccomp.go82
9 files changed, 160 insertions, 47 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index c4a7dacb2..1c1633068 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -47,6 +47,7 @@ go_stateify(
],
out = "kernel_autogen_state.go",
imports = [
+ "gvisor.googlesource.com/gvisor/pkg/bpf",
"gvisor.googlesource.com/gvisor/pkg/sentry/arch",
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs",
"gvisor.googlesource.com/gvisor/pkg/tcpip",
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index b7c4a507f..d77c05e2f 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -144,10 +144,15 @@ func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, i
input := data.asBPFInput()
ret := uint32(linux.SECCOMP_RET_ALLOW)
+ f := t.syscallFilters.Load()
+ if f == nil {
+ return ret
+ }
+
// "Every filter successfully installed will be evaluated (in reverse
// order) for each system call the task makes." - kernel/seccomp.c
- for i := len(t.syscallFilters) - 1; i >= 0; i-- {
- thisRet, err := bpf.Exec(t.syscallFilters[i], input)
+ for i := len(f.([]bpf.Program)) - 1; i >= 0; i-- {
+ thisRet, err := bpf.Exec(f.([]bpf.Program)[i], input)
if err != nil {
t.Debugf("seccomp-bpf filter %d returned error: %v", i, err)
thisRet = linux.SECCOMP_RET_KILL
@@ -180,15 +185,53 @@ func (t *Task) AppendSyscallFilter(p bpf.Program) error {
// maxSyscallFilterInstructions. (This restriction is inherited from
// Linux.)
totalLength := p.Length()
- for _, f := range t.syscallFilters {
- totalLength += f.Length() + 4
+ var newFilters []bpf.Program
+
+ // While syscallFilters are an atomic.Value we must take the mutex to
+ // prevent our read-copy-update from happening while another task
+ // is syncing syscall filters to us, this keeps the filters in a
+ // consistent state.
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ if sf := t.syscallFilters.Load(); sf != nil {
+ oldFilters := sf.([]bpf.Program)
+ for _, f := range oldFilters {
+ totalLength += f.Length() + 4
+ }
+ newFilters = append(newFilters, oldFilters...)
}
+
if totalLength > maxSyscallFilterInstructions {
return syserror.ENOMEM
}
- t.mu.Lock()
- defer t.mu.Unlock()
- t.syscallFilters = append(t.syscallFilters, p)
+
+ newFilters = append(newFilters, p)
+ t.syscallFilters.Store(newFilters)
+ return nil
+}
+
+// SyncSyscallFiltersToThreadGroup will copy this task's filters to all other
+// threads in our thread group.
+func (t *Task) SyncSyscallFiltersToThreadGroup() error {
+ f := t.syscallFilters.Load()
+
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+
+ // Note: No new privs is always assumed to be set.
+ for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() {
+ if ot.ThreadID() != t.ThreadID() {
+ // We must take the other task's mutex to prevent it from
+ // appending to its own syscall filters while we're syncing.
+ ot.mu.Lock()
+ var copiedFilters []bpf.Program
+ if f != nil {
+ copiedFilters = append(copiedFilters, f.([]bpf.Program)...)
+ }
+ ot.syscallFilters.Store(copiedFilters)
+ ot.mu.Unlock()
+ }
+ }
return nil
}
@@ -196,9 +239,8 @@ func (t *Task) AppendSyscallFilter(p bpf.Program) error {
// seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP)
// and /proc/[pid]/status.
func (t *Task) SeccompMode() int {
- t.mu.Lock()
- defer t.mu.Unlock()
- if len(t.syscallFilters) > 0 {
+ f := t.syscallFilters.Load()
+ if f != nil && len(f.([]bpf.Program)) > 0 {
return linux.SECCOMP_MODE_FILTER
}
return linux.SECCOMP_MODE_NONE
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 7f6735320..e705260da 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -355,11 +355,11 @@ type Task struct {
parentDeathSignal linux.Signal
// syscallFilters is all seccomp-bpf syscall filters applicable to the
- // task, in the order in which they were installed.
+ // task, in the order in which they were installed. The type of the atomic
+ // is []bpf.Program. Writing needs to be protected by mu.
//
- // syscallFilters is protected by mu. syscallFilters is owned by the task
- // goroutine.
- syscallFilters []bpf.Program
+ // syscallFilters is owned by the task goroutine.
+ syscallFilters atomic.Value `state:".([]bpf.Program)"`
// If cleartid is non-zero, treat it as a pointer to a ThreadID in the
// task's virtual address space; when the task exits, set the pointed-to
@@ -469,6 +469,17 @@ func (t *Task) loadLogPrefix(prefix string) {
t.logPrefix.Store(prefix)
}
+func (t *Task) saveSyscallFilters() []bpf.Program {
+ if f := t.syscallFilters.Load(); f != nil {
+ return f.([]bpf.Program)
+ }
+ return nil
+}
+
+func (t *Task) loadSyscallFilters(filters []bpf.Program) {
+ t.syscallFilters.Store(filters)
+}
+
// afterLoad is invoked by stateify.
func (t *Task) afterLoad() {
t.interruptChan = make(chan struct{}, 1)
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index a61283267..3b77a4965 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -280,7 +280,10 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
// "If fork/clone and execve are allowed by @prog, any child processes will
// be constrained to the same filters and system call ABI as the parent." -
// Documentation/prctl/seccomp_filter.txt
- nt.syscallFilters = append([]bpf.Program(nil), t.syscallFilters...)
+ if f := t.syscallFilters.Load(); f != nil {
+ copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...)
+ nt.syscallFilters.Store(copiedFilters)
+ }
if opts.Vfork {
nt.vforkParent = t
}
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 79f4ff60c..92ca0acd9 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -194,7 +194,7 @@ func (t *Task) doSyscall() taskRunState {
// Check seccomp filters. The nil check is for performance (as seccomp use
// is rare), not needed for correctness.
- if t.syscallFilters != nil {
+ if t.syscallFilters.Load() != nil {
switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r {
case seccompResultDeny:
t.Debugf("Syscall %d: denied by seccomp", sysno)
@@ -334,7 +334,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
// to syscall ABI because they both use RDI, RSI, and RDX for the first three
// arguments and none of the vsyscalls uses more than two arguments.
args := t.Arch().SyscallArgs()
- if t.syscallFilters != nil {
+ if t.syscallFilters.Load() != nil {
switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r {
case seccompResultDeny:
t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller))
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 574621ad2..32fca3811 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -42,6 +42,7 @@ go_library(
"sys_rlimit.go",
"sys_rusage.go",
"sys_sched.go",
+ "sys_seccomp.go",
"sys_sem.go",
"sys_shm.go",
"sys_signal.go",
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index edfcdca3f..c102af101 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -359,6 +359,7 @@ var AMD64 = &kernel.SyscallTable{
312: syscalls.CapError(linux.CAP_SYS_PTRACE), // Kcmp, requires cap_sys_ptrace
313: syscalls.CapError(linux.CAP_SYS_MODULE), // FinitModule, requires cap_sys_module
// "Backports."
+ 317: Seccomp,
318: GetRandom,
},
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 2ca7471cf..911fef658 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -18,29 +18,13 @@ import (
"syscall"
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
- "gvisor.googlesource.com/gvisor/pkg/bpf"
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
- "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
)
-// userSockFprog is equivalent to Linux's struct sock_fprog on amd64.
-type userSockFprog struct {
- // Len is the length of the filter in BPF instructions.
- Len uint16
-
- _ [6]byte // padding for alignment
-
- // Filter is a user pointer to the struct sock_filter array that makes up
- // the filter program. Filter is a uint64 rather than a usermem.Addr
- // because usermem.Addr is actually uintptr, which is not a fixed-size
- // type, and encoding/binary.Read objects to this.
- Filter uint64
-}
-
// Prctl implements linux syscall prctl(2).
// It has a list of subfunctions which operate on the process. The arguments are
// all based on each subfunction.
@@ -143,20 +127,8 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Unsupported mode.
return 0, nil, syscall.EINVAL
}
- var fprog userSockFprog
- if _, err := t.CopyIn(args[2].Pointer(), &fprog); err != nil {
- return 0, nil, err
- }
- filter := make([]linux.BPFInstruction, int(fprog.Len))
- if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil {
- return 0, nil, err
- }
- compiledFilter, err := bpf.Compile(filter)
- if err != nil {
- t.Debugf("Invalid seccomp-bpf filter: %v", err)
- return 0, nil, syscall.EINVAL
- }
- return 0, nil, t.AppendSyscallFilter(compiledFilter)
+
+ return 0, nil, seccomp(t, linux.SECCOMP_SET_MODE_FILTER, 0, args[2].Pointer())
case linux.PR_GET_SECCOMP:
return uintptr(t.SeccompMode()), nil, nil
diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go
new file mode 100644
index 000000000..4323a4df4
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_seccomp.go
@@ -0,0 +1,82 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/bpf"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// userSockFprog is equivalent to Linux's struct sock_fprog on amd64.
+type userSockFprog struct {
+ // Len is the length of the filter in BPF instructions.
+ Len uint16
+
+ _ [6]byte // padding for alignment
+
+ // Filter is a user pointer to the struct sock_filter array that makes up
+ // the filter program. Filter is a uint64 rather than a usermem.Addr
+ // because usermem.Addr is actually uintptr, which is not a fixed-size
+ // type, and encoding/binary.Read objects to this.
+ Filter uint64
+}
+
+// seccomp applies a seccomp policy to the current task.
+func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error {
+ // We only support SECCOMP_SET_MODE_FILTER at the moment.
+ if mode != linux.SECCOMP_SET_MODE_FILTER {
+ // Unsupported mode.
+ return syscall.EINVAL
+ }
+
+ tsync := flags&linux.SECCOMP_FILTER_FLAG_TSYNC != 0
+
+ // The only flag we support now is SECCOMP_FILTER_FLAG_TSYNC.
+ if flags&^linux.SECCOMP_FILTER_FLAG_TSYNC != 0 {
+ // Unsupported flag.
+ return syscall.EINVAL
+ }
+
+ var fprog userSockFprog
+ if _, err := t.CopyIn(addr, &fprog); err != nil {
+ return err
+ }
+ filter := make([]linux.BPFInstruction, int(fprog.Len))
+ if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil {
+ return err
+ }
+ compiledFilter, err := bpf.Compile(filter)
+ if err != nil {
+ t.Debugf("Invalid seccomp-bpf filter: %v", err)
+ return syscall.EINVAL
+ }
+
+ err = t.AppendSyscallFilter(compiledFilter)
+ if err == nil && tsync {
+ // Now we must copy this seccomp program to all other threads.
+ err = t.SyncSyscallFiltersToThreadGroup()
+ }
+ return err
+}
+
+// Seccomp implements linux syscall seccomp(2).
+func Seccomp(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, seccomp(t, args[0].Uint64(), args[1].Uint64(), args[2].Pointer())
+}