summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel
diff options
context:
space:
mode:
authorBrian Geffon <bgeffon@google.com>2018-08-02 08:09:03 -0700
committerShentubot <shentubot@google.com>2018-08-02 08:10:30 -0700
commitcf44aff6e08b0e19935d5cd98455b4af98fd8794 (patch)
treeb4c95523871f54a8ec739a426bb0cc84f7f11b48 /pkg/sentry/kernel
parent3cd7824410302da00d1c8c8323db8959a124814a (diff)
Add seccomp(2) support.
Add support for the seccomp syscall and the flag SECCOMP_FILTER_FLAG_TSYNC. PiperOrigin-RevId: 207101507 Change-Id: I5eb8ba9d5ef71b0e683930a6429182726dc23175
Diffstat (limited to 'pkg/sentry/kernel')
-rw-r--r--pkg/sentry/kernel/BUILD1
-rw-r--r--pkg/sentry/kernel/seccomp.go62
-rw-r--r--pkg/sentry/kernel/task.go19
-rw-r--r--pkg/sentry/kernel/task_clone.go5
-rw-r--r--pkg/sentry/kernel/task_syscall.go4
5 files changed, 74 insertions, 17 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index c4a7dacb2..1c1633068 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -47,6 +47,7 @@ go_stateify(
],
out = "kernel_autogen_state.go",
imports = [
+ "gvisor.googlesource.com/gvisor/pkg/bpf",
"gvisor.googlesource.com/gvisor/pkg/sentry/arch",
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs",
"gvisor.googlesource.com/gvisor/pkg/tcpip",
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index b7c4a507f..d77c05e2f 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -144,10 +144,15 @@ func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, i
input := data.asBPFInput()
ret := uint32(linux.SECCOMP_RET_ALLOW)
+ f := t.syscallFilters.Load()
+ if f == nil {
+ return ret
+ }
+
// "Every filter successfully installed will be evaluated (in reverse
// order) for each system call the task makes." - kernel/seccomp.c
- for i := len(t.syscallFilters) - 1; i >= 0; i-- {
- thisRet, err := bpf.Exec(t.syscallFilters[i], input)
+ for i := len(f.([]bpf.Program)) - 1; i >= 0; i-- {
+ thisRet, err := bpf.Exec(f.([]bpf.Program)[i], input)
if err != nil {
t.Debugf("seccomp-bpf filter %d returned error: %v", i, err)
thisRet = linux.SECCOMP_RET_KILL
@@ -180,15 +185,53 @@ func (t *Task) AppendSyscallFilter(p bpf.Program) error {
// maxSyscallFilterInstructions. (This restriction is inherited from
// Linux.)
totalLength := p.Length()
- for _, f := range t.syscallFilters {
- totalLength += f.Length() + 4
+ var newFilters []bpf.Program
+
+ // While syscallFilters are an atomic.Value we must take the mutex to
+ // prevent our read-copy-update from happening while another task
+ // is syncing syscall filters to us, this keeps the filters in a
+ // consistent state.
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ if sf := t.syscallFilters.Load(); sf != nil {
+ oldFilters := sf.([]bpf.Program)
+ for _, f := range oldFilters {
+ totalLength += f.Length() + 4
+ }
+ newFilters = append(newFilters, oldFilters...)
}
+
if totalLength > maxSyscallFilterInstructions {
return syserror.ENOMEM
}
- t.mu.Lock()
- defer t.mu.Unlock()
- t.syscallFilters = append(t.syscallFilters, p)
+
+ newFilters = append(newFilters, p)
+ t.syscallFilters.Store(newFilters)
+ return nil
+}
+
+// SyncSyscallFiltersToThreadGroup will copy this task's filters to all other
+// threads in our thread group.
+func (t *Task) SyncSyscallFiltersToThreadGroup() error {
+ f := t.syscallFilters.Load()
+
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+
+ // Note: No new privs is always assumed to be set.
+ for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() {
+ if ot.ThreadID() != t.ThreadID() {
+ // We must take the other task's mutex to prevent it from
+ // appending to its own syscall filters while we're syncing.
+ ot.mu.Lock()
+ var copiedFilters []bpf.Program
+ if f != nil {
+ copiedFilters = append(copiedFilters, f.([]bpf.Program)...)
+ }
+ ot.syscallFilters.Store(copiedFilters)
+ ot.mu.Unlock()
+ }
+ }
return nil
}
@@ -196,9 +239,8 @@ func (t *Task) AppendSyscallFilter(p bpf.Program) error {
// seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP)
// and /proc/[pid]/status.
func (t *Task) SeccompMode() int {
- t.mu.Lock()
- defer t.mu.Unlock()
- if len(t.syscallFilters) > 0 {
+ f := t.syscallFilters.Load()
+ if f != nil && len(f.([]bpf.Program)) > 0 {
return linux.SECCOMP_MODE_FILTER
}
return linux.SECCOMP_MODE_NONE
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 7f6735320..e705260da 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -355,11 +355,11 @@ type Task struct {
parentDeathSignal linux.Signal
// syscallFilters is all seccomp-bpf syscall filters applicable to the
- // task, in the order in which they were installed.
+ // task, in the order in which they were installed. The type of the atomic
+ // is []bpf.Program. Writing needs to be protected by mu.
//
- // syscallFilters is protected by mu. syscallFilters is owned by the task
- // goroutine.
- syscallFilters []bpf.Program
+ // syscallFilters is owned by the task goroutine.
+ syscallFilters atomic.Value `state:".([]bpf.Program)"`
// If cleartid is non-zero, treat it as a pointer to a ThreadID in the
// task's virtual address space; when the task exits, set the pointed-to
@@ -469,6 +469,17 @@ func (t *Task) loadLogPrefix(prefix string) {
t.logPrefix.Store(prefix)
}
+func (t *Task) saveSyscallFilters() []bpf.Program {
+ if f := t.syscallFilters.Load(); f != nil {
+ return f.([]bpf.Program)
+ }
+ return nil
+}
+
+func (t *Task) loadSyscallFilters(filters []bpf.Program) {
+ t.syscallFilters.Store(filters)
+}
+
// afterLoad is invoked by stateify.
func (t *Task) afterLoad() {
t.interruptChan = make(chan struct{}, 1)
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index a61283267..3b77a4965 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -280,7 +280,10 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
// "If fork/clone and execve are allowed by @prog, any child processes will
// be constrained to the same filters and system call ABI as the parent." -
// Documentation/prctl/seccomp_filter.txt
- nt.syscallFilters = append([]bpf.Program(nil), t.syscallFilters...)
+ if f := t.syscallFilters.Load(); f != nil {
+ copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...)
+ nt.syscallFilters.Store(copiedFilters)
+ }
if opts.Vfork {
nt.vforkParent = t
}
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 79f4ff60c..92ca0acd9 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -194,7 +194,7 @@ func (t *Task) doSyscall() taskRunState {
// Check seccomp filters. The nil check is for performance (as seccomp use
// is rare), not needed for correctness.
- if t.syscallFilters != nil {
+ if t.syscallFilters.Load() != nil {
switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r {
case seccompResultDeny:
t.Debugf("Syscall %d: denied by seccomp", sysno)
@@ -334,7 +334,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
// to syscall ABI because they both use RDI, RSI, and RDX for the first three
// arguments and none of the vsyscalls uses more than two arguments.
args := t.Arch().SyscallArgs()
- if t.syscallFilters != nil {
+ if t.syscallFilters.Load() != nil {
switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r {
case seccompResultDeny:
t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller))