summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
authorAdin Scannell <ascannell@google.com>2018-10-10 22:39:32 -0700
committerShentubot <shentubot@google.com>2018-10-10 22:40:28 -0700
commit463e73d46d76042c39050d02cf3b0f875e55eb01 (patch)
treedbaac54c225820d0850925a8cde4d80861fce686 /pkg/sentry
parente21ba16d9cf7ba4f2d5f65651e06ab592032ef86 (diff)
Add seccomp filter configuration to ptrace stubs.
This is a defense-in-depth measure. If the sentry is compromised, this prevents system call injection to the stubs. There is some complexity with respect to ptrace and seccomp interactions, so this protection is not really available for kernel versions < 4.8; this is detected dynamically. Note that this also solves the vsyscall emulation issue by adding in appropriate trapping for those system calls. It does mean that a compromised sentry could theoretically inject these into the stub (ignoring the trap and resume, thereby allowing execution), but they are harmless. PiperOrigin-RevId: 216647581 Change-Id: Id06c232cbac1f9489b1803ec97f83097fcba8eb8
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/arch/arch_amd64.go5
-rw-r--r--pkg/sentry/platform/ptrace/BUILD2
-rw-r--r--pkg/sentry/platform/ptrace/ptrace_unsafe.go2
-rw-r--r--pkg/sentry/platform/ptrace/subprocess.go150
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_amd64.go16
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_linux.go175
-rw-r--r--pkg/sentry/strace/BUILD1
-rw-r--r--pkg/sentry/strace/strace.go11
8 files changed, 293 insertions, 69 deletions
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index f1e408af9..5ba6c19ea 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// +build amd64
+
package arch
import (
@@ -26,6 +28,9 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
)
+// Host specifies the host architecture.
+const Host = AMD64
+
// These constants come directly from Linux.
const (
// maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index ceee895dc..debae058b 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -19,6 +19,8 @@ go_library(
visibility = ["//:sandbox"],
deps = [
"//pkg/abi/linux",
+ "//pkg/log",
+ "//pkg/seccomp",
"//pkg/sentry/arch",
"//pkg/sentry/platform",
"//pkg/sentry/platform/filemem",
diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
index b55b2795a..46a8bda8e 100644
--- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -136,7 +136,7 @@ func (t *thread) clone(initRegs *syscall.PtraceRegs) (*thread, error) {
return nil, syscall.EINVAL
}
rval, err := t.syscallIgnoreInterrupt(
- initRegs,
+ &t.initRegs,
syscall.SYS_CLONE,
arch.SyscallArgument{Value: uintptr(
syscall.CLONE_FILES |
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 035ebc332..6d5ad6b71 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -47,6 +47,11 @@ type thread struct {
tgid int32
tid int32
cpu uint32
+
+ // initRegs are the initial registers for the first thread.
+ //
+ // These are used for the register set for system calls.
+ initRegs syscall.PtraceRegs
}
// threadPool is a collection of threads.
@@ -99,11 +104,6 @@ func (tp *threadPool) lookupOrCreate(currentTID int32, newThread func() *thread)
type subprocess struct {
platform.NoAddressSpaceIO
- // initRegs are the initial registers for the first thread.
- //
- // These are used for the register set for system calls.
- initRegs syscall.PtraceRegs
-
// requests is used to signal creation of new threads.
requests chan chan *thread
@@ -142,7 +142,6 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
// thread, and responding to requests to make additional threads in the
// traced process. The process will be killed and reaped when the
// request channel is closed, which happens in Release below.
- var initRegs syscall.PtraceRegs
errChan := make(chan error)
requests := make(chan chan *thread)
go func() { // S/R-SAFE: Platform-related.
@@ -156,22 +155,12 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
return
}
- // Grab registers.
- //
- // Note that we adjust the current register RIP value to be
- // just before the current system call executed. This depends
- // on the definition of the stub itself.
- if err := firstThread.getRegs(&initRegs); err != nil {
- panic(fmt.Sprintf("ptrace get regs failed: %v", err))
- }
- initRegs.Rip -= initRegsRipAdjustment
-
// Ready to handle requests.
errChan <- nil
// Wait for requests to create threads.
for r := range requests {
- t, err := firstThread.clone(&initRegs)
+ t, err := firstThread.clone(&firstThread.initRegs)
if err != nil {
// Should not happen: not recoverable.
panic(fmt.Sprintf("error initializing first thread: %v", err))
@@ -183,15 +172,12 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
// (Hopefully nobody tgkilled it with a signal <
// SIGSTOP before the SIGSTOP was delivered, in which
// case that signal would be delivered before SIGSTOP.)
- if sig := t.wait(); sig != syscall.SIGSTOP {
+ if sig := t.wait(stopped); sig != syscall.SIGSTOP {
panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig))
}
- // Detach the thread without suppressing the SIGSTOP,
- // causing it to enter group-stop.
- if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(syscall.SIGSTOP), 0, 0); errno != 0 {
- panic(fmt.Sprintf("can't detach new clone: %v", errno))
- }
+ // Detach the thread.
+ t.detach()
// Return the thread.
r <- t
@@ -208,7 +194,6 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
// Ready.
sp := &subprocess{
- initRegs: initRegs,
requests: requests,
sysemuThreads: threadPool{
threads: make(map[int32]*thread),
@@ -277,16 +262,48 @@ func (t *thread) attach() {
// stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of
// newSubprocess), so we always expect to see signal-delivery-stop with
// SIGSTOP.
- if sig := t.wait(); sig != syscall.SIGSTOP {
+ if sig := t.wait(stopped); sig != syscall.SIGSTOP {
panic(fmt.Sprintf("wait failed: expected SIGSTOP, got %v", sig))
}
// Initialize options.
t.init()
+
+ // Grab registers.
+ //
+ // Note that we adjust the current register RIP value to be just before
+ // the current system call executed. This depends on the definition of
+ // the stub itself.
+ if err := t.getRegs(&t.initRegs); err != nil {
+ panic(fmt.Sprintf("ptrace get regs failed: %v", err))
+ }
+ t.initRegs.Rip -= initRegsRipAdjustment
}
+// detach detachs from the thread.
+//
+// Because the SIGSTOP is not supressed, the thread will enter group-stop.
+func (t *thread) detach() {
+ if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(syscall.SIGSTOP), 0, 0); errno != 0 {
+ panic(fmt.Sprintf("can't detach new clone: %v", errno))
+ }
+}
+
+// waitOutcome is used for wait below.
+type waitOutcome int
+
+const (
+ // stopped indicates that the process was stopped.
+ stopped waitOutcome = iota
+
+ // killed indicates that the process was killed.
+ killed
+)
+
// wait waits for a stop event.
-func (t *thread) wait() syscall.Signal {
+//
+// Precondition: outcome is a valid waitOutcome.
+func (t *thread) wait(outcome waitOutcome) syscall.Signal {
var status syscall.WaitStatus
for {
@@ -300,25 +317,55 @@ func (t *thread) wait() syscall.Signal {
if int(r) != int(t.tid) {
panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid))
}
- if !status.Stopped() {
- panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
- }
- if status.StopSignal() == 0 {
- continue // Spurious stop.
+ switch outcome {
+ case stopped:
+ if !status.Stopped() {
+ panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
+ }
+ stopSig := status.StopSignal()
+ if stopSig == 0 {
+ continue // Spurious stop.
+ }
+ if stopSig == syscall.SIGTRAP {
+ // Re-encode the trap cause the way it's expected.
+ return stopSig | syscall.Signal(status.TrapCause()<<8)
+ }
+ // Not a trap signal.
+ return stopSig
+ case killed:
+ if !status.Exited() && !status.Signaled() {
+ panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status))
+ }
+ return syscall.Signal(status.ExitStatus())
+ default:
+ // Should not happen.
+ panic(fmt.Sprintf("unknown outcome: %v", outcome))
}
- return status.StopSignal()
}
}
+// destroy kills the thread.
+//
+// Note that this should not be used in the general case; the death of threads
+// will typically cause the death of the parent. This is a utility method for
+// manually created threads.
+func (t *thread) destroy() {
+ t.detach()
+ syscall.Tgkill(int(t.tgid), int(t.tid), syscall.Signal(syscall.SIGKILL))
+ t.wait(killed)
+}
+
// init initializes trace options.
func (t *thread) init() {
- // Set our TRACESYSGOOD option to differeniate real SIGTRAP.
+ // Set our TRACESYSGOOD option to differeniate real SIGTRAP. Also, we
+ // require the SECCOMP option to ensure that seccomp violations
+ // generate a ptrace event.
_, _, errno := syscall.RawSyscall6(
syscall.SYS_PTRACE,
syscall.PTRACE_SETOPTIONS,
uintptr(t.tid),
0,
- syscall.PTRACE_O_TRACESYSGOOD,
+ syscall.PTRACE_O_TRACESYSGOOD|_PTRACE_O_TRACESECCOMP,
0, 0)
if errno != 0 {
panic(fmt.Sprintf("ptrace set options failed: %v", errno))
@@ -342,8 +389,8 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
}
- sig := t.wait()
- if sig == (0x80 | syscall.SIGTRAP) {
+ sig := t.wait(stopped)
+ if sig == (syscallEvent | syscall.SIGTRAP) {
// Reached syscall-enter-stop.
break
} else {
@@ -360,7 +407,7 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
// Wait for syscall-exit-stop. "[Signal-delivery-stop] never happens
// between syscall-enter-stop and syscall-exit-stop; it happens *after*
// syscall-exit-stop.)" - ptrace(2), "Syscall-stops"
- if sig := t.wait(); sig != (0x80 | syscall.SIGTRAP) {
+ if sig := t.wait(stopped); sig != (syscallEvent | syscall.SIGTRAP) {
panic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig))
}
@@ -403,22 +450,23 @@ func (t *thread) NotifyInterrupt() {
//
// This function returns true on a system call, false on a signal.
func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
- regs := &ac.StateData().Regs
- s.resetSysemuRegs(regs)
+ // Lock the thread for ptrace operations.
+ runtime.LockOSThread()
+ defer runtime.UnlockOSThread()
// Extract floating point state.
fpState := ac.FloatingPointData()
fpLen, _ := ac.FeatureSet().ExtendedStateSize()
useXsave := ac.FeatureSet().UseXsave()
- // Lock the thread for ptrace operations.
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
// Grab our thread from the pool.
currentTID := int32(procid.Current())
t := s.sysemuThreads.lookupOrCreate(currentTID, s.newThread)
+ // Reset necessary registers.
+ regs := &ac.StateData().Regs
+ t.resetSysemuRegs(regs)
+
// Check for interrupts, and ensure that future interrupts will signal t.
if !c.interrupt.Enable(t) {
// Pending interrupt; simulate.
@@ -459,7 +507,7 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
}
// Wait for the syscall-enter stop.
- sig := t.wait()
+ sig := t.wait(stopped)
// Refresh all registers.
if err := t.getRegs(regs); err != nil {
@@ -470,13 +518,17 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
}
// Is it a system call?
- if sig == (0x80 | syscall.SIGTRAP) {
+ if sig == (syscallEvent | syscall.SIGTRAP) {
// Ensure registers are sane.
updateSyscallRegs(regs)
return true
- }
-
- if sig == syscall.SIGSTOP {
+ } else if sig == (seccompEvent | syscall.SIGTRAP) {
+ // Seccomp is enabled, and caught the system call. This
+ // is an emulated vsyscall call, since those are caught
+ // only by seccomp and explicitly set to trace.
+ updateSyscallRegs(regs)
+ return true
+ } else if sig == syscall.SIGSTOP {
// SIGSTOP was delivered to another thread in the same thread
// group, which initiated another group stop. Just ignore it.
continue
@@ -507,7 +559,7 @@ func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintp
currentTID := int32(procid.Current())
t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
- return t.syscallIgnoreInterrupt(&s.initRegs, sysno, args...)
+ return t.syscallIgnoreInterrupt(&t.initRegs, sysno, args...)
}
// MapFile implements platform.AddressSpace.MapFile.
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index 8211215df..c38dc1ff8 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -43,20 +43,20 @@ const (
// resetSysemuRegs sets up emulation registers.
//
// This should be called prior to calling sysemu.
-func (s *subprocess) resetSysemuRegs(regs *syscall.PtraceRegs) {
- regs.Cs = s.initRegs.Cs
- regs.Ss = s.initRegs.Ss
- regs.Ds = s.initRegs.Ds
- regs.Es = s.initRegs.Es
- regs.Fs = s.initRegs.Fs
- regs.Gs = s.initRegs.Gs
+func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) {
+ regs.Cs = t.initRegs.Cs
+ regs.Ss = t.initRegs.Ss
+ regs.Ds = t.initRegs.Ds
+ regs.Es = t.initRegs.Es
+ regs.Fs = t.initRegs.Fs
+ regs.Gs = t.initRegs.Gs
}
// createSyscallRegs sets up syscall registers.
//
// This should be called to generate registers for a system call.
func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs {
- // Copy initial registers (RIP, segments, etc.).
+ // Copy initial registers.
regs := *initRegs
// Set our syscall number.
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index b212bbdfe..53adadadd 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -21,14 +21,167 @@ import (
"syscall"
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
)
+const (
+ syscallEvent syscall.Signal = 0x80
+ seccompEvent syscall.Signal = 0x700 // 0x7 (PTRACE_SECCOMP_EVENT) << 8
+ _PTRACE_O_TRACESECCOMP = 0x80 // 1 << 0x7 (PTRACE_SECCOMP_EVENT)
+)
+
+// probeSeccomp returns true iff seccomp is run after ptrace notifications,
+// which is generally the case for kernel version >= 4.8. This check is dynamic
+// because kernels have be backported behavior.
+//
+// See createStub for more information.
+//
+// Precondition: the runtime OS thread must be locked.
+func probeSeccomp() bool {
+ // Create a completely new, destroyable process.
+ t, err := attachedThread(0, uint32(linux.SECCOMP_RET_ERRNO))
+ if err != nil {
+ panic(fmt.Sprintf("seccomp probe failed: %v", err))
+ }
+ defer t.destroy()
+
+ // Set registers to the yield system call. This call is not allowed
+ // by the filters specified in the attachThread function.
+ regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD)
+ if err := t.setRegs(&regs); err != nil {
+ panic(fmt.Sprintf("ptrace set regs failed: %v", err))
+ }
+
+ for {
+ // Attempt an emulation.
+ if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0); errno != 0 {
+ panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
+ }
+
+ sig := t.wait(stopped)
+ if sig == (syscallEvent | syscall.SIGTRAP) {
+ // Did the seccomp errno hook already run? This would
+ // indicate that seccomp is first in line and we're
+ // less than 4.8.
+ if err := t.getRegs(&regs); err != nil {
+ panic(fmt.Sprintf("ptrace get-regs failed: %v", err))
+ }
+ if _, err := syscallReturnValue(&regs); err == nil {
+ // The seccomp errno mode ran first, and reset
+ // the error in the registers.
+ return false
+ }
+ // The seccomp hook did not run yet, and therefore it
+ // is safe to use RET_KILL mode for dispatched calls.
+ return true
+ }
+ }
+}
+
// createStub creates a fresh stub processes.
//
// Precondition: the runtime OS thread must be locked.
func createStub() (*thread, error) {
+ // The exact interactions of ptrace and seccomp are complex, and
+ // changed in recent kernel versions. Before commit 93e35efb8de45, the
+ // seccomp check is done before the ptrace emulation check. This means
+ // that any calls not matching this list will trigger the seccomp
+ // default action instead of notifying ptrace.
+ //
+ // After commit 93e35efb8de45, the seccomp check is done after the
+ // ptrace emulation check. This simplifies using SYSEMU, since seccomp
+ // will never run for emulation. Seccomp will only run for injected
+ // system calls, and thus we can use RET_KILL as our violation action.
+ var defaultAction uint32
+ if probeSeccomp() {
+ log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)")
+ defaultAction = uint32(linux.SECCOMP_RET_KILL)
+ } else {
+ // We must rely on SYSEMU behavior; tracing with SYSEMU is broken.
+ log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)")
+ defaultAction = uint32(linux.SECCOMP_RET_ALLOW)
+ }
+
+ // When creating the new child process, we specify SIGKILL as the
+ // signal to deliver when the child exits. We never expect a subprocess
+ // to exit; they are pooled and reused. This is done to ensure that if
+ // a subprocess is OOM-killed, this process (and all other stubs,
+ // transitively) will be killed as well. It's simply not possible to
+ // safely handle a single stub getting killed: the exact state of
+ // execution is unknown and not recoverable.
+ return attachedThread(uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, defaultAction)
+}
+
+// attachedThread returns a new attached thread.
+//
+// Precondition: the runtime OS thread must be locked.
+func attachedThread(flags uintptr, defaultAction uint32) (*thread, error) {
+ // Create a BPF program that allows only the system calls needed by the
+ // stub and all its children. This is used to create child stubs
+ // (below), so we must include the ability to fork, but otherwise lock
+ // down available calls only to what is needed.
+ rules := []seccomp.RuleSet{
+ // Rules for trapping vsyscall access.
+ seccomp.RuleSet{
+ Rules: seccomp.SyscallRules{
+ syscall.SYS_GETTIMEOFDAY: {},
+ syscall.SYS_TIME: {},
+ 309: {}, // SYS_GETCPU.
+ },
+ Action: uint32(linux.SECCOMP_RET_TRACE),
+ Vsyscall: true,
+ },
+ }
+ if defaultAction != uint32(linux.SECCOMP_RET_ALLOW) {
+ rules = append(rules, seccomp.RuleSet{
+ Rules: seccomp.SyscallRules{
+ syscall.SYS_CLONE: []seccomp.Rule{
+ // Allow creation of new subprocesses (used by the master).
+ {seccomp.AllowValue(syscall.CLONE_FILES | syscall.SIGKILL)},
+ // Allow creation of new threads within a single address space (used by addresss spaces).
+ {seccomp.AllowValue(
+ syscall.CLONE_FILES |
+ syscall.CLONE_FS |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_THREAD |
+ syscall.CLONE_PTRACE |
+ syscall.CLONE_VM)},
+ },
+
+ // For the initial process creation.
+ syscall.SYS_WAIT4: {},
+ syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+ {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+ },
+ syscall.SYS_EXIT: {},
+
+ // For the stub prctl dance (all).
+ syscall.SYS_PRCTL: []seccomp.Rule{
+ {seccomp.AllowValue(syscall.PR_SET_PDEATHSIG), seccomp.AllowValue(syscall.SIGKILL)},
+ },
+ syscall.SYS_GETPPID: {},
+
+ // For the stub to stop itself (all).
+ syscall.SYS_GETPID: {},
+ syscall.SYS_KILL: []seccomp.Rule{
+ {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SIGSTOP)},
+ },
+
+ // Injected to support the address space operations.
+ syscall.SYS_MMAP: {},
+ syscall.SYS_MUNMAP: {},
+ },
+ Action: uint32(linux.SECCOMP_RET_ALLOW),
+ })
+ }
+ instrs, err := seccomp.BuildProgram(rules, defaultAction)
+ if err != nil {
+ return nil, err
+ }
+
// Declare all variables up front in order to ensure that there's no
// need for allocations between beforeFork & afterFork.
var (
@@ -43,14 +196,8 @@ func createStub() (*thread, error) {
// Among other things, beforeFork masks all signals.
beforeFork()
- // When creating the new child process, we specify SIGKILL as the
- // signal to deliver when the child exits. We never expect a subprocess
- // to exit; they are pooled and reused. This is done to ensure that if
- // a subprocess is OOM-killed, this process (and all other stubs,
- // transitively) will be killed as well. It's simply not possible to
- // safely handle a single stub getting killed: the exact state of
- // execution is unknown and not recoverable.
- pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, 0, 0, 0, 0, 0)
+ // Do the clone.
+ pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, flags, 0, 0, 0, 0, 0)
if errno != 0 {
afterFork()
return nil, errno
@@ -67,7 +214,7 @@ func createStub() (*thread, error) {
tid: int32(pid),
cpu: ^uint32(0),
}
- if sig := t.wait(); sig != syscall.SIGSTOP {
+ if sig := t.wait(stopped); sig != syscall.SIGSTOP {
return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
}
t.attach()
@@ -86,6 +233,12 @@ func createStub() (*thread, error) {
syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
}
+ // Set an aggressive BPF filter for the stub and all it's children. See
+ // the description of the BPF program built above.
+ if errno := seccomp.SetFilter(instrs); errno != 0 {
+ syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
+ }
+
// Enable cpuid-faulting; this may fail on older kernels or hardware,
// so we just disregard the result. Host CPUID will be enabled.
syscall.RawSyscall(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0)
@@ -105,7 +258,7 @@ func (s *subprocess) createStub() (*thread, error) {
t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
// Pass the expected PPID to the child via R15.
- regs := s.initRegs
+ regs := t.initRegs
regs.R15 = uint64(t.tgid)
// Call fork in a subprocess.
@@ -138,7 +291,7 @@ func (s *subprocess) createStub() (*thread, error) {
// status. If the wait succeeds, we'll assume that it was the SIGSTOP.
// If the child actually exited, the attach below will fail.
_, err = t.syscallIgnoreInterrupt(
- &s.initRegs,
+ &t.initRegs,
syscall.SYS_WAIT4,
arch.SyscallArgument{Value: uintptr(pid)},
arch.SyscallArgument{Value: 0},
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index e1c8db67a..674554081 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -24,6 +24,7 @@ go_library(
"//pkg/binary",
"//pkg/bits",
"//pkg/eventchannel",
+ "//pkg/seccomp",
"//pkg/sentry/arch",
"//pkg/sentry/kernel",
"//pkg/sentry/socket/control",
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index f2a22aaa5..a16f5490e 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -28,6 +28,7 @@ import (
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/bits"
"gvisor.googlesource.com/gvisor/pkg/eventchannel"
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
pb "gvisor.googlesource.com/gvisor/pkg/sentry/strace/strace_go_proto"
@@ -699,3 +700,13 @@ func EnableAll(sinks SinkType) {
table.FeatureEnable.EnableAll(flags)
}
}
+
+func init() {
+ t, ok := Lookup(abi.Host, arch.Host)
+ if ok {
+ // Provide the native table as the lookup for seccomp
+ // debugging. This is best-effort. This is provided this way to
+ // avoid dependencies from seccomp to this package.
+ seccomp.SyscallName = t.Name
+ }
+}