summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel/task_syscall.go
diff options
context:
space:
mode:
authorGoogler <noreply@google.com>2018-04-27 10:37:02 -0700
committerAdin Scannell <ascannell@google.com>2018-04-28 01:44:26 -0400
commitd02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/kernel/task_syscall.go
parentf70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)
Check in gVisor.
PiperOrigin-RevId: 194583126 Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
Diffstat (limited to 'pkg/sentry/kernel/task_syscall.go')
-rw-r--r--pkg/sentry/kernel/task_syscall.go434
1 files changed, 434 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
new file mode 100644
index 000000000..79f4ff60c
--- /dev/null
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -0,0 +1,434 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "fmt"
+ "os"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/bits"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel
+// include/linux/errno.h. These errnos are never returned to userspace
+// directly, but are used to communicate the expected behavior of an
+// interrupted syscall from the syscall to signal handling.
+type SyscallRestartErrno int
+
+// These numeric values are significant because ptrace syscall exit tracing can
+// observe them.
+//
+// For all of the following errnos, if the syscall is not interrupted by a
+// signal delivered to a user handler, the syscall is restarted.
+const (
+ // ERESTARTSYS is returned by an interrupted syscall to indicate that it
+ // should be converted to EINTR if interrupted by a signal delivered to a
+ // user handler without SA_RESTART set, and restarted otherwise.
+ ERESTARTSYS = SyscallRestartErrno(512)
+
+ // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it
+ // should always be restarted.
+ ERESTARTNOINTR = SyscallRestartErrno(513)
+
+ // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it
+ // should be converted to EINTR if interrupted by a signal delivered to a
+ // user handler, and restarted otherwise.
+ ERESTARTNOHAND = SyscallRestartErrno(514)
+
+ // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate
+ // that it should be restarted using a custom function. The interrupted
+ // syscall must register a custom restart function by calling
+ // Task.SetRestartSyscallFn.
+ ERESTART_RESTARTBLOCK = SyscallRestartErrno(516)
+)
+
+// Error implements error.Error.
+func (e SyscallRestartErrno) Error() string {
+ // Descriptions are borrowed from strace.
+ switch e {
+ case ERESTARTSYS:
+ return "to be restarted if SA_RESTART is set"
+ case ERESTARTNOINTR:
+ return "to be restarted"
+ case ERESTARTNOHAND:
+ return "to be restarted if no handler"
+ case ERESTART_RESTARTBLOCK:
+ return "interrupted by signal"
+ default:
+ return "(unknown interrupt error)"
+ }
+}
+
+// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by
+// rv, the value in a syscall return register.
+func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) {
+ switch int(rv) {
+ case -int(ERESTARTSYS):
+ return ERESTARTSYS, true
+ case -int(ERESTARTNOINTR):
+ return ERESTARTNOINTR, true
+ case -int(ERESTARTNOHAND):
+ return ERESTARTNOHAND, true
+ case -int(ERESTART_RESTARTBLOCK):
+ return ERESTART_RESTARTBLOCK, true
+ default:
+ return 0, false
+ }
+}
+
+// SyscallRestartBlock represents the restart block for a syscall restartable
+// with a custom function. It encapsulates the state required to restart a
+// syscall across a S/R.
+type SyscallRestartBlock interface {
+ Restart(t *Task) (uintptr, error)
+}
+
+// SyscallControl is returned by syscalls to control the behavior of
+// Task.doSyscallInvoke.
+type SyscallControl struct {
+ // next is the state that the task goroutine should switch to. If next is
+ // nil, the task goroutine should continue to syscall exit as usual.
+ next taskRunState
+
+ // If ignoreReturn is true, Task.doSyscallInvoke should not store any value
+ // in the task's syscall return value register.
+ ignoreReturn bool
+}
+
+var (
+ // CtrlDoExit is returned by the implementations of the exit and exit_group
+ // syscalls to enter the task exit path directly, skipping syscall exit
+ // tracing.
+ CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true}
+
+ // ctrlStopAndReinvokeSyscall is returned by syscalls using the external
+ // feature before syscall execution. This causes Task.doSyscallInvoke
+ // to return runSyscallReinvoke, allowing Task.run to check for stops
+ // before immediately re-invoking the syscall (skipping the re-checking
+ // of seccomp filters and ptrace which would confuse userspace
+ // tracing).
+ ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true}
+
+ // ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at
+ // their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather
+ // than tail-calling it, allowing stops to be checked before syscall exit.
+ ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)}
+)
+
+func (t *Task) invokeExternal() {
+ t.BeginExternalStop()
+ go func() { // S/R-SAFE: External control flow.
+ defer t.EndExternalStop()
+ t.SyscallTable().External(t.Kernel())
+ }()
+}
+
+func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) {
+ s := t.SyscallTable()
+
+ fe := s.FeatureEnable.Word(sysno)
+
+ var straceContext interface{}
+ if bits.IsAnyOn32(fe, StraceEnableBits) {
+ straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe)
+ }
+
+ if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) {
+ t.invokeExternal()
+ // Ensure we check for stops, then invoke the syscall again.
+ ctrl = ctrlStopAndReinvokeSyscall
+ } else {
+ fn := s.Lookup(sysno)
+ if fn != nil {
+ // Call our syscall implementation.
+ rval, ctrl, err = fn(t, args)
+ } else {
+ // Use the missing function if not found.
+ rval, err = t.SyscallTable().Missing(t, sysno, args)
+ }
+ }
+
+ if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) {
+ t.invokeExternal()
+ // Don't reinvoke the syscall.
+ }
+
+ if bits.IsAnyOn32(fe, StraceEnableBits) {
+ s.Stracer.SyscallExit(straceContext, t, sysno, rval, err)
+ }
+
+ return
+}
+
+// doSyscall is the entry point for an invocation of a system call specified by
+// the current state of t's registers.
+//
+// The syscall path is very hot; avoid defer.
+func (t *Task) doSyscall() taskRunState {
+ sysno := t.Arch().SyscallNo()
+ args := t.Arch().SyscallArgs()
+
+ // Tracers expect to see this between when the task traps into the kernel
+ // to perform a syscall and when the syscall is actually invoked.
+ // This useless-looking temporary is needed because Go.
+ tmp := uintptr(syscall.ENOSYS)
+ t.Arch().SetReturn(-tmp)
+
+ // Check seccomp filters. The nil check is for performance (as seccomp use
+ // is rare), not needed for correctness.
+ if t.syscallFilters != nil {
+ switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r {
+ case seccompResultDeny:
+ t.Debugf("Syscall %d: denied by seccomp", sysno)
+ return (*runSyscallExit)(nil)
+ case seccompResultAllow:
+ // ok
+ case seccompResultKill:
+ t.Debugf("Syscall %d: killed by seccomp", sysno)
+ t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+ return (*runExit)(nil)
+ case seccompResultTrace:
+ t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno)
+ return (*runSyscallAfterPtraceEventSeccomp)(nil)
+ default:
+ panic(fmt.Sprintf("Unknown seccomp result %d", r))
+ }
+ }
+
+ return t.doSyscallEnter(sysno, args)
+}
+
+type runSyscallAfterPtraceEventSeccomp struct{}
+
+func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
+ if t.killed() {
+ // "[S]yscall-exit-stop is not generated prior to death by SIGKILL." -
+ // ptrace(2)
+ return (*runInterrupt)(nil)
+ }
+ sysno := t.Arch().SyscallNo()
+ // "The tracer can skip the system call by changing the syscall number to
+ // -1." - Documentation/prctl/seccomp_filter.txt
+ if sysno == ^uintptr(0) {
+ return (*runSyscallExit)(nil).execute(t)
+ }
+ args := t.Arch().SyscallArgs()
+ return t.doSyscallEnter(sysno, args)
+}
+
+func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState {
+ if next, ok := t.ptraceSyscallEnter(); ok {
+ return next
+ }
+ return t.doSyscallInvoke(sysno, args)
+}
+
+type runSyscallAfterSyscallEnterStop struct{}
+
+func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
+ if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
+ t.tg.signalHandlers.mu.Lock()
+ t.sendSignalLocked(sigPriv(sig), false /* group */)
+ t.tg.signalHandlers.mu.Unlock()
+ }
+ if t.killed() {
+ return (*runInterrupt)(nil)
+ }
+ sysno := t.Arch().SyscallNo()
+ if sysno == ^uintptr(0) {
+ return (*runSyscallExit)(nil)
+ }
+ args := t.Arch().SyscallArgs()
+ return t.doSyscallInvoke(sysno, args)
+}
+
+type runSyscallAfterSysemuStop struct{}
+
+func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState {
+ if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
+ t.tg.signalHandlers.mu.Lock()
+ t.sendSignalLocked(sigPriv(sig), false /* group */)
+ t.tg.signalHandlers.mu.Unlock()
+ }
+ if t.killed() {
+ return (*runInterrupt)(nil)
+ }
+ return (*runSyscallExit)(nil).execute(t)
+}
+
+func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState {
+ rval, ctrl, err := t.executeSyscall(sysno, args)
+
+ if ctrl != nil {
+ if !ctrl.ignoreReturn {
+ t.Arch().SetReturn(rval)
+ }
+ if ctrl.next != nil {
+ return ctrl.next
+ }
+ } else if err != nil {
+ t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno))))
+ t.haveSyscallReturn = true
+ } else {
+ t.Arch().SetReturn(rval)
+ }
+
+ return (*runSyscallExit)(nil).execute(t)
+}
+
+type runSyscallReinvoke struct{}
+
+func (*runSyscallReinvoke) execute(t *Task) taskRunState {
+ if t.killed() {
+ // It's possible that since the last execution, the task has
+ // been forcible killed. Invoking the system call here could
+ // result in an infinite loop if it is again preempted by an
+ // external stop and reinvoked.
+ return (*runInterrupt)(nil)
+ }
+
+ sysno := t.Arch().SyscallNo()
+ args := t.Arch().SyscallArgs()
+ return t.doSyscallInvoke(sysno, args)
+}
+
+type runSyscallExit struct{}
+
+func (*runSyscallExit) execute(t *Task) taskRunState {
+ t.ptraceSyscallExit()
+ return (*runApp)(nil)
+}
+
+// doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as
+// indicated by an execution fault at address addr. doVsyscall returns the
+// task's next run state.
+func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
+ // Grab the caller up front, to make sure there's a sensible stack.
+ caller := t.Arch().Native(uintptr(0))
+ if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil {
+ t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
+ t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+ t.SendSignal(sigPriv(linux.SIGSEGV))
+ return (*runApp)(nil)
+ }
+
+ // For _vsyscalls_, there is no need to translate System V calling convention
+ // to syscall ABI because they both use RDI, RSI, and RDX for the first three
+ // arguments and none of the vsyscalls uses more than two arguments.
+ args := t.Arch().SyscallArgs()
+ if t.syscallFilters != nil {
+ switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r {
+ case seccompResultDeny:
+ t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller))
+ return (*runApp)(nil)
+ case seccompResultAllow:
+ // ok
+ case seccompResultTrace:
+ t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller))
+ return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller}
+ default:
+ panic(fmt.Sprintf("Unknown seccomp result %d", r))
+ }
+ }
+
+ return t.doVsyscallInvoke(sysno, args, caller)
+}
+
+type runVsyscallAfterPtraceEventSeccomp struct {
+ addr usermem.Addr
+ sysno uintptr
+ caller interface{}
+}
+
+func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
+ if t.killed() {
+ return (*runInterrupt)(nil)
+ }
+ sysno := t.Arch().SyscallNo()
+ // "... the syscall may not be changed to another system call using the
+ // orig_rax register. It may only be changed to -1 order [sic] to skip the
+ // currently emulated call. ... The tracer MUST NOT modify rip or rsp." -
+ // Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip
+ // causes do_exit(SIGSYS), and changing sp is ignored.
+ if (sysno != ^uintptr(0) && sysno != r.sysno) || usermem.Addr(t.Arch().IP()) != r.addr {
+ t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+ return (*runExit)(nil)
+ }
+ if sysno == ^uintptr(0) {
+ return (*runApp)(nil)
+ }
+ return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
+}
+
+func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState {
+ rval, ctrl, err := t.executeSyscall(sysno, args)
+ if ctrl != nil {
+ t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
+ // Set the return value. The stack has already been adjusted.
+ t.Arch().SetReturn(0)
+ } else if err == nil {
+ t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller))
+ // Set the return value. The stack has already been adjusted.
+ t.Arch().SetReturn(uintptr(rval))
+ } else {
+ t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err)
+ if err == syserror.EFAULT {
+ t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+ t.SendSignal(sigPriv(linux.SIGSEGV))
+ // A return is not emulated in this case.
+ return (*runApp)(nil)
+ }
+ t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno))))
+ }
+ t.Arch().SetIP(t.Arch().Value(caller))
+ t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width()))
+ return (*runApp)(nil)
+}
+
+// ExtractErrno extracts an integer error number from the error.
+// The syscall number is purely for context in the error case. Use -1 if
+// syscall number is unknown.
+func (t *Task) ExtractErrno(err error, sysno int) int {
+ switch err := err.(type) {
+ case nil:
+ return 0
+ case syscall.Errno:
+ return int(err)
+ case SyscallRestartErrno:
+ return int(err)
+ case *memmap.BusError:
+ // Bus errors may generate SIGBUS, but for syscalls they still
+ // return EFAULT. See case in task_run.go where the fault is
+ // handled (and the SIGBUS is delivered).
+ return int(syscall.EFAULT)
+ case *os.PathError:
+ return t.ExtractErrno(err.Err, sysno)
+ case *os.LinkError:
+ return t.ExtractErrno(err.Err, sysno)
+ case *os.SyscallError:
+ return t.ExtractErrno(err.Err, sysno)
+ default:
+ if errno, ok := syserror.TranslateError(err); ok {
+ return int(errno)
+ }
+ }
+ panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err))
+}