diff options
author | kevin.xu <cming.xu@gmail.com> | 2020-04-27 21:51:31 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-04-27 21:51:31 +0800 |
commit | e896ca54db67524afc20b644d43c72185e72dc0e (patch) | |
tree | 2a16f3a62a5cafd098f1f028c621f1b655589d69 /pkg/sentry/platform/ptrace | |
parent | 1f19624fa127d7d59cabe29593cc80b7fe6c81f8 (diff) | |
parent | 3c67754663f424f2ebbc0ff2a4c80e30618d5355 (diff) |
Merge pull request #1 from google/master
catch up
Diffstat (limited to 'pkg/sentry/platform/ptrace')
-rw-r--r-- | pkg/sentry/platform/ptrace/BUILD | 10 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/ptrace.go | 17 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/ptrace_amd64.go | 19 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/ptrace_arm64.go | 5 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go | 62 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/ptrace_unsafe.go | 6 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/stub_amd64.s | 29 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/stub_arm64.s | 30 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/stub_unsafe.go | 4 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess.go | 54 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess_amd64.go | 130 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess_arm64.go | 62 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess_linux.go | 93 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go | 23 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/subprocess_unsafe.go | 2 |
15 files changed, 357 insertions, 189 deletions
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index ebcc8c098..30402c2df 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,6 +9,7 @@ go_library( "ptrace.go", "ptrace_amd64.go", "ptrace_arm64.go", + "ptrace_arm64_unsafe.go", "ptrace_unsafe.go", "stub_amd64.s", "stub_arm64.s", @@ -20,18 +21,19 @@ go_library( "subprocess_linux_unsafe.go", "subprocess_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ptrace", visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", "//pkg/log", "//pkg/procid", + "//pkg/safecopy", "//pkg/seccomp", "//pkg/sentry/arch", + "//pkg/sentry/hostcpu", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", - "//pkg/sentry/platform/safecopy", - "//pkg/sentry/usermem", + "//pkg/sync", + "//pkg/usermem", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go index 7b120a15d..08d055e05 100644 --- a/pkg/sentry/platform/ptrace/ptrace.go +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -46,13 +46,13 @@ package ptrace import ( "os" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) var ( @@ -177,6 +177,9 @@ func (c *context) Interrupt() { c.interrupt.NotifyInterrupt() } +// Release implements platform.Context.Release(). +func (c *context) Release() {} + // PTrace represents a collection of ptrace subprocesses. type PTrace struct { platform.MMapMinAddr @@ -248,6 +251,16 @@ func (*constructor) OpenDevice() (*os.File, error) { return nil, nil } +// Flags implements platform.Constructor.Flags(). +func (*constructor) Requirements() platform.Requirements { + // TODO(b/75837838): Also set a new PID namespace so that we limit + // access to other host processes. + return platform.Requirements{ + RequiresCapSysPtrace: true, + RequiresCurrentPIDNS: true, + } +} + func init() { platform.Register("ptrace", &constructor{}) } diff --git a/pkg/sentry/platform/ptrace/ptrace_amd64.go b/pkg/sentry/platform/ptrace/ptrace_amd64.go index db0212538..3b9a870a5 100644 --- a/pkg/sentry/platform/ptrace/ptrace_amd64.go +++ b/pkg/sentry/platform/ptrace/ptrace_amd64.go @@ -15,9 +15,8 @@ package ptrace import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" ) // fpRegSet returns the GETREGSET/SETREGSET register set type to be used. @@ -28,6 +27,20 @@ func fpRegSet(useXsave bool) uintptr { return linux.NT_PRFPREG } -func stackPointer(r *syscall.PtraceRegs) uintptr { +func stackPointer(r *arch.Registers) uintptr { return uintptr(r.Rsp) } + +// x86 use the fs_base register to store the TLS pointer which can be +// get/set in "func (t *thread) get/setRegs(regs *arch.Registers)". +// So both of the get/setTLS() operations are noop here. + +// getTLS gets the thread local storage register. +func (t *thread) getTLS(tls *uint64) error { + return nil +} + +// setTLS sets the thread local storage register. +func (t *thread) setTLS(tls *uint64) error { + return nil +} diff --git a/pkg/sentry/platform/ptrace/ptrace_arm64.go b/pkg/sentry/platform/ptrace/ptrace_arm64.go index 4db28c534..5c869926a 100644 --- a/pkg/sentry/platform/ptrace/ptrace_arm64.go +++ b/pkg/sentry/platform/ptrace/ptrace_arm64.go @@ -15,9 +15,8 @@ package ptrace import ( - "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" ) // fpRegSet returns the GETREGSET/SETREGSET register set type to be used. @@ -25,6 +24,6 @@ func fpRegSet(_ bool) uintptr { return linux.NT_PRFPREG } -func stackPointer(r *syscall.PtraceRegs) uintptr { +func stackPointer(r *arch.Registers) uintptr { return uintptr(r.Sp) } diff --git a/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go new file mode 100644 index 000000000..32b8a6be9 --- /dev/null +++ b/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go @@ -0,0 +1,62 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package ptrace + +import ( + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/abi/linux" +) + +// getTLS gets the thread local storage register. +func (t *thread) getTLS(tls *uint64) error { + iovec := syscall.Iovec{ + Base: (*byte)(unsafe.Pointer(tls)), + Len: uint64(unsafe.Sizeof(*tls)), + } + _, _, errno := syscall.RawSyscall6( + syscall.SYS_PTRACE, + syscall.PTRACE_GETREGSET, + uintptr(t.tid), + linux.NT_ARM_TLS, + uintptr(unsafe.Pointer(&iovec)), + 0, 0) + if errno != 0 { + return errno + } + return nil +} + +// setTLS sets the thread local storage register. +func (t *thread) setTLS(tls *uint64) error { + iovec := syscall.Iovec{ + Base: (*byte)(unsafe.Pointer(tls)), + Len: uint64(unsafe.Sizeof(*tls)), + } + _, _, errno := syscall.RawSyscall6( + syscall.SYS_PTRACE, + syscall.PTRACE_SETREGSET, + uintptr(t.tid), + linux.NT_ARM_TLS, + uintptr(unsafe.Pointer(&iovec)), + 0, 0) + if errno != 0 { + return errno + } + return nil +} diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go index 72c7ec564..8b72d24e8 100644 --- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go +++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go @@ -20,11 +20,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // getRegs gets the general purpose register set. -func (t *thread) getRegs(regs *syscall.PtraceRegs) error { +func (t *thread) getRegs(regs *arch.Registers) error { iovec := syscall.Iovec{ Base: (*byte)(unsafe.Pointer(regs)), Len: uint64(unsafe.Sizeof(*regs)), @@ -43,7 +43,7 @@ func (t *thread) getRegs(regs *syscall.PtraceRegs) error { } // setRegs sets the general purpose register set. -func (t *thread) setRegs(regs *syscall.PtraceRegs) error { +func (t *thread) setRegs(regs *arch.Registers) error { iovec := syscall.Iovec{ Base: (*byte)(unsafe.Pointer(regs)), Len: uint64(unsafe.Sizeof(*regs)), diff --git a/pkg/sentry/platform/ptrace/stub_amd64.s b/pkg/sentry/platform/ptrace/stub_amd64.s index 64c718d21..16f9c523e 100644 --- a/pkg/sentry/platform/ptrace/stub_amd64.s +++ b/pkg/sentry/platform/ptrace/stub_amd64.s @@ -64,6 +64,8 @@ begin: CMPQ AX, $0 JL error + MOVQ $0, BX + // SIGSTOP to wait for attach. // // The SYSCALL instruction will be used for future syscall injection by @@ -73,23 +75,26 @@ begin: MOVQ $SIGSTOP, SI SYSCALL - // The tracer may "detach" and/or allow code execution here in three cases: - // - // 1. New (traced) stub threads are explicitly detached by the - // goroutine in newSubprocess. However, they are detached while in - // group-stop, so they do not execute code here. - // - // 2. If a tracer thread exits, it implicitly detaches from the stub, - // potentially allowing code execution here. However, the Go runtime - // never exits individual threads, so this case never occurs. - // - // 3. subprocess.createStub clones a new stub process that is untraced, + // The sentry sets BX to 1 when creating stub process. + CMPQ BX, $1 + JE clone + + // Notify the Sentry that syscall exited. +done: + INT $3 + // Be paranoid. + JMP done +clone: + // subprocess.createStub clones a new stub process that is untraced, // thus executing this code. We setup the PDEATHSIG before SIGSTOPing // ourselves for attach by the tracer. // // R15 has been updated with the expected PPID. - JMP begin + CMPQ AX, $0 + JE begin + // The clone syscall returns a non-zero value. + JMP done error: // Exit with -errno. MOVQ AX, DI diff --git a/pkg/sentry/platform/ptrace/stub_arm64.s b/pkg/sentry/platform/ptrace/stub_arm64.s index 2c5e4d5cb..6162df02a 100644 --- a/pkg/sentry/platform/ptrace/stub_arm64.s +++ b/pkg/sentry/platform/ptrace/stub_arm64.s @@ -59,6 +59,8 @@ begin: CMP $0x0, R0 BLT error + MOVD $0, R9 + // SIGSTOP to wait for attach. // // The SYSCALL instruction will be used for future syscall injection by @@ -66,22 +68,26 @@ begin: MOVD $SYS_KILL, R8 MOVD $SIGSTOP, R1 SVC - // The tracer may "detach" and/or allow code execution here in three cases: - // - // 1. New (traced) stub threads are explicitly detached by the - // goroutine in newSubprocess. However, they are detached while in - // group-stop, so they do not execute code here. - // - // 2. If a tracer thread exits, it implicitly detaches from the stub, - // potentially allowing code execution here. However, the Go runtime - // never exits individual threads, so this case never occurs. - // - // 3. subprocess.createStub clones a new stub process that is untraced, + + // The sentry sets R9 to 1 when creating stub process. + CMP $1, R9 + BEQ clone + +done: + // Notify the Sentry that syscall exited. + BRK $3 + B done // Be paranoid. +clone: + // subprocess.createStub clones a new stub process that is untraced, // thus executing this code. We setup the PDEATHSIG before SIGSTOPing // ourselves for attach by the tracer. // // R7 has been updated with the expected PPID. - B begin + CMP $0, R0 + BEQ begin + + // The clone system call returned a non-zero value. + B done error: // Exit with -errno. diff --git a/pkg/sentry/platform/ptrace/stub_unsafe.go b/pkg/sentry/platform/ptrace/stub_unsafe.go index aa1b87237..341dde143 100644 --- a/pkg/sentry/platform/ptrace/stub_unsafe.go +++ b/pkg/sentry/platform/ptrace/stub_unsafe.go @@ -19,8 +19,8 @@ import ( "syscall" "unsafe" - "gvisor.dev/gvisor/pkg/sentry/platform/safecopy" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/safecopy" + "gvisor.dev/gvisor/pkg/usermem" ) // stub is defined in arch-specific assembly. diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index b699b057d..2389423b0 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -18,14 +18,15 @@ import ( "fmt" "os" "runtime" - "sync" "syscall" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/procid" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // Linux kernel errnos which "should never be seen by user programs", but will @@ -62,7 +63,7 @@ type thread struct { // initRegs are the initial registers for the first thread. // // These are used for the register set for system calls. - initRegs syscall.PtraceRegs + initRegs arch.Registers } // threadPool is a collection of threads. @@ -316,7 +317,7 @@ const ( ) func (t *thread) dumpAndPanic(message string) { - var regs syscall.PtraceRegs + var regs arch.Registers message += "\n" if err := t.getRegs(®s); err == nil { message += dumpRegs(®s) @@ -331,11 +332,12 @@ func (t *thread) unexpectedStubExit() { msg, err := t.getEventMessage() status := syscall.WaitStatus(msg) if status.Signaled() && status.Signal() == syscall.SIGKILL { - // SIGKILL can be only sent by an user or OOM-killer. In both + // SIGKILL can be only sent by a user or OOM-killer. In both // these cases, we don't need to panic. There is no reasons to // think that something wrong in gVisor. log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid) - syscall.Kill(os.Getpid(), syscall.SIGKILL) + pid := os.Getpid() + syscall.Tgkill(pid, pid, syscall.Signal(syscall.SIGKILL)) } t.dumpAndPanic(fmt.Sprintf("wait failed: the process %d:%d exited: %x (err %v)", t.tgid, t.tid, msg, err)) } @@ -421,20 +423,22 @@ func (t *thread) init() { // This is _not_ for use by application system calls, rather it is for use when // a system call must be injected into the remote context (e.g. mmap, munmap). // Note that clones are handled separately. -func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) { +func (t *thread) syscall(regs *arch.Registers) (uintptr, error) { // Set registers. if err := t.setRegs(regs); err != nil { panic(fmt.Sprintf("ptrace set regs failed: %v", err)) } for { - // Execute the syscall instruction. - if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { + // Execute the syscall instruction. The task has to stop on the + // trap instruction which is right after the syscall + // instruction. + if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_CONT, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) } sig := t.wait(stopped) - if sig == (syscallEvent | syscall.SIGTRAP) { + if sig == syscall.SIGTRAP { // Reached syscall-enter-stop. break } else { @@ -446,18 +450,6 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) { } } - // Complete the actual system call. - if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { - panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) - } - - // Wait for syscall-exit-stop. "[Signal-delivery-stop] never happens - // between syscall-enter-stop and syscall-exit-stop; it happens *after* - // syscall-exit-stop.)" - ptrace(2), "Syscall-stops" - if sig := t.wait(stopped); sig != (syscallEvent | syscall.SIGTRAP) { - t.dumpAndPanic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig)) - } - // Grab registers. if err := t.getRegs(regs); err != nil { panic(fmt.Sprintf("ptrace get regs failed: %v", err)) @@ -469,7 +461,7 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) { // syscallIgnoreInterrupt ignores interrupts on the system call thread and // restarts the syscall if the kernel indicates that should happen. func (t *thread) syscallIgnoreInterrupt( - initRegs *syscall.PtraceRegs, + initRegs *arch.Registers, sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) { for { @@ -514,6 +506,9 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { regs := &ac.StateData().Regs t.resetSysemuRegs(regs) + // Extract TLS register + tls := uint64(ac.TLS()) + // Check for interrupts, and ensure that future interrupts will signal t. if !c.interrupt.Enable(t) { // Pending interrupt; simulate. @@ -534,20 +529,23 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { if err := t.setFPRegs(fpState, uint64(fpLen), useXsave); err != nil { panic(fmt.Sprintf("ptrace set fpregs (%+v) failed: %v", fpState, err)) } + if err := t.setTLS(&tls); err != nil { + panic(fmt.Sprintf("ptrace set tls (%+v) failed: %v", tls, err)) + } for { // Start running until the next system call. if isSingleStepping(regs) { if _, _, errno := syscall.RawSyscall6( syscall.SYS_PTRACE, - syscall.PTRACE_SYSEMU_SINGLESTEP, + unix.PTRACE_SYSEMU_SINGLESTEP, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace sysemu failed: %v", errno)) } } else { if _, _, errno := syscall.RawSyscall6( syscall.SYS_PTRACE, - syscall.PTRACE_SYSEMU, + unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace sysemu failed: %v", errno)) } @@ -563,6 +561,12 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { if err := t.getFPRegs(fpState, uint64(fpLen), useXsave); err != nil { panic(fmt.Sprintf("ptrace get fpregs failed: %v", err)) } + if err := t.getTLS(&tls); err != nil { + panic(fmt.Sprintf("ptrace get tls failed: %v", err)) + } + if !ac.SetTLS(uintptr(tls)) { + panic(fmt.Sprintf("tls value %v is invalid", tls)) + } // Is it a system call? if sig == (syscallEvent | syscall.SIGTRAP) { diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go index 4649a94a7..84b699f0d 100644 --- a/pkg/sentry/platform/ptrace/subprocess_amd64.go +++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go @@ -21,6 +21,9 @@ import ( "strings" "syscall" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" ) @@ -38,7 +41,7 @@ const ( // resetSysemuRegs sets up emulation registers. // // This should be called prior to calling sysemu. -func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) { +func (t *thread) resetSysemuRegs(regs *arch.Registers) { regs.Cs = t.initRegs.Cs regs.Ss = t.initRegs.Ss regs.Ds = t.initRegs.Ds @@ -50,7 +53,7 @@ func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) { // createSyscallRegs sets up syscall registers. // // This should be called to generate registers for a system call. -func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs { +func createSyscallRegs(initRegs *arch.Registers, sysno uintptr, args ...arch.SyscallArgument) arch.Registers { // Copy initial registers. regs := *initRegs @@ -79,18 +82,18 @@ func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch } // isSingleStepping determines if the registers indicate single-stepping. -func isSingleStepping(regs *syscall.PtraceRegs) bool { +func isSingleStepping(regs *arch.Registers) bool { return (regs.Eflags & arch.X86TrapFlag) != 0 } // updateSyscallRegs updates registers after finishing sysemu. -func updateSyscallRegs(regs *syscall.PtraceRegs) { +func updateSyscallRegs(regs *arch.Registers) { // Ptrace puts -ENOSYS in rax on syscall-enter-stop. regs.Rax = regs.Orig_rax } // syscallReturnValue extracts a sensible return from registers. -func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) { +func syscallReturnValue(regs *arch.Registers) (uintptr, error) { rval := int64(regs.Rax) if rval < 0 { return 0, syscall.Errno(-rval) @@ -98,7 +101,7 @@ func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) { return uintptr(rval), nil } -func dumpRegs(regs *syscall.PtraceRegs) string { +func dumpRegs(regs *arch.Registers) string { var m strings.Builder fmt.Fprintf(&m, "Registers:\n") @@ -139,7 +142,118 @@ func (t *thread) adjustInitRegsRip() { t.initRegs.Rip -= initRegsRipAdjustment } -// Pass the expected PPID to the child via R15 when creating stub process -func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) { +// Pass the expected PPID to the child via R15 when creating stub process. +func initChildProcessPPID(initregs *arch.Registers, ppid int32) { initregs.R15 = uint64(ppid) + // Rbx has to be set to 1 when creating stub process. + initregs.Rbx = 1 +} + +// patchSignalInfo patches the signal info to account for hitting the seccomp +// filters from vsyscall emulation, specified below. We allow for SIGSYS as a +// synchronous trap, but patch the structure to appear like a SIGSEGV with the +// Rip as the faulting address. +// +// Note that this should only be called after verifying that the signalInfo has +// been generated by the kernel. +func patchSignalInfo(regs *arch.Registers, signalInfo *arch.SignalInfo) { + if linux.Signal(signalInfo.Signo) == linux.SIGSYS { + signalInfo.Signo = int32(linux.SIGSEGV) + + // Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered + // with the si_call_addr field pointing to the current RIP. This field + // aligns with the si_addr field for a SIGSEGV, so we don't need to touch + // anything there. We do need to unwind emulation however, so we set the + // instruction pointer to the faulting value, and "unpop" the stack. + regs.Rip = signalInfo.Addr() + regs.Rsp -= 8 + } +} + +// enableCpuidFault enables cpuid-faulting. +// +// This may fail on older kernels or hardware, so we just disregard the result. +// Host CPUID will be enabled. +// +// This is safe to call in an afterFork context. +// +//go:nosplit +func enableCpuidFault() { + syscall.RawSyscall6(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0) +} + +// appendArchSeccompRules append architecture specific seccomp rules when creating BPF program. +// Ref attachedThread() for more detail. +func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFAction) []seccomp.RuleSet { + rules = append(rules, + // Rules for trapping vsyscall access. + seccomp.RuleSet{ + Rules: seccomp.SyscallRules{ + syscall.SYS_GETTIMEOFDAY: {}, + syscall.SYS_TIME: {}, + unix.SYS_GETCPU: {}, // SYS_GETCPU was not defined in package syscall on amd64. + }, + Action: linux.SECCOMP_RET_TRAP, + Vsyscall: true, + }) + if defaultAction != linux.SECCOMP_RET_ALLOW { + rules = append(rules, + seccomp.RuleSet{ + Rules: seccomp.SyscallRules{ + syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ + {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)}, + }, + }, + Action: linux.SECCOMP_RET_ALLOW, + }) + } + return rules +} + +// probeSeccomp returns true iff seccomp is run after ptrace notifications, +// which is generally the case for kernel version >= 4.8. This check is dynamic +// because kernels have be backported behavior. +// +// See createStub for more information. +// +// Precondition: the runtime OS thread must be locked. +func probeSeccomp() bool { + // Create a completely new, destroyable process. + t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO) + if err != nil { + panic(fmt.Sprintf("seccomp probe failed: %v", err)) + } + defer t.destroy() + + // Set registers to the yield system call. This call is not allowed + // by the filters specified in the attachThread function. + regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD) + if err := t.setRegs(®s); err != nil { + panic(fmt.Sprintf("ptrace set regs failed: %v", err)) + } + + for { + // Attempt an emulation. + if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { + panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) + } + + sig := t.wait(stopped) + if sig == (syscallEvent | syscall.SIGTRAP) { + // Did the seccomp errno hook already run? This would + // indicate that seccomp is first in line and we're + // less than 4.8. + if err := t.getRegs(®s); err != nil { + panic(fmt.Sprintf("ptrace get-regs failed: %v", err)) + } + if _, err := syscallReturnValue(®s); err == nil { + // The seccomp errno mode ran first, and reset + // the error in the registers. + return false + } + // The seccomp hook did not run yet, and therefore it + // is safe to use RET_KILL mode for dispatched calls. + return true + } + } } diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go index bec884ba5..bd618fae8 100644 --- a/pkg/sentry/platform/ptrace/subprocess_arm64.go +++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go @@ -17,8 +17,12 @@ package ptrace import ( + "fmt" + "strings" "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" ) @@ -37,13 +41,13 @@ const ( // resetSysemuRegs sets up emulation registers. // // This should be called prior to calling sysemu. -func (s *subprocess) resetSysemuRegs(regs *syscall.PtraceRegs) { +func (t *thread) resetSysemuRegs(regs *arch.Registers) { } // createSyscallRegs sets up syscall registers. // // This should be called to generate registers for a system call. -func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs { +func createSyscallRegs(initRegs *arch.Registers, sysno uintptr, args ...arch.SyscallArgument) arch.Registers { // Copy initial registers (Pc, Sp, etc.). regs := *initRegs @@ -74,7 +78,7 @@ func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch } // isSingleStepping determines if the registers indicate single-stepping. -func isSingleStepping(regs *syscall.PtraceRegs) bool { +func isSingleStepping(regs *arch.Registers) bool { // Refer to the ARM SDM D2.12.3: software step state machine // return (regs.Pstate.SS == 1) && (MDSCR_EL1.SS == 1). // @@ -85,13 +89,13 @@ func isSingleStepping(regs *syscall.PtraceRegs) bool { } // updateSyscallRegs updates registers after finishing sysemu. -func updateSyscallRegs(regs *syscall.PtraceRegs) { +func updateSyscallRegs(regs *arch.Registers) { // No special work is necessary. return } // syscallReturnValue extracts a sensible return from registers. -func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) { +func syscallReturnValue(regs *arch.Registers) (uintptr, error) { rval := int64(regs.Regs[0]) if rval < 0 { return 0, syscall.Errno(-rval) @@ -99,7 +103,7 @@ func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) { return uintptr(rval), nil } -func dumpRegs(regs *syscall.PtraceRegs) string { +func dumpRegs(regs *arch.Registers) string { var m strings.Builder fmt.Fprintf(&m, "Registers:\n") @@ -121,6 +125,50 @@ func (t *thread) adjustInitRegsRip() { } // Pass the expected PPID to the child via X7 when creating stub process -func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) { +func initChildProcessPPID(initregs *arch.Registers, ppid int32) { initregs.Regs[7] = uint64(ppid) + // R9 has to be set to 1 when creating stub process. + initregs.Regs[9] = 1 +} + +// patchSignalInfo patches the signal info to account for hitting the seccomp +// filters from vsyscall emulation, specified below. We allow for SIGSYS as a +// synchronous trap, but patch the structure to appear like a SIGSEGV with the +// Rip as the faulting address. +// +// Note that this should only be called after verifying that the signalInfo has +// been generated by the kernel. +func patchSignalInfo(regs *arch.Registers, signalInfo *arch.SignalInfo) { + if linux.Signal(signalInfo.Signo) == linux.SIGSYS { + signalInfo.Signo = int32(linux.SIGSEGV) + + // Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered + // with the si_call_addr field pointing to the current RIP. This field + // aligns with the si_addr field for a SIGSEGV, so we don't need to touch + // anything there. We do need to unwind emulation however, so we set the + // instruction pointer to the faulting value, and "unpop" the stack. + regs.Pc = signalInfo.Addr() + regs.Sp -= 8 + } +} + +// Noop on arm64. +// +//go:nosplit +func enableCpuidFault() { +} + +// appendArchSeccompRules append architecture specific seccomp rules when creating BPF program. +// Ref attachedThread() for more detail. +func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFAction) []seccomp.RuleSet { + return rules +} + +// probeSeccomp returns true if seccomp is run after ptrace notifications, +// which is generally the case for kernel version >= 4.8. +// +// On arm64, the support of PTRACE_SYSEMU was added in the 5.3 kernel, so +// probeSeccomp can always return true. +func probeSeccomp() bool { + return true } diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go index 3782d4332..2ce528601 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux.go @@ -29,75 +29,6 @@ import ( const syscallEvent syscall.Signal = 0x80 -// probeSeccomp returns true iff seccomp is run after ptrace notifications, -// which is generally the case for kernel version >= 4.8. This check is dynamic -// because kernels have be backported behavior. -// -// See createStub for more information. -// -// Precondition: the runtime OS thread must be locked. -func probeSeccomp() bool { - // Create a completely new, destroyable process. - t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO) - if err != nil { - panic(fmt.Sprintf("seccomp probe failed: %v", err)) - } - defer t.destroy() - - // Set registers to the yield system call. This call is not allowed - // by the filters specified in the attachThread function. - regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD) - if err := t.setRegs(®s); err != nil { - panic(fmt.Sprintf("ptrace set regs failed: %v", err)) - } - - for { - // Attempt an emulation. - if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { - panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) - } - - sig := t.wait(stopped) - if sig == (syscallEvent | syscall.SIGTRAP) { - // Did the seccomp errno hook already run? This would - // indicate that seccomp is first in line and we're - // less than 4.8. - if err := t.getRegs(®s); err != nil { - panic(fmt.Sprintf("ptrace get-regs failed: %v", err)) - } - if _, err := syscallReturnValue(®s); err == nil { - // The seccomp errno mode ran first, and reset - // the error in the registers. - return false - } - // The seccomp hook did not run yet, and therefore it - // is safe to use RET_KILL mode for dispatched calls. - return true - } - } -} - -// patchSignalInfo patches the signal info to account for hitting the seccomp -// filters from vsyscall emulation, specified below. We allow for SIGSYS as a -// synchronous trap, but patch the structure to appear like a SIGSEGV with the -// Rip as the faulting address. -// -// Note that this should only be called after verifying that the signalInfo has -// been generated by the kernel. -func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) { - if linux.Signal(signalInfo.Signo) == linux.SIGSYS { - signalInfo.Signo = int32(linux.SIGSEGV) - - // Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered - // with the si_call_addr field pointing to the current RIP. This field - // aligns with the si_addr field for a SIGSEGV, so we don't need to touch - // anything there. We do need to unwind emulation however, so we set the - // instruction pointer to the faulting value, and "unpop" the stack. - regs.Rip = signalInfo.Addr() - regs.Rsp -= 8 - } -} - // createStub creates a fresh stub processes. // // Precondition: the runtime OS thread must be locked. @@ -143,18 +74,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro // stub and all its children. This is used to create child stubs // (below), so we must include the ability to fork, but otherwise lock // down available calls only to what is needed. - rules := []seccomp.RuleSet{ - // Rules for trapping vsyscall access. - { - Rules: seccomp.SyscallRules{ - syscall.SYS_GETTIMEOFDAY: {}, - syscall.SYS_TIME: {}, - 309: {}, // SYS_GETCPU. - }, - Action: linux.SECCOMP_RET_TRAP, - Vsyscall: true, - }, - } + rules := []seccomp.RuleSet{} if defaultAction != linux.SECCOMP_RET_ALLOW { rules = append(rules, seccomp.RuleSet{ Rules: seccomp.SyscallRules{ @@ -173,10 +93,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro // For the initial process creation. syscall.SYS_WAIT4: {}, - syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ - {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)}, - }, - syscall.SYS_EXIT: {}, + syscall.SYS_EXIT: {}, // For the stub prctl dance (all). syscall.SYS_PRCTL: []seccomp.Rule{ @@ -197,6 +114,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro Action: linux.SECCOMP_RET_ALLOW, }) } + rules = appendArchSeccompRules(rules, defaultAction) instrs, err := seccomp.BuildProgram(rules, defaultAction) if err != nil { return nil, err @@ -267,9 +185,8 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0) } - // Enable cpuid-faulting; this may fail on older kernels or hardware, - // so we just disregard the result. Host CPUID will be enabled. - syscall.RawSyscall6(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0) + // Enable cpuid-faulting. + enableCpuidFault() // Call the stub; should not return. stubCall(stubStart, ppid) diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go index de6783fb0..245b20722 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go @@ -18,13 +18,14 @@ package ptrace import ( - "sync" "sync/atomic" "syscall" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/hostcpu" + "gvisor.dev/gvisor/pkg/sync" ) // maskPool contains reusable CPU masks for setting affinity. Unfortunately, @@ -49,20 +50,6 @@ func unmaskAllSignals() syscall.Errno { return errno } -// getCPU gets the current CPU. -// -// Precondition: the current runtime thread should be locked. -func getCPU() (uint32, error) { - var cpu uintptr - if _, _, errno := syscall.RawSyscall( - unix.SYS_GETCPU, - uintptr(unsafe.Pointer(&cpu)), - 0, 0); errno != 0 { - return 0, errno - } - return uint32(cpu), nil -} - // setCPU sets the CPU affinity. func (t *thread) setCPU(cpu uint32) error { mask := maskPool.Get().([]uintptr) @@ -93,10 +80,8 @@ func (t *thread) setCPU(cpu uint32) error { // // Precondition: the current runtime thread should be locked. func (t *thread) bind() { - currentCPU, err := getCPU() - if err != nil { - return - } + currentCPU := hostcpu.GetCPU() + if oldCPU := atomic.SwapUint32(&t.cpu, currentCPU); oldCPU != currentCPU { // Set the affinity on the thread and save the CPU for next // round; we don't expect CPUs to bounce around too frequently. diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go index b80a3604d..2ae6b9f9d 100644 --- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go +++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go @@ -13,7 +13,7 @@ // limitations under the License. // +build go1.12 -// +build !go1.14 +// +build !go1.15 // Check go:linkname function signatures when updating Go version. |