From 1c9da886e72aebc1c44c66715e3ec45f6d5eff5b Mon Sep 17 00:00:00 2001 From: Haibo Xu Date: Fri, 9 Aug 2019 13:16:46 -0700 Subject: Add initial ptrace stub and syscall support for arm64. Signed-off-by: Haibo Xu Change-Id: I1dbd23bb240cca71d0cc30fc75ca5be28cb4c37c PiperOrigin-RevId: 262619519 --- pkg/abi/linux/elf.go | 14 +++ pkg/sentry/platform/ptrace/BUILD | 6 +- pkg/sentry/platform/ptrace/ptrace.go | 2 +- pkg/sentry/platform/ptrace/ptrace_amd64.go | 33 ++++++ pkg/sentry/platform/ptrace/ptrace_arm64.go | 30 +++++ pkg/sentry/platform/ptrace/ptrace_unsafe.go | 46 +++----- pkg/sentry/platform/ptrace/stub_arm64.s | 106 +++++++++++++++++ pkg/sentry/platform/ptrace/subprocess.go | 12 +- pkg/sentry/platform/ptrace/subprocess_amd64.go | 24 ++-- pkg/sentry/platform/ptrace/subprocess_arm64.go | 126 +++++++++++++++++++++ pkg/sentry/platform/ptrace/subprocess_linux.go | 2 +- .../ptrace/subprocess_linux_amd64_unsafe.go | 109 ------------------ .../platform/ptrace/subprocess_linux_unsafe.go | 110 ++++++++++++++++++ 13 files changed, 469 insertions(+), 151 deletions(-) create mode 100644 pkg/sentry/platform/ptrace/ptrace_amd64.go create mode 100644 pkg/sentry/platform/ptrace/ptrace_arm64.go create mode 100644 pkg/sentry/platform/ptrace/stub_arm64.s create mode 100644 pkg/sentry/platform/ptrace/subprocess_arm64.go delete mode 100644 pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go create mode 100644 pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go diff --git a/pkg/abi/linux/elf.go b/pkg/abi/linux/elf.go index fb1c679d2..40f0459a0 100644 --- a/pkg/abi/linux/elf.go +++ b/pkg/abi/linux/elf.go @@ -89,3 +89,17 @@ const ( // AT_SYSINFO_EHDR is the address of the VDSO. AT_SYSINFO_EHDR = 33 ) + +// ELF ET_CORE and ptrace GETREGSET/SETREGSET register set types. +// +// See include/uapi/linux/elf.h. +const ( + // NT_PRSTATUS is for general purpose register. + NT_PRSTATUS = 0x1 + + // NT_PRFPREG is for float point register. + NT_PRFPREG = 0x2 + + // NT_X86_XSTATE is for x86 extended state using xsave. + NT_X86_XSTATE = 0x202 +) diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index 1b6c54e96..ebcc8c098 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -7,13 +7,17 @@ go_library( srcs = [ "filters.go", "ptrace.go", + "ptrace_amd64.go", + "ptrace_arm64.go", "ptrace_unsafe.go", "stub_amd64.s", + "stub_arm64.s", "stub_unsafe.go", "subprocess.go", "subprocess_amd64.go", + "subprocess_arm64.go", "subprocess_linux.go", - "subprocess_linux_amd64_unsafe.go", + "subprocess_linux_unsafe.go", "subprocess_unsafe.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ptrace", diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go index 6fd30ed25..7b120a15d 100644 --- a/pkg/sentry/platform/ptrace/ptrace.go +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -60,7 +60,7 @@ var ( // maximum user address. This is valid only after a call to stubInit. // // We attempt to link the stub here, and adjust downward as needed. - stubStart uintptr = 0x7fffffff0000 + stubStart uintptr = stubInitAddress // stubEnd is the first byte past the end of the stub, as with // stubStart this is valid only after a call to stubInit. diff --git a/pkg/sentry/platform/ptrace/ptrace_amd64.go b/pkg/sentry/platform/ptrace/ptrace_amd64.go new file mode 100644 index 000000000..db0212538 --- /dev/null +++ b/pkg/sentry/platform/ptrace/ptrace_amd64.go @@ -0,0 +1,33 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ptrace + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" +) + +// fpRegSet returns the GETREGSET/SETREGSET register set type to be used. +func fpRegSet(useXsave bool) uintptr { + if useXsave { + return linux.NT_X86_XSTATE + } + return linux.NT_PRFPREG +} + +func stackPointer(r *syscall.PtraceRegs) uintptr { + return uintptr(r.Rsp) +} diff --git a/pkg/sentry/platform/ptrace/ptrace_arm64.go b/pkg/sentry/platform/ptrace/ptrace_arm64.go new file mode 100644 index 000000000..4db28c534 --- /dev/null +++ b/pkg/sentry/platform/ptrace/ptrace_arm64.go @@ -0,0 +1,30 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ptrace + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" +) + +// fpRegSet returns the GETREGSET/SETREGSET register set type to be used. +func fpRegSet(_ bool) uintptr { + return linux.NT_PRFPREG +} + +func stackPointer(r *syscall.PtraceRegs) uintptr { + return uintptr(r.Sp) +} diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go index 2706039a5..47957bb3b 100644 --- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go +++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go @@ -18,37 +18,23 @@ import ( "syscall" "unsafe" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/usermem" ) -// GETREGSET/SETREGSET register set types. -// -// See include/uapi/linux/elf.h. -const ( - // _NT_PRFPREG is for x86 floating-point state without using xsave. - _NT_PRFPREG = 0x2 - - // _NT_X86_XSTATE is for x86 extended state using xsave. - _NT_X86_XSTATE = 0x202 -) - -// fpRegSet returns the GETREGSET/SETREGSET register set type to be used. -func fpRegSet(useXsave bool) uintptr { - if useXsave { - return _NT_X86_XSTATE - } - return _NT_PRFPREG -} - -// getRegs sets the regular register set. +// getRegs gets the general purpose register set. func (t *thread) getRegs(regs *syscall.PtraceRegs) error { + iovec := syscall.Iovec{ + Base: (*byte)(unsafe.Pointer(regs)), + Len: uint64(unsafe.Sizeof(*regs)), + } _, _, errno := syscall.RawSyscall6( syscall.SYS_PTRACE, - syscall.PTRACE_GETREGS, + syscall.PTRACE_GETREGSET, uintptr(t.tid), - 0, - uintptr(unsafe.Pointer(regs)), + linux.NT_PRSTATUS, + uintptr(unsafe.Pointer(&iovec)), 0, 0) if errno != 0 { return errno @@ -56,14 +42,18 @@ func (t *thread) getRegs(regs *syscall.PtraceRegs) error { return nil } -// setRegs sets the regular register set. +// setRegs sets the general purpose register set. func (t *thread) setRegs(regs *syscall.PtraceRegs) error { + iovec := syscall.Iovec{ + Base: (*byte)(unsafe.Pointer(regs)), + Len: uint64(unsafe.Sizeof(*regs)), + } _, _, errno := syscall.RawSyscall6( syscall.SYS_PTRACE, - syscall.PTRACE_SETREGS, + syscall.PTRACE_SETREGSET, uintptr(t.tid), - 0, - uintptr(unsafe.Pointer(regs)), + linux.NT_PRSTATUS, + uintptr(unsafe.Pointer(&iovec)), 0, 0) if errno != 0 { return errno @@ -131,7 +121,7 @@ func (t *thread) getSignalInfo(si *arch.SignalInfo) error { // // Precondition: the OS thread must be locked and own t. func (t *thread) clone() (*thread, error) { - r, ok := usermem.Addr(t.initRegs.Rsp).RoundUp() + r, ok := usermem.Addr(stackPointer(&t.initRegs)).RoundUp() if !ok { return nil, syscall.EINVAL } diff --git a/pkg/sentry/platform/ptrace/stub_arm64.s b/pkg/sentry/platform/ptrace/stub_arm64.s new file mode 100644 index 000000000..2c5e4d5cb --- /dev/null +++ b/pkg/sentry/platform/ptrace/stub_arm64.s @@ -0,0 +1,106 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "funcdata.h" +#include "textflag.h" + +#define SYS_GETPID 172 +#define SYS_EXIT 93 +#define SYS_KILL 129 +#define SYS_GETPPID 173 +#define SYS_PRCTL 167 + +#define SIGKILL 9 +#define SIGSTOP 19 + +#define PR_SET_PDEATHSIG 1 + +// stub bootstraps the child and sends itself SIGSTOP to wait for attach. +// +// R7 contains the expected PPID. +// +// This should not be used outside the context of a new ptrace child (as the +// function is otherwise a bunch of nonsense). +TEXT ·stub(SB),NOSPLIT,$0 +begin: + // N.B. This loop only executes in the context of a single-threaded + // fork child. + + MOVD $SYS_PRCTL, R8 + MOVD $PR_SET_PDEATHSIG, R0 + MOVD $SIGKILL, R1 + SVC + + CMN $4095, R0 + BCS error + + // If the parent already died before we called PR_SET_DEATHSIG then + // we'll have an unexpected PPID. + MOVD $SYS_GETPPID, R8 + SVC + + CMP R0, R7 + BNE parent_dead + + MOVD $SYS_GETPID, R8 + SVC + + CMP $0x0, R0 + BLT error + + // SIGSTOP to wait for attach. + // + // The SYSCALL instruction will be used for future syscall injection by + // thread.syscall. + MOVD $SYS_KILL, R8 + MOVD $SIGSTOP, R1 + SVC + // The tracer may "detach" and/or allow code execution here in three cases: + // + // 1. New (traced) stub threads are explicitly detached by the + // goroutine in newSubprocess. However, they are detached while in + // group-stop, so they do not execute code here. + // + // 2. If a tracer thread exits, it implicitly detaches from the stub, + // potentially allowing code execution here. However, the Go runtime + // never exits individual threads, so this case never occurs. + // + // 3. subprocess.createStub clones a new stub process that is untraced, + // thus executing this code. We setup the PDEATHSIG before SIGSTOPing + // ourselves for attach by the tracer. + // + // R7 has been updated with the expected PPID. + B begin + +error: + // Exit with -errno. + NEG R0, R0 + MOVD $SYS_EXIT, R8 + SVC + HLT + +parent_dead: + MOVD $SYS_EXIT, R8 + MOVD $1, R0 + SVC + HLT + +// stubCall calls the stub function at the given address with the given PPID. +// +// This is a distinct function because stub, above, may be mapped at any +// arbitrary location, and stub has a specific binary API (see above). +TEXT ·stubCall(SB),NOSPLIT,$0-16 + MOVD addr+0(FP), R0 + MOVD pid+8(FP), R7 + B (R0) diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index 15e84735e..79501682d 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -28,6 +28,16 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usermem" ) +// Linux kernel errnos which "should never be seen by user programs", but will +// be revealed to ptrace syscall exit tracing. +// +// These constants are only used in subprocess.go. +const ( + ERESTARTSYS = syscall.Errno(512) + ERESTARTNOINTR = syscall.Errno(513) + ERESTARTNOHAND = syscall.Errno(514) +) + // globalPool exists to solve two distinct problems: // // 1) Subprocesses can't always be killed properly (see Release). @@ -282,7 +292,7 @@ func (t *thread) grabInitRegs() { if err := t.getRegs(&t.initRegs); err != nil { panic(fmt.Sprintf("ptrace get regs failed: %v", err)) } - t.initRegs.Rip -= initRegsRipAdjustment + t.adjustInitRegsRip() } // detach detaches from the thread. diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go index a70512913..4649a94a7 100644 --- a/pkg/sentry/platform/ptrace/subprocess_amd64.go +++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go @@ -28,20 +28,13 @@ const ( // maximumUserAddress is the largest possible user address. maximumUserAddress = 0x7ffffffff000 + // stubInitAddress is the initial attempt link address for the stub. + stubInitAddress = 0x7fffffff0000 + // initRegsRipAdjustment is the size of the syscall instruction. initRegsRipAdjustment = 2 ) -// Linux kernel errnos which "should never be seen by user programs", but will -// be revealed to ptrace syscall exit tracing. -// -// These constants are used in subprocess.go. -const ( - ERESTARTSYS = syscall.Errno(512) - ERESTARTNOINTR = syscall.Errno(513) - ERESTARTNOHAND = syscall.Errno(514) -) - // resetSysemuRegs sets up emulation registers. // // This should be called prior to calling sysemu. @@ -139,3 +132,14 @@ func dumpRegs(regs *syscall.PtraceRegs) string { return m.String() } + +// adjustInitregsRip adjust the current register RIP value to +// be just before the system call instruction excution +func (t *thread) adjustInitRegsRip() { + t.initRegs.Rip -= initRegsRipAdjustment +} + +// Pass the expected PPID to the child via R15 when creating stub process +func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) { + initregs.R15 = uint64(ppid) +} diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go new file mode 100644 index 000000000..bec884ba5 --- /dev/null +++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go @@ -0,0 +1,126 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package ptrace + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/sentry/arch" +) + +const ( + // maximumUserAddress is the largest possible user address. + maximumUserAddress = 0xfffffffff000 + + // stubInitAddress is the initial attempt link address for the stub. + // Only support 48bits VA currently. + stubInitAddress = 0xffffffff0000 + + // initRegsRipAdjustment is the size of the svc instruction. + initRegsRipAdjustment = 4 +) + +// resetSysemuRegs sets up emulation registers. +// +// This should be called prior to calling sysemu. +func (s *subprocess) resetSysemuRegs(regs *syscall.PtraceRegs) { +} + +// createSyscallRegs sets up syscall registers. +// +// This should be called to generate registers for a system call. +func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs { + // Copy initial registers (Pc, Sp, etc.). + regs := *initRegs + + // Set our syscall number. + // r8 for the syscall number. + // r0-r6 is used to store the parameters. + regs.Regs[8] = uint64(sysno) + if len(args) >= 1 { + regs.Regs[0] = args[0].Uint64() + } + if len(args) >= 2 { + regs.Regs[1] = args[1].Uint64() + } + if len(args) >= 3 { + regs.Regs[2] = args[2].Uint64() + } + if len(args) >= 4 { + regs.Regs[3] = args[3].Uint64() + } + if len(args) >= 5 { + regs.Regs[4] = args[4].Uint64() + } + if len(args) >= 6 { + regs.Regs[5] = args[5].Uint64() + } + + return regs +} + +// isSingleStepping determines if the registers indicate single-stepping. +func isSingleStepping(regs *syscall.PtraceRegs) bool { + // Refer to the ARM SDM D2.12.3: software step state machine + // return (regs.Pstate.SS == 1) && (MDSCR_EL1.SS == 1). + // + // Since the host Linux kernel will set MDSCR_EL1.SS on our behalf + // when we call a single-step ptrace command, we only need to check + // the Pstate.SS bit here. + return (regs.Pstate & arch.ARMTrapFlag) != 0 +} + +// updateSyscallRegs updates registers after finishing sysemu. +func updateSyscallRegs(regs *syscall.PtraceRegs) { + // No special work is necessary. + return +} + +// syscallReturnValue extracts a sensible return from registers. +func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) { + rval := int64(regs.Regs[0]) + if rval < 0 { + return 0, syscall.Errno(-rval) + } + return uintptr(rval), nil +} + +func dumpRegs(regs *syscall.PtraceRegs) string { + var m strings.Builder + + fmt.Fprintf(&m, "Registers:\n") + + for i := 0; i < 31; i++ { + fmt.Fprintf(&m, "\tRegs[%d]\t = %016x\n", i, regs.Regs[i]) + } + fmt.Fprintf(&m, "\tSp\t = %016x\n", regs.Sp) + fmt.Fprintf(&m, "\tPc\t = %016x\n", regs.Pc) + fmt.Fprintf(&m, "\tPstate\t = %016x\n", regs.Pstate) + + return m.String() +} + +// adjustInitregsRip adjust the current register RIP value to +// be just before the system call instruction excution +func (t *thread) adjustInitRegsRip() { + t.initRegs.Pc -= initRegsRipAdjustment +} + +// Pass the expected PPID to the child via X7 when creating stub process +func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) { + initregs.Regs[7] = uint64(ppid) +} diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go index 87ded0bbd..f09b0b3d0 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux.go @@ -284,7 +284,7 @@ func (s *subprocess) createStub() (*thread, error) { // Pass the expected PPID to the child via R15. regs := t.initRegs - regs.R15 = uint64(t.tgid) + initChildProcessPPID(®s, t.tgid) // Call fork in a subprocess. // diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go deleted file mode 100644 index e977992f9..000000000 --- a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 linux - -package ptrace - -import ( - "sync" - "sync/atomic" - "syscall" - "unsafe" - - "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/abi/linux" -) - -// maskPool contains reusable CPU masks for setting affinity. Unfortunately, -// runtime.NumCPU doesn't actually record the number of CPUs on the system, it -// just records the number of CPUs available in the scheduler affinity set at -// startup. This may a) change over time and b) gives a number far lower than -// the maximum indexable CPU. To prevent lots of allocation in the hot path, we -// use a pool to store large masks that we can reuse during bind. -var maskPool = sync.Pool{ - New: func() interface{} { - const maxCPUs = 1024 // Not a hard limit; see below. - return make([]uintptr, maxCPUs/64) - }, -} - -// unmaskAllSignals unmasks all signals on the current thread. -// -//go:nosplit -func unmaskAllSignals() syscall.Errno { - var set linux.SignalSet - _, _, errno := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_SETMASK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0) - return errno -} - -// getCPU gets the current CPU. -// -// Precondition: the current runtime thread should be locked. -func getCPU() (uint32, error) { - var cpu uintptr - if _, _, errno := syscall.RawSyscall( - unix.SYS_GETCPU, - uintptr(unsafe.Pointer(&cpu)), - 0, 0); errno != 0 { - return 0, errno - } - return uint32(cpu), nil -} - -// setCPU sets the CPU affinity. -func (t *thread) setCPU(cpu uint32) error { - mask := maskPool.Get().([]uintptr) - n := int(cpu / 64) - v := uintptr(1 << uintptr(cpu%64)) - if n >= len(mask) { - // See maskPool note above. We've actually exceeded the number - // of available cores. Grow the mask and return it. - mask = make([]uintptr, n+1) - } - mask[n] |= v - if _, _, errno := syscall.RawSyscall( - unix.SYS_SCHED_SETAFFINITY, - uintptr(t.tid), - uintptr(len(mask)*8), - uintptr(unsafe.Pointer(&mask[0]))); errno != 0 { - return errno - } - mask[n] &^= v - maskPool.Put(mask) - return nil -} - -// bind attempts to ensure that the thread is on the same CPU as the current -// thread. This provides no guarantees as it is fundamentally a racy operation: -// CPU sets may change and we may be rescheduled in the middle of this -// operation. As a result, no failures are reported. -// -// Precondition: the current runtime thread should be locked. -func (t *thread) bind() { - currentCPU, err := getCPU() - if err != nil { - return - } - if oldCPU := atomic.SwapUint32(&t.cpu, currentCPU); oldCPU != currentCPU { - // Set the affinity on the thread and save the CPU for next - // round; we don't expect CPUs to bounce around too frequently. - // - // (It's worth noting that we could move CPUs between this point - // and when the tracee finishes executing. But that would be - // roughly the status quo anyways -- we're just maximizing our - // chances of colocation, not guaranteeing it.) - t.setCPU(currentCPU) - } -} diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go new file mode 100644 index 000000000..de6783fb0 --- /dev/null +++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go @@ -0,0 +1,110 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux +// +build amd64 arm64 + +package ptrace + +import ( + "sync" + "sync/atomic" + "syscall" + "unsafe" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" +) + +// maskPool contains reusable CPU masks for setting affinity. Unfortunately, +// runtime.NumCPU doesn't actually record the number of CPUs on the system, it +// just records the number of CPUs available in the scheduler affinity set at +// startup. This may a) change over time and b) gives a number far lower than +// the maximum indexable CPU. To prevent lots of allocation in the hot path, we +// use a pool to store large masks that we can reuse during bind. +var maskPool = sync.Pool{ + New: func() interface{} { + const maxCPUs = 1024 // Not a hard limit; see below. + return make([]uintptr, maxCPUs/64) + }, +} + +// unmaskAllSignals unmasks all signals on the current thread. +// +//go:nosplit +func unmaskAllSignals() syscall.Errno { + var set linux.SignalSet + _, _, errno := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_SETMASK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0) + return errno +} + +// getCPU gets the current CPU. +// +// Precondition: the current runtime thread should be locked. +func getCPU() (uint32, error) { + var cpu uintptr + if _, _, errno := syscall.RawSyscall( + unix.SYS_GETCPU, + uintptr(unsafe.Pointer(&cpu)), + 0, 0); errno != 0 { + return 0, errno + } + return uint32(cpu), nil +} + +// setCPU sets the CPU affinity. +func (t *thread) setCPU(cpu uint32) error { + mask := maskPool.Get().([]uintptr) + n := int(cpu / 64) + v := uintptr(1 << uintptr(cpu%64)) + if n >= len(mask) { + // See maskPool note above. We've actually exceeded the number + // of available cores. Grow the mask and return it. + mask = make([]uintptr, n+1) + } + mask[n] |= v + if _, _, errno := syscall.RawSyscall( + unix.SYS_SCHED_SETAFFINITY, + uintptr(t.tid), + uintptr(len(mask)*8), + uintptr(unsafe.Pointer(&mask[0]))); errno != 0 { + return errno + } + mask[n] &^= v + maskPool.Put(mask) + return nil +} + +// bind attempts to ensure that the thread is on the same CPU as the current +// thread. This provides no guarantees as it is fundamentally a racy operation: +// CPU sets may change and we may be rescheduled in the middle of this +// operation. As a result, no failures are reported. +// +// Precondition: the current runtime thread should be locked. +func (t *thread) bind() { + currentCPU, err := getCPU() + if err != nil { + return + } + if oldCPU := atomic.SwapUint32(&t.cpu, currentCPU); oldCPU != currentCPU { + // Set the affinity on the thread and save the CPU for next + // round; we don't expect CPUs to bounce around too frequently. + // + // (It's worth noting that we could move CPUs between this point + // and when the tracee finishes executing. But that would be + // roughly the status quo anyways -- we're just maximizing our + // chances of colocation, not guaranteeing it.) + t.setCPU(currentCPU) + } +} -- cgit v1.2.3