diff options
Diffstat (limited to 'pkg/sentry')
-rw-r--r-- | pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go | 31 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/bluepill_impl_amd64.s | 1 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/kvm_const.go | 1 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_amd64.go | 4 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_amd64_unsafe.go | 22 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/defs_impl_amd64.go | 39 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/entry_impl_amd64.s | 12 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/entry_impl_arm64.s | 1 |
8 files changed, 95 insertions, 16 deletions
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go index 03a98512e..0a54dd30d 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go @@ -83,5 +83,34 @@ func bluepillStopGuest(c *vCPU) { // //go:nosplit func bluepillReadyStopGuest(c *vCPU) bool { - return c.runData.readyForInterruptInjection != 0 + if c.runData.readyForInterruptInjection == 0 { + return false + } + + if c.runData.ifFlag == 0 { + // This is impossible if readyForInterruptInjection is 1. + throw("interrupts are disabled") + } + + // Disable interrupts if we are in the kernel space. + // + // When the Sentry switches into the kernel mode, it disables + // interrupts. But when goruntime switches on a goroutine which has + // been saved in the host mode, it restores flags and this enables + // interrupts. See the comment of UserFlagsSet for more details. + uregs := userRegs{} + err := c.getUserRegisters(&uregs) + if err != 0 { + throw("failed to get user registers") + } + + if ring0.IsKernelFlags(uregs.RFLAGS) { + uregs.RFLAGS &^= ring0.KernelFlagsClear + err = c.setUserRegisters(&uregs) + if err != 0 { + throw("failed to set user registers") + } + return false + } + return true } diff --git a/pkg/sentry/platform/kvm/bluepill_impl_amd64.s b/pkg/sentry/platform/kvm/bluepill_impl_amd64.s index 8fba53479..3851b5017 100644 --- a/pkg/sentry/platform/kvm/bluepill_impl_amd64.s +++ b/pkg/sentry/platform/kvm/bluepill_impl_amd64.s @@ -17,6 +17,7 @@ // Bits. #define _RFLAGS_IF 0x200 +#define _RFLAGS_IOPL0 0x1000 #define _KERNEL_FLAGS 0x02 // Vectors. diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go index 5c4b18899..5f627a016 100644 --- a/pkg/sentry/platform/kvm/kvm_const.go +++ b/pkg/sentry/platform/kvm/kvm_const.go @@ -32,6 +32,7 @@ const ( _KVM_SET_REGS = 0x4090ae82 _KVM_SET_SREGS = 0x4138ae84 _KVM_GET_REGS = 0x8090ae81 + _KVM_GET_SREGS = 0x8138ae83 _KVM_GET_SUPPORTED_CPUID = 0xc008ae05 _KVM_SET_CPUID2 = 0x4008ae90 _KVM_SET_SIGNAL_MASK = 0x4004ae8b diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 6849ab113..54e721bb1 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -153,8 +153,8 @@ func (c *vCPU) initArchState() error { } // Set the user registers. - if err := c.setUserRegisters(&kernelUserRegs); err != nil { - return err + if errno := c.setUserRegisters(&kernelUserRegs); errno != 0 { + return fmt.Errorf("error setting user registers: %v", errno) } // Allocate some floating point state save area for the local vCPU. diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go index 290f035dd..330f29065 100644 --- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -137,15 +137,17 @@ func (c *vCPU) setSignalMask() error { } // setUserRegisters sets user registers in the vCPU. -func (c *vCPU) setUserRegisters(uregs *userRegs) error { +// +//go:nosplit +func (c *vCPU) setUserRegisters(uregs *userRegs) syscall.Errno { if _, _, errno := syscall.RawSyscall( syscall.SYS_IOCTL, uintptr(c.fd), _KVM_SET_REGS, uintptr(unsafe.Pointer(uregs))); errno != 0 { - return fmt.Errorf("error setting user registers: %v", errno) + return errno } - return nil + return 0 } // getUserRegisters reloads user registers in the vCPU. @@ -175,3 +177,17 @@ func (c *vCPU) setSystemRegisters(sregs *systemRegs) error { } return nil } + +// getSystemRegisters sets system registers. +// +//go:nosplit +func (c *vCPU) getSystemRegisters(sregs *systemRegs) syscall.Errno { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_GET_SREGS, + uintptr(unsafe.Pointer(sregs))); errno != 0 { + return errno + } + return 0 +} diff --git a/pkg/sentry/platform/ring0/defs_impl_amd64.go b/pkg/sentry/platform/ring0/defs_impl_amd64.go index 7a06eb316..795cf86ba 100644 --- a/pkg/sentry/platform/ring0/defs_impl_amd64.go +++ b/pkg/sentry/platform/ring0/defs_impl_amd64.go @@ -270,6 +270,7 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "\n// Bits.\n") fmt.Fprintf(w, "#define _RFLAGS_IF 0x%02x\n", _RFLAGS_IF) + fmt.Fprintf(w, "#define _RFLAGS_IOPL0 0x%02x\n", _RFLAGS_IOPL0) fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet) fmt.Fprintf(w, "\n// Vectors.\n") @@ -343,7 +344,9 @@ const ( _RFLAGS_AC = 1 << 18 _RFLAGS_NT = 1 << 14 - _RFLAGS_IOPL = 3 << 12 + _RFLAGS_IOPL0 = 1 << 12 + _RFLAGS_IOPL1 = 1 << 13 + _RFLAGS_IOPL = _RFLAGS_IOPL0 | _RFLAGS_IOPL1 _RFLAGS_DF = 1 << 10 _RFLAGS_IF = 1 << 9 _RFLAGS_STEP = 1 << 8 @@ -371,15 +374,45 @@ const ( KernelFlagsSet = _RFLAGS_RESERVED // UserFlagsSet are always set in userspace. - UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF + // + // _RFLAGS_IOPL is a set of two bits and it shows the I/O privilege + // level. The Current Privilege Level (CPL) of the task must be less + // than or equal to the IOPL in order for the task or program to access + // I/O ports. + // + // Here, _RFLAGS_IOPL0 is used only to determine whether the task is + // running in the kernel or userspace mode. In the user mode, the CPL is + // always 3 and it doesn't matter what IOPL is set if it is bellow CPL. + // + // We need to have one bit which will be always different in user and + // kernel modes. And we have to remember that even though we have + // KernelFlagsClear, we still can see some of these flags in the kernel + // mode. This can happen when the goruntime switches on a goroutine + // which has been saved in the host mode. On restore, the popf + // instruction is used to restore flags and this means that all flags + // what the goroutine has in the host mode will be restored in the + // kernel mode. + // + // _RFLAGS_IOPL0 is never set in host and kernel modes and we always set + // it in the user mode. So if this flag is set, the task is running in + // the user mode and if it isn't set, the task is running in the kernel + // mode. + UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF | _RFLAGS_IOPL0 // KernelFlagsClear should always be clear in the kernel. KernelFlagsClear = _RFLAGS_STEP | _RFLAGS_IF | _RFLAGS_IOPL | _RFLAGS_AC | _RFLAGS_NT // UserFlagsClear are always cleared in userspace. - UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL + UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL1 ) +// IsKernelFlags returns true if rflags coresponds to the kernel mode. +// +// go:nosplit +func IsKernelFlags(rflags uint64) bool { + return rflags&_RFLAGS_IOPL0 == 0 +} + // Vector is an exception vector. type Vector uintptr diff --git a/pkg/sentry/platform/ring0/entry_impl_amd64.s b/pkg/sentry/platform/ring0/entry_impl_amd64.s index 3df00dee8..1216f3843 100644 --- a/pkg/sentry/platform/ring0/entry_impl_amd64.s +++ b/pkg/sentry/platform/ring0/entry_impl_amd64.s @@ -17,6 +17,7 @@ // Bits. #define _RFLAGS_IF 0x200 +#define _RFLAGS_IOPL0 0x1000 #define _KERNEL_FLAGS 0x02 // Vectors. @@ -263,13 +264,10 @@ TEXT ·Start(SB),NOSPLIT,$0 // See entry_amd64.go. TEXT ·sysenter(SB),NOSPLIT,$0 - // Interrupts are always disabled while we're executing in kernel mode - // and always enabled while executing in user mode. Therefore, we can - // reliably look at the flags in R11 to determine where this syscall - // was from. - TESTL $_RFLAGS_IF, R11 + // _RFLAGS_IOPL0 is always set in the user mode and it is never set in + // the kernel mode. See the comment of UserFlagsSet for more details. + TESTL $_RFLAGS_IOPL0, R11 JZ kernel - user: SWAP_GS() MOVQ AX, ENTRY_SCRATCH0(GS) // Save user AX on scratch. @@ -348,7 +346,7 @@ TEXT ·exception(SB),NOSPLIT,$0 // ERROR_CODE (sp+8) // VECTOR (sp+0) // - TESTL $_RFLAGS_IF, 32(SP) + TESTL $_RFLAGS_IOPL0, 32(SP) JZ kernel user: diff --git a/pkg/sentry/platform/ring0/entry_impl_arm64.s b/pkg/sentry/platform/ring0/entry_impl_arm64.s index f57fc369a..d654ebcb0 100644 --- a/pkg/sentry/platform/ring0/entry_impl_arm64.s +++ b/pkg/sentry/platform/ring0/entry_impl_arm64.s @@ -17,6 +17,7 @@ // Bits. #define _RFLAGS_IF 0x200 +#define _RFLAGS_IOPL0 0x1000 #define _KERNEL_FLAGS 0x02 // Vectors. |