diff options
Diffstat (limited to 'pkg/sentry')
35 files changed, 681 insertions, 466 deletions
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD index 85278b389..f660f1614 100644 --- a/pkg/sentry/arch/BUILD +++ b/pkg/sentry/arch/BUILD @@ -9,7 +9,6 @@ go_library( "arch.go", "arch_aarch64.go", "arch_amd64.go", - "arch_amd64.s", "arch_arm64.go", "arch_state_x86.go", "arch_x86.go", @@ -36,8 +35,8 @@ go_library( "//pkg/log", "//pkg/marshal", "//pkg/marshal/primitive", + "//pkg/sentry/arch/fpu", "//pkg/sentry/limits", - "//pkg/sync", "//pkg/syserror", "//pkg/usermem", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go index 3443b9e1b..921151137 100644 --- a/pkg/sentry/arch/arch.go +++ b/pkg/sentry/arch/arch.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/usermem" ) @@ -50,12 +51,6 @@ func (a Arch) String() string { } } -// FloatingPointData is a generic type, and will always be passed as a pointer. -// We rely on the individual arch implementations to meet all the necessary -// requirements. For example, on x86 the region must be 16-byte aligned and 512 -// bytes in size. -type FloatingPointData []byte - // Context provides architecture-dependent information for a specific thread. // // NOTE(b/34169503): Currently we use uintptr here to refer to a generic native @@ -187,7 +182,7 @@ type Context interface { ClearSingleStep() // FloatingPointData will be passed to underlying save routines. - FloatingPointData() FloatingPointData + FloatingPointData() *fpu.State // NewMmapLayout returns a layout for a new MM, where MinAddr for the // returned layout must be no lower than min, and MaxAddr for the returned @@ -221,16 +216,6 @@ type Context interface { // number of bytes read. PtraceSetRegs(src io.Reader) (int, error) - // PtraceGetFPRegs implements ptrace(PTRACE_GETFPREGS) by writing the - // floating-point registers represented by this Context to addr in dst and - // returning the number of bytes written. - PtraceGetFPRegs(dst io.Writer) (int, error) - - // PtraceSetFPRegs implements ptrace(PTRACE_SETFPREGS) by reading - // floating-point registers from src into this Context and returning the - // number of bytes read. - PtraceSetFPRegs(src io.Reader) (int, error) - // PtraceGetRegSet implements ptrace(PTRACE_GETREGSET) by writing the // register set given by architecture-defined value regset from this // Context to dst and returning the number of bytes written, which must be @@ -365,18 +350,3 @@ func (a SyscallArgument) SizeT() uint { func (a SyscallArgument) ModeT() uint { return uint(uint16(a.Value)) } - -// ErrFloatingPoint indicates a failed restore due to unusable floating point -// state. -type ErrFloatingPoint struct { - // supported is the supported floating point state. - supported uint64 - - // saved is the saved floating point state. - saved uint64 -} - -// Error returns a sensible description of the restore error. -func (e ErrFloatingPoint) Error() string { - return fmt.Sprintf("floating point state contains unsupported features; supported: %#x saved: %#x", e.supported, e.saved) -} diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go index 6b81e9708..08789f517 100644 --- a/pkg/sentry/arch/arch_aarch64.go +++ b/pkg/sentry/arch/arch_aarch64.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" "gvisor.dev/gvisor/pkg/syserror" ) @@ -40,65 +41,11 @@ type Registers struct { const ( // SyscallWidth is the width of insturctions. SyscallWidth = 4 - - // fpsimdMagic is the magic number which is used in fpsimd_context. - fpsimdMagic = 0x46508001 - - // fpsimdContextSize is the size of fpsimd_context. - fpsimdContextSize = 0x210 ) // ARMTrapFlag is the mask for the trap flag. const ARMTrapFlag = uint64(1) << 21 -// aarch64FPState is aarch64 floating point state. -type aarch64FPState []byte - -// initAarch64FPState sets up initial state. -// -// Related code in Linux kernel: fpsimd_flush_thread(). -// FPCR = FPCR_RM_RN (0x0 << 22). -// -// Currently, aarch64FPState is only a space of 0x210 length for fpstate. -// The fp head is useless in sentry/ptrace/kvm. -// -func initAarch64FPState(data aarch64FPState) { -} - -func newAarch64FPStateSlice() []byte { - return alignedBytes(4096, 16)[:fpsimdContextSize] -} - -// newAarch64FPState returns an initialized floating point state. -// -// The returned state is large enough to store all floating point state -// supported by host, even if the app won't use much of it due to a restricted -// FeatureSet. -func newAarch64FPState() aarch64FPState { - f := aarch64FPState(newAarch64FPStateSlice()) - initAarch64FPState(f) - return f -} - -// fork creates and returns an identical copy of the aarch64 floating point state. -func (f aarch64FPState) fork() aarch64FPState { - n := aarch64FPState(newAarch64FPStateSlice()) - copy(n, f) - return n -} - -// FloatingPointData returns the raw data pointer. -func (f aarch64FPState) FloatingPointData() FloatingPointData { - return ([]byte)(f) -} - -// NewFloatingPointData returns a new floating point data blob. -// -// This is primarily for use in tests. -func NewFloatingPointData() FloatingPointData { - return ([]byte)(newAarch64FPState()) -} - // State contains the common architecture bits for aarch64 (the build tag of this // file ensures it's only built on aarch64). // @@ -108,7 +55,7 @@ type State struct { Regs Registers // Our floating point state. - aarch64FPState `state:"wait"` + fpState fpu.State `state:"wait"` // FeatureSet is a pointer to the currently active feature set. FeatureSet *cpuid.FeatureSet @@ -162,10 +109,10 @@ func (s State) Proto() *rpb.Registers { // Fork creates and returns an identical copy of the state. func (s *State) Fork() State { return State{ - Regs: s.Regs, - aarch64FPState: s.aarch64FPState.fork(), - FeatureSet: s.FeatureSet, - OrigR0: s.OrigR0, + Regs: s.Regs, + fpState: s.fpState.Fork(), + FeatureSet: s.FeatureSet, + OrigR0: s.OrigR0, } } @@ -318,10 +265,10 @@ func New(arch Arch, fs *cpuid.FeatureSet) Context { case ARM64: return &context64{ State{ - aarch64FPState: newAarch64FPState(), - FeatureSet: fs, + fpState: fpu.NewState(), + FeatureSet: fs, }, - []aarch64FPState(nil), + []fpu.State(nil), } } panic(fmt.Sprintf("unknown architecture %v", arch)) diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go index 15d8ddb40..2571be60f 100644 --- a/pkg/sentry/arch/arch_amd64.go +++ b/pkg/sentry/arch/arch_amd64.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/usermem" ) @@ -105,7 +106,7 @@ const ( // +stateify savable type context64 struct { State - sigFPState []x86FPState // fpstate to be restored on sigreturn. + sigFPState []fpu.State // fpstate to be restored on sigreturn. } // Arch implements Context.Arch. @@ -113,14 +114,18 @@ func (c *context64) Arch() Arch { return AMD64 } -func (c *context64) copySigFPState() []x86FPState { - var sigfps []x86FPState +func (c *context64) copySigFPState() []fpu.State { + var sigfps []fpu.State for _, s := range c.sigFPState { - sigfps = append(sigfps, s.fork()) + sigfps = append(sigfps, s.Fork()) } return sigfps } +func (c *context64) FloatingPointData() *fpu.State { + return &c.State.fpState +} + // Fork returns an exact copy of this context. func (c *context64) Fork() Context { return &context64{ diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go index 0c61a3ff7..14ad9483b 100644 --- a/pkg/sentry/arch/arch_arm64.go +++ b/pkg/sentry/arch/arch_arm64.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/usermem" ) @@ -79,7 +80,7 @@ const ( // +stateify savable type context64 struct { State - sigFPState []aarch64FPState // fpstate to be restored on sigreturn. + sigFPState []fpu.State // fpstate to be restored on sigreturn. } // Arch implements Context.Arch. @@ -87,10 +88,10 @@ func (c *context64) Arch() Arch { return ARM64 } -func (c *context64) copySigFPState() []aarch64FPState { - var sigfps []aarch64FPState +func (c *context64) copySigFPState() []fpu.State { + var sigfps []fpu.State for _, s := range c.sigFPState { - sigfps = append(sigfps, s.fork()) + sigfps = append(sigfps, s.Fork()) } return sigfps } @@ -286,3 +287,7 @@ func (c *context64) PtracePokeUser(addr, data uintptr) error { // TODO(gvisor.dev/issue/1239): Full ptrace supporting for Arm64. return nil } + +func (c *context64) FloatingPointData() *fpu.State { + return &c.State.fpState +} diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go index 840e53d33..b2b94c304 100644 --- a/pkg/sentry/arch/arch_state_x86.go +++ b/pkg/sentry/arch/arch_state_x86.go @@ -16,59 +16,7 @@ package arch -import ( - "gvisor.dev/gvisor/pkg/cpuid" - "gvisor.dev/gvisor/pkg/usermem" -) - -// XSTATE_BV does not exist if FXSAVE is used, but FXSAVE implicitly saves x87 -// and SSE state, so this is the equivalent XSTATE_BV value. -const fxsaveBV uint64 = cpuid.XSAVEFeatureX87 | cpuid.XSAVEFeatureSSE - // afterLoadFPState is invoked by afterLoad. func (s *State) afterLoadFPState() { - old := s.x86FPState - - // Recreate the slice. This is done to ensure that it is aligned - // appropriately in memory, and large enough to accommodate any new - // state that may be saved by the new CPU. Even if extraneous new state - // is saved, the state we care about is guaranteed to be a subset of - // new state. Later optimizations can use less space when using a - // smaller state component bitmap. Intel SDM Volume 1 Chapter 13 has - // more info. - s.x86FPState = newX86FPState() - - // x86FPState always contains all the FP state supported by the host. - // We may have come from a newer machine that supports additional state - // which we cannot restore. - // - // The x86 FP state areas are backwards compatible, so we can simply - // truncate the additional floating point state. - // - // Applications should not depend on the truncated state because it - // should relate only to features that were not exposed in the app - // FeatureSet. However, because we do not *prevent* them from using - // this state, we must verify here that there is no in-use state - // (according to XSTATE_BV) which we do not support. - if len(s.x86FPState) < len(old) { - // What do we support? - supportedBV := fxsaveBV - if fs := cpuid.HostFeatureSet(); fs.UseXsave() { - supportedBV = fs.ValidXCR0Mask() - } - - // What was in use? - savedBV := fxsaveBV - if len(old) >= xstateBVOffset+8 { - savedBV = usermem.ByteOrder.Uint64(old[xstateBVOffset:]) - } - - // Supported features must be a superset of saved features. - if savedBV&^supportedBV != 0 { - panic(ErrFloatingPoint{supported: supportedBV, saved: savedBV}) - } - } - - // Copy to the new, aligned location. - copy(s.x86FPState, old) + s.fpState.AfterLoad() } diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go index 91edf0703..e8e52d3a8 100644 --- a/pkg/sentry/arch/arch_x86.go +++ b/pkg/sentry/arch/arch_x86.go @@ -24,10 +24,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" - "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" ) // Registers represents the CPU registers for this architecture. @@ -111,57 +110,6 @@ var ( X86TrapFlag uint64 = (1 << 8) ) -// x86FPState is x86 floating point state. -type x86FPState []byte - -// initX86FPState (defined in asm files) sets up initial state. -func initX86FPState(data *byte, useXsave bool) - -func newX86FPStateSlice() []byte { - size, align := cpuid.HostFeatureSet().ExtendedStateSize() - capacity := size - // Always use at least 4096 bytes. - // - // For the KVM platform, this state is a fixed 4096 bytes, so make sure - // that the underlying array is at _least_ that size otherwise we will - // corrupt random memory. This is not a pleasant thing to debug. - if capacity < 4096 { - capacity = 4096 - } - return alignedBytes(capacity, align)[:size] -} - -// newX86FPState returns an initialized floating point state. -// -// The returned state is large enough to store all floating point state -// supported by host, even if the app won't use much of it due to a restricted -// FeatureSet. Since they may still be able to see state not advertised by -// CPUID we must ensure it does not contain any sentry state. -func newX86FPState() x86FPState { - f := x86FPState(newX86FPStateSlice()) - initX86FPState(&f.FloatingPointData()[0], cpuid.HostFeatureSet().UseXsave()) - return f -} - -// fork creates and returns an identical copy of the x86 floating point state. -func (f x86FPState) fork() x86FPState { - n := x86FPState(newX86FPStateSlice()) - copy(n, f) - return n -} - -// FloatingPointData returns the raw data pointer. -func (f x86FPState) FloatingPointData() FloatingPointData { - return []byte(f) -} - -// NewFloatingPointData returns a new floating point data blob. -// -// This is primarily for use in tests. -func NewFloatingPointData() FloatingPointData { - return (FloatingPointData)(newX86FPState()) -} - // Proto returns a protobuf representation of the system registers in State. func (s State) Proto() *rpb.Registers { regs := &rpb.AMD64Registers{ @@ -200,7 +148,7 @@ func (s State) Proto() *rpb.Registers { func (s *State) Fork() State { return State{ Regs: s.Regs, - x86FPState: s.x86FPState.fork(), + fpState: s.fpState.Fork(), FeatureSet: s.FeatureSet, } } @@ -393,149 +341,6 @@ func isValidSegmentBase(reg uint64) bool { return reg < uint64(maxAddr64) } -// ptraceFPRegsSize is the size in bytes of Linux's user_i387_struct, the type -// manipulated by PTRACE_GETFPREGS and PTRACE_SETFPREGS on x86. Equivalently, -// ptraceFPRegsSize is the size in bytes of the x86 FXSAVE area. -const ptraceFPRegsSize = 512 - -// PtraceGetFPRegs implements Context.PtraceGetFPRegs. -func (s *State) PtraceGetFPRegs(dst io.Writer) (int, error) { - return dst.Write(s.x86FPState[:ptraceFPRegsSize]) -} - -// PtraceSetFPRegs implements Context.PtraceSetFPRegs. -func (s *State) PtraceSetFPRegs(src io.Reader) (int, error) { - var f [ptraceFPRegsSize]byte - n, err := io.ReadFull(src, f[:]) - if err != nil { - return 0, err - } - // Force reserved bits in MXCSR to 0. This is consistent with Linux. - sanitizeMXCSR(x86FPState(f[:])) - // N.B. this only copies the beginning of the FP state, which - // corresponds to the FXSAVE area. - copy(s.x86FPState, f[:]) - return n, nil -} - -const ( - // mxcsrOffset is the offset in bytes of the MXCSR field from the start of - // the FXSAVE area. (Intel SDM Vol. 1, Table 10-2 "Format of an FXSAVE - // Area") - mxcsrOffset = 24 - - // mxcsrMaskOffset is the offset in bytes of the MXCSR_MASK field from the - // start of the FXSAVE area. - mxcsrMaskOffset = 28 -) - -var ( - mxcsrMask uint32 - initMXCSRMask sync.Once -) - -// sanitizeMXCSR coerces reserved bits in the MXCSR field of f to 0. ("FXRSTOR -// generates a general-protection fault (#GP) in response to an attempt to set -// any of the reserved bits of the MXCSR register." - Intel SDM Vol. 1, Section -// 10.5.1.2 "SSE State") -func sanitizeMXCSR(f x86FPState) { - mxcsr := usermem.ByteOrder.Uint32(f[mxcsrOffset:]) - initMXCSRMask.Do(func() { - temp := x86FPState(alignedBytes(uint(ptraceFPRegsSize), 16)) - initX86FPState(&temp.FloatingPointData()[0], false /* useXsave */) - mxcsrMask = usermem.ByteOrder.Uint32(temp[mxcsrMaskOffset:]) - if mxcsrMask == 0 { - // "If the value of the MXCSR_MASK field is 00000000H, then the - // MXCSR_MASK value is the default value of 0000FFBFH." - Intel SDM - // Vol. 1, Section 11.6.6 "Guidelines for Writing to the MXCSR - // Register" - mxcsrMask = 0xffbf - } - }) - mxcsr &= mxcsrMask - usermem.ByteOrder.PutUint32(f[mxcsrOffset:], mxcsr) -} - -const ( - // minXstateBytes is the minimum size in bytes of an x86 XSAVE area, equal - // to the size of the XSAVE legacy area (512 bytes) plus the size of the - // XSAVE header (64 bytes). Equivalently, minXstateBytes is GDB's - // X86_XSTATE_SSE_SIZE. - minXstateBytes = 512 + 64 - - // userXstateXCR0Offset is the offset in bytes of the USER_XSTATE_XCR0_WORD - // field in Linux's struct user_xstateregs, which is the type manipulated - // by ptrace(PTRACE_GET/SETREGSET, NT_X86_XSTATE). Equivalently, - // userXstateXCR0Offset is GDB's I386_LINUX_XSAVE_XCR0_OFFSET. - userXstateXCR0Offset = 464 - - // xstateBVOffset is the offset in bytes of the XSTATE_BV field in an x86 - // XSAVE area. - xstateBVOffset = 512 - - // xsaveHeaderZeroedOffset and xsaveHeaderZeroedBytes indicate parts of the - // XSAVE header that we coerce to zero: "Bytes 15:8 of the XSAVE header is - // a state-component bitmap called XCOMP_BV. ... Bytes 63:16 of the XSAVE - // header are reserved." - Intel SDM Vol. 1, Section 13.4.2 "XSAVE Header". - // Linux ignores XCOMP_BV, but it's able to recover from XRSTOR #GP - // exceptions resulting from invalid values; we aren't. Linux also never - // uses the compacted format when doing XSAVE and doesn't even define the - // compaction extensions to XSAVE as a CPU feature, so for simplicity we - // assume no one is using them. - xsaveHeaderZeroedOffset = 512 + 8 - xsaveHeaderZeroedBytes = 64 - 8 -) - -func (s *State) ptraceGetXstateRegs(dst io.Writer, maxlen int) (int, error) { - // N.B. s.x86FPState may contain more state than the application - // expects. We only copy the subset that would be in their XSAVE area. - ess, _ := s.FeatureSet.ExtendedStateSize() - f := make([]byte, ess) - copy(f, s.x86FPState) - // "The XSAVE feature set does not use bytes 511:416; bytes 463:416 are - // reserved." - Intel SDM Vol 1., Section 13.4.1 "Legacy Region of an XSAVE - // Area". Linux uses the first 8 bytes of this area to store the OS XSTATE - // mask. GDB relies on this: see - // gdb/x86-linux-nat.c:x86_linux_read_description(). - usermem.ByteOrder.PutUint64(f[userXstateXCR0Offset:], s.FeatureSet.ValidXCR0Mask()) - if len(f) > maxlen { - f = f[:maxlen] - } - return dst.Write(f) -} - -func (s *State) ptraceSetXstateRegs(src io.Reader, maxlen int) (int, error) { - // Allow users to pass an xstate register set smaller than ours (they can - // mask bits out of XSTATE_BV), as long as it's at least minXstateBytes. - // Also allow users to pass a register set larger than ours; anything after - // their ExtendedStateSize will be ignored. (I think Linux technically - // permits setting a register set smaller than minXstateBytes, but it has - // the same silent truncation behavior in kernel/ptrace.c:ptrace_regset().) - if maxlen < minXstateBytes { - return 0, unix.EFAULT - } - ess, _ := s.FeatureSet.ExtendedStateSize() - if maxlen > int(ess) { - maxlen = int(ess) - } - f := make([]byte, maxlen) - if _, err := io.ReadFull(src, f); err != nil { - return 0, err - } - // Force reserved bits in MXCSR to 0. This is consistent with Linux. - sanitizeMXCSR(x86FPState(f)) - // Users can't enable *more* XCR0 bits than what we, and the CPU, support. - xstateBV := usermem.ByteOrder.Uint64(f[xstateBVOffset:]) - xstateBV &= s.FeatureSet.ValidXCR0Mask() - usermem.ByteOrder.PutUint64(f[xstateBVOffset:], xstateBV) - // Force XCOMP_BV and reserved bytes in the XSAVE header to 0. - reserved := f[xsaveHeaderZeroedOffset : xsaveHeaderZeroedOffset+xsaveHeaderZeroedBytes] - for i := range reserved { - reserved[i] = 0 - } - return copy(s.x86FPState, f), nil -} - // Register sets defined in include/uapi/linux/elf.h. const ( _NT_PRSTATUS = 1 @@ -552,12 +357,9 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, } return s.PtraceGetRegs(dst) case _NT_PRFPREG: - if maxlen < ptraceFPRegsSize { - return 0, syserror.EFAULT - } - return s.PtraceGetFPRegs(dst) + return s.fpState.PtraceGetFPRegs(dst, maxlen) case _NT_X86_XSTATE: - return s.ptraceGetXstateRegs(dst, maxlen) + return s.fpState.PtraceGetXstateRegs(dst, maxlen, s.FeatureSet) default: return 0, syserror.EINVAL } @@ -572,12 +374,9 @@ func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, } return s.PtraceSetRegs(src) case _NT_PRFPREG: - if maxlen < ptraceFPRegsSize { - return 0, syserror.EFAULT - } - return s.PtraceSetFPRegs(src) + return s.fpState.PtraceSetFPRegs(src, maxlen) case _NT_X86_XSTATE: - return s.ptraceSetXstateRegs(src, maxlen) + return s.fpState.PtraceSetXstateRegs(src, maxlen, s.FeatureSet) default: return 0, syserror.EINVAL } @@ -609,10 +408,10 @@ func New(arch Arch, fs *cpuid.FeatureSet) Context { case AMD64: return &context64{ State{ - x86FPState: newX86FPState(), + fpState: fpu.NewState(), FeatureSet: fs, }, - []x86FPState(nil), + []fpu.State(nil), } } panic(fmt.Sprintf("unknown architecture %v", arch)) diff --git a/pkg/sentry/arch/arch_x86_impl.go b/pkg/sentry/arch/arch_x86_impl.go index 0c73fcbfb..5d7b99bd9 100644 --- a/pkg/sentry/arch/arch_x86_impl.go +++ b/pkg/sentry/arch/arch_x86_impl.go @@ -18,6 +18,7 @@ package arch import ( "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" ) // State contains the common architecture bits for X86 (the build tag of this @@ -29,7 +30,7 @@ type State struct { Regs Registers // Our floating point state. - x86FPState `state:"wait"` + fpState fpu.State `state:"wait"` // FeatureSet is a pointer to the currently active feature set. FeatureSet *cpuid.FeatureSet diff --git a/pkg/sentry/arch/fpu/BUILD b/pkg/sentry/arch/fpu/BUILD new file mode 100644 index 000000000..0a5395267 --- /dev/null +++ b/pkg/sentry/arch/fpu/BUILD @@ -0,0 +1,21 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "fpu", + srcs = [ + "fpu.go", + "fpu_amd64.go", + "fpu_amd64.s", + "fpu_arm64.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/cpuid", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/pkg/sentry/arch/fpu/fpu.go b/pkg/sentry/arch/fpu/fpu.go new file mode 100644 index 000000000..867d309a3 --- /dev/null +++ b/pkg/sentry/arch/fpu/fpu.go @@ -0,0 +1,54 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fpu provides basic floating point helpers. +package fpu + +import ( + "fmt" + "reflect" +) + +// State represents floating point state. +// +// This is a simple byte slice, but may have architecture-specific methods +// attached to it. +type State []byte + +// ErrLoadingState indicates a failed restore due to unusable floating point +// state. +type ErrLoadingState struct { + // supported is the supported floating point state. + supportedFeatures uint64 + + // saved is the saved floating point state. + savedFeatures uint64 +} + +// Error returns a sensible description of the restore error. +func (e ErrLoadingState) Error() string { + return fmt.Sprintf("floating point state contains unsupported features; supported: %#x saved: %#x", e.supportedFeatures, e.savedFeatures) +} + +// alignedBytes returns a slice of size bytes, aligned in memory to the given +// alignment. This is used because we require certain structures to be aligned +// in a specific way (for example, the X86 floating point data). +func alignedBytes(size, alignment uint) []byte { + data := make([]byte, size+alignment-1) + offset := uint(reflect.ValueOf(data).Index(0).Addr().Pointer() % uintptr(alignment)) + if offset == 0 { + return data[:size:size] + } + return data[alignment-offset:][:size:size] +} diff --git a/pkg/sentry/arch/fpu/fpu_amd64.go b/pkg/sentry/arch/fpu/fpu_amd64.go new file mode 100644 index 000000000..3a62f51be --- /dev/null +++ b/pkg/sentry/arch/fpu/fpu_amd64.go @@ -0,0 +1,280 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 i386 + +package fpu + +import ( + "io" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// initX86FPState (defined in asm files) sets up initial state. +func initX86FPState(data *byte, useXsave bool) + +func newX86FPStateSlice() State { + size, align := cpuid.HostFeatureSet().ExtendedStateSize() + capacity := size + // Always use at least 4096 bytes. + // + // For the KVM platform, this state is a fixed 4096 bytes, so make sure + // that the underlying array is at _least_ that size otherwise we will + // corrupt random memory. This is not a pleasant thing to debug. + if capacity < 4096 { + capacity = 4096 + } + return alignedBytes(capacity, align)[:size] +} + +// NewState returns an initialized floating point state. +// +// The returned state is large enough to store all floating point state +// supported by host, even if the app won't use much of it due to a restricted +// FeatureSet. Since they may still be able to see state not advertised by +// CPUID we must ensure it does not contain any sentry state. +func NewState() State { + f := newX86FPStateSlice() + initX86FPState(&f[0], cpuid.HostFeatureSet().UseXsave()) + return f +} + +// Fork creates and returns an identical copy of the x86 floating point state. +func (s *State) Fork() State { + n := newX86FPStateSlice() + copy(n, *s) + return n +} + +// ptraceFPRegsSize is the size in bytes of Linux's user_i387_struct, the type +// manipulated by PTRACE_GETFPREGS and PTRACE_SETFPREGS on x86. Equivalently, +// ptraceFPRegsSize is the size in bytes of the x86 FXSAVE area. +const ptraceFPRegsSize = 512 + +// PtraceGetFPRegs implements Context.PtraceGetFPRegs. +func (s *State) PtraceGetFPRegs(dst io.Writer, maxlen int) (int, error) { + if maxlen < ptraceFPRegsSize { + return 0, syserror.EFAULT + } + + return dst.Write((*s)[:ptraceFPRegsSize]) +} + +// PtraceSetFPRegs implements Context.PtraceSetFPRegs. +func (s *State) PtraceSetFPRegs(src io.Reader, maxlen int) (int, error) { + if maxlen < ptraceFPRegsSize { + return 0, syserror.EFAULT + } + + var f [ptraceFPRegsSize]byte + n, err := io.ReadFull(src, f[:]) + if err != nil { + return 0, err + } + // Force reserved bits in MXCSR to 0. This is consistent with Linux. + sanitizeMXCSR(State(f[:])) + // N.B. this only copies the beginning of the FP state, which + // corresponds to the FXSAVE area. + copy(*s, f[:]) + return n, nil +} + +const ( + // mxcsrOffset is the offset in bytes of the MXCSR field from the start of + // the FXSAVE area. (Intel SDM Vol. 1, Table 10-2 "Format of an FXSAVE + // Area") + mxcsrOffset = 24 + + // mxcsrMaskOffset is the offset in bytes of the MXCSR_MASK field from the + // start of the FXSAVE area. + mxcsrMaskOffset = 28 +) + +var ( + mxcsrMask uint32 + initMXCSRMask sync.Once +) + +const ( + // minXstateBytes is the minimum size in bytes of an x86 XSAVE area, equal + // to the size of the XSAVE legacy area (512 bytes) plus the size of the + // XSAVE header (64 bytes). Equivalently, minXstateBytes is GDB's + // X86_XSTATE_SSE_SIZE. + minXstateBytes = 512 + 64 + + // userXstateXCR0Offset is the offset in bytes of the USER_XSTATE_XCR0_WORD + // field in Linux's struct user_xstateregs, which is the type manipulated + // by ptrace(PTRACE_GET/SETREGSET, NT_X86_XSTATE). Equivalently, + // userXstateXCR0Offset is GDB's I386_LINUX_XSAVE_XCR0_OFFSET. + userXstateXCR0Offset = 464 + + // xstateBVOffset is the offset in bytes of the XSTATE_BV field in an x86 + // XSAVE area. + xstateBVOffset = 512 + + // xsaveHeaderZeroedOffset and xsaveHeaderZeroedBytes indicate parts of the + // XSAVE header that we coerce to zero: "Bytes 15:8 of the XSAVE header is + // a state-component bitmap called XCOMP_BV. ... Bytes 63:16 of the XSAVE + // header are reserved." - Intel SDM Vol. 1, Section 13.4.2 "XSAVE Header". + // Linux ignores XCOMP_BV, but it's able to recover from XRSTOR #GP + // exceptions resulting from invalid values; we aren't. Linux also never + // uses the compacted format when doing XSAVE and doesn't even define the + // compaction extensions to XSAVE as a CPU feature, so for simplicity we + // assume no one is using them. + xsaveHeaderZeroedOffset = 512 + 8 + xsaveHeaderZeroedBytes = 64 - 8 +) + +// sanitizeMXCSR coerces reserved bits in the MXCSR field of f to 0. ("FXRSTOR +// generates a general-protection fault (#GP) in response to an attempt to set +// any of the reserved bits of the MXCSR register." - Intel SDM Vol. 1, Section +// 10.5.1.2 "SSE State") +func sanitizeMXCSR(f State) { + mxcsr := usermem.ByteOrder.Uint32(f[mxcsrOffset:]) + initMXCSRMask.Do(func() { + temp := State(alignedBytes(uint(ptraceFPRegsSize), 16)) + initX86FPState(&temp[0], false /* useXsave */) + mxcsrMask = usermem.ByteOrder.Uint32(temp[mxcsrMaskOffset:]) + if mxcsrMask == 0 { + // "If the value of the MXCSR_MASK field is 00000000H, then the + // MXCSR_MASK value is the default value of 0000FFBFH." - Intel SDM + // Vol. 1, Section 11.6.6 "Guidelines for Writing to the MXCSR + // Register" + mxcsrMask = 0xffbf + } + }) + mxcsr &= mxcsrMask + usermem.ByteOrder.PutUint32(f[mxcsrOffset:], mxcsr) +} + +// PtraceGetXstateRegs implements ptrace(PTRACE_GETREGS, NT_X86_XSTATE) by +// writing the floating point registers from this state to dst and returning the +// number of bytes written, which must be less than or equal to maxlen. +func (s *State) PtraceGetXstateRegs(dst io.Writer, maxlen int, featureSet *cpuid.FeatureSet) (int, error) { + // N.B. s.x86FPState may contain more state than the application + // expects. We only copy the subset that would be in their XSAVE area. + ess, _ := featureSet.ExtendedStateSize() + f := make([]byte, ess) + copy(f, *s) + // "The XSAVE feature set does not use bytes 511:416; bytes 463:416 are + // reserved." - Intel SDM Vol 1., Section 13.4.1 "Legacy Region of an XSAVE + // Area". Linux uses the first 8 bytes of this area to store the OS XSTATE + // mask. GDB relies on this: see + // gdb/x86-linux-nat.c:x86_linux_read_description(). + usermem.ByteOrder.PutUint64(f[userXstateXCR0Offset:], featureSet.ValidXCR0Mask()) + if len(f) > maxlen { + f = f[:maxlen] + } + return dst.Write(f) +} + +// PtraceSetXstateRegs implements ptrace(PTRACE_SETREGS, NT_X86_XSTATE) by +// reading floating point registers from src and returning the number of bytes +// read, which must be less than or equal to maxlen. +func (s *State) PtraceSetXstateRegs(src io.Reader, maxlen int, featureSet *cpuid.FeatureSet) (int, error) { + // Allow users to pass an xstate register set smaller than ours (they can + // mask bits out of XSTATE_BV), as long as it's at least minXstateBytes. + // Also allow users to pass a register set larger than ours; anything after + // their ExtendedStateSize will be ignored. (I think Linux technically + // permits setting a register set smaller than minXstateBytes, but it has + // the same silent truncation behavior in kernel/ptrace.c:ptrace_regset().) + if maxlen < minXstateBytes { + return 0, unix.EFAULT + } + ess, _ := featureSet.ExtendedStateSize() + if maxlen > int(ess) { + maxlen = int(ess) + } + f := make([]byte, maxlen) + if _, err := io.ReadFull(src, f); err != nil { + return 0, err + } + // Force reserved bits in MXCSR to 0. This is consistent with Linux. + sanitizeMXCSR(State(f)) + // Users can't enable *more* XCR0 bits than what we, and the CPU, support. + xstateBV := usermem.ByteOrder.Uint64(f[xstateBVOffset:]) + xstateBV &= featureSet.ValidXCR0Mask() + usermem.ByteOrder.PutUint64(f[xstateBVOffset:], xstateBV) + // Force XCOMP_BV and reserved bytes in the XSAVE header to 0. + reserved := f[xsaveHeaderZeroedOffset : xsaveHeaderZeroedOffset+xsaveHeaderZeroedBytes] + for i := range reserved { + reserved[i] = 0 + } + return copy(*s, f), nil +} + +// BytePointer returns a pointer to the first byte of the state. +// +//go:nosplit +func (s *State) BytePointer() *byte { + return &(*s)[0] +} + +// XSTATE_BV does not exist if FXSAVE is used, but FXSAVE implicitly saves x87 +// and SSE state, so this is the equivalent XSTATE_BV value. +const fxsaveBV uint64 = cpuid.XSAVEFeatureX87 | cpuid.XSAVEFeatureSSE + +// AfterLoad converts the loaded state to the format that compatible with the +// current processor. +func (s *State) AfterLoad() { + old := *s + + // Recreate the slice. This is done to ensure that it is aligned + // appropriately in memory, and large enough to accommodate any new + // state that may be saved by the new CPU. Even if extraneous new state + // is saved, the state we care about is guaranteed to be a subset of + // new state. Later optimizations can use less space when using a + // smaller state component bitmap. Intel SDM Volume 1 Chapter 13 has + // more info. + *s = NewState() + + // x86FPState always contains all the FP state supported by the host. + // We may have come from a newer machine that supports additional state + // which we cannot restore. + // + // The x86 FP state areas are backwards compatible, so we can simply + // truncate the additional floating point state. + // + // Applications should not depend on the truncated state because it + // should relate only to features that were not exposed in the app + // FeatureSet. However, because we do not *prevent* them from using + // this state, we must verify here that there is no in-use state + // (according to XSTATE_BV) which we do not support. + if len(*s) < len(old) { + // What do we support? + supportedBV := fxsaveBV + if fs := cpuid.HostFeatureSet(); fs.UseXsave() { + supportedBV = fs.ValidXCR0Mask() + } + + // What was in use? + savedBV := fxsaveBV + if len(old) >= xstateBVOffset+8 { + savedBV = usermem.ByteOrder.Uint64(old[xstateBVOffset:]) + } + + // Supported features must be a superset of saved features. + if savedBV&^supportedBV != 0 { + panic(ErrLoadingState{supportedFeatures: supportedBV, savedFeatures: savedBV}) + } + } + + // Copy to the new, aligned location. + copy(*s, old) +} diff --git a/pkg/sentry/arch/arch_amd64.s b/pkg/sentry/arch/fpu/fpu_amd64.s index 6c10336e7..6c10336e7 100644 --- a/pkg/sentry/arch/arch_amd64.s +++ b/pkg/sentry/arch/fpu/fpu_amd64.s diff --git a/pkg/sentry/arch/fpu/fpu_arm64.go b/pkg/sentry/arch/fpu/fpu_arm64.go new file mode 100644 index 000000000..d2f62631d --- /dev/null +++ b/pkg/sentry/arch/fpu/fpu_arm64.go @@ -0,0 +1,63 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package fpu + +const ( + // fpsimdMagic is the magic number which is used in fpsimd_context. + fpsimdMagic = 0x46508001 + + // fpsimdContextSize is the size of fpsimd_context. + fpsimdContextSize = 0x210 +) + +// initAarch64FPState sets up initial state. +// +// Related code in Linux kernel: fpsimd_flush_thread(). +// FPCR = FPCR_RM_RN (0x0 << 22). +// +// Currently, aarch64FPState is only a space of 0x210 length for fpstate. +// The fp head is useless in sentry/ptrace/kvm. +// +func initAarch64FPState(data *State) { +} + +func newAarch64FPStateSlice() []byte { + return alignedBytes(4096, 16)[:fpsimdContextSize] +} + +// NewState returns an initialized floating point state. +// +// The returned state is large enough to store all floating point state +// supported by host, even if the app won't use much of it due to a restricted +// FeatureSet. +func NewState() State { + f := State(newAarch64FPStateSlice()) + initAarch64FPState(&f) + return f +} + +// Fork creates and returns an identical copy of the aarch64 floating point state. +func (s *State) Fork() State { + n := State(newAarch64FPStateSlice()) + copy(n, *s) + return n +} + +// BytePointer returns a pointer to the first byte of the state. +func (s *State) BytePointer() *byte { + return &(*s)[0] +} diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go index e6557cab6..ee3743483 100644 --- a/pkg/sentry/arch/signal_amd64.go +++ b/pkg/sentry/arch/signal_amd64.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal/primitive" + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/usermem" ) @@ -98,7 +99,7 @@ func (c *context64) NewSignalStack() NativeSignalStack { const _FP_XSTATE_MAGIC2_SIZE = 4 func (c *context64) fpuFrameSize() (size int, useXsave bool) { - size = len(c.x86FPState) + size = len(c.fpState) if size > 512 { // Make room for the magic cookie at the end of the xsave frame. size += _FP_XSTATE_MAGIC2_SIZE @@ -226,10 +227,10 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt c.Regs.Ss = userDS // Save the thread's floating point state. - c.sigFPState = append(c.sigFPState, c.x86FPState) + c.sigFPState = append(c.sigFPState, c.fpState) // Signal handler gets a clean floating point state. - c.x86FPState = newX86FPState() + c.fpState = fpu.NewState() return nil } @@ -273,7 +274,7 @@ func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalSt // Restore floating point state. l := len(c.sigFPState) if l > 0 { - c.x86FPState = c.sigFPState[l-1] + c.fpState = c.sigFPState[l-1] // NOTE(cl/133042258): State save requires that any slice // elements from '[len:cap]' to be zero value. c.sigFPState[l-1] = nil diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go index 4491008c2..53281dcba 100644 --- a/pkg/sentry/arch/signal_arm64.go +++ b/pkg/sentry/arch/signal_arm64.go @@ -20,6 +20,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/usermem" ) @@ -139,9 +140,9 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt c.Regs.Regs[30] = uint64(act.Restorer) // Save the thread's floating point state. - c.sigFPState = append(c.sigFPState, c.aarch64FPState) + c.sigFPState = append(c.sigFPState, c.fpState) // Signal handler gets a clean floating point state. - c.aarch64FPState = newAarch64FPState() + c.fpState = fpu.NewState() return nil } @@ -166,7 +167,7 @@ func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalSt // Restore floating point state. l := len(c.sigFPState) if l > 0 { - c.aarch64FPState = c.sigFPState[l-1] + c.fpState = c.sigFPState[l-1] // NOTE(cl/133042258): State save requires that any slice // elements from '[len:cap]' to be zero value. c.sigFPState[l-1] = nil diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go index 82eda3e43..0ed7aafa5 100644 --- a/pkg/sentry/fs/fsutil/inode_cached.go +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -380,16 +380,17 @@ func (c *CachingInodeOperations) Allocate(ctx context.Context, offset, length in return nil } -// WriteOut implements fs.InodeOperations.WriteOut. -func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error { +// WriteDirtyPagesAndAttrs will write the dirty pages and attributes to the +// gofer without calling Fsync on the remote file. +func (c *CachingInodeOperations) WriteDirtyPagesAndAttrs(ctx context.Context, inode *fs.Inode) error { c.attrMu.Lock() + defer c.attrMu.Unlock() + c.dataMu.Lock() + defer c.dataMu.Unlock() // Write dirty pages back. - c.dataMu.Lock() err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.mfp.MemoryFile(), c.backingFile.WriteFromBlocksAt) - c.dataMu.Unlock() if err != nil { - c.attrMu.Unlock() return err } @@ -399,12 +400,18 @@ func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) // Write out cached attributes. if err := c.backingFile.SetMaskedAttributes(ctx, c.dirtyAttr, c.attr, false); err != nil { - c.attrMu.Unlock() return err } c.dirtyAttr = fs.AttrMask{} - c.attrMu.Unlock() + return nil +} + +// WriteOut implements fs.InodeOperations.WriteOut. +func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error { + if err := c.WriteDirtyPagesAndAttrs(ctx, inode); err != nil { + return err + } // Fsync the remote file. return c.backingFile.Sync(ctx) diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go index 06d450ba6..8f5a87120 100644 --- a/pkg/sentry/fs/gofer/file.go +++ b/pkg/sentry/fs/gofer/file.go @@ -204,20 +204,8 @@ func (f *fileOperations) readdirAll(ctx context.Context) (map[string]fs.DentAttr return entries, nil } -// maybeSync will call FSync on the file if either the cache policy or file -// flags require it. +// maybeSync will call FSync on the file if the file flags require it. func (f *fileOperations) maybeSync(ctx context.Context, file *fs.File, offset, n int64) error { - if n == 0 { - // Nothing to sync. - return nil - } - - if f.inodeOperations.session().cachePolicy.writeThrough(file.Dirent.Inode) { - // Call WriteOut directly, as some "writethrough" filesystems - // do not support sync. - return f.inodeOperations.cachingInodeOps.WriteOut(ctx, file.Dirent.Inode) - } - flags := file.Flags() var syncType fs.SyncType switch { @@ -254,6 +242,19 @@ func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.I n, err = src.CopyInTo(ctx, f.handles.readWriterAt(ctx, offset)) } + if n == 0 { + // Nothing written. We are done. + return 0, err + } + + // Write the dirty pages and attributes if cache policy tells us to. + if f.inodeOperations.session().cachePolicy.writeThrough(file.Dirent.Inode) { + if werr := f.inodeOperations.cachingInodeOps.WriteDirtyPagesAndAttrs(ctx, file.Dirent.Inode); werr != nil { + // Report no bytes written since the write faild. + return 0, werr + } + } + // We may need to sync the written bytes. if syncErr := f.maybeSync(ctx, file, offset, n); syncErr != nil { // Sync failed. Report 0 bytes written, since none of them are diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index c34451269..43c3c5a2d 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -783,7 +783,15 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { creds := rp.Credentials() return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, _ **[]*dentry) error { - if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil { + // If the parent is a setgid directory, use the parent's GID + // rather than the caller's and enable setgid. + kgid := creds.EffectiveKGID + mode := opts.Mode + if atomic.LoadUint32(&parent.mode)&linux.S_ISGID != 0 { + kgid = auth.KGID(atomic.LoadUint32(&parent.gid)) + mode |= linux.S_ISGID + } + if _, err := parent.file.mkdir(ctx, name, p9.FileMode(mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid)); err != nil { if !opts.ForSyntheticMountpoint || err == syserror.EEXIST { return err } @@ -1145,7 +1153,15 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving name := rp.Component() // We only want the access mode for creating the file. createFlags := p9.OpenFlags(opts.Flags) & p9.OpenFlagsModeMask - fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + + // If the parent is a setgid directory, use the parent's GID rather + // than the caller's. + kgid := creds.EffectiveKGID + if atomic.LoadUint32(&d.mode)&linux.S_ISGID != 0 { + kgid = auth.KGID(atomic.LoadUint32(&d.gid)) + } + + fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, p9.FileMode(opts.Mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid)) if err != nil { dirfile.close(ctx) return nil, err diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 71569dc65..692da02c1 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1102,10 +1102,26 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs d.metadataMu.Lock() defer d.metadataMu.Unlock() + + // As with Linux, if the UID, GID, or file size is changing, we have to + // clear permission bits. Note that when set, clearSGID causes + // permissions to be updated, but does not modify stat.Mask, as + // modification would cause an extra inotify flag to be set. + clearSGID := stat.Mask&linux.STATX_UID != 0 && stat.UID != atomic.LoadUint32(&d.uid) || + stat.Mask&linux.STATX_GID != 0 && stat.GID != atomic.LoadUint32(&d.gid) || + stat.Mask&linux.STATX_SIZE != 0 + if clearSGID { + if stat.Mask&linux.STATX_MODE != 0 { + stat.Mode = uint16(vfs.ClearSUIDAndSGID(uint32(stat.Mode))) + } else { + stat.Mode = uint16(vfs.ClearSUIDAndSGID(atomic.LoadUint32(&d.mode))) + } + } + if !d.isSynthetic() { if stat.Mask != 0 { if err := d.file.setAttr(ctx, p9.SetAttrMask{ - Permissions: stat.Mask&linux.STATX_MODE != 0, + Permissions: stat.Mask&linux.STATX_MODE != 0 || clearSGID, UID: stat.Mask&linux.STATX_UID != 0, GID: stat.Mask&linux.STATX_GID != 0, Size: stat.Mask&linux.STATX_SIZE != 0, @@ -1140,7 +1156,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs return nil } } - if stat.Mask&linux.STATX_MODE != 0 { + if stat.Mask&linux.STATX_MODE != 0 || clearSGID { atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode)) } if stat.Mask&linux.STATX_UID != 0 { diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 283b220bb..4f1ad0c88 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -266,6 +266,20 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off return 0, offset, err } } + + // As with Linux, writing clears the setuid and setgid bits. + if n > 0 { + oldMode := atomic.LoadUint32(&d.mode) + // If setuid or setgid were set, update d.mode and propagate + // changes to the host. + if newMode := vfs.ClearSUIDAndSGID(oldMode); newMode != oldMode { + atomic.StoreUint32(&d.mode, newMode) + if err := d.file.setAttr(ctx, p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(newMode)}); err != nil { + return 0, offset, err + } + } + } + return n, offset + n, nil } diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index 84e37f793..46c500427 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -689,13 +689,9 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v } return err } - creds := rp.Credentials() + if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ - Stat: linux.Statx{ - Mask: linux.STATX_UID | linux.STATX_GID, - UID: uint32(creds.EffectiveKUID), - GID: uint32(creds.EffectiveKGID), - }, + Stat: parent.newChildOwnerStat(opts.Mode, rp.Credentials()), }); err != nil { if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr)) @@ -750,11 +746,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v } creds := rp.Credentials() if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ - Stat: linux.Statx{ - Mask: linux.STATX_UID | linux.STATX_GID, - UID: uint32(creds.EffectiveKUID), - GID: uint32(creds.EffectiveKGID), - }, + Stat: parent.newChildOwnerStat(opts.Mode, creds), }); err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr)) @@ -963,14 +955,11 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving } return nil, err } + // Change the file's owner to the caller. We can't use upperFD.SetStat() // because it will pick up creds from ctx. if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ - Stat: linux.Statx{ - Mask: linux.STATX_UID | linux.STATX_GID, - UID: uint32(creds.EffectiveKUID), - GID: uint32(creds.EffectiveKGID), - }, + Stat: parent.newChildOwnerStat(opts.Mode, creds), }); err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr)) diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index 58680bc80..454c20d4f 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -749,6 +749,27 @@ func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { ) } +// newChildOwnerStat returns a Statx for configuring the UID, GID, and mode of +// children. +func (d *dentry) newChildOwnerStat(mode linux.FileMode, creds *auth.Credentials) linux.Statx { + stat := linux.Statx{ + Mask: uint32(linux.STATX_UID | linux.STATX_GID), + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + } + // Set GID and possibly the SGID bit if the parent is an SGID directory. + d.copyMu.RLock() + defer d.copyMu.RUnlock() + if atomic.LoadUint32(&d.mode)&linux.ModeSetGID == linux.ModeSetGID { + stat.GID = atomic.LoadUint32(&d.gid) + if stat.Mode&linux.ModeDirectory == linux.ModeDirectory { + stat.Mode = uint16(mode) | linux.ModeSetGID + stat.Mask |= linux.STATX_MODE + } + } + return stat +} + // fileDescription is embedded by overlay implementations of // vfs.FileDescriptionImpl. // diff --git a/pkg/sentry/fsimpl/overlay/regular_file.go b/pkg/sentry/fsimpl/overlay/regular_file.go index 25c785fd4..d791c06db 100644 --- a/pkg/sentry/fsimpl/overlay/regular_file.go +++ b/pkg/sentry/fsimpl/overlay/regular_file.go @@ -205,6 +205,20 @@ func (fd *regularFileFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) e if err := wrappedFD.SetStat(ctx, opts); err != nil { return err } + + // Changing owners may clear one or both of the setuid and setgid bits, + // so we may have to update opts before setting d.mode. + if opts.Stat.Mask&(linux.STATX_UID|linux.STATX_GID) != 0 { + stat, err := wrappedFD.Stat(ctx, vfs.StatOptions{ + Mask: linux.STATX_MODE, + }) + if err != nil { + return err + } + opts.Stat.Mode = stat.Mode + opts.Stat.Mask |= linux.STATX_MODE + } + d.updateAfterSetStatLocked(&opts) if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) @@ -295,7 +309,11 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, err } defer wrappedFD.DecRef(ctx) - return wrappedFD.PWrite(ctx, src, offset, opts) + n, err := wrappedFD.PWrite(ctx, src, offset, opts) + if err != nil { + return n, err + } + return fd.updateSetUserGroupIDs(ctx, wrappedFD, n) } // Write implements vfs.FileDescriptionImpl.Write. @@ -307,7 +325,28 @@ func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts if err != nil { return 0, err } - return wrappedFD.Write(ctx, src, opts) + n, err := wrappedFD.Write(ctx, src, opts) + if err != nil { + return n, err + } + return fd.updateSetUserGroupIDs(ctx, wrappedFD, n) +} + +func (fd *regularFileFD) updateSetUserGroupIDs(ctx context.Context, wrappedFD *vfs.FileDescription, written int64) (int64, error) { + // Writing can clear the setuid and/or setgid bits. We only have to + // check this if something was written and one of those bits was set. + dentry := fd.dentry() + if written == 0 || atomic.LoadUint32(&dentry.mode)&(linux.S_ISUID|linux.S_ISGID) == 0 { + return written, nil + } + stat, err := wrappedFD.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_MODE}) + if err != nil { + return written, err + } + dentry.copyMu.Lock() + defer dentry.copyMu.Unlock() + atomic.StoreUint32(&dentry.mode, uint32(stat.Mode)) + return written, nil } // Seek implements vfs.FileDescriptionImpl.Seek. diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go index 609ad3941..7aea3dcd8 100644 --- a/pkg/sentry/kernel/ptrace_amd64.go +++ b/pkg/sentry/kernel/ptrace_amd64.go @@ -51,14 +51,15 @@ func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) erro return err case linux.PTRACE_GETFPREGS: - _, err := target.Arch().PtraceGetFPRegs(&usermem.IOReadWriter{ + s := target.Arch().FloatingPointData() + _, err := target.Arch().FloatingPointData().PtraceGetFPRegs(&usermem.IOReadWriter{ Ctx: t, IO: t.MemoryManager(), Addr: data, Opts: usermem.IOOpts{ AddressSpaceActive: true, }, - }) + }, len(*s)) return err case linux.PTRACE_SETREGS: @@ -73,14 +74,15 @@ func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) erro return err case linux.PTRACE_SETFPREGS: - _, err := target.Arch().PtraceSetFPRegs(&usermem.IOReadWriter{ + s := target.Arch().FloatingPointData() + _, err := s.PtraceSetFPRegs(&usermem.IOReadWriter{ Ctx: t, IO: t.MemoryManager(), Addr: data, Opts: usermem.IOOpts{ AddressSpaceActive: true, }, - }) + }, len(*s)) return err default: diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 4f9e781af..03a76eb9b 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -50,6 +50,7 @@ go_library( "//pkg/safecopy", "//pkg/seccomp", "//pkg/sentry/arch", + "//pkg/sentry/arch/fpu", "//pkg/sentry/memmap", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", @@ -78,6 +79,7 @@ go_test( "//pkg/ring0", "//pkg/ring0/pagetables", "//pkg/sentry/arch", + "//pkg/sentry/arch/fpu", "//pkg/sentry/platform", "//pkg/sentry/platform/kvm/testutil", "//pkg/sentry/time", diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go index 308696efe..d761bbdee 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64.go @@ -73,7 +73,7 @@ func (c *vCPU) KernelSyscall() { // We only trigger a bluepill entry in the bluepill function, and can // therefore be guaranteed that there is no floating point state to be // loaded on resuming from halt. We only worry about saving on exit. - ring0.SaveFloatingPoint(&c.floatingPointState[0]) // escapes: no. + ring0.SaveFloatingPoint(c.floatingPointState.BytePointer()) // escapes: no. ring0.Halt() ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no, reload host segment. } @@ -92,7 +92,7 @@ func (c *vCPU) KernelException(vector ring0.Vector) { regs.Rip = 0 } // See above. - ring0.SaveFloatingPoint(&c.floatingPointState[0]) // escapes: no. + ring0.SaveFloatingPoint(c.floatingPointState.BytePointer()) // escapes: no. ring0.Halt() ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no; reload host segment. } @@ -124,5 +124,5 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { // Set the context pointer to the saved floating point state. This is // where the guest data has been serialized, the kernel will restore // from this new pointer value. - context.Fpstate = uint64(uintptrValue(&c.floatingPointState[0])) + context.Fpstate = uint64(uintptrValue(c.floatingPointState.BytePointer())) } diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go index c317f1e99..578852c3f 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64.go @@ -92,7 +92,7 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { lazyVfp := c.GetLazyVFP() if lazyVfp != 0 { - fpsimd := fpsimdPtr(&c.floatingPointState[0]) + fpsimd := fpsimdPtr(c.floatingPointState.BytePointer()) context.Fpsimd64.Fpsr = fpsimd.Fpsr context.Fpsimd64.Fpcr = fpsimd.Fpcr context.Fpsimd64.Vregs = fpsimd.Vregs @@ -112,12 +112,12 @@ func (c *vCPU) KernelSyscall() { fpDisableTrap := ring0.CPACREL1() if fpDisableTrap != 0 { - fpsimd := fpsimdPtr(&c.floatingPointState[0]) + fpsimd := fpsimdPtr(c.floatingPointState.BytePointer()) fpcr := ring0.GetFPCR() fpsr := ring0.GetFPSR() fpsimd.Fpcr = uint32(fpcr) fpsimd.Fpsr = uint32(fpsr) - ring0.SaveVRegs(&c.floatingPointState[0]) + ring0.SaveVRegs(c.floatingPointState.BytePointer()) } ring0.Halt() @@ -136,12 +136,12 @@ func (c *vCPU) KernelException(vector ring0.Vector) { fpDisableTrap := ring0.CPACREL1() if fpDisableTrap != 0 { - fpsimd := fpsimdPtr(&c.floatingPointState[0]) + fpsimd := fpsimdPtr(c.floatingPointState.BytePointer()) fpcr := ring0.GetFPCR() fpsr := ring0.GetFPSR() fpsimd.Fpcr = uint32(fpcr) fpsimd.Fpsr = uint32(fpsr) - ring0.SaveVRegs(&c.floatingPointState[0]) + ring0.SaveVRegs(c.floatingPointState.BytePointer()) } ring0.Halt() diff --git a/pkg/sentry/platform/kvm/kvm_amd64_test.go b/pkg/sentry/platform/kvm/kvm_amd64_test.go index 76fc594a0..e44e995a0 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64_test.go +++ b/pkg/sentry/platform/kvm/kvm_amd64_test.go @@ -33,7 +33,7 @@ func TestSegments(t *testing.T) { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, - FloatingPointState: dummyFPState, + FloatingPointState: &dummyFPState, PageTables: pt, FullRestore: true, }, &si); err == platform.ErrContextInterrupt { diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go index 6243b9a04..5bce16dde 100644 --- a/pkg/sentry/platform/kvm/kvm_test.go +++ b/pkg/sentry/platform/kvm/kvm_test.go @@ -25,13 +25,14 @@ import ( "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil" ktime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/usermem" ) -var dummyFPState = (*byte)(arch.NewFloatingPointData()) +var dummyFPState = fpu.NewState() type testHarness interface { Errorf(format string, args ...interface{}) @@ -159,7 +160,7 @@ func TestApplicationSyscall(t *testing.T) { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, - FloatingPointState: dummyFPState, + FloatingPointState: &dummyFPState, PageTables: pt, FullRestore: true, }, &si); err == platform.ErrContextInterrupt { @@ -173,7 +174,7 @@ func TestApplicationSyscall(t *testing.T) { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, - FloatingPointState: dummyFPState, + FloatingPointState: &dummyFPState, PageTables: pt, }, &si); err == platform.ErrContextInterrupt { return true // Retry. @@ -190,7 +191,7 @@ func TestApplicationFault(t *testing.T) { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, - FloatingPointState: dummyFPState, + FloatingPointState: &dummyFPState, PageTables: pt, FullRestore: true, }, &si); err == platform.ErrContextInterrupt { @@ -205,7 +206,7 @@ func TestApplicationFault(t *testing.T) { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, - FloatingPointState: dummyFPState, + FloatingPointState: &dummyFPState, PageTables: pt, }, &si); err == platform.ErrContextInterrupt { return true // Retry. @@ -223,7 +224,7 @@ func TestRegistersSyscall(t *testing.T) { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, - FloatingPointState: dummyFPState, + FloatingPointState: &dummyFPState, PageTables: pt, }, &si); err == platform.ErrContextInterrupt { continue // Retry. @@ -246,7 +247,7 @@ func TestRegistersFault(t *testing.T) { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, - FloatingPointState: dummyFPState, + FloatingPointState: &dummyFPState, PageTables: pt, FullRestore: true, }, &si); err == platform.ErrContextInterrupt { @@ -272,7 +273,7 @@ func TestBounce(t *testing.T) { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, - FloatingPointState: dummyFPState, + FloatingPointState: &dummyFPState, PageTables: pt, }, &si); err != platform.ErrContextInterrupt { t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt) @@ -287,7 +288,7 @@ func TestBounce(t *testing.T) { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, - FloatingPointState: dummyFPState, + FloatingPointState: &dummyFPState, PageTables: pt, FullRestore: true, }, &si); err != platform.ErrContextInterrupt { @@ -319,7 +320,7 @@ func TestBounceStress(t *testing.T) { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, - FloatingPointState: dummyFPState, + FloatingPointState: &dummyFPState, PageTables: pt, }, &si); err != platform.ErrContextInterrupt { t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt) @@ -340,7 +341,7 @@ func TestInvalidate(t *testing.T) { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, - FloatingPointState: dummyFPState, + FloatingPointState: &dummyFPState, PageTables: pt, }, &si); err == platform.ErrContextInterrupt { continue // Retry. @@ -355,7 +356,7 @@ func TestInvalidate(t *testing.T) { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, - FloatingPointState: dummyFPState, + FloatingPointState: &dummyFPState, PageTables: pt, Flush: true, }, &si); err == platform.ErrContextInterrupt { @@ -379,7 +380,7 @@ func TestEmptyAddressSpace(t *testing.T) { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, - FloatingPointState: dummyFPState, + FloatingPointState: &dummyFPState, PageTables: pt, }, &si); err == platform.ErrContextInterrupt { return true // Retry. @@ -393,7 +394,7 @@ func TestEmptyAddressSpace(t *testing.T) { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, - FloatingPointState: dummyFPState, + FloatingPointState: &dummyFPState, PageTables: pt, FullRestore: true, }, &si); err == platform.ErrContextInterrupt { @@ -469,7 +470,7 @@ func BenchmarkApplicationSyscall(b *testing.B) { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, - FloatingPointState: dummyFPState, + FloatingPointState: &dummyFPState, PageTables: pt, }, &si); err == platform.ErrContextInterrupt { a++ @@ -506,7 +507,7 @@ func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) { var si arch.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, - FloatingPointState: dummyFPState, + FloatingPointState: &dummyFPState, PageTables: pt, }, &si); err == platform.ErrContextInterrupt { a++ diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 916903881..3af96c7e5 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/platform" ktime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/usermem" @@ -70,7 +71,7 @@ type vCPUArchState struct { // floatingPointState is the floating point state buffer used in guest // to host transitions. See usage in bluepill_amd64.go. - floatingPointState arch.FloatingPointData + floatingPointState fpu.State } const ( @@ -151,7 +152,7 @@ func (c *vCPU) initArchState() error { // This will be saved prior to leaving the guest, and we restore from // this always. We cannot use the pointer in the context alone because // we don't know how large the area there is in reality. - c.floatingPointState = arch.NewFloatingPointData() + c.floatingPointState = fpu.NewState() // Set the time offset to the host native time. return c.setSystemTime() @@ -307,12 +308,12 @@ func loadByte(ptr *byte) byte { // emulate instructions like xsave and xrstor. // //go:nosplit -func prefaultFloatingPointState(data arch.FloatingPointData) { - size := len(data) +func prefaultFloatingPointState(data *fpu.State) { + size := len(*data) for i := 0; i < size; i += usermem.PageSize { - loadByte(&(data)[i]) + loadByte(&(*data)[i]) } - loadByte(&(data)[size-1]) + loadByte(&(*data)[size-1]) } // SwitchToUser unpacks architectural-details. diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index 3d715e570..2edc9d1b2 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/usermem" ) @@ -32,7 +33,7 @@ type vCPUArchState struct { // floatingPointState is the floating point state buffer used in guest // to host transitions. See usage in bluepill_arm64.go. - floatingPointState arch.FloatingPointData + floatingPointState fpu.State } const ( diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index 059aa43d0..e7d5f3193 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/usermem" ) @@ -150,7 +151,7 @@ func (c *vCPU) initArchState() error { c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs) } - c.floatingPointState = arch.NewFloatingPointData() + c.floatingPointState = fpu.NewState() return c.setSystemTime() } diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index fc43cc3c0..47efde6a2 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -30,6 +30,7 @@ go_library( "//pkg/safecopy", "//pkg/seccomp", "//pkg/sentry/arch", + "//pkg/sentry/arch/fpu", "//pkg/sentry/memmap", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go index 6259350ec..01e73b019 100644 --- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go +++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go @@ -20,6 +20,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/usermem" ) @@ -62,9 +63,9 @@ func (t *thread) setRegs(regs *arch.Registers) error { } // getFPRegs gets the floating-point data via the GETREGSET ptrace unix. -func (t *thread) getFPRegs(fpState arch.FloatingPointData, fpLen uint64, useXsave bool) error { +func (t *thread) getFPRegs(fpState *fpu.State, fpLen uint64, useXsave bool) error { iovec := unix.Iovec{ - Base: (*byte)(&fpState[0]), + Base: fpState.BytePointer(), Len: fpLen, } _, _, errno := unix.RawSyscall6( @@ -81,9 +82,9 @@ func (t *thread) getFPRegs(fpState arch.FloatingPointData, fpLen uint64, useXsav } // setFPRegs sets the floating-point data via the SETREGSET ptrace unix. -func (t *thread) setFPRegs(fpState arch.FloatingPointData, fpLen uint64, useXsave bool) error { +func (t *thread) setFPRegs(fpState *fpu.State, fpLen uint64, useXsave bool) error { iovec := unix.Iovec{ - Base: (*byte)(&fpState[0]), + Base: fpState.BytePointer(), Len: fpLen, } _, _, errno := unix.RawSyscall6( diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go index 5bd526b73..efec93f73 100644 --- a/pkg/sentry/syscalls/linux/error.go +++ b/pkg/sentry/syscalls/linux/error.go @@ -75,17 +75,25 @@ func handleIOError(ctx context.Context, partialResult bool, ioerr, intr error, o // errors, we may consume the error and return only the partial read/write. // // Returns false if error is unknown. -func handleIOErrorImpl(ctx context.Context, partialResult bool, err, intr error, op string) (bool, error) { - switch err { - case nil: +func handleIOErrorImpl(ctx context.Context, partialResult bool, errOrig, intr error, op string) (bool, error) { + if errOrig == nil { // Typical successful syscall. return true, nil + } + + // Translate error, if possible, to consolidate errors from other packages + // into a smaller set of errors from syserror package. + translatedErr := errOrig + if errno, ok := syserror.TranslateError(errOrig); ok { + translatedErr = errno + } + switch translatedErr { case io.EOF: // EOF is always consumed. If this is a partial read/write // (result != 0), the application will see that, otherwise // they will see 0. return true, nil - case syserror.ErrExceedsFileSizeLimit: + case syserror.EFBIG: t := kernel.TaskFromContext(ctx) if t == nil { panic("I/O error should only occur from a context associated with a Task") @@ -98,7 +106,7 @@ func handleIOErrorImpl(ctx context.Context, partialResult bool, err, intr error, // Simultaneously send a SIGXFSZ per setrlimit(2). t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) return true, syserror.EFBIG - case syserror.ErrInterrupted: + case syserror.EINTR: // The syscall was interrupted. Return nil if it completed // partially, otherwise return the error code that the syscall // needs (to indicate to the kernel what it should do). @@ -110,10 +118,10 @@ func handleIOErrorImpl(ctx context.Context, partialResult bool, err, intr error, if !partialResult { // Typical syscall error. - return true, err + return true, errOrig } - switch err { + switch translatedErr { case syserror.EINTR: // Syscall interrupted, but completed a partial // read/write. Like ErrWouldBlock, since we have a @@ -143,7 +151,7 @@ func handleIOErrorImpl(ctx context.Context, partialResult bool, err, intr error, // For TCP sendfile connections, we may have a reset or timeout. But we // should just return n as the result. return true, nil - case syserror.ErrWouldBlock: + case syserror.EWOULDBLOCK: // Syscall would block, but completed a partial read/write. // This case should only be returned by IssueIO for nonblocking // files. Since we have a partial read/write, we consume @@ -151,7 +159,7 @@ func handleIOErrorImpl(ctx context.Context, partialResult bool, err, intr error, return true, nil } - switch err.(type) { + switch errOrig.(type) { case syserror.SyscallRestartErrno: // Identical to the EINTR case. return true, nil |