diff options
-rw-r--r-- | pkg/sentry/platform/kvm/machine.go | 15 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_amd64.go | 22 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_arm64.go | 36 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_arm64_unsafe.go | 67 | ||||
-rw-r--r-- | runsc/boot/loader.go | 62 |
5 files changed, 160 insertions, 42 deletions
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index 99f036bba..1b5d5f66e 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -75,6 +75,9 @@ type machine struct { // nextID is the next vCPU ID. nextID uint32 + + // machineArchState is the architecture-specific state. + machineArchState } const ( @@ -196,12 +199,7 @@ func newMachine(vm int) (*machine, error) { m.available.L = &m.mu // Pull the maximum vCPUs. - maxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) - if errno != 0 { - m.maxVCPUs = _KVM_NR_VCPUS - } else { - m.maxVCPUs = int(maxVCPUs) - } + m.getMaxVCPU() log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs) m.vCPUsByTID = make(map[uint64]*vCPU) m.vCPUsByID = make([]*vCPU, m.maxVCPUs) @@ -427,9 +425,8 @@ func (m *machine) Get() *vCPU { } } - // Create a new vCPU (maybe). - if int(m.nextID) < m.maxVCPUs { - c := m.newVCPU() + // Get a new vCPU (maybe). + if c := m.getNewVCPU(); c != nil { c.lock() m.vCPUsByTID[tid] = c m.mu.Unlock() diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index f727e61b0..9a2337654 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -63,6 +63,9 @@ func (m *machine) initArchState() error { return nil } +type machineArchState struct { +} + type vCPUArchState struct { // PCIDs is the set of PCIDs for this vCPU. // @@ -499,3 +502,22 @@ func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { physical) } } + +// getMaxVCPU get max vCPU number +func (m *machine) getMaxVCPU() { + maxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) + if errno != 0 { + m.maxVCPUs = _KVM_NR_VCPUS + } else { + m.maxVCPUs = int(maxVCPUs) + } +} + +// getNewVCPU create a new vCPU (maybe) +func (m *machine) getNewVCPU() *vCPU { + if int(m.nextID) < m.maxVCPUs { + c := m.newVCPU() + return c + } + return nil +} diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index cd912f922..8926b1d9f 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -17,6 +17,10 @@ package kvm import ( + "runtime" + "sync/atomic" + + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" @@ -25,6 +29,11 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" ) +type machineArchState struct { + //initialvCPUs is the machine vCPUs which has initialized but not used + initialvCPUs map[int]*vCPU +} + type vCPUArchState struct { // PCIDs is the set of PCIDs for this vCPU. // @@ -182,3 +191,30 @@ func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (hostarch.AccessType, return accessType, platform.ErrContextSignal } + +// getMaxVCPU get max vCPU number +func (m *machine) getMaxVCPU() { + rmaxVCPUs := runtime.NumCPU() + smaxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) + // compare the max vcpu number from runtime and syscall, use smaller one. + if errno != 0 { + m.maxVCPUs = rmaxVCPUs + } else { + if rmaxVCPUs < int(smaxVCPUs) { + m.maxVCPUs = rmaxVCPUs + } else { + m.maxVCPUs = int(smaxVCPUs) + } + } +} + +// getNewVCPU() scan for an available vCPU from initialvCPUs +func (m *machine) getNewVCPU() *vCPU { + for CID, c := range m.initialvCPUs { + if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) { + delete(m.initialvCPUs, CID) + return c + } + } + return nil +} diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index 634e55ec0..92edc992b 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -29,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/platform" + ktime "gvisor.dev/gvisor/pkg/sentry/time" ) type kvmVcpuInit struct { @@ -47,6 +48,19 @@ func (m *machine) initArchState() error { uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 { panic(fmt.Sprintf("error setting KVM_ARM_PREFERRED_TARGET failed: %v", errno)) } + + // Initialize all vCPUs on ARM64, while this does not happen on x86_64. + // The reason for the difference is that ARM64 and x86_64 have different KVM timer mechanisms. + // If we create vCPU dynamically on ARM64, the timer for vCPU would mess up for a short time. + // For more detail, please refer to https://github.com/google/gvisor/issues/5739 + m.initialvCPUs = make(map[int]*vCPU) + m.mu.Lock() + for int(m.nextID) < m.maxVCPUs-1 { + c := m.newVCPU() + c.state = 0 + m.initialvCPUs[c.id] = c + } + m.mu.Unlock() return nil } @@ -174,9 +188,58 @@ func (c *vCPU) setTSC(value uint64) error { return nil } +// getTSC gets the counter Physical Counter minus Virtual Offset. +func (c *vCPU) getTSC() error { + var ( + reg kvmOneReg + data uint64 + ) + + reg.addr = uint64(reflect.ValueOf(&data).Pointer()) + reg.id = _KVM_ARM64_REGS_TIMER_CNT + + if err := c.getOneRegister(®); err != nil { + return err + } + + return nil +} + // setSystemTime sets the vCPU to the system time. func (c *vCPU) setSystemTime() error { - return c.setSystemTimeLegacy() + const minIterations = 10 + minimum := uint64(0) + for iter := 0; ; iter++ { + // Use get the TSC to an estimate of where it will be + // on the host during a "fast" system call iteration. + // replace getTSC to another setOneRegister syscall can get more accurate value? + start := uint64(ktime.Rdtsc()) + if err := c.getTSC(); err != nil { + return err + } + // See if this is our new minimum call time. Note that this + // serves two functions: one, we make sure that we are + // accurately predicting the offset we need to set. Second, we + // don't want to do the final set on a slow call, which could + // produce a really bad result. + end := uint64(ktime.Rdtsc()) + if end < start { + continue // Totally bogus: unstable TSC? + } + current := end - start + if current < minimum || iter == 0 { + minimum = current // Set our new minimum. + } + // Is this past minIterations and within ~10% of minimum? + upperThreshold := (((minimum << 3) + minimum) >> 3) + if iter >= minIterations && (current <= upperThreshold || minimum < 50) { + // Try to set the TSC + if err := c.setTSC(end + (minimum / 2)); err != nil { + return err + } + return nil + } + } } //go:nosplit @@ -203,7 +266,7 @@ func (c *vCPU) getOneRegister(reg *kvmOneReg) error { uintptr(c.fd), _KVM_GET_ONE_REG, uintptr(unsafe.Pointer(reg))); errno != 0 { - return fmt.Errorf("error setting one register: %v", errno) + return fmt.Errorf("error getting one register: %v", errno) } return nil } diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 25f06165f..10f2d3d35 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -230,6 +230,33 @@ func New(args Args) (*Loader, error) { vfs2.Override() } + // Make host FDs stable between invocations. Host FDs must map to the exact + // same number when the sandbox is restored. Otherwise the wrong FD will be + // used. + info := containerInfo{} + newfd := startingStdioFD + + for _, stdioFD := range args.StdioFDs { + // Check that newfd is unused to avoid clobbering over it. + if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { + if err != nil { + return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) + } + return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) + } + + err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) + if err != nil { + return nil, fmt.Errorf("dup3 of stdios failed: %w", err) + } + info.stdioFDs = append(info.stdioFDs, fd.New(newfd)) + _ = unix.Close(stdioFD) + newfd++ + } + for _, goferFD := range args.GoferFDs { + info.goferFDs = append(info.goferFDs, fd.New(goferFD)) + } + // Create kernel and platform. p, err := createPlatform(args.Conf, args.Device) if err != nil { @@ -349,6 +376,7 @@ func New(args Args) (*Loader, error) { if err != nil { return nil, fmt.Errorf("creating init process for root container: %v", err) } + info.procArgs = procArgs if err := initCompatLogs(args.UserLogFD); err != nil { return nil, fmt.Errorf("initializing compat logs: %v", err) @@ -359,6 +387,9 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("creating pod mount hints: %v", err) } + info.conf = args.Conf + info.spec = args.Spec + if kernel.VFS2Enabled { // Set up host mount that will be used for imported fds. hostFilesystem, err := hostvfs2.NewFilesystem(k.VFS()) @@ -373,37 +404,6 @@ func New(args Args) (*Loader, error) { k.SetHostMount(hostMount) } - info := containerInfo{ - conf: args.Conf, - spec: args.Spec, - procArgs: procArgs, - } - - // Make host FDs stable between invocations. Host FDs must map to the exact - // same number when the sandbox is restored. Otherwise the wrong FD will be - // used. - newfd := startingStdioFD - for _, stdioFD := range args.StdioFDs { - // Check that newfd is unused to avoid clobbering over it. - if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { - if err != nil { - return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) - } - return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) - } - - err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) - if err != nil { - return nil, fmt.Errorf("dup3 of stdios failed: %w", err) - } - info.stdioFDs = append(info.stdioFDs, fd.New(newfd)) - _ = unix.Close(stdioFD) - newfd++ - } - for _, goferFD := range args.GoferFDs { - info.goferFDs = append(info.goferFDs, fd.New(goferFD)) - } - eid := execID{cid: args.ID} l := &Loader{ k: k, |