diff options
author | howard zhang <howard.zhang@arm.com> | 2021-03-29 13:54:21 +0800 |
---|---|---|
committer | Howard Zhang <howard.zhang@arm.com> | 2021-05-07 16:42:58 +0800 |
commit | 0bff4afd0fa8ac02dd27a4ba66a217d92e0020cf (patch) | |
tree | 9f5c5a17ef4fccf9a8830f0a19c54f1ca1aaa4b3 | |
parent | 8a2f7e716dcc62f04d2808e8ade34941c94fc956 (diff) |
Init all vCPU when initializing machine on ARM64
This patch is to solve problem that vCPU timer mess up when
adding vCPU dynamically on ARM64, for detailed information
please refer to:
https://github.com/google/gvisor/issues/5739
There is no influence on x86 and here are main changes for
ARM64:
1. create maxVCPUs number of vCPU in machine initialization
2. we want to sync gvisor vCPU number with host CPU number,
so use smaller number between runtime.NumCPU and
KVM_CAP_MAX_VCPUS to be maxVCPUS
3. put unused vCPUs into architecture-specific map initialvCPUs
4. When machine need to bind a new vCPU with tid, rather
than creating new one, it would pick a vCPU from map initalvCPUs
5. change the setSystemTime function. When vCPU number increasing,
the time cost for function setTSC(use syscall to set cntvoff) is
liner growth from around 300 ns to 100000 ns, and this leads to
the function setSystemTimeLegacy can not get correct offset
value.
6. initializing StdioFDs and goferFD before a platform to avoid
StdioFDs confects with vCPU fds
Signed-off-by: howard zhang <howard.zhang@arm.com>
-rw-r--r-- | pkg/sentry/platform/kvm/machine.go | 15 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_amd64.go | 22 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_arm64.go | 36 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_arm64_unsafe.go | 67 | ||||
-rw-r--r-- | runsc/boot/loader.go | 62 |
5 files changed, 160 insertions, 42 deletions
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index b3d4188a3..6d90eaefa 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -72,6 +72,9 @@ type machine struct { // nextID is the next vCPU ID. nextID uint32 + + // machineArchState is the architecture-specific state. + machineArchState } const ( @@ -193,12 +196,7 @@ func newMachine(vm int) (*machine, error) { m.available.L = &m.mu // Pull the maximum vCPUs. - maxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) - if errno != 0 { - m.maxVCPUs = _KVM_NR_VCPUS - } else { - m.maxVCPUs = int(maxVCPUs) - } + m.getMaxVCPU() log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs) m.vCPUsByTID = make(map[uint64]*vCPU) m.vCPUsByID = make([]*vCPU, m.maxVCPUs) @@ -419,9 +417,8 @@ func (m *machine) Get() *vCPU { } } - // Create a new vCPU (maybe). - if int(m.nextID) < m.maxVCPUs { - c := m.newVCPU() + // Get a new vCPU (maybe). + if c := m.getNewVCPU(); c != nil { c.lock() m.vCPUsByTID[tid] = c m.mu.Unlock() diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index e8e209249..d1b2c9c92 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -63,6 +63,9 @@ func (m *machine) initArchState() error { return nil } +type machineArchState struct { +} + type vCPUArchState struct { // PCIDs is the set of PCIDs for this vCPU. // @@ -490,3 +493,22 @@ func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { physical) } } + +// getMaxVCPU get max vCPU number +func (m *machine) getMaxVCPU() { + maxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) + if errno != 0 { + m.maxVCPUs = _KVM_NR_VCPUS + } else { + m.maxVCPUs = int(maxVCPUs) + } +} + +// getNewVCPU create a new vCPU (maybe) +func (m *machine) getNewVCPU() *vCPU { + if int(m.nextID) < m.maxVCPUs { + c := m.newVCPU() + return c + } + return nil +} diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index 03e84d804..242bee833 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -17,6 +17,10 @@ package kvm import ( + "runtime" + "sync/atomic" + + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" @@ -25,6 +29,11 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" ) +type machineArchState struct { + //initialvCPUs is the machine vCPUs which has initialized but not used + initialvCPUs map[int]*vCPU +} + type vCPUArchState struct { // PCIDs is the set of PCIDs for this vCPU. // @@ -182,3 +191,30 @@ func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (hostarch.AccessType, return accessType, platform.ErrContextSignal } + +// getMaxVCPU get max vCPU number +func (m *machine) getMaxVCPU() { + rmaxVCPUs := runtime.NumCPU() + smaxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) + // compare the max vcpu number from runtime and syscall, use smaller one. + if errno != 0 { + m.maxVCPUs = rmaxVCPUs + } else { + if rmaxVCPUs < int(smaxVCPUs) { + m.maxVCPUs = rmaxVCPUs + } else { + m.maxVCPUs = int(smaxVCPUs) + } + } +} + +// getNewVCPU() scan for an available vCPU from initialvCPUs +func (m *machine) getNewVCPU() *vCPU { + for CID, c := range m.initialvCPUs { + if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) { + delete(m.initialvCPUs, CID) + return c + } + } + return nil +} diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index 634e55ec0..1dd184586 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -29,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/platform" + ktime "gvisor.dev/gvisor/pkg/sentry/time" ) type kvmVcpuInit struct { @@ -47,6 +48,19 @@ func (m *machine) initArchState() error { uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 { panic(fmt.Sprintf("error setting KVM_ARM_PREFERRED_TARGET failed: %v", errno)) } + + // Initialize all vCPUs on ARM64, while this does not happen on x86_64. + // The reason for the difference is that ARM64 and x86_64 have different KVM timer mechanisms. + // If we create vCPU dynamically on ARM64, the timer for vCPU would mess up for a short time. + // For more detail, please refer to https://github.com/google/gvisor/issues/5739 + m.initialvCPUs = make(map[int]*vCPU) + m.mu.Lock() + for int(m.nextID) < m.maxVCPUs-1 { + c := m.newVCPU() + c.state = 0 + m.initialvCPUs[c.id] = c + } + m.mu.Unlock() return nil } @@ -174,9 +188,58 @@ func (c *vCPU) setTSC(value uint64) error { return nil } +// getTSC gets the counter Physical Counter minus Virtual Offset. +func (c *vCPU) getTSC() error { + var ( + reg kvmOneReg + data uint64 + ) + + reg.addr = uint64(reflect.ValueOf(&data).Pointer()) + reg.id = _KVM_ARM64_REGS_TIMER_CNT + + if err := c.getOneRegister(®); err != nil { + return err + } + + return nil +} + // setSystemTime sets the vCPU to the system time. func (c *vCPU) setSystemTime() error { - return c.setSystemTimeLegacy() + const minIterations = 10 + minimum := uint64(0) + for iter := 0; ; iter++ { + // Use get the TSC to an estimate of where it will be + // on the host during a "fast" system call iteration. + // replace getTSC to another setOneRegister syscall can get more accurate value? + start := uint64(ktime.Rdtsc()) + if err := c.getTSC(); err != nil { + return err + } + // See if this is our new minimum call time. Note that this + // serves two functions: one, we make sure that we are + // accurately predicting the offset we need to set. Second, we + // don't want to do the final set on a slow call, which could + // produce a really bad result. + end := uint64(ktime.Rdtsc()) + if end < start { + continue // Totally bogus: unstable TSC? + } + current := end - start + if current < minimum || iter == 0 { + minimum = current // Set our new minimum. + } + // Is this past minIterations and within ~10% of minimum? + upperThreshold := (((minimum << 3) + minimum) >> 3) + if iter >= minIterations && ( current <= upperThreshold || minimum < 50 ) { + // Try to set the TSC + if err := c.setTSC(end + (minimum / 2)); err != nil { + return err + } + return nil + } + } } //go:nosplit @@ -203,7 +266,7 @@ func (c *vCPU) getOneRegister(reg *kvmOneReg) error { uintptr(c.fd), _KVM_GET_ONE_REG, uintptr(unsafe.Pointer(reg))); errno != 0 { - return fmt.Errorf("error setting one register: %v", errno) + return fmt.Errorf("error getting one register: %v", errno) } return nil } diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 774621970..290a706d2 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -226,6 +226,33 @@ func New(args Args) (*Loader, error) { vfs2.Override() } + // Make host FDs stable between invocations. Host FDs must map to the exact + // same number when the sandbox is restored. Otherwise the wrong FD will be + // used. + info := containerInfo{} + newfd := startingStdioFD + + for _, stdioFD := range args.StdioFDs { + // Check that newfd is unused to avoid clobbering over it. + if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { + if err != nil { + return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) + } + return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) + } + + err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) + if err != nil { + return nil, fmt.Errorf("dup3 of stdios failed: %w", err) + } + info.stdioFDs = append(info.stdioFDs, fd.New(newfd)) + _ = unix.Close(stdioFD) + newfd++ + } + for _, goferFD := range args.GoferFDs { + info.goferFDs = append(info.goferFDs, fd.New(goferFD)) + } + // Create kernel and platform. p, err := createPlatform(args.Conf, args.Device) if err != nil { @@ -345,6 +372,7 @@ func New(args Args) (*Loader, error) { if err != nil { return nil, fmt.Errorf("creating init process for root container: %v", err) } + info.procArgs = procArgs if err := initCompatLogs(args.UserLogFD); err != nil { return nil, fmt.Errorf("initializing compat logs: %v", err) @@ -355,6 +383,9 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("creating pod mount hints: %v", err) } + info.conf = args.Conf + info.spec = args.Spec + if kernel.VFS2Enabled { // Set up host mount that will be used for imported fds. hostFilesystem, err := hostvfs2.NewFilesystem(k.VFS()) @@ -369,37 +400,6 @@ func New(args Args) (*Loader, error) { k.SetHostMount(hostMount) } - info := containerInfo{ - conf: args.Conf, - spec: args.Spec, - procArgs: procArgs, - } - - // Make host FDs stable between invocations. Host FDs must map to the exact - // same number when the sandbox is restored. Otherwise the wrong FD will be - // used. - newfd := startingStdioFD - for _, stdioFD := range args.StdioFDs { - // Check that newfd is unused to avoid clobbering over it. - if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { - if err != nil { - return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) - } - return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) - } - - err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) - if err != nil { - return nil, fmt.Errorf("dup3 of stdios failed: %w", err) - } - info.stdioFDs = append(info.stdioFDs, fd.New(newfd)) - _ = unix.Close(stdioFD) - newfd++ - } - for _, goferFD := range args.GoferFDs { - info.goferFDs = append(info.goferFDs, fd.New(goferFD)) - } - eid := execID{cid: args.ID} l := &Loader{ k: k, |