5 files changed, 160 insertions, 42 deletions
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index b3d4188a3..6d90eaefa 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -72,6 +72,9 @@ type machine struct {
 
 	// nextID is the next vCPU ID.
 	nextID uint32
+
+	// machineArchState is the architecture-specific state.
+	machineArchState
 }
 
 const (
@@ -193,12 +196,7 @@ func newMachine(vm int) (*machine, error) {
 	m.available.L = &m.mu
 
 	// Pull the maximum vCPUs.
-	maxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
-	if errno != 0 {
-		m.maxVCPUs = _KVM_NR_VCPUS
-	} else {
-		m.maxVCPUs = int(maxVCPUs)
-	}
+	m.getMaxVCPU()
 	log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
 	m.vCPUsByTID = make(map[uint64]*vCPU)
 	m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
@@ -419,9 +417,8 @@ func (m *machine) Get() *vCPU {
 			}
 		}
 
-		// Create a new vCPU (maybe).
-		if int(m.nextID) < m.maxVCPUs {
-			c := m.newVCPU()
+		// Get a new vCPU (maybe).
+		if c := m.getNewVCPU(); c != nil {
 			c.lock()
 			m.vCPUsByTID[tid] = c
 			m.mu.Unlock()
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index e8e209249..d1b2c9c92 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -63,6 +63,9 @@ func (m *machine) initArchState() error {
 	return nil
 }
 
+type machineArchState struct {
+}
+
 type vCPUArchState struct {
 	// PCIDs is the set of PCIDs for this vCPU.
 	//
@@ -490,3 +493,22 @@ func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
 			physical)
 	}
 }
+
+// getMaxVCPU get max vCPU number
+func (m *machine) getMaxVCPU() {
+	maxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
+	if errno != 0 {
+		m.maxVCPUs = _KVM_NR_VCPUS
+	} else {
+		m.maxVCPUs = int(maxVCPUs)
+	}
+}
+
+// getNewVCPU create a new vCPU (maybe)
+func (m *machine) getNewVCPU() *vCPU {
+	if int(m.nextID) < m.maxVCPUs {
+		c := m.newVCPU()
+		return c
+	}
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index 03e84d804..242bee833 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -17,6 +17,10 @@
 package kvm
 
 import (
+	"runtime"
+	"sync/atomic"
+
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/hostarch"
 	"gvisor.dev/gvisor/pkg/ring0"
 	"gvisor.dev/gvisor/pkg/ring0/pagetables"
@@ -25,6 +29,11 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 )
 
+type machineArchState struct {
+	//initialvCPUs is the machine vCPUs which has initialized but not used
+	initialvCPUs map[int]*vCPU
+}
+
 type vCPUArchState struct {
 	// PCIDs is the set of PCIDs for this vCPU.
 	//
@@ -182,3 +191,30 @@ func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (hostarch.AccessType,
 
 	return accessType, platform.ErrContextSignal
 }
+
+// getMaxVCPU get max vCPU number
+func (m *machine) getMaxVCPU() {
+	rmaxVCPUs := runtime.NumCPU()
+	smaxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
+	// compare the max vcpu number from runtime and syscall, use smaller one.
+	if errno != 0 {
+		m.maxVCPUs = rmaxVCPUs
+	} else {
+		if rmaxVCPUs < int(smaxVCPUs) {
+			m.maxVCPUs = rmaxVCPUs
+		} else {
+			m.maxVCPUs = int(smaxVCPUs)
+		}
+	}
+}
+
+// getNewVCPU() scan for an available vCPU from initialvCPUs
+func (m *machine) getNewVCPU() *vCPU {
+	for CID, c := range m.initialvCPUs {
+		if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
+			delete(m.initialvCPUs, CID)
+			return c
+		}
+	}
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 634e55ec0..1dd184586 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -29,6 +29,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/arch/fpu"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
+	ktime "gvisor.dev/gvisor/pkg/sentry/time"
 )
 
 type kvmVcpuInit struct {
@@ -47,6 +48,19 @@ func (m *machine) initArchState() error {
 		uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 {
 		panic(fmt.Sprintf("error setting KVM_ARM_PREFERRED_TARGET failed: %v", errno))
 	}
+
+	// Initialize all vCPUs on ARM64, while this does not happen on x86_64.
+	// The reason for the difference is that ARM64 and x86_64 have different KVM timer mechanisms.
+	// If we create vCPU dynamically on ARM64, the timer for vCPU would mess up for a short time.
+	// For more detail, please refer to https://github.com/google/gvisor/issues/5739
+	m.initialvCPUs = make(map[int]*vCPU)
+	m.mu.Lock()
+	for int(m.nextID) < m.maxVCPUs-1 {
+		c := m.newVCPU()
+		c.state = 0
+		m.initialvCPUs[c.id] = c
+	}
+	m.mu.Unlock()
 	return nil
 }
 
@@ -174,9 +188,58 @@ func (c *vCPU) setTSC(value uint64) error {
 	return nil
 }
 
+// getTSC gets the counter Physical Counter minus Virtual Offset.
+func (c *vCPU) getTSC() error {
+	var (
+		reg  kvmOneReg
+		data uint64
+	)
+
+	reg.addr = uint64(reflect.ValueOf(&data).Pointer())
+	reg.id = _KVM_ARM64_REGS_TIMER_CNT
+
+	if err := c.getOneRegister(&reg); err != nil {
+		return err
+	}
+
+	return nil
+}
+
 // setSystemTime sets the vCPU to the system time.
 func (c *vCPU) setSystemTime() error {
-	return c.setSystemTimeLegacy()
+	const minIterations = 10
+	minimum := uint64(0)
+	for iter := 0; ; iter++ {
+		// Use get the TSC to an estimate of where it will be
+		// on the host during a "fast" system call iteration.
+		// replace getTSC to another setOneRegister syscall can get more accurate value?
+		start := uint64(ktime.Rdtsc())
+		if err := c.getTSC(); err != nil {
+			return err
+		}
+		// See if this is our new minimum call time. Note that this
+		// serves two functions: one, we make sure that we are
+		// accurately predicting the offset we need to set. Second, we
+		// don't want to do the final set on a slow call, which could
+		// produce a really bad result.
+		end := uint64(ktime.Rdtsc())
+		if end < start {
+			continue // Totally bogus: unstable TSC?
+		}
+		current := end - start
+		if current < minimum || iter == 0 {
+			minimum = current // Set our new minimum.
+		}
+		// Is this past minIterations and within ~10% of minimum?
+		upperThreshold := (((minimum << 3) + minimum) >> 3)
+		if iter >= minIterations && ( current <= upperThreshold || minimum < 50 ) {
+			// Try to set the TSC
+			if err := c.setTSC(end + (minimum / 2)); err != nil {
+				return err
+			}
+			return nil
+		}
+	}
 }
 
 //go:nosplit
@@ -203,7 +266,7 @@ func (c *vCPU) getOneRegister(reg *kvmOneReg) error {
 		uintptr(c.fd),
 		_KVM_GET_ONE_REG,
 		uintptr(unsafe.Pointer(reg))); errno != 0 {
-		return fmt.Errorf("error setting one register: %v", errno)
+		return fmt.Errorf("error getting one register: %v", errno)
 	}
 	return nil
 }
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 774621970..290a706d2 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -226,6 +226,33 @@ func New(args Args) (*Loader, error) {
 		vfs2.Override()
 	}
 
+	// Make host FDs stable between invocations. Host FDs must map to the exact
+	// same number when the sandbox is restored. Otherwise the wrong FD will be
+	// used.
+	info := containerInfo{}
+	newfd := startingStdioFD
+
+	for _, stdioFD := range args.StdioFDs {
+		// Check that newfd is unused to avoid clobbering over it.
+		if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
+			if err != nil {
+				return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err)
+			}
+			return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
+		}
+
+		err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
+		if err != nil {
+			return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
+		}
+		info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
+		_ = unix.Close(stdioFD)
+		newfd++
+	}
+	for _, goferFD := range args.GoferFDs {
+		info.goferFDs = append(info.goferFDs, fd.New(goferFD))
+	}
+
 	// Create kernel and platform.
 	p, err := createPlatform(args.Conf, args.Device)
 	if err != nil {
@@ -345,6 +372,7 @@ func New(args Args) (*Loader, error) {
 	if err != nil {
 		return nil, fmt.Errorf("creating init process for root container: %v", err)
 	}
+	info.procArgs = procArgs
 
 	if err := initCompatLogs(args.UserLogFD); err != nil {
 		return nil, fmt.Errorf("initializing compat logs: %v", err)
@@ -355,6 +383,9 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("creating pod mount hints: %v", err)
 	}
 
+	info.conf = args.Conf
+	info.spec = args.Spec
+
 	if kernel.VFS2Enabled {
 		// Set up host mount that will be used for imported fds.
 		hostFilesystem, err := hostvfs2.NewFilesystem(k.VFS())
@@ -369,37 +400,6 @@ func New(args Args) (*Loader, error) {
 		k.SetHostMount(hostMount)
 	}
 
-	info := containerInfo{
-		conf:     args.Conf,
-		spec:     args.Spec,
-		procArgs: procArgs,
-	}
-
-	// Make host FDs stable between invocations. Host FDs must map to the exact
-	// same number when the sandbox is restored. Otherwise the wrong FD will be
-	// used.
-	newfd := startingStdioFD
-	for _, stdioFD := range args.StdioFDs {
-		// Check that newfd is unused to avoid clobbering over it.
-		if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
-			if err != nil {
-				return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err)
-			}
-			return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
-		}
-
-		err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
-		if err != nil {
-			return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
-		}
-		info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
-		_ = unix.Close(stdioFD)
-		newfd++
-	}
-	for _, goferFD := range args.GoferFDs {
-		info.goferFDs = append(info.goferFDs, fd.New(goferFD))
-	}
-
 	eid := execID{cid: args.ID}
 	l := &Loader{
 		k:          k,