summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/sentry/platform/kvm/machine.go15
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64.go22
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64.go36
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64_unsafe.go67
-rw-r--r--runsc/boot/loader.go62
5 files changed, 160 insertions, 42 deletions
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index b3d4188a3..6d90eaefa 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -72,6 +72,9 @@ type machine struct {
// nextID is the next vCPU ID.
nextID uint32
+
+ // machineArchState is the architecture-specific state.
+ machineArchState
}
const (
@@ -193,12 +196,7 @@ func newMachine(vm int) (*machine, error) {
m.available.L = &m.mu
// Pull the maximum vCPUs.
- maxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
- if errno != 0 {
- m.maxVCPUs = _KVM_NR_VCPUS
- } else {
- m.maxVCPUs = int(maxVCPUs)
- }
+ m.getMaxVCPU()
log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
m.vCPUsByTID = make(map[uint64]*vCPU)
m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
@@ -419,9 +417,8 @@ func (m *machine) Get() *vCPU {
}
}
- // Create a new vCPU (maybe).
- if int(m.nextID) < m.maxVCPUs {
- c := m.newVCPU()
+ // Get a new vCPU (maybe).
+ if c := m.getNewVCPU(); c != nil {
c.lock()
m.vCPUsByTID[tid] = c
m.mu.Unlock()
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index e8e209249..d1b2c9c92 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -63,6 +63,9 @@ func (m *machine) initArchState() error {
return nil
}
+type machineArchState struct {
+}
+
type vCPUArchState struct {
// PCIDs is the set of PCIDs for this vCPU.
//
@@ -490,3 +493,22 @@ func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
physical)
}
}
+
+// getMaxVCPU get max vCPU number
+func (m *machine) getMaxVCPU() {
+ maxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
+ if errno != 0 {
+ m.maxVCPUs = _KVM_NR_VCPUS
+ } else {
+ m.maxVCPUs = int(maxVCPUs)
+ }
+}
+
+// getNewVCPU create a new vCPU (maybe)
+func (m *machine) getNewVCPU() *vCPU {
+ if int(m.nextID) < m.maxVCPUs {
+ c := m.newVCPU()
+ return c
+ }
+ return nil
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index 03e84d804..242bee833 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -17,6 +17,10 @@
package kvm
import (
+ "runtime"
+ "sync/atomic"
+
+ "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/ring0"
"gvisor.dev/gvisor/pkg/ring0/pagetables"
@@ -25,6 +29,11 @@ import (
"gvisor.dev/gvisor/pkg/sentry/platform"
)
+type machineArchState struct {
+ //initialvCPUs is the machine vCPUs which has initialized but not used
+ initialvCPUs map[int]*vCPU
+}
+
type vCPUArchState struct {
// PCIDs is the set of PCIDs for this vCPU.
//
@@ -182,3 +191,30 @@ func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (hostarch.AccessType,
return accessType, platform.ErrContextSignal
}
+
+// getMaxVCPU get max vCPU number
+func (m *machine) getMaxVCPU() {
+ rmaxVCPUs := runtime.NumCPU()
+ smaxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
+ // compare the max vcpu number from runtime and syscall, use smaller one.
+ if errno != 0 {
+ m.maxVCPUs = rmaxVCPUs
+ } else {
+ if rmaxVCPUs < int(smaxVCPUs) {
+ m.maxVCPUs = rmaxVCPUs
+ } else {
+ m.maxVCPUs = int(smaxVCPUs)
+ }
+ }
+}
+
+// getNewVCPU() scan for an available vCPU from initialvCPUs
+func (m *machine) getNewVCPU() *vCPU {
+ for CID, c := range m.initialvCPUs {
+ if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
+ delete(m.initialvCPUs, CID)
+ return c
+ }
+ }
+ return nil
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 634e55ec0..1dd184586 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -29,6 +29,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/arch/fpu"
"gvisor.dev/gvisor/pkg/sentry/platform"
+ ktime "gvisor.dev/gvisor/pkg/sentry/time"
)
type kvmVcpuInit struct {
@@ -47,6 +48,19 @@ func (m *machine) initArchState() error {
uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 {
panic(fmt.Sprintf("error setting KVM_ARM_PREFERRED_TARGET failed: %v", errno))
}
+
+ // Initialize all vCPUs on ARM64, while this does not happen on x86_64.
+ // The reason for the difference is that ARM64 and x86_64 have different KVM timer mechanisms.
+ // If we create vCPU dynamically on ARM64, the timer for vCPU would mess up for a short time.
+ // For more detail, please refer to https://github.com/google/gvisor/issues/5739
+ m.initialvCPUs = make(map[int]*vCPU)
+ m.mu.Lock()
+ for int(m.nextID) < m.maxVCPUs-1 {
+ c := m.newVCPU()
+ c.state = 0
+ m.initialvCPUs[c.id] = c
+ }
+ m.mu.Unlock()
return nil
}
@@ -174,9 +188,58 @@ func (c *vCPU) setTSC(value uint64) error {
return nil
}
+// getTSC gets the counter Physical Counter minus Virtual Offset.
+func (c *vCPU) getTSC() error {
+ var (
+ reg kvmOneReg
+ data uint64
+ )
+
+ reg.addr = uint64(reflect.ValueOf(&data).Pointer())
+ reg.id = _KVM_ARM64_REGS_TIMER_CNT
+
+ if err := c.getOneRegister(&reg); err != nil {
+ return err
+ }
+
+ return nil
+}
+
// setSystemTime sets the vCPU to the system time.
func (c *vCPU) setSystemTime() error {
- return c.setSystemTimeLegacy()
+ const minIterations = 10
+ minimum := uint64(0)
+ for iter := 0; ; iter++ {
+ // Use get the TSC to an estimate of where it will be
+ // on the host during a "fast" system call iteration.
+ // replace getTSC to another setOneRegister syscall can get more accurate value?
+ start := uint64(ktime.Rdtsc())
+ if err := c.getTSC(); err != nil {
+ return err
+ }
+ // See if this is our new minimum call time. Note that this
+ // serves two functions: one, we make sure that we are
+ // accurately predicting the offset we need to set. Second, we
+ // don't want to do the final set on a slow call, which could
+ // produce a really bad result.
+ end := uint64(ktime.Rdtsc())
+ if end < start {
+ continue // Totally bogus: unstable TSC?
+ }
+ current := end - start
+ if current < minimum || iter == 0 {
+ minimum = current // Set our new minimum.
+ }
+ // Is this past minIterations and within ~10% of minimum?
+ upperThreshold := (((minimum << 3) + minimum) >> 3)
+ if iter >= minIterations && ( current <= upperThreshold || minimum < 50 ) {
+ // Try to set the TSC
+ if err := c.setTSC(end + (minimum / 2)); err != nil {
+ return err
+ }
+ return nil
+ }
+ }
}
//go:nosplit
@@ -203,7 +266,7 @@ func (c *vCPU) getOneRegister(reg *kvmOneReg) error {
uintptr(c.fd),
_KVM_GET_ONE_REG,
uintptr(unsafe.Pointer(reg))); errno != 0 {
- return fmt.Errorf("error setting one register: %v", errno)
+ return fmt.Errorf("error getting one register: %v", errno)
}
return nil
}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 774621970..290a706d2 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -226,6 +226,33 @@ func New(args Args) (*Loader, error) {
vfs2.Override()
}
+ // Make host FDs stable between invocations. Host FDs must map to the exact
+ // same number when the sandbox is restored. Otherwise the wrong FD will be
+ // used.
+ info := containerInfo{}
+ newfd := startingStdioFD
+
+ for _, stdioFD := range args.StdioFDs {
+ // Check that newfd is unused to avoid clobbering over it.
+ if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
+ if err != nil {
+ return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err)
+ }
+ return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
+ }
+
+ err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
+ if err != nil {
+ return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
+ }
+ info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
+ _ = unix.Close(stdioFD)
+ newfd++
+ }
+ for _, goferFD := range args.GoferFDs {
+ info.goferFDs = append(info.goferFDs, fd.New(goferFD))
+ }
+
// Create kernel and platform.
p, err := createPlatform(args.Conf, args.Device)
if err != nil {
@@ -345,6 +372,7 @@ func New(args Args) (*Loader, error) {
if err != nil {
return nil, fmt.Errorf("creating init process for root container: %v", err)
}
+ info.procArgs = procArgs
if err := initCompatLogs(args.UserLogFD); err != nil {
return nil, fmt.Errorf("initializing compat logs: %v", err)
@@ -355,6 +383,9 @@ func New(args Args) (*Loader, error) {
return nil, fmt.Errorf("creating pod mount hints: %v", err)
}
+ info.conf = args.Conf
+ info.spec = args.Spec
+
if kernel.VFS2Enabled {
// Set up host mount that will be used for imported fds.
hostFilesystem, err := hostvfs2.NewFilesystem(k.VFS())
@@ -369,37 +400,6 @@ func New(args Args) (*Loader, error) {
k.SetHostMount(hostMount)
}
- info := containerInfo{
- conf: args.Conf,
- spec: args.Spec,
- procArgs: procArgs,
- }
-
- // Make host FDs stable between invocations. Host FDs must map to the exact
- // same number when the sandbox is restored. Otherwise the wrong FD will be
- // used.
- newfd := startingStdioFD
- for _, stdioFD := range args.StdioFDs {
- // Check that newfd is unused to avoid clobbering over it.
- if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
- if err != nil {
- return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err)
- }
- return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
- }
-
- err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
- if err != nil {
- return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
- }
- info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
- _ = unix.Close(stdioFD)
- newfd++
- }
- for _, goferFD := range args.GoferFDs {
- info.goferFDs = append(info.goferFDs, fd.New(goferFD))
- }
-
eid := execID{cid: args.ID}
l := &Loader{
k: k,