[v2] KVM: x86: Use gfn_to_pfn_cache for record_steal_time

[PATCH v2] KVM: x86: Use gfn_to_pfn_cache for record_steal_time

Posted by David Woodhouse 3 weeks, 6 days ago

From: Carsten Stollmaier <stollmc@amazon.com>

On vcpu_run, before entering the guest, the update of the steal time
information causes a page-fault if the page is not present. In our
scenario, this gets handled by do_user_addr_fault and successively
handle_userfault since we have the region registered to that.

handle_userfault uses TASK_INTERRUPTIBLE, so it is interruptible by
signals. do_user_addr_fault then busy-retries it if the pending signal
is non-fatal. This leads to contention of the mmap_lock.

This patch replaces the use of gfn_to_hva_cache with gfn_to_pfn_cache,
as gfn_to_pfn_cache ensures page presence for the memory access,
preventing the contention of the mmap_lock.

As an added bonus, this removes the last open-coded assembler access
to userspace from arch/x86/kvm/x86.c.

Signed-off-by: Carsten Stollmaier <stollmc@amazon.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/include/asm/kvm_host.h |   2 +-
 arch/x86/kvm/x86.c              | 115 +++++++++++++++-----------------
 2 files changed, 54 insertions(+), 63 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ff07c45e3c73..5eb38976af58 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -958,7 +958,7 @@ struct kvm_vcpu_arch {
 		u8 preempted;
 		u64 msr_val;
 		u64 last_steal;
-		struct gfn_to_hva_cache cache;
+		struct gfn_to_pfn_cache cache;
 	} st;
 
 	u64 l1_tsc_offset;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a03530795707..7cbef6dc3216 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3745,10 +3745,8 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_service_local_tlb_flush_requests);
 
 static void record_steal_time(struct kvm_vcpu *vcpu)
 {
-	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
-	struct kvm_steal_time __user *st;
-	struct kvm_memslots *slots;
-	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+	struct gfn_to_pfn_cache *gpc = &vcpu->arch.st.cache;
+	struct kvm_steal_time *st;
 	u64 steal;
 	u32 version;
 
@@ -3763,42 +3761,26 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 	if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
 		return;
 
-	slots = kvm_memslots(vcpu->kvm);
-
-	if (unlikely(slots->generation != ghc->generation ||
-		     gpa != ghc->gpa ||
-		     kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
+	read_lock(&gpc->lock);
+	while (!kvm_gpc_check(gpc, sizeof(*st))) {
 		/* We rely on the fact that it fits in a single page. */
 		BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
 
-		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
-		    kvm_is_error_hva(ghc->hva) || !ghc->memslot)
+		read_unlock(&gpc->lock);
+
+		if (kvm_gpc_refresh(gpc, sizeof(*st)))
 			return;
+
+		read_lock(&gpc->lock);
 	}
 
-	st = (struct kvm_steal_time __user *)ghc->hva;
+	st = (struct kvm_steal_time *)gpc->khva;
 	/*
 	 * Doing a TLB flush here, on the guest's behalf, can avoid
 	 * expensive IPIs.
 	 */
 	if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
-		u8 st_preempted = 0;
-		int err = -EFAULT;
-
-		if (!user_access_begin(st, sizeof(*st)))
-			return;
-
-		asm volatile("1: xchgb %0, %2\n"
-			     "xor %1, %1\n"
-			     "2:\n"
-			     _ASM_EXTABLE_UA(1b, 2b)
-			     : "+q" (st_preempted),
-			       "+&r" (err),
-			       "+m" (st->preempted));
-		if (err)
-			goto out;
-
-		user_access_end();
+		u8 st_preempted = xchg(&st->preempted, 0);
 
 		vcpu->arch.st.preempted = 0;
 
@@ -3806,39 +3788,32 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 				       st_preempted & KVM_VCPU_FLUSH_TLB);
 		if (st_preempted & KVM_VCPU_FLUSH_TLB)
 			kvm_vcpu_flush_tlb_guest(vcpu);
-
-		if (!user_access_begin(st, sizeof(*st)))
-			goto dirty;
 	} else {
-		if (!user_access_begin(st, sizeof(*st)))
-			return;
-
-		unsafe_put_user(0, &st->preempted, out);
+		st->preempted = 0;
 		vcpu->arch.st.preempted = 0;
 	}
 
-	unsafe_get_user(version, &st->version, out);
+	version = st->version;
 	if (version & 1)
 		version += 1;  /* first time write, random junk */
 
 	version += 1;
-	unsafe_put_user(version, &st->version, out);
+	st->version = version;
 
 	smp_wmb();
 
-	unsafe_get_user(steal, &st->steal, out);
+	steal = st->steal;
 	steal += current->sched_info.run_delay -
 		vcpu->arch.st.last_steal;
 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
-	unsafe_put_user(steal, &st->steal, out);
+	st->steal = steal;
 
 	version += 1;
-	unsafe_put_user(version, &st->version, out);
+	st->version = version;
+
+	kvm_gpc_mark_dirty_in_slot(gpc);
 
- out:
-	user_access_end();
- dirty:
-	mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
+	read_unlock(&gpc->lock);
 }
 
 /*
@@ -4173,8 +4148,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		vcpu->arch.st.msr_val = data;
 
-		if (!(data & KVM_MSR_ENABLED))
-			break;
+		if (data & KVM_MSR_ENABLED) {
+			kvm_gpc_activate(&vcpu->arch.st.cache, data & ~KVM_MSR_ENABLED,
+					sizeof(struct kvm_steal_time));
+		} else {
+			kvm_gpc_deactivate(&vcpu->arch.st.cache);
+		}
 
 		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 
@@ -5237,11 +5216,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 {
-	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
-	struct kvm_steal_time __user *st;
-	struct kvm_memslots *slots;
+	struct gfn_to_pfn_cache *gpc = &vcpu->arch.st.cache;
+	struct kvm_steal_time *st;
 	static const u8 preempted = KVM_VCPU_PREEMPTED;
-	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+	unsigned long flags;
 
 	/*
 	 * The vCPU can be marked preempted if and only if the VM-Exit was on
@@ -5266,20 +5244,28 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 	if (unlikely(current->mm != vcpu->kvm->mm))
 		return;
 
-	slots = kvm_memslots(vcpu->kvm);
-
-	if (unlikely(slots->generation != ghc->generation ||
-		     gpa != ghc->gpa ||
-		     kvm_is_error_hva(ghc->hva) || !ghc->memslot))
-		return;
+	read_lock_irqsave(&gpc->lock, flags);
+	if (!kvm_gpc_check(gpc, sizeof(*st)))
+		goto out_unlock_gpc;
 
-	st = (struct kvm_steal_time __user *)ghc->hva;
+	st = (struct kvm_steal_time *)gpc->khva;
 	BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
 
-	if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
-		vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+	st->preempted = preempted;
+	vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
 
-	mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
+	kvm_gpc_mark_dirty_in_slot(gpc);
+
+out_unlock_gpc:
+	read_unlock_irqrestore(&gpc->lock, flags);
+}
+
+static void kvm_steal_time_reset(struct kvm_vcpu *vcpu)
+{
+	kvm_gpc_deactivate(&vcpu->arch.st.cache);
+	vcpu->arch.st.preempted = 0;
+	vcpu->arch.st.msr_val = 0;
+	vcpu->arch.st.last_steal = 0;
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -12744,6 +12730,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 
 	kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
 
+	kvm_gpc_init(&vcpu->arch.st.cache, vcpu->kvm);
+
 	if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
 		kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
 	else
@@ -12851,6 +12839,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_clear_async_pf_completion_queue(vcpu);
 	kvm_mmu_unload(vcpu);
 
+	kvm_steal_time_reset(vcpu);
+
 	kvmclock_reset(vcpu);
 
 	for_each_possible_cpu(cpu)
@@ -12971,7 +12961,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	vcpu->arch.apf.msr_en_val = 0;
 	vcpu->arch.apf.msr_int_val = 0;
-	vcpu->arch.st.msr_val = 0;
+
+	kvm_steal_time_reset(vcpu);
 
 	kvmclock_reset(vcpu);
 
-- 
2.51.0

Re: [PATCH v2] KVM: x86: Use gfn_to_pfn_cache for record_steal_time

Posted by Sean Christopherson 3 weeks, 4 days ago

With some assistance from an AI review bot (well, more than "some").

On Wed, Mar 11, 2026, David Woodhouse wrote:
> @@ -3806,39 +3788,32 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
>  				       st_preempted & KVM_VCPU_FLUSH_TLB);
>  		if (st_preempted & KVM_VCPU_FLUSH_TLB)
>  			kvm_vcpu_flush_tlb_guest(vcpu);
> -
> -		if (!user_access_begin(st, sizeof(*st)))
> -			goto dirty;
>  	} else {
> -		if (!user_access_begin(st, sizeof(*st)))
> -			return;
> -
> -		unsafe_put_user(0, &st->preempted, out);
> +		st->preempted = 0;

These should all be WRITE_ONCE(), correct?

>  		vcpu->arch.st.preempted = 0;
>  	}
>  
> -	unsafe_get_user(version, &st->version, out);
> +	version = st->version;

And then READ_ONCE()?

>  	if (version & 1)
>  		version += 1;  /* first time write, random junk */
>  
>  	version += 1;
> -	unsafe_put_user(version, &st->version, out);
> +	st->version = version;
>  
>  	smp_wmb();
>  
> -	unsafe_get_user(steal, &st->steal, out);
> +	steal = st->steal;
>  	steal += current->sched_info.run_delay -
>  		vcpu->arch.st.last_steal;
>  	vcpu->arch.st.last_steal = current->sched_info.run_delay;
> -	unsafe_put_user(steal, &st->steal, out);
> +	st->steal = steal;
>  
>  	version += 1;
> -	unsafe_put_user(version, &st->version, out);
> +	st->version = version;

And then here, doesn't there need to be an smp_wmb() before incrementing the
version again?  Because I believe making this all vanilla C means the compiler
can reorder those two.  Per the friendly bot:

  The previous code used unsafe_put_user(), which inherently acts as a
  compiler barrier due to its internal inline assembly

> +
> +	kvm_gpc_mark_dirty_in_slot(gpc);
>  
> - out:
> -	user_access_end();
> - dirty:
> -	mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
> +	read_unlock(&gpc->lock);
>  }
>  
>  /*
> @@ -4173,8 +4148,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  
>  		vcpu->arch.st.msr_val = data;
>  
> -		if (!(data & KVM_MSR_ENABLED))
> -			break;
> +		if (data & KVM_MSR_ENABLED) {

Curly braces aren't required.

> +			kvm_gpc_activate(&vcpu->arch.st.cache, data & ~KVM_MSR_ENABLED,
> +					sizeof(struct kvm_steal_time));
> +		} else {
> +			kvm_gpc_deactivate(&vcpu->arch.st.cache);
> +		}

...

> @@ -5266,20 +5244,28 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
>  	if (unlikely(current->mm != vcpu->kvm->mm))
>  		return;
>  
> -	slots = kvm_memslots(vcpu->kvm);
> -
> -	if (unlikely(slots->generation != ghc->generation ||
> -		     gpa != ghc->gpa ||
> -		     kvm_is_error_hva(ghc->hva) || !ghc->memslot))
> -		return;
> +	read_lock_irqsave(&gpc->lock, flags);

I'm pretty sure this is going to make PROVE_LOCKING unhappy due to PREEMPT_RT
making rwlock_t sleepable (when called from kvm_sched_out()).  I've been content
to ignore the kvm_xen_set_evtchn_fast() warning[*] because I can't imagine anyone
is crazy enough to emulate Xen with an RT kernel, but I do know there are RT users
that run VMs, and so this path would be more than just a PROVE_LOCKING issue.

If we want to push the gpc stuff broadly, we need a solution to that (though I'm
still not 100% convinced using a gpc here is a net positive).

[*] https://lore.kernel.org/all/673f4bbc.050a0220.3c9d61.0174.GAE@google.com

Re: [PATCH v2] KVM: x86: Use gfn_to_pfn_cache for record_steal_time

Posted by Huang, Kai 2 weeks, 6 days ago

On Thu, 2026-03-12 at 17:17 -0700, Sean Christopherson wrote:
>  I've been content
> to ignore the kvm_xen_set_evtchn_fast() warning[*] because I can't imagine anyone
> is crazy enough to emulate Xen with an RT kernel

[...]

> [*] https://lore.kernel.org/all/673f4bbc.050a0220.3c9d61.0174.GAE@google.com

Completely not related to this patch:

One side effort of this lockdep warning is it suppresses all further lockdep
warnings.  We have met in our TDX CI testing that a useful lockdep warning
was suppressed because of this one.  We have since then disabled
CONFIG_KVM_XEN in our CI. :-)

Re: [PATCH v2] KVM: x86: Use gfn_to_pfn_cache for record_steal_time

Posted by David Woodhouse 2 weeks, 6 days ago

On Thu, 2026-03-12 at 17:17 -0700, Sean Christopherson wrote:
> 
> I'm pretty sure this is going to make PROVE_LOCKING unhappy due to PREEMPT_RT
> making rwlock_t sleepable (when called from kvm_sched_out()).  I've been content
> to ignore the kvm_xen_set_evtchn_fast() warning[*] because I can't imagine anyone
> is crazy enough to emulate Xen with an RT kernel, but I do know there are RT users
> that run VMs, and so this path would be more than just a PROVE_LOCKING issue.
> 
> If we want to push the gpc stuff broadly, we need a solution to that (though I'm
> still not 100% convinced using a gpc here is a net positive).
> 
> [*] https://lore.kernel.org/all/673f4bbc.050a0220.3c9d61.0174.GAE@google.com

I set up a test case, and you're quite right:

[  478.182893] [ BUG: Invalid wait context ]
[  478.182895] 7.0.0-rc3+ #2 Tainted: G           OE      
[  478.182896] -----------------------------
[  478.182897] steal_time/5227 is trying to lock:
[  478.182898] ff2c7aa6637b8cf8 (&gpc->lock){++++}-{3:3}, at: kvm_arch_vcpu_put+0xe8/0x240 [kvm]
[  478.182982] other info that might help us debug this:
[  478.182982] context-{5:5}
[  478.182983] 3 locks held by steal_time/5227:
[  478.182984]  #0: ff2c7aa6637b80a0 (&vcpu->mutex){+.+.}-{4:4}, at: kvm_vcpu_ioctl+0xe4/0x950 [kvm]
[  478.183026]  #1: ff2c7ac4fe0367a0 (&rq->__lock){-...}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
[  478.183034]  #2: ff2c7aa67e96a4e0 (&kvm->srcu){.+.+}-{0:0}, at: kvm_arch_vcpu_put+0x7d/0x240 [kvm]

It appears to be fixable just by making it use read_trylock(). In the
PREEMPT_RT case that appears to go through rt_read_trylock() →
rwbase_read_trylock(), which does an atomic cmpxchg on the reader count
and returns immediately if it can't get it. And, crucially, doesn't
seem to whine about it.

There's absolutely no need for the irqsave in this case; this lock is
never obtained from interrupt context anyway.

I think that just using read_trylock() will work for the
kvm_xen_set_evtchn_fast() case too; will look at that in the morning.

@@ -5244,20 +5244,33 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
        if (unlikely(current->mm != vcpu->kvm->mm))
                return;
 
-       read_lock_irqsave(&gpc->lock, flags);
+       /*
+        * Use a trylock as this is called from the scheduler path (via
+        * kvm_sched_out), where rwlock_t is not safe on PREEMPT_RT (it
+        * becomes sleepable).  Setting preempted is best-effort anyway;
+        * the old HVA-based code used copy_to_user_nofault() which could
+        * also silently fail.
+        *
+        * Since we only trylock and bail on failure, there is no risk of
+        * deadlock with an interrupt handler, so no need to disable
+        * interrupts.
+        */
+       if (!read_trylock(&gpc->lock))
+               return;
+
        if (!kvm_gpc_check(gpc, sizeof(*st)))
                goto out_unlock_gpc;

Re: [PATCH v2] KVM: x86: Use gfn_to_pfn_cache for record_steal_time

Posted by David Woodhouse 3 weeks, 5 days ago

On Wed, 2026-03-11 at 12:49 +0100, David Woodhouse wrote:
> From: Carsten Stollmaier <stollmc@amazon.com>
> 
> On vcpu_run, before entering the guest, the update of the steal time
> information causes a page-fault if the page is not present. In our
> scenario, this gets handled by do_user_addr_fault and successively
> handle_userfault since we have the region registered to that.
> 
> handle_userfault uses TASK_INTERRUPTIBLE, so it is interruptible by
> signals. do_user_addr_fault then busy-retries it if the pending signal
> is non-fatal. This leads to contention of the mmap_lock.
> 
> This patch replaces the use of gfn_to_hva_cache with gfn_to_pfn_cache,
> as gfn_to_pfn_cache ensures page presence for the memory access,
> preventing the contention of the mmap_lock.
> 
> As an added bonus, this removes the last open-coded assembler access
> to userspace from arch/x86/kvm/x86.c.
> 
> Signed-off-by: Carsten Stollmaier <stollmc@amazon.com>
> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>

I've just spotted this is mostly reverting my commit 7e2175ebd695
("KVM: x86: Fix recording of guest steal time / preempted status") from
2021.

The steal time *used* to use a gfn_to_pfn_cache, back in the days when
the gfn_to_pfn_cache was entirely hosed before I ripped it out and
completely reimplemented it.