[PATCH v3 20/25] KVM: TDX: Add macro to retry SEAMCALLs when forcing vCPUs out of guest

Sean Christopherson posted 25 patches 3 months, 3 weeks ago
There is a newer version of this series
[PATCH v3 20/25] KVM: TDX: Add macro to retry SEAMCALLs when forcing vCPUs out of guest
Posted by Sean Christopherson 3 months, 3 weeks ago
Add a macro to handle kicking vCPUs out of the guest and retrying
SEAMCALLs on -EBUSY instead of providing small helpers to be used by each
SEAMCALL.  Wrapping the SEAMCALLs in a macro makes it a little harder to
tease out which SEAMCALL is being made, but significantly reduces the
amount of copy+paste code and makes it all but impossible to leave an
elevated wait_for_sept_zap.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/vmx/tdx.c | 72 ++++++++++++++----------------------------
 1 file changed, 23 insertions(+), 49 deletions(-)

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index f6782b0ffa98..2e2dab89c98f 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -294,25 +294,24 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
 	vcpu->cpu = -1;
 }
 
-static void tdx_no_vcpus_enter_start(struct kvm *kvm)
-{
-	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
-	lockdep_assert_held_write(&kvm->mmu_lock);
-
-	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
-
-	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
-}
-
-static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
-{
-	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
-	lockdep_assert_held_write(&kvm->mmu_lock);
-
-	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
-}
+#define tdh_do_no_vcpus(tdh_func, kvm, args...)					\
+({										\
+	struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm);				\
+	u64 __err;								\
+										\
+	lockdep_assert_held_write(&kvm->mmu_lock);				\
+										\
+	__err = tdh_func(args);							\
+	if (unlikely(tdx_operand_busy(__err))) {				\
+		WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true);			\
+		kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);	\
+										\
+		__err = tdh_func(args);						\
+										\
+		WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false);		\
+	}									\
+	__err;									\
+})
 
 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
 static int __tdx_reclaim_page(struct page *page)
@@ -1711,14 +1710,7 @@ static void tdx_track(struct kvm *kvm)
 	if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
 		return;
 
-	err = tdh_mem_track(&kvm_tdx->td);
-	if (unlikely(tdx_operand_busy(err))) {
-		/* After no vCPUs enter, the second retry is expected to succeed */
-		tdx_no_vcpus_enter_start(kvm);
-		err = tdh_mem_track(&kvm_tdx->td);
-		tdx_no_vcpus_enter_stop(kvm);
-	}
-
+	err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td);
 	TDX_BUG_ON(err, TDH_MEM_TRACK, kvm);
 
 	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
@@ -1770,14 +1762,8 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
 	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
 		return;
 
-	err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
-	if (unlikely(tdx_operand_busy(err))) {
-		/* After no vCPUs enter, the second retry is expected to succeed */
-		tdx_no_vcpus_enter_start(kvm);
-		err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
-		tdx_no_vcpus_enter_stop(kvm);
-	}
-
+	err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa,
+			      tdx_level, &entry, &level_state);
 	if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm))
 		return;
 
@@ -1792,20 +1778,8 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
 	 * with other vcpu sept operation.
 	 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
 	 */
-	err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
-				  &level_state);
-
-	if (unlikely(tdx_operand_busy(err))) {
-		/*
-		 * The second retry is expected to succeed after kicking off all
-		 * other vCPUs and prevent them from invoking TDH.VP.ENTER.
-		 */
-		tdx_no_vcpus_enter_start(kvm);
-		err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
-					  &level_state);
-		tdx_no_vcpus_enter_stop(kvm);
-	}
-
+	err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa,
+			      tdx_level, &entry, &level_state);
 	if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm))
 		return;
 
-- 
2.51.0.858.gf9c4a03a3a-goog
Re: [PATCH v3 20/25] KVM: TDX: Add macro to retry SEAMCALLs when forcing vCPUs out of guest
Posted by Huang, Kai 3 months, 2 weeks ago
On Thu, 2025-10-16 at 17:32 -0700, Sean Christopherson wrote:
> Add a macro to handle kicking vCPUs out of the guest and retrying
> SEAMCALLs on -EBUSY instead of providing small helpers to be used by each
> SEAMCALL.  Wrapping the SEAMCALLs in a macro makes it a little harder to
> tease out which SEAMCALL is being made, but significantly reduces the
> amount of copy+paste code and makes it all but impossible to leave an
> elevated wait_for_sept_zap.
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/vmx/tdx.c | 72 ++++++++++++++----------------------------
>  1 file changed, 23 insertions(+), 49 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index f6782b0ffa98..2e2dab89c98f 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -294,25 +294,24 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
>  	vcpu->cpu = -1;
>  }
>  
> -static void tdx_no_vcpus_enter_start(struct kvm *kvm)
> -{
> -	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> -
> -	lockdep_assert_held_write(&kvm->mmu_lock);
> -
> -	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
> -
> -	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
> -}
> -
> -static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
> -{
> -	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> -
> -	lockdep_assert_held_write(&kvm->mmu_lock);
> -
> -	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
> -}
> +#define tdh_do_no_vcpus(tdh_func, kvm, args...)					\
> +({										\
> +	struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm);				\
> +	u64 __err;								\
> +										\
> +	lockdep_assert_held_write(&kvm->mmu_lock);				\
> +										\
> +	__err = tdh_func(args);							\
> +	if (unlikely(tdx_operand_busy(__err))) {				\
> +		WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true);			\
> +		kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);	\
> +										\
> +		__err = tdh_func(args);						\
> +										\
> +		WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false);		\
> +	}									\
> +	__err;									\
> +})

The comment which says "the second retry should succeed" is lost, could we
add it to tdh_do_no_vcpus()?

Also, perhaps we can just TDX_BUG_ON() inside tdh_do_no_vcpus() when the
second call of tdh_func() fails?
Re: [PATCH v3 20/25] KVM: TDX: Add macro to retry SEAMCALLs when forcing vCPUs out of guest
Posted by Sean Christopherson 3 months, 1 week ago
On Fri, Oct 24, 2025, Kai Huang wrote:
> On Thu, 2025-10-16 at 17:32 -0700, Sean Christopherson wrote:
> > Add a macro to handle kicking vCPUs out of the guest and retrying
> > SEAMCALLs on -EBUSY instead of providing small helpers to be used by each
> > SEAMCALL.  Wrapping the SEAMCALLs in a macro makes it a little harder to
> > tease out which SEAMCALL is being made, but significantly reduces the
> > amount of copy+paste code and makes it all but impossible to leave an
> > elevated wait_for_sept_zap.
> > 
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > ---
> >  arch/x86/kvm/vmx/tdx.c | 72 ++++++++++++++----------------------------
> >  1 file changed, 23 insertions(+), 49 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index f6782b0ffa98..2e2dab89c98f 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -294,25 +294,24 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
> >  	vcpu->cpu = -1;
> >  }
> >  
> > -static void tdx_no_vcpus_enter_start(struct kvm *kvm)
> > -{
> > -	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > -
> > -	lockdep_assert_held_write(&kvm->mmu_lock);
> > -
> > -	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
> > -
> > -	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
> > -}
> > -
> > -static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
> > -{
> > -	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > -
> > -	lockdep_assert_held_write(&kvm->mmu_lock);
> > -
> > -	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
> > -}
> > +#define tdh_do_no_vcpus(tdh_func, kvm, args...)					\
> > +({										\
> > +	struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm);				\
> > +	u64 __err;								\
> > +										\
> > +	lockdep_assert_held_write(&kvm->mmu_lock);				\
> > +										\
> > +	__err = tdh_func(args);							\
> > +	if (unlikely(tdx_operand_busy(__err))) {				\
> > +		WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true);			\
> > +		kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);	\
> > +										\
> > +		__err = tdh_func(args);						\
> > +										\
> > +		WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false);		\
> > +	}									\
> > +	__err;									\
> > +})
> 
> The comment which says "the second retry should succeed" is lost, could we
> add it to tdh_do_no_vcpus()?

+1, definitely needs a comment.

/*
 * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single
 * retry (if necessary) after forcing vCPUs to exit and wait for the operation
 * to complete.  All flows that remove/block S-EPT entries run with mmu_lock
 * held for write, i.e. are mutually exlusive with each other, but they aren't
 * mutually exclusive with vCPUs running (because that would be overkill), and
 * so can fail with "operand busy" if a vCPU acquires a required lock in the
 * TDX-Module.
 *
 * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs.
 */
 
> Also, perhaps we can just TDX_BUG_ON() inside tdh_do_no_vcpus() when the
> second call of tdh_func() fails?

Heh, this also caught my eye when typing up the comment.  Unfortunately, I don't
think it's worth doing the TDX_BUG_ON() inside the macro as that would require
plumbing in the UPPERCASE name, and doesn't work well with the variadic arguments,
e.g. TRACK wants TDX_BUG_ON(), but REMOVE and BLOCK want TDX_BUG_ON_2().

Given that REMOVE and BLOCK need to check the return value, getting the TDX_BUG_ON()
call into the macro wouldn't buy that much.
Re: [PATCH v3 20/25] KVM: TDX: Add macro to retry SEAMCALLs when forcing vCPUs out of guest
Posted by Huang, Kai 3 months, 1 week ago
On Mon, 2025-10-27 at 12:20 -0700, Sean Christopherson wrote:
> On Fri, Oct 24, 2025, Kai Huang wrote:
> > On Thu, 2025-10-16 at 17:32 -0700, Sean Christopherson wrote:
> > > Add a macro to handle kicking vCPUs out of the guest and retrying
> > > SEAMCALLs on -EBUSY instead of providing small helpers to be used by each
> > > SEAMCALL.  Wrapping the SEAMCALLs in a macro makes it a little harder to
> > > tease out which SEAMCALL is being made, but significantly reduces the
> > > amount of copy+paste code and makes it all but impossible to leave an
> > > elevated wait_for_sept_zap.
> > > 
> > > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > > ---
> > >  arch/x86/kvm/vmx/tdx.c | 72 ++++++++++++++----------------------------
> > >  1 file changed, 23 insertions(+), 49 deletions(-)
> > > 
> > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > > index f6782b0ffa98..2e2dab89c98f 100644
> > > --- a/arch/x86/kvm/vmx/tdx.c
> > > +++ b/arch/x86/kvm/vmx/tdx.c
> > > @@ -294,25 +294,24 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
> > >  	vcpu->cpu = -1;
> > >  }
> > >  
> > > -static void tdx_no_vcpus_enter_start(struct kvm *kvm)
> > > -{
> > > -	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > > -
> > > -	lockdep_assert_held_write(&kvm->mmu_lock);
> > > -
> > > -	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
> > > -
> > > -	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
> > > -}
> > > -
> > > -static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
> > > -{
> > > -	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > > -
> > > -	lockdep_assert_held_write(&kvm->mmu_lock);
> > > -
> > > -	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
> > > -}
> > > +#define tdh_do_no_vcpus(tdh_func, kvm, args...)					\
> > > +({										\
> > > +	struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm);				\
> > > +	u64 __err;								\
> > > +										\
> > > +	lockdep_assert_held_write(&kvm->mmu_lock);				\
> > > +										\
> > > +	__err = tdh_func(args);							\
> > > +	if (unlikely(tdx_operand_busy(__err))) {				\
> > > +		WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true);			\
> > > +		kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);	\
> > > +										\
> > > +		__err = tdh_func(args);						\
> > > +										\
> > > +		WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false);		\
> > > +	}									\
> > > +	__err;									\
> > > +})
> > 
> > The comment which says "the second retry should succeed" is lost, could we
> > add it to tdh_do_no_vcpus()?
> 
> +1, definitely needs a comment.
> 
> /*
>  * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single
>  * retry (if necessary) after forcing vCPUs to exit and wait for the operation
>  * to complete.  All flows that remove/block S-EPT entries run with mmu_lock
>  * held for write, i.e. are mutually exlusive with each other, but they aren't
>  * mutually exclusive with vCPUs running (because that would be overkill), and
>  * so can fail with "operand busy" if a vCPU acquires a required lock in the
>  * TDX-Module.

LGTM.  Nit: is it more clear to just say they can contend with TDX guest?

   All flows that ..., but they aren't mutually exclusive with vCPUs
   running, i.e., they can also contend with TDCALL from guest and fail with
   "operand busy".

>  *
>  * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs.
>  */
>  
> > Also, perhaps we can just TDX_BUG_ON() inside tdh_do_no_vcpus() when the
> > second call of tdh_func() fails?
> 
> Heh, this also caught my eye when typing up the comment.  Unfortunately, I don't
> think it's worth doing the TDX_BUG_ON() inside the macro as that would require
> plumbing in the UPPERCASE name, and doesn't work well with the variadic arguments,
> e.g. TRACK wants TDX_BUG_ON(), but REMOVE and BLOCK want TDX_BUG_ON_2().
> 
> Given that REMOVE and BLOCK need to check the return value, getting the TDX_BUG_ON()
> call into the macro wouldn't buy that much.

Agreed.  Seems it sounds nice in concept but not in practice. :-)