[v3] KVM: x86/mmu: TDX post-populate cleanups

[PATCH v3 24/25] KVM: TDX: Guard VM state transitions with "all" the locks

Posted by Sean Christopherson 3 months, 3 weeks ago

Acquire kvm->lock, kvm->slots_lock, and all vcpu->mutex locks when
servicing ioctls that (a) transition the TD to a new state, i.e. when
doing INIT or FINALIZE or (b) are only valid if the TD is in a specific
state, i.e. when initializing a vCPU or memory region.  Acquiring "all"
the locks fixes several KVM_BUG_ON() situations where a SEAMCALL can fail
due to racing actions, e.g. if tdh_vp_create() contends with either
tdh_mr_extend() or tdh_mr_finalize().

For all intents and purposes, the paths in question are fully serialized,
i.e. there's no reason to try and allow anything remotely interesting to
happen.  Smack 'em with a big hammer instead of trying to be "nice".

Acquire kvm->lock to prevent VM-wide things from happening, slots_lock to
prevent kvm_mmu_zap_all_fast(), and _all_ vCPU mutexes to prevent vCPUs
from interefering.  Use the recently-renamed kvm_arch_vcpu_unlocked_ioctl()
to service the vCPU-scoped ioctls to avoid a lock inversion problem, e.g.
due to taking vcpu->mutex outside kvm->lock.

See also commit ecf371f8b02d ("KVM: SVM: Reject SEV{-ES} intra host
migration if vCPU creation is in-flight"), which fixed a similar bug with
SEV intra-host migration where an in-flight vCPU creation could race with
a VM-wide state transition.

Define a fancy new CLASS to handle the lock+check => unlock logic with
guard()-like syntax:

        CLASS(tdx_vm_state_guard, guard)(kvm);
        if (IS_ERR(guard))
                return PTR_ERR(guard);

to simplify juggling the many locks.

Note!  Take kvm->slots_lock *after* all vcpu->mutex locks, as per KVM's
soon-to-be-documented lock ordering rules[1].

Link: https://lore.kernel.org/all/20251016235538.171962-1-seanjc@google.com [1]
Reported-by: Yan Zhao <yan.y.zhao@intel.com>
Closes: https://lore.kernel.org/all/aLFiPq1smdzN3Ary@yzhao56-desk.sh.intel.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/vmx/tdx.c | 63 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 53 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 84b5fe654c99..d6541b08423f 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -2632,6 +2632,46 @@ static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
 	return -EIO;
 }
 
+typedef void *tdx_vm_state_guard_t;
+
+static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm)
+{
+	int r;
+
+	mutex_lock(&kvm->lock);
+
+	if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) {
+		r = -EBUSY;
+		goto out_err;
+	}
+
+	r = kvm_lock_all_vcpus(kvm);
+	if (r)
+		goto out_err;
+
+	/*
+	 * Note the unintuitive ordering!  vcpu->mutex must be taken outside
+	 * kvm->slots_lock!
+	 */
+	mutex_lock(&kvm->slots_lock);
+	return kvm;
+
+out_err:
+	mutex_unlock(&kvm->lock);
+	return ERR_PTR(r);
+}
+
+static void tdx_release_vm_state_locks(struct kvm *kvm)
+{
+	mutex_unlock(&kvm->slots_lock);
+	kvm_unlock_all_vcpus(kvm);
+	mutex_unlock(&kvm->lock);
+}
+
+DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t,
+	     if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T),
+	     tdx_acquire_vm_state_locks(kvm), struct kvm *kvm);
+
 static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
 {
 	struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data);
@@ -2644,6 +2684,10 @@ static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
 	BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
 	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
 
+	CLASS(tdx_vm_state_guard, guard)(kvm);
+	if (IS_ERR(guard))
+		return PTR_ERR(guard);
+
 	if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
 		return -EINVAL;
 
@@ -2743,7 +2787,9 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
 {
 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
 
-	guard(mutex)(&kvm->slots_lock);
+	CLASS(tdx_vm_state_guard, guard)(kvm);
+	if (IS_ERR(guard))
+		return PTR_ERR(guard);
 
 	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
 		return -EINVAL;
@@ -2781,8 +2827,6 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 	if (r)
 		return r;
 
-	guard(mutex)(&kvm->lock);
-
 	switch (tdx_cmd.id) {
 	case KVM_TDX_CAPABILITIES:
 		r = tdx_get_capabilities(&tdx_cmd);
@@ -3090,8 +3134,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
 	if (tdx->state != VCPU_TD_STATE_INITIALIZED)
 		return -EINVAL;
 
-	guard(mutex)(&kvm->slots_lock);
-
 	/* Once TD is finalized, the initial guest memory is fixed. */
 	if (kvm_tdx->state == TD_STATE_RUNNABLE)
 		return -EINVAL;
@@ -3147,7 +3189,8 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
 
 int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
 {
-	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
 	struct kvm_tdx_cmd cmd;
 	int r;
 
@@ -3155,12 +3198,13 @@ int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
 	if (r)
 		return r;
 
+	CLASS(tdx_vm_state_guard, guard)(kvm);
+	if (IS_ERR(guard))
+		return PTR_ERR(guard);
+
 	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
 		return -EINVAL;
 
-	if (mutex_lock_killable(&vcpu->mutex))
-		return -EINTR;
-
 	vcpu_load(vcpu);
 
 	switch (cmd.id) {
@@ -3177,7 +3221,6 @@ int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
 
 	vcpu_put(vcpu);
 
-	mutex_unlock(&vcpu->mutex);
 	return r;
 }
 
-- 
2.51.0.858.gf9c4a03a3a-goog

Re: [PATCH v3 24/25] KVM: TDX: Guard VM state transitions with "all" the locks

Posted by Huang, Kai 3 months, 1 week ago

On Thu, 2025-10-16 at 17:32 -0700, Sean Christopherson wrote:
> @@ -2781,8 +2827,6 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
>  	if (r)
>  		return r;
>  
> -	guard(mutex)(&kvm->lock);
> -
>  	switch (tdx_cmd.id) {
>  	case KVM_TDX_CAPABILITIES:
>  		r = tdx_get_capabilities(&tdx_cmd);

IIRC, this patch removes grabbing the kvm->lock in tdx_vm_ioctl() but only
adds the "big hammer" to tdx_td_init() and tdx_td_finalize(), so the
tdx_get_capabilities() lost holding the kvm->lock.

As replied earlier, I think we can just hold the "big hammer" in
tdx_vm_ioctl()?

One thing is when tdx_vm_ioctl() is called, the TD may not have any vCPU
(e.g., for tdx_td_init()).  This means the "big hammer" will hold kvm-
>slots_lock w/o holding any lock of vCPU.  But IIUC this should be OK.

Re: [PATCH v3 24/25] KVM: TDX: Guard VM state transitions with "all" the locks

Posted by Sean Christopherson 3 months, 1 week ago

On Tue, Oct 28, 2025, Kai Huang wrote:
> On Thu, 2025-10-16 at 17:32 -0700, Sean Christopherson wrote:
> > @@ -2781,8 +2827,6 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
> >  	if (r)
> >  		return r;
> >  
> > -	guard(mutex)(&kvm->lock);
> > -
> >  	switch (tdx_cmd.id) {
> >  	case KVM_TDX_CAPABILITIES:
> >  		r = tdx_get_capabilities(&tdx_cmd);
> 
> IIRC, this patch removes grabbing the kvm->lock in tdx_vm_ioctl() but only
> adds the "big hammer" to tdx_td_init() and tdx_td_finalize(), so the
> tdx_get_capabilities() lost holding the kvm->lock.

Ooh, yeah, nice catch, that is indeed silly and unnecessary churn.

> As replied earlier, I think we can just hold the "big hammer" in
> tdx_vm_ioctl()?

Actually, I think we can have our cake and eat it too.  With this slotted in as
a prep patch, the big hammer can land directly in tdx_vm_ioctl(), without any
change in functionality for KVM_TDX_CAPABILITIES.

--
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 27 Oct 2025 17:32:34 -0700
Subject: [PATCH] KVM: TDX: Don't copy "cmd" back to userspace for
 KVM_TDX_CAPABILITIES

Don't copy the kvm_tdx_cmd structure back to userspace when handling
KVM_TDX_CAPABILITIES, as tdx_get_capabilities() doesn't modify hw_error or
any other fields.

Opportunistically hoist the call to tdx_get_capabilities() outside of the
kvm->lock critical section, as getting the capabilities doesn't touch the
VM in any way, e.g. doesn't even take @kvm.

Suggested-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/vmx/tdx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 1642da9c1fa9..43c0c3f6a8c0 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -2807,12 +2807,12 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 	if (r)
 		return r;
 
+	if (tdx_cmd.id == KVM_TDX_CAPABILITIES)
+		return tdx_get_capabilities(&tdx_cmd);
+
 	guard(mutex)(&kvm->lock);
 
 	switch (tdx_cmd.id) {
-	case KVM_TDX_CAPABILITIES:
-		r = tdx_get_capabilities(&tdx_cmd);
-		break;
 	case KVM_TDX_INIT_VM:
 		r = tdx_td_init(kvm, &tdx_cmd);
 		break;

base-commit: 672537233b8da2c29dca7154bf3a3211af7f6128
--

Re: [PATCH v3 24/25] KVM: TDX: Guard VM state transitions with "all" the locks

Posted by Huang, Kai 3 months, 1 week ago

On Mon, 2025-10-27 at 17:37 -0700, Sean Christopherson wrote:
> On Tue, Oct 28, 2025, Kai Huang wrote:
> > On Thu, 2025-10-16 at 17:32 -0700, Sean Christopherson wrote:
> > > @@ -2781,8 +2827,6 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
> > >  	if (r)
> > >  		return r;
> > >  
> > > -	guard(mutex)(&kvm->lock);
> > > -
> > >  	switch (tdx_cmd.id) {
> > >  	case KVM_TDX_CAPABILITIES:
> > >  		r = tdx_get_capabilities(&tdx_cmd);
> > 
> > IIRC, this patch removes grabbing the kvm->lock in tdx_vm_ioctl() but only
> > adds the "big hammer" to tdx_td_init() and tdx_td_finalize(), so the
> > tdx_get_capabilities() lost holding the kvm->lock.
> 
> Ooh, yeah, nice catch, that is indeed silly and unnecessary churn.
> 
> > As replied earlier, I think we can just hold the "big hammer" in
> > tdx_vm_ioctl()?
> 
> Actually, I think we can have our cake and eat it too.  With this slotted in as
> a prep patch, the big hammer can land directly in tdx_vm_ioctl(), without any
> change in functionality for KVM_TDX_CAPABILITIES.
> 
> --
> From: Sean Christopherson <seanjc@google.com>
> Date: Mon, 27 Oct 2025 17:32:34 -0700
> Subject: [PATCH] KVM: TDX: Don't copy "cmd" back to userspace for
>  KVM_TDX_CAPABILITIES
> 
> Don't copy the kvm_tdx_cmd structure back to userspace when handling
> KVM_TDX_CAPABILITIES, as tdx_get_capabilities() doesn't modify hw_error or
> any other fields.
> 
> Opportunistically hoist the call to tdx_get_capabilities() outside of the
> kvm->lock critical section, as getting the capabilities doesn't touch the
> VM in any way, e.g. doesn't even take @kvm.
> 
> Suggested-by: Kai Huang <kai.huang@intel.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/vmx/tdx.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 1642da9c1fa9..43c0c3f6a8c0 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -2807,12 +2807,12 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
>  	if (r)
>  		return r;
>  
> +	if (tdx_cmd.id == KVM_TDX_CAPABILITIES)
> +		return tdx_get_capabilities(&tdx_cmd);
> +

OK fine to me. :-)

Maybe add a comment saying tdx_get_capabilities() doesn't copy any data 
back to kvm_tdx_cmd structure, and doesn't need to take @kvm?  This gives
people some notice if they want to change in the future (not sure whether
any change will happen though).

>  	guard(mutex)(&kvm->lock);
>  
>  	switch (tdx_cmd.id) {
> -	case KVM_TDX_CAPABILITIES:
> -		r = tdx_get_capabilities(&tdx_cmd);
> -		break;
>  	case KVM_TDX_INIT_VM:
>  		r = tdx_td_init(kvm, &tdx_cmd);
>  		break;
> 
> base-commit: 672537233b8da2c29dca7154bf3a3211af7f6128
> --

Re: [PATCH v3 24/25] KVM: TDX: Guard VM state transitions with "all" the locks

Posted by Huang, Kai 3 months, 2 weeks ago

>  
> +typedef void *tdx_vm_state_guard_t;
> +
> +static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm)
> +{
> +	int r;
> +
> +	mutex_lock(&kvm->lock);
> +
> +	if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) {
> +		r = -EBUSY;
> +		goto out_err;
> +	}
> +
> +	r = kvm_lock_all_vcpus(kvm);
> +	if (r)
> +		goto out_err;
> +
> +	/*
> +	 * Note the unintuitive ordering!  vcpu->mutex must be taken outside
> +	 * kvm->slots_lock!
> +	 */
> +	mutex_lock(&kvm->slots_lock);
> +	return kvm;
> +
> +out_err:
> +	mutex_unlock(&kvm->lock);
> +	return ERR_PTR(r);
> +}
> +
> +static void tdx_release_vm_state_locks(struct kvm *kvm)
> +{
> +	mutex_unlock(&kvm->slots_lock);
> +	kvm_unlock_all_vcpus(kvm);
> +	mutex_unlock(&kvm->lock);
> +}
> +
> +DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t,
> +	     if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T),
> +	     tdx_acquire_vm_state_locks(kvm), struct kvm *kvm);
> +
>  static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
>  {
>  	struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data);
> @@ -2644,6 +2684,10 @@ static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
>  	BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
>  	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
>  
> +	CLASS(tdx_vm_state_guard, guard)(kvm);
> +	if (IS_ERR(guard))
> +		return PTR_ERR(guard);
> +
>  	if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
>  		return -EINVAL;
>  
> @@ -2743,7 +2787,9 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
>  {
>  	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
>  
> -	guard(mutex)(&kvm->slots_lock);
> +	CLASS(tdx_vm_state_guard, guard)(kvm);
> +	if (IS_ERR(guard))
> +		return PTR_ERR(guard);
>  

Since you are changing both tdx_td_init() and tdx_td_finalize(), maybe
just changing tdx_vm_ioctl() instead (like tdx_vcpu_unlocked_ioctl())?  
This is not required for tdx_get_capabilities() but there's no harm to do
it too AFACIT.

Re: [PATCH v3 24/25] KVM: TDX: Guard VM state transitions with "all" the locks

Posted by Yan Zhao 3 months, 2 weeks ago

On Thu, Oct 16, 2025 at 05:32:42PM -0700, Sean Christopherson wrote:
> Acquire kvm->lock, kvm->slots_lock, and all vcpu->mutex locks when
> servicing ioctls that (a) transition the TD to a new state, i.e. when
> doing INIT or FINALIZE or (b) are only valid if the TD is in a specific
> state, i.e. when initializing a vCPU or memory region.  Acquiring "all"
> the locks fixes several KVM_BUG_ON() situations where a SEAMCALL can fail
> due to racing actions, e.g. if tdh_vp_create() contends with either
> tdh_mr_extend() or tdh_mr_finalize().
> 
> For all intents and purposes, the paths in question are fully serialized,
> i.e. there's no reason to try and allow anything remotely interesting to
> happen.  Smack 'em with a big hammer instead of trying to be "nice".
> 
> Acquire kvm->lock to prevent VM-wide things from happening, slots_lock to
> prevent kvm_mmu_zap_all_fast(), and _all_ vCPU mutexes to prevent vCPUs
slots_lock to prevent kvm_mmu_zap_memslot()?
kvm_mmu_zap_all_fast() does not operate on the mirror root.

We may have missed a zap in the guest_memfd punch hole path:

The SEAMCALLs tdh_mem_range_block(), tdh_mem_track() tdh_mem_page_remove()
in the guest_memfd punch hole path are only protected by the filemap invaliate
lock and mmu_lock, so they could contend with v1 version of tdh_vp_init().
(I'm writing a selftest to verify this, haven't been able to reproduce
tdh_vp_init(v1) returning BUSY yet. However, this race condition should be
theoretically possible.)

Resources              SHARED  users              EXCLUSIVE users
------------------------------------------------------------------------
(1) TDR                tdh_mng_rdwr               tdh_mng_create
                       tdh_vp_create              tdh_mng_add_cx
                       tdh_vp_addcx               tdh_mng_init
                       tdh_vp_init(v0)            tdh_mng_vpflushdone
                       tdh_vp_enter               tdh_mng_key_config
                       tdh_vp_flush               tdh_mng_key_freeid
                       tdh_vp_rd_wr               tdh_mr_extend
                       tdh_mem_sept_add           tdh_mr_finalize
                       tdh_mem_sept_remove        tdh_vp_init(v1)
                       tdh_mem_page_aug           tdh_mem_page_add
                       tdh_mem_page_remove
                       tdh_mem_range_block
                       tdh_mem_track
                       tdh_mem_range_unblock
                       tdh_phymem_page_reclaim

Do you think we can acquire the mmu_lock for cmd KVM_TDX_INIT_VCPU?

> @@ -3155,12 +3198,13 @@ int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
>  	if (r)
>  		return r;
>  
> +	CLASS(tdx_vm_state_guard, guard)(kvm);
Should we move the guard to inside each cmd? Then there's no need to acquire the
locks in the default cases.

Re: [PATCH v3 24/25] KVM: TDX: Guard VM state transitions with "all" the locks

Posted by Sean Christopherson 3 months, 2 weeks ago

On Fri, Oct 24, 2025, Yan Zhao wrote:
> On Thu, Oct 16, 2025 at 05:32:42PM -0700, Sean Christopherson wrote:
> > Acquire kvm->lock, kvm->slots_lock, and all vcpu->mutex locks when
> > servicing ioctls that (a) transition the TD to a new state, i.e. when
> > doing INIT or FINALIZE or (b) are only valid if the TD is in a specific
> > state, i.e. when initializing a vCPU or memory region.  Acquiring "all"
> > the locks fixes several KVM_BUG_ON() situations where a SEAMCALL can fail
> > due to racing actions, e.g. if tdh_vp_create() contends with either
> > tdh_mr_extend() or tdh_mr_finalize().
> > 
> > For all intents and purposes, the paths in question are fully serialized,
> > i.e. there's no reason to try and allow anything remotely interesting to
> > happen.  Smack 'em with a big hammer instead of trying to be "nice".
> > 
> > Acquire kvm->lock to prevent VM-wide things from happening, slots_lock to
> > prevent kvm_mmu_zap_all_fast(), and _all_ vCPU mutexes to prevent vCPUs
> slots_lock to prevent kvm_mmu_zap_memslot()?
> kvm_mmu_zap_all_fast() does not operate on the mirror root.

Oh, right.

> We may have missed a zap in the guest_memfd punch hole path:
> 
> The SEAMCALLs tdh_mem_range_block(), tdh_mem_track() tdh_mem_page_remove()
> in the guest_memfd punch hole path are only protected by the filemap invaliate
> lock and mmu_lock, so they could contend with v1 version of tdh_vp_init().
>
> (I'm writing a selftest to verify this, haven't been able to reproduce
> tdh_vp_init(v1) returning BUSY yet. However, this race condition should be
> theoretically possible.)
> 
> Resources              SHARED  users              EXCLUSIVE users
> ------------------------------------------------------------------------
> (1) TDR                tdh_mng_rdwr               tdh_mng_create
>                        tdh_vp_create              tdh_mng_add_cx
>                        tdh_vp_addcx               tdh_mng_init
>                        tdh_vp_init(v0)            tdh_mng_vpflushdone
>                        tdh_vp_enter               tdh_mng_key_config
>                        tdh_vp_flush               tdh_mng_key_freeid
>                        tdh_vp_rd_wr               tdh_mr_extend
>                        tdh_mem_sept_add           tdh_mr_finalize
>                        tdh_mem_sept_remove        tdh_vp_init(v1)
>                        tdh_mem_page_aug           tdh_mem_page_add
>                        tdh_mem_page_remove
>                        tdh_mem_range_block
>                        tdh_mem_track
>                        tdh_mem_range_unblock
>                        tdh_phymem_page_reclaim
> 
> Do you think we can acquire the mmu_lock for cmd KVM_TDX_INIT_VCPU?

Ugh, I'd rather not?  Refresh me, what's the story with "v1"?  Are we now on v2?
If this is effectively limited to deprecated TDX modules, my vote would be to
ignore the flaw and avoid even more complexity in KVM.

> > @@ -3155,12 +3198,13 @@ int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> >  	if (r)
> >  		return r;
> >  
> > +	CLASS(tdx_vm_state_guard, guard)(kvm);
> Should we move the guard to inside each cmd? Then there's no need to acquire the
> locks in the default cases. 

No, I don't think it's a good tradeoff.  We'd also need to move vcpu_{load,put}()
into the cmd helpers, and theoretically slowing down a bad ioctl invocation due
to taking extra locks is a complete non-issue.

Re: [PATCH v3 24/25] KVM: TDX: Guard VM state transitions with "all" the locks

Posted by Yan Zhao 3 months, 2 weeks ago

On Fri, Oct 24, 2025 at 09:57:20AM -0700, Sean Christopherson wrote:
> On Fri, Oct 24, 2025, Yan Zhao wrote:
> > On Thu, Oct 16, 2025 at 05:32:42PM -0700, Sean Christopherson wrote:
> > We may have missed a zap in the guest_memfd punch hole path:
> > 
> > The SEAMCALLs tdh_mem_range_block(), tdh_mem_track() tdh_mem_page_remove()
> > in the guest_memfd punch hole path are only protected by the filemap invaliate
> > lock and mmu_lock, so they could contend with v1 version of tdh_vp_init().
> >
> > (I'm writing a selftest to verify this, haven't been able to reproduce
> > tdh_vp_init(v1) returning BUSY yet. However, this race condition should be
> > theoretically possible.)
> > 
> > Resources              SHARED  users              EXCLUSIVE users
> > ------------------------------------------------------------------------
> > (1) TDR                tdh_mng_rdwr               tdh_mng_create
> >                        tdh_vp_create              tdh_mng_add_cx
> >                        tdh_vp_addcx               tdh_mng_init
> >                        tdh_vp_init(v0)            tdh_mng_vpflushdone
> >                        tdh_vp_enter               tdh_mng_key_config
> >                        tdh_vp_flush               tdh_mng_key_freeid
> >                        tdh_vp_rd_wr               tdh_mr_extend
> >                        tdh_mem_sept_add           tdh_mr_finalize
> >                        tdh_mem_sept_remove        tdh_vp_init(v1)
> >                        tdh_mem_page_aug           tdh_mem_page_add
> >                        tdh_mem_page_remove
> >                        tdh_mem_range_block
> >                        tdh_mem_track
> >                        tdh_mem_range_unblock
> >                        tdh_phymem_page_reclaim
> > 
> > Do you think we can acquire the mmu_lock for cmd KVM_TDX_INIT_VCPU?
> 
> Ugh, I'd rather not?  Refresh me, what's the story with "v1"?  Are we now on v2?
No... We are now on v1.
As in [1], I found that TDX module changed SEAMCALL TDH_VP_INIT behavior to
require exclusive lock on resource TDR when leaf_opcode.version > 0.

Therefore, we moved KVM_TDX_INIT_VCPU to tdx_vcpu_unlocked_ioctl() in patch 22.

[1] https://lore.kernel.org/all/aLa34QCJCXGLk%2Ffl@yzhao56-desk.sh.intel.com/

> If this is effectively limited to deprecated TDX modules, my vote would be to
> ignore the flaw and avoid even more complexity in KVM.
Conversely, the new TDX module has this flaw...

> > > @@ -3155,12 +3198,13 @@ int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> > >  	if (r)
> > >  		return r;
> > >  
> > > +	CLASS(tdx_vm_state_guard, guard)(kvm);
> > Should we move the guard to inside each cmd? Then there's no need to acquire the
> > locks in the default cases. 
> 
> No, I don't think it's a good tradeoff.  We'd also need to move vcpu_{load,put}()
> into the cmd helpers, and theoretically slowing down a bad ioctl invocation due
> to taking extra locks is a complete non-issue.
Fair enough.

Re: [PATCH v3 24/25] KVM: TDX: Guard VM state transitions with "all" the locks

Posted by Edgecombe, Rick P 3 months, 1 week ago

On Mon, 2025-10-27 at 17:26 +0800, Yan Zhao wrote:
> > Ugh, I'd rather not?  Refresh me, what's the story with "v1"?  Are we now on
> > v2?
> No... We are now on v1.
> As in [1], I found that TDX module changed SEAMCALL TDH_VP_INIT behavior to
> require exclusive lock on resource TDR when leaf_opcode.version > 0.
> 
> Therefore, we moved KVM_TDX_INIT_VCPU to tdx_vcpu_unlocked_ioctl() in patch
> 22.
> 
> [1] https://lore.kernel.org/all/aLa34QCJCXGLk%2Ffl@yzhao56-desk.sh.intel.com/

Looking at the PDF docs, TDR exclusive locking in version == 1 is called out at
least back to the oldest ABI docs I have (March 2024). Not sure about the
assertion that the behavior changed, but if indeed this was documented, it's a
little bit our bad. We might consider being flexible around calling it a TDX ABI
break?

Sean, can you elaborate why taking mmu_lock is objectionable here, though?

Note, myself (and I think Yan?) determined the locking by examining TDX module
source. For myself, it's possible I misread the locking originally.

Also, I'm not sure about switching gears at this point, but it makes me wonder
about the previously discussed option of trying to just duplicate the TDX locks
on the kernel side. Or perhaps make some kind of debug time lockdep type thing
to document/check the assumptions in the kernel. Something along the lines of
this patch, but to map the TDX locks to KVM locks or something. As we add more
things (DPAMT, etc), it doesn't seem like the TDX locking is getting tamer...

Re: [PATCH v3 24/25] KVM: TDX: Guard VM state transitions with "all" the locks

Posted by Yan Zhao 3 months, 1 week ago

On Tue, Oct 28, 2025 at 01:46:03AM +0800, Edgecombe, Rick P wrote:
> On Mon, 2025-10-27 at 17:26 +0800, Yan Zhao wrote:
> > > Ugh, I'd rather not?  Refresh me, what's the story with "v1"?  Are we now on
> > > v2?
> > No... We are now on v1.
> > As in [1], I found that TDX module changed SEAMCALL TDH_VP_INIT behavior to
> > require exclusive lock on resource TDR when leaf_opcode.version > 0.
> > 
> > Therefore, we moved KVM_TDX_INIT_VCPU to tdx_vcpu_unlocked_ioctl() in patch
> > 22.
> > 
> > [1] https://lore.kernel.org/all/aLa34QCJCXGLk%2Ffl@yzhao56-desk.sh.intel.com/
> 
> Looking at the PDF docs, TDR exclusive locking in version == 1 is called out at
> least back to the oldest ABI docs I have (March 2024). Not sure about the
> assertion that the behavior changed, but if indeed this was documented, it's a
> little bit our bad. We might consider being flexible around calling it a TDX ABI
> break?
I apologize that the ABI documentation had already called this out earlier in
March 2024.

I determined the locking behavior by reading the TDX module's source code,
specifically, from public repo https://github.com/intel/tdx-module.git, branch
tdx_1.5.

I checked my git snapshot of that branch, and I think it's because back to my
checking time, branch tdx_1.5 was pointing to TDX_1.5.01, which did not include
the code for version 1.

commit 72d8cffb214b74ae94d75afce36423020f74b47c (HEAD -> tdx_1.5, tag: TDX_1.5.01)
Author: mvainer <michael1.vainer@intel.com>
Date:   Thu Feb 22 15:36:58 2024 +0200

    TDX 1.5.01

    Signed-off-by: mvainer <michael1.vainer@intel.com>



In that repository, the latest tdx_1.5 branch points to tag TDX_1.5.16.
The exclusive TDR lock in TDH.VP.INIT was introduced starting from TDX 1.5.05.


commit 147ba2973479be63147048954f77a0d7248fcc35
Author: Vainer, Michael1 <michael1.vainer@intel.com>
Date:   Mon Aug 11 11:53:07 2025 +0300

    TDX 1.5.05

    Signed-off-by: Vainer, Michael1 <michael1.vainer@intel.com>

diff --git a/src/vmm_dispatcher/api_calls/tdh_vp_init.c b/src/vmm_dispatcher/api_calls/tdh_vp_init.c
index ccb6b9e..ee51a18 100644
--- a/src/vmm_dispatcher/api_calls/tdh_vp_init.c
+++ b/src/vmm_dispatcher/api_calls/tdh_vp_init.c
@@ -114,6 +114,17 @@ api_error_type tdh_vp_init(uint64_t target_tdvpr_pa, uint64_t td_vcpu_rcx)

     api_error_type        return_val = UNINITIALIZE_ERROR;

+    tdx_leaf_and_version_t leaf_opcode;
+    leaf_opcode.raw = get_local_data()->vmm_regs.rax;
+
+    uint64_t x2apic_id = get_local_data()->vmm_regs.r8;
+
+    // TDH.VP.INIT supports version 1.  Other version checks are done by the SEAMCALL dispatcher.
+    if (leaf_opcode.version > 1)
+    {
+        return_val = api_error_with_operand_id(TDX_OPERAND_INVALID, OPERAND_ID_RAX);
+        goto EXIT;
+    }

     // Check and lock the parent TDVPR page
     return_val = check_and_lock_explicit_4k_private_hpa(tdvpr_pa,
@@ -129,11 +140,13 @@ api_error_type tdh_vp_init(uint64_t target_tdvpr_pa, uint64_t td_vcpu_rcx)
         goto EXIT;
     }

+    lock_type_t tdr_lock_type = (leaf_opcode.version > 0) ? TDX_LOCK_EXCLUSIVE : TDX_LOCK_SHARED;
+
     // Lock and map the TDR page
     return_val = lock_and_map_implicit_tdr(get_pamt_entry_owner(tdvpr_pamt_entry_ptr),
                                            OPERAND_ID_TDR,
                                            TDX_RANGE_RO,
-                                           TDX_LOCK_SHARED,
+                                           tdr_lock_type,
                                            &tdr_pamt_entry_ptr,
                                            &tdr_locked_flag,
                                            &tdr_ptr);
@@ -190,6 +203,32 @@ api_error_type tdh_vp_init(uint64_t target_tdvpr_pa, uint64_t td_vcpu_rcx)
     }
     tdvps_ptr->management.vcpu_index = vcpu_index;

+    if (leaf_opcode.version == 0)
+    {
+        // No x2APIC ID was provided
+        tdcs_ptr->executions_ctl_fields.topology_enum_configured = false;
+    }
+    else
+    {
+        // Check and save the configured x2APIC ID.  Upper 32 bits must be 0.
+        if (x2apic_id > 0xFFFFFFFF)
+        {
+            (void)_lock_xadd_32b(&tdcs_ptr->management_fields.num_vcpus, (uint32_t)-1);
+            return_val = api_error_with_operand_id(TDX_OPERAND_INVALID, OPERAND_ID_R8);
+            goto EXIT;
+        }
+
+        for (uint32_t i = 0; i < vcpu_index; i++)
+        {
+            if ((uint32_t)x2apic_id == tdcs_ptr->x2apic_ids[i])
+            {
+                return_val = api_error_with_operand_id(TDX_X2APIC_ID_NOT_UNIQUE, tdcs_ptr->x2apic_ids[i]);
+                goto EXIT;
+            }
+        }
+
+        tdcs_ptr->x2apic_ids[vcpu_index] = (uint32_t)x2apic_id;
+    }

     // We read TSC below.  Compare IA32_TSC_ADJUST to the value sampled on TDHSYSINIT
     // to make sure the host VMM doesn't play any trick on us. */
@@ -282,7 +321,7 @@ EXIT:
     }
     if (tdr_locked_flag)
     {
-        pamt_implicit_release_lock(tdr_pamt_entry_ptr, TDX_LOCK_SHARED);
+        pamt_implicit_release_lock(tdr_pamt_entry_ptr, tdr_lock_type);
         free_la(tdr_ptr);
     }
     if (tdvpr_locked_flag)

Re: [PATCH v3 24/25] KVM: TDX: Guard VM state transitions with "all" the locks

Posted by Edgecombe, Rick P 3 months, 1 week ago

On Tue, 2025-10-28 at 09:37 +0800, Yan Zhao wrote:
> I checked my git snapshot of that branch, and I think it's because back to my
> checking time, branch tdx_1.5 was pointing to TDX_1.5.01, which did not include
> the code for version 1.

Ah, that explains it. I've been looking more at the code for this kind of info
too. I guess we should cross check the docs more.

Re: [PATCH v3 24/25] KVM: TDX: Guard VM state transitions with "all" the locks

Posted by Sean Christopherson 3 months, 1 week ago

On Mon, Oct 27, 2025, Rick P Edgecombe wrote:
> On Mon, 2025-10-27 at 17:26 +0800, Yan Zhao wrote:
> > > Ugh, I'd rather not?  Refresh me, what's the story with "v1"?  Are we now on
> > > v2?
> > No... We are now on v1.
> > As in [1], I found that TDX module changed SEAMCALL TDH_VP_INIT behavior to
> > require exclusive lock on resource TDR when leaf_opcode.version > 0.
> > 
> > Therefore, we moved KVM_TDX_INIT_VCPU to tdx_vcpu_unlocked_ioctl() in patch
> > 22.
> > 
> > [1] https://lore.kernel.org/all/aLa34QCJCXGLk%2Ffl@yzhao56-desk.sh.intel.com/
> 
> Looking at the PDF docs, TDR exclusive locking in version == 1 is called out at
> least back to the oldest ABI docs I have (March 2024). Not sure about the
> assertion that the behavior changed, but if indeed this was documented, it's a
> little bit our bad. We might consider being flexible around calling it a TDX ABI
> break?
> 
> Sean, can you elaborate why taking mmu_lock is objectionable here, though?

It's not, I was just hoping we could avoid yet more complexity.

Assuming we do indeed need to take mmu_lock, can you send a patch that applies
on top?  I'm not planning on sending any of this to stable@, so I don't see any
reason to try and juggle patches around.

> Note, myself (and I think Yan?) determined the locking by examining TDX module
> source. For myself, it's possible I misread the locking originally.
> 
> Also, I'm not sure about switching gears at this point, but it makes me wonder
> about the previously discussed option of trying to just duplicate the TDX locks
> on the kernel side.

Please no.  At best that will yield a pile of effectively useless code.  At worst,
it will make us lazy and lead to real bugs because we don't propery guard the *KVM*
flows that need exclusivity relative to what is going on in the TDX-Module.

> Or perhaps make some kind of debug time lockdep type thing to document/check
> the assumptions in the kernel. Something along the lines of this patch, but
> to map the TDX locks to KVM locks or something. As we add more things (DPAMT,
> etc), it doesn't seem like the TDX locking is getting tamer...

Hmm, I like the idea, but actually getting meaningful coverage could be quite
difficult.

[PATCH] KVM: TDX: Take MMU lock around tdh_vp_init()

Posted by Rick Edgecombe 3 months, 1 week ago

Take MMU lock around tdh_vp_init() in KVM_TDX_INIT_VCPU to prevent
meeting contention during retries in some no-fail MMU paths.

The TDX module takes various try-locks internally, which can cause
SEAMCALLs to return an error code when contention is met. Dealing with
an error in some of the MMU paths that make SEAMCALLs is not straight
forward, so KVM takes steps to ensure that these will meet no contention
during a single BUSY error retry. The whole scheme relies on KVM to take
appropriate steps to avoid making any SEAMCALLs that could contend while
the retry is happening.

Unfortunately, there is a case where contention could be met if userspace
does something unusual. Specifically, hole punching a gmem fd while
initializing the TD vCPU. The impact would be triggering a KVM_BUG_ON().

The resource being contended is called the "TDR resource" in TDX docs 
parlance. The tdh_vp_init() can take this resource as exclusive if the 
'version' passed is 1, which happens to be version the kernel passes. The 
various MMU operations (tdh_mem_range_block(), tdh_mem_track() and 
tdh_mem_page_remove()) take it as shared.

There isn't a KVM lock that maps conceptually and in a lock order friendly 
way to the TDR lock. So to minimize infrastructure, just take MMU lock 
around tdh_vp_init(). This makes the operations we care about mutually 
exclusive. Since the other operations are under a write mmu_lock, the code 
could just take the lock for read, however this is weirdly inverted from 
the actual underlying resource being contended. Since this is covering an 
edge case that shouldn't be hit in normal usage, be a little less weird 
and take the mmu_lock for write around the call.

Fixes: 02ab57707bdb ("KVM: TDX: Implement hooks to propagate changes of TDP MMU mirror page table")
Reported-by: Yan Zhao <yan.y.zhao@intel.com>
Suggested-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
Hi,

It was indeed awkward, as Sean must have sniffed. But seems ok enough to
close the issue.

Yan, can you give it a look?

Posted here, but applies on top of this series.

Thanks,

Rick
---
 arch/x86/kvm/vmx/tdx.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index daec88d4b88d..8bf5d2624152 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -2938,9 +2938,18 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
 		}
 	}
 
-	err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
-	if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm))
-		return -EIO;
+	/*
+	 * tdh_vp_init() can take a exclusive lock of the TDR resource inside
+	 * the TDX module. This resource is also taken as shared in several
+	 * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention.
+	 * A read lock here would be enough to exclude the contention, but take
+	 * a write lock to avoid the weird inversion.
+	 */
+	scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
+		err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
+		if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm))
+			return -EIO;
+	}
 
 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
-- 
2.51.1

Re: [PATCH] KVM: TDX: Take MMU lock around tdh_vp_init()

Posted by Sean Christopherson 2 months, 3 weeks ago

On Mon, Oct 27, 2025, Rick Edgecombe wrote:
> Take MMU lock around tdh_vp_init() in KVM_TDX_INIT_VCPU to prevent
> meeting contention during retries in some no-fail MMU paths.
> 
> The TDX module takes various try-locks internally, which can cause
> SEAMCALLs to return an error code when contention is met. Dealing with
> an error in some of the MMU paths that make SEAMCALLs is not straight
> forward, so KVM takes steps to ensure that these will meet no contention
> during a single BUSY error retry. The whole scheme relies on KVM to take
> appropriate steps to avoid making any SEAMCALLs that could contend while
> the retry is happening.
> 
> Unfortunately, there is a case where contention could be met if userspace
> does something unusual. Specifically, hole punching a gmem fd while
> initializing the TD vCPU. The impact would be triggering a KVM_BUG_ON().
> 
> The resource being contended is called the "TDR resource" in TDX docs 
> parlance. The tdh_vp_init() can take this resource as exclusive if the 
> 'version' passed is 1, which happens to be version the kernel passes. The 
> various MMU operations (tdh_mem_range_block(), tdh_mem_track() and 
> tdh_mem_page_remove()) take it as shared.
> 
> There isn't a KVM lock that maps conceptually and in a lock order friendly 
> way to the TDR lock. So to minimize infrastructure, just take MMU lock 
> around tdh_vp_init(). This makes the operations we care about mutually 
> exclusive. Since the other operations are under a write mmu_lock, the code 
> could just take the lock for read, however this is weirdly inverted from 
> the actual underlying resource being contended. Since this is covering an 
> edge case that shouldn't be hit in normal usage, be a little less weird 
> and take the mmu_lock for write around the call.
> 
> Fixes: 02ab57707bdb ("KVM: TDX: Implement hooks to propagate changes of TDP MMU mirror page table")
> Reported-by: Yan Zhao <yan.y.zhao@intel.com>
> Suggested-by: Yan Zhao <yan.y.zhao@intel.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> ---
> Hi,
> 
> It was indeed awkward, as Sean must have sniffed. But seems ok enough to
> close the issue.
> 
> Yan, can you give it a look?
> 
> Posted here, but applies on top of this series.

In the future, please don't post in-reply-to, as it mucks up my b4 workflow.

Applied to kvm-x86 tdx, with a more verbose comment as suggested by Binbin.

[1/1] KVM: TDX: Take MMU lock around tdh_vp_init()
      https://github.com/kvm-x86/linux/commit/9a89894f30d5

Re: [PATCH] KVM: TDX: Take MMU lock around tdh_vp_init()

Posted by Edgecombe, Rick P 2 months, 3 weeks ago

On Tue, 2025-11-18 at 15:31 -0800, Sean Christopherson wrote:
> In the future, please don't post in-reply-to, as it mucks up my b4 workflow.

Sure thing, sorry.

Re: [PATCH] KVM: TDX: Take MMU lock around tdh_vp_init()

Posted by Edgecombe, Rick P 2 months, 3 weeks ago

On Tue, 2025-11-18 at 15:31 -0800, Sean Christopherson wrote:
> In the future, please don't post in-reply-to, as it mucks up my b4 workflow.

Sure thing, sorry.

Re: [PATCH] KVM: TDX: Take MMU lock around tdh_vp_init()

Posted by Binbin Wu 3 months, 1 week ago


On 10/28/2025 8:28 AM, Rick Edgecombe wrote:
> Take MMU lock around tdh_vp_init() in KVM_TDX_INIT_VCPU to prevent
> meeting contention during retries in some no-fail MMU paths.
>
> The TDX module takes various try-locks internally, which can cause
> SEAMCALLs to return an error code when contention is met. Dealing with
> an error in some of the MMU paths that make SEAMCALLs is not straight
> forward, so KVM takes steps to ensure that these will meet no contention
> during a single BUSY error retry. The whole scheme relies on KVM to take
> appropriate steps to avoid making any SEAMCALLs that could contend while
> the retry is happening.
>
> Unfortunately, there is a case where contention could be met if userspace
> does something unusual. Specifically, hole punching a gmem fd while
> initializing the TD vCPU. The impact would be triggering a KVM_BUG_ON().
>
> The resource being contended is called the "TDR resource" in TDX docs
> parlance. The tdh_vp_init() can take this resource as exclusive if the
> 'version' passed is 1, which happens to be version the kernel passes. The
> various MMU operations (tdh_mem_range_block(), tdh_mem_track() and
> tdh_mem_page_remove()) take it as shared.
>
> There isn't a KVM lock that maps conceptually and in a lock order friendly
> way to the TDR lock. So to minimize infrastructure, just take MMU lock
> around tdh_vp_init(). This makes the operations we care about mutually
> exclusive. Since the other operations are under a write mmu_lock, the code
> could just take the lock for read, however this is weirdly inverted from
> the actual underlying resource being contended. Since this is covering an
> edge case that shouldn't be hit in normal usage, be a little less weird
> and take the mmu_lock for write around the call.
>
> Fixes: 02ab57707bdb ("KVM: TDX: Implement hooks to propagate changes of TDP MMU mirror page table")
> Reported-by: Yan Zhao <yan.y.zhao@intel.com>
> Suggested-by: Yan Zhao <yan.y.zhao@intel.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> ---
> Hi,
>
> It was indeed awkward, as Sean must have sniffed. But seems ok enough to
> close the issue.
>
> Yan, can you give it a look?
>
> Posted here, but applies on top of this series.
>
> Thanks,
>
> Rick
> ---
>   arch/x86/kvm/vmx/tdx.c | 15 ++++++++++++---
>   1 file changed, 12 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index daec88d4b88d..8bf5d2624152 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -2938,9 +2938,18 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
>   		}
>   	}
>   
> -	err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
> -	if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm))
> -		return -EIO;
> +	/*
> +	 * tdh_vp_init() can take a exclusive lock of the TDR resource inside
                                   ^
                                   an

> +	 * the TDX module. This resource is also taken as shared in several
> +	 * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention.
> +	 * A read lock here would be enough to exclude the contention, but take
> +	 * a write lock to avoid the weird inversion.
Can we also add the description that the lock is trying to prevent an edge case
as in the change log if not too wordy?

> +	 */
> +	scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
> +		err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
> +		if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm))
> +			return -EIO;
> +	}
>   
>   	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
>

Re: [PATCH] KVM: TDX: Take MMU lock around tdh_vp_init()

Posted by Yan Zhao 3 months, 1 week ago

On Mon, Oct 27, 2025 at 05:28:24PM -0700, Rick Edgecombe wrote:
> Take MMU lock around tdh_vp_init() in KVM_TDX_INIT_VCPU to prevent
> meeting contention during retries in some no-fail MMU paths.
> 
> The TDX module takes various try-locks internally, which can cause
> SEAMCALLs to return an error code when contention is met. Dealing with
> an error in some of the MMU paths that make SEAMCALLs is not straight
> forward, so KVM takes steps to ensure that these will meet no contention
> during a single BUSY error retry. The whole scheme relies on KVM to take
> appropriate steps to avoid making any SEAMCALLs that could contend while
> the retry is happening.
> 
> Unfortunately, there is a case where contention could be met if userspace
> does something unusual. Specifically, hole punching a gmem fd while
> initializing the TD vCPU. The impact would be triggering a KVM_BUG_ON().
> 
> The resource being contended is called the "TDR resource" in TDX docs 
> parlance. The tdh_vp_init() can take this resource as exclusive if the 
> 'version' passed is 1, which happens to be version the kernel passes. The 
> various MMU operations (tdh_mem_range_block(), tdh_mem_track() and 
> tdh_mem_page_remove()) take it as shared.
> 
> There isn't a KVM lock that maps conceptually and in a lock order friendly 
> way to the TDR lock. So to minimize infrastructure, just take MMU lock 
> around tdh_vp_init(). This makes the operations we care about mutually 
> exclusive. Since the other operations are under a write mmu_lock, the code 
> could just take the lock for read, however this is weirdly inverted from 
> the actual underlying resource being contended. Since this is covering an 
> edge case that shouldn't be hit in normal usage, be a little less weird 
> and take the mmu_lock for write around the call.
> 
> Fixes: 02ab57707bdb ("KVM: TDX: Implement hooks to propagate changes of TDP MMU mirror page table")
> Reported-by: Yan Zhao <yan.y.zhao@intel.com>
> Suggested-by: Yan Zhao <yan.y.zhao@intel.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> ---
> Hi,
> 
> It was indeed awkward, as Sean must have sniffed. But seems ok enough to
> close the issue.
> 
> Yan, can you give it a look?
It passed my local tests. LGTM. Thanks!

> Posted here, but applies on top of this series.
> 
> Thanks,
> 
> Rick
> ---
>  arch/x86/kvm/vmx/tdx.c | 15 ++++++++++++---
>  1 file changed, 12 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index daec88d4b88d..8bf5d2624152 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -2938,9 +2938,18 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
>  		}
>  	}
>  
> -	err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
> -	if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm))
> -		return -EIO;
> +	/*
> +	 * tdh_vp_init() can take a exclusive lock of the TDR resource inside
> +	 * the TDX module. This resource is also taken as shared in several
> +	 * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention.
> +	 * A read lock here would be enough to exclude the contention, but take
> +	 * a write lock to avoid the weird inversion.
> +	 */
> +	scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
> +		err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
> +		if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm))
> +			return -EIO;
> +	}
>  
>  	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
>  
> -- 
> 2.51.1
>