[PATCH 09/67] KVM: SVM: Track per-vCPU IRTEs using kvm_kernel_irqfd structure

Sean Christopherson posted 67 patches 8 months, 2 weeks ago
There is a newer version of this series
[PATCH 09/67] KVM: SVM: Track per-vCPU IRTEs using kvm_kernel_irqfd structure
Posted by Sean Christopherson 8 months, 2 weeks ago
Track the IRTEs that are posting to an SVM vCPU via the associated irqfd
structure and GSI routing instead of dynamically allocating a separate
data structure.  In addition to eliminating an atomic allocation, this
will allow hoisting much of the IRTE update logic to common x86.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/avic.c   | 49 ++++++++++++++++-----------------------
 include/linux/kvm_irqfd.h |  3 +++
 2 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 04dfd898ea8d..967618ba743a 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -774,27 +774,30 @@ static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
 	return ret;
 }
 
-static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+static void svm_ir_list_del(struct vcpu_svm *svm,
+			    struct kvm_kernel_irqfd *irqfd,
+			    struct amd_iommu_pi_data *pi)
 {
 	unsigned long flags;
-	struct amd_svm_iommu_ir *cur;
+	struct kvm_kernel_irqfd *cur;
 
 	spin_lock_irqsave(&svm->ir_list_lock, flags);
-	list_for_each_entry(cur, &svm->ir_list, node) {
-		if (cur->data != pi->ir_data)
+	list_for_each_entry(cur, &svm->ir_list, vcpu_list) {
+		if (cur->irq_bypass_data != pi->ir_data)
 			continue;
-		list_del(&cur->node);
-		kfree(cur);
+		if (WARN_ON_ONCE(cur != irqfd))
+			continue;
+		list_del(&irqfd->vcpu_list);
 		break;
 	}
 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
 }
 
-static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+static int svm_ir_list_add(struct vcpu_svm *svm,
+			   struct kvm_kernel_irqfd *irqfd,
+			   struct amd_iommu_pi_data *pi)
 {
-	int ret = 0;
 	unsigned long flags;
-	struct amd_svm_iommu_ir *ir;
 	u64 entry;
 
 	if (WARN_ON_ONCE(!pi->ir_data))
@@ -811,25 +814,14 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
 		struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
 		struct vcpu_svm *prev_svm;
 
-		if (!prev_vcpu) {
-			ret = -EINVAL;
-			goto out;
-		}
+		if (!prev_vcpu)
+			return -EINVAL;
 
 		prev_svm = to_svm(prev_vcpu);
-		svm_ir_list_del(prev_svm, pi);
+		svm_ir_list_del(prev_svm, irqfd, pi);
 	}
 
-	/**
-	 * Allocating new amd_iommu_pi_data, which will get
-	 * add to the per-vcpu ir_list.
-	 */
-	ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT);
-	if (!ir) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	ir->data = pi->ir_data;
+	irqfd->irq_bypass_data = pi->ir_data;
 
 	spin_lock_irqsave(&svm->ir_list_lock, flags);
 
@@ -844,10 +836,9 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
 		amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
 				    true, pi->ir_data);
 
-	list_add(&ir->node, &svm->ir_list);
+	list_add(&irqfd->vcpu_list, &svm->ir_list);
 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-out:
-	return ret;
+	return 0;
 }
 
 /*
@@ -951,7 +942,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 			 * scheduling information in IOMMU irte.
 			 */
 			if (!ret && pi.is_guest_mode)
-				svm_ir_list_add(svm, &pi);
+				svm_ir_list_add(svm, irqfd, &pi);
 		}
 
 		if (!ret && svm) {
@@ -991,7 +982,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 
 			vcpu = kvm_get_vcpu_by_id(kvm, id);
 			if (vcpu)
-				svm_ir_list_del(to_svm(vcpu), &pi);
+				svm_ir_list_del(to_svm(vcpu), irqfd, &pi);
 		}
 	} else {
 		ret = 0;
diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
index 8ad43692e3bb..6510a48e62aa 100644
--- a/include/linux/kvm_irqfd.h
+++ b/include/linux/kvm_irqfd.h
@@ -59,6 +59,9 @@ struct kvm_kernel_irqfd {
 	struct work_struct shutdown;
 	struct irq_bypass_consumer consumer;
 	struct irq_bypass_producer *producer;
+
+	struct list_head vcpu_list;
+	void *irq_bypass_data;
 };
 
 #endif /* __LINUX_KVM_IRQFD_H */
-- 
2.49.0.504.g3bcea36a83-goog
Re: [PATCH 09/67] KVM: SVM: Track per-vCPU IRTEs using kvm_kernel_irqfd structure
Posted by Arun Kodilkar, Sairaj 8 months, 1 week ago
On 4/5/2025 1:08 AM, Sean Christopherson wrote:
> Track the IRTEs that are posting to an SVM vCPU via the associated irqfd
> structure and GSI routing instead of dynamically allocating a separate
> data structure.  In addition to eliminating an atomic allocation, this
> will allow hoisting much of the IRTE update logic to common x86.
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>   arch/x86/kvm/svm/avic.c   | 49 ++++++++++++++++-----------------------
>   include/linux/kvm_irqfd.h |  3 +++
>   2 files changed, 23 insertions(+), 29 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
> index 04dfd898ea8d..967618ba743a 100644
> --- a/arch/x86/kvm/svm/avic.c
> +++ b/arch/x86/kvm/svm/avic.c
> @@ -774,27 +774,30 @@ static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
>   	return ret;
>   }
>   
> -static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
> +static void svm_ir_list_del(struct vcpu_svm *svm,
> +			    struct kvm_kernel_irqfd *irqfd,
> +			    struct amd_iommu_pi_data *pi)
>   {
>   	unsigned long flags;
> -	struct amd_svm_iommu_ir *cur;
> +	struct kvm_kernel_irqfd *cur;
>   
>   	spin_lock_irqsave(&svm->ir_list_lock, flags);
> -	list_for_each_entry(cur, &svm->ir_list, node) {
> -		if (cur->data != pi->ir_data)
> +	list_for_each_entry(cur, &svm->ir_list, vcpu_list) {
> +		if (cur->irq_bypass_data != pi->ir_data)
>   			continue;
> -		list_del(&cur->node);
> -		kfree(cur);
> +		if (WARN_ON_ONCE(cur != irqfd))
> +			continue;
> +		list_del(&irqfd->vcpu_list);
>   		break;
>   	}
>   	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
>   }
>   
> -static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
> +static int svm_ir_list_add(struct vcpu_svm *svm,
> +			   struct kvm_kernel_irqfd *irqfd,
> +			   struct amd_iommu_pi_data *pi)
>   {
> -	int ret = 0;
>   	unsigned long flags;
> -	struct amd_svm_iommu_ir *ir;
>   	u64 entry;
>   
>   	if (WARN_ON_ONCE(!pi->ir_data))
> @@ -811,25 +814,14 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
>   		struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
>   		struct vcpu_svm *prev_svm;
>   
> -		if (!prev_vcpu) {
> -			ret = -EINVAL;
> -			goto out;
> -		}
> +		if (!prev_vcpu)
> +			return -EINVAL;
>   
>   		prev_svm = to_svm(prev_vcpu);
> -		svm_ir_list_del(prev_svm, pi);
> +		svm_ir_list_del(prev_svm, irqfd, pi);
>   	}
>   
> -	/**
> -	 * Allocating new amd_iommu_pi_data, which will get
> -	 * add to the per-vcpu ir_list.
> -	 */
> -	ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT);
> -	if (!ir) {
> -		ret = -ENOMEM;
> -		goto out;
> -	}
> -	ir->data = pi->ir_data;
> +	irqfd->irq_bypass_data = pi->ir_data;
>   
>   	spin_lock_irqsave(&svm->ir_list_lock, flags);
>   
> @@ -844,10 +836,9 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
>   		amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
>   				    true, pi->ir_data);
>   
> -	list_add(&ir->node, &svm->ir_list);
> +	list_add(&irqfd->vcpu_list, &svm->ir_list);
>   	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
> -out:
> -	return ret;
> +	return 0;
>   }
>   
>   /*
> @@ -951,7 +942,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
>   			 * scheduling information in IOMMU irte.
>   			 */
>   			if (!ret && pi.is_guest_mode)
> -				svm_ir_list_add(svm, &pi);
> +				svm_ir_list_add(svm, irqfd, &pi);
>   		}
>   
>   		if (!ret && svm) {
> @@ -991,7 +982,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
>   
>   			vcpu = kvm_get_vcpu_by_id(kvm, id);
>   			if (vcpu)
> -				svm_ir_list_del(to_svm(vcpu), &pi);
> +				svm_ir_list_del(to_svm(vcpu), irqfd, &pi);
>   		}
>   	} else {
>   		ret = 0;
> diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
> index 8ad43692e3bb..6510a48e62aa 100644
> --- a/include/linux/kvm_irqfd.h
> +++ b/include/linux/kvm_irqfd.h
> @@ -59,6 +59,9 @@ struct kvm_kernel_irqfd {
>   	struct work_struct shutdown;
>   	struct irq_bypass_consumer consumer;
>   	struct irq_bypass_producer *producer;
> +
> +	struct list_head vcpu_list;
> +	void *irq_bypass_data;
>   };
>   
>   #endif /* __LINUX_KVM_IRQFD_H */

Hi Sean,
You missed to update the functions avic_set_pi_irte_mode and
avic_update_iommu_vcpu_affinity, which iterate over the ir_list.

Regards
Sairaj Kodilkar
Re: [PATCH 09/67] KVM: SVM: Track per-vCPU IRTEs using kvm_kernel_irqfd structure
Posted by Sean Christopherson 8 months, 1 week ago
On Fri, Apr 11, 2025, Arun Kodilkar, Sairaj wrote:
> On 4/5/2025 1:08 AM, Sean Christopherson wrote:
> > diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
> > index 8ad43692e3bb..6510a48e62aa 100644
> > --- a/include/linux/kvm_irqfd.h
> > +++ b/include/linux/kvm_irqfd.h
> > @@ -59,6 +59,9 @@ struct kvm_kernel_irqfd {
> >   	struct work_struct shutdown;
> >   	struct irq_bypass_consumer consumer;
> >   	struct irq_bypass_producer *producer;
> > +
> > +	struct list_head vcpu_list;
> > +	void *irq_bypass_data;
> >   };
> >   #endif /* __LINUX_KVM_IRQFD_H */
> 
> Hi Sean,
> You missed to update the functions avic_set_pi_irte_mode and
> avic_update_iommu_vcpu_affinity, which iterate over the ir_list.

Well bugger, I did indeed.  And now I'm questioning my (hacky) testing, as I don't
see how avic_update_iommu_vcpu_affinity() survived.

Oh, wow.  This is disgustingly hilarious.  By dumb luck, the offset of the data
pointer relative to the list_head structure is the same in amd_svm_iommu_ir and
kvm_kernel_irqfd.  And the usage in avic_set_pi_irte_mode() and
avic_update_iommu_vcpu_affinity() only ever touches the data, not "svm".

So even though the structure is completely wrong, the math works out and
avic_set_pi_irte_mode() and avic_update_iommu_vcpu_affinity() unknowingly pass in
irq_bypass_data, and all is well.

struct amd_svm_iommu_ir {
	struct list_head node;	/* Used by SVM for per-vcpu ir_list */
	void *data;		/* Storing pointer to struct amd_ir_data */
	struct vcpu_svm *svm;
};


struct kvm_kernel_irqfd {
	...

	struct kvm_vcpu *irq_bypass_vcpu;
	struct list_head vcpu_list;
	void *irq_bypass_data;
};

Great catch!  And thanks for the reviews!