[PATCH 08/67] KVM: x86: Pass new routing entries and irqfd when updating IRTEs

Sean Christopherson posted 67 patches 8 months, 2 weeks ago
There is a newer version of this series
[PATCH 08/67] KVM: x86: Pass new routing entries and irqfd when updating IRTEs
Posted by Sean Christopherson 8 months, 2 weeks ago
When updating IRTEs in response to a GSI routing or IRQ bypass change,
pass the new/current routing information along with the associated irqfd.
This will allow KVM x86 to harden, simplify, and deduplicate its code.

Since adding/removing a bypass producer is now conveniently protected with
irqfds.lock, i.e. can't run concurrently with kvm_irq_routing_update(),
use the routing information cached in the irqfd instead of looking up
the information in the current GSI routing tables.

Opportunistically convert an existing printk() to pr_info() and put its
string onto a single line (old code that strictly adhered to 80 chars).

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_host.h |  6 ++++--
 arch/x86/kvm/svm/avic.c         | 18 +++++++----------
 arch/x86/kvm/svm/svm.h          |  5 +++--
 arch/x86/kvm/vmx/posted_intr.c  | 19 ++++++++---------
 arch/x86/kvm/vmx/posted_intr.h  |  8 ++++++--
 arch/x86/kvm/x86.c              | 36 ++++++++++++++++++---------------
 include/linux/kvm_host.h        |  7 +++++--
 virt/kvm/eventfd.c              | 11 +++++-----
 8 files changed, 58 insertions(+), 52 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6e8be274c089..54f3cf73329b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -294,6 +294,7 @@ enum x86_intercept_stage;
  */
 #define KVM_APIC_PV_EOI_PENDING	1
 
+struct kvm_kernel_irqfd;
 struct kvm_kernel_irq_routing_entry;
 
 /*
@@ -1828,8 +1829,9 @@ struct kvm_x86_ops {
 	void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
 	void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
 
-	int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq,
-			      uint32_t guest_irq, bool set);
+	int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+			      unsigned int host_irq, uint32_t guest_irq,
+			      struct kvm_kernel_irq_routing_entry *new);
 	void (*pi_start_assignment)(struct kvm *kvm);
 	void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu);
 	void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 1708ea55125a..04dfd898ea8d 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -18,6 +18,7 @@
 #include <linux/hashtable.h>
 #include <linux/amd-iommu.h>
 #include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
 
 #include <asm/irq_remapping.h>
 
@@ -885,21 +886,14 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 	return 0;
 }
 
-/*
- * avic_pi_update_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
-			uint32_t guest_irq, bool set)
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+			unsigned int host_irq, uint32_t guest_irq,
+			struct kvm_kernel_irq_routing_entry *new)
 {
 	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_irq_routing_table *irq_rt;
 	bool enable_remapped_mode = true;
+	bool set = !!new;
 	int idx, ret = 0;
 
 	if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass())
@@ -925,6 +919,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 		if (e->type != KVM_IRQ_ROUTING_MSI)
 			continue;
 
+		WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new)));
+
 		/**
 		 * Here, we setup with legacy mode in the following cases:
 		 * 1. When cannot target interrupt to a specific vcpu.
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index d4490eaed55d..294d5594c724 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -731,8 +731,9 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void avic_vcpu_put(struct kvm_vcpu *vcpu);
 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
-int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
-			uint32_t guest_irq, bool set);
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+			unsigned int host_irq, uint32_t guest_irq,
+			struct kvm_kernel_irq_routing_entry *new);
 void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
 void avic_ring_doorbell(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 78ba3d638fe8..1b6b655a2b8a 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -2,6 +2,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
 
 #include <asm/irq_remapping.h>
 #include <asm/cpu.h>
@@ -259,17 +260,9 @@ void vmx_pi_start_assignment(struct kvm *kvm)
 	kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
 }
 
-/*
- * vmx_pi_update_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
-		       uint32_t guest_irq, bool set)
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+		       unsigned int host_irq, uint32_t guest_irq,
+		       struct kvm_kernel_irq_routing_entry *new)
 {
 	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_irq_routing_table *irq_rt;
@@ -277,6 +270,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 	struct kvm_lapic_irq irq;
 	struct kvm_vcpu *vcpu;
 	struct vcpu_data vcpu_info;
+	bool set = !!new;
 	int idx, ret = 0;
 
 	if (!vmx_can_use_vtd_pi(kvm))
@@ -294,6 +288,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
 		if (e->type != KVM_IRQ_ROUTING_MSI)
 			continue;
+
+		WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new)));
+
 		/*
 		 * VT-d PI cannot support posting multicast/broadcast
 		 * interrupts to a vCPU, we still use interrupt remapping
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index ad9116a99bcc..a586d6aaf862 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -3,6 +3,9 @@
 #define __KVM_X86_VMX_POSTED_INTR_H
 
 #include <linux/bitmap.h>
+#include <linux/find.h>
+#include <linux/kvm_host.h>
+
 #include <asm/posted_intr.h>
 
 void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
@@ -10,8 +13,9 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
 void pi_wakeup_handler(void);
 void __init pi_init_cpu(int cpu);
 bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
-int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
-		       uint32_t guest_irq, bool set);
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+		       unsigned int host_irq, uint32_t guest_irq,
+		       struct kvm_kernel_irq_routing_entry *new);
 void vmx_pi_start_assignment(struct kvm *kvm);
 
 static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dcc173852dc5..23376fcd928c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13570,31 +13570,31 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(cons, struct kvm_kernel_irqfd, consumer);
 	struct kvm *kvm = irqfd->kvm;
-	int ret;
+	int ret = 0;
 
 	kvm_arch_start_assignment(irqfd->kvm);
 
 	spin_lock_irq(&kvm->irqfds.lock);
 	irqfd->producer = prod;
 
-	ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
-					   prod->irq, irqfd->gsi, 1);
-	if (ret)
-		kvm_arch_end_assignment(irqfd->kvm);
-
+	if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+		ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq,
+						   irqfd->gsi, &irqfd->irq_entry);
+		if (ret)
+			kvm_arch_end_assignment(irqfd->kvm);
+	}
 	spin_unlock_irq(&kvm->irqfds.lock);
 
-
 	return ret;
 }
 
 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 				      struct irq_bypass_producer *prod)
 {
-	int ret;
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(cons, struct kvm_kernel_irqfd, consumer);
 	struct kvm *kvm = irqfd->kvm;
+	int ret;
 
 	WARN_ON(irqfd->producer != prod);
 
@@ -13607,11 +13607,13 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 	spin_lock_irq(&kvm->irqfds.lock);
 	irqfd->producer = NULL;
 
-	ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
-					   prod->irq, irqfd->gsi, 0);
-	if (ret)
-		printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
-		       " fails: %d\n", irqfd->consumer.token, ret);
+	if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+		ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq,
+						   irqfd->gsi, NULL);
+		if (ret)
+			pr_info("irq bypass consumer (token %p) unregistration fails: %d\n",
+				irqfd->consumer.token, ret);
+	}
 
 	spin_unlock_irq(&kvm->irqfds.lock);
 
@@ -13619,10 +13621,12 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 	kvm_arch_end_assignment(irqfd->kvm);
 }
 
-int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
-				   uint32_t guest_irq, bool set)
+int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+				  struct kvm_kernel_irq_routing_entry *old,
+				  struct kvm_kernel_irq_routing_entry *new)
 {
-	return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set);
+	return kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, irqfd->producer->irq,
+					    irqfd->gsi, new);
 }
 
 bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5438a1b446a6..2d9f3aeb766a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2383,6 +2383,8 @@ struct kvm_vcpu *kvm_get_running_vcpu(void);
 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
 
 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+struct kvm_kernel_irqfd;
+
 bool kvm_arch_has_irq_bypass(void);
 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *,
 			   struct irq_bypass_producer *);
@@ -2390,8 +2392,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *,
 			   struct irq_bypass_producer *);
 void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *);
 void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *);
-int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
-				  uint32_t guest_irq, bool set);
+int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+				  struct kvm_kernel_irq_routing_entry *old,
+				  struct kvm_kernel_irq_routing_entry *new);
 bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *,
 				  struct kvm_kernel_irq_routing_entry *);
 #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 249ba5b72e9b..ad71e3e4d1c3 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -285,9 +285,9 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start(
 {
 }
 
-int  __attribute__((weak)) kvm_arch_update_irqfd_routing(
-				struct kvm *kvm, unsigned int host_irq,
-				uint32_t guest_irq, bool set)
+int __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+					 struct kvm_kernel_irq_routing_entry *old,
+					 struct kvm_kernel_irq_routing_entry *new)
 {
 	return 0;
 }
@@ -619,9 +619,8 @@ void kvm_irq_routing_update(struct kvm *kvm)
 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 		if (irqfd->producer &&
 		    kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) {
-			int ret = kvm_arch_update_irqfd_routing(
-					irqfd->kvm, irqfd->producer->irq,
-					irqfd->gsi, 1);
+			int ret = kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry);
+
 			WARN_ON(ret);
 		}
 #endif
-- 
2.49.0.504.g3bcea36a83-goog
Re: [PATCH 08/67] KVM: x86: Pass new routing entries and irqfd when updating IRTEs
Posted by Arun Kodilkar, Sairaj 8 months, 1 week ago
On 4/5/2025 1:08 AM, Sean Christopherson wrote:
> When updating IRTEs in response to a GSI routing or IRQ bypass change,
> pass the new/current routing information along with the associated irqfd.
> This will allow KVM x86 to harden, simplify, and deduplicate its code.
> 
> Since adding/removing a bypass producer is now conveniently protected with
> irqfds.lock, i.e. can't run concurrently with kvm_irq_routing_update(),
> use the routing information cached in the irqfd instead of looking up
> the information in the current GSI routing tables.
> 
> Opportunistically convert an existing printk() to pr_info() and put its
> string onto a single line (old code that strictly adhered to 80 chars).
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>   arch/x86/include/asm/kvm_host.h |  6 ++++--
>   arch/x86/kvm/svm/avic.c         | 18 +++++++----------
>   arch/x86/kvm/svm/svm.h          |  5 +++--
>   arch/x86/kvm/vmx/posted_intr.c  | 19 ++++++++---------
>   arch/x86/kvm/vmx/posted_intr.h  |  8 ++++++--
>   arch/x86/kvm/x86.c              | 36 ++++++++++++++++++---------------
>   include/linux/kvm_host.h        |  7 +++++--
>   virt/kvm/eventfd.c              | 11 +++++-----
>   8 files changed, 58 insertions(+), 52 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 6e8be274c089..54f3cf73329b 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -294,6 +294,7 @@ enum x86_intercept_stage;
>    */
>   #define KVM_APIC_PV_EOI_PENDING	1
>   
> +struct kvm_kernel_irqfd;
>   struct kvm_kernel_irq_routing_entry;
>   
>   /*
> @@ -1828,8 +1829,9 @@ struct kvm_x86_ops {
>   	void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
>   	void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
>   
> -	int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq,
> -			      uint32_t guest_irq, bool set);
> +	int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
> +			      unsigned int host_irq, uint32_t guest_irq,
> +			      struct kvm_kernel_irq_routing_entry *new);
>   	void (*pi_start_assignment)(struct kvm *kvm);
>   	void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu);
>   	void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
> index 1708ea55125a..04dfd898ea8d 100644
> --- a/arch/x86/kvm/svm/avic.c
> +++ b/arch/x86/kvm/svm/avic.c
> @@ -18,6 +18,7 @@
>   #include <linux/hashtable.h>
>   #include <linux/amd-iommu.h>
>   #include <linux/kvm_host.h>
> +#include <linux/kvm_irqfd.h>
>   
>   #include <asm/irq_remapping.h>
>   
> @@ -885,21 +886,14 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
>   	return 0;
>   }
>   
> -/*
> - * avic_pi_update_irte - set IRTE for Posted-Interrupts
> - *
> - * @kvm: kvm
> - * @host_irq: host irq of the interrupt
> - * @guest_irq: gsi of the interrupt
> - * @set: set or unset PI
> - * returns 0 on success, < 0 on failure
> - */
> -int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
> -			uint32_t guest_irq, bool set)
> +int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
> +			unsigned int host_irq, uint32_t guest_irq,
> +			struct kvm_kernel_irq_routing_entry *new)
>   {
>   	struct kvm_kernel_irq_routing_entry *e;
>   	struct kvm_irq_routing_table *irq_rt;
>   	bool enable_remapped_mode = true;
> +	bool set = !!new;
>   	int idx, ret = 0;
>   
>   	if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass())
> @@ -925,6 +919,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
>   		if (e->type != KVM_IRQ_ROUTING_MSI)
>   			continue;
>   
> +		WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new)));
> +
> 

Hi Sean,

In kvm_irq_routing_update() function, its possible that there are
multiple entries in the `kvm_irq_routing_table`, and `irqfd_update()`
ends up setting up the new entry type to 0 instead of copying the entry.

if (n_entries == 1)
     irqfd->irq_entry = *e;
else
     irqfd->irq_entry.type = 0;

Since irqfd_update() did not copy the entry to irqfd->entries, the "new" 
will not match entry "e" obtained from irq_rt, which can trigger a false 
WARN_ON.

Let me know if I am missing something here.

Regards
Sairaj Kodilkar

>  		/**
>   		 * Here, we setup with legacy mode in the following cases:
>   		 * 1. When cannot target interrupt to a specific vcpu.
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index d4490eaed55d..294d5594c724 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -731,8 +731,9 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
>   void avic_vcpu_put(struct kvm_vcpu *vcpu);
>   void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
>   void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
> -int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
> -			uint32_t guest_irq, bool set);
> +int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
> +			unsigned int host_irq, uint32_t guest_irq,
> +			struct kvm_kernel_irq_routing_entry *new);
>   void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
>   void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
>   void avic_ring_doorbell(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
> index 78ba3d638fe8..1b6b655a2b8a 100644
> --- a/arch/x86/kvm/vmx/posted_intr.c
> +++ b/arch/x86/kvm/vmx/posted_intr.c
> @@ -2,6 +2,7 @@
>   #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>   
>   #include <linux/kvm_host.h>
> +#include <linux/kvm_irqfd.h>
>   
>   #include <asm/irq_remapping.h>
>   #include <asm/cpu.h>
> @@ -259,17 +260,9 @@ void vmx_pi_start_assignment(struct kvm *kvm)
>   	kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
>   }
>   
> -/*
> - * vmx_pi_update_irte - set IRTE for Posted-Interrupts
> - *
> - * @kvm: kvm
> - * @host_irq: host irq of the interrupt
> - * @guest_irq: gsi of the interrupt
> - * @set: set or unset PI
> - * returns 0 on success, < 0 on failure
> - */
> -int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
> -		       uint32_t guest_irq, bool set)
> +int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
> +		       unsigned int host_irq, uint32_t guest_irq,
> +		       struct kvm_kernel_irq_routing_entry *new)
>   {
>   	struct kvm_kernel_irq_routing_entry *e;
>   	struct kvm_irq_routing_table *irq_rt;
> @@ -277,6 +270,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
>   	struct kvm_lapic_irq irq;
>   	struct kvm_vcpu *vcpu;
>   	struct vcpu_data vcpu_info;
> +	bool set = !!new;
>   	int idx, ret = 0;
>   
>   	if (!vmx_can_use_vtd_pi(kvm))
> @@ -294,6 +288,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
>   	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
>   		if (e->type != KVM_IRQ_ROUTING_MSI)
>   			continue;
> +
> +		WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new)));
> +
>   		/*
>   		 * VT-d PI cannot support posting multicast/broadcast
>   		 * interrupts to a vCPU, we still use interrupt remapping
> diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
> index ad9116a99bcc..a586d6aaf862 100644
> --- a/arch/x86/kvm/vmx/posted_intr.h
> +++ b/arch/x86/kvm/vmx/posted_intr.h
> @@ -3,6 +3,9 @@
>   #define __KVM_X86_VMX_POSTED_INTR_H
>   
>   #include <linux/bitmap.h>
> +#include <linux/find.h>
> +#include <linux/kvm_host.h>
> +
>   #include <asm/posted_intr.h>
>   
>   void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
> @@ -10,8 +13,9 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
>   void pi_wakeup_handler(void);
>   void __init pi_init_cpu(int cpu);
>   bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
> -int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
> -		       uint32_t guest_irq, bool set);
> +int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
> +		       unsigned int host_irq, uint32_t guest_irq,
> +		       struct kvm_kernel_irq_routing_entry *new);
>   void vmx_pi_start_assignment(struct kvm *kvm);
>   
>   static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index dcc173852dc5..23376fcd928c 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -13570,31 +13570,31 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
>   	struct kvm_kernel_irqfd *irqfd =
>   		container_of(cons, struct kvm_kernel_irqfd, consumer);
>   	struct kvm *kvm = irqfd->kvm;
> -	int ret;
> +	int ret = 0;
>   
>   	kvm_arch_start_assignment(irqfd->kvm);
>   
>   	spin_lock_irq(&kvm->irqfds.lock);
>   	irqfd->producer = prod;
>   
> -	ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
> -					   prod->irq, irqfd->gsi, 1);
> -	if (ret)
> -		kvm_arch_end_assignment(irqfd->kvm);
> -
> +	if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
> +		ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq,
> +						   irqfd->gsi, &irqfd->irq_entry);
> +		if (ret)
> +			kvm_arch_end_assignment(irqfd->kvm);
> +	}
>   	spin_unlock_irq(&kvm->irqfds.lock);
>   
> -
>   	return ret;
>   }
>   
>   void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
>   				      struct irq_bypass_producer *prod)
>   {
> -	int ret;
>   	struct kvm_kernel_irqfd *irqfd =
>   		container_of(cons, struct kvm_kernel_irqfd, consumer);
>   	struct kvm *kvm = irqfd->kvm;
> +	int ret;
>   
>   	WARN_ON(irqfd->producer != prod);
>   
> @@ -13607,11 +13607,13 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
>   	spin_lock_irq(&kvm->irqfds.lock);
>   	irqfd->producer = NULL;
>   
> -	ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
> -					   prod->irq, irqfd->gsi, 0);
> -	if (ret)
> -		printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
> -		       " fails: %d\n", irqfd->consumer.token, ret);
> +	if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
> +		ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq,
> +						   irqfd->gsi, NULL);
> +		if (ret)
> +			pr_info("irq bypass consumer (token %p) unregistration fails: %d\n",
> +				irqfd->consumer.token, ret);
> +	}
>   
>   	spin_unlock_irq(&kvm->irqfds.lock);
>   
> @@ -13619,10 +13621,12 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
>   	kvm_arch_end_assignment(irqfd->kvm);
>   }
>   
> -int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
> -				   uint32_t guest_irq, bool set)
> +int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
> +				  struct kvm_kernel_irq_routing_entry *old,
> +				  struct kvm_kernel_irq_routing_entry *new)
>   {
> -	return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set);
> +	return kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, irqfd->producer->irq,
> +					    irqfd->gsi, new);
>   }
>   
>   bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 5438a1b446a6..2d9f3aeb766a 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2383,6 +2383,8 @@ struct kvm_vcpu *kvm_get_running_vcpu(void);
>   struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
>   
>   #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
> +struct kvm_kernel_irqfd;
> +
>   bool kvm_arch_has_irq_bypass(void);
>   int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *,
>   			   struct irq_bypass_producer *);
> @@ -2390,8 +2392,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *,
>   			   struct irq_bypass_producer *);
>   void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *);
>   void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *);
> -int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
> -				  uint32_t guest_irq, bool set);
> +int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
> +				  struct kvm_kernel_irq_routing_entry *old,
> +				  struct kvm_kernel_irq_routing_entry *new);
>   bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *,
>   				  struct kvm_kernel_irq_routing_entry *);
>   #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 249ba5b72e9b..ad71e3e4d1c3 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -285,9 +285,9 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start(
>   {
>   }
>   
> -int  __attribute__((weak)) kvm_arch_update_irqfd_routing(
> -				struct kvm *kvm, unsigned int host_irq,
> -				uint32_t guest_irq, bool set)
> +int __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
> +					 struct kvm_kernel_irq_routing_entry *old,
> +					 struct kvm_kernel_irq_routing_entry *new)
>   {
>   	return 0;
>   }
> @@ -619,9 +619,8 @@ void kvm_irq_routing_update(struct kvm *kvm)
>   #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
>   		if (irqfd->producer &&
>   		    kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) {
> -			int ret = kvm_arch_update_irqfd_routing(
> -					irqfd->kvm, irqfd->producer->irq,
> -					irqfd->gsi, 1);
> +			int ret = kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry);
> +
>   			WARN_ON(ret);
>   		}
>   #endif
Re: [PATCH 08/67] KVM: x86: Pass new routing entries and irqfd when updating IRTEs
Posted by Sean Christopherson 8 months, 1 week ago
On Fri, Apr 11, 2025, Arun Kodilkar, Sairaj wrote:
> On 4/5/2025 1:08 AM, Sean Christopherson wrote:
> > +int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
> > +			unsigned int host_irq, uint32_t guest_irq,
> > +			struct kvm_kernel_irq_routing_entry *new)
> >   {
> >   	struct kvm_kernel_irq_routing_entry *e;
> >   	struct kvm_irq_routing_table *irq_rt;
> >   	bool enable_remapped_mode = true;
> > +	bool set = !!new;
> >   	int idx, ret = 0;
> >   	if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass())
> > @@ -925,6 +919,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
> >   		if (e->type != KVM_IRQ_ROUTING_MSI)
> >   			continue;
> > +		WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new)));
> > +
> > 
> 
> Hi Sean,
> 
> In kvm_irq_routing_update() function, its possible that there are
> multiple entries in the `kvm_irq_routing_table`,

Not if one of them is an MSI.  In setup_routing_entry():

	/*
	 * Do not allow GSI to be mapped to the same irqchip more than once.
	 * Allow only one to one mapping between GSI and non-irqchip routing.
	 */
	hlist_for_each_entry(ei, &rt->map[gsi], link)
		if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
		    ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
		    ue->u.irqchip.irqchip == ei->irqchip.irqchip)
			return -EINVAL;

> and `irqfd_update()` ends up setting up the new entry type to 0 instead of
> copying the entry.
> 
> if (n_entries == 1)
>     irqfd->irq_entry = *e;
> else
>     irqfd->irq_entry.type = 0;
> 
> Since irqfd_update() did not copy the entry to irqfd->entries, the "new"
> will not match entry "e" obtained from irq_rt, which can trigger a false
> WARN_ON.

And since there can only be one MSI, if there are multiple routing entries, then
the WARN won't be reached thanks to the continue that's just above:

		if (e->type != KVM_IRQ_ROUTING_MSI)
			continue;
Re: [PATCH 08/67] KVM: x86: Pass new routing entries and irqfd when updating IRTEs
Posted by Sairaj Kodilkar 8 months, 1 week ago

On 4/11/2025 7:31 PM, Sean Christopherson wrote:
> On Fri, Apr 11, 2025, Arun Kodilkar, Sairaj wrote:
>> On 4/5/2025 1:08 AM, Sean Christopherson wrote:
>>> +int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
>>> +			unsigned int host_irq, uint32_t guest_irq,
>>> +			struct kvm_kernel_irq_routing_entry *new)
>>>    {
>>>    	struct kvm_kernel_irq_routing_entry *e;
>>>    	struct kvm_irq_routing_table *irq_rt;
>>>    	bool enable_remapped_mode = true;
>>> +	bool set = !!new;
>>>    	int idx, ret = 0;
>>>    	if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass())
>>> @@ -925,6 +919,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
>>>    		if (e->type != KVM_IRQ_ROUTING_MSI)
>>>    			continue;
>>> +		WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new)));
>>> +
>>>
>>
>> Hi Sean,
>>
>> In kvm_irq_routing_update() function, its possible that there are
>> multiple entries in the `kvm_irq_routing_table`,
> 
> Not if one of them is an MSI.  In setup_routing_entry():
> 
> 	/*
> 	 * Do not allow GSI to be mapped to the same irqchip more than once.
> 	 * Allow only one to one mapping between GSI and non-irqchip routing.
> 	 */
> 	hlist_for_each_entry(ei, &rt->map[gsi], link)
> 		if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
> 		    ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
> 		    ue->u.irqchip.irqchip == ei->irqchip.irqchip)
> 			return -EINVAL;
> 
>> and `irqfd_update()` ends up setting up the new entry type to 0 instead of
>> copying the entry.
>>
>> if (n_entries == 1)
>>      irqfd->irq_entry = *e;
>> else
>>      irqfd->irq_entry.type = 0;
>>
>> Since irqfd_update() did not copy the entry to irqfd->entries, the "new"
>> will not match entry "e" obtained from irq_rt, which can trigger a false
>> WARN_ON.
> 
> And since there can only be one MSI, if there are multiple routing entries, then
> the WARN won't be reached thanks to the continue that's just above:
> 
> 		if (e->type != KVM_IRQ_ROUTING_MSI)
> 			continue;

Thanks.. I understand it now. I did not see complete code hence the 
confusion. Sorry about that.

Regards
Sairaj kodilkar