[v1] KVM: iommu: Overhaul device posted IRQs support

[PATCH 64/67] iommu/amd: KVM: SVM: Allow KVM to control need for GA log interrupts

Posted by Sean Christopherson 10 months, 1 week ago

Add plumbing to the AMD IOMMU driver to allow KVM to control whether or
not an IRTE is configured to generate GA log interrupts.  KVM only needs a
notification if the target vCPU is blocking, so the vCPU can be awakened.
If a vCPU is preempted or exits to userspace, KVM clears is_run, but will
set the vCPU back to running when userspace does KVM_RUN and/or the vCPU
task is scheduled back in, i.e. KVM doesn't need a notification.

Unconditionally pass "true" in all KVM paths to isolate the IOMMU changes
from the KVM changes insofar as possible.

Opportunistically swap the ordering of parameters for amd_iommu_update_ga()
so that the match amd_iommu_activate_guest_mode().

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/irq_remapping.h |  1 +
 arch/x86/kvm/svm/avic.c              | 10 ++++++----
 drivers/iommu/amd/iommu.c            | 17 ++++++++++-------
 include/linux/amd-iommu.h            |  9 ++++-----
 4 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 4c75a17632f6..5a0d42464d44 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -36,6 +36,7 @@ struct amd_iommu_pi_data {
 	u32 ga_tag;
 	u32 vector;		/* Guest vector of the interrupt */
 	int cpu;
+	bool ga_log_intr;
 	bool is_guest_mode;
 	void *ir_data;
 };
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index c896f00f901c..1466e66cca6c 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -794,10 +794,12 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 		 * is awakened and/or scheduled in.  See also avic_vcpu_load().
 		 */
 		entry = svm->avic_physical_id_entry;
-		if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+		if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) {
 			pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
-		else
+		} else {
 			pi_data.cpu = -1;
+			pi_data.ga_log_intr = true;
+		}
 
 		ret = irq_set_vcpu_affinity(host_irq, &pi_data);
 		if (ret)
@@ -837,9 +839,9 @@ static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu,
 
 	list_for_each_entry(ir, &svm->ir_list, node) {
 		if (!toggle_avic)
-			WARN_ON_ONCE(amd_iommu_update_ga(cpu, ir->data));
+			WARN_ON_ONCE(amd_iommu_update_ga(ir->data, cpu, true));
 		else if (cpu >= 0)
-			WARN_ON_ONCE(amd_iommu_activate_guest_mode(ir->data, cpu));
+			WARN_ON_ONCE(amd_iommu_activate_guest_mode(ir->data, cpu, true));
 		else
 			WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(ir->data));
 	}
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 2e016b98fa1b..27b03e718980 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -3775,7 +3775,8 @@ static const struct irq_domain_ops amd_ir_domain_ops = {
 	.deactivate = irq_remapping_deactivate,
 };
 
-static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
+static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu,
+				  bool ga_log_intr)
 {
 	if (cpu >= 0) {
 		entry->lo.fields_vapic.destination =
@@ -3783,12 +3784,14 @@ static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
 		entry->hi.fields.destination =
 					APICID_TO_IRTE_DEST_HI(cpu);
 		entry->lo.fields_vapic.is_run = true;
+		entry->lo.fields_vapic.ga_log_intr = false;
 	} else {
 		entry->lo.fields_vapic.is_run = false;
+		entry->lo.fields_vapic.ga_log_intr = ga_log_intr;
 	}
 }
 
-int amd_iommu_update_ga(int cpu, void *data)
+int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr)
 {
 	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
 	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
@@ -3802,14 +3805,14 @@ int amd_iommu_update_ga(int cpu, void *data)
 	if (!ir_data->iommu)
 		return -ENODEV;
 
-	__amd_iommu_update_ga(entry, cpu);
+	__amd_iommu_update_ga(entry, cpu, ga_log_intr);
 
 	return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
 				ir_data->irq_2_irte.index, entry);
 }
 EXPORT_SYMBOL(amd_iommu_update_ga);
 
-int amd_iommu_activate_guest_mode(void *data, int cpu)
+int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr)
 {
 	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
 	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
@@ -3828,12 +3831,11 @@ int amd_iommu_activate_guest_mode(void *data, int cpu)
 
 	entry->lo.fields_vapic.valid       = valid;
 	entry->lo.fields_vapic.guest_mode  = 1;
-	entry->lo.fields_vapic.ga_log_intr = 1;
 	entry->hi.fields.ga_root_ptr       = ir_data->ga_root_ptr;
 	entry->hi.fields.vector            = ir_data->ga_vector;
 	entry->lo.fields_vapic.ga_tag      = ir_data->ga_tag;
 
-	__amd_iommu_update_ga(entry, cpu);
+	__amd_iommu_update_ga(entry, cpu, ga_log_intr);
 
 	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
 			      ir_data->irq_2_irte.index, entry);
@@ -3904,7 +3906,8 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info)
 		ir_data->ga_vector = pi_data->vector;
 		ir_data->ga_tag = pi_data->ga_tag;
 		if (pi_data->is_guest_mode)
-			ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu);
+			ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu,
+							    pi_data->ga_log_intr);
 		else
 			ret = amd_iommu_deactivate_guest_mode(ir_data);
 	} else {
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index c9f2df0c4596..8cced632ecd0 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -30,9 +30,8 @@ static inline void amd_iommu_detect(void) { }
 /* IOMMU AVIC Function */
 extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32));
 
-extern int amd_iommu_update_ga(int cpu, void *data);
-
-extern int amd_iommu_activate_guest_mode(void *data, int cpu);
+extern int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr);
+extern int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr);
 extern int amd_iommu_deactivate_guest_mode(void *data);
 
 #else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */
@@ -43,12 +42,12 @@ amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
 	return 0;
 }
 
-static inline int amd_iommu_update_ga(int cpu, void *data)
+static inline int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr)
 {
 	return 0;
 }
 
-static inline int amd_iommu_activate_guest_mode(void *data, int cpu)
+static inline int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr)
 {
 	return 0;
 }
-- 
2.49.0.504.g3bcea36a83-goog

Re: [PATCH 64/67] iommu/amd: KVM: SVM: Allow KVM to control need for GA log interrupts

Posted by Joao Martins 10 months ago

On 04/04/2025 20:39, Sean Christopherson wrote:
> Add plumbing to the AMD IOMMU driver to allow KVM to control whether or
> not an IRTE is configured to generate GA log interrupts.  KVM only needs a
> notification if the target vCPU is blocking, so the vCPU can be awakened.
> If a vCPU is preempted or exits to userspace, KVM clears is_run, but will
> set the vCPU back to running when userspace does KVM_RUN and/or the vCPU
> task is scheduled back in, i.e. KVM doesn't need a notification.
> 
> Unconditionally pass "true" in all KVM paths to isolate the IOMMU changes
> from the KVM changes insofar as possible.
> 
> Opportunistically swap the ordering of parameters for amd_iommu_update_ga()
> so that the match amd_iommu_activate_guest_mode().

Unfortunately I think this patch and the next one might be riding on the
assumption that amd_iommu_update_ga() is always cheap :( -- see below.

I didn't spot anything else flawed in the series though, just this one. I would
suggest holding off on this and the next one, while progressing with the rest of
the series.

> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> index 2e016b98fa1b..27b03e718980 100644
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> -static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
> +static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu,
> +				  bool ga_log_intr)
>  {
>  	if (cpu >= 0) {
>  		entry->lo.fields_vapic.destination =
> @@ -3783,12 +3784,14 @@ static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
>  		entry->hi.fields.destination =
>  					APICID_TO_IRTE_DEST_HI(cpu);
>  		entry->lo.fields_vapic.is_run = true;
> +		entry->lo.fields_vapic.ga_log_intr = false;
>  	} else {
>  		entry->lo.fields_vapic.is_run = false;
> +		entry->lo.fields_vapic.ga_log_intr = ga_log_intr;
>  	}
>  }
>

isRun, Destination and GATag are not cached. Quoting the update from a few years
back (page 93 of IOMMU spec dated Feb 2025):

| When virtual interrupts are enabled by setting MMIO Offset 0018h[GAEn] and
| IRTE[GuestMode=1], IRTE[IsRun], IRTE[Destination], and if present IRTE[GATag],
| are not cached by the IOMMU. Modifications to these fields do not require an
| invalidation of the Interrupt Remapping Table.

This is the reason we were able to get rid of the IOMMU invalidation in
amd_iommu_update_ga() ... which sped up vmexit/vmenter flow with iommu avic.
Besides the lock contention that was observed at the time, we were seeing stalls
in this path with enough vCPUs IIRC; CCing Alejandro to keep me honest.

Now this change above is incorrect as is and to make it correct: you will need
xor with the previous content of the IRTE::ga_log_intr and then if it changes
then you re-add back an invalidation command via
iommu_flush_irt_and_complete()). The latter is what I am worried will
reintroduce these above problem :(

The invalidation command (which has a completion barrier to serialize
invalidation execution) takes some time in h/w, and will make all your vcpus
content on the irq table lock (as is). Even assuming you somehow move the
invalidation outside the lock, you will content on the iommu lock (for the
command queue) or best case assuming no locks (which I am not sure it is
possible) you will need to wait for the command to complete until you can
progress forward with entering/exiting.

Unless the GALogIntr bit is somehow also not cached too which wouldn't need the
invalidation command (which would be good news!). Adding Suravee/Vasant here.

It's a nice trick how you would leverage this in SVM, but do you have
measurements that corroborate its introduction? How many unnecessary GALog
entries were you able to avoid with this trick say on a workload that would
exercise this (like netperf 1byte RR test that sleeps and wakes up a lot) ?

I should also mention that there's a different logic that is alternative to
GALog (in Genoa or more recent), which is GAPPI support whereby an entry is
generated but a more rare condition. Quoting the an excerpts below:

| This mode is enabled by setting MMIO Offset 0018h[GAPPIEn]=1. Under this
| mode, guest interrupts (IRTE[GuestMode]=1) update the vAPIC backing page IRR
| status as normal.

| In GAPPI mode, a GAPPI interrupt is generated if all of the guest IRR bits
| were previously clear prior to the last IRR update. This indicates the new
| interrupt is the first pending interrupt to the
| vCPU. The GAPPI interrupt is used to signal Hypervisor software to schedule
| one or more vCPUs to execute pending interrupts.

| Implementations may not be able to perfectly determine if all of the IRR bits
| were previously clear prior to updating the vAPIC backing page to set IRR.
| Spurious interrupts may be generated as a
| result. Software must be designed to handle this possibility

Page 99, "2.2.5.5 Guest APIC Physical Processor Interrupt",
https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/specifications/48882_IOMMU.pdf

Re: [PATCH 64/67] iommu/amd: KVM: SVM: Allow KVM to control need for GA log interrupts

Posted by Vasant Hegde 9 months, 3 weeks ago

Hi Joao, Sean,


On 4/9/2025 5:26 PM, Joao Martins wrote:
> On 04/04/2025 20:39, Sean Christopherson wrote:
>> Add plumbing to the AMD IOMMU driver to allow KVM to control whether or
>> not an IRTE is configured to generate GA log interrupts.  KVM only needs a
>> notification if the target vCPU is blocking, so the vCPU can be awakened.
>> If a vCPU is preempted or exits to userspace, KVM clears is_run, but will
>> set the vCPU back to running when userspace does KVM_RUN and/or the vCPU
>> task is scheduled back in, i.e. KVM doesn't need a notification.
>>
>> Unconditionally pass "true" in all KVM paths to isolate the IOMMU changes
>> from the KVM changes insofar as possible.
>>
>> Opportunistically swap the ordering of parameters for amd_iommu_update_ga()
>> so that the match amd_iommu_activate_guest_mode().
> 
> Unfortunately I think this patch and the next one might be riding on the
> assumption that amd_iommu_update_ga() is always cheap :( -- see below.
> 
> I didn't spot anything else flawed in the series though, just this one. I would
> suggest holding off on this and the next one, while progressing with the rest of
> the series.
> 
>> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
>> index 2e016b98fa1b..27b03e718980 100644
>> --- a/drivers/iommu/amd/iommu.c
>> +++ b/drivers/iommu/amd/iommu.c
>> -static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
>> +static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu,
>> +				  bool ga_log_intr)
>>  {
>>  	if (cpu >= 0) {
>>  		entry->lo.fields_vapic.destination =
>> @@ -3783,12 +3784,14 @@ static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
>>  		entry->hi.fields.destination =
>>  					APICID_TO_IRTE_DEST_HI(cpu);
>>  		entry->lo.fields_vapic.is_run = true;
>> +		entry->lo.fields_vapic.ga_log_intr = false;
>>  	} else {
>>  		entry->lo.fields_vapic.is_run = false;
>> +		entry->lo.fields_vapic.ga_log_intr = ga_log_intr;
>>  	}
>>  }
>>
> 
> isRun, Destination and GATag are not cached. Quoting the update from a few years
> back (page 93 of IOMMU spec dated Feb 2025):
> 
> | When virtual interrupts are enabled by setting MMIO Offset 0018h[GAEn] and
> | IRTE[GuestMode=1], IRTE[IsRun], IRTE[Destination], and if present IRTE[GATag],
> | are not cached by the IOMMU. Modifications to these fields do not require an
> | invalidation of the Interrupt Remapping Table.
> 
> This is the reason we were able to get rid of the IOMMU invalidation in
> amd_iommu_update_ga() ... which sped up vmexit/vmenter flow with iommu avic.
> Besides the lock contention that was observed at the time, we were seeing stalls
> in this path with enough vCPUs IIRC; CCing Alejandro to keep me honest.
> 
> Now this change above is incorrect as is and to make it correct: you will need
> xor with the previous content of the IRTE::ga_log_intr and then if it changes
> then you re-add back an invalidation command via
> iommu_flush_irt_and_complete()). The latter is what I am worried will
> reintroduce these above problem :(
> 
> The invalidation command (which has a completion barrier to serialize
> invalidation execution) takes some time in h/w, and will make all your vcpus
> content on the irq table lock (as is). Even assuming you somehow move the
> invalidation outside the lock, you will content on the iommu lock (for the>
command queue) or best case assuming no locks (which I am not sure it is
> possible) you will need to wait for the command to complete until you can
> progress forward with entering/exiting.
> 
> Unless the GALogIntr bit is somehow also not cached too which wouldn't need the
> invalidation command (which would be good news!). Adding Suravee/Vasant here.

I have checked with HW architects. Its not cached. So we don't need invalidation
after updating GALogIntr field in IRTE.


-Vasant

Re: [PATCH 64/67] iommu/amd: KVM: SVM: Allow KVM to control need for GA log interrupts

Posted by Sean Christopherson 9 months, 3 weeks ago

On Fri, Apr 18, 2025, Vasant Hegde wrote:
> On 4/9/2025 5:26 PM, Joao Martins wrote:
> > On 04/04/2025 20:39, Sean Christopherson wrote:
> >> Add plumbing to the AMD IOMMU driver to allow KVM to control whether or
> >> not an IRTE is configured to generate GA log interrupts.  KVM only needs a
> >> notification if the target vCPU is blocking, so the vCPU can be awakened.
> >> If a vCPU is preempted or exits to userspace, KVM clears is_run, but will
> >> set the vCPU back to running when userspace does KVM_RUN and/or the vCPU
> >> task is scheduled back in, i.e. KVM doesn't need a notification.
> >>
> >> Unconditionally pass "true" in all KVM paths to isolate the IOMMU changes
> >> from the KVM changes insofar as possible.
> >>
> >> Opportunistically swap the ordering of parameters for amd_iommu_update_ga()
> >> so that the match amd_iommu_activate_guest_mode().
> > 
> > Unfortunately I think this patch and the next one might be riding on the
> > assumption that amd_iommu_update_ga() is always cheap :( -- see below.
> > 
> > I didn't spot anything else flawed in the series though, just this one. I would
> > suggest holding off on this and the next one, while progressing with the rest of
> > the series.
> > 
> >> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> >> index 2e016b98fa1b..27b03e718980 100644
> >> --- a/drivers/iommu/amd/iommu.c
> >> +++ b/drivers/iommu/amd/iommu.c
> >> -static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
> >> +static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu,
> >> +				  bool ga_log_intr)
> >>  {
> >>  	if (cpu >= 0) {
> >>  		entry->lo.fields_vapic.destination =
> >> @@ -3783,12 +3784,14 @@ static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
> >>  		entry->hi.fields.destination =
> >>  					APICID_TO_IRTE_DEST_HI(cpu);
> >>  		entry->lo.fields_vapic.is_run = true;
> >> +		entry->lo.fields_vapic.ga_log_intr = false;
> >>  	} else {
> >>  		entry->lo.fields_vapic.is_run = false;
> >> +		entry->lo.fields_vapic.ga_log_intr = ga_log_intr;
> >>  	}
> >>  }
> >>
> > 
> > isRun, Destination and GATag are not cached. Quoting the update from a few years
> > back (page 93 of IOMMU spec dated Feb 2025):
> > 
> > | When virtual interrupts are enabled by setting MMIO Offset 0018h[GAEn] and
> > | IRTE[GuestMode=1], IRTE[IsRun], IRTE[Destination], and if present IRTE[GATag],
> > | are not cached by the IOMMU. Modifications to these fields do not require an
> > | invalidation of the Interrupt Remapping Table.
> > 
> > This is the reason we were able to get rid of the IOMMU invalidation in
> > amd_iommu_update_ga() ... which sped up vmexit/vmenter flow with iommu avic.
> > Besides the lock contention that was observed at the time, we were seeing stalls
> > in this path with enough vCPUs IIRC; CCing Alejandro to keep me honest.
> > 
> > Now this change above is incorrect as is and to make it correct: you will need
> > xor with the previous content of the IRTE::ga_log_intr and then if it changes
> > then you re-add back an invalidation command via
> > iommu_flush_irt_and_complete()). The latter is what I am worried will
> > reintroduce these above problem :(
> > 
> > The invalidation command (which has a completion barrier to serialize
> > invalidation execution) takes some time in h/w, and will make all your vcpus
> > content on the irq table lock (as is). Even assuming you somehow move the
> > invalidation outside the lock, you will content on the iommu lock (for the>
> command queue) or best case assuming no locks (which I am not sure it is
> > possible) you will need to wait for the command to complete until you can
> > progress forward with entering/exiting.
> > 
> > Unless the GALogIntr bit is somehow also not cached too which wouldn't need the
> > invalidation command (which would be good news!). Adding Suravee/Vasant here.
> 
> I have checked with HW architects. Its not cached. So we don't need invalidation
> after updating GALogIntr field in IRTE.

Woot!  Better to be lucky than good :-)

Re: [PATCH 64/67] iommu/amd: KVM: SVM: Allow KVM to control need for GA log interrupts

Posted by Joao Martins 9 months, 3 weeks ago

On 18/04/2025 19:48, Sean Christopherson wrote:
> On Fri, Apr 18, 2025, Vasant Hegde wrote:
>> On 4/9/2025 5:26 PM, Joao Martins wrote:
>>> On 04/04/2025 20:39, Sean Christopherson wrote:
>>>> Add plumbing to the AMD IOMMU driver to allow KVM to control whether or
>>>> not an IRTE is configured to generate GA log interrupts.  KVM only needs a
>>>> notification if the target vCPU is blocking, so the vCPU can be awakened.
>>>> If a vCPU is preempted or exits to userspace, KVM clears is_run, but will
>>>> set the vCPU back to running when userspace does KVM_RUN and/or the vCPU
>>>> task is scheduled back in, i.e. KVM doesn't need a notification.
>>>>
>>>> Unconditionally pass "true" in all KVM paths to isolate the IOMMU changes
>>>> from the KVM changes insofar as possible.
>>>>
>>>> Opportunistically swap the ordering of parameters for amd_iommu_update_ga()
>>>> so that the match amd_iommu_activate_guest_mode().
>>>
>>> Unfortunately I think this patch and the next one might be riding on the
>>> assumption that amd_iommu_update_ga() is always cheap :( -- see below.
>>>
>>> I didn't spot anything else flawed in the series though, just this one. I would
>>> suggest holding off on this and the next one, while progressing with the rest of
>>> the series.
>>>
>>>> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
>>>> index 2e016b98fa1b..27b03e718980 100644
>>>> --- a/drivers/iommu/amd/iommu.c
>>>> +++ b/drivers/iommu/amd/iommu.c
>>>> -static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
>>>> +static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu,
>>>> +				  bool ga_log_intr)
>>>>  {
>>>>  	if (cpu >= 0) {
>>>>  		entry->lo.fields_vapic.destination =
>>>> @@ -3783,12 +3784,14 @@ static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
>>>>  		entry->hi.fields.destination =
>>>>  					APICID_TO_IRTE_DEST_HI(cpu);
>>>>  		entry->lo.fields_vapic.is_run = true;
>>>> +		entry->lo.fields_vapic.ga_log_intr = false;
>>>>  	} else {
>>>>  		entry->lo.fields_vapic.is_run = false;
>>>> +		entry->lo.fields_vapic.ga_log_intr = ga_log_intr;
>>>>  	}
>>>>  }
>>>>
>>>
>>> isRun, Destination and GATag are not cached. Quoting the update from a few years
>>> back (page 93 of IOMMU spec dated Feb 2025):
>>>
>>> | When virtual interrupts are enabled by setting MMIO Offset 0018h[GAEn] and
>>> | IRTE[GuestMode=1], IRTE[IsRun], IRTE[Destination], and if present IRTE[GATag],
>>> | are not cached by the IOMMU. Modifications to these fields do not require an
>>> | invalidation of the Interrupt Remapping Table.
>>>
>>> This is the reason we were able to get rid of the IOMMU invalidation in
>>> amd_iommu_update_ga() ... which sped up vmexit/vmenter flow with iommu avic.
>>> Besides the lock contention that was observed at the time, we were seeing stalls
>>> in this path with enough vCPUs IIRC; CCing Alejandro to keep me honest.
>>>
>>> Now this change above is incorrect as is and to make it correct: you will need
>>> xor with the previous content of the IRTE::ga_log_intr and then if it changes
>>> then you re-add back an invalidation command via
>>> iommu_flush_irt_and_complete()). The latter is what I am worried will
>>> reintroduce these above problem :(
>>>
>>> The invalidation command (which has a completion barrier to serialize
>>> invalidation execution) takes some time in h/w, and will make all your vcpus
>>> content on the irq table lock (as is). Even assuming you somehow move the
>>> invalidation outside the lock, you will content on the iommu lock (for the>
>> command queue) or best case assuming no locks (which I am not sure it is
>>> possible) you will need to wait for the command to complete until you can
>>> progress forward with entering/exiting.
>>>
>>> Unless the GALogIntr bit is somehow also not cached too which wouldn't need the
>>> invalidation command (which would be good news!). Adding Suravee/Vasant here.
>>
>> I have checked with HW architects. Its not cached. So we don't need invalidation
>> after updating GALogIntr field in IRTE.
> 
> Woot!  Better to be lucky than good :-)

Probably worth using this thread Message ID as a Link: tag while the IOMMU
manual isn't yet up to date with this information. That usually takes a while
until formalized.

Re: [PATCH 64/67] iommu/amd: KVM: SVM: Allow KVM to control need for GA log interrupts

Posted by Sean Christopherson 10 months ago

On Wed, Apr 09, 2025, Joao Martins wrote:
> On 04/04/2025 20:39, Sean Christopherson wrote:
> I would suggest holding off on this and the next one, while progressing with
> the rest of the series.

Agreed, though I think there's a "pure win" alternative that can be safely
implemented (but it definitely should be done separately).

If HLT-exiting is disabled for the VM, and the VM doesn't have access to the
various paravirtual features that can put it into a synthetic HLT state (PV async
#PF and/or Xen support), then I'm pretty sure GALogIntr can be disabled entirely,
i.e. disabled during the initial irq_set_vcpu_affinity() and never enabled.  KVM
doesn't emulate HLT via its full emulator for AMD (just non-unrestricted Intel
guests), so I'm pretty sure there would be no need for KVM to ever wake a vCPU in
response to a device interrupt.

> > diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> > index 2e016b98fa1b..27b03e718980 100644
> > --- a/drivers/iommu/amd/iommu.c
> > +++ b/drivers/iommu/amd/iommu.c
> > -static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
> > +static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu,
> > +				  bool ga_log_intr)
> >  {
> >  	if (cpu >= 0) {
> >  		entry->lo.fields_vapic.destination =
> > @@ -3783,12 +3784,14 @@ static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
> >  		entry->hi.fields.destination =
> >  					APICID_TO_IRTE_DEST_HI(cpu);
> >  		entry->lo.fields_vapic.is_run = true;
> > +		entry->lo.fields_vapic.ga_log_intr = false;
> >  	} else {
> >  		entry->lo.fields_vapic.is_run = false;
> > +		entry->lo.fields_vapic.ga_log_intr = ga_log_intr;
> >  	}
> >  }
> >
> 
> isRun, Destination and GATag are not cached. Quoting the update from a few years
> back (page 93 of IOMMU spec dated Feb 2025):
> 
> | When virtual interrupts are enabled by setting MMIO Offset 0018h[GAEn] and
> | IRTE[GuestMode=1], IRTE[IsRun], IRTE[Destination], and if present IRTE[GATag],
> | are not cached by the IOMMU. Modifications to these fields do not require an
> | invalidation of the Interrupt Remapping Table.

Ooh, that's super helpful info.  Any objections to me adding verbose comments to
explain the effective rules for amd_iommu_update_ga()?

> This is the reason we were able to get rid of the IOMMU invalidation in
> amd_iommu_update_ga() ... which sped up vmexit/vmenter flow with iommu avic.
> Besides the lock contention that was observed at the time, we were seeing stalls
> in this path with enough vCPUs IIRC; CCing Alejandro to keep me honest.
> 
> Now this change above is incorrect as is and to make it correct: you will need
> xor with the previous content of the IRTE::ga_log_intr and then if it changes
> then you re-add back an invalidation command via
> iommu_flush_irt_and_complete()). The latter is what I am worried will
> reintroduce these above problem :(

Ya, the need to flush definitely changes things.

> The invalidation command (which has a completion barrier to serialize
> invalidation execution) takes some time in h/w, and will make all your vcpus
> content on the irq table lock (as is). Even assuming you somehow move the
> invalidation outside the lock, you will content on the iommu lock (for the
> command queue) or best case assuming no locks (which I am not sure it is
> possible) you will need to wait for the command to complete until you can
> progress forward with entering/exiting.
> 
> Unless the GALogIntr bit is somehow also not cached too which wouldn't need the
> invalidation command (which would be good news!). Adding Suravee/Vasant here.
> 
> It's a nice trick how you would leverage this in SVM, but do you have
> measurements that corroborate its introduction? How many unnecessary GALog
> entries were you able to avoid with this trick say on a workload that would
> exercise this (like netperf 1byte RR test that sleeps and wakes up a lot) ?

I didn't do any measurements; I assumed the opportunistic toggling of GALogIntr
would be "free".

There might be optimizations that could be done, but I think the better solution
is to simply disable GALogIntr when it's not needed.  That'd limit the benefits
to select setups, but trying to optimize IRQ bypass for VMs that are CPU-overcommited,
i.e. can't do native HLT, seems rather pointless.

> I should also mention that there's a different logic that is alternative to
> GALog (in Genoa or more recent), which is GAPPI support whereby an entry is
> generated but a more rare condition. Quoting the an excerpts below:
> 
> | This mode is enabled by setting MMIO Offset 0018h[GAPPIEn]=1. Under this
> | mode, guest interrupts (IRTE[GuestMode]=1) update the vAPIC backing page IRR
> | status as normal.
> 
> | In GAPPI mode, a GAPPI interrupt is generated if all of the guest IRR bits
> | were previously clear prior to the last IRR update. This indicates the new
> | interrupt is the first pending interrupt to the
> | vCPU. The GAPPI interrupt is used to signal Hypervisor software to schedule
> | one or more vCPUs to execute pending interrupts.
> 
> | Implementations may not be able to perfectly determine if all of the IRR bits
> | were previously clear prior to updating the vAPIC backing page to set IRR.
> | Spurious interrupts may be generated as a
> | result. Software must be designed to handle this possibility

I saw this as well.  I'm curious if enabling GAPPI mode affects IOMMU interrupt
delivery latency/throughput due to having to scrape the entire IRR.

> Page 99, "2.2.5.5 Guest APIC Physical Processor Interrupt",
> https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/specifications/48882_IOMMU.pdf

Re: [PATCH 64/67] iommu/amd: KVM: SVM: Allow KVM to control need for GA log interrupts

Posted by Joao Martins 10 months ago

On 10/04/2025 16:45, Sean Christopherson wrote:
> On Wed, Apr 09, 2025, Joao Martins wrote:
>> On 04/04/2025 20:39, Sean Christopherson wrote:
>> I would suggest holding off on this and the next one, while progressing with
>> the rest of the series.
> 
> Agreed, though I think there's a "pure win" alternative that can be safely
> implemented (but it definitely should be done separately).
> 
> If HLT-exiting is disabled for the VM, and the VM doesn't have access to the
> various paravirtual features that can put it into a synthetic HLT state (PV async
> #PF and/or Xen support), then I'm pretty sure GALogIntr can be disabled entirely,
> i.e. disabled during the initial irq_set_vcpu_affinity() and never enabled.  KVM
> doesn't emulate HLT via its full emulator for AMD (just non-unrestricted Intel
> guests), so I'm pretty sure there would be no need for KVM to ever wake a vCPU in
> response to a device interrupt.
> 

Done via IRQ affinity changes already a significant portion of the IRTE and it's
already on a slowpath that performs an invalidation, so via
irq_set_vcpu_affinity is definitely safe.

But even with HLT exits disabled; there's still preemption though? But I guess
that's a bit more rare if it's conditional to HLT exiting being enabled or not,
and whether there's only a single task running.

>>> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
>>> index 2e016b98fa1b..27b03e718980 100644
>>> --- a/drivers/iommu/amd/iommu.c
>>> +++ b/drivers/iommu/amd/iommu.c
>>> -static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
>>> +static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu,
>>> +				  bool ga_log_intr)
>>>  {
>>>  	if (cpu >= 0) {
>>>  		entry->lo.fields_vapic.destination =
>>> @@ -3783,12 +3784,14 @@ static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
>>>  		entry->hi.fields.destination =
>>>  					APICID_TO_IRTE_DEST_HI(cpu);
>>>  		entry->lo.fields_vapic.is_run = true;
>>> +		entry->lo.fields_vapic.ga_log_intr = false;
>>>  	} else {
>>>  		entry->lo.fields_vapic.is_run = false;
>>> +		entry->lo.fields_vapic.ga_log_intr = ga_log_intr;
>>>  	}
>>>  }
>>>
>>
>> isRun, Destination and GATag are not cached. Quoting the update from a few years
>> back (page 93 of IOMMU spec dated Feb 2025):
>>
>> | When virtual interrupts are enabled by setting MMIO Offset 0018h[GAEn] and
>> | IRTE[GuestMode=1], IRTE[IsRun], IRTE[Destination], and if present IRTE[GATag],
>> | are not cached by the IOMMU. Modifications to these fields do not require an
>> | invalidation of the Interrupt Remapping Table.
> 
> Ooh, that's super helpful info.  Any objections to me adding verbose comments to
> explain the effective rules for amd_iommu_update_ga()?
> 
That's a great addition, it should have been there from the beginning when we
added the cacheviness of guest-mode IRTE into the mix.

>> This is the reason we were able to get rid of the IOMMU invalidation in
>> amd_iommu_update_ga() ... which sped up vmexit/vmenter flow with iommu avic.
>> Besides the lock contention that was observed at the time, we were seeing stalls
>> in this path with enough vCPUs IIRC; CCing Alejandro to keep me honest.
>>
>> Now this change above is incorrect as is and to make it correct: you will need
>> xor with the previous content of the IRTE::ga_log_intr and then if it changes
>> then you re-add back an invalidation command via
>> iommu_flush_irt_and_complete()). The latter is what I am worried will
>> reintroduce these above problem :(
> 
> Ya, the need to flush definitely changes things.
> 
>> The invalidation command (which has a completion barrier to serialize
>> invalidation execution) takes some time in h/w, and will make all your vcpus
>> content on the irq table lock (as is). Even assuming you somehow move the
>> invalidation outside the lock, you will content on the iommu lock (for the
>> command queue) or best case assuming no locks (which I am not sure it is
>> possible) you will need to wait for the command to complete until you can
>> progress forward with entering/exiting.
>>
>> Unless the GALogIntr bit is somehow also not cached too which wouldn't need the
>> invalidation command (which would be good news!). Adding Suravee/Vasant here.
>>
>> It's a nice trick how you would leverage this in SVM, but do you have
>> measurements that corroborate its introduction? How many unnecessary GALog
>> entries were you able to avoid with this trick say on a workload that would
>> exercise this (like netperf 1byte RR test that sleeps and wakes up a lot) ?
> 
> I didn't do any measurements; I assumed the opportunistic toggling of GALogIntr
> would be "free".
> 
> There might be optimizations that could be done, but I think the better solution
> is to simply disable GALogIntr when it's not needed.  That'd limit the benefits
> to select setups, but trying to optimize IRQ bypass for VMs that are CPU-overcommited,
> i.e. can't do native HLT, seems rather pointless.
> 
/me nods

Re: [PATCH 64/67] iommu/amd: KVM: SVM: Allow KVM to control need for GA log interrupts

Posted by Sean Christopherson 10 months ago

On Thu, Apr 10, 2025, Joao Martins wrote:
> On 10/04/2025 16:45, Sean Christopherson wrote:
> > On Wed, Apr 09, 2025, Joao Martins wrote:
> >> On 04/04/2025 20:39, Sean Christopherson wrote:
> >> I would suggest holding off on this and the next one, while progressing with
> >> the rest of the series.
> > 
> > Agreed, though I think there's a "pure win" alternative that can be safely
> > implemented (but it definitely should be done separately).
> > 
> > If HLT-exiting is disabled for the VM, and the VM doesn't have access to the
> > various paravirtual features that can put it into a synthetic HLT state (PV async
> > #PF and/or Xen support), then I'm pretty sure GALogIntr can be disabled entirely,
> > i.e. disabled during the initial irq_set_vcpu_affinity() and never enabled.  KVM
> > doesn't emulate HLT via its full emulator for AMD (just non-unrestricted Intel
> > guests), so I'm pretty sure there would be no need for KVM to ever wake a vCPU in
> > response to a device interrupt.
> > 
> 
> Done via IRQ affinity changes already a significant portion of the IRTE and it's
> already on a slowpath that performs an invalidation, so via
> irq_set_vcpu_affinity is definitely safe.
> 
> But even with HLT exits disabled; there's still preemption though?

Even with involuntary preemption (which would be nonsensical to pair with HLT
passthrough), KVM doesn't rely on the GALogIntr to schedule in the vCPU task.

The _only_ use of the notification is to wake the task and make it runnable.  If
the vCPU task is already runnable, when and where the task is run is fully
controlled by the scheduler (and/or userspace).

> But I guess that's a bit more rare if it's conditional to HLT exiting being
> enabled or not, and whether there's only a single task running.