KVM allows VMMs to specify the maximum possible APIC ID for a virtual
machine through KVM_CAP_MAX_VCPU_ID capability so as to limit data
structures related to APIC/x2APIC. Utilize the same to set the AVIC
physical max index in the VMCB, similar to VMX. This helps hardware
limit the number of entries to be scanned in the physical APIC ID table
speeding up IPI broadcasts for virtual machines with smaller number of
vcpus.
The minimum allocation required for the Physical APIC ID table is one 4k
page supporting up to 512 entries. With AVIC support for 4096 vcpus
though, it is sufficient to only allocate memory to accommodate the
AVIC physical max index that will be programmed into the VMCB. Limit
memory allocated for the Physical APIC ID table accordingly.
Signed-off-by: Naveen N Rao (AMD) <naveen@kernel.org>
---
arch/x86/kvm/svm/avic.c | 53 ++++++++++++++++++++++++++++++-----------
arch/x86/kvm/svm/svm.c | 6 +++++
arch/x86/kvm/svm/svm.h | 1 +
3 files changed, 46 insertions(+), 14 deletions(-)
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 1fb322d2ac18..dac4a6648919 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -85,6 +85,17 @@ struct amd_svm_iommu_ir {
void *data; /* Storing pointer to struct amd_ir_data */
};
+static inline u32 avic_get_max_physical_id(struct kvm *kvm, bool is_x2apic)
+{
+ u32 avic_max_physical_id = is_x2apic ? x2avic_max_physical_id : AVIC_MAX_PHYSICAL_ID;
+
+ /*
+ * Assume vcpu_id is the same as APIC ID. Per KVM_CAP_MAX_VCPU_ID, max_vcpu_ids
+ * represents the max APIC ID for this vm, rather than the max vcpus.
+ */
+ return min(kvm->arch.max_vcpu_ids - 1, avic_max_physical_id);
+}
+
static void avic_activate_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
@@ -103,7 +114,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
*/
if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
vmcb->control.int_ctl |= X2APIC_MODE_MASK;
- vmcb->control.avic_physical_id |= x2avic_max_physical_id;
+ vmcb->control.avic_physical_id |= avic_get_max_physical_id(svm->vcpu.kvm, true);
/* Disabling MSR intercept for x2APIC registers */
svm_set_x2apic_msr_interception(svm, false);
} else {
@@ -114,7 +125,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
/* For xAVIC and hybrid-xAVIC modes */
- vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
+ vmcb->control.avic_physical_id |= avic_get_max_physical_id(svm->vcpu.kvm, false);
/* Enabling MSR intercept for x2APIC registers */
svm_set_x2apic_msr_interception(svm, true);
}
@@ -174,6 +185,12 @@ int avic_ga_log_notifier(u32 ga_tag)
return 0;
}
+static inline int avic_get_physical_id_table_order(struct kvm *kvm)
+{
+ /* Limit to the maximum physical ID supported in x2avic mode */
+ return get_order((avic_get_max_physical_id(kvm, true) + 1) * sizeof(u64));
+}
+
void avic_vm_destroy(struct kvm *kvm)
{
unsigned long flags;
@@ -186,7 +203,7 @@ void avic_vm_destroy(struct kvm *kvm)
__free_page(kvm_svm->avic_logical_id_table_page);
if (kvm_svm->avic_physical_id_table_page)
__free_pages(kvm_svm->avic_physical_id_table_page,
- get_order(sizeof(u64) * (x2avic_max_physical_id + 1)));
+ avic_get_physical_id_table_order(kvm));
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_del(&kvm_svm->hnode);
@@ -199,22 +216,12 @@ int avic_vm_init(struct kvm *kvm)
int err = -ENOMEM;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
struct kvm_svm *k2;
- struct page *p_page;
struct page *l_page;
- u32 vm_id, entries;
+ u32 vm_id;
if (!enable_apicv)
return 0;
- /* Allocating physical APIC ID table */
- entries = x2avic_max_physical_id + 1;
- p_page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
- get_order(sizeof(u64) * entries));
- if (!p_page)
- goto free_avic;
-
- kvm_svm->avic_physical_id_table_page = p_page;
-
/* Allocating logical APIC ID table (4KB) */
l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!l_page)
@@ -265,6 +272,24 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
avic_deactivate_vmcb(svm);
}
+int avic_alloc_physical_id_table(struct kvm *kvm)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ struct page *p_page;
+
+ if (kvm_svm->avic_physical_id_table_page || !enable_apicv || !irqchip_in_kernel(kvm))
+ return 0;
+
+ p_page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ avic_get_physical_id_table_order(kvm));
+ if (!p_page)
+ return -ENOMEM;
+
+ kvm_svm->avic_physical_id_table_page = p_page;
+
+ return 0;
+}
+
static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
unsigned int index)
{
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index b8aa0f36850f..3cb23298cdc3 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1423,6 +1423,11 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
svm->vmcb = target_vmcb->ptr;
}
+static int svm_vcpu_precreate(struct kvm *kvm)
+{
+ return avic_alloc_physical_id_table(kvm);
+}
+
static int svm_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
@@ -5007,6 +5012,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu,
.has_emulated_msr = svm_has_emulated_msr,
+ .vcpu_precreate = svm_vcpu_precreate,
.vcpu_create = svm_vcpu_create,
.vcpu_free = svm_vcpu_free,
.vcpu_reset = svm_vcpu_reset,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 5b159f017055..b4670afe0034 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -694,6 +694,7 @@ bool avic_hardware_setup(void);
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
+int avic_alloc_physical_id_table(struct kvm *kvm);
void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
--
2.48.1
On Thu, Feb 20, 2025, Naveen N Rao (AMD) wrote:
> KVM allows VMMs to specify the maximum possible APIC ID for a virtual
> machine through KVM_CAP_MAX_VCPU_ID capability so as to limit data
> structures related to APIC/x2APIC. Utilize the same to set the AVIC
> physical max index in the VMCB, similar to VMX. This helps hardware
> limit the number of entries to be scanned in the physical APIC ID table
> speeding up IPI broadcasts for virtual machines with smaller number of
> vcpus.
>
> The minimum allocation required for the Physical APIC ID table is one 4k
> page supporting up to 512 entries. With AVIC support for 4096 vcpus
> though, it is sufficient to only allocate memory to accommodate the
> AVIC physical max index that will be programmed into the VMCB. Limit
> memory allocated for the Physical APIC ID table accordingly.
Can you flip the order of the patches? This seems like an easy "win" for
performance, and so I can see people wanting to backport this to random kernels
even if they don't care about running 4k vCPUs.
Speaking of which, is there a measurable performance win?
> Signed-off-by: Naveen N Rao (AMD) <naveen@kernel.org>
> ---
> arch/x86/kvm/svm/avic.c | 53 ++++++++++++++++++++++++++++++-----------
> arch/x86/kvm/svm/svm.c | 6 +++++
> arch/x86/kvm/svm/svm.h | 1 +
> 3 files changed, 46 insertions(+), 14 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
> index 1fb322d2ac18..dac4a6648919 100644
> --- a/arch/x86/kvm/svm/avic.c
> +++ b/arch/x86/kvm/svm/avic.c
> @@ -85,6 +85,17 @@ struct amd_svm_iommu_ir {
> void *data; /* Storing pointer to struct amd_ir_data */
> };
>
> +static inline u32 avic_get_max_physical_id(struct kvm *kvm, bool is_x2apic)
Formletter incoming...
Do not use "inline" for functions that are visible only to the local compilation
unit. "inline" is just a hint, and modern compilers are smart enough to inline
functions when appropriate without a hint.
A longer explanation/rant here: https://lore.kernel.org/all/ZAdfX+S323JVWNZC@google.com
> +{
> + u32 avic_max_physical_id = is_x2apic ? x2avic_max_physical_id : AVIC_MAX_PHYSICAL_ID;
Don't use a super long local variable. For a helper like this, it's unnecessary,
e.g. if the reader can't understand what arch_max or max_id is, then spelling it
out entirely probably won't help them.
And practically, there's a danger to using long names like this: you're much more
likely to unintentionally "shadow" a global variable. Functionally, it won't be
a problem, but it can create confusion. E.g. if we ever added a global
avic_max_physical_id, then this code would get rather confusing.
> +
> + /*
> + * Assume vcpu_id is the same as APIC ID. Per KVM_CAP_MAX_VCPU_ID, max_vcpu_ids
> + * represents the max APIC ID for this vm, rather than the max vcpus.
> + */
> + return min(kvm->arch.max_vcpu_ids - 1, avic_max_physical_id);
> +}
> +
> static void avic_activate_vmcb(struct vcpu_svm *svm)
> {
> struct vmcb *vmcb = svm->vmcb01.ptr;
> @@ -103,7 +114,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
> */
> if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
> vmcb->control.int_ctl |= X2APIC_MODE_MASK;
> - vmcb->control.avic_physical_id |= x2avic_max_physical_id;
> + vmcb->control.avic_physical_id |= avic_get_max_physical_id(svm->vcpu.kvm, true);
Don't pass hardcoded booleans when it is at all possible to do something else.
For this case, I would either do:
static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu)
{
u32 arch_max;
if (x2avic_enabled && apic_x2apic_mode(vcpu->arch.apic))
arch_max = x2avic_max_physical_id;
else
arch_max = AVIC_MAX_PHYSICAL_ID;
return min(kvm->arch.max_vcpu_ids - 1, arch_max);
}
static void avic_activate_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
struct kvm_vcpu *vcpu = &svm->vcpu;
vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu);
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
/*
* Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
* accesses, while interrupt injection to a running vCPU can be
* achieved using AVIC doorbell. KVM disables the APIC access page
* (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
* AVIC in hybrid mode activates only the doorbell mechanism.
*/
if (x2avic_enabled && apic_x2apic_mode(vcpu->arch.apic)) {
vmcb->control.int_ctl |= X2APIC_MODE_MASK;
/* Disabling MSR intercept for x2APIC registers */
svm_set_x2apic_msr_interception(svm, false);
} else {
/*
* Flush the TLB, the guest may have inserted a non-APIC
* mapping into the TLB while AVIC was disabled.
*/
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
/* Enabling MSR intercept for x2APIC registers */
svm_set_x2apic_msr_interception(svm, true);
}
}
or
static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu, u32 arch_max)
{
return min(kvm->arch.max_vcpu_ids - 1, arch_max);
}
static void avic_activate_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
struct kvm_vcpu *vcpu = &svm->vcpu;
u32 max_id;
vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
/*
* Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
* accesses, while interrupt injection to a running vCPU can be
* achieved using AVIC doorbell. KVM disables the APIC access page
* (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
* AVIC in hybrid mode activates only the doorbell mechanism.
*/
if (x2avic_enabled && apic_x2apic_mode(vcpu->arch.apic)) {
vmcb->control.int_ctl |= X2APIC_MODE_MASK;
max_id = avic_get_max_physical_id(vcpu, x2avic_max_physical_id);
/* Disabling MSR intercept for x2APIC registers */
svm_set_x2apic_msr_interception(svm, false);
} else {
max_id = avic_get_max_physical_id(vcpu, AVIC_MAX_PHYSICAL_ID);
/*
* Flush the TLB, the guest may have inserted a non-APIC
* mapping into the TLB while AVIC was disabled.
*/
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
/* Enabling MSR intercept for x2APIC registers */
svm_set_x2apic_msr_interception(svm, true);
}
vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
vmcb->control.avic_physical_id |= max_id;
}
I don't think I have a preference between the two?
> /* Disabling MSR intercept for x2APIC registers */
> svm_set_x2apic_msr_interception(svm, false);
> } else {
> @@ -114,7 +125,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
> kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
>
> /* For xAVIC and hybrid-xAVIC modes */
> - vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
> + vmcb->control.avic_physical_id |= avic_get_max_physical_id(svm->vcpu.kvm, false);
> /* Enabling MSR intercept for x2APIC registers */
> svm_set_x2apic_msr_interception(svm, true);
> }
> @@ -174,6 +185,12 @@ int avic_ga_log_notifier(u32 ga_tag)
> return 0;
> }
>
> +static inline int avic_get_physical_id_table_order(struct kvm *kvm)
Heh, we got there eventually ;-)
> +{
> + /* Limit to the maximum physical ID supported in x2avic mode */
> + return get_order((avic_get_max_physical_id(kvm, true) + 1) * sizeof(u64));
> +}
> +
> void avic_vm_destroy(struct kvm *kvm)
> {
> unsigned long flags;
> @@ -186,7 +203,7 @@ void avic_vm_destroy(struct kvm *kvm)
> __free_page(kvm_svm->avic_logical_id_table_page);
> if (kvm_svm->avic_physical_id_table_page)
> __free_pages(kvm_svm->avic_physical_id_table_page,
> - get_order(sizeof(u64) * (x2avic_max_physical_id + 1)));
> + avic_get_physical_id_table_order(kvm));
>
> spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
> hash_del(&kvm_svm->hnode);
> @@ -199,22 +216,12 @@ int avic_vm_init(struct kvm *kvm)
> int err = -ENOMEM;
> struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
> struct kvm_svm *k2;
> - struct page *p_page;
> struct page *l_page;
> - u32 vm_id, entries;
> + u32 vm_id;
>
> if (!enable_apicv)
> return 0;
>
> - /* Allocating physical APIC ID table */
> - entries = x2avic_max_physical_id + 1;
> - p_page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
> - get_order(sizeof(u64) * entries));
> - if (!p_page)
> - goto free_avic;
> -
> - kvm_svm->avic_physical_id_table_page = p_page;
> -
> /* Allocating logical APIC ID table (4KB) */
> l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> if (!l_page)
> @@ -265,6 +272,24 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
> avic_deactivate_vmcb(svm);
> }
>
> +int avic_alloc_physical_id_table(struct kvm *kvm)
> +{
> + struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
> + struct page *p_page;
> +
> + if (kvm_svm->avic_physical_id_table_page || !enable_apicv || !irqchip_in_kernel(kvm))
> + return 0;
> +
> + p_page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
> + avic_get_physical_id_table_order(kvm));
> + if (!p_page)
> + return -ENOMEM;
> +
> + kvm_svm->avic_physical_id_table_page = p_page;
> +
> + return 0;
> +}
> +
> static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
> unsigned int index)
> {
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index b8aa0f36850f..3cb23298cdc3 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1423,6 +1423,11 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
> svm->vmcb = target_vmcb->ptr;
> }
>
> +static int svm_vcpu_precreate(struct kvm *kvm)
> +{
> + return avic_alloc_physical_id_table(kvm);
Why is allocation being moved to svm_vcpu_precreate()?
On Mon, Jun 23, 2025 at 05:38:23PM -0700, Sean Christopherson wrote:
> On Thu, Feb 20, 2025, Naveen N Rao (AMD) wrote:
> > KVM allows VMMs to specify the maximum possible APIC ID for a virtual
> > machine through KVM_CAP_MAX_VCPU_ID capability so as to limit data
> > structures related to APIC/x2APIC. Utilize the same to set the AVIC
> > physical max index in the VMCB, similar to VMX. This helps hardware
> > limit the number of entries to be scanned in the physical APIC ID table
> > speeding up IPI broadcasts for virtual machines with smaller number of
> > vcpus.
> >
> > The minimum allocation required for the Physical APIC ID table is one 4k
> > page supporting up to 512 entries. With AVIC support for 4096 vcpus
> > though, it is sufficient to only allocate memory to accommodate the
> > AVIC physical max index that will be programmed into the VMCB. Limit
> > memory allocated for the Physical APIC ID table accordingly.
>
> Can you flip the order of the patches? This seems like an easy "win" for
> performance, and so I can see people wanting to backport this to random kernels
> even if they don't care about running 4k vCPUs.
>
> Speaking of which, is there a measurable performance win?
That was my first thought. But for VMs upto 512 vCPUs, I didn't see much
of a performance difference with broadcast IPIs at all. But, I guess it
shouldn't hurt, so I will prep a smaller patch that can go before the 4k
vCPU support patch.
With 4k vCPU support enabled, yes, this makes a lot of difference. IIRC,
ipi-bench for broadcast IPIs went from ~10-15 seconds down to 3 seconds.
>
> > Signed-off-by: Naveen N Rao (AMD) <naveen@kernel.org>
> > ---
> > arch/x86/kvm/svm/avic.c | 53 ++++++++++++++++++++++++++++++-----------
> > arch/x86/kvm/svm/svm.c | 6 +++++
> > arch/x86/kvm/svm/svm.h | 1 +
> > 3 files changed, 46 insertions(+), 14 deletions(-)
> >
> > diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
> > index 1fb322d2ac18..dac4a6648919 100644
> > --- a/arch/x86/kvm/svm/avic.c
> > +++ b/arch/x86/kvm/svm/avic.c
> > @@ -85,6 +85,17 @@ struct amd_svm_iommu_ir {
> > void *data; /* Storing pointer to struct amd_ir_data */
> > };
> >
> > +static inline u32 avic_get_max_physical_id(struct kvm *kvm, bool is_x2apic)
>
> Formletter incoming...
>
> Do not use "inline" for functions that are visible only to the local compilation
> unit. "inline" is just a hint, and modern compilers are smart enough to inline
> functions when appropriate without a hint.
>
> A longer explanation/rant here: https://lore.kernel.org/all/ZAdfX+S323JVWNZC@google.com
Ack.
>
> > +{
> > + u32 avic_max_physical_id = is_x2apic ? x2avic_max_physical_id : AVIC_MAX_PHYSICAL_ID;
>
> Don't use a super long local variable. For a helper like this, it's unnecessary,
> e.g. if the reader can't understand what arch_max or max_id is, then spelling it
> out entirely probably won't help them.
>
> And practically, there's a danger to using long names like this: you're much more
> likely to unintentionally "shadow" a global variable. Functionally, it won't be
> a problem, but it can create confusion. E.g. if we ever added a global
> avic_max_physical_id, then this code would get rather confusing.
Sure, makes sense.
>
> > +
> > + /*
> > + * Assume vcpu_id is the same as APIC ID. Per KVM_CAP_MAX_VCPU_ID, max_vcpu_ids
> > + * represents the max APIC ID for this vm, rather than the max vcpus.
> > + */
> > + return min(kvm->arch.max_vcpu_ids - 1, avic_max_physical_id);
> > +}
> > +
> > static void avic_activate_vmcb(struct vcpu_svm *svm)
> > {
> > struct vmcb *vmcb = svm->vmcb01.ptr;
> > @@ -103,7 +114,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
> > */
> > if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
> > vmcb->control.int_ctl |= X2APIC_MODE_MASK;
> > - vmcb->control.avic_physical_id |= x2avic_max_physical_id;
> > + vmcb->control.avic_physical_id |= avic_get_max_physical_id(svm->vcpu.kvm, true);
>
> Don't pass hardcoded booleans when it is at all possible to do something else.
> For this case, I would either do:
>
> static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu)
> {
> u32 arch_max;
>
> if (x2avic_enabled && apic_x2apic_mode(vcpu->arch.apic))
This won't work since we want to use this during vCPU init and at that
point, I don't think we can rely on the vCPU x2APIC mode to decide the
size of the AVIC physical ID table.
> arch_max = x2avic_max_physical_id;
> else
> arch_max = AVIC_MAX_PHYSICAL_ID;
>
> return min(kvm->arch.max_vcpu_ids - 1, arch_max);
> }
>
<snip>
>
> static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu, u32 arch_max)
> {
> return min(kvm->arch.max_vcpu_ids - 1, arch_max);
> }
>
> static void avic_activate_vmcb(struct vcpu_svm *svm)
> {
> struct vmcb *vmcb = svm->vmcb01.ptr;
> struct kvm_vcpu *vcpu = &svm->vcpu;
> u32 max_id;
>
> vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
> vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
>
> /*
> * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
> * accesses, while interrupt injection to a running vCPU can be
> * achieved using AVIC doorbell. KVM disables the APIC access page
> * (deletes the memslot) if any vCPU has x2APIC enabled, thus
> enabling
> * AVIC in hybrid mode activates only the doorbell mechanism.
> */
> if (x2avic_enabled && apic_x2apic_mode(vcpu->arch.apic)) {
> vmcb->control.int_ctl |= X2APIC_MODE_MASK;
> max_id = avic_get_max_physical_id(vcpu, x2avic_max_physical_id);
>
> /* Disabling MSR intercept for x2APIC registers */
> svm_set_x2apic_msr_interception(svm, false);
> } else {
> max_id = avic_get_max_physical_id(vcpu,
> AVIC_MAX_PHYSICAL_ID);
> /*
> * Flush the TLB, the guest may have inserted a non-APIC
> * mapping into the TLB while AVIC was disabled.
> */
> kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
>
> /* Enabling MSR intercept for x2APIC registers */
> svm_set_x2apic_msr_interception(svm, true);
> }
>
> vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
> vmcb->control.avic_physical_id |= max_id;
> }
>
>
> I don't think I have a preference between the two?
I'm thinking of limiting the helper to just x2AVIC mode, since the
x1AVIC use is limited to a single place. Let me see what I can come up
with.
> > +static int svm_vcpu_precreate(struct kvm *kvm)
> > +{
> > + return avic_alloc_physical_id_table(kvm);
>
> Why is allocation being moved to svm_vcpu_precreate()?
This is because we want KVM_CAP_MAX_VCPU_ID to have been invoked by the
VMM, and that is guaranteed to be set by the time the first vCPU is
created. We restrict the AVIC physical ID table based on the maximum
number of vCPUs set by the VMM.
This mirrors how Intel VMX uses this capability. I will call this out
explicitly in the commit log.
Thanks,
Naveen
On Fri, Jul 18, 2025, Naveen N Rao wrote:
> On Mon, Jun 23, 2025 at 05:38:23PM -0700, Sean Christopherson wrote:
> > > @@ -103,7 +114,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
> > > */
> > > if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
> > > vmcb->control.int_ctl |= X2APIC_MODE_MASK;
> > > - vmcb->control.avic_physical_id |= x2avic_max_physical_id;
> > > + vmcb->control.avic_physical_id |= avic_get_max_physical_id(svm->vcpu.kvm, true);
> >
> > Don't pass hardcoded booleans when it is at all possible to do something else.
> > For this case, I would either do:
> >
> > static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu)
> > {
> > u32 arch_max;
> >
> > if (x2avic_enabled && apic_x2apic_mode(vcpu->arch.apic))
>
> This won't work since we want to use this during vCPU init and at that
> point, I don't think we can rely on the vCPU x2APIC mode to decide the
> size of the AVIC physical ID table.
Ah, I missed that this is used by avic_get_physical_id_table_order(). How about
this?
static u32 __avic_get_max_physical_id(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
u32 arch_max;
if (x2avic_enabled && (!vcpu || apic_x2apic_mode(vcpu->arch.apic)))
arch_max = x2avic_max_physical_id;
else
arch_max = AVIC_MAX_PHYSICAL_ID;
return min(kvm->arch.max_vcpu_ids - 1, arch_max);
}
static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu)
{
return __avic_get_max_physical_id(vcpu->kvm, vcpu);
}
static void avic_activate_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
struct kvm_vcpu *vcpu = &svm->vcpu;
vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu);
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
/*
* Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
* accesses, while interrupt injection to a running vCPU can be
* achieved using AVIC doorbell. KVM disables the APIC access page
* (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
* AVIC in hybrid mode activates only the doorbell mechanism.
*/
if (x2avic_enabled && apic_x2apic_mode(vcpu->arch.apic)) {
vmcb->control.int_ctl |= X2APIC_MODE_MASK;
/* Disabling MSR intercept for x2APIC registers */
svm_set_x2apic_msr_interception(svm, false);
} else {
/*
* Flush the TLB, the guest may have inserted a non-APIC
* mapping into the TLB while AVIC was disabled.
*/
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
/* Enabling MSR intercept for x2APIC registers */
svm_set_x2apic_msr_interception(svm, true);
}
}
static int avic_get_physical_id_table_order(struct kvm *kvm)
{
/* Limit to the maximum physical ID supported in x2avic mode */
return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64));
}
> > > +static int svm_vcpu_precreate(struct kvm *kvm)
> > > +{
> > > + return avic_alloc_physical_id_table(kvm);
> >
> > Why is allocation being moved to svm_vcpu_precreate()?
>
> This is because we want KVM_CAP_MAX_VCPU_ID to have been invoked by the
> VMM, and that is guaranteed to be set by the time the first vCPU is
> created. We restrict the AVIC physical ID table based on the maximum
> number of vCPUs set by the VMM.
>
> This mirrors how Intel VMX uses this capability. I will call this out
> explicitly in the commit log.
Do it as a separate patch. Then you pretty much *have* to write a changelog,
and the changes that are specific to 4k support are even more isolated.
On Fri, Jul 18, 2025 at 08:17:02AM -0700, Sean Christopherson wrote:
> On Fri, Jul 18, 2025, Naveen N Rao wrote:
> > On Mon, Jun 23, 2025 at 05:38:23PM -0700, Sean Christopherson wrote:
> > > > @@ -103,7 +114,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
> > > > */
> > > > if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
> > > > vmcb->control.int_ctl |= X2APIC_MODE_MASK;
> > > > - vmcb->control.avic_physical_id |= x2avic_max_physical_id;
> > > > + vmcb->control.avic_physical_id |= avic_get_max_physical_id(svm->vcpu.kvm, true);
> > >
> > > Don't pass hardcoded booleans when it is at all possible to do something else.
> > > For this case, I would either do:
> > >
> > > static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu)
> > > {
> > > u32 arch_max;
> > >
> > > if (x2avic_enabled && apic_x2apic_mode(vcpu->arch.apic))
> >
> > This won't work since we want to use this during vCPU init and at that
> > point, I don't think we can rely on the vCPU x2APIC mode to decide the
> > size of the AVIC physical ID table.
>
> Ah, I missed that this is used by avic_get_physical_id_table_order(). How about
> this?
>
> static u32 __avic_get_max_physical_id(struct kvm *kvm, struct kvm_vcpu *vcpu)
> {
> u32 arch_max;
>
> if (x2avic_enabled && (!vcpu || apic_x2apic_mode(vcpu->arch.apic)))
> arch_max = x2avic_max_physical_id;
> else
> arch_max = AVIC_MAX_PHYSICAL_ID;
>
> return min(kvm->arch.max_vcpu_ids - 1, arch_max);
> }
>
> static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu)
> {
> return __avic_get_max_physical_id(vcpu->kvm, vcpu);
> }
>
> static void avic_activate_vmcb(struct vcpu_svm *svm)
> {
> struct vmcb *vmcb = svm->vmcb01.ptr;
> struct kvm_vcpu *vcpu = &svm->vcpu;
>
> vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
>
> vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
> vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu);
>
> vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
>
> /*
> * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
> * accesses, while interrupt injection to a running vCPU can be
> * achieved using AVIC doorbell. KVM disables the APIC access page
> * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
> * AVIC in hybrid mode activates only the doorbell mechanism.
> */
> if (x2avic_enabled && apic_x2apic_mode(vcpu->arch.apic)) {
> vmcb->control.int_ctl |= X2APIC_MODE_MASK;
> /* Disabling MSR intercept for x2APIC registers */
> svm_set_x2apic_msr_interception(svm, false);
> } else {
> /*
> * Flush the TLB, the guest may have inserted a non-APIC
> * mapping into the TLB while AVIC was disabled.
> */
> kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
>
> /* Enabling MSR intercept for x2APIC registers */
> svm_set_x2apic_msr_interception(svm, true);
> }
> }
>
> static int avic_get_physical_id_table_order(struct kvm *kvm)
> {
> /* Limit to the maximum physical ID supported in x2avic mode */
> return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64));
> }
This LGTM, I will incorporate this.
>
> > > > +static int svm_vcpu_precreate(struct kvm *kvm)
> > > > +{
> > > > + return avic_alloc_physical_id_table(kvm);
> > >
> > > Why is allocation being moved to svm_vcpu_precreate()?
> >
> > This is because we want KVM_CAP_MAX_VCPU_ID to have been invoked by the
> > VMM, and that is guaranteed to be set by the time the first vCPU is
> > created. We restrict the AVIC physical ID table based on the maximum
> > number of vCPUs set by the VMM.
> >
> > This mirrors how Intel VMX uses this capability. I will call this out
> > explicitly in the commit log.
>
> Do it as a separate patch. Then you pretty much *have* to write a changelog,
> and the changes that are specific to 4k support are even more isolated.
Sure.
Thanks,
Naveen
On 2/20/2025 8:38 AM, Naveen N Rao (AMD) wrote:
> KVM allows VMMs to specify the maximum possible APIC ID for a virtual
> machine through KVM_CAP_MAX_VCPU_ID capability so as to limit data
> structures related to APIC/x2APIC. Utilize the same to set the AVIC
> physical max index in the VMCB, similar to VMX. This helps hardware
> limit the number of entries to be scanned in the physical APIC ID table
> speeding up IPI broadcasts for virtual machines with smaller number of
> vcpus.
>
> The minimum allocation required for the Physical APIC ID table is one 4k
> page supporting up to 512 entries. With AVIC support for 4096 vcpus
> though, it is sufficient to only allocate memory to accommodate the
> AVIC physical max index that will be programmed into the VMCB. Limit
> memory allocated for the Physical APIC ID table accordingly.
>
> Signed-off-by: Naveen N Rao (AMD) <naveen@kernel.org>
Looks good to me. Liked the optimal solution.
Reviewed-by: Pankaj Gupta <pankaj.gupta@amd.com>
> ---
> arch/x86/kvm/svm/avic.c | 53 ++++++++++++++++++++++++++++++-----------
> arch/x86/kvm/svm/svm.c | 6 +++++
> arch/x86/kvm/svm/svm.h | 1 +
> 3 files changed, 46 insertions(+), 14 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
> index 1fb322d2ac18..dac4a6648919 100644
> --- a/arch/x86/kvm/svm/avic.c
> +++ b/arch/x86/kvm/svm/avic.c
> @@ -85,6 +85,17 @@ struct amd_svm_iommu_ir {
> void *data; /* Storing pointer to struct amd_ir_data */
> };
>
> +static inline u32 avic_get_max_physical_id(struct kvm *kvm, bool is_x2apic)
> +{
> + u32 avic_max_physical_id = is_x2apic ? x2avic_max_physical_id : AVIC_MAX_PHYSICAL_ID;
> +
> + /*
> + * Assume vcpu_id is the same as APIC ID. Per KVM_CAP_MAX_VCPU_ID, max_vcpu_ids
> + * represents the max APIC ID for this vm, rather than the max vcpus.
> + */
> + return min(kvm->arch.max_vcpu_ids - 1, avic_max_physical_id);
> +}
> +
> static void avic_activate_vmcb(struct vcpu_svm *svm)
> {
> struct vmcb *vmcb = svm->vmcb01.ptr;
> @@ -103,7 +114,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
> */
> if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
> vmcb->control.int_ctl |= X2APIC_MODE_MASK;
> - vmcb->control.avic_physical_id |= x2avic_max_physical_id;
> + vmcb->control.avic_physical_id |= avic_get_max_physical_id(svm->vcpu.kvm, true);
> /* Disabling MSR intercept for x2APIC registers */
> svm_set_x2apic_msr_interception(svm, false);
> } else {
> @@ -114,7 +125,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
> kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
>
> /* For xAVIC and hybrid-xAVIC modes */
> - vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
> + vmcb->control.avic_physical_id |= avic_get_max_physical_id(svm->vcpu.kvm, false);
> /* Enabling MSR intercept for x2APIC registers */
> svm_set_x2apic_msr_interception(svm, true);
> }
> @@ -174,6 +185,12 @@ int avic_ga_log_notifier(u32 ga_tag)
> return 0;
> }
>
> +static inline int avic_get_physical_id_table_order(struct kvm *kvm)
> +{
> + /* Limit to the maximum physical ID supported in x2avic mode */
> + return get_order((avic_get_max_physical_id(kvm, true) + 1) * sizeof(u64));
> +}
> +
> void avic_vm_destroy(struct kvm *kvm)
> {
> unsigned long flags;
> @@ -186,7 +203,7 @@ void avic_vm_destroy(struct kvm *kvm)
> __free_page(kvm_svm->avic_logical_id_table_page);
> if (kvm_svm->avic_physical_id_table_page)
> __free_pages(kvm_svm->avic_physical_id_table_page,
> - get_order(sizeof(u64) * (x2avic_max_physical_id + 1)));
> + avic_get_physical_id_table_order(kvm));
>
> spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
> hash_del(&kvm_svm->hnode);
> @@ -199,22 +216,12 @@ int avic_vm_init(struct kvm *kvm)
> int err = -ENOMEM;
> struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
> struct kvm_svm *k2;
> - struct page *p_page;
> struct page *l_page;
> - u32 vm_id, entries;
> + u32 vm_id;
>
> if (!enable_apicv)
> return 0;
>
> - /* Allocating physical APIC ID table */
> - entries = x2avic_max_physical_id + 1;
> - p_page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
> - get_order(sizeof(u64) * entries));
> - if (!p_page)
> - goto free_avic;
> -
> - kvm_svm->avic_physical_id_table_page = p_page;
> -
> /* Allocating logical APIC ID table (4KB) */
> l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> if (!l_page)
> @@ -265,6 +272,24 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
> avic_deactivate_vmcb(svm);
> }
>
> +int avic_alloc_physical_id_table(struct kvm *kvm)
> +{
> + struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
> + struct page *p_page;
> +
> + if (kvm_svm->avic_physical_id_table_page || !enable_apicv || !irqchip_in_kernel(kvm))
> + return 0;
> +
> + p_page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
> + avic_get_physical_id_table_order(kvm));
> + if (!p_page)
> + return -ENOMEM;
> +
> + kvm_svm->avic_physical_id_table_page = p_page;
> +
> + return 0;
> +}
> +
> static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
> unsigned int index)
> {
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index b8aa0f36850f..3cb23298cdc3 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1423,6 +1423,11 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
> svm->vmcb = target_vmcb->ptr;
> }
>
> +static int svm_vcpu_precreate(struct kvm *kvm)
> +{
> + return avic_alloc_physical_id_table(kvm);
> +}
> +
> static int svm_vcpu_create(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm;
> @@ -5007,6 +5012,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
> .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu,
> .has_emulated_msr = svm_has_emulated_msr,
>
> + .vcpu_precreate = svm_vcpu_precreate,
> .vcpu_create = svm_vcpu_create,
> .vcpu_free = svm_vcpu_free,
> .vcpu_reset = svm_vcpu_reset,
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 5b159f017055..b4670afe0034 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -694,6 +694,7 @@ bool avic_hardware_setup(void);
> int avic_ga_log_notifier(u32 ga_tag);
> void avic_vm_destroy(struct kvm *kvm);
> int avic_vm_init(struct kvm *kvm);
> +int avic_alloc_physical_id_table(struct kvm *kvm);
> void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
> int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
> int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
© 2016 - 2025 Red Hat, Inc.