[v1] AMD: Add Secure AVIC Guest Support

[RFC 02/14] x86/apic: Initialize Secure AVIC APIC backing page

Posted by Neeraj Upadhyay 1 year, 4 months ago

From: Kishon Vijay Abraham I <kvijayab@amd.com>

With Secure AVIC, the APIC backing page is owned and managed by guest.
Allocate APIC backing page for all guest CPUs. In addition, add a
setup() APIC callback. This callback is used by Secure AVIC driver to
initialize APIC backing page area for each CPU.

Allocate APIC backing page memory area in chunks of 2M, so that
backing page memory is mapped using full huge pages. Without this,
if there are private to shared page state conversions for any
non-backing-page allocation which is part of the same huge page as the
one containing a backing page, hypervisor splits the huge page into 4K
pages. Splitting of APIC backing page area into individual 4K pages can
result in performance impact, due to TLB pressure.

Secure AVIC requires that vCPU's APIC backing page's NPT entry is always
present while that vCPU is running. If APIC backing page's NPT entry is
not present, a VMEXIT_BUSY is returned on VMRUN and the vCPU cannot
be resumed after that point. To handle this, invoke sev_notify_savic_gpa()
in Secure AVIC driver's setup() callback. This triggers SVM_VMGEXIT_SECURE_
AVIC_GPA exit for the hypervisor to note GPA of the vCPU's APIC
backing page. Hypervisor uses this information to ensure that the APIC
backing page is mapped in NPT before invoking VMRUN.

Signed-off-by: Kishon Vijay Abraham I <kvijayab@amd.com>
Co-developed-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
Signed-off-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
---

GHCB spec update for SVM_VMGEXIT_SECURE_AVIC_GPA NAE event is
part of the draft spec:

 https://lore.kernel.org/linux-coco/3453675d-ca29-4715-9c17-10b56b3af17e@amd.com/T/#u

 arch/x86/coco/sev/core.c            | 22 +++++++++++++++++
 arch/x86/include/asm/apic.h         |  1 +
 arch/x86/include/asm/sev.h          |  2 ++
 arch/x86/include/uapi/asm/svm.h     |  1 +
 arch/x86/kernel/apic/apic.c         |  2 ++
 arch/x86/kernel/apic/x2apic_savic.c | 38 +++++++++++++++++++++++++++++
 6 files changed, 66 insertions(+)

diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index de1df0cb45da..93470538af5e 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -1367,6 +1367,28 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
 	return ret;
 }
 
+enum es_result sev_notify_savic_gpa(u64 gpa)
+{
+	struct ghcb_state state;
+	struct es_em_ctxt ctxt;
+	unsigned long flags;
+	struct ghcb *ghcb;
+	int ret = 0;
+
+	local_irq_save(flags);
+
+	ghcb = __sev_get_ghcb(&state);
+
+	vc_ghcb_invalidate(ghcb);
+
+	ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_SECURE_AVIC_GPA, gpa, 0);
+
+	__sev_put_ghcb(&state);
+
+	local_irq_restore(flags);
+	return ret;
+}
+
 static void snp_register_per_cpu_ghcb(void)
 {
 	struct sev_es_runtime_data *data;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 9327eb00e96d..ca682c1e8748 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -302,6 +302,7 @@ struct apic {
 
 	/* Probe, setup and smpboot functions */
 	int	(*probe)(void);
+	void	(*setup)(void);
 	int	(*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
 
 	void	(*init_apic_ldr)(void);
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 79bbe2be900e..e84fc7fcc32a 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -399,6 +399,7 @@ u64 snp_get_unsupported_features(u64 status);
 u64 sev_get_status(void);
 void sev_show_status(void);
 void snp_update_svsm_ca(void);
+enum es_result sev_notify_savic_gpa(u64 gpa);
 
 #else	/* !CONFIG_AMD_MEM_ENCRYPT */
 
@@ -435,6 +436,7 @@ static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
 static inline u64 sev_get_status(void) { return 0; }
 static inline void sev_show_status(void) { }
 static inline void snp_update_svsm_ca(void) { }
+static inline enum es_result sev_notify_savic_gpa(u64 gpa) { return ES_UNSUPPORTED; }
 
 #endif	/* CONFIG_AMD_MEM_ENCRYPT */
 
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 1814b413fd57..0f21cea6d21c 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -116,6 +116,7 @@
 #define SVM_VMGEXIT_AP_CREATE			1
 #define SVM_VMGEXIT_AP_DESTROY			2
 #define SVM_VMGEXIT_SNP_RUN_VMPL		0x80000018
+#define SVM_VMGEXIT_SECURE_AVIC_GPA		0x8000001a
 #define SVM_VMGEXIT_HV_FEATURES			0x8000fffd
 #define SVM_VMGEXIT_TERM_REQUEST		0x8000fffe
 #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code)	\
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 373638691cd4..b47d1dc854c3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1499,6 +1499,8 @@ static void setup_local_APIC(void)
 		return;
 	}
 
+	if (apic->setup)
+		apic->setup();
 	/*
 	 * If this comes from kexec/kcrash the APIC might be enabled in
 	 * SPIV. Soft disable it before doing further initialization.
diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c
index 97dac09a7f42..d903c35b8b64 100644
--- a/arch/x86/kernel/apic/x2apic_savic.c
+++ b/arch/x86/kernel/apic/x2apic_savic.c
@@ -9,12 +9,16 @@
 
 #include <linux/cpumask.h>
 #include <linux/cc_platform.h>
+#include <linux/percpu-defs.h>
 
 #include <asm/apic.h>
 #include <asm/sev.h>
 
 #include "local.h"
 
+static DEFINE_PER_CPU(void *, apic_backing_page);
+static DEFINE_PER_CPU(bool, savic_setup_done);
+
 static int x2apic_savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
 	return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC);
@@ -61,8 +65,30 @@ static void x2apic_savic_send_IPI_mask_allbutself(const struct cpumask *mask, in
 	__send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
 }
 
+static void x2apic_savic_setup(void)
+{
+	void *backing_page;
+	enum es_result ret;
+	unsigned long gpa;
+
+	if (this_cpu_read(savic_setup_done))
+		return;
+
+	backing_page = this_cpu_read(apic_backing_page);
+	gpa = __pa(backing_page);
+	ret = sev_notify_savic_gpa(gpa);
+	if (ret != ES_OK)
+		snp_abort();
+	this_cpu_write(savic_setup_done, true);
+}
+
 static int x2apic_savic_probe(void)
 {
+	void *backing_pages;
+	unsigned int cpu;
+	size_t sz;
+	int i;
+
 	if (!cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
 		return 0;
 
@@ -71,6 +97,17 @@ static int x2apic_savic_probe(void)
 		snp_abort();
 	}
 
+	sz = ALIGN(num_possible_cpus() * SZ_4K, SZ_2M);
+	backing_pages = kzalloc(sz, GFP_ATOMIC);
+	if (!backing_pages)
+		snp_abort();
+
+	i = 0;
+	for_each_possible_cpu(cpu) {
+		per_cpu(apic_backing_page, cpu) = backing_pages + i * SZ_4K;
+		i++;
+	}
+
 	pr_info("Secure AVIC Enabled\n");
 
 	return 1;
@@ -81,6 +118,7 @@ static struct apic apic_x2apic_savic __ro_after_init = {
 	.name				= "secure avic x2apic",
 	.probe				= x2apic_savic_probe,
 	.acpi_madt_oem_check		= x2apic_savic_acpi_madt_oem_check,
+	.setup				= x2apic_savic_setup,
 
 	.dest_mode_logical		= false,
 
-- 
2.34.1

Re: [RFC 02/14] x86/apic: Initialize Secure AVIC APIC backing page

Posted by Borislav Petkov 1 year, 3 months ago

On Fri, Sep 13, 2024 at 05:06:53PM +0530, Neeraj Upadhyay wrote:
> @@ -61,8 +65,30 @@ static void x2apic_savic_send_IPI_mask_allbutself(const struct cpumask *mask, in
>  	__send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
>  }
>  
> +static void x2apic_savic_setup(void)
> +{
> +	void *backing_page;
> +	enum es_result ret;
> +	unsigned long gpa;
> +
> +	if (this_cpu_read(savic_setup_done))

I'm sure you can get rid of that silly bool. Like check the apic_backing_page
pointer instead and use that pointer to verify whether the per-CPU setup has
been done successfully.

> +		return;
> +
> +	backing_page = this_cpu_read(apic_backing_page);
> +	gpa = __pa(backing_page);
> +	ret = sev_notify_savic_gpa(gpa);
> +	if (ret != ES_OK)
> +		snp_abort();
> +	this_cpu_write(savic_setup_done, true);
> +}

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Re: [RFC 02/14] x86/apic: Initialize Secure AVIC APIC backing page

Posted by Neeraj Upadhyay 1 year, 3 months ago

On 10/23/2024 10:06 PM, Borislav Petkov wrote:
> On Fri, Sep 13, 2024 at 05:06:53PM +0530, Neeraj Upadhyay wrote:
>> @@ -61,8 +65,30 @@ static void x2apic_savic_send_IPI_mask_allbutself(const struct cpumask *mask, in
>>  	__send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
>>  }
>>  
>> +static void x2apic_savic_setup(void)
>> +{
>> +	void *backing_page;
>> +	enum es_result ret;
>> +	unsigned long gpa;
>> +
>> +	if (this_cpu_read(savic_setup_done))
> 
> I'm sure you can get rid of that silly bool. Like check the apic_backing_page
> pointer instead and use that pointer to verify whether the per-CPU setup has
> been done successfully.
> 

Ok agree. In this patch version, APIC backing page allocation for all CPUs is done in
x2apic_savic_probe(). This is done to group the allocation together, so that these
backing pages are mapped using single 2M NPT and RMP entry.
I will move the APIC backing page setup to per-CPU setup (x2apic_savic_setup()) and
use that pointer to do the check.

- Neeraj

Re: [RFC 02/14] x86/apic: Initialize Secure AVIC APIC backing page

Posted by Dave Hansen 1 year, 4 months ago

On 9/13/24 04:36, Neeraj Upadhyay wrote:
> +	sz = ALIGN(num_possible_cpus() * SZ_4K, SZ_2M);
> +	backing_pages = kzalloc(sz, GFP_ATOMIC);
> +	if (!backing_pages)
> +		snp_abort();

Is this in an atomic context?  If not, why the GFP_ATOMIC?

Second, this looks to be allocating a potentially large physically
contiguous chunk of memory, then handing it out 4k at a time.  The loop is:

	buf = alloc(NR_CPUS * PAGE_SIZE);
	for (i = 0; i < NR_CPUS; i++)
		foo[i] = buf + i * PAGE_SIZE;

but could be:

	for (i = 0; i < NR_CPUS; i++)
		foo[i] = alloc(PAGE_SIZE);

right?

Re: [RFC 02/14] x86/apic: Initialize Secure AVIC APIC backing page

Posted by Neeraj Upadhyay 1 year, 4 months ago

On 10/9/2024 8:57 PM, Dave Hansen wrote:
> On 9/13/24 04:36, Neeraj Upadhyay wrote:
>> +	sz = ALIGN(num_possible_cpus() * SZ_4K, SZ_2M);
>> +	backing_pages = kzalloc(sz, GFP_ATOMIC);
>> +	if (!backing_pages)
>> +		snp_abort();
> 
> Is this in an atomic context?  If not, why the GFP_ATOMIC?
> 

No. I think GFP_ATOMIC is not required. I will change it to GFP_KERNEL.

> Second, this looks to be allocating a potentially large physically
> contiguous chunk of memory, then handing it out 4k at a time.  The loop is:
> 
> 	buf = alloc(NR_CPUS * PAGE_SIZE);
> 	for (i = 0; i < NR_CPUS; i++)
> 		foo[i] = buf + i * PAGE_SIZE;
> 
> but could be:
> 
> 	for (i = 0; i < NR_CPUS; i++)
> 		foo[i] = alloc(PAGE_SIZE);
> 
> right?

Single contiguous allocation is done here to avoid TLB impact due to backing page
accesses (e.g. sending ipi requires writing to target CPU's backing page).
I can change it to allocation in chunks of size 2M instead of one big allocation.
Is that fine? Also, as described in commit message, reserving entire 2M chunk
for backing pages also prevents splitting of NPT entries into individual 4K entries.
This can happen if part of a 2M page is not allocated for backing pages by guest
and page state change (from private to shared) is done for that part.

- Neeraj

Re: [RFC 02/14] x86/apic: Initialize Secure AVIC APIC backing page

Posted by Dave Hansen 1 year, 4 months ago

On 10/9/24 09:31, Neeraj Upadhyay wrote:
>> Second, this looks to be allocating a potentially large physically
>> contiguous chunk of memory, then handing it out 4k at a time.  The loop is:
>>
>> 	buf = alloc(NR_CPUS * PAGE_SIZE);
>> 	for (i = 0; i < NR_CPUS; i++)
>> 		foo[i] = buf + i * PAGE_SIZE;
>>
>> but could be:
>>
>> 	for (i = 0; i < NR_CPUS; i++)
>> 		foo[i] = alloc(PAGE_SIZE);
>>
>> right?
> 
> Single contiguous allocation is done here to avoid TLB impact due to backing page
> accesses (e.g. sending ipi requires writing to target CPU's backing page).
> I can change it to allocation in chunks of size 2M instead of one big allocation.
> Is that fine? Also, as described in commit message, reserving entire 2M chunk
> for backing pages also prevents splitting of NPT entries into individual 4K entries.
> This can happen if part of a 2M page is not allocated for backing pages by guest
> and page state change (from private to shared) is done for that part.

Ick.

First, this needs to be thoroughly commented, not in the changelogs.

Second, this is premature optimization at its finest.  Just imagine if
_every_ site that needed 16k or 32k of shared memory decided to allocate
a 2M chunk for this _and_ used it sparsely.  What's the average number
of vCPUs in a guest.  4?  8?

The absolute minimum that we can do here is some stupid infrastructure
that you call for allocating shared pages, or for things that _will_ be
converted to shared so they get packed.

But hacking uncommented 2M allocations into every site seems like
insanity to me.

IMNHO, you can either invest the time to put the infrastructure in place
and get 2M pages, or you can live with the suboptimal performance of 4k.

Re: [RFC 02/14] x86/apic: Initialize Secure AVIC APIC backing page

Posted by Neeraj Upadhyay 1 year, 4 months ago


On 10/9/2024 10:33 PM, Dave Hansen wrote:
> On 10/9/24 09:31, Neeraj Upadhyay wrote:
>>> Second, this looks to be allocating a potentially large physically
>>> contiguous chunk of memory, then handing it out 4k at a time.  The loop is:
>>>
>>> 	buf = alloc(NR_CPUS * PAGE_SIZE);
>>> 	for (i = 0; i < NR_CPUS; i++)
>>> 		foo[i] = buf + i * PAGE_SIZE;
>>>
>>> but could be:
>>>
>>> 	for (i = 0; i < NR_CPUS; i++)
>>> 		foo[i] = alloc(PAGE_SIZE);
>>>
>>> right?
>>
>> Single contiguous allocation is done here to avoid TLB impact due to backing page
>> accesses (e.g. sending ipi requires writing to target CPU's backing page).
>> I can change it to allocation in chunks of size 2M instead of one big allocation.
>> Is that fine? Also, as described in commit message, reserving entire 2M chunk
>> for backing pages also prevents splitting of NPT entries into individual 4K entries.
>> This can happen if part of a 2M page is not allocated for backing pages by guest
>> and page state change (from private to shared) is done for that part.
> 
> Ick.
> 
> First, this needs to be thoroughly commented, not in the changelogs.
> 

Ok.

> Second, this is premature optimization at its finest.  Just imagine if
> _every_ site that needed 16k or 32k of shared memory decided to allocate
> a 2M chunk for this _and_ used it sparsely.  What's the average number
> of vCPUs in a guest.  4?  8?
> 

Got it.

> The absolute minimum that we can do here is some stupid infrastructure
> that you call for allocating shared pages, or for things that _will_ be
> converted to shared so they get packed.
> 
> But hacking uncommented 2M allocations into every site seems like
> insanity to me.
> 
> IMNHO, you can either invest the time to put the infrastructure in place
> and get 2M pages, or you can live with the suboptimal performance of 4k.

I will start with 4K. For later, I will get the performance numbers to propose
a change in allocation scheme  - for ex, allocating a bigger contiguous
batch from the total allocation required for backing pages (num_possible_cpus() * 4K)
without doing 2M reservation.


- Neeraj

Re: [RFC 02/14] x86/apic: Initialize Secure AVIC APIC backing page

Posted by Borislav Petkov 1 year, 3 months ago

On Wed, Oct 09, 2024 at 11:22:58PM +0530, Neeraj Upadhyay wrote:
> I will start with 4K. For later, I will get the performance numbers to propose
> a change in allocation scheme  - for ex, allocating a bigger contiguous
> batch from the total allocation required for backing pages (num_possible_cpus() * 4K)
> without doing 2M reservation.

Why does performance matter here if you're going to allocate simply a 4K page
per vCPU and set them all up in the APIC setup path? And then you can do the
page conversion to guest-owned as part of the guest vCPU init path?

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Re: [RFC 02/14] x86/apic: Initialize Secure AVIC APIC backing page

Posted by Neeraj Upadhyay 1 year, 3 months ago


On 10/23/2024 10:00 PM, Borislav Petkov wrote:
> On Wed, Oct 09, 2024 at 11:22:58PM +0530, Neeraj Upadhyay wrote:
>> I will start with 4K. For later, I will get the performance numbers to propose
>> a change in allocation scheme  - for ex, allocating a bigger contiguous
>> batch from the total allocation required for backing pages (num_possible_cpus() * 4K)
>> without doing 2M reservation.
> 
> Why does performance matter here if you're going to allocate simply a 4K page
> per vCPU and set them all up in the APIC setup path? And then you can do the
> page conversion to guest-owned as part of the guest vCPU init path?
> 
Please let me know if I didn't understand your questions correctly. The performance
concerns here are w.r.t. these backing page allocations being part of a single
hugepage.

Grouping of allocation together allows these pages to be part of the same 2M NPT
and RMP table entry, which can provide better performance compared to having
separate 4K entries for each backing page. For example, to send IPI to target CPUs,
->send_IPI callback (executing on source CPU) in Secure AVIC driver writes to the
backing page of target CPU. Having these backing pages as part of the single
2M entry could provide better caching of the translation and require single entry
in TLB at the source CPU.



- Neeraj

Re: [RFC 02/14] x86/apic: Initialize Secure AVIC APIC backing page

Posted by Borislav Petkov 1 year, 3 months ago

On Thu, Oct 24, 2024 at 09:31:01AM +0530, Neeraj Upadhyay wrote:
> Please let me know if I didn't understand your questions correctly. The performance
> concerns here are w.r.t. these backing page allocations being part of a single
> hugepage.
> 
> Grouping of allocation together allows these pages to be part of the same 2M NPT
> and RMP table entry, which can provide better performance compared to having
> separate 4K entries for each backing page. For example, to send IPI to target CPUs,
> ->send_IPI callback (executing on source CPU) in Secure AVIC driver writes to the
> backing page of target CPU. Having these backing pages as part of the single
> 2M entry could provide better caching of the translation and require single entry
> in TLB at the source CPU.

Lemme see if I understand you correctly: you want a single 2M page to contain
*all* backing pages so that when the HV wants to send IPIs etc, the first vCPU
will load the page translation into the TLB and the following ones will have
it already?

Versus having separate 4K pages which would mean that everytime a vCPU's backing
page is needed, every vCPU would have to do a TLB walk and pull it in so that
the mapping's there?

Am I close?

If so, what's the problem with loading that backing page each time you VMRUN
the vCPU?

IOW, how noticeable would that be?

And what guarantees that the 2M page containing the backing pages would always
remain in the TLB?

Hmmm.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Re: [RFC 02/14] x86/apic: Initialize Secure AVIC APIC backing page

Posted by Neeraj Upadhyay 1 year, 3 months ago

On 10/24/2024 5:19 PM, Borislav Petkov wrote:
> On Thu, Oct 24, 2024 at 09:31:01AM +0530, Neeraj Upadhyay wrote:
>> Please let me know if I didn't understand your questions correctly. The performance
>> concerns here are w.r.t. these backing page allocations being part of a single
>> hugepage.
>>
>> Grouping of allocation together allows these pages to be part of the same 2M NPT
>> and RMP table entry, which can provide better performance compared to having
>> separate 4K entries for each backing page. For example, to send IPI to target CPUs,
>> ->send_IPI callback (executing on source CPU) in Secure AVIC driver writes to the
>> backing page of target CPU. Having these backing pages as part of the single
>> 2M entry could provide better caching of the translation and require single entry
>> in TLB at the source CPU.
> 
> Lemme see if I understand you correctly: you want a single 2M page to contain
> *all* backing pages so that when the HV wants to send IPIs etc, the first vCPU

With Secure AVIC enabled, source vCPU directly writes to the Interrupt Request Register
(IRR) offset in the target CPU's backing page. So, the IPI is directly requested in
target vCPU's backing page by source vCPU context and not by HV.

> will load the page translation into the TLB and the following ones will have
> it already?
> 

Yes, but the following ones will be already present in source vCPU's CPU TLB.

> Versus having separate 4K pages which would mean that everytime a vCPU's backing
> page is needed, every vCPU would have to do a TLB walk and pull it in so that
> the mapping's there?
> 

The walk is done by source CPU here, as it is the one which is writing to the
backing page of target vCPUs.

> Am I close?
>

I have clarified some parts above. Basically, source vCPU is directly writing to
remote backing pages.

> If so, what's the problem with loading that backing page each time you VMRUN
> the vCPU?
> 

As I clarified above, it's the source vCPU which need to load each backing page.

> IOW, how noticeable would that be?
> 

I don't have the data at this point. That is the reason I will send this contiguous
allocation as a separate patch (if required) when I can get data on some workloads
which are impacted by this.

> And what guarantees that the 2M page containing the backing pages would always
> remain in the TLB?
> 

For smp_call_function_many(), where a source CPU sends IPI to multiple CPUs,
source CPU writes to backing pages of different target CPUs within this function.
So, accesses have temporal locality. For other use cases, I need to enable
perf with Secure AVIC to collect the TLB misses on a IPI benchmark and get
back with the numbers.

- Neeraj

> Hmmm.
>

Re: [RFC 02/14] x86/apic: Initialize Secure AVIC APIC backing page

Posted by Borislav Petkov 1 year, 3 months ago

On Thu, Oct 24, 2024 at 06:01:16PM +0530, Neeraj Upadhyay wrote:
> With Secure AVIC enabled, source vCPU directly writes to the Interrupt
> Request Register (IRR) offset in the target CPU's backing page. So, the IPI
> is directly requested in target vCPU's backing page by source vCPU context
> and not by HV.

So the source vCPU will fault in the target vCPU's backing page if it is not
there anymore. And if it is part of a 2M translation, the likelihood that it
is there is higher.

> As I clarified above, it's the source vCPU which need to load each backing
> page.

So if we have 4K backing pages, the source vCPU will fault-in the target's
respective backing page into its TLB and send the IPI. And if it is an IPI to
multiple vCPUs, then it will have to fault in each vCPU's backing page in
succession.

However, when the target vCPU gets to VMRUN, the backing page will have to be
faulted in into the target vCPU's TLB too.

And this is the same with a 2M backing page - the target vCPUs will have to
fault that 2M page translation too.

But then if the target vCPU wants to send IPIs itself, the 2M backing pages
will be there already. Hmmm.

> I don't have the data at this point. That is the reason I will send this
> contiguous allocation as a separate patch (if required) when I can get data
> on some workloads which are impacted by this.

Yes, that would clarify whether something more involved than simply using 4K
pages is needed.

> For smp_call_function_many(), where a source CPU sends IPI to multiple CPUs,
> source CPU writes to backing pages of different target CPUs within this function.
> So, accesses have temporal locality. For other use cases, I need to enable
> perf with Secure AVIC to collect the TLB misses on a IPI benchmark and get
> back with the numbers.

Right, I can see some TLB walks getting avoided if you have a single 2M page
but without actually measuring it, I don't know. If I had to venture a guess,
it probably won't show any difference but who knows...

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette