[v1] Provide SEV-SNP support for unaccepted memory

[PATCH v1.1 1/2] x86/sev: Use per-CPU PSC structure in prep for unaccepted memory support

Posted by Tom Lendacky 3 years, 8 months ago

In advance of providing support for unaccepted memory, switch from using
kmalloc() for allocating the Page State Change (PSC) structure to using a
static structure. This is needed to avoid a possible recursive call into
set_pages_state() if the kmalloc() call requires (more) memory to be
accepted, which would result in a hang.

Page state changes occur whenever DMA memory is allocated or memory needs
to be shared with the hypervisor (kvmclock, attestation reports, etc.).
Since most page state changes occur early in boot and are limited in
number, a single static PSC structure is used and protected by a spin
lock with interrupts disabled.

Even with interrupts disabled, an NMI can be raised while performing
memory acceptance. The NMI could then cause further memory acceptance to
be performed. To prevent a deadlock, use the MSR protocol if executing in
an NMI context.

Since the set_pages_state() path is the only path into vmgexit_psc(),
rename vmgexit_psc() to __vmgexit_psc() and remove the calls to disable
interrupts which are now performed by set_pages_state().

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 arch/x86/kernel/sev.c | 55 +++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index c05f0124c410..84d94fd2ec53 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -66,6 +66,9 @@ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
  */
 static struct ghcb *boot_ghcb __section(".data");
 
+/* Flag to indicate when the first per-CPU GHCB is registered */
+static bool ghcb_percpu_ready __section(".data");
+
 /* Bitmap of SEV features supported by the hypervisor */
 static u64 sev_hv_features __ro_after_init;
 
@@ -122,6 +125,15 @@ struct sev_config {
 
 static struct sev_config sev_cfg __read_mostly;
 
+/*
+ * Page State Change structure for use when accepting memory or when changing
+ * page state. Use is protected by a spinlock with interrupts disabled, but an
+ * NMI could still be raised, so check if running in an NMI an use the MSR
+ * protocol in these cases.
+ */
+static struct snp_psc_desc psc_desc;
+static DEFINE_SPINLOCK(psc_desc_lock);
+
 static __always_inline bool on_vc_stack(struct pt_regs *regs)
 {
 	unsigned long sp = regs->sp;
@@ -660,7 +672,7 @@ static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool valid
 	}
 }
 
-static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op)
+static void early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op)
 {
 	unsigned long paddr_end;
 	u64 val;
@@ -742,26 +754,17 @@ void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op
 		WARN(1, "invalid memory op %d\n", op);
 }
 
-static int vmgexit_psc(struct snp_psc_desc *desc)
+static int __vmgexit_psc(struct snp_psc_desc *desc)
 {
 	int cur_entry, end_entry, ret = 0;
 	struct snp_psc_desc *data;
 	struct ghcb_state state;
 	struct es_em_ctxt ctxt;
-	unsigned long flags;
 	struct ghcb *ghcb;
 
-	/*
-	 * __sev_get_ghcb() needs to run with IRQs disabled because it is using
-	 * a per-CPU GHCB.
-	 */
-	local_irq_save(flags);
-
 	ghcb = __sev_get_ghcb(&state);
-	if (!ghcb) {
-		ret = 1;
-		goto out_unlock;
-	}
+	if (!ghcb)
+		return 1;
 
 	/* Copy the input desc into GHCB shared buffer */
 	data = (struct snp_psc_desc *)ghcb->shared_buffer;
@@ -820,9 +823,6 @@ static int vmgexit_psc(struct snp_psc_desc *desc)
 out:
 	__sev_put_ghcb(&state);
 
-out_unlock:
-	local_irq_restore(flags);
-
 	return ret;
 }
 
@@ -861,18 +861,25 @@ static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,
 		i++;
 	}
 
-	if (vmgexit_psc(data))
+	if (__vmgexit_psc(data))
 		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
 }
 
 static void set_pages_state(unsigned long vaddr, unsigned int npages, int op)
 {
 	unsigned long vaddr_end, next_vaddr;
-	struct snp_psc_desc *desc;
+	unsigned long flags;
 
-	desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT);
-	if (!desc)
-		panic("SNP: failed to allocate memory for PSC descriptor\n");
+	/*
+	 * Use the MSR protocol when either:
+	 *   - executing in an NMI to avoid any possibility of a deadlock
+	 *   - per-CPU GHCBs are not yet registered, since __vmgexit_psc()
+	 *     uses the per-CPU GHCB.
+	 */
+	if (in_nmi() || !ghcb_percpu_ready)
+		return early_set_pages_state(__pa(vaddr), npages, op);
+
+	spin_lock_irqsave(&psc_desc_lock, flags);
 
 	vaddr = vaddr & PAGE_MASK;
 	vaddr_end = vaddr + (npages << PAGE_SHIFT);
@@ -882,12 +889,12 @@ static void set_pages_state(unsigned long vaddr, unsigned int npages, int op)
 		next_vaddr = min_t(unsigned long, vaddr_end,
 				   (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr);
 
-		__set_pages_state(desc, vaddr, next_vaddr, op);
+		__set_pages_state(&psc_desc, vaddr, next_vaddr, op);
 
 		vaddr = next_vaddr;
 	}
 
-	kfree(desc);
+	spin_unlock_irqrestore(&psc_desc_lock, flags);
 }
 
 void snp_set_memory_shared(unsigned long vaddr, unsigned int npages)
@@ -1254,6 +1261,8 @@ void setup_ghcb(void)
 		if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
 			snp_register_per_cpu_ghcb();
 
+		ghcb_percpu_ready = true;
+
 		return;
 	}
 
-- 
2.36.1

Re: [PATCH v1.1 1/2] x86/sev: Use per-CPU PSC structure in prep for unaccepted memory support

Posted by Tom Lendacky 3 years, 8 months ago

On 8/3/22 13:11, Tom Lendacky wrote:

Of course, I'll fix the subject if submitting this for real... ugh.

Thanks,
Tom

> In advance of providing support for unaccepted memory, switch from using
> kmalloc() for allocating the Page State Change (PSC) structure to using a
> static structure. This is needed to avoid a possible recursive call into
> set_pages_state() if the kmalloc() call requires (more) memory to be
> accepted, which would result in a hang.
> 
> Page state changes occur whenever DMA memory is allocated or memory needs
> to be shared with the hypervisor (kvmclock, attestation reports, etc.).
> Since most page state changes occur early in boot and are limited in
> number, a single static PSC structure is used and protected by a spin
> lock with interrupts disabled.
> 
> Even with interrupts disabled, an NMI can be raised while performing
> memory acceptance. The NMI could then cause further memory acceptance to
> be performed. To prevent a deadlock, use the MSR protocol if executing in
> an NMI context.
> 
> Since the set_pages_state() path is the only path into vmgexit_psc(),
> rename vmgexit_psc() to __vmgexit_psc() and remove the calls to disable
> interrupts which are now performed by set_pages_state().
> 
> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
> ---
>   arch/x86/kernel/sev.c | 55 +++++++++++++++++++++++++------------------
>   1 file changed, 32 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
> index c05f0124c410..84d94fd2ec53 100644
> --- a/arch/x86/kernel/sev.c
> +++ b/arch/x86/kernel/sev.c
> @@ -66,6 +66,9 @@ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
>    */
>   static struct ghcb *boot_ghcb __section(".data");
>   
> +/* Flag to indicate when the first per-CPU GHCB is registered */
> +static bool ghcb_percpu_ready __section(".data");
> +
>   /* Bitmap of SEV features supported by the hypervisor */
>   static u64 sev_hv_features __ro_after_init;
>   
> @@ -122,6 +125,15 @@ struct sev_config {
>   
>   static struct sev_config sev_cfg __read_mostly;
>   
> +/*
> + * Page State Change structure for use when accepting memory or when changing
> + * page state. Use is protected by a spinlock with interrupts disabled, but an
> + * NMI could still be raised, so check if running in an NMI an use the MSR
> + * protocol in these cases.
> + */
> +static struct snp_psc_desc psc_desc;
> +static DEFINE_SPINLOCK(psc_desc_lock);
> +
>   static __always_inline bool on_vc_stack(struct pt_regs *regs)
>   {
>   	unsigned long sp = regs->sp;
> @@ -660,7 +672,7 @@ static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool valid
>   	}
>   }
>   
> -static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op)
> +static void early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op)
>   {
>   	unsigned long paddr_end;
>   	u64 val;
> @@ -742,26 +754,17 @@ void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op
>   		WARN(1, "invalid memory op %d\n", op);
>   }
>   
> -static int vmgexit_psc(struct snp_psc_desc *desc)
> +static int __vmgexit_psc(struct snp_psc_desc *desc)
>   {
>   	int cur_entry, end_entry, ret = 0;
>   	struct snp_psc_desc *data;
>   	struct ghcb_state state;
>   	struct es_em_ctxt ctxt;
> -	unsigned long flags;
>   	struct ghcb *ghcb;
>   
> -	/*
> -	 * __sev_get_ghcb() needs to run with IRQs disabled because it is using
> -	 * a per-CPU GHCB.
> -	 */
> -	local_irq_save(flags);
> -
>   	ghcb = __sev_get_ghcb(&state);
> -	if (!ghcb) {
> -		ret = 1;
> -		goto out_unlock;
> -	}
> +	if (!ghcb)
> +		return 1;
>   
>   	/* Copy the input desc into GHCB shared buffer */
>   	data = (struct snp_psc_desc *)ghcb->shared_buffer;
> @@ -820,9 +823,6 @@ static int vmgexit_psc(struct snp_psc_desc *desc)
>   out:
>   	__sev_put_ghcb(&state);
>   
> -out_unlock:
> -	local_irq_restore(flags);
> -
>   	return ret;
>   }
>   
> @@ -861,18 +861,25 @@ static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,
>   		i++;
>   	}
>   
> -	if (vmgexit_psc(data))
> +	if (__vmgexit_psc(data))
>   		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
>   }
>   
>   static void set_pages_state(unsigned long vaddr, unsigned int npages, int op)
>   {
>   	unsigned long vaddr_end, next_vaddr;
> -	struct snp_psc_desc *desc;
> +	unsigned long flags;
>   
> -	desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT);
> -	if (!desc)
> -		panic("SNP: failed to allocate memory for PSC descriptor\n");
> +	/*
> +	 * Use the MSR protocol when either:
> +	 *   - executing in an NMI to avoid any possibility of a deadlock
> +	 *   - per-CPU GHCBs are not yet registered, since __vmgexit_psc()
> +	 *     uses the per-CPU GHCB.
> +	 */
> +	if (in_nmi() || !ghcb_percpu_ready)
> +		return early_set_pages_state(__pa(vaddr), npages, op);
> +
> +	spin_lock_irqsave(&psc_desc_lock, flags);
>   
>   	vaddr = vaddr & PAGE_MASK;
>   	vaddr_end = vaddr + (npages << PAGE_SHIFT);
> @@ -882,12 +889,12 @@ static void set_pages_state(unsigned long vaddr, unsigned int npages, int op)
>   		next_vaddr = min_t(unsigned long, vaddr_end,
>   				   (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr);
>   
> -		__set_pages_state(desc, vaddr, next_vaddr, op);
> +		__set_pages_state(&psc_desc, vaddr, next_vaddr, op);
>   
>   		vaddr = next_vaddr;
>   	}
>   
> -	kfree(desc);
> +	spin_unlock_irqrestore(&psc_desc_lock, flags);
>   }
>   
>   void snp_set_memory_shared(unsigned long vaddr, unsigned int npages)
> @@ -1254,6 +1261,8 @@ void setup_ghcb(void)
>   		if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
>   			snp_register_per_cpu_ghcb();
>   
> +		ghcb_percpu_ready = true;
> +
>   		return;
>   	}
>

Re: [PATCH v1.1 1/2] x86/sev: Use per-CPU PSC structure in prep for unaccepted memory support

Posted by Dave Hansen 3 years, 8 months ago

On 8/3/22 11:11, Tom Lendacky wrote:
> +	/*
> +	 * Use the MSR protocol when either:
> +	 *   - executing in an NMI to avoid any possibility of a deadlock
> +	 *   - per-CPU GHCBs are not yet registered, since __vmgexit_psc()
> +	 *     uses the per-CPU GHCB.
> +	 */
> +	if (in_nmi() || !ghcb_percpu_ready)
> +		return early_set_pages_state(__pa(vaddr), npages, op);
> +
> +	spin_lock_irqsave(&psc_desc_lock, flags);

Would it be simpler to just do a spin_trylock_irqsave()?  You fall back
to early_set_pages_state() whenever you can't acquire the lock.

That avoids even having to know what the situations are where you
_might_ recurse.  If it recurses, the trylock will just naturally fail.
 You simply can't have bugs where the "(in_nmi() || !ghcb_percpu_ready)"
conditional was wrong.

Re: [PATCH v1.1 1/2] x86/sev: Use per-CPU PSC structure in prep for unaccepted memory support

Posted by Tom Lendacky 3 years, 8 months ago


On 8/3/22 13:17, Dave Hansen wrote:
> On 8/3/22 11:11, Tom Lendacky wrote:
>> +	/*
>> +	 * Use the MSR protocol when either:
>> +	 *   - executing in an NMI to avoid any possibility of a deadlock
>> +	 *   - per-CPU GHCBs are not yet registered, since __vmgexit_psc()
>> +	 *     uses the per-CPU GHCB.
>> +	 */
>> +	if (in_nmi() || !ghcb_percpu_ready)
>> +		return early_set_pages_state(__pa(vaddr), npages, op);
>> +
>> +	spin_lock_irqsave(&psc_desc_lock, flags);
> 
> Would it be simpler to just do a spin_trylock_irqsave()?  You fall back
> to early_set_pages_state() whenever you can't acquire the lock.

I was looking at that and can definitely go that route if this approach is 
preferred.

Thanks,
Tom

> 
> That avoids even having to know what the situations are where you
> _might_ recurse.  If it recurses, the trylock will just naturally fail.
>   You simply can't have bugs where the "(in_nmi() || !ghcb_percpu_ready)"
> conditional was wrong.

Re: [PATCH v1.1 1/2] x86/sev: Use per-CPU PSC structure in prep for unaccepted memory support

Posted by Dave Hansen 3 years, 8 months ago

On 8/3/22 11:21, Tom Lendacky wrote:
>> Would it be simpler to just do a spin_trylock_irqsave()?  You fall back
>> to early_set_pages_state() whenever you can't acquire the lock.
> 
> I was looking at that and can definitely go that route if this approach
> is preferred.

I prefer it for sure.

This whole iteration does look good to me versus the per-cpu version, so
I say go ahead with doing this for v2 once you wait a bit for any more
feedback.

Re: [PATCH v1.1 1/2] x86/sev: Use per-CPU PSC structure in prep for unaccepted memory support

Posted by Tom Lendacky 3 years, 8 months ago

On 8/3/22 13:24, Dave Hansen wrote:
> On 8/3/22 11:21, Tom Lendacky wrote:
>>> Would it be simpler to just do a spin_trylock_irqsave()?  You fall back
>>> to early_set_pages_state() whenever you can't acquire the lock.
>>
>> I was looking at that and can definitely go that route if this approach
>> is preferred.
> 
> I prefer it for sure.
> 
> This whole iteration does look good to me versus the per-cpu version, so
> I say go ahead with doing this for v2 once you wait a bit for any more
> feedback.

I'm still concerned about the whole spinlock and performance. What if I 
reduce the number of entries in the PSC structure to, say, 64, which 
reduces the size of the struct to 520 bytes. Any issue if that is put on 
the stack, instead? It definitely makes things less complicated and feels 
like a good compromise on the size vs the number of PSC VMGEXIT requests.

Thanks,
Tom

Re: [PATCH v1.1 1/2] x86/sev: Use per-CPU PSC structure in prep for unaccepted memory support

Posted by Dave Hansen 3 years, 8 months ago

On 8/3/22 14:03, Tom Lendacky wrote:
>> This whole iteration does look good to me versus the per-cpu version, so
>> I say go ahead with doing this for v2 once you wait a bit for any more
>> feedback.
> 
> I'm still concerned about the whole spinlock and performance. What if I
> reduce the number of entries in the PSC structure to, say, 64, which
> reduces the size of the struct to 520 bytes. Any issue if that is put on
> the stack, instead? It definitely makes things less complicated and
> feels like a good compromise on the size vs the number of PSC VMGEXIT
> requests.

That would be fine too.

But, I doubt there will be any real performance issues coming out of
this.  As bad as this MSR thing is, I suspect it's not half as
disastrous as the global spinlock in Kirill's patches.

Also, private<->shared page conversions are *NOT* common from what I can
tell.  There are a few pages converted at boot, but most host the
guest<->host communications are through the swiotlb pages which are static.

Are there other things that SEV uses this structure for that I'm missing?

Re: [PATCH v1.1 1/2] x86/sev: Use per-CPU PSC structure in prep for unaccepted memory support

Posted by Tom Lendacky 3 years, 8 months ago

On 8/3/22 16:18, Dave Hansen wrote:
> On 8/3/22 14:03, Tom Lendacky wrote:
>>> This whole iteration does look good to me versus the per-cpu version, so
>>> I say go ahead with doing this for v2 once you wait a bit for any more
>>> feedback.
>>
>> I'm still concerned about the whole spinlock and performance. What if I
>> reduce the number of entries in the PSC structure to, say, 64, which
>> reduces the size of the struct to 520 bytes. Any issue if that is put on
>> the stack, instead? It definitely makes things less complicated and
>> feels like a good compromise on the size vs the number of PSC VMGEXIT
>> requests.
> 
> That would be fine too.

Ok.

> 
> But, I doubt there will be any real performance issues coming out of
> this.  As bad as this MSR thing is, I suspect it's not half as
> disastrous as the global spinlock in Kirill's patches.
> 
> Also, private<->shared page conversions are *NOT* common from what I can
> tell.  There are a few pages converted at boot, but most host the
> guest<->host communications are through the swiotlb pages which are static.

Generally, that's true. But, e.g., a dma_alloc_coherent() actually doesn't 
go through SWIOTLB, but instead allocates the pages and makes them shared, 
which results in a page state change. The NVMe driver was calling that API 
a lot. In this case, though, the NVMe driver was running in IRQ context 
and set_memory_decrypted() could sleep, so an unencrypted DMA memory pool 
was created to work around the sleeping issue and reduce the page state 
changes. It's just things like that, that make me wary.

Thanks,
Tom

> 
> Are there other things that SEV uses this structure for that I'm missing?

Re: [PATCH v1.1 1/2] x86/sev: Use per-CPU PSC structure in prep for unaccepted memory support

Posted by Dave Hansen 3 years, 8 months ago

On 8/3/22 14:34, Tom Lendacky wrote:
>> Also, private<->shared page conversions are *NOT* common from what I can
>> tell.  There are a few pages converted at boot, but most host the
>> guest<->host communications are through the swiotlb pages which are
>> static.
> 
> Generally, that's true. But, e.g., a dma_alloc_coherent() actually
> doesn't go through SWIOTLB, but instead allocates the pages and makes
> them shared, which results in a page state change. The NVMe driver was
> calling that API a lot. In this case, though, the NVMe driver was
> running in IRQ context and set_memory_decrypted() could sleep, so an
> unencrypted DMA memory pool was created to work around the sleeping
> issue and reduce the page state changes. It's just things like that,
> that make me wary.

Interesting.  Is that a real passthrough NVMe device or the hypervisor
presenting a virtual one that just happens to use the NVMe driver?

I'm pretty sure the TDX folks have been banking on having very few page
state changes.  But, part of that at least is their expectation of
relying heavily on virtio.

I wonder if their expectations are accurate, or if once TDX gets out
into the real world if their hopes will be dashed.

Re: [PATCH v1.1 1/2] x86/sev: Use per-CPU PSC structure in prep for unaccepted memory support

Posted by Tom Lendacky 3 years, 8 months ago

On 8/3/22 16:48, Dave Hansen wrote:
> On 8/3/22 14:34, Tom Lendacky wrote:
>>> Also, private<->shared page conversions are *NOT* common from what I can
>>> tell.  There are a few pages converted at boot, but most host the
>>> guest<->host communications are through the swiotlb pages which are
>>> static.
>>
>> Generally, that's true. But, e.g., a dma_alloc_coherent() actually
>> doesn't go through SWIOTLB, but instead allocates the pages and makes
>> them shared, which results in a page state change. The NVMe driver was
>> calling that API a lot. In this case, though, the NVMe driver was
>> running in IRQ context and set_memory_decrypted() could sleep, so an
>> unencrypted DMA memory pool was created to work around the sleeping
>> issue and reduce the page state changes. It's just things like that,
>> that make me wary.
> 
> Interesting.  Is that a real passthrough NVMe device or the hypervisor
> presenting a virtual one that just happens to use the NVMe driver?

Hmmm... not sure, possibly the latter. I just knew that whatever it was, 
the NVMe driver was being used.

Thanks,
Tom

> 
> I'm pretty sure the TDX folks have been banking on having very few page
> state changes.  But, part of that at least is their expectation of
> relying heavily on virtio.
> 
> I wonder if their expectations are accurate, or if once TDX gets out
> into the real world if their hopes will be dashed.