x86/sev-es: Do not use copy_from_kernel_nofault in early #VC handler

[PATCH] x86/sev-es: Do not use copy_from_kernel_nofault in early #VC handler

Posted by Adam Dunlap 2 years, 5 months ago

In the early #VC handler before start_kernel,
boot_cpu_data.x86_virt_bits is not yet initialized.
copy_from_kernel_nofault references this variable, so it cannot be
called. Instead, simply use memcpy.

Usage of this uninitialized variable is currently causing boot failures
in the latest version of ubuntu2204 in the gcp project, but in general
reading the uninitialized variable is undefined behavior. If the
variable happens to be 0, UB also happens due to a bit shift by 64.

Fixes: 1aa9aa8ee517 ("x86/sev-es: Setup GHCB-based boot #VC handler")

Tested-by: Jacob Xu <jacobhxu@google.com>
Signed-off-by: Adam Dunlap <acdunlap@google.com>
---
 arch/x86/boot/compressed/sev.c |  4 ++--
 arch/x86/kernel/sev-shared.c   |  4 ++--
 arch/x86/kernel/sev.c          | 22 ++++++++++++++--------
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 09dc8c187b3c..0829ae00a885 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -73,7 +73,7 @@ static inline void sev_es_wr_ghcb_msr(u64 val)
 	boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m);
 }
 
-static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt, bool is_early)
 {
 	char buffer[MAX_INSN_SIZE];
 	int ret;
@@ -290,7 +290,7 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
 		sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
 
 	vc_ghcb_invalidate(boot_ghcb);
-	result = vc_init_em_ctxt(&ctxt, regs, exit_code);
+	result = vc_init_em_ctxt(&ctxt, regs, exit_code, true);
 	if (result != ES_OK)
 		goto finish;
 
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 2eabccde94fb..616be2e9a663 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -176,7 +176,7 @@ static bool vc_decoding_needed(unsigned long exit_code)
 
 static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt,
 				      struct pt_regs *regs,
-				      unsigned long exit_code)
+				      unsigned long exit_code, bool is_early)
 {
 	enum es_result ret = ES_OK;
 
@@ -184,7 +184,7 @@ static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt,
 	ctxt->regs = regs;
 
 	if (vc_decoding_needed(exit_code))
-		ret = vc_decode_insn(ctxt);
+		ret = vc_decode_insn(ctxt, is_early);
 
 	return ret;
 }
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 1ee7bed453de..93f117e5cddf 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -270,9 +270,15 @@ static __always_inline void sev_es_wr_ghcb_msr(u64 val)
 }
 
 static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
-				unsigned char *buffer)
+				unsigned char *buffer, bool is_early)
 {
-	return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
+	if (is_early) {
+		memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
+		return 0;
+	} else {
+		return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip,
+			MAX_INSN_SIZE);
+	}
 }
 
 static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
@@ -304,12 +310,12 @@ static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
 		return ES_DECODE_FAILED;
 }
 
-static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
+static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt, bool is_early)
 {
 	char buffer[MAX_INSN_SIZE];
 	int res, ret;
 
-	res = vc_fetch_insn_kernel(ctxt, buffer);
+	res = vc_fetch_insn_kernel(ctxt, buffer, is_early);
 	if (res) {
 		ctxt->fi.vector     = X86_TRAP_PF;
 		ctxt->fi.error_code = X86_PF_INSTR;
@@ -324,12 +330,12 @@ static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
 		return ES_OK;
 }
 
-static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt, bool is_early)
 {
 	if (user_mode(ctxt->regs))
 		return __vc_decode_user_insn(ctxt);
 	else
-		return __vc_decode_kern_insn(ctxt);
+		return __vc_decode_kern_insn(ctxt, is_early);
 }
 
 static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
@@ -1829,7 +1835,7 @@ static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_co
 	ghcb = __sev_get_ghcb(&state);
 
 	vc_ghcb_invalidate(ghcb);
-	result = vc_init_em_ctxt(&ctxt, regs, error_code);
+	result = vc_init_em_ctxt(&ctxt, regs, error_code, false);
 
 	if (result == ES_OK)
 		result = vc_handle_exitcode(&ctxt, ghcb, error_code);
@@ -1969,7 +1975,7 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
 
 	vc_ghcb_invalidate(boot_ghcb);
 
-	result = vc_init_em_ctxt(&ctxt, regs, exit_code);
+	result = vc_init_em_ctxt(&ctxt, regs, exit_code, true);
 	if (result == ES_OK)
 		result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
 
-- 
2.42.0.283.g2d96d420d3-goog

Re: [PATCH] x86/sev-es: Do not use copy_from_kernel_nofault in early #VC handler

Posted by Dave Hansen 2 years, 5 months ago

On 9/6/23 15:45, Adam Dunlap wrote:
>  static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
> -				unsigned char *buffer)
> +				unsigned char *buffer, bool is_early)
>  {
> -	return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
> +	if (is_early) {
> +		memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
> +		return 0;
> +	} else {
> +		return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip,
> +			MAX_INSN_SIZE);
> +	}
>  }

This isn't the normal way we do these kinds of things.

If we go with this solution, they next guy who tries
copy_from_kernel_nofault() will hit the same issue, and start plumbing
their own 'is_early' through _their_ little chunk of arch/x86.

Usually, we'll add some gunk in arch/x86/boot/compressed/misc.h to
override the troublesome implementation.  In this case, it would make a
lot of sense to somehow avoid touching boot_cpu_data.x86_virt_bits in
the first place.

Re: [PATCH] x86/sev-es: Do not use copy_from_kernel_nofault in early #VC handler

Posted by Adam Dunlap 2 years, 5 months ago

> Usually, we'll add some gunk in arch/x86/boot/compressed/misc.h to
> override the troublesome implementation.  In this case, it would make a
> lot of sense to somehow avoid touching boot_cpu_data.x86_virt_bits in
> the first place.

Thanks for the comment. I realize this patch is doing something a bit misleading
here. In this case, "early" does not refer to the compressed kernel, but
actually the regular kernel but in the stage with this early #VC handler
vc_boot_ghcb (instead of the usual vc_raw_handle_exception). This #VC handler
triggers for the first time on a cpuid instruction in secondary_startup_64, but
boot_cpu_data.x86_virt_bits is not initialized until setup_arch inside of
start_kernel, which is at the end of secondary_startup_64.

The compressed kernel already has its own implementation of vc_decode_insn which
simply calls memcpy instead of copy_from_kernel_nofault, so the point of this
patch is to do the same thing for the early stages of the post-decompression
kernel.

So unless I'm misunderstanding, adding a version of copy_from_kernel_nofault to
the compressed directory wouldn't solve the problem since the problem does not
occur during the compressed stage.

Re: [PATCH] x86/sev-es: Do not use copy_from_kernel_nofault in early #VC handler

Posted by Dave Hansen 2 years, 5 months ago

On 9/6/23 16:25, Adam Dunlap wrote:
>> Usually, we'll add some gunk in arch/x86/boot/compressed/misc.h to
>> override the troublesome implementation.  In this case, it would make a
>> lot of sense to somehow avoid touching boot_cpu_data.x86_virt_bits in
>> the first place.
> Thanks for the comment. I realize this patch is doing something a bit misleading
> here. In this case, "early" does not refer to the compressed kernel, but
> actually the regular kernel but in the stage with this early #VC handler
> vc_boot_ghcb (instead of the usual vc_raw_handle_exception). This #VC handler
> triggers for the first time on a cpuid instruction in secondary_startup_64, but
> boot_cpu_data.x86_virt_bits is not initialized until setup_arch inside of
> start_kernel, which is at the end of secondary_startup_64.

How about something like the attached patch?

It avoids passing around 'is_early' everywhere, which I'm sure we'll get
wrong at some point.  If we get it wrong, we lose *ALL* the checking
that copy_from_kernel*() does in addition to the canonical checks.

The attached patch at least preserves the userspace address checks.

This also makes me wonder how much other code is called via the early
exception handlers that's subtly broken.  I scanned a function or two
deep and the instruction decoding was the most guilty looking thing.
But a closer look would be appreciated.

Also, what's the root cause here?  What's causing the early exception?
It is some silly CPUID leaf?  Should we be more careful to just avoid
these exceptions?

Re: [PATCH] x86/sev-es: Do not use copy_from_kernel_nofault in early #VC handler

Posted by Adam Dunlap 2 years, 5 months ago

On Thu, Sep 7, 2023 at 10:06 AM Dave Hansen <dave.hansen@intel.com> wrote:
> How about something like the attached patch?

I think that's a much better idea, but unfortunately we can't rely on
boot_cpu_data being 0'd out. While it is a static global variable that C says
should be 0'd, the early interrupt happens before .bss is cleared (Note if it
happens to be 0, then the __is_canonical_address check succeeds anyway -- the
boot failure only happens when that variable happens to be other random values).
If there's another check we could do, I'd agree that'd end up being a much
better patch. For example, maybe we could add a field to cpuinfo_x86 is_valid
that is manually (i.e. not part of the regular .bss clearing) set to false and
is only set to true after early_identify_cpu is finished. Or the
simplest thing would
be to just manually set x86_virt_bits to 0 somewhere super early.

> But a closer look would be appreciated.

Turning on CONFIG_UBSAN catches the UB that happens when x86_virt_bits is 0
since that causes a bit shift by 64. Unfortunately, the ubsan reporting
mechanism also doesn't play well with being called so early into boot, so the
messages end up corrupted. But the fact that it only reports one error means
that (at least in our tests), it only happens once. Of course, there could be
other interrupts or other configurations where other problems happen.

> Also, what's the root cause here?  What's causing the early exception?
> It is some silly CPUID leaf?  Should we be more careful to just avoid
> these exceptions?

Yes, it is a CPUID instruction in secondary_startup_64 with the comment /* Check
if nx is implemented */. It appears to be pretty important. Potentially we could
paravirtualize the CPUID direclty (i.e. mess with the GHCB and make the vmgexit)
instead of taking the #VC, but I'm not sure that's a great idea.

There's a couple of other CPUIDs that are called in early_identify_cpu between
when x86_virt_bits is set to 48 and when it is set to its real value (IIUC, it
may be set to 57 if 5 level paging is enabled), which could potentially cause
spurious failures in those later calls.

Re: [PATCH] x86/sev-es: Do not use copy_from_kernel_nofault in early #VC handler

Posted by Dave Hansen 2 years, 5 months ago

On 9/7/23 11:03, Adam Dunlap wrote:
> On Thu, Sep 7, 2023 at 10:06 AM Dave Hansen <dave.hansen@intel.com> wrote:
>> How about something like the attached patch?
> 
> I think that's a much better idea, but unfortunately we can't rely on
> boot_cpu_data being 0'd out. While it is a static global variable that C says
> should be 0'd, the early interrupt happens before .bss is cleared (Note if it
> happens to be 0, then the __is_canonical_address check succeeds anyway -- the
> boot failure only happens when that variable happens to be other random values).
> If there's another check we could do, I'd agree that'd end up being a much
> better patch. For example, maybe we could add a field to cpuinfo_x86 is_valid
> that is manually (i.e. not part of the regular .bss clearing) set to false and
> is only set to true after early_identify_cpu is finished. Or the
> simplest thing would  be to just manually set x86_virt_bits to 0 somewhere super early.

Gah, what a mess.  So the guilty CPUID in question is this:

>         /* Setup and Load IDT */
>         call    early_setup_idt
> 
>         /* Check if nx is implemented */
>         movl    $0x80000001, %eax
>         cpuid
>         movl    %edx,%edi

which is _barely_ after we have an IDT with which to generate
exceptions.  What happens before this?  This isn't the first CPUID
invocation.  Does this one just happen to #VC and all the others before
don't?

In any case, the most straightforward way out of this mess is to just
move boot_cpu_data out of .bss and explicitly initialize it along with
some documentation explaining the situation.

>> Also, what's the root cause here?  What's causing the early exception?
>> It is some silly CPUID leaf?  Should we be more careful to just avoid
>> these exceptions?
> 
> Yes, it is a CPUID instruction in secondary_startup_64 with the comment /* Check
> if nx is implemented */. It appears to be pretty important. Potentially we could
> paravirtualize the CPUID direclty (i.e. mess with the GHCB and make the vmgexit)
> instead of taking the #VC, but I'm not sure that's a great idea.

What TDX does here is actually pretty nice.  It doesn't generate a #VE
for these.

But seriously, is it even *possible* to spin up a SEV-SNP VM what
doesn't have NX?

> There's a couple of other CPUIDs that are called in early_identify_cpu between
> when x86_virt_bits is set to 48 and when it is set to its real value (IIUC, it
> may be set to 57 if 5 level paging is enabled), which could potentially cause
> spurious failures in those later calls.

That should be easy enough to fix.

These things where we initialize a value and then write over it are
always fragile.  Let's just make one function that does it right and
does it once.

Totally untested patch attached.

Re: [PATCH] x86/sev-es: Do not use copy_from_kernel_nofault in early #VC handler

Posted by Tom Lendacky 2 years, 5 months ago

On 9/7/23 14:12, Dave Hansen wrote:
> On 9/7/23 11:03, Adam Dunlap wrote:
>> On Thu, Sep 7, 2023 at 10:06 AM Dave Hansen <dave.hansen@intel.com> wrote:
>>> How about something like the attached patch?
>>
>> I think that's a much better idea, but unfortunately we can't rely on
>> boot_cpu_data being 0'd out. While it is a static global variable that C says
>> should be 0'd, the early interrupt happens before .bss is cleared (Note if it
>> happens to be 0, then the __is_canonical_address check succeeds anyway -- the
>> boot failure only happens when that variable happens to be other random values).
>> If there's another check we could do, I'd agree that'd end up being a much
>> better patch. For example, maybe we could add a field to cpuinfo_x86 is_valid
>> that is manually (i.e. not part of the regular .bss clearing) set to false and
>> is only set to true after early_identify_cpu is finished. Or the
>> simplest thing would  be to just manually set x86_virt_bits to 0 somewhere super early.
> 
> Gah, what a mess.  So the guilty CPUID in question is this:
> 
>>          /* Setup and Load IDT */
>>          call    early_setup_idt
>>
>>          /* Check if nx is implemented */
>>          movl    $0x80000001, %eax
>>          cpuid
>>          movl    %edx,%edi
> 
> which is _barely_ after we have an IDT with which to generate
> exceptions.  What happens before this?  This isn't the first CPUID
> invocation.  Does this one just happen to #VC and all the others before
> don't?
> 
> In any case, the most straightforward way out of this mess is to just
> move boot_cpu_data out of .bss and explicitly initialize it along with
> some documentation explaining the situation.

Agreed, putting this in the .data section will ensure it is zero from the 
start.

> 
>>> Also, what's the root cause here?  What's causing the early exception?
>>> It is some silly CPUID leaf?  Should we be more careful to just avoid
>>> these exceptions?
>>
>> Yes, it is a CPUID instruction in secondary_startup_64 with the comment /* Check
>> if nx is implemented */. It appears to be pretty important. Potentially we could
>> paravirtualize the CPUID direclty (i.e. mess with the GHCB and make the vmgexit)
>> instead of taking the #VC, but I'm not sure that's a great idea.
> 
> What TDX does here is actually pretty nice.  It doesn't generate a #VE
> for these.
> 
> But seriously, is it even *possible* to spin up a SEV-SNP VM what
> doesn't have NX?

It is a common path, so while an SEV guest would have NX support, you 
would first have to determine that it is an SEV guest. That would take 
issuing a CPUID instruction in order to determine if a particular MSR can 
be read...

Ultimately, we could probably pass the encryption mask from the 
decompressor to the kernel and avoid some of the checks during early boot 
of the kernel proper. Is it possible to boot an x86 kernel without going 
through the decompressor?

Thanks,
Tom

> 
>> There's a couple of other CPUIDs that are called in early_identify_cpu between
>> when x86_virt_bits is set to 48 and when it is set to its real value (IIUC, it
>> may be set to 57 if 5 level paging is enabled), which could potentially cause
>> spurious failures in those later calls.
> 
> That should be easy enough to fix.
> 
> These things where we initialize a value and then write over it are
> always fragile.  Let's just make one function that does it right and
> does it once.
> 
> Totally untested patch attached.

Re: [PATCH] x86/sev-es: Do not use copy_from_kernel_nofault in early #VC handler

Posted by Dave Hansen 2 years, 5 months ago

On 9/8/23 06:13, Tom Lendacky wrote:
> On 9/7/23 14:12, Dave Hansen wrote:
>> But seriously, is it even *possible* to spin up a SEV-SNP VM what
>> doesn't have NX?
> 
> It is a common path, so while an SEV guest would have NX support, you
> would first have to determine that it is an SEV guest. That would take
> issuing a CPUID instruction in order to determine if a particular MSR
> can be read...

I was thinking more along the lines of telling folks that if they want
to turn SEV-SNP support on, they also have to give up on running on a
!NX system.  That would be a _bit_ nicer than just refusing to boot on
all !NX systems.

> Ultimately, we could probably pass the encryption mask from the
> decompressor to the kernel and avoid some of the checks during early
> boot of the kernel proper. Is it possible to boot an x86 kernel without
> going through the decompressor?

I think it's possible, but it's very unusual.

Re: [PATCH] x86/sev-es: Do not use copy_from_kernel_nofault in early #VC handler

Posted by Peter Zijlstra 2 years, 5 months ago

On Fri, Sep 08, 2023 at 07:48:58AM -0700, Dave Hansen wrote:
> On 9/8/23 06:13, Tom Lendacky wrote:
> > On 9/7/23 14:12, Dave Hansen wrote:
> >> But seriously, is it even *possible* to spin up a SEV-SNP VM what
> >> doesn't have NX?
> > 
> > It is a common path, so while an SEV guest would have NX support, you
> > would first have to determine that it is an SEV guest. That would take
> > issuing a CPUID instruction in order to determine if a particular MSR
> > can be read...
> 
> I was thinking more along the lines of telling folks that if they want
> to turn SEV-SNP support on, they also have to give up on running on a
> !NX system.  That would be a _bit_ nicer than just refusing to boot on
> all !NX systems.

I think there were a very limited amount of !NX P4 class x86_64 systems.
I vote in favour of dropping them into the museum bucket, they get to
run an old kernel.

Andrew and me looked at this a few months ago, but I forgot all details
again :/

Re: [PATCH] x86/sev-es: Do not use copy_from_kernel_nofault in early #VC handler

Posted by Adam Dunlap 2 years, 5 months ago

On Thu, Sep 7, 2023 at 12:12 PM Dave Hansen <dave.hansen@intel.com> wrote:

> What happens before this?  This isn't the first CPUID
> invocation.  Does this one just happen to #VC and all the others before
> don't?

I hadn't noticed this before, but there is an even earlier interrupt
handler vc_no_ghcb
which only supports cpuid. Potentially this version could work until
boot_cpu_data is
set up, but wasn't able to get it working myself.

> In any case, the most straightforward way out of this mess is to just
> move boot_cpu_data out of .bss and explicitly initialize it along with
> some documentation explaining the situation.

That seems totally reasonable. I tried applying the two patches that
you sent plus
boot_cpu_data.x86_virt_bits = 0; in early_setup_idt(), and that fixes
the problems
that we can see. Do you want me to send out a new patch with these changes
together?

Re: [PATCH] x86/sev-es: Do not use copy_from_kernel_nofault in early #VC handler

Posted by Dave Hansen 2 years, 5 months ago

On 9/7/23 15:30, Adam Dunlap wrote:
>> In any case, the most straightforward way out of this mess is to
>> just move boot_cpu_data out of .bss and explicitly initialize it
>> along with some documentation explaining the situation.
> That seems totally reasonable. I tried applying the two patches that 
> you sent plus boot_cpu_data.x86_virt_bits = 0; in early_setup_idt(),

Let's just initialize boot_cpu_data to zero and get it out of .bss entirely.

> and that fixes the problems that we can see. Do you want me to send
> out a new patch with these changes together?
Well, ideally broken out in a couple of patches, but yeah, please resend
also with some revised changelogs based on what you've learned.