arch/x86/boot/compressed/sev.c | 4 ++-- arch/x86/kernel/sev-shared.c | 4 ++-- arch/x86/kernel/sev.c | 22 ++++++++++++++-------- 3 files changed, 18 insertions(+), 12 deletions(-)
In the early #VC handler before start_kernel,
boot_cpu_data.x86_virt_bits is not yet initialized.
copy_from_kernel_nofault references this variable, so it cannot be
called. Instead, simply use memcpy.
Usage of this uninitialized variable is currently causing boot failures
in the latest version of ubuntu2204 in the gcp project, but in general
reading the uninitialized variable is undefined behavior. If the
variable happens to be 0, UB also happens due to a bit shift by 64.
Fixes: 1aa9aa8ee517 ("x86/sev-es: Setup GHCB-based boot #VC handler")
Tested-by: Jacob Xu <jacobhxu@google.com>
Signed-off-by: Adam Dunlap <acdunlap@google.com>
---
arch/x86/boot/compressed/sev.c | 4 ++--
arch/x86/kernel/sev-shared.c | 4 ++--
arch/x86/kernel/sev.c | 22 ++++++++++++++--------
3 files changed, 18 insertions(+), 12 deletions(-)
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 09dc8c187b3c..0829ae00a885 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -73,7 +73,7 @@ static inline void sev_es_wr_ghcb_msr(u64 val)
boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m);
}
-static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt, bool is_early)
{
char buffer[MAX_INSN_SIZE];
int ret;
@@ -290,7 +290,7 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
vc_ghcb_invalidate(boot_ghcb);
- result = vc_init_em_ctxt(&ctxt, regs, exit_code);
+ result = vc_init_em_ctxt(&ctxt, regs, exit_code, true);
if (result != ES_OK)
goto finish;
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 2eabccde94fb..616be2e9a663 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -176,7 +176,7 @@ static bool vc_decoding_needed(unsigned long exit_code)
static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt,
struct pt_regs *regs,
- unsigned long exit_code)
+ unsigned long exit_code, bool is_early)
{
enum es_result ret = ES_OK;
@@ -184,7 +184,7 @@ static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt,
ctxt->regs = regs;
if (vc_decoding_needed(exit_code))
- ret = vc_decode_insn(ctxt);
+ ret = vc_decode_insn(ctxt, is_early);
return ret;
}
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 1ee7bed453de..93f117e5cddf 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -270,9 +270,15 @@ static __always_inline void sev_es_wr_ghcb_msr(u64 val)
}
static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
- unsigned char *buffer)
+ unsigned char *buffer, bool is_early)
{
- return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
+ if (is_early) {
+ memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
+ return 0;
+ } else {
+ return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip,
+ MAX_INSN_SIZE);
+ }
}
static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
@@ -304,12 +310,12 @@ static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
return ES_DECODE_FAILED;
}
-static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
+static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt, bool is_early)
{
char buffer[MAX_INSN_SIZE];
int res, ret;
- res = vc_fetch_insn_kernel(ctxt, buffer);
+ res = vc_fetch_insn_kernel(ctxt, buffer, is_early);
if (res) {
ctxt->fi.vector = X86_TRAP_PF;
ctxt->fi.error_code = X86_PF_INSTR;
@@ -324,12 +330,12 @@ static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
return ES_OK;
}
-static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt, bool is_early)
{
if (user_mode(ctxt->regs))
return __vc_decode_user_insn(ctxt);
else
- return __vc_decode_kern_insn(ctxt);
+ return __vc_decode_kern_insn(ctxt, is_early);
}
static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
@@ -1829,7 +1835,7 @@ static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_co
ghcb = __sev_get_ghcb(&state);
vc_ghcb_invalidate(ghcb);
- result = vc_init_em_ctxt(&ctxt, regs, error_code);
+ result = vc_init_em_ctxt(&ctxt, regs, error_code, false);
if (result == ES_OK)
result = vc_handle_exitcode(&ctxt, ghcb, error_code);
@@ -1969,7 +1975,7 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
vc_ghcb_invalidate(boot_ghcb);
- result = vc_init_em_ctxt(&ctxt, regs, exit_code);
+ result = vc_init_em_ctxt(&ctxt, regs, exit_code, true);
if (result == ES_OK)
result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
--
2.42.0.283.g2d96d420d3-goog
On 9/6/23 15:45, Adam Dunlap wrote:
> static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
> - unsigned char *buffer)
> + unsigned char *buffer, bool is_early)
> {
> - return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
> + if (is_early) {
> + memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
> + return 0;
> + } else {
> + return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip,
> + MAX_INSN_SIZE);
> + }
> }
This isn't the normal way we do these kinds of things.
If we go with this solution, they next guy who tries
copy_from_kernel_nofault() will hit the same issue, and start plumbing
their own 'is_early' through _their_ little chunk of arch/x86.
Usually, we'll add some gunk in arch/x86/boot/compressed/misc.h to
override the troublesome implementation. In this case, it would make a
lot of sense to somehow avoid touching boot_cpu_data.x86_virt_bits in
the first place.
> Usually, we'll add some gunk in arch/x86/boot/compressed/misc.h to > override the troublesome implementation. In this case, it would make a > lot of sense to somehow avoid touching boot_cpu_data.x86_virt_bits in > the first place. Thanks for the comment. I realize this patch is doing something a bit misleading here. In this case, "early" does not refer to the compressed kernel, but actually the regular kernel but in the stage with this early #VC handler vc_boot_ghcb (instead of the usual vc_raw_handle_exception). This #VC handler triggers for the first time on a cpuid instruction in secondary_startup_64, but boot_cpu_data.x86_virt_bits is not initialized until setup_arch inside of start_kernel, which is at the end of secondary_startup_64. The compressed kernel already has its own implementation of vc_decode_insn which simply calls memcpy instead of copy_from_kernel_nofault, so the point of this patch is to do the same thing for the early stages of the post-decompression kernel. So unless I'm misunderstanding, adding a version of copy_from_kernel_nofault to the compressed directory wouldn't solve the problem since the problem does not occur during the compressed stage.
On 9/6/23 16:25, Adam Dunlap wrote: >> Usually, we'll add some gunk in arch/x86/boot/compressed/misc.h to >> override the troublesome implementation. In this case, it would make a >> lot of sense to somehow avoid touching boot_cpu_data.x86_virt_bits in >> the first place. > Thanks for the comment. I realize this patch is doing something a bit misleading > here. In this case, "early" does not refer to the compressed kernel, but > actually the regular kernel but in the stage with this early #VC handler > vc_boot_ghcb (instead of the usual vc_raw_handle_exception). This #VC handler > triggers for the first time on a cpuid instruction in secondary_startup_64, but > boot_cpu_data.x86_virt_bits is not initialized until setup_arch inside of > start_kernel, which is at the end of secondary_startup_64. How about something like the attached patch? It avoids passing around 'is_early' everywhere, which I'm sure we'll get wrong at some point. If we get it wrong, we lose *ALL* the checking that copy_from_kernel*() does in addition to the canonical checks. The attached patch at least preserves the userspace address checks. This also makes me wonder how much other code is called via the early exception handlers that's subtly broken. I scanned a function or two deep and the instruction decoding was the most guilty looking thing. But a closer look would be appreciated. Also, what's the root cause here? What's causing the early exception? It is some silly CPUID leaf? Should we be more careful to just avoid these exceptions?
On Thu, Sep 7, 2023 at 10:06 AM Dave Hansen <dave.hansen@intel.com> wrote: > How about something like the attached patch? I think that's a much better idea, but unfortunately we can't rely on boot_cpu_data being 0'd out. While it is a static global variable that C says should be 0'd, the early interrupt happens before .bss is cleared (Note if it happens to be 0, then the __is_canonical_address check succeeds anyway -- the boot failure only happens when that variable happens to be other random values). If there's another check we could do, I'd agree that'd end up being a much better patch. For example, maybe we could add a field to cpuinfo_x86 is_valid that is manually (i.e. not part of the regular .bss clearing) set to false and is only set to true after early_identify_cpu is finished. Or the simplest thing would be to just manually set x86_virt_bits to 0 somewhere super early. > But a closer look would be appreciated. Turning on CONFIG_UBSAN catches the UB that happens when x86_virt_bits is 0 since that causes a bit shift by 64. Unfortunately, the ubsan reporting mechanism also doesn't play well with being called so early into boot, so the messages end up corrupted. But the fact that it only reports one error means that (at least in our tests), it only happens once. Of course, there could be other interrupts or other configurations where other problems happen. > Also, what's the root cause here? What's causing the early exception? > It is some silly CPUID leaf? Should we be more careful to just avoid > these exceptions? Yes, it is a CPUID instruction in secondary_startup_64 with the comment /* Check if nx is implemented */. It appears to be pretty important. Potentially we could paravirtualize the CPUID direclty (i.e. mess with the GHCB and make the vmgexit) instead of taking the #VC, but I'm not sure that's a great idea. There's a couple of other CPUIDs that are called in early_identify_cpu between when x86_virt_bits is set to 48 and when it is set to its real value (IIUC, it may be set to 57 if 5 level paging is enabled), which could potentially cause spurious failures in those later calls.
On 9/7/23 11:03, Adam Dunlap wrote: > On Thu, Sep 7, 2023 at 10:06 AM Dave Hansen <dave.hansen@intel.com> wrote: >> How about something like the attached patch? > > I think that's a much better idea, but unfortunately we can't rely on > boot_cpu_data being 0'd out. While it is a static global variable that C says > should be 0'd, the early interrupt happens before .bss is cleared (Note if it > happens to be 0, then the __is_canonical_address check succeeds anyway -- the > boot failure only happens when that variable happens to be other random values). > If there's another check we could do, I'd agree that'd end up being a much > better patch. For example, maybe we could add a field to cpuinfo_x86 is_valid > that is manually (i.e. not part of the regular .bss clearing) set to false and > is only set to true after early_identify_cpu is finished. Or the > simplest thing would be to just manually set x86_virt_bits to 0 somewhere super early. Gah, what a mess. So the guilty CPUID in question is this: > /* Setup and Load IDT */ > call early_setup_idt > > /* Check if nx is implemented */ > movl $0x80000001, %eax > cpuid > movl %edx,%edi which is _barely_ after we have an IDT with which to generate exceptions. What happens before this? This isn't the first CPUID invocation. Does this one just happen to #VC and all the others before don't? In any case, the most straightforward way out of this mess is to just move boot_cpu_data out of .bss and explicitly initialize it along with some documentation explaining the situation. >> Also, what's the root cause here? What's causing the early exception? >> It is some silly CPUID leaf? Should we be more careful to just avoid >> these exceptions? > > Yes, it is a CPUID instruction in secondary_startup_64 with the comment /* Check > if nx is implemented */. It appears to be pretty important. Potentially we could > paravirtualize the CPUID direclty (i.e. mess with the GHCB and make the vmgexit) > instead of taking the #VC, but I'm not sure that's a great idea. What TDX does here is actually pretty nice. It doesn't generate a #VE for these. But seriously, is it even *possible* to spin up a SEV-SNP VM what doesn't have NX? > There's a couple of other CPUIDs that are called in early_identify_cpu between > when x86_virt_bits is set to 48 and when it is set to its real value (IIUC, it > may be set to 57 if 5 level paging is enabled), which could potentially cause > spurious failures in those later calls. That should be easy enough to fix. These things where we initialize a value and then write over it are always fragile. Let's just make one function that does it right and does it once. Totally untested patch attached.
On 9/7/23 14:12, Dave Hansen wrote: > On 9/7/23 11:03, Adam Dunlap wrote: >> On Thu, Sep 7, 2023 at 10:06 AM Dave Hansen <dave.hansen@intel.com> wrote: >>> How about something like the attached patch? >> >> I think that's a much better idea, but unfortunately we can't rely on >> boot_cpu_data being 0'd out. While it is a static global variable that C says >> should be 0'd, the early interrupt happens before .bss is cleared (Note if it >> happens to be 0, then the __is_canonical_address check succeeds anyway -- the >> boot failure only happens when that variable happens to be other random values). >> If there's another check we could do, I'd agree that'd end up being a much >> better patch. For example, maybe we could add a field to cpuinfo_x86 is_valid >> that is manually (i.e. not part of the regular .bss clearing) set to false and >> is only set to true after early_identify_cpu is finished. Or the >> simplest thing would be to just manually set x86_virt_bits to 0 somewhere super early. > > Gah, what a mess. So the guilty CPUID in question is this: > >> /* Setup and Load IDT */ >> call early_setup_idt >> >> /* Check if nx is implemented */ >> movl $0x80000001, %eax >> cpuid >> movl %edx,%edi > > which is _barely_ after we have an IDT with which to generate > exceptions. What happens before this? This isn't the first CPUID > invocation. Does this one just happen to #VC and all the others before > don't? > > In any case, the most straightforward way out of this mess is to just > move boot_cpu_data out of .bss and explicitly initialize it along with > some documentation explaining the situation. Agreed, putting this in the .data section will ensure it is zero from the start. > >>> Also, what's the root cause here? What's causing the early exception? >>> It is some silly CPUID leaf? Should we be more careful to just avoid >>> these exceptions? >> >> Yes, it is a CPUID instruction in secondary_startup_64 with the comment /* Check >> if nx is implemented */. It appears to be pretty important. Potentially we could >> paravirtualize the CPUID direclty (i.e. mess with the GHCB and make the vmgexit) >> instead of taking the #VC, but I'm not sure that's a great idea. > > What TDX does here is actually pretty nice. It doesn't generate a #VE > for these. > > But seriously, is it even *possible* to spin up a SEV-SNP VM what > doesn't have NX? It is a common path, so while an SEV guest would have NX support, you would first have to determine that it is an SEV guest. That would take issuing a CPUID instruction in order to determine if a particular MSR can be read... Ultimately, we could probably pass the encryption mask from the decompressor to the kernel and avoid some of the checks during early boot of the kernel proper. Is it possible to boot an x86 kernel without going through the decompressor? Thanks, Tom > >> There's a couple of other CPUIDs that are called in early_identify_cpu between >> when x86_virt_bits is set to 48 and when it is set to its real value (IIUC, it >> may be set to 57 if 5 level paging is enabled), which could potentially cause >> spurious failures in those later calls. > > That should be easy enough to fix. > > These things where we initialize a value and then write over it are > always fragile. Let's just make one function that does it right and > does it once. > > Totally untested patch attached.
On 9/8/23 06:13, Tom Lendacky wrote: > On 9/7/23 14:12, Dave Hansen wrote: >> But seriously, is it even *possible* to spin up a SEV-SNP VM what >> doesn't have NX? > > It is a common path, so while an SEV guest would have NX support, you > would first have to determine that it is an SEV guest. That would take > issuing a CPUID instruction in order to determine if a particular MSR > can be read... I was thinking more along the lines of telling folks that if they want to turn SEV-SNP support on, they also have to give up on running on a !NX system. That would be a _bit_ nicer than just refusing to boot on all !NX systems. > Ultimately, we could probably pass the encryption mask from the > decompressor to the kernel and avoid some of the checks during early > boot of the kernel proper. Is it possible to boot an x86 kernel without > going through the decompressor? I think it's possible, but it's very unusual.
On Fri, Sep 08, 2023 at 07:48:58AM -0700, Dave Hansen wrote: > On 9/8/23 06:13, Tom Lendacky wrote: > > On 9/7/23 14:12, Dave Hansen wrote: > >> But seriously, is it even *possible* to spin up a SEV-SNP VM what > >> doesn't have NX? > > > > It is a common path, so while an SEV guest would have NX support, you > > would first have to determine that it is an SEV guest. That would take > > issuing a CPUID instruction in order to determine if a particular MSR > > can be read... > > I was thinking more along the lines of telling folks that if they want > to turn SEV-SNP support on, they also have to give up on running on a > !NX system. That would be a _bit_ nicer than just refusing to boot on > all !NX systems. I think there were a very limited amount of !NX P4 class x86_64 systems. I vote in favour of dropping them into the museum bucket, they get to run an old kernel. Andrew and me looked at this a few months ago, but I forgot all details again :/
On Thu, Sep 7, 2023 at 12:12 PM Dave Hansen <dave.hansen@intel.com> wrote: > What happens before this? This isn't the first CPUID > invocation. Does this one just happen to #VC and all the others before > don't? I hadn't noticed this before, but there is an even earlier interrupt handler vc_no_ghcb which only supports cpuid. Potentially this version could work until boot_cpu_data is set up, but wasn't able to get it working myself. > In any case, the most straightforward way out of this mess is to just > move boot_cpu_data out of .bss and explicitly initialize it along with > some documentation explaining the situation. That seems totally reasonable. I tried applying the two patches that you sent plus boot_cpu_data.x86_virt_bits = 0; in early_setup_idt(), and that fixes the problems that we can see. Do you want me to send out a new patch with these changes together?
On 9/7/23 15:30, Adam Dunlap wrote: >> In any case, the most straightforward way out of this mess is to >> just move boot_cpu_data out of .bss and explicitly initialize it >> along with some documentation explaining the situation. > That seems totally reasonable. I tried applying the two patches that > you sent plus boot_cpu_data.x86_virt_bits = 0; in early_setup_idt(), Let's just initialize boot_cpu_data to zero and get it out of .bss entirely. > and that fixes the problems that we can see. Do you want me to send > out a new patch with these changes together? Well, ideally broken out in a couple of patches, but yeah, please resend also with some revised changelogs based on what you've learned.
© 2016 - 2025 Red Hat, Inc.