AMD SEV and Intel TDX guests allocate shared buffers for performing I/O.
This is done by allocating pages normally from the buddy allocator and
then converting them to shared using set_memory_decrypted().
On kexec, the second kernel is unaware of which memory has been
converted in this manner. It only sees E820_TYPE_RAM. Accessing shared
memory as private is fatal.
Therefore, the memory state must be reset to its original state before
starting the new kernel with kexec.
The process of converting shared memory back to private occurs in two
steps:
- enc_kexec_stop_conversion() stops new conversions.
- enc_kexec_unshare_mem() unshares all existing shared memory, reverting
it back to private.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Nikolay Borisov <nik.borisov@suse.com>x
Reviewed-by: Kai Huang <kai.huang@intel.com>
Tested-by: Tao Liu <ltao@redhat.com>
---
arch/x86/include/asm/x86_init.h | 2 ++
arch/x86/kernel/crash.c | 6 ++++++
arch/x86/kernel/reboot.c | 12 ++++++++++++
arch/x86/kernel/x86_init.c | 4 ++++
4 files changed, 24 insertions(+)
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 28ac3cb9b987..c731e6bc4343 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -155,6 +155,8 @@ struct x86_guest {
int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
bool (*enc_tlb_flush_required)(bool enc);
bool (*enc_cache_flush_required)(void);
+ void (*enc_kexec_stop_conversion)(bool crash);
+ void (*enc_kexec_unshare_mem)(void);
};
/**
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index e74d0c4286c1..7a1560d7e62d 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -128,6 +128,12 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
+
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ x86_platform.guest.enc_kexec_stop_conversion(true);
+ x86_platform.guest.enc_kexec_unshare_mem();
+ }
+
crash_save_cpu(regs, safe_smp_processor_id());
}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f3130f762784..1ec478f40963 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,7 @@
#include <linux/delay.h>
#include <linux/objtool.h>
#include <linux/pgtable.h>
+#include <linux/kexec.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -716,6 +717,14 @@ static void native_machine_emergency_restart(void)
void native_machine_shutdown(void)
{
+ /*
+ * Call enc_kexec_stop_conversion() while all CPUs are still active and
+ * interrupts are enabled. This will allow all in-flight memory
+ * conversions to finish cleanly.
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && kexec_in_progress)
+ x86_platform.guest.enc_kexec_stop_conversion(false);
+
/* Stop the cpus and apics */
#ifdef CONFIG_X86_IO_APIC
/*
@@ -752,6 +761,9 @@ void native_machine_shutdown(void)
#ifdef CONFIG_X86_64
x86_platform.iommu_shutdown();
#endif
+
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && kexec_in_progress)
+ x86_platform.guest.enc_kexec_unshare_mem();
}
static void __machine_emergency_restart(int emergency)
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index a7143bb7dd93..045ce1c70070 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -138,6 +138,8 @@ static int enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool
static int enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
static bool enc_tlb_flush_required_noop(bool enc) { return false; }
static bool enc_cache_flush_required_noop(void) { return false; }
+static void enc_kexec_stop_conversion_noop(bool crash) {}
+static void enc_kexec_unshare_mem_noop(void) {}
static bool is_private_mmio_noop(u64 addr) {return false; }
struct x86_platform_ops x86_platform __ro_after_init = {
@@ -161,6 +163,8 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.enc_status_change_finish = enc_status_change_finish_noop,
.enc_tlb_flush_required = enc_tlb_flush_required_noop,
.enc_cache_flush_required = enc_cache_flush_required_noop,
+ .enc_kexec_stop_conversion = enc_kexec_stop_conversion_noop,
+ .enc_kexec_unshare_mem = enc_kexec_unshare_mem_noop,
},
};
--
2.43.0
On Tue, Apr 09, 2024 at 02:30:01PM +0300, Kirill A. Shutemov wrote:
> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
> index e74d0c4286c1..7a1560d7e62d 100644
> --- a/arch/x86/kernel/crash.c
> +++ b/arch/x86/kernel/crash.c
> @@ -128,6 +128,12 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
> #ifdef CONFIG_HPET_TIMER
> hpet_disable();
> #endif
> +
> + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
> + x86_platform.guest.enc_kexec_stop_conversion(true);
> + x86_platform.guest.enc_kexec_unshare_mem();
> + }
This is not how this is done - the point of those function pointers is
to avoid random checks in the code but simply unconditionally call them.
The platform which needs something special to happen, assigns to them
its own function pointers and the rest assigns dummy stubs.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
AMD SEV and Intel TDX guests allocate shared buffers for performing I/O.
This is done by allocating pages normally from the buddy allocator and
then converting them to shared using set_memory_decrypted().
On kexec, the second kernel is unaware of which memory has been
converted in this manner. It only sees E820_TYPE_RAM. Accessing shared
memory as private is fatal.
Therefore, the memory state must be reset to its original state before
starting the new kernel with kexec.
The process of converting shared memory back to private occurs in two
steps:
- enc_kexec_stop_conversion() stops new conversions.
- enc_kexec_unshare_mem() unshares all existing shared memory, reverting
it back to private.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Nikolay Borisov <nik.borisov@suse.com>x
Reviewed-by: Kai Huang <kai.huang@intel.com>
Tested-by: Tao Liu <ltao@redhat.com>
---
arch/x86/include/asm/x86_init.h | 2 ++
arch/x86/kernel/crash.c | 4 ++++
arch/x86/kernel/reboot.c | 12 ++++++++++++
arch/x86/kernel/x86_init.c | 4 ++++
4 files changed, 22 insertions(+)
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 28ac3cb9b987..c731e6bc4343 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -155,6 +155,8 @@ struct x86_guest {
int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
bool (*enc_tlb_flush_required)(bool enc);
bool (*enc_cache_flush_required)(void);
+ void (*enc_kexec_stop_conversion)(bool crash);
+ void (*enc_kexec_unshare_mem)(void);
};
/**
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index e74d0c4286c1..f1b261be78b4 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -128,6 +128,10 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
+
+ x86_platform.guest.enc_kexec_stop_conversion(true);
+ x86_platform.guest.enc_kexec_unshare_mem();
+
crash_save_cpu(regs, safe_smp_processor_id());
}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f3130f762784..c1920ec34f0c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,7 @@
#include <linux/delay.h>
#include <linux/objtool.h>
#include <linux/pgtable.h>
+#include <linux/kexec.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -716,6 +717,14 @@ static void native_machine_emergency_restart(void)
void native_machine_shutdown(void)
{
+ /*
+ * Call enc_kexec_stop_conversion() while all CPUs are still active and
+ * interrupts are enabled. This will allow all in-flight memory
+ * conversions to finish cleanly.
+ */
+ if (kexec_in_progress)
+ x86_platform.guest.enc_kexec_stop_conversion(false);
+
/* Stop the cpus and apics */
#ifdef CONFIG_X86_IO_APIC
/*
@@ -752,6 +761,9 @@ void native_machine_shutdown(void)
#ifdef CONFIG_X86_64
x86_platform.iommu_shutdown();
#endif
+
+ if (kexec_in_progress)
+ x86_platform.guest.enc_kexec_unshare_mem();
}
static void __machine_emergency_restart(int emergency)
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index a7143bb7dd93..045ce1c70070 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -138,6 +138,8 @@ static int enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool
static int enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
static bool enc_tlb_flush_required_noop(bool enc) { return false; }
static bool enc_cache_flush_required_noop(void) { return false; }
+static void enc_kexec_stop_conversion_noop(bool crash) {}
+static void enc_kexec_unshare_mem_noop(void) {}
static bool is_private_mmio_noop(u64 addr) {return false; }
struct x86_platform_ops x86_platform __ro_after_init = {
@@ -161,6 +163,8 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.enc_status_change_finish = enc_status_change_finish_noop,
.enc_tlb_flush_required = enc_tlb_flush_required_noop,
.enc_cache_flush_required = enc_cache_flush_required_noop,
+ .enc_kexec_stop_conversion = enc_kexec_stop_conversion_noop,
+ .enc_kexec_unshare_mem = enc_kexec_unshare_mem_noop,
},
};
--
2.43.0
On Sat, Apr 27, 2024 at 08:06:34PM +0300, Kirill A. Shutemov wrote:
> Subject: Re: [PATCHv10.1 09/18] x86/mm: Adding callbacks to prepare encrypted memory for kexec
s/Adding/Add/
> AMD SEV and Intel TDX guests allocate shared buffers for performing I/O.
> This is done by allocating pages normally from the buddy allocator and
> then converting them to shared using set_memory_decrypted().
>
> On kexec, the second kernel is unaware of which memory has been
> converted in this manner. It only sees E820_TYPE_RAM. Accessing shared
> memory as private is fatal.
>
> Therefore, the memory state must be reset to its original state before
> starting the new kernel with kexec.
>
> The process of converting shared memory back to private occurs in two
> steps:
>
> - enc_kexec_stop_conversion() stops new conversions.
>
> - enc_kexec_unshare_mem() unshares all existing shared memory, reverting
> it back to private.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Reviewed-by: Nikolay Borisov <nik.borisov@suse.com>x
> Reviewed-by: Kai Huang <kai.huang@intel.com>
> Tested-by: Tao Liu <ltao@redhat.com>
> ---
> arch/x86/include/asm/x86_init.h | 2 ++
> arch/x86/kernel/crash.c | 4 ++++
> arch/x86/kernel/reboot.c | 12 ++++++++++++
> arch/x86/kernel/x86_init.c | 4 ++++
> 4 files changed, 22 insertions(+)
>
> diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
> index 28ac3cb9b987..c731e6bc4343 100644
> --- a/arch/x86/include/asm/x86_init.h
> +++ b/arch/x86/include/asm/x86_init.h
> @@ -155,6 +155,8 @@ struct x86_guest {
> int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
> bool (*enc_tlb_flush_required)(bool enc);
> bool (*enc_cache_flush_required)(void);
> + void (*enc_kexec_stop_conversion)(bool crash);
> + void (*enc_kexec_unshare_mem)(void);
This is all fine and dandy but those functions need documentation in the
kernel doc above it.
And if there's a "stop_conversion" function, then I'd expect there to be
a "start conversion" one.
I think you need to give those two better names to denote that they're
related, the order in which they should be called and why they're
separate.
> };
>
> /**
> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
> index e74d0c4286c1..f1b261be78b4 100644
> --- a/arch/x86/kernel/crash.c
> +++ b/arch/x86/kernel/crash.c
> @@ -128,6 +128,10 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
> #ifdef CONFIG_HPET_TIMER
> hpet_disable();
> #endif
> +
> + x86_platform.guest.enc_kexec_stop_conversion(true);
> + x86_platform.guest.enc_kexec_unshare_mem();
> +
You call them here back-to-back...
> crash_save_cpu(regs, safe_smp_processor_id());
> }
>
> diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
> index f3130f762784..c1920ec34f0c 100644
> --- a/arch/x86/kernel/reboot.c
> +++ b/arch/x86/kernel/reboot.c
> @@ -12,6 +12,7 @@
> #include <linux/delay.h>
> #include <linux/objtool.h>
> #include <linux/pgtable.h>
> +#include <linux/kexec.h>
> #include <acpi/reboot.h>
> #include <asm/io.h>
> #include <asm/apic.h>
> @@ -716,6 +717,14 @@ static void native_machine_emergency_restart(void)
>
> void native_machine_shutdown(void)
> {
> + /*
> + * Call enc_kexec_stop_conversion() while all CPUs are still active and
> + * interrupts are enabled. This will allow all in-flight memory
> + * conversions to finish cleanly.
> + */
> + if (kexec_in_progress)
> + x86_platform.guest.enc_kexec_stop_conversion(false);
> +
> /* Stop the cpus and apics */
> #ifdef CONFIG_X86_IO_APIC
> /*
> @@ -752,6 +761,9 @@ void native_machine_shutdown(void)
> #ifdef CONFIG_X86_64
> x86_platform.iommu_shutdown();
> #endif
> +
> + if (kexec_in_progress)
> + x86_platform.guest.enc_kexec_unshare_mem();
... but they're split here.
And I don't know why and nothing tells me...
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Thu, May 02, 2024 at 03:45:06PM +0200, Borislav Petkov wrote:
> > diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
> > index e74d0c4286c1..f1b261be78b4 100644
> > --- a/arch/x86/kernel/crash.c
> > +++ b/arch/x86/kernel/crash.c
> > @@ -128,6 +128,10 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
> > #ifdef CONFIG_HPET_TIMER
> > hpet_disable();
> > #endif
> > +
> > + x86_platform.guest.enc_kexec_stop_conversion(true);
> > + x86_platform.guest.enc_kexec_unshare_mem();
> > +
>
> You call them here back-to-back...
>
> > crash_save_cpu(regs, safe_smp_processor_id());
> > }
> >
> > diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
> > index f3130f762784..c1920ec34f0c 100644
> > --- a/arch/x86/kernel/reboot.c
> > +++ b/arch/x86/kernel/reboot.c
> > @@ -12,6 +12,7 @@
> > #include <linux/delay.h>
> > #include <linux/objtool.h>
> > #include <linux/pgtable.h>
> > +#include <linux/kexec.h>
> > #include <acpi/reboot.h>
> > #include <asm/io.h>
> > #include <asm/apic.h>
> > @@ -716,6 +717,14 @@ static void native_machine_emergency_restart(void)
> >
> > void native_machine_shutdown(void)
> > {
> > + /*
> > + * Call enc_kexec_stop_conversion() while all CPUs are still active and
> > + * interrupts are enabled. This will allow all in-flight memory
> > + * conversions to finish cleanly.
> > + */
> > + if (kexec_in_progress)
> > + x86_platform.guest.enc_kexec_stop_conversion(false);
> > +
> > /* Stop the cpus and apics */
> > #ifdef CONFIG_X86_IO_APIC
> > /*
> > @@ -752,6 +761,9 @@ void native_machine_shutdown(void)
> > #ifdef CONFIG_X86_64
> > x86_platform.iommu_shutdown();
> > #endif
> > +
> > + if (kexec_in_progress)
> > + x86_platform.guest.enc_kexec_unshare_mem();
>
> ... but they're split here.
>
> And I don't know why and nothing tells me...
I do. See comment just above enc_kexec_stop_conversion() call.
Do you want also comment for enc_kexec_unshare_mem() ?
--
Kiryl Shutsemau / Kirill A. Shutemov
On Mon, May 06, 2024 at 04:22:02PM +0300, Kirill A. Shutemov wrote:
> I do. See comment just above enc_kexec_stop_conversion() call.
If you mean this:
/*
* Call enc_kexec_stop_conversion() while all CPUs are still active and
* interrupts are enabled. This will allow all in-flight memory
* conversions to finish cleanly.
*/
if (kexec_in_progress)
x86_platform.guest.enc_kexec_stop_conversion(false);
then no, this is not enough.
I mean this:
/**
* struct x86_guest - Functions used by misc guest incarnations like SEV, TDX, etc.
*
* @enc_status_change_prepare Notify HV before the encryption status of a range is changed
* @enc_status_change_finish Notify HV after the encryption status of a range is changed
* @enc_tlb_flush_required Returns true if a TLB flush is needed before changing page encryption status
* @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status
* @enc_kexec_begin Begin the two-step process of stopping
* page conversion... <insert reason why it
* needs to happen this way, blabla>
* @enc_kexec_finish ...
*/
struct x86_guest {
int (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc);
int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
bool (*enc_tlb_flush_required)(bool enc);
bool (*enc_cache_flush_required)(void);
void (*enc_kexec_begin)(bool crash);
void (*enc_kexec_finish)(void);
And calling them a _begin and _finish makes a lot more sense to me:
_begin starts the kexec process for encrypted guests and _finish
finishes it.
Just from the names you now know what needs to happen and in which
order.
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
© 2016 - 2026 Red Hat, Inc.