[v3] TDX host: kexec/kdump support

[PATCH v3 6/6] KVM: TDX: Explicitly do WBINVD upon reboot notifier

Posted by Kai Huang 3 months, 2 weeks ago

On TDX platforms, during kexec, the kernel needs to make sure there's no
dirty cachelines of TDX private memory before booting to the new kernel
to avoid silent memory corruption to the new kernel.

During kexec, the kexec-ing CPU firstly invokes native_stop_other_cpus()
to stop all remote CPUs before booting to the new kernel.  The remote
CPUs will then execute stop_this_cpu() to stop themselves.

The kernel has a percpu boolean to indicate whether the cache of a CPU
may be in incoherent state.  In stop_this_cpu(), the kernel does WBINVD
if that percpu boolean is true.

TDX turns on that percpu boolean on a CPU when the kernel does SEAMCALL.
This makes sure the caches will be flushed during kexec.

However, the native_stop_other_cpus() and stop_this_cpu() have a "race"
which is extremely rare to happen but could cause system to hang.

Specifically, the native_stop_other_cpus() firstly sends normal reboot
IPI to remote CPUs and wait one second for them to stop.  If that times
out, native_stop_other_cpus() then sends NMIs to remote CPUs to stop
them.

The aforementioned race happens when NMIs are sent.  Doing WBINVD in
stop_this_cpu() makes each CPU take longer time to stop and increases
the chance of the race to happen.

Register reboot notifier in KVM to explicitly flush caches upon
receiving reboot notifier (e.g., during kexec) for TDX.  This moves the
WBINVD to an earlier stage than stop_this_cpus(), avoiding a possibly
lengthy operation at a time where it could cause this race.

Signed-off-by: Kai Huang <kai.huang@intel.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Tested-by: Farrah Chen <farrah.chen@intel.com>
---

v2 -> v3:
 - Update changelog to address Paolo's comments and Add Paolo's Ack:
   https://lore.kernel.org/lkml/3a7c0856-6e7b-4d3d-b966-6f17f1aca42e@redhat.com/

---
 arch/x86/include/asm/tdx.h  |  3 +++
 arch/x86/kvm/vmx/tdx.c      | 45 +++++++++++++++++++++++++++++++++++++
 arch/x86/virt/vmx/tdx/tdx.c |  9 ++++++++
 3 files changed, 57 insertions(+)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index d4c624c69d7f..e6b11982c6c6 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -221,6 +221,8 @@ u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u6
 u64 tdh_phymem_cache_wb(bool resume);
 u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td);
 u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page);
+
+void tdx_cpu_flush_cache(void);
 #else
 static inline void tdx_init(void) { }
 static inline int tdx_cpu_enable(void) { return -ENODEV; }
@@ -228,6 +230,7 @@ static inline int tdx_enable(void)  { return -ENODEV; }
 static inline u32 tdx_get_nr_guest_keyids(void) { return 0; }
 static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; }
 static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; }
+static inline void tdx_cpu_flush_cache(void) { }
 #endif	/* CONFIG_INTEL_TDX_HOST */
 
 #endif /* !__ASSEMBLER__ */
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 1ad20c273f3b..c567a64a6cb0 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -5,7 +5,9 @@
 #include <asm/fpu/xcr.h>
 #include <linux/misc_cgroup.h>
 #include <linux/mmu_context.h>
+#include <linux/reboot.h>
 #include <asm/tdx.h>
+#include <asm/processor.h>
 #include "capabilities.h"
 #include "mmu.h"
 #include "x86_ops.h"
@@ -3347,6 +3349,33 @@ static int tdx_offline_cpu(unsigned int cpu)
 	return -EBUSY;
 }
 
+static void smp_func_cpu_flush_cache(void *unused)
+{
+	tdx_cpu_flush_cache();
+}
+
+static int tdx_reboot_notify(struct notifier_block *nb, unsigned long code,
+			     void *unused)
+{
+	/*
+	 * Flush cache for all CPUs upon the reboot notifier.  This
+	 * avoids having to do WBINVD in stop_this_cpu() during kexec.
+	 *
+	 * Kexec calls native_stop_other_cpus() to stop remote CPUs
+	 * before booting to new kernel, but that code has a "race"
+	 * when the normal REBOOT IPI timesout and NMIs are sent to
+	 * remote CPUs to stop them.  Doing WBINVD in stop_this_cpu()
+	 * could potentially increase the posibility of the "race".
+	 */
+	if (code == SYS_RESTART)
+		on_each_cpu(smp_func_cpu_flush_cache, NULL, 1);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block tdx_reboot_nb = {
+	.notifier_call = tdx_reboot_notify,
+};
+
 static void __do_tdx_cleanup(void)
 {
 	/*
@@ -3504,6 +3533,11 @@ void tdx_cleanup(void)
 {
 	if (enable_tdx) {
 		misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
+		/*
+		 * Ignore the return value.  See the comment in
+		 * tdx_bringup().
+		 */
+		unregister_reboot_notifier(&tdx_reboot_nb);
 		__tdx_cleanup();
 		kvm_disable_virtualization();
 	}
@@ -3587,6 +3621,17 @@ int __init tdx_bringup(void)
 		enable_tdx = 0;
 	}
 
+	if (enable_tdx)
+		/*
+		 * Ignore the return value.  @tdx_reboot_nb is used to flush
+		 * cache for all CPUs upon rebooting to avoid having to do
+		 * WBINVD in kexec while the kexec-ing CPU stops all remote
+		 * CPUs.  Failure to register isn't fatal, because if KVM
+		 * doesn't flush cache explicitly upon rebooting the kexec
+		 * will do it anyway.
+		 */
+		register_reboot_notifier(&tdx_reboot_nb);
+
 	return r;
 
 success_disable_tdx:
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index c7a9a087ccaf..73425e9bee39 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1870,3 +1870,12 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
 	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
 }
 EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
+
+void tdx_cpu_flush_cache(void)
+{
+	lockdep_assert_preemption_disabled();
+
+	wbinvd();
+	this_cpu_write(cache_state_incoherent, false);
+}
+EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache);
-- 
2.49.0

Re: [PATCH v3 6/6] KVM: TDX: Explicitly do WBINVD upon reboot notifier

Posted by Chao Gao 3 months, 1 week ago

>--- a/arch/x86/virt/vmx/tdx/tdx.c
>+++ b/arch/x86/virt/vmx/tdx/tdx.c
>@@ -1870,3 +1870,12 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
> 	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
> }
> EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
>+
>+void tdx_cpu_flush_cache(void)
>+{
>+	lockdep_assert_preemption_disabled();
>+
>+	wbinvd();

Shouldn't you check the per-CPU variable first? so that WBINVD can be
skipped if there is nothing incoherent.

And reboot notifier looks the wrong place for WBINVD. Because SEAMCALLs
(see below) called after the reboot notifier will set the per-CPU variable
again. So in some cases, this patch will result in an *extra* WBINVD
instead of moving WBINVD to an earlier stage.

kernel_kexec()
  ->kernel_restart_prepare()
    ->blocking_notifier_call_chain() // reboot notifier
  ->syscore_shutdown()
    -> ...
      ->tdx_disable_virtualization_cpu()
        ->tdx_flush_vp()

>+	this_cpu_write(cache_state_incoherent, false);
>+}
>+EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache);

I wonder why we don't simply perform WBINVD in
vt_disable_virtualization_cpu() after VMXOFF, i.e.,

diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index d1e02e567b57..1ad3c28b8eff 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -19,6 +19,8 @@ static void vt_disable_virtualization_cpu(void)
	if (enable_tdx)
		tdx_disable_virtualization_cpu();
	vmx_disable_virtualization_cpu();
+	/* Explain why WBINVD is needed */
+	if (enable_tdx)
+		wbinvd();
 }
 
 static __init int vt_hardware_setup(void)

It can solve the cache line aliasing problem and is much simpler than
patches 1-2 and 6.

Re: [PATCH v3 6/6] KVM: TDX: Explicitly do WBINVD upon reboot notifier

Posted by Huang, Kai 3 months ago

On Wed, 2025-07-02 at 15:54 +0800, Chao Gao wrote:
> I wonder why we don't simply perform WBINVD in
> vt_disable_virtualization_cpu() after VMXOFF, i.e.,
> 
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index d1e02e567b57..1ad3c28b8eff 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -19,6 +19,8 @@ static void vt_disable_virtualization_cpu(void)
> 	if (enable_tdx)
> 		tdx_disable_virtualization_cpu();
> 	vmx_disable_virtualization_cpu();
> +	/* Explain why WBINVD is needed */
> +	if (enable_tdx)
> +		wbinvd();
>  }
>  
>  static __init int vt_hardware_setup(void)
> 
> It can solve the cache line aliasing problem and is much simpler than
> patches 1-2 and 6.

After more thinking, I think the percpu boolean isn't conflicting with
what you suggested above.  As Dave mentioned here[*], it can help catch
wbinvd() at kexec-ing time if something screwed up at earlier time:

  ...
  hopefully at point after you're sure no more TDCALLs are being made. If
  you screw it up, no biggie: the kexec-time one will make up for it,
  exposing TDX systems to the kexec timing bugs. But if the on_each_cpu()
  thing works in the common case, you get no additional bug exposure.

Btw, the reason that I wanted to do wbinvd() in rebooting notifier is it
suits a good place to reset TDX private memory managed by KVM on "partial
write machine check" erratum platforms (which isn't included in this patch
though).  We need to flush cache before resetting TDX private memory.

So while doing wbinvd() after vmx_disable_virtualization_cpu() sounds
promising for cache flush, it is not ideal for handling erratum.  Using
rebooting notifier makes such code self-contained in tdx.c in KVM.

[*]:
https://lore.kernel.org/lkml/31e17bc8-2e9e-4e93-a912-3d54826e59d0@intel.com/

Re: [PATCH v3 6/6] KVM: TDX: Explicitly do WBINVD upon reboot notifier

Posted by Huang, Kai 3 months, 1 week ago

On Wed, 2025-07-02 at 15:54 +0800, Chao Gao wrote:
> > --- a/arch/x86/virt/vmx/tdx/tdx.c
> > +++ b/arch/x86/virt/vmx/tdx/tdx.c
> > @@ -1870,3 +1870,12 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
> > 	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
> > }
> > EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
> > +
> > +void tdx_cpu_flush_cache(void)
> > +{
> > +	lockdep_assert_preemption_disabled();
> > +
> > +	wbinvd();
> 
> Shouldn't you check the per-CPU variable first? so that WBINVD can be
> skipped if there is nothing incoherent.

It is assumed the caller of this function knows that cache needs to be
flushed.  But I can do the additional check and skip the wbinvd().

> 
> And reboot notifier looks the wrong place for WBINVD. Because SEAMCALLs
> (see below) called after the reboot notifier will set the per-CPU variable
> again. So in some cases, this patch will result in an *extra* WBINVD
> instead of moving WBINVD to an earlier stage.

I agree.  Me and Rick had some discussion around here and this patch can
still bring optimization "in most cases", i.e., the real user of kexec
normally will just do the kexec when no TD is running.

To make it complete we should manually kill all TDs upon rebooting notifier.

I should call that out in the changelog though.

> 
> kernel_kexec()
>   ->kernel_restart_prepare()
>     ->blocking_notifier_call_chain() // reboot notifier
>   ->syscore_shutdown()
>     -> ...
>       ->tdx_disable_virtualization_cpu()
>         ->tdx_flush_vp()
> 
> > +	this_cpu_write(cache_state_incoherent, false);
> > +}
> > +EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache);
> 
> I wonder why we don't simply perform WBINVD in
> vt_disable_virtualization_cpu() after VMXOFF, i.e.,
> 
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index d1e02e567b57..1ad3c28b8eff 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -19,6 +19,8 @@ static void vt_disable_virtualization_cpu(void)
> 	if (enable_tdx)
> 		tdx_disable_virtualization_cpu();
> 	vmx_disable_virtualization_cpu();
> +	/* Explain why WBINVD is needed */
> +	if (enable_tdx)
> +		wbinvd();
>  }
> 
>  static __init int vt_hardware_setup(void)
> 
> It can solve the cache line aliasing problem and is much simpler than
> patches 1-2 and 6.

This sounds promising, but the emergency virtualization part needs similar
handling too.

Previously the VMXOFF in the emergency virtualization was explicitly done in
the core kernel, so for this approach we have to scatter checks for
different vendors at different places.  This wasn't nice.

Now the emergency virtualization disable itself is implemented in KVM too
(the core kernel only calls a function pointer), so I _think_ the situation
is better now, if we do wbinvd() in VMX disabling path.

It will get more complicated when other kernel components (VT-d) need to
invoke SEAMCALL, but at that time VMX code should have been moved to core
kernel, so doing WBINVD after VMXOFF sounds fine too.

One concern is this approach doesn't play quite nice with below pattern:

	cpu_enable_virtualization();
	SEAMCALL();
	cpu_disable_virtualization();

but hopefully VT-d code doesn't need to do like this.

The percpu boolean approach does have another advantage (although it's more
like theoretical issue), that it could be also used to cover other cases
that could also lead to cache being in incoherent:

https://lore.kernel.org/lkml/eb2e3b02-cf5e-4848-8f1d-9f3af8f9c96b@intel.com/

I'll think more on this.  I will be out for the rest of the week so I will
come back next week.

Re: [PATCH v3 6/6] KVM: TDX: Explicitly do WBINVD upon reboot notifier

Posted by Binbin Wu 3 months, 1 week ago


On 6/26/2025 6:48 PM, Kai Huang wrote:
> On TDX platforms, during kexec, the kernel needs to make sure there's no
> dirty cachelines of TDX private memory before booting to the new kernel
> to avoid silent memory corruption to the new kernel.
>
> During kexec, the kexec-ing CPU firstly invokes native_stop_other_cpus()
> to stop all remote CPUs before booting to the new kernel.  The remote
> CPUs will then execute stop_this_cpu() to stop themselves.
>
> The kernel has a percpu boolean to indicate whether the cache of a CPU
> may be in incoherent state.  In stop_this_cpu(), the kernel does WBINVD
> if that percpu boolean is true.
>
> TDX turns on that percpu boolean on a CPU when the kernel does SEAMCALL.
> This makes sure the caches will be flushed during kexec.
>
> However, the native_stop_other_cpus() and stop_this_cpu() have a "race"
> which is extremely rare to happen but could cause system to hang.
>
> Specifically, the native_stop_other_cpus() firstly sends normal reboot
> IPI to remote CPUs and wait one second for them to stop.  If that times
> out, native_stop_other_cpus() then sends NMIs to remote CPUs to stop
> them.
>
> The aforementioned race happens when NMIs are sent.  Doing WBINVD in
> stop_this_cpu() makes each CPU take longer time to stop and increases
> the chance of the race to happen.
>
> Register reboot notifier in KVM to explicitly flush caches upon
> receiving reboot notifier (e.g., during kexec) for TDX.  This moves the
> WBINVD to an earlier stage than stop_this_cpus(), avoiding a possibly
> lengthy operation at a time where it could cause this race.
Two nits below.

Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>

>
> Signed-off-by: Kai Huang <kai.huang@intel.com>
> Acked-by: Paolo Bonzini <pbonzini@redhat.com>
> Tested-by: Farrah Chen <farrah.chen@intel.com>
> ---
>
> v2 -> v3:
>   - Update changelog to address Paolo's comments and Add Paolo's Ack:
>     https://lore.kernel.org/lkml/3a7c0856-6e7b-4d3d-b966-6f17f1aca42e@redhat.com/
>
> ---
>   arch/x86/include/asm/tdx.h  |  3 +++
>   arch/x86/kvm/vmx/tdx.c      | 45 +++++++++++++++++++++++++++++++++++++
>   arch/x86/virt/vmx/tdx/tdx.c |  9 ++++++++
>   3 files changed, 57 insertions(+)
>
[...]
> +
> +static int tdx_reboot_notify(struct notifier_block *nb, unsigned long code,
> +			     void *unused)
> +{
> +	/*
> +	 * Flush cache for all CPUs upon the reboot notifier.  This
> +	 * avoids having to do WBINVD in stop_this_cpu() during kexec.
> +	 *
> +	 * Kexec calls native_stop_other_cpus() to stop remote CPUs
> +	 * before booting to new kernel, but that code has a "race"
> +	 * when the normal REBOOT IPI timesout and NMIs are sent to

timesout should be times out or timeouts?

> +	 * remote CPUs to stop them.  Doing WBINVD in stop_this_cpu()
> +	 * could potentially increase the posibility of the "race".
s/posibility/possibility

> +	 */
> +	if (code == SYS_RESTART)
> +		on_each_cpu(smp_func_cpu_flush_cache, NULL, 1);
> +	return NOTIFY_DONE;
> +}
> +
>
[...]

Re: [PATCH v3 6/6] KVM: TDX: Explicitly do WBINVD upon reboot notifier

Posted by Huang, Kai 3 months, 1 week ago

> Two nits below.
> 
> Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
> 
> > 
> > +	/*
> > +	 * Flush cache for all CPUs upon the reboot notifier.  This
> > +	 * avoids having to do WBINVD in stop_this_cpu() during kexec.
> > +	 *
> > +	 * Kexec calls native_stop_other_cpus() to stop remote CPUs
> > +	 * before booting to new kernel, but that code has a "race"
> > +	 * when the normal REBOOT IPI timesout and NMIs are sent to
> 
> timesout should be times out or timeouts?

I will use "times out".

> 
> > +	 * remote CPUs to stop them.  Doing WBINVD in stop_this_cpu()
> > +	 * could potentially increase the posibility of the "race".
> s/posibility/possibility
> 
> 

Oops I didn't check enough :-)

Will fix and thanks.

Re: [PATCH v3 6/6] KVM: TDX: Explicitly do WBINVD upon reboot notifier

Posted by Edgecombe, Rick P 3 months, 1 week ago

On Thu, 2025-06-26 at 22:48 +1200, Kai Huang wrote:
> On TDX platforms, during kexec, the kernel needs to make sure there's no
> dirty cachelines of TDX private memory before booting to the new kernel
> to avoid silent memory corruption to the new kernel.
> 
> During kexec, the kexec-ing CPU firstly invokes native_stop_other_cpus()
> to stop all remote CPUs before booting to the new kernel.  The remote
> CPUs will then execute stop_this_cpu() to stop themselves.
> 
> The kernel has a percpu boolean to indicate whether the cache of a CPU
> may be in incoherent state.  In stop_this_cpu(), the kernel does WBINVD
> if that percpu boolean is true.
> 
> TDX turns on that percpu boolean on a CPU when the kernel does SEAMCALL.
> This makes sure the caches will be flushed during kexec.
> 
> However, the native_stop_other_cpus() and stop_this_cpu() have a "race"
> which is extremely rare to happen but could cause system to hang.
> 
> Specifically, the native_stop_other_cpus() firstly sends normal reboot
> IPI to remote CPUs and wait one second for them to stop.  If that times
> out, native_stop_other_cpus() then sends NMIs to remote CPUs to stop
> them.
> 
> The aforementioned race happens when NMIs are sent.  Doing WBINVD in
> stop_this_cpu() makes each CPU take longer time to stop and increases
> the chance of the race to happen.
> 
> Register reboot notifier in KVM to explicitly flush caches upon
> receiving reboot notifier (e.g., during kexec) for TDX.  This moves the
> WBINVD to an earlier stage than stop_this_cpus(), avoiding a possibly
> lengthy operation at a time where it could cause this race.
> 

I think this will reduce the chance of the race, but it feels like the wrong way
to address the race. But I don't know how to properly fix it either. Maybe this
is just the nature of x86 NMIs, to have code like this.

Functionally it looks good, but a few nits to take or leave below.

> Signed-off-by: Kai Huang <kai.huang@intel.com>
> Acked-by: Paolo Bonzini <pbonzini@redhat.com>
> Tested-by: Farrah Chen <farrah.chen@intel.com>
> ---
> 
> v2 -> v3:
>  - Update changelog to address Paolo's comments and Add Paolo's Ack:
>    https://lore.kernel.org/lkml/3a7c0856-6e7b-4d3d-b966-6f17f1aca42e@redhat.com/
> 
> ---
>  arch/x86/include/asm/tdx.h  |  3 +++
>  arch/x86/kvm/vmx/tdx.c      | 45 +++++++++++++++++++++++++++++++++++++
>  arch/x86/virt/vmx/tdx/tdx.c |  9 ++++++++
>  3 files changed, 57 insertions(+)
> 
> diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> index d4c624c69d7f..e6b11982c6c6 100644
> --- a/arch/x86/include/asm/tdx.h
> +++ b/arch/x86/include/asm/tdx.h
> @@ -221,6 +221,8 @@ u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u6
>  u64 tdh_phymem_cache_wb(bool resume);
>  u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td);
>  u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page);
> +

Nit: There is a new line here, but not below. I guess it's ok.

> +void tdx_cpu_flush_cache(void);
>  #else
>  static inline void tdx_init(void) { }
>  static inline int tdx_cpu_enable(void) { return -ENODEV; }
> @@ -228,6 +230,7 @@ static inline int tdx_enable(void)  { return -ENODEV; }
>  static inline u32 tdx_get_nr_guest_keyids(void) { return 0; }
>  static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; }
>  static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; }
> +static inline void tdx_cpu_flush_cache(void) { }
>  #endif	/* CONFIG_INTEL_TDX_HOST */
>  
>  #endif /* !__ASSEMBLER__ */
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 1ad20c273f3b..c567a64a6cb0 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -5,7 +5,9 @@
>  #include <asm/fpu/xcr.h>
>  #include <linux/misc_cgroup.h>
>  #include <linux/mmu_context.h>
> +#include <linux/reboot.h>
>  #include <asm/tdx.h>
> +#include <asm/processor.h>
>  #include "capabilities.h"
>  #include "mmu.h"
>  #include "x86_ops.h"
> @@ -3347,6 +3349,33 @@ static int tdx_offline_cpu(unsigned int cpu)
>  	return -EBUSY;
>  }
>  
> +static void smp_func_cpu_flush_cache(void *unused)
> +{
> +	tdx_cpu_flush_cache();
> +}
> +
> +static int tdx_reboot_notify(struct notifier_block *nb, unsigned long code,
> +			     void *unused)
> +{
> +	/*
> +	 * Flush cache for all CPUs upon the reboot notifier.  This
> +	 * avoids having to do WBINVD in stop_this_cpu() during kexec.
> +	 *
> +	 * Kexec calls native_stop_other_cpus() to stop remote CPUs
> +	 * before booting to new kernel, but that code has a "race"
> +	 * when the normal REBOOT IPI timesout and NMIs are sent to
> +	 * remote CPUs to stop them.  Doing WBINVD in stop_this_cpu()
> +	 * could potentially increase the posibility of the "race".
> +	 */
> +	if (code == SYS_RESTART)
> +		on_each_cpu(smp_func_cpu_flush_cache, NULL, 1);
> +	return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block tdx_reboot_nb = {
> +	.notifier_call = tdx_reboot_notify,
> +};
> +
>  static void __do_tdx_cleanup(void)
>  {
>  	/*
> @@ -3504,6 +3533,11 @@ void tdx_cleanup(void)
>  {
>  	if (enable_tdx) {
>  		misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
> +		/*
> +		 * Ignore the return value.  See the comment in
> +		 * tdx_bringup().
> +		 */
> +		unregister_reboot_notifier(&tdx_reboot_nb);
>  		__tdx_cleanup();
>  		kvm_disable_virtualization();
>  	}
> @@ -3587,6 +3621,17 @@ int __init tdx_bringup(void)
>  		enable_tdx = 0;
>  	}
>  
> +	if (enable_tdx)
> +		/*
> +		 * Ignore the return value.  @tdx_reboot_nb is used to flush
> +		 * cache for all CPUs upon rebooting to avoid having to do
> +		 * WBINVD in kexec while the kexec-ing CPU stops all remote
> +		 * CPUs.  Failure to register isn't fatal, because if KVM
> +		 * doesn't flush cache explicitly upon rebooting the kexec
> +		 * will do it anyway.
> +		 */
> +		register_reboot_notifier(&tdx_reboot_nb);
> +

The comment should be inside a {}.

>  	return r;
>  
>  success_disable_tdx:
> diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> index c7a9a087ccaf..73425e9bee39 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -1870,3 +1870,12 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
>  	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
>  }
>  EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
> +
> +void tdx_cpu_flush_cache(void)
> +{
> +	lockdep_assert_preemption_disabled();
> +
> +	wbinvd();
> +	this_cpu_write(cache_state_incoherent, false);
> +}
> +EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache);

Does this need to be here? Why not in KVM?

Re: [PATCH v3 6/6] KVM: TDX: Explicitly do WBINVD upon reboot notifier

Posted by Huang, Kai 3 months, 1 week ago

> > --- a/arch/x86/include/asm/tdx.h
> > +++ b/arch/x86/include/asm/tdx.h
> > @@ -221,6 +221,8 @@ u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u6
> >  u64 tdh_phymem_cache_wb(bool resume);
> >  u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td);
> >  u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page);
> > +
> 
> Nit: There is a new line here, but not below. I guess it's ok.

I will remove.


[...]

> > +	if (enable_tdx)
> > +		/*
> > +		 * Ignore the return value.  @tdx_reboot_nb is used to flush
> > +		 * cache for all CPUs upon rebooting to avoid having to do
> > +		 * WBINVD in kexec while the kexec-ing CPU stops all remote
> > +		 * CPUs.  Failure to register isn't fatal, because if KVM
> > +		 * doesn't flush cache explicitly upon rebooting the kexec
> > +		 * will do it anyway.
> > +		 */
> > +		register_reboot_notifier(&tdx_reboot_nb);
> > +
> 
> The comment should be inside a {}.

Will do.

> 
> >  	return r;
> >  
> >  success_disable_tdx:
> > diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> > index c7a9a087ccaf..73425e9bee39 100644
> > --- a/arch/x86/virt/vmx/tdx/tdx.c
> > +++ b/arch/x86/virt/vmx/tdx/tdx.c
> > @@ -1870,3 +1870,12 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
> >  	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
> >  }
> >  EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
> > +
> > +void tdx_cpu_flush_cache(void)
> > +{
> > +	lockdep_assert_preemption_disabled();
> > +
> > +	wbinvd();
> > +	this_cpu_write(cache_state_incoherent, false);
> > +}
> > +EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache);
> 
> Does this need to be here? Why not in KVM?

Otherwise the 'cache_state_incoherent' variable needs to be exported.  It's
good to hide the details in TDX core code too.

[PATCH v3 1/6] x86/sme: Use percpu boolean to control wbinvd during kexec
[PATCH v3 2/6] x86/virt/tdx: Mark memory cache state incoherent when making SEAMCALL
[PATCH v3 3/6] x86/kexec: Disable kexec/kdump on platforms with TDX partial write erratum
[PATCH v3 4/6] x86/virt/tdx: Remove the !KEXEC_CORE dependency
[PATCH v3 5/6] x86/virt/tdx: Update the kexec section in the TDX documentation
[PATCH v3 6/6] KVM: TDX: Explicitly do WBINVD upon reboot notifier