[PATCH] KVM: x86: Unify L1TF flushing under per-CPU variable

Brendan Jackman posted 1 patch 2 months ago
There is a newer version of this series
arch/x86/include/asm/hardirq.h  | 26 --------------------------
arch/x86/include/asm/idtentry.h |  1 +
arch/x86/include/asm/kvm_host.h | 21 ++++++++++++++++++---
arch/x86/kvm/mmu/mmu.c          |  2 +-
arch/x86/kvm/vmx/nested.c       |  2 +-
arch/x86/kvm/vmx/vmx.c          | 17 +++--------------
arch/x86/kvm/x86.c              | 12 +++++++++---
7 files changed, 33 insertions(+), 48 deletions(-)
[PATCH] KVM: x86: Unify L1TF flushing under per-CPU variable
Posted by Brendan Jackman 2 months ago
Currently the tracking of the need to flush L1D for L1TF is tracked by
two bits: one per-CPU and one per-vCPU.

The per-vCPU bit is always set when the vCPU shows up on a core, so
there is no interesting state that's truly per-vCPU. Indeed, this is a
requirement, since L1D is a part of the physical CPU.

So simplify this by combining the two bits.

Since this requires a DECLARE_PER_CPU() which belongs in kvm_host.h,
also move the remaining helper definitions there to live next to the
declaration.

Signed-off-by: Brendan Jackman <jackmanb@google.com>
---
 arch/x86/include/asm/hardirq.h  | 26 --------------------------
 arch/x86/include/asm/idtentry.h |  1 +
 arch/x86/include/asm/kvm_host.h | 21 ++++++++++++++++++---
 arch/x86/kvm/mmu/mmu.c          |  2 +-
 arch/x86/kvm/vmx/nested.c       |  2 +-
 arch/x86/kvm/vmx/vmx.c          | 17 +++--------------
 arch/x86/kvm/x86.c              | 12 +++++++++---
 7 files changed, 33 insertions(+), 48 deletions(-)

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index f00c09ffe6a95f07342bb0c6cea3769d71eecfa9..29d8fa43d4404add3b191821e42c3526b0f2c950 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -5,9 +5,6 @@
 #include <linux/threads.h>
 
 typedef struct {
-#if IS_ENABLED(CONFIG_KVM_INTEL)
-	u8	     kvm_cpu_l1tf_flush_l1d;
-#endif
 	unsigned int __nmi_count;	/* arch dependent */
 #ifdef CONFIG_X86_LOCAL_APIC
 	unsigned int apic_timer_irqs;	/* arch dependent */
@@ -68,27 +65,4 @@ extern u64 arch_irq_stat(void);
 DECLARE_PER_CPU_CACHE_HOT(u16, __softirq_pending);
 #define local_softirq_pending_ref       __softirq_pending
 
-#if IS_ENABLED(CONFIG_KVM_INTEL)
-/*
- * This function is called from noinstr interrupt contexts
- * and must be inlined to not get instrumentation.
- */
-static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void)
-{
-	__this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1);
-}
-
-static __always_inline void kvm_clear_cpu_l1tf_flush_l1d(void)
-{
-	__this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0);
-}
-
-static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void)
-{
-	return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d);
-}
-#else /* !IS_ENABLED(CONFIG_KVM_INTEL) */
-static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void) { }
-#endif /* IS_ENABLED(CONFIG_KVM_INTEL) */
-
 #endif /* _ASM_X86_HARDIRQ_H */
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index a4ec27c6798875900cbdbba927918e70b900f63b..67fb1adadbb8ac2bd083ba6245de2e7d58b5b398 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -12,6 +12,7 @@
 #include <linux/hardirq.h>
 
 #include <asm/irq_stack.h>
+#include <asm/kvm_host.h>
 
 typedef void (*idtentry_t)(struct pt_regs *regs);
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 48598d017d6f3f07263a2ffffe670be2658eb9cb..d93c2b9dbfbc9824cce65256f606f32e41c93167 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1055,9 +1055,6 @@ struct kvm_vcpu_arch {
 	/* be preempted when it's in kernel-mode(cpl=0) */
 	bool preempted_in_kernel;
 
-	/* Flush the L1 Data cache for L1TF mitigation on VMENTER */
-	bool l1tf_flush_l1d;
-
 	/* Host CPU on which VM-entry was most recently attempted */
 	int last_vmentry_cpu;
 
@@ -2476,4 +2473,22 @@ static inline bool kvm_arch_has_irq_bypass(void)
 	return enable_device_posted_irqs;
 }
 
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+
+DECLARE_PER_CPU(bool, l1tf_flush_l1d);
+
+/*
+ * This function is called from noinstr interrupt contexts
+ * and must be inlined to not get instrumentation.
+ */
+static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void)
+{
+	__this_cpu_write(l1tf_flush_l1d, true);
+}
+
+#else /* !IS_ENABLED(CONFIG_KVM_INTEL) */
+static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void) { }
+#endif /* IS_ENABLED(CONFIG_KVM_INTEL) */
+
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 667d66cf76d5e52c22f9517914307244ae868eea..8c0dce401a42d977756ca82d249bb33c858b9c9f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4859,7 +4859,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 	 */
 	BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
 
-	vcpu->arch.l1tf_flush_l1d = true;
+	kvm_set_cpu_l1tf_flush_l1d();
 	if (!flags) {
 		trace_kvm_page_fault(vcpu, fault_address, error_code);
 
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 76271962cb7083b475de6d7d24bf9cb918050650..5035cfdc4e55365bfabf08c704b9bff5c06267a1 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3880,7 +3880,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		goto vmentry_failed;
 
 	/* Hide L1D cache contents from the nested guest.  */
-	vmx->vcpu.arch.l1tf_flush_l1d = true;
+	kvm_set_cpu_l1tf_flush_l1d();
 
 	/*
 	 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 546272a5d34da301710df1d89414f41fc9b24a1f..f982f6721dc3e0dfe046881c72732326e16fcfb3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6673,25 +6673,14 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
 	 * 'always'
 	 */
 	if (static_branch_likely(&vmx_l1d_flush_cond)) {
-		bool flush_l1d;
-
 		/*
 		 * Clear the per-vcpu flush bit, it gets set again if the vCPU
 		 * is reloaded, i.e. if the vCPU is scheduled out or if KVM
 		 * exits to userspace, or if KVM reaches one of the unsafe
-		 * VMEXIT handlers, e.g. if KVM calls into the emulator.
+		 * VMEXIT handlers, e.g. if KVM calls into the emulator, or from the
+		 * interrupt handlers.
 		 */
-		flush_l1d = vcpu->arch.l1tf_flush_l1d;
-		vcpu->arch.l1tf_flush_l1d = false;
-
-		/*
-		 * Clear the per-cpu flush bit, it gets set again from
-		 * the interrupt handlers.
-		 */
-		flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
-		kvm_clear_cpu_l1tf_flush_l1d();
-
-		if (!flush_l1d)
+		if (!this_cpu_xchg(l1tf_flush_l1d, false))
 			return;
 	}
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4b8138bd48572fd161eda73d2dbdc1dcd0bcbcac..766d61516602e0f4975930224fc57b5a511281e5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -171,6 +171,12 @@ bool __read_mostly enable_vmware_backdoor = false;
 module_param(enable_vmware_backdoor, bool, 0444);
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_vmware_backdoor);
 
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+/* Flush the L1 Data cache for L1TF mitigation on VMENTER */
+DEFINE_PER_CPU(bool, l1tf_flush_l1d);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(l1tf_flush_l1d);
+#endif
+
 /*
  * Flags to manipulate forced emulation behavior (any non-zero value will
  * enable forced emulation).
@@ -5190,7 +5196,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 
-	vcpu->arch.l1tf_flush_l1d = true;
+	kvm_set_cpu_l1tf_flush_l1d();
 
 	if (vcpu->scheduled_out && pmu->version && pmu->event_count) {
 		pmu->need_cleanup = true;
@@ -8000,7 +8006,7 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
 				unsigned int bytes, struct x86_exception *exception)
 {
 	/* kvm_write_guest_virt_system can pull in tons of pages. */
-	vcpu->arch.l1tf_flush_l1d = true;
+	kvm_set_cpu_l1tf_flush_l1d();
 
 	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
 					   PFERR_WRITE_MASK, exception);
@@ -9396,7 +9402,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 		return handle_emulation_failure(vcpu, emulation_type);
 	}
 
-	vcpu->arch.l1tf_flush_l1d = true;
+	kvm_set_cpu_l1tf_flush_l1d();
 
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 		kvm_clear_exception_queue(vcpu);

---
base-commit: 6b36119b94d0b2bb8cea9d512017efafd461d6ac
change-id: 20251013-b4-l1tf-percpu-793181fa5884

Best regards,
-- 
Brendan Jackman <jackmanb@google.com>
Re: [PATCH] KVM: x86: Unify L1TF flushing under per-CPU variable
Posted by kernel test robot 2 months ago
Hi Brendan,

kernel test robot noticed the following build errors:

[auto build test ERROR on 6b36119b94d0b2bb8cea9d512017efafd461d6ac]

url:    https://github.com/intel-lab-lkp/linux/commits/Brendan-Jackman/KVM-x86-Unify-L1TF-flushing-under-per-CPU-variable/20251013-235118
base:   6b36119b94d0b2bb8cea9d512017efafd461d6ac
patch link:    https://lore.kernel.org/r/20251013-b4-l1tf-percpu-v1-1-d65c5366ea1a%40google.com
patch subject: [PATCH] KVM: x86: Unify L1TF flushing under per-CPU variable
config: x86_64-rhel-9.4 (https://download.01.org/0day-ci/archive/20251014/202510141438.OMSBOz6R-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251014/202510141438.OMSBOz6R-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202510141438.OMSBOz6R-lkp@intel.com/

All errors (new ones prefixed by >>):

>> arch/x86/coco/tdx/tdx.c:471:12: error: conflicting types for 'read_msr'; have 'int(struct pt_regs *, struct ve_info *)'
     471 | static int read_msr(struct pt_regs *regs, struct ve_info *ve)
         |            ^~~~~~~~
   In file included from arch/x86/include/asm/idtentry.h:15,
                    from arch/x86/include/asm/traps.h:9,
                    from arch/x86/coco/tdx/tdx.c:20:
   arch/x86/include/asm/kvm_host.h:2320:29: note: previous definition of 'read_msr' with type 'long unsigned int(long unsigned int)'
    2320 | static inline unsigned long read_msr(unsigned long msr)
         |                             ^~~~~~~~


vim +471 arch/x86/coco/tdx/tdx.c

9f98a4f4e7216d Vishal Annapurve   2025-02-28  470  
cdd85786f4b3b9 Kirill A. Shutemov 2022-06-14 @471  static int read_msr(struct pt_regs *regs, struct ve_info *ve)
ae87f609cd5282 Kirill A. Shutemov 2022-04-06  472  {
8a8544bde858e5 Kai Huang          2023-08-15  473  	struct tdx_module_args args = {
ae87f609cd5282 Kirill A. Shutemov 2022-04-06  474  		.r10 = TDX_HYPERCALL_STANDARD,
ae87f609cd5282 Kirill A. Shutemov 2022-04-06  475  		.r11 = hcall_func(EXIT_REASON_MSR_READ),
ae87f609cd5282 Kirill A. Shutemov 2022-04-06  476  		.r12 = regs->cx,
ae87f609cd5282 Kirill A. Shutemov 2022-04-06  477  	};
ae87f609cd5282 Kirill A. Shutemov 2022-04-06  478  
ae87f609cd5282 Kirill A. Shutemov 2022-04-06  479  	/*
ae87f609cd5282 Kirill A. Shutemov 2022-04-06  480  	 * Emulate the MSR read via hypercall. More info about ABI
ae87f609cd5282 Kirill A. Shutemov 2022-04-06  481  	 * can be found in TDX Guest-Host-Communication Interface
ae87f609cd5282 Kirill A. Shutemov 2022-04-06  482  	 * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
ae87f609cd5282 Kirill A. Shutemov 2022-04-06  483  	 */
c641cfb5c157b6 Kai Huang          2023-08-15  484  	if (__tdx_hypercall(&args))
cdd85786f4b3b9 Kirill A. Shutemov 2022-06-14  485  		return -EIO;
ae87f609cd5282 Kirill A. Shutemov 2022-04-06  486  
ae87f609cd5282 Kirill A. Shutemov 2022-04-06  487  	regs->ax = lower_32_bits(args.r11);
ae87f609cd5282 Kirill A. Shutemov 2022-04-06  488  	regs->dx = upper_32_bits(args.r11);
cdd85786f4b3b9 Kirill A. Shutemov 2022-06-14  489  	return ve_instr_len(ve);
ae87f609cd5282 Kirill A. Shutemov 2022-04-06  490  }
ae87f609cd5282 Kirill A. Shutemov 2022-04-06  491  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH] KVM: x86: Unify L1TF flushing under per-CPU variable
Posted by Brendan Jackman 2 months ago
On Tue Oct 14, 2025 at 7:24 AM UTC, kernel test robot wrote:
> Hi Brendan,
>
> kernel test robot noticed the following build errors:
>
> [auto build test ERROR on 6b36119b94d0b2bb8cea9d512017efafd461d6ac]
>
> url:    https://github.com/intel-lab-lkp/linux/commits/Brendan-Jackman/KVM-x86-Unify-L1TF-flushing-under-per-CPU-variable/20251013-235118
> base:   6b36119b94d0b2bb8cea9d512017efafd461d6ac
> patch link:    https://lore.kernel.org/r/20251013-b4-l1tf-percpu-v1-1-d65c5366ea1a%40google.com
> patch subject: [PATCH] KVM: x86: Unify L1TF flushing under per-CPU variable
> config: x86_64-rhel-9.4 (https://download.01.org/0day-ci/archive/20251014/202510141438.OMSBOz6R-lkp@intel.com/config)
> compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251014/202510141438.OMSBOz6R-lkp@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202510141438.OMSBOz6R-lkp@intel.com/
>
> All errors (new ones prefixed by >>):
>
>>> arch/x86/coco/tdx/tdx.c:471:12: error: conflicting types for 'read_msr'; have 'int(struct pt_regs *, struct ve_info *)'
>      471 | static int read_msr(struct pt_regs *regs, struct ve_info *ve)
>          |            ^~~~~~~~
>    In file included from arch/x86/include/asm/idtentry.h:15,
>                     from arch/x86/include/asm/traps.h:9,
>                     from arch/x86/coco/tdx/tdx.c:20:
>    arch/x86/include/asm/kvm_host.h:2320:29: note: previous definition of 'read_msr' with type 'long unsigned int(long unsigned int)'
>     2320 | static inline unsigned long read_msr(unsigned long msr)
>          |                             ^~~~~~~~

Yeah this is essentially just another symptom of the kvm_host.h abuse Sean
pointed out.
[syzbot ci] Re: KVM: x86: Unify L1TF flushing under per-CPU variable
Posted by syzbot ci 2 months ago
syzbot ci has tested the following series

[v1] KVM: x86: Unify L1TF flushing under per-CPU variable
https://lore.kernel.org/all/20251013-b4-l1tf-percpu-v1-1-d65c5366ea1a@google.com
* [PATCH] KVM: x86: Unify L1TF flushing under per-CPU variable

and found the following issue:
BUG: using __this_cpu_write() in preemptible code in x86_emulate_instruction

Full report is available here:
https://ci.syzbot.org/series/c0a39c51-4121-4883-a21f-4277f63b3118

***

BUG: using __this_cpu_write() in preemptible code in x86_emulate_instruction

tree:      kvm-next
URL:       https://kernel.googlesource.com/pub/scm/virt/kvm/kvm/
base:      6b36119b94d0b2bb8cea9d512017efafd461d6ac
arch:      amd64
compiler:  Debian clang version 20.1.8 (++20250708063551+0c9f909b7976-1~exp1~20250708183702.136), Debian LLD 20.1.8
config:    https://ci.syzbot.org/builds/a964957a-128c-43dd-b6df-7758575a9993/config
C repro:   https://ci.syzbot.org/findings/c0c645e8-abe2-437f-aa29-1adf339cb732/c_repro
syz repro: https://ci.syzbot.org/findings/c0c645e8-abe2-437f-aa29-1adf339cb732/syz_repro

BUG: using __this_cpu_write() in preemptible [00000000] code: syz.0.17/5962
caller is kvm_set_cpu_l1tf_flush_l1d arch/x86/include/asm/kvm_host.h:2486 [inline]
caller is x86_emulate_instruction+0x1a4/0x1f70 arch/x86/kvm/x86.c:9405
CPU: 1 UID: 0 PID: 5962 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
 check_preemption_disabled+0x10a/0x120 lib/smp_processor_id.c:47
 kvm_set_cpu_l1tf_flush_l1d arch/x86/include/asm/kvm_host.h:2486 [inline]
 x86_emulate_instruction+0x1a4/0x1f70 arch/x86/kvm/x86.c:9405
 kvm_mmu_page_fault+0x91a/0xb70 arch/x86/kvm/mmu/mmu.c:6385
 __vmx_handle_exit arch/x86/kvm/vmx/vmx.c:6624 [inline]
 vmx_handle_exit+0x10a4/0x18c0 arch/x86/kvm/vmx/vmx.c:6641
 vcpu_enter_guest arch/x86/kvm/x86.c:11461 [inline]
 vcpu_run+0x4359/0x6fc0 arch/x86/kvm/x86.c:11619
 kvm_arch_vcpu_ioctl_run+0xfc9/0x1940 arch/x86/kvm/x86.c:11958
 kvm_vcpu_ioctl+0x95c/0xe90 virt/kvm/kvm_main.c:4476
 vfs_ioctl fs/ioctl.c:51 [inline]
 __do_sys_ioctl fs/ioctl.c:598 [inline]
 __se_sys_ioctl+0xf9/0x170 fs/ioctl.c:584
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0xfa/0xfa0 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f0f9698eec9
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007fff7c8600c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00007f0f96be5fa0 RCX: 00007f0f9698eec9
RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 0000000000000005
RBP: 00007f0f96a11f91 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f0f96be5fa0 R14: 00007f0f96be5fa0 R15: 0000000000000003
 </TASK>


***

If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
  Tested-by: syzbot@syzkaller.appspotmail.com

---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.
Re: [syzbot ci] Re: KVM: x86: Unify L1TF flushing under per-CPU variable
Posted by Brendan Jackman 2 months ago
On Tue Oct 14, 2025 at 6:13 AM UTC, syzbot ci wrote:
> BUG: using __this_cpu_write() in preemptible code in x86_emulate_instruction

Ah. And now I realise I never booted my debug config on an actual
Skylake host, I'd better do that, presumably running the KVM selftests
with DEBUG_PREEMPT etc would have been enough to catch this earlier.

Anyway, I guest we just want to use vcpu->arch.last_vmentry_cpu instead
of smp_processor_id()?
Re: [syzbot ci] Re: KVM: x86: Unify L1TF flushing under per-CPU variable
Posted by Brendan Jackman 2 months ago
On Tue Oct 14, 2025 at 8:57 AM UTC, Brendan Jackman wrote:
> On Tue Oct 14, 2025 at 6:13 AM UTC, syzbot ci wrote:
>> BUG: using __this_cpu_write() in preemptible code in x86_emulate_instruction
>
> Ah. And now I realise I never booted my debug config on an actual
> Skylake host, I'd better do that, presumably running the KVM selftests
> with DEBUG_PREEMPT etc would have been enough to catch this earlier.
>
> Anyway, I guest we just want to use vcpu->arch.last_vmentry_cpu instead
> of smp_processor_id()?

Just went to code it up and changed my mind about this. If the vCPU is
being migrated, it doesn't really matter which CPU stuff like
x86_emulate_instruction() sets the bit on since it's vcpu_load()'s job to
make sure it's set on the CPU that actually needs it. So I think instead
we just want raw_cpu_write() here, then there's no pointless remote
updates. The bit might get set on a CPU that doesn't end up needing it
for the current vCPU, but it was gonna get the bit set before it ran the
next vCPU anyway.
Re: [PATCH] KVM: x86: Unify L1TF flushing under per-CPU variable
Posted by Sean Christopherson 2 months ago
On Mon, Oct 13, 2025, Brendan Jackman wrote:
> Currently the tracking of the need to flush L1D for L1TF is tracked by
> two bits: one per-CPU and one per-vCPU.
> 
> The per-vCPU bit is always set when the vCPU shows up on a core, so
> there is no interesting state that's truly per-vCPU. Indeed, this is a
> requirement, since L1D is a part of the physical CPU.
> 
> So simplify this by combining the two bits.
> 
> Since this requires a DECLARE_PER_CPU() which belongs in kvm_host.h,

No, it doesn't belong in kvm_host.h.

One of my biggest gripes with Google's prodkernel is that we only build with one
.config, and that breeds bad habits and some truly awful misconceptions about
kernel programming because engineers tend to treat that one .config as gospel.

Information *never* flows from a module to code that can _only_ be built-in, i.e.
to the so called "core kernel".  KVM x86 can be, and _usually_ is, built as a module,
kvm.ko.  Thus, KVM should *never* declare/provide symbols that are used by the
core kernel, because it simply can't work (without some abusrdly stupid logic)
when kvm.ko is built as a module:

  ld: vmlinux.o: in function `common_interrupt':
  arch/x86/include/asm/kvm_host.h:2486:(.noinstr.text+0x2b56): undefined reference to `l1tf_flush_l1d'
  ld: vmlinux.o: in function `sysvec_x86_platform_ipi':
  arch/x86/include/asm/kvm_host.h:2486:(.noinstr.text+0x2bf1): undefined reference to `l1tf_flush_l1d'
  ld: vmlinux.o: in function `sysvec_kvm_posted_intr_ipi':
  arch/x86/include/asm/kvm_host.h:2486:(.noinstr.text+0x2c81): undefined reference to `l1tf_flush_l1d'
  ld: vmlinux.o: in function `sysvec_kvm_posted_intr_wakeup_ipi':
  arch/x86/include/asm/kvm_host.h:2486:(.noinstr.text+0x2cd1): undefined reference to `l1tf_flush_l1d'
  ld: vmlinux.o: in function `sysvec_kvm_posted_intr_nested_ipi':
  arch/x86/include/asm/kvm_host.h:2486:(.noinstr.text+0x2d61): undefined reference to `l1tf_flush_l1d'
  ld: vmlinux.o:arch/x86/include/asm/kvm_host.h:2486: more undefined references to `l1tf_flush_l1d' follow

Because prodkernel's .config forces CONFIG_KVM=y (for equally awful reasons),
Google engineers completely forget/miss that having information flow from kvm.ko
to vmlinux is broken (though I am convinced that a large percentage of engineers
that work (almost) exclusively on prodkernel simply have no clue about how kernel
modules work in the first place).

I am 100% in favor of dropping kvm_vcpu_arch.l1tf_flush_l1d, but the per-CPU flag
needs to stay in IRQ stats.  The alternative would be to have KVM (un)register a
pointer at module (un)load, but I don't see any point in doing so.  And _if_ we
wanted to go that route, it should be done in a separate patch.
Re: [PATCH] KVM: x86: Unify L1TF flushing under per-CPU variable
Posted by Brendan Jackman 2 months ago
On Mon Oct 13, 2025 at 4:31 PM UTC, Sean Christopherson wrote:
> On Mon, Oct 13, 2025, Brendan Jackman wrote:
>> Currently the tracking of the need to flush L1D for L1TF is tracked by
>> two bits: one per-CPU and one per-vCPU.
>> 
>> The per-vCPU bit is always set when the vCPU shows up on a core, so
>> there is no interesting state that's truly per-vCPU. Indeed, this is a
>> requirement, since L1D is a part of the physical CPU.
>> 
>> So simplify this by combining the two bits.
>> 
>> Since this requires a DECLARE_PER_CPU() which belongs in kvm_host.h,
>
> No, it doesn't belong in kvm_host.h.
>
> One of my biggest gripes with Google's prodkernel is that we only build with one
> .config, and that breeds bad habits and some truly awful misconceptions about
> kernel programming because engineers tend to treat that one .config as gospel.
>
> Information *never* flows from a module to code that can _only_ be built-in, i.e.
> to the so called "core kernel".  KVM x86 can be, and _usually_ is, built as a module,
> kvm.ko.  Thus, KVM should *never* declare/provide symbols that are used by the
> core kernel, because it simply can't work (without some abusrdly stupid logic)
> when kvm.ko is built as a module:
>
>   ld: vmlinux.o: in function `common_interrupt':
>   arch/x86/include/asm/kvm_host.h:2486:(.noinstr.text+0x2b56): undefined reference to `l1tf_flush_l1d'
>   ld: vmlinux.o: in function `sysvec_x86_platform_ipi':
>   arch/x86/include/asm/kvm_host.h:2486:(.noinstr.text+0x2bf1): undefined reference to `l1tf_flush_l1d'
>   ld: vmlinux.o: in function `sysvec_kvm_posted_intr_ipi':
>   arch/x86/include/asm/kvm_host.h:2486:(.noinstr.text+0x2c81): undefined reference to `l1tf_flush_l1d'
>   ld: vmlinux.o: in function `sysvec_kvm_posted_intr_wakeup_ipi':
>   arch/x86/include/asm/kvm_host.h:2486:(.noinstr.text+0x2cd1): undefined reference to `l1tf_flush_l1d'
>   ld: vmlinux.o: in function `sysvec_kvm_posted_intr_nested_ipi':
>   arch/x86/include/asm/kvm_host.h:2486:(.noinstr.text+0x2d61): undefined reference to `l1tf_flush_l1d'
>   ld: vmlinux.o:arch/x86/include/asm/kvm_host.h:2486: more undefined references to `l1tf_flush_l1d' follow
>
> Because prodkernel's .config forces CONFIG_KVM=y (for equally awful reasons),
> Google engineers completely forget/miss that having information flow from kvm.ko
> to vmlinux is broken (though I am convinced that a large percentage of engineers
> that work (almost) exclusively on prodkernel simply have no clue about how kernel
> modules work in the first place).
>
> I am 100% in favor of dropping kvm_vcpu_arch.l1tf_flush_l1d, but the per-CPU flag
> needs to stay in IRQ stats.  The alternative would be to have KVM (un)register a
> pointer at module (un)load, but I don't see any point in doing so.  And _if_ we
> wanted to go that route, it should be done in a separate patch.

Ack, thanks for the correction. I indeed thought wrongly of kvm_host.h
as the "builtin KVM stuff" place.

I also see from the commit message of
45b575c00d8e72d69d75dd8c112f044b7b01b069 that PeterZ suggested irq_stat
should be cache-hot.

Will wait a day or two in case anyone spots other issues.