[Patch v5 06/19] perf/x86: Add support for XMM registers in non-PEBS and REGS_USER

Dapeng Mi posted 19 patches 2 weeks, 2 days ago
[Patch v5 06/19] perf/x86: Add support for XMM registers in non-PEBS and REGS_USER
Posted by Dapeng Mi 2 weeks, 2 days ago
From: Kan Liang <kan.liang@linux.intel.com>

While collecting XMM registers in a PEBS record has been supported since
Icelake, non-PEBS events have lacked this feature. By leveraging the
xsaves instruction, it is now possible to snapshot XMM registers for
non-PEBS events, completing the feature set.

To utilize the xsaves instruction, a 64-byte aligned buffer is required.
A per-CPU ext_regs_buf is added to store SIMD and other registers, with
the buffer size being approximately 2K. The buffer is allocated using
kzalloc_node(), ensuring natural alignment and 64-byte alignment for all
kmalloc() allocations with powers of 2.

The XMM sampling support is extended for both REGS_USER and REGS_INTR.
For REGS_USER, perf_get_regs_user() returns the registers from
task_pt_regs(current), which is a pt_regs structure. It needs to be
copied to user space secific x86_user_regs structure since kernel may
modify pt_regs structure later.

For PEBS, XMM registers are retrieved from PEBS records.

In cases where userspace tasks are trapped within kernel mode (e.g.,
during a syscall) when an NMI arrives, pt_regs information can still be
retrieved from task_pt_regs(). However, capturing SIMD and other
xsave-based registers in this scenario is challenging. Therefore,
snapshots for these registers are omitted in such cases.

The reasons are:
- Profiling a userspace task that requires SIMD/eGPR registers typically
  involves NMIs hitting userspace, not kernel mode.
- Although it is possible to retrieve values when the TIF_NEED_FPU_LOAD
  flag is set, the complexity introduced to handle this uncommon case in
  the critical path is not justified.
- Additionally, checking the TIF_NEED_FPU_LOAD flag alone is insufficient.
  Some corner cases, such as an NMI occurring just after the flag switches
  but still in kernel mode, cannot be handled.

Future support for additional vector registers is anticipated.
An ext_regs_mask is added to track the supported vector register groups.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Co-developed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
---
 arch/x86/events/core.c            | 175 ++++++++++++++++++++++++++----
 arch/x86/events/intel/core.c      |  29 ++++-
 arch/x86/events/intel/ds.c        |  20 ++--
 arch/x86/events/perf_event.h      |  11 +-
 arch/x86/include/asm/fpu/xstate.h |   2 +
 arch/x86/include/asm/perf_event.h |   5 +-
 arch/x86/kernel/fpu/xstate.c      |   2 +-
 7 files changed, 212 insertions(+), 32 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index dcdd2c2d68ee..0d33668b1927 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -406,6 +406,62 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
 	return x86_pmu_extra_regs(val, event);
 }
 
+static DEFINE_PER_CPU(struct xregs_state *, ext_regs_buf);
+
+static void x86_pmu_get_ext_regs(struct x86_perf_regs *perf_regs, u64 mask)
+{
+	struct xregs_state *xsave = per_cpu(ext_regs_buf, smp_processor_id());
+	u64 valid_mask = x86_pmu.ext_regs_mask & mask;
+
+	if (WARN_ON_ONCE(!xsave))
+		return;
+
+	xsaves_nmi(xsave, valid_mask);
+
+	/* Filtered by what XSAVE really gives */
+	valid_mask &= xsave->header.xfeatures;
+
+	if (valid_mask & XFEATURE_MASK_SSE)
+		perf_regs->xmm_space = xsave->i387.xmm_space;
+}
+
+static void release_ext_regs_buffers(void)
+{
+	int cpu;
+
+	if (!x86_pmu.ext_regs_mask)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		kfree(per_cpu(ext_regs_buf, cpu));
+		per_cpu(ext_regs_buf, cpu) = NULL;
+	}
+}
+
+static void reserve_ext_regs_buffers(void)
+{
+	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
+	unsigned int size;
+	int cpu;
+
+	if (!x86_pmu.ext_regs_mask)
+		return;
+
+	size = xstate_calculate_size(x86_pmu.ext_regs_mask, compacted);
+
+	for_each_possible_cpu(cpu) {
+		per_cpu(ext_regs_buf, cpu) = kzalloc_node(size, GFP_KERNEL,
+							  cpu_to_node(cpu));
+		if (!per_cpu(ext_regs_buf, cpu))
+			goto err;
+	}
+
+	return;
+
+err:
+	release_ext_regs_buffers();
+}
+
 int x86_reserve_hardware(void)
 {
 	int err = 0;
@@ -418,6 +474,7 @@ int x86_reserve_hardware(void)
 			} else {
 				reserve_ds_buffers();
 				reserve_lbr_buffers();
+				reserve_ext_regs_buffers();
 			}
 		}
 		if (!err)
@@ -434,6 +491,7 @@ void x86_release_hardware(void)
 		release_pmc_hardware();
 		release_ds_buffers();
 		release_lbr_buffers();
+		release_ext_regs_buffers();
 		mutex_unlock(&pmc_reserve_mutex);
 	}
 }
@@ -651,19 +709,17 @@ int x86_pmu_hw_config(struct perf_event *event)
 			return -EINVAL;
 	}
 
-	/* sample_regs_user never support XMM registers */
-	if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK))
-		return -EINVAL;
-	/*
-	 * Besides the general purpose registers, XMM registers may
-	 * be collected in PEBS on some platforms, e.g. Icelake
-	 */
-	if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) {
-		if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS))
-			return -EINVAL;
-
-		if (!event->attr.precise_ip)
-			return -EINVAL;
+	if (event->attr.sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) {
+		/*
+		 * Besides the general purpose registers, XMM registers may
+		 * be collected as well.
+		 */
+		if (event_has_extended_regs(event)) {
+			if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS))
+				return -EINVAL;
+			if (!event->attr.precise_ip)
+				return -EINVAL;
+		}
 	}
 
 	return x86_setup_perfctr(event);
@@ -1695,38 +1751,115 @@ static void x86_pmu_del(struct perf_event *event, int flags)
 	static_call_cond(x86_pmu_del)(event);
 }
 
-void x86_pmu_setup_regs_data(struct perf_event *event,
-			     struct perf_sample_data *data,
-			     struct pt_regs *regs)
+static DEFINE_PER_CPU(struct x86_perf_regs, x86_user_regs);
+
+static struct x86_perf_regs *
+x86_pmu_perf_get_regs_user(struct perf_sample_data *data,
+			   struct pt_regs *regs)
+{
+	struct x86_perf_regs *x86_regs_user = this_cpu_ptr(&x86_user_regs);
+	struct perf_regs regs_user;
+
+	perf_get_regs_user(&regs_user, regs);
+	data->regs_user.abi = regs_user.abi;
+	if (regs_user.regs) {
+		x86_regs_user->regs = *regs_user.regs;
+		data->regs_user.regs = &x86_regs_user->regs;
+	} else
+		data->regs_user.regs = NULL;
+	return x86_regs_user;
+}
+
+static bool x86_pmu_user_req_pt_regs_only(struct perf_event *event)
 {
-	u64 sample_type = event->attr.sample_type;
+	return !(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK);
+}
+
+inline void x86_pmu_clear_perf_regs(struct pt_regs *regs)
+{
+	struct x86_perf_regs *perf_regs = container_of(regs, struct x86_perf_regs, regs);
+
+	perf_regs->xmm_regs = NULL;
+}
+
+static void x86_pmu_setup_basic_regs_data(struct perf_event *event,
+					  struct perf_sample_data *data,
+					  struct pt_regs *regs)
+{
+	struct perf_event_attr *attr = &event->attr;
+	u64 sample_type = attr->sample_type;
+	struct x86_perf_regs *perf_regs;
+
+	if (!(attr->sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)))
+		return;
 
 	if (sample_type & PERF_SAMPLE_REGS_USER) {
+		perf_regs = container_of(regs, struct x86_perf_regs, regs);
+
 		if (user_mode(regs)) {
 			data->regs_user.abi = perf_reg_abi(current);
 			data->regs_user.regs = regs;
-		} else if (!(current->flags & PF_KTHREAD)) {
-			perf_get_regs_user(&data->regs_user, regs);
+		} else if (!(current->flags & PF_KTHREAD) &&
+			   x86_pmu_user_req_pt_regs_only(event)) {
+			/*
+			 * It cannot guarantee that the kernel will never
+			 * touch the registers outside of the pt_regs,
+			 * especially when more and more registers
+			 * (e.g., SIMD, eGPR) are added. The live data
+			 * cannot be used.
+			 * Dump the registers when only pt_regs are required.
+			 */
+			perf_regs = x86_pmu_perf_get_regs_user(data, regs);
 		} else {
 			data->regs_user.abi = PERF_SAMPLE_REGS_ABI_NONE;
 			data->regs_user.regs = NULL;
 		}
 		data->dyn_size += sizeof(u64);
 		if (data->regs_user.regs)
-			data->dyn_size += hweight64(event->attr.sample_regs_user) * sizeof(u64);
+			data->dyn_size += hweight64(attr->sample_regs_user) * sizeof(u64);
 		data->sample_flags |= PERF_SAMPLE_REGS_USER;
 	}
 
 	if (sample_type & PERF_SAMPLE_REGS_INTR) {
+		perf_regs = container_of(regs, struct x86_perf_regs, regs);
+
 		data->regs_intr.regs = regs;
 		data->regs_intr.abi = perf_reg_abi(current);
 		data->dyn_size += sizeof(u64);
 		if (data->regs_intr.regs)
-			data->dyn_size += hweight64(event->attr.sample_regs_intr) * sizeof(u64);
+			data->dyn_size += hweight64(attr->sample_regs_intr) * sizeof(u64);
 		data->sample_flags |= PERF_SAMPLE_REGS_INTR;
 	}
 }
 
+static void x86_pmu_sample_ext_regs(struct perf_event *event,
+				    struct pt_regs *regs,
+				    u64 ignore_mask)
+{
+	struct x86_perf_regs *perf_regs = container_of(regs, struct x86_perf_regs, regs);
+	u64 mask = 0;
+
+	if (event_has_extended_regs(event))
+		mask |= XFEATURE_MASK_SSE;
+
+	mask &= ~ignore_mask;
+	if (mask)
+		x86_pmu_get_ext_regs(perf_regs, mask);
+}
+
+void x86_pmu_setup_regs_data(struct perf_event *event,
+			     struct perf_sample_data *data,
+			     struct pt_regs *regs,
+			     u64 ignore_mask)
+{
+	x86_pmu_setup_basic_regs_data(event, data, regs);
+	/*
+	 * ignore_mask indicates the PEBS sampled extended regs
+	 * which is unnessary to sample again.
+	 */
+	x86_pmu_sample_ext_regs(event, regs, ignore_mask);
+}
+
 int x86_pmu_handle_irq(struct pt_regs *regs)
 {
 	struct perf_sample_data data;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 81e6c8bcabde..b5c89e8eabb2 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3410,6 +3410,9 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
 		if (has_branch_stack(event))
 			intel_pmu_lbr_save_brstack(&data, cpuc, event);
 
+		x86_pmu_clear_perf_regs(regs);
+		x86_pmu_setup_regs_data(event, &data, regs, 0);
+
 		perf_event_overflow(event, &data, regs);
 	}
 
@@ -5619,8 +5622,30 @@ static inline void __intel_update_large_pebs_flags(struct pmu *pmu)
 	}
 }
 
-#define counter_mask(_gp, _fixed) ((_gp) | ((u64)(_fixed) << INTEL_PMC_IDX_FIXED))
+static void intel_extended_regs_init(struct pmu *pmu)
+{
+	/*
+	 * Extend the vector registers support to non-PEBS.
+	 * The feature is limited to newer Intel machines with
+	 * PEBS V4+ or archPerfmonExt (0x23) enabled for now.
+	 * In theory, the vector registers can be retrieved as
+	 * long as the CPU supports. The support for the old
+	 * generations may be added later if there is a
+	 * requirement.
+	 * Only support the extension when XSAVES is available.
+	 */
+	if (!boot_cpu_has(X86_FEATURE_XSAVES))
+		return;
 
+	if (!boot_cpu_has(X86_FEATURE_XMM) ||
+	    !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
+		return;
+
+	x86_pmu.ext_regs_mask |= XFEATURE_MASK_SSE;
+	x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
+}
+
+#define counter_mask(_gp, _fixed) ((_gp) | ((u64)(_fixed) << INTEL_PMC_IDX_FIXED))
 static void update_pmu_cap(struct pmu *pmu)
 {
 	unsigned int eax, ebx, ecx, edx;
@@ -5682,6 +5707,8 @@ static void update_pmu_cap(struct pmu *pmu)
 		/* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */
 		rdmsrq(MSR_IA32_PERF_CAPABILITIES, hybrid(pmu, intel_cap).capabilities);
 	}
+
+	intel_extended_regs_init(pmu);
 }
 
 static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index c7351f476d8c..af462f69cd1c 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1473,8 +1473,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event)
 	if (gprs || (attr->precise_ip < 2) || tsx_weight)
 		pebs_data_cfg |= PEBS_DATACFG_GP;
 
-	if ((sample_type & PERF_SAMPLE_REGS_INTR) &&
-	    (attr->sample_regs_intr & PERF_REG_EXTENDED_MASK))
+	if (event_has_extended_regs(event))
 		pebs_data_cfg |= PEBS_DATACFG_XMMS;
 
 	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -2190,10 +2189,8 @@ static inline void __setup_pebs_gpr_group(struct perf_event *event,
 		regs->flags &= ~PERF_EFLAGS_EXACT;
 	}
 
-	if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) {
+	if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER))
 		adaptive_pebs_save_regs(regs, gprs);
-		x86_pmu_setup_regs_data(event, data, regs);
-	}
 }
 
 static inline void __setup_pebs_meminfo_group(struct perf_event *event,
@@ -2251,6 +2248,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 	struct pebs_meminfo *meminfo = NULL;
 	struct pebs_gprs *gprs = NULL;
 	struct x86_perf_regs *perf_regs;
+	u64 ignore_mask = 0;
 	u64 format_group;
 	u16 retire;
 
@@ -2258,7 +2256,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 		return;
 
 	perf_regs = container_of(regs, struct x86_perf_regs, regs);
-	perf_regs->xmm_regs = NULL;
+	x86_pmu_clear_perf_regs(regs);
 
 	format_group = basic->format_group;
 
@@ -2305,6 +2303,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 	if (format_group & PEBS_DATACFG_XMMS) {
 		struct pebs_xmm *xmm = next_record;
 
+		ignore_mask |= XFEATURE_MASK_SSE;
 		next_record = xmm + 1;
 		perf_regs->xmm_regs = xmm->xmm;
 	}
@@ -2343,6 +2342,8 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 		next_record += nr * sizeof(u64);
 	}
 
+	x86_pmu_setup_regs_data(event, data, regs, ignore_mask);
+
 	WARN_ONCE(next_record != __pebs + basic->format_size,
 			"PEBS record size %u, expected %llu, config %llx\n",
 			basic->format_size,
@@ -2368,6 +2369,7 @@ static void setup_arch_pebs_sample_data(struct perf_event *event,
 	struct arch_pebs_aux *meminfo = NULL;
 	struct arch_pebs_gprs *gprs = NULL;
 	struct x86_perf_regs *perf_regs;
+	u64 ignore_mask = 0;
 	void *next_record;
 	void *at = __pebs;
 
@@ -2375,7 +2377,7 @@ static void setup_arch_pebs_sample_data(struct perf_event *event,
 		return;
 
 	perf_regs = container_of(regs, struct x86_perf_regs, regs);
-	perf_regs->xmm_regs = NULL;
+	x86_pmu_clear_perf_regs(regs);
 
 	__setup_perf_sample_data(event, iregs, data);
 
@@ -2430,6 +2432,7 @@ static void setup_arch_pebs_sample_data(struct perf_event *event,
 
 		next_record += sizeof(struct arch_pebs_xer_header);
 
+		ignore_mask |= XFEATURE_MASK_SSE;
 		xmm = next_record;
 		perf_regs->xmm_regs = xmm->xmm;
 		next_record = xmm + 1;
@@ -2477,6 +2480,8 @@ static void setup_arch_pebs_sample_data(struct perf_event *event,
 		at = at + header->size;
 		goto again;
 	}
+
+	x86_pmu_setup_regs_data(event, data, regs, ignore_mask);
 }
 
 static inline void *
@@ -3137,6 +3142,7 @@ static void __init intel_ds_pebs_init(void)
 				x86_pmu.flags |= PMU_FL_PEBS_ALL;
 				x86_pmu.pebs_capable = ~0ULL;
 				pebs_qual = "-baseline";
+				x86_pmu.ext_regs_mask |= XFEATURE_MASK_SSE;
 				x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
 			} else {
 				/* Only basic record supported */
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 80e52e937638..3c470d79aa65 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1009,6 +1009,12 @@ struct x86_pmu {
 	struct extra_reg *extra_regs;
 	unsigned int flags;
 
+	/*
+	 * Extended regs, e.g., vector registers
+	 * Utilize the same format as the XFEATURE_MASK_*
+	 */
+	u64		ext_regs_mask;
+
 	/*
 	 * Intel host/guest support (KVM)
 	 */
@@ -1294,9 +1300,12 @@ void x86_pmu_enable_event(struct perf_event *event);
 
 int x86_pmu_handle_irq(struct pt_regs *regs);
 
+void x86_pmu_clear_perf_regs(struct pt_regs *regs);
+
 void x86_pmu_setup_regs_data(struct perf_event *event,
 			     struct perf_sample_data *data,
-			     struct pt_regs *regs);
+			     struct pt_regs *regs,
+			     u64 ignore_mask);
 
 void x86_pmu_show_pmu_cap(struct pmu *pmu);
 
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 38fa8ff26559..19dec5f0b1c7 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -112,6 +112,8 @@ void xsaves(struct xregs_state *xsave, u64 mask);
 void xrstors(struct xregs_state *xsave, u64 mask);
 void xsaves_nmi(struct xregs_state *xsave, u64 mask);
 
+unsigned int xstate_calculate_size(u64 xfeatures, bool compacted);
+
 int xfd_enable_feature(u64 xfd_err);
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 7276ba70c88a..3b368de9f803 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -704,7 +704,10 @@ extern void perf_events_lapic_init(void);
 struct pt_regs;
 struct x86_perf_regs {
 	struct pt_regs	regs;
-	u64		*xmm_regs;
+	union {
+		u64	*xmm_regs;
+		u32	*xmm_space;	/* for xsaves */
+	};
 };
 
 extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index e3b8afed8b2c..33142bccc075 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -586,7 +586,7 @@ static bool __init check_xstate_against_struct(int nr)
 	return true;
 }
 
-static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
+unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
 {
 	unsigned int topmost = fls64(xfeatures) -  1;
 	unsigned int offset, i;
-- 
2.34.1
Re: [Patch v5 06/19] perf/x86: Add support for XMM registers in non-PEBS and REGS_USER
Posted by Peter Zijlstra 2 weeks, 1 day ago
On Wed, Dec 03, 2025 at 02:54:47PM +0800, Dapeng Mi wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
> 
> While collecting XMM registers in a PEBS record has been supported since
> Icelake, non-PEBS events have lacked this feature. By leveraging the
> xsaves instruction, it is now possible to snapshot XMM registers for
> non-PEBS events, completing the feature set.
> 
> To utilize the xsaves instruction, a 64-byte aligned buffer is required.
> A per-CPU ext_regs_buf is added to store SIMD and other registers, with
> the buffer size being approximately 2K. The buffer is allocated using
> kzalloc_node(), ensuring natural alignment and 64-byte alignment for all
> kmalloc() allocations with powers of 2.
> 
> The XMM sampling support is extended for both REGS_USER and REGS_INTR.
> For REGS_USER, perf_get_regs_user() returns the registers from
> task_pt_regs(current), which is a pt_regs structure. It needs to be
> copied to user space secific x86_user_regs structure since kernel may
> modify pt_regs structure later.
> 
> For PEBS, XMM registers are retrieved from PEBS records.
> 
> In cases where userspace tasks are trapped within kernel mode (e.g.,
> during a syscall) when an NMI arrives, pt_regs information can still be
> retrieved from task_pt_regs(). However, capturing SIMD and other
> xsave-based registers in this scenario is challenging. Therefore,
> snapshots for these registers are omitted in such cases.
> 
> The reasons are:
> - Profiling a userspace task that requires SIMD/eGPR registers typically
>   involves NMIs hitting userspace, not kernel mode.
> - Although it is possible to retrieve values when the TIF_NEED_FPU_LOAD
>   flag is set, the complexity introduced to handle this uncommon case in
>   the critical path is not justified.
> - Additionally, checking the TIF_NEED_FPU_LOAD flag alone is insufficient.
>   Some corner cases, such as an NMI occurring just after the flag switches
>   but still in kernel mode, cannot be handled.

Urgh.. Dave, Thomas, is there any reason we could not set
TIF_NEED_FPU_LOAD *after* doing the XSAVE (clearing is already done
after restore).

That way, when an NMI sees TIF_NEED_FPU_LOAD it knows the task copy is
consistent.

I'm not at all sure this is complex, it just needs a little care.

And then there is the deferred thing, just like unwind, we can defer
REGS_USER/STACK_USER much the same, except someone went and built all
that deferred stuff with unwind all tangled into it :/
Re: [Patch v5 06/19] perf/x86: Add support for XMM registers in non-PEBS and REGS_USER
Posted by Dave Hansen 2 weeks, 1 day ago
On 12/4/25 07:17, Peter Zijlstra wrote:
>> - Additionally, checking the TIF_NEED_FPU_LOAD flag alone is insufficient.
>>   Some corner cases, such as an NMI occurring just after the flag switches
>>   but still in kernel mode, cannot be handled.
> Urgh.. Dave, Thomas, is there any reason we could not set
> TIF_NEED_FPU_LOAD *after* doing the XSAVE (clearing is already done
> after restore).
> 
> That way, when an NMI sees TIF_NEED_FPU_LOAD it knows the task copy is
> consistent.

Something like the attached patch?

I think that would be just fine. save_fpregs_to_fpstate() doesn't
actually change the need for TIF_NEED_FPU_LOAD, so I don't think the
ordering matters.
Re: [Patch v5 06/19] perf/x86: Add support for XMM registers in non-PEBS and REGS_USER
Posted by Peter Zijlstra 2 weeks ago
On Thu, Dec 04, 2025 at 10:59:15AM -0800, Dave Hansen wrote:
> On 12/4/25 07:17, Peter Zijlstra wrote:
> >> - Additionally, checking the TIF_NEED_FPU_LOAD flag alone is insufficient.
> >>   Some corner cases, such as an NMI occurring just after the flag switches
> >>   but still in kernel mode, cannot be handled.
> > Urgh.. Dave, Thomas, is there any reason we could not set
> > TIF_NEED_FPU_LOAD *after* doing the XSAVE (clearing is already done
> > after restore).
> > 
> > That way, when an NMI sees TIF_NEED_FPU_LOAD it knows the task copy is
> > consistent.
> 
> Something like the attached patch?
> 
> I think that would be just fine. save_fpregs_to_fpstate() doesn't
> actually change the need for TIF_NEED_FPU_LOAD, so I don't think the
> ordering matters.

Right, I missed this one. And yes, I couldn't find any site where this
ordering mattered either. Its all with interrupts disabled, so normally
it all goes together. Only the NMI could observe the difference.

> diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h
> index 89004f4ca208..2d57a7bf5406 100644
> --- a/arch/x86/include/asm/fpu/sched.h
> +++ b/arch/x86/include/asm/fpu/sched.h
> @@ -36,8 +36,8 @@ static inline void switch_fpu(struct task_struct *old, int cpu)
>  	    !(old->flags & (PF_KTHREAD | PF_USER_WORKER))) {
>  		struct fpu *old_fpu = x86_task_fpu(old);
>  
> -		set_tsk_thread_flag(old, TIF_NEED_FPU_LOAD);
>  		save_fpregs_to_fpstate(old_fpu);
> +		set_tsk_thread_flag(old, TIF_NEED_FPU_LOAD);
>  		/*
>  		 * The save operation preserved register state, so the
>  		 * fpu_fpregs_owner_ctx is still @old_fpu. Store the
Re: [Patch v5 06/19] perf/x86: Add support for XMM registers in non-PEBS and REGS_USER
Posted by Peter Zijlstra 2 weeks, 1 day ago
On Thu, Dec 04, 2025 at 04:17:35PM +0100, Peter Zijlstra wrote:
> On Wed, Dec 03, 2025 at 02:54:47PM +0800, Dapeng Mi wrote:
> > From: Kan Liang <kan.liang@linux.intel.com>
> > 
> > While collecting XMM registers in a PEBS record has been supported since
> > Icelake, non-PEBS events have lacked this feature. By leveraging the
> > xsaves instruction, it is now possible to snapshot XMM registers for
> > non-PEBS events, completing the feature set.
> > 
> > To utilize the xsaves instruction, a 64-byte aligned buffer is required.
> > A per-CPU ext_regs_buf is added to store SIMD and other registers, with
> > the buffer size being approximately 2K. The buffer is allocated using
> > kzalloc_node(), ensuring natural alignment and 64-byte alignment for all
> > kmalloc() allocations with powers of 2.
> > 
> > The XMM sampling support is extended for both REGS_USER and REGS_INTR.
> > For REGS_USER, perf_get_regs_user() returns the registers from
> > task_pt_regs(current), which is a pt_regs structure. It needs to be
> > copied to user space secific x86_user_regs structure since kernel may
> > modify pt_regs structure later.
> > 
> > For PEBS, XMM registers are retrieved from PEBS records.
> > 
> > In cases where userspace tasks are trapped within kernel mode (e.g.,
> > during a syscall) when an NMI arrives, pt_regs information can still be
> > retrieved from task_pt_regs(). However, capturing SIMD and other
> > xsave-based registers in this scenario is challenging. Therefore,
> > snapshots for these registers are omitted in such cases.
> > 
> > The reasons are:
> > - Profiling a userspace task that requires SIMD/eGPR registers typically
> >   involves NMIs hitting userspace, not kernel mode.
> > - Although it is possible to retrieve values when the TIF_NEED_FPU_LOAD
> >   flag is set, the complexity introduced to handle this uncommon case in
> >   the critical path is not justified.
> > - Additionally, checking the TIF_NEED_FPU_LOAD flag alone is insufficient.
> >   Some corner cases, such as an NMI occurring just after the flag switches
> >   but still in kernel mode, cannot be handled.
> 
> Urgh.. Dave, Thomas, is there any reason we could not set
> TIF_NEED_FPU_LOAD *after* doing the XSAVE (clearing is already done
> after restore).
> 
> That way, when an NMI sees TIF_NEED_FPU_LOAD it knows the task copy is
> consistent.
> 
> I'm not at all sure this is complex, it just needs a little care.
> 
> And then there is the deferred thing, just like unwind, we can defer
> REGS_USER/STACK_USER much the same, except someone went and built all
> that deferred stuff with unwind all tangled into it :/

With something like the below, the NMI could do something like:

	struct xregs_state *xr = NULL;

	/*
	 * fpu code does:
	 *  XSAVE
	 *  set_thread_flag(TIF_NEED_FPU_LOAD)
	 *  ...
	 *  XRSTOR
	 *  clear_thread_flag(TIF_NEED_FPU_LOAD)
	 * therefore, when TIF_NEED_FPU_LOAD, the task fpu state holds a
	 * whole copy.
	 */
	if (test_thread_flag(TIF_NEED_FPU_LOAD)) {
		struct fpu *fpu = x86_task_fpu(current);
		/*
		 * If __task_fpstate is set, it holds the right pointer,
		 * otherwise fpstate will.
		 */
		struct fpstate *fps = READ_ONCE(fpu->__task_fpstate);
		if (!fps)
			fps = fpu->fpstate;
		xr = &fps->regs.xregs_state;
	} else {
		/* like fpu_sync_fpstate(), except NMI local */
		xsave_nmi(xr, mask);
	}

	// frob xr into perf data

Or did I miss something? I've not looked at this very long and the above
was very vague on the actual issues.


diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index da233f20ae6f..0f91a0d7e799 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -359,18 +359,22 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
 	struct fpstate *cur_fps = fpu->fpstate;
 
 	fpregs_lock();
-	if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD))
+	if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD)) {
 		save_fpregs_to_fpstate(fpu);
+		set_thread_flag(TIF_NEED_FPU_LOAD);
+	}
 
 	/* Swap fpstate */
 	if (enter_guest) {
-		fpu->__task_fpstate = cur_fps;
+		WRITE_ONCE(fpu->__task_fpstate, cur_fps);
+		barrier();
 		fpu->fpstate = guest_fps;
 		guest_fps->in_use = true;
 	} else {
 		guest_fps->in_use = false;
 		fpu->fpstate = fpu->__task_fpstate;
-		fpu->__task_fpstate = NULL;
+		barrier();
+		WRITE_ONCE(fpu->__task_fpstate, NULL);
 	}
 
 	cur_fps = fpu->fpstate;
@@ -456,8 +460,8 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
 
 	if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) &&
 	    !test_thread_flag(TIF_NEED_FPU_LOAD)) {
-		set_thread_flag(TIF_NEED_FPU_LOAD);
 		save_fpregs_to_fpstate(x86_task_fpu(current));
+		set_thread_flag(TIF_NEED_FPU_LOAD);
 	}
 	__cpu_invalidate_fpregs_state();
Re: [Patch v5 06/19] perf/x86: Add support for XMM registers in non-PEBS and REGS_USER
Posted by Mi, Dapeng 2 weeks ago
On 12/4/2025 11:47 PM, Peter Zijlstra wrote:
> On Thu, Dec 04, 2025 at 04:17:35PM +0100, Peter Zijlstra wrote:
>> On Wed, Dec 03, 2025 at 02:54:47PM +0800, Dapeng Mi wrote:
>>> From: Kan Liang <kan.liang@linux.intel.com>
>>>
>>> While collecting XMM registers in a PEBS record has been supported since
>>> Icelake, non-PEBS events have lacked this feature. By leveraging the
>>> xsaves instruction, it is now possible to snapshot XMM registers for
>>> non-PEBS events, completing the feature set.
>>>
>>> To utilize the xsaves instruction, a 64-byte aligned buffer is required.
>>> A per-CPU ext_regs_buf is added to store SIMD and other registers, with
>>> the buffer size being approximately 2K. The buffer is allocated using
>>> kzalloc_node(), ensuring natural alignment and 64-byte alignment for all
>>> kmalloc() allocations with powers of 2.
>>>
>>> The XMM sampling support is extended for both REGS_USER and REGS_INTR.
>>> For REGS_USER, perf_get_regs_user() returns the registers from
>>> task_pt_regs(current), which is a pt_regs structure. It needs to be
>>> copied to user space secific x86_user_regs structure since kernel may
>>> modify pt_regs structure later.
>>>
>>> For PEBS, XMM registers are retrieved from PEBS records.
>>>
>>> In cases where userspace tasks are trapped within kernel mode (e.g.,
>>> during a syscall) when an NMI arrives, pt_regs information can still be
>>> retrieved from task_pt_regs(). However, capturing SIMD and other
>>> xsave-based registers in this scenario is challenging. Therefore,
>>> snapshots for these registers are omitted in such cases.
>>>
>>> The reasons are:
>>> - Profiling a userspace task that requires SIMD/eGPR registers typically
>>>   involves NMIs hitting userspace, not kernel mode.
>>> - Although it is possible to retrieve values when the TIF_NEED_FPU_LOAD
>>>   flag is set, the complexity introduced to handle this uncommon case in
>>>   the critical path is not justified.
>>> - Additionally, checking the TIF_NEED_FPU_LOAD flag alone is insufficient.
>>>   Some corner cases, such as an NMI occurring just after the flag switches
>>>   but still in kernel mode, cannot be handled.
>> Urgh.. Dave, Thomas, is there any reason we could not set
>> TIF_NEED_FPU_LOAD *after* doing the XSAVE (clearing is already done
>> after restore).
>>
>> That way, when an NMI sees TIF_NEED_FPU_LOAD it knows the task copy is
>> consistent.
>>
>> I'm not at all sure this is complex, it just needs a little care.
>>
>> And then there is the deferred thing, just like unwind, we can defer
>> REGS_USER/STACK_USER much the same, except someone went and built all
>> that deferred stuff with unwind all tangled into it :/
> With something like the below, the NMI could do something like:
>
> 	struct xregs_state *xr = NULL;
>
> 	/*
> 	 * fpu code does:
> 	 *  XSAVE
> 	 *  set_thread_flag(TIF_NEED_FPU_LOAD)
> 	 *  ...
> 	 *  XRSTOR
> 	 *  clear_thread_flag(TIF_NEED_FPU_LOAD)
> 	 * therefore, when TIF_NEED_FPU_LOAD, the task fpu state holds a
> 	 * whole copy.
> 	 */
> 	if (test_thread_flag(TIF_NEED_FPU_LOAD)) {
> 		struct fpu *fpu = x86_task_fpu(current);
> 		/*
> 		 * If __task_fpstate is set, it holds the right pointer,
> 		 * otherwise fpstate will.
> 		 */
> 		struct fpstate *fps = READ_ONCE(fpu->__task_fpstate);
> 		if (!fps)
> 			fps = fpu->fpstate;
> 		xr = &fps->regs.xregs_state;
> 	} else {
> 		/* like fpu_sync_fpstate(), except NMI local */
> 		xsave_nmi(xr, mask);
> 	}
>
> 	// frob xr into perf data
>
> Or did I miss something? I've not looked at this very long and the above
> was very vague on the actual issues.
>
>
> diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
> index da233f20ae6f..0f91a0d7e799 100644
> --- a/arch/x86/kernel/fpu/core.c
> +++ b/arch/x86/kernel/fpu/core.c
> @@ -359,18 +359,22 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
>  	struct fpstate *cur_fps = fpu->fpstate;
>  
>  	fpregs_lock();
> -	if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD))
> +	if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD)) {
>  		save_fpregs_to_fpstate(fpu);
> +		set_thread_flag(TIF_NEED_FPU_LOAD);
> +	}
>  
>  	/* Swap fpstate */
>  	if (enter_guest) {
> -		fpu->__task_fpstate = cur_fps;
> +		WRITE_ONCE(fpu->__task_fpstate, cur_fps);
> +		barrier();
>  		fpu->fpstate = guest_fps;
>  		guest_fps->in_use = true;
>  	} else {
>  		guest_fps->in_use = false;
>  		fpu->fpstate = fpu->__task_fpstate;
> -		fpu->__task_fpstate = NULL;
> +		barrier();
> +		WRITE_ONCE(fpu->__task_fpstate, NULL);
>  	}
>  
>  	cur_fps = fpu->fpstate;
> @@ -456,8 +460,8 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
>  
>  	if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) &&
>  	    !test_thread_flag(TIF_NEED_FPU_LOAD)) {
> -		set_thread_flag(TIF_NEED_FPU_LOAD);
>  		save_fpregs_to_fpstate(x86_task_fpu(current));
> +		set_thread_flag(TIF_NEED_FPU_LOAD);
>  	}
>  	__cpu_invalidate_fpregs_state();
>  

Ok, I would involve these changes into next version and support the SIMD
registers sampling for user space registers sampling.