[PATCH V3 05/17] perf/x86: Support XMM register for non-PEBS and REGS_USER

kan.liang@linux.intel.com posted 17 patches 5 months, 3 weeks ago
[PATCH V3 05/17] perf/x86: Support XMM register for non-PEBS and REGS_USER
Posted by kan.liang@linux.intel.com 5 months, 3 weeks ago
From: Kan Liang <kan.liang@linux.intel.com>

Collecting the XMM registers in a PEBS record has been supported since
the Icelake. But non-PEBS events don't support the feature. It's
possible to retrieve the XMM registers from the XSAVE for non-PEBS.
Add it to make the feature complete.

To utilize the XSAVE, a 64-byte aligned buffer is required. Add a
per-CPU ext_regs_buf to store the vector registers. The size of the
buffer is ~2K. kzalloc_node() is used because there's a _guarantee_
that all kmalloc()'s with powers of 2 are naturally aligned and also
64b aligned.

Extend the support for both REGS_USER and REGS_INTR. For REGS_USER, the
perf_get_regs_user() returns the regs from the task_pt_regs(current),
which is struct pt_regs. Need to move it to local struct x86_perf_regs
x86_user_regs.
For PEBS, the HW support is still preferred. The XMM should be retrieved
from PEBS records.

There could be more vector registers supported later. Add ext_regs_mask
to track the supported vector register group.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
 arch/x86/events/core.c            | 127 +++++++++++++++++++++++++-----
 arch/x86/events/intel/core.c      |  27 +++++++
 arch/x86/events/intel/ds.c        |  10 ++-
 arch/x86/events/perf_event.h      |   9 ++-
 arch/x86/include/asm/fpu/xstate.h |   2 +
 arch/x86/include/asm/perf_event.h |   5 +-
 arch/x86/kernel/fpu/xstate.c      |   2 +-
 7 files changed, 157 insertions(+), 25 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index c601ad761534..f27c58f4c815 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -406,6 +406,61 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
 	return x86_pmu_extra_regs(val, event);
 }
 
+static DEFINE_PER_CPU(struct xregs_state *, ext_regs_buf);
+
+static void x86_pmu_get_ext_regs(struct x86_perf_regs *perf_regs, u64 mask)
+{
+	struct xregs_state *xsave = per_cpu(ext_regs_buf, smp_processor_id());
+	u64 valid_mask = x86_pmu.ext_regs_mask & mask;
+
+	if (WARN_ON_ONCE(!xsave))
+		return;
+
+	xsaves_nmi(xsave, valid_mask);
+
+	/* Filtered by what XSAVE really gives */
+	valid_mask &= xsave->header.xfeatures;
+
+	if (valid_mask & XFEATURE_MASK_SSE)
+		perf_regs->xmm_space = xsave->i387.xmm_space;
+}
+
+static void release_ext_regs_buffers(void)
+{
+	int cpu;
+
+	if (!x86_pmu.ext_regs_mask)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		kfree(per_cpu(ext_regs_buf, cpu));
+		per_cpu(ext_regs_buf, cpu) = NULL;
+	}
+}
+
+static void reserve_ext_regs_buffers(void)
+{
+	unsigned int size;
+	int cpu;
+
+	if (!x86_pmu.ext_regs_mask)
+		return;
+
+	size = xstate_calculate_size(x86_pmu.ext_regs_mask, true);
+
+	for_each_possible_cpu(cpu) {
+		per_cpu(ext_regs_buf, cpu) = kzalloc_node(size, GFP_KERNEL,
+							  cpu_to_node(cpu));
+		if (!per_cpu(ext_regs_buf, cpu))
+			goto err;
+	}
+
+	return;
+
+err:
+	release_ext_regs_buffers();
+}
+
 int x86_reserve_hardware(void)
 {
 	int err = 0;
@@ -418,6 +473,7 @@ int x86_reserve_hardware(void)
 			} else {
 				reserve_ds_buffers();
 				reserve_lbr_buffers();
+				reserve_ext_regs_buffers();
 			}
 		}
 		if (!err)
@@ -434,6 +490,7 @@ void x86_release_hardware(void)
 		release_pmc_hardware();
 		release_ds_buffers();
 		release_lbr_buffers();
+		release_ext_regs_buffers();
 		mutex_unlock(&pmc_reserve_mutex);
 	}
 }
@@ -642,21 +699,18 @@ int x86_pmu_hw_config(struct perf_event *event)
 			return -EINVAL;
 	}
 
-	/* sample_regs_user never support XMM registers */
-	if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK))
-		return -EINVAL;
-	/*
-	 * Besides the general purpose registers, XMM registers may
-	 * be collected in PEBS on some platforms, e.g. Icelake
-	 */
-	if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) {
-		if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS))
-			return -EINVAL;
-
-		if (!event->attr.precise_ip)
-			return -EINVAL;
+	if (event->attr.sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) {
+		/*
+		 * Besides the general purpose registers, XMM registers may
+		 * be collected as well.
+		 */
+		if (event_has_extended_regs(event)) {
+			if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS))
+				return -EINVAL;
+			if (!(x86_pmu.ext_regs_mask & XFEATURE_MASK_SSE))
+				return -EINVAL;
+		}
 	}
-
 	return x86_setup_perfctr(event);
 }
 
@@ -1685,25 +1739,51 @@ static void x86_pmu_del(struct perf_event *event, int flags)
 	static_call_cond(x86_pmu_del)(event);
 }
 
+static DEFINE_PER_CPU(struct x86_perf_regs, x86_user_regs);
+
+static struct x86_perf_regs *
+x86_pmu_perf_get_regs_user(struct perf_sample_data *data,
+			   struct pt_regs *regs)
+{
+	struct x86_perf_regs *x86_regs_user = this_cpu_ptr(&x86_user_regs);
+	struct perf_regs regs_user;
+
+	perf_get_regs_user(&regs_user, regs);
+	data->regs_user.abi = regs_user.abi;
+	if (regs_user.regs) {
+		x86_regs_user->regs = *regs_user.regs;
+		data->regs_user.regs = &x86_regs_user->regs;
+	} else
+		data->regs_user.regs = NULL;
+	return x86_regs_user;
+}
+
 void x86_pmu_setup_regs_data(struct perf_event *event,
 			     struct perf_sample_data *data,
-			     struct pt_regs *regs)
+			     struct pt_regs *regs,
+			     u64 ignore_mask)
 {
-	u64 sample_type = event->attr.sample_type;
+	struct x86_perf_regs *perf_regs = container_of(regs, struct x86_perf_regs, regs);
+	struct perf_event_attr *attr = &event->attr;
+	u64 sample_type = attr->sample_type;
+	u64 mask = 0;
+
+	if (!(attr->sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)))
+		return;
 
 	if (sample_type & PERF_SAMPLE_REGS_USER) {
 		if (user_mode(regs)) {
 			data->regs_user.abi = perf_reg_abi(current);
 			data->regs_user.regs = regs;
 		} else if (!(current->flags & PF_KTHREAD)) {
-			perf_get_regs_user(&data->regs_user, regs);
+			perf_regs = x86_pmu_perf_get_regs_user(data, regs);
 		} else {
 			data->regs_user.abi = PERF_SAMPLE_REGS_ABI_NONE;
 			data->regs_user.regs = NULL;
 		}
 		data->dyn_size += sizeof(u64);
 		if (data->regs_user.regs)
-			data->dyn_size += hweight64(event->attr.sample_regs_user) * sizeof(u64);
+			data->dyn_size += hweight64(attr->sample_regs_user) * sizeof(u64);
 		data->sample_flags |= PERF_SAMPLE_REGS_USER;
 	}
 
@@ -1712,9 +1792,18 @@ void x86_pmu_setup_regs_data(struct perf_event *event,
 		data->regs_intr.abi = perf_reg_abi(current);
 		data->dyn_size += sizeof(u64);
 		if (data->regs_intr.regs)
-			data->dyn_size += hweight64(event->attr.sample_regs_intr) * sizeof(u64);
+			data->dyn_size += hweight64(attr->sample_regs_intr) * sizeof(u64);
 		data->sample_flags |= PERF_SAMPLE_REGS_INTR;
 	}
+
+	if (event_has_extended_regs(event)) {
+		perf_regs->xmm_regs = NULL;
+		mask |= XFEATURE_MASK_SSE;
+	}
+
+	mask &= ~ignore_mask;
+	if (mask)
+		x86_pmu_get_ext_regs(perf_regs, mask);
 }
 
 int x86_pmu_handle_irq(struct pt_regs *regs)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index c2fb729c270e..bd16f91dea1c 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3284,6 +3284,8 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
 		if (has_branch_stack(event))
 			intel_pmu_lbr_save_brstack(&data, cpuc, event);
 
+		x86_pmu_setup_regs_data(event, &data, regs, 0);
+
 		perf_event_overflow(event, &data, regs);
 	}
 
@@ -5272,6 +5274,29 @@ static inline bool intel_pmu_broken_perf_cap(void)
 	return false;
 }
 
+static void intel_extended_regs_init(struct pmu *pmu)
+{
+	/*
+	 * Extend the vector registers support to non-PEBS.
+	 * The feature is limited to newer Intel machines with
+	 * PEBS V4+ or archPerfmonExt (0x23) enabled for now.
+	 * In theory, the vector registers can be retrieved as
+	 * long as the CPU supports. The support for the old
+	 * generations may be added later if there is a
+	 * requirement.
+	 * Only support the extension when XSAVES is available.
+	 */
+	if (!boot_cpu_has(X86_FEATURE_XSAVES))
+		return;
+
+	if (!boot_cpu_has(X86_FEATURE_XMM) ||
+	    !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
+		return;
+
+	x86_pmu.ext_regs_mask |= XFEATURE_MASK_SSE;
+	x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
+}
+
 static void update_pmu_cap(struct pmu *pmu)
 {
 	unsigned int cntr, fixed_cntr, ecx, edx;
@@ -5306,6 +5331,8 @@ static void update_pmu_cap(struct pmu *pmu)
 		/* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */
 		rdmsrq(MSR_IA32_PERF_CAPABILITIES, hybrid(pmu, intel_cap).capabilities);
 	}
+
+	intel_extended_regs_init(pmu);
 }
 
 static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index e67d8a03ddfe..9cdece014ac0 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1415,8 +1415,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event)
 	if (gprs || (attr->precise_ip < 2) || tsx_weight)
 		pebs_data_cfg |= PEBS_DATACFG_GP;
 
-	if ((sample_type & PERF_SAMPLE_REGS_INTR) &&
-	    (attr->sample_regs_intr & PERF_REG_EXTENDED_MASK))
+	if (event_has_extended_regs(event))
 		pebs_data_cfg |= PEBS_DATACFG_XMMS;
 
 	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -2127,8 +2126,12 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 		}
 
 		if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) {
+			u64 mask = 0;
+
 			adaptive_pebs_save_regs(regs, gprs);
-			x86_pmu_setup_regs_data(event, data, regs);
+			if (format_group & PEBS_DATACFG_XMMS)
+				mask |= XFEATURE_MASK_SSE;
+			x86_pmu_setup_regs_data(event, data, regs, mask);
 		}
 	}
 
@@ -2755,6 +2758,7 @@ void __init intel_pebs_init(void)
 				x86_pmu.flags |= PMU_FL_PEBS_ALL;
 				x86_pmu.pebs_capable = ~0ULL;
 				pebs_qual = "-baseline";
+				x86_pmu.ext_regs_mask |= XFEATURE_MASK_SSE;
 				x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
 			} else {
 				/* Only basic record supported */
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 12682a059608..7bf24842b1dc 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -992,6 +992,12 @@ struct x86_pmu {
 	struct extra_reg *extra_regs;
 	unsigned int flags;
 
+	/*
+	 * Extended regs, e.g., vector registers
+	 * Utilize the same format as the XFEATURE_MASK_*
+	 */
+	u64		ext_regs_mask;
+
 	/*
 	 * Intel host/guest support (KVM)
 	 */
@@ -1280,7 +1286,8 @@ int x86_pmu_handle_irq(struct pt_regs *regs);
 
 void x86_pmu_setup_regs_data(struct perf_event *event,
 			     struct perf_sample_data *data,
-			     struct pt_regs *regs);
+			     struct pt_regs *regs,
+			     u64 ignore_mask);
 
 void x86_pmu_show_pmu_cap(struct pmu *pmu);
 
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 0c8b9251c29f..58bbdf9226d1 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -109,6 +109,8 @@ void xsaves(struct xregs_state *xsave, u64 mask);
 void xrstors(struct xregs_state *xsave, u64 mask);
 void xsaves_nmi(struct xregs_state *xsave, u64 mask);
 
+unsigned int xstate_calculate_size(u64 xfeatures, bool compacted);
+
 int xfd_enable_feature(u64 xfd_err);
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 70d1d94aca7e..f36f04bc95f1 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -592,7 +592,10 @@ extern void perf_events_lapic_init(void);
 struct pt_regs;
 struct x86_perf_regs {
 	struct pt_regs	regs;
-	u64		*xmm_regs;
+	union {
+		u64	*xmm_regs;
+		u32	*xmm_space;	/* for xsaves */
+	};
 };
 
 extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 8602683fcb12..4747b29608cd 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -583,7 +583,7 @@ static bool __init check_xstate_against_struct(int nr)
 	return true;
 }
 
-static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
+unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
 {
 	unsigned int topmost = fls64(xfeatures) -  1;
 	unsigned int offset, i;
-- 
2.38.1
Re: [PATCH V3 05/17] perf/x86: Support XMM register for non-PEBS and REGS_USER
Posted by Peter Zijlstra 5 months, 3 weeks ago
On Fri, Aug 15, 2025 at 02:34:23PM -0700, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
> 
> Collecting the XMM registers in a PEBS record has been supported since
> the Icelake. But non-PEBS events don't support the feature. It's
> possible to retrieve the XMM registers from the XSAVE for non-PEBS.
> Add it to make the feature complete.
> 
> To utilize the XSAVE, a 64-byte aligned buffer is required. Add a
> per-CPU ext_regs_buf to store the vector registers. The size of the
> buffer is ~2K. kzalloc_node() is used because there's a _guarantee_
> that all kmalloc()'s with powers of 2 are naturally aligned and also
> 64b aligned.
> 
> Extend the support for both REGS_USER and REGS_INTR. For REGS_USER, the
> perf_get_regs_user() returns the regs from the task_pt_regs(current),
> which is struct pt_regs. Need to move it to local struct x86_perf_regs
> x86_user_regs.
> For PEBS, the HW support is still preferred. The XMM should be retrieved
> from PEBS records.
> 
> There could be more vector registers supported later. Add ext_regs_mask
> to track the supported vector register group.


I'm a little confused... *again* :-)

Specifically, we should consider two sets of registers:

 - the live set, as per the CPU (XSAVE)
 - the stored set, as per x86_task_fpu()

regs_intr should always get a copy of the live set; however
regs_user should not. It might need a copy of the x86_task_fpu() instead
of the live set, depending on TIF_NEED_FPU_LOAD (more or less, we need
another variable set in kernel_fpu_begin_mask() *after*
save_fpregs_to_fpstate() is completed).

I don't see this code make this distinction.

Consider getting a sample while the kernel is doing some avx enhanced
crypto and such.
Re: [PATCH V3 05/17] perf/x86: Support XMM register for non-PEBS and REGS_USER
Posted by Liang, Kan 5 months, 3 weeks ago

On 2025-08-19 6:39 a.m., Peter Zijlstra wrote:
> On Fri, Aug 15, 2025 at 02:34:23PM -0700, kan.liang@linux.intel.com wrote:
>> From: Kan Liang <kan.liang@linux.intel.com>
>>
>> Collecting the XMM registers in a PEBS record has been supported since
>> the Icelake. But non-PEBS events don't support the feature. It's
>> possible to retrieve the XMM registers from the XSAVE for non-PEBS.
>> Add it to make the feature complete.
>>
>> To utilize the XSAVE, a 64-byte aligned buffer is required. Add a
>> per-CPU ext_regs_buf to store the vector registers. The size of the
>> buffer is ~2K. kzalloc_node() is used because there's a _guarantee_
>> that all kmalloc()'s with powers of 2 are naturally aligned and also
>> 64b aligned.
>>
>> Extend the support for both REGS_USER and REGS_INTR. For REGS_USER, the
>> perf_get_regs_user() returns the regs from the task_pt_regs(current),
>> which is struct pt_regs. Need to move it to local struct x86_perf_regs
>> x86_user_regs.
>> For PEBS, the HW support is still preferred. The XMM should be retrieved
>> from PEBS records.
>>
>> There could be more vector registers supported later. Add ext_regs_mask
>> to track the supported vector register group.
> 
> 
> I'm a little confused... *again* :-)
> 
> Specifically, we should consider two sets of registers:
> 
>  - the live set, as per the CPU (XSAVE)
>  - the stored set, as per x86_task_fpu()
> 
> regs_intr should always get a copy of the live set; however
> regs_user should not. It might need a copy of the x86_task_fpu() instead
> of the live set, depending on TIF_NEED_FPU_LOAD (more or less, we need
> another variable set in kernel_fpu_begin_mask() *after*
> save_fpregs_to_fpstate() is completed).
> 
> I don't see this code make this distinction.
> 
> Consider getting a sample while the kernel is doing some avx enhanced
> crypto and such.

The regs_user only needs a set when the NMI hits the user mode
(user_mode(regs)) or a non-kernel thread (!(current->flags &
PF_KTHREAD)). The live set is good enough for both cases.

I think the kernel crypto should be to a kernel thread (current->flags &
PF_KTHREAD). If so, the regs_user should return NULL.

Thanks,
Kan
Re: [PATCH V3 05/17] perf/x86: Support XMM register for non-PEBS and REGS_USER
Posted by Mi, Dapeng 5 months, 2 weeks ago
On 8/19/2025 11:55 PM, Liang, Kan wrote:
>
> On 2025-08-19 6:39 a.m., Peter Zijlstra wrote:
>> On Fri, Aug 15, 2025 at 02:34:23PM -0700, kan.liang@linux.intel.com wrote:
>>> From: Kan Liang <kan.liang@linux.intel.com>
>>>
>>> Collecting the XMM registers in a PEBS record has been supported since
>>> the Icelake. But non-PEBS events don't support the feature. It's
>>> possible to retrieve the XMM registers from the XSAVE for non-PEBS.
>>> Add it to make the feature complete.
>>>
>>> To utilize the XSAVE, a 64-byte aligned buffer is required. Add a
>>> per-CPU ext_regs_buf to store the vector registers. The size of the
>>> buffer is ~2K. kzalloc_node() is used because there's a _guarantee_
>>> that all kmalloc()'s with powers of 2 are naturally aligned and also
>>> 64b aligned.
>>>
>>> Extend the support for both REGS_USER and REGS_INTR. For REGS_USER, the
>>> perf_get_regs_user() returns the regs from the task_pt_regs(current),
>>> which is struct pt_regs. Need to move it to local struct x86_perf_regs
>>> x86_user_regs.
>>> For PEBS, the HW support is still preferred. The XMM should be retrieved
>>> from PEBS records.
>>>
>>> There could be more vector registers supported later. Add ext_regs_mask
>>> to track the supported vector register group.
>>
>> I'm a little confused... *again* :-)
>>
>> Specifically, we should consider two sets of registers:
>>
>>  - the live set, as per the CPU (XSAVE)
>>  - the stored set, as per x86_task_fpu()
>>
>> regs_intr should always get a copy of the live set; however
>> regs_user should not. It might need a copy of the x86_task_fpu() instead
>> of the live set, depending on TIF_NEED_FPU_LOAD (more or less, we need
>> another variable set in kernel_fpu_begin_mask() *after*
>> save_fpregs_to_fpstate() is completed).
>>
>> I don't see this code make this distinction.
>>
>> Consider getting a sample while the kernel is doing some avx enhanced
>> crypto and such.
> The regs_user only needs a set when the NMI hits the user mode
> (user_mode(regs)) or a non-kernel thread (!(current->flags &
> PF_KTHREAD)). The live set is good enough for both cases.

It's fine if NMI hits user mode, but if NMI hits the kernel mode
(!(current->flags &PF_KTHREAD)), won't the kernel space SIMD/eGPR regs be
exposed to user space for user-regs option? I'm not sure if kernel space
really use these SIMD/eGPR regs right now, but it seems a risk.


>
> I think the kernel crypto should be to a kernel thread (current->flags &
> PF_KTHREAD). If so, the regs_user should return NULL.
>
> Thanks,
> Kan
>
Re: [PATCH V3 05/17] perf/x86: Support XMM register for non-PEBS and REGS_USER
Posted by Liang, Kan 5 months, 2 weeks ago

On 2025-08-20 2:46 a.m., Mi, Dapeng wrote:
> 
> On 8/19/2025 11:55 PM, Liang, Kan wrote:
>>
>> On 2025-08-19 6:39 a.m., Peter Zijlstra wrote:
>>> On Fri, Aug 15, 2025 at 02:34:23PM -0700, kan.liang@linux.intel.com wrote:
>>>> From: Kan Liang <kan.liang@linux.intel.com>
>>>>
>>>> Collecting the XMM registers in a PEBS record has been supported since
>>>> the Icelake. But non-PEBS events don't support the feature. It's
>>>> possible to retrieve the XMM registers from the XSAVE for non-PEBS.
>>>> Add it to make the feature complete.
>>>>
>>>> To utilize the XSAVE, a 64-byte aligned buffer is required. Add a
>>>> per-CPU ext_regs_buf to store the vector registers. The size of the
>>>> buffer is ~2K. kzalloc_node() is used because there's a _guarantee_
>>>> that all kmalloc()'s with powers of 2 are naturally aligned and also
>>>> 64b aligned.
>>>>
>>>> Extend the support for both REGS_USER and REGS_INTR. For REGS_USER, the
>>>> perf_get_regs_user() returns the regs from the task_pt_regs(current),
>>>> which is struct pt_regs. Need to move it to local struct x86_perf_regs
>>>> x86_user_regs.
>>>> For PEBS, the HW support is still preferred. The XMM should be retrieved
>>>> from PEBS records.
>>>>
>>>> There could be more vector registers supported later. Add ext_regs_mask
>>>> to track the supported vector register group.
>>>
>>> I'm a little confused... *again* :-)
>>>
>>> Specifically, we should consider two sets of registers:
>>>
>>>  - the live set, as per the CPU (XSAVE)
>>>  - the stored set, as per x86_task_fpu()
>>>
>>> regs_intr should always get a copy of the live set; however
>>> regs_user should not. It might need a copy of the x86_task_fpu() instead
>>> of the live set, depending on TIF_NEED_FPU_LOAD (more or less, we need
>>> another variable set in kernel_fpu_begin_mask() *after*
>>> save_fpregs_to_fpstate() is completed).
>>>
>>> I don't see this code make this distinction.
>>>
>>> Consider getting a sample while the kernel is doing some avx enhanced
>>> crypto and such.
>> The regs_user only needs a set when the NMI hits the user mode
>> (user_mode(regs)) or a non-kernel thread (!(current->flags &
>> PF_KTHREAD)). The live set is good enough for both cases.
> 
> It's fine if NMI hits user mode, but if NMI hits the kernel mode
> (!(current->flags &PF_KTHREAD)), won't the kernel space SIMD/eGPR regs be
> exposed to user space for user-regs option? I'm not sure if kernel space
> really use these SIMD/eGPR regs right now, but it seems a risk.
> 
>

I don't think it's possible for the existing kernel. But I cannot
guarantee future usage.

If the kernel mode handling is still a concern, I think we should drop
the SIMD/eGPR regs for the case for now. Because
- To profile a userspace application which requires SIMD/eGPR regs, the
NMI usually hits the usersapce. It's not common to hit the kernel mode.
- The SIMD/eGPR cannot be retrieved from the task_pt_regs(). Although
it's possible to retrieve the values when the TIF_NEED_FPU_LOAD flag is
set, I don't think it's worth introducing such complexity to handle an
uncommon case in the critical path.
- Furthermore, only checking the TIF_NEED_FPU_LOAD flag cannot cure
everything. Some corner cases cannot be handled either. For example, an
NMI can happen when the flag just switched, but still in the kernel mode.

We can always add the support later if someone thinks it's important to
retrieve the user SIMD/eGPR regs during the kernel syscall.

Thanks,
Kan
>>
>> I think the kernel crypto should be to a kernel thread (current->flags &
>> PF_KTHREAD). If so, the regs_user should return NULL.
>>
>> Thanks,
>> Kan
>>
>
Re: [PATCH V3 05/17] perf/x86: Support XMM register for non-PEBS and REGS_USER
Posted by Mi, Dapeng 5 months, 2 weeks ago
On 8/21/2025 2:03 AM, Liang, Kan wrote:
>
> On 2025-08-20 2:46 a.m., Mi, Dapeng wrote:
>> On 8/19/2025 11:55 PM, Liang, Kan wrote:
>>> On 2025-08-19 6:39 a.m., Peter Zijlstra wrote:
>>>> On Fri, Aug 15, 2025 at 02:34:23PM -0700, kan.liang@linux.intel.com wrote:
>>>>> From: Kan Liang <kan.liang@linux.intel.com>
>>>>>
>>>>> Collecting the XMM registers in a PEBS record has been supported since
>>>>> the Icelake. But non-PEBS events don't support the feature. It's
>>>>> possible to retrieve the XMM registers from the XSAVE for non-PEBS.
>>>>> Add it to make the feature complete.
>>>>>
>>>>> To utilize the XSAVE, a 64-byte aligned buffer is required. Add a
>>>>> per-CPU ext_regs_buf to store the vector registers. The size of the
>>>>> buffer is ~2K. kzalloc_node() is used because there's a _guarantee_
>>>>> that all kmalloc()'s with powers of 2 are naturally aligned and also
>>>>> 64b aligned.
>>>>>
>>>>> Extend the support for both REGS_USER and REGS_INTR. For REGS_USER, the
>>>>> perf_get_regs_user() returns the regs from the task_pt_regs(current),
>>>>> which is struct pt_regs. Need to move it to local struct x86_perf_regs
>>>>> x86_user_regs.
>>>>> For PEBS, the HW support is still preferred. The XMM should be retrieved
>>>>> from PEBS records.
>>>>>
>>>>> There could be more vector registers supported later. Add ext_regs_mask
>>>>> to track the supported vector register group.
>>>> I'm a little confused... *again* :-)
>>>>
>>>> Specifically, we should consider two sets of registers:
>>>>
>>>>  - the live set, as per the CPU (XSAVE)
>>>>  - the stored set, as per x86_task_fpu()
>>>>
>>>> regs_intr should always get a copy of the live set; however
>>>> regs_user should not. It might need a copy of the x86_task_fpu() instead
>>>> of the live set, depending on TIF_NEED_FPU_LOAD (more or less, we need
>>>> another variable set in kernel_fpu_begin_mask() *after*
>>>> save_fpregs_to_fpstate() is completed).
>>>>
>>>> I don't see this code make this distinction.
>>>>
>>>> Consider getting a sample while the kernel is doing some avx enhanced
>>>> crypto and such.
>>> The regs_user only needs a set when the NMI hits the user mode
>>> (user_mode(regs)) or a non-kernel thread (!(current->flags &
>>> PF_KTHREAD)). The live set is good enough for both cases.
>> It's fine if NMI hits user mode, but if NMI hits the kernel mode
>> (!(current->flags &PF_KTHREAD)), won't the kernel space SIMD/eGPR regs be
>> exposed to user space for user-regs option? I'm not sure if kernel space
>> really use these SIMD/eGPR regs right now, but it seems a risk.
>>
>>
> I don't think it's possible for the existing kernel. But I cannot
> guarantee future usage.
>
> If the kernel mode handling is still a concern, I think we should drop
> the SIMD/eGPR regs for the case for now. Because
> - To profile a userspace application which requires SIMD/eGPR regs, the
> NMI usually hits the usersapce. It's not common to hit the kernel mode.
> - The SIMD/eGPR cannot be retrieved from the task_pt_regs(). Although
> it's possible to retrieve the values when the TIF_NEED_FPU_LOAD flag is
> set, I don't think it's worth introducing such complexity to handle an
> uncommon case in the critical path.
> - Furthermore, only checking the TIF_NEED_FPU_LOAD flag cannot cure
> everything. Some corner cases cannot be handled either. For example, an
> NMI can happen when the flag just switched, but still in the kernel mode.
>
> We can always add the support later if someone thinks it's important to
> retrieve the user SIMD/eGPR regs during the kernel syscall.

+1


>
> Thanks,
> Kan
>>> I think the kernel crypto should be to a kernel thread (current->flags &
>>> PF_KTHREAD). If so, the regs_user should return NULL.
>>>
>>> Thanks,
>>> Kan
>>>