Arch-PEBS introduces a new MSR IA32_PEBS_BASE to store the arch-PEBS
buffer physical address. This patch allocates arch-PEBS buffer and then
initializeis IA32_PEBS_BASE MSR with the allocated PEBS buffer physical
address.
Co-developed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
---
arch/x86/events/intel/core.c | 2 +
arch/x86/events/intel/ds.c | 73 +++++++++++++++++++++++++++------
arch/x86/events/perf_event.h | 7 +++-
arch/x86/include/asm/intel_ds.h | 3 +-
4 files changed, 70 insertions(+), 15 deletions(-)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 4025ea7934ac..5e6ef9f3a077 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -5460,6 +5460,7 @@ static void intel_pmu_cpu_starting(int cpu)
return;
init_debug_store_on_cpu(cpu);
+ init_arch_pebs_buf_on_cpu(cpu);
/*
* Deal with CPUs that don't clear their LBRs on power-up, and that may
* even boot with LBRs enabled.
@@ -5557,6 +5558,7 @@ static void free_excl_cntrs(struct cpu_hw_events *cpuc)
static void intel_pmu_cpu_dying(int cpu)
{
fini_debug_store_on_cpu(cpu);
+ fini_arch_pebs_buf_on_cpu(cpu);
}
void intel_cpuc_finish(struct cpu_hw_events *cpuc)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index b6eface4dccd..72b925b8c482 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -625,13 +625,22 @@ static int alloc_pebs_buffer(int cpu)
int max, node = cpu_to_node(cpu);
void *buffer, *insn_buff, *cea;
- if (!x86_pmu.ds_pebs)
+ if (!intel_pmu_has_pebs())
return 0;
- buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
+ /*
+ * alloc_pebs_buffer() could be called by init_arch_pebs_buf_on_cpu()
+ * which is in atomic context.
+ */
+ buffer = dsalloc_pages(bsiz, preemptible() ? GFP_KERNEL : GFP_ATOMIC, cpu);
if (unlikely(!buffer))
return -ENOMEM;
+ if (x86_pmu.arch_pebs) {
+ hwev->pebs_vaddr = buffer;
+ return 0;
+ }
+
/*
* HSW+ already provides us the eventing ip; no need to allocate this
* buffer then.
@@ -644,7 +653,7 @@ static int alloc_pebs_buffer(int cpu)
}
per_cpu(insn_buffer, cpu) = insn_buff;
}
- hwev->ds_pebs_vaddr = buffer;
+ hwev->pebs_vaddr = buffer;
/* Update the cpu entry area mapping */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
ds->pebs_buffer_base = (unsigned long) cea;
@@ -660,17 +669,20 @@ static void release_pebs_buffer(int cpu)
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
void *cea;
- if (!x86_pmu.ds_pebs)
+ if (!intel_pmu_has_pebs())
return;
- kfree(per_cpu(insn_buffer, cpu));
- per_cpu(insn_buffer, cpu) = NULL;
+ if (x86_pmu.ds_pebs) {
+ kfree(per_cpu(insn_buffer, cpu));
+ per_cpu(insn_buffer, cpu) = NULL;
- /* Clear the fixmap */
- cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
- ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
- dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
- hwev->ds_pebs_vaddr = NULL;
+ /* Clear the fixmap */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+ ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
+ }
+
+ dsfree_pages(hwev->pebs_vaddr, x86_pmu.pebs_buffer_size);
+ hwev->pebs_vaddr = NULL;
}
static int alloc_bts_buffer(int cpu)
@@ -823,6 +835,41 @@ void reserve_ds_buffers(void)
}
}
+void init_arch_pebs_buf_on_cpu(int cpu)
+{
+ struct cpu_hw_events *cpuc = per_cpu_ptr(&cpu_hw_events, cpu);
+ u64 arch_pebs_base;
+
+ if (!x86_pmu.arch_pebs)
+ return;
+
+ if (alloc_pebs_buffer(cpu) < 0 || !cpuc->pebs_vaddr) {
+ WARN(1, "Fail to allocate PEBS buffer on CPU %d\n", cpu);
+ x86_pmu.pebs_active = 0;
+ return;
+ }
+
+ /*
+ * 4KB-aligned pointer of the output buffer
+ * (__alloc_pages_node() return page aligned address)
+ * Buffer Size = 4KB * 2^SIZE
+ * contiguous physical buffer (__alloc_pages_node() with order)
+ */
+ arch_pebs_base = virt_to_phys(cpuc->pebs_vaddr) | PEBS_BUFFER_SHIFT;
+ wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, (u32)arch_pebs_base,
+ (u32)(arch_pebs_base >> 32));
+ x86_pmu.pebs_active = 1;
+}
+
+void fini_arch_pebs_buf_on_cpu(int cpu)
+{
+ if (!x86_pmu.arch_pebs)
+ return;
+
+ wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, 0, 0);
+ release_pebs_buffer(cpu);
+}
+
/*
* BTS
*/
@@ -2877,8 +2924,8 @@ static void intel_pmu_drain_arch_pebs(struct pt_regs *iregs,
return;
}
- base = cpuc->ds_pebs_vaddr;
- top = (void *)((u64)cpuc->ds_pebs_vaddr +
+ base = cpuc->pebs_vaddr;
+ top = (void *)((u64)cpuc->pebs_vaddr +
(index.split.wr << ARCH_PEBS_INDEX_WR_SHIFT));
mask = hybrid(cpuc->pmu, arch_pebs_cap).counters & cpuc->pebs_enabled;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index a5145e8f1ddb..82e8c20611b9 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -283,8 +283,9 @@ struct cpu_hw_events {
* Intel DebugStore bits
*/
struct debug_store *ds;
- void *ds_pebs_vaddr;
void *ds_bts_vaddr;
+ /* DS based PEBS or arch-PEBS buffer address */
+ void *pebs_vaddr;
u64 pebs_enabled;
int n_pebs;
int n_large_pebs;
@@ -1618,6 +1619,10 @@ extern void intel_cpuc_finish(struct cpu_hw_events *cpuc);
int intel_pmu_init(void);
+void init_arch_pebs_buf_on_cpu(int cpu);
+
+void fini_arch_pebs_buf_on_cpu(int cpu);
+
void init_debug_store_on_cpu(int cpu);
void fini_debug_store_on_cpu(int cpu);
diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h
index 5dbeac48a5b9..023c2883f9f3 100644
--- a/arch/x86/include/asm/intel_ds.h
+++ b/arch/x86/include/asm/intel_ds.h
@@ -4,7 +4,8 @@
#include <linux/percpu-defs.h>
#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
+#define PEBS_BUFFER_SHIFT 4
+#define PEBS_BUFFER_SIZE (PAGE_SIZE << PEBS_BUFFER_SHIFT)
/* The maximal number of PEBS events: */
#define MAX_PEBS_EVENTS_FMT4 8
--
2.43.0
On Fri, Jun 20, 2025 at 10:39:03AM +0000, Dapeng Mi wrote: > diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c > index b6eface4dccd..72b925b8c482 100644 > --- a/arch/x86/events/intel/ds.c > +++ b/arch/x86/events/intel/ds.c > @@ -625,13 +625,22 @@ static int alloc_pebs_buffer(int cpu) > int max, node = cpu_to_node(cpu); > void *buffer, *insn_buff, *cea; > > - if (!x86_pmu.ds_pebs) > + if (!intel_pmu_has_pebs()) > return 0; > > - buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); > + /* > + * alloc_pebs_buffer() could be called by init_arch_pebs_buf_on_cpu() > + * which is in atomic context. > + */ > + buffer = dsalloc_pages(bsiz, preemptible() ? GFP_KERNEL : GFP_ATOMIC, cpu); > if (unlikely(!buffer)) > return -ENOMEM; Here we go again.. that is CPU_STARTING context, that has IRQs disabled and as such no allocation is allowed. Not even GFP_ATOMIC -- this will break PREEMPT_RT.
On 6/21/2025 5:20 PM, Peter Zijlstra wrote: > On Fri, Jun 20, 2025 at 10:39:03AM +0000, Dapeng Mi wrote: > >> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c >> index b6eface4dccd..72b925b8c482 100644 >> --- a/arch/x86/events/intel/ds.c >> +++ b/arch/x86/events/intel/ds.c >> @@ -625,13 +625,22 @@ static int alloc_pebs_buffer(int cpu) >> int max, node = cpu_to_node(cpu); >> void *buffer, *insn_buff, *cea; >> >> - if (!x86_pmu.ds_pebs) >> + if (!intel_pmu_has_pebs()) >> return 0; >> >> - buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); >> + /* >> + * alloc_pebs_buffer() could be called by init_arch_pebs_buf_on_cpu() >> + * which is in atomic context. >> + */ >> + buffer = dsalloc_pages(bsiz, preemptible() ? GFP_KERNEL : GFP_ATOMIC, cpu); >> if (unlikely(!buffer)) >> return -ENOMEM; > Here we go again.. that is CPU_STARTING context, that has IRQs disabled > and as such no allocation is allowed. Not even GFP_ATOMIC -- this will > break PREEMPT_RT. Thanks. So we could have to follow what current legacy PEBS does and defer the PEBS buffer allocation until creating perf events (x86_reserve_hardware()).
On Mon, Jun 23, 2025 at 09:17:23AM +0800, Mi, Dapeng wrote: > > On 6/21/2025 5:20 PM, Peter Zijlstra wrote: > > On Fri, Jun 20, 2025 at 10:39:03AM +0000, Dapeng Mi wrote: > > > >> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c > >> index b6eface4dccd..72b925b8c482 100644 > >> --- a/arch/x86/events/intel/ds.c > >> +++ b/arch/x86/events/intel/ds.c > >> @@ -625,13 +625,22 @@ static int alloc_pebs_buffer(int cpu) > >> int max, node = cpu_to_node(cpu); > >> void *buffer, *insn_buff, *cea; > >> > >> - if (!x86_pmu.ds_pebs) > >> + if (!intel_pmu_has_pebs()) > >> return 0; > >> > >> - buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); > >> + /* > >> + * alloc_pebs_buffer() could be called by init_arch_pebs_buf_on_cpu() > >> + * which is in atomic context. > >> + */ > >> + buffer = dsalloc_pages(bsiz, preemptible() ? GFP_KERNEL : GFP_ATOMIC, cpu); > >> if (unlikely(!buffer)) > >> return -ENOMEM; > > Here we go again.. that is CPU_STARTING context, that has IRQs disabled > > and as such no allocation is allowed. Not even GFP_ATOMIC -- this will > > break PREEMPT_RT. > > Thanks. So we could have to follow what current legacy PEBS does and defer > the PEBS buffer allocation until creating perf events > (x86_reserve_hardware()). The normal way to do this kind of thing is allocate in prepare, use in starting, and the reverse on down, stop using in dying and free in dead. Specifically we have the callbacks: CPUHP_PERF_X86_PREPARE -> x86_pmu.cpu_prepare() / x86_pmu.cpu_dead() CPUHP_PERF_X86_STARTING -> x86_pmu.cpu_starting() / x86_pmu.cpu_dying() to arrange for just such a setup.
On 6/23/2025 3:28 PM, Peter Zijlstra wrote: > On Mon, Jun 23, 2025 at 09:17:23AM +0800, Mi, Dapeng wrote: >> On 6/21/2025 5:20 PM, Peter Zijlstra wrote: >>> On Fri, Jun 20, 2025 at 10:39:03AM +0000, Dapeng Mi wrote: >>> >>>> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c >>>> index b6eface4dccd..72b925b8c482 100644 >>>> --- a/arch/x86/events/intel/ds.c >>>> +++ b/arch/x86/events/intel/ds.c >>>> @@ -625,13 +625,22 @@ static int alloc_pebs_buffer(int cpu) >>>> int max, node = cpu_to_node(cpu); >>>> void *buffer, *insn_buff, *cea; >>>> >>>> - if (!x86_pmu.ds_pebs) >>>> + if (!intel_pmu_has_pebs()) >>>> return 0; >>>> >>>> - buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); >>>> + /* >>>> + * alloc_pebs_buffer() could be called by init_arch_pebs_buf_on_cpu() >>>> + * which is in atomic context. >>>> + */ >>>> + buffer = dsalloc_pages(bsiz, preemptible() ? GFP_KERNEL : GFP_ATOMIC, cpu); >>>> if (unlikely(!buffer)) >>>> return -ENOMEM; >>> Here we go again.. that is CPU_STARTING context, that has IRQs disabled >>> and as such no allocation is allowed. Not even GFP_ATOMIC -- this will >>> break PREEMPT_RT. >> Thanks. So we could have to follow what current legacy PEBS does and defer >> the PEBS buffer allocation until creating perf events >> (x86_reserve_hardware()). > The normal way to do this kind of thing is allocate in prepare, use in > starting, and the reverse on down, stop using in dying and free in dead. > > Specifically we have the callbacks: > > CPUHP_PERF_X86_PREPARE -> x86_pmu.cpu_prepare() / x86_pmu.cpu_dead() > CPUHP_PERF_X86_STARTING -> x86_pmu.cpu_starting() / x86_pmu.cpu_dying() > > to arrange for just such a setup. Sure. Would do. Thanks. >
© 2016 - 2025 Red Hat, Inc.