Sampling is inherently a feature for CPU PMUs, given that the thing
to be sampled is a CPU context. These days, we have many more
uncore/system PMUs than CPU PMUs, so it no longer makes much sense to
assume sampling support by default and force the ever-growing majority
of drivers to opt out of it (or erroneously fail to). Instead, let's
introduce a positive opt-in capability that's more obvious and easier to
maintain.
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---
arch/alpha/kernel/perf_event.c | 3 ++-
arch/arc/kernel/perf_event.c | 2 ++
arch/csky/kernel/perf_event.c | 2 ++
arch/loongarch/kernel/perf_event.c | 1 +
arch/mips/kernel/perf_event_mipsxx.c | 1 +
arch/powerpc/perf/core-book3s.c | 1 +
arch/powerpc/perf/core-fsl-emb.c | 1 +
arch/powerpc/perf/imc-pmu.c | 1 +
arch/s390/kernel/perf_cpum_cf.c | 1 +
arch/s390/kernel/perf_cpum_sf.c | 2 ++
arch/s390/kernel/perf_pai_crypto.c | 1 +
arch/s390/kernel/perf_pai_ext.c | 1 +
arch/sparc/kernel/perf_event.c | 1 +
arch/x86/events/amd/ibs.c | 2 ++
arch/x86/events/core.c | 4 +++-
arch/xtensa/kernel/perf_event.c | 1 +
drivers/perf/arm_pmu.c | 3 ++-
drivers/perf/arm_pmu_platform.c | 1 +
drivers/perf/arm_spe_pmu.c | 3 ++-
drivers/perf/riscv_pmu_sbi.c | 2 ++
include/linux/perf_event.h | 3 ++-
kernel/events/core.c | 20 +++++++++++---------
kernel/events/hw_breakpoint.c | 1 +
23 files changed, 44 insertions(+), 14 deletions(-)
diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c
index 8557165e64c0..4de1802d249f 100644
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -761,7 +761,8 @@ static struct pmu pmu = {
.start = alpha_pmu_start,
.stop = alpha_pmu_stop,
.read = alpha_pmu_read,
- .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+ .capabilities = PERF_PMU_CAP_SAMPLING |
+ PERF_PMU_CAP_NO_EXCLUDE,
};
diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c
index ed6d4f0cd621..1b99b0215027 100644
--- a/arch/arc/kernel/perf_event.c
+++ b/arch/arc/kernel/perf_event.c
@@ -818,6 +818,8 @@ static int arc_pmu_device_probe(struct platform_device *pdev)
if (irq == -1)
arc_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+ else
+ arc_pmu->pmu.capabilities |= PERF_PMU_CAP_SAMPLING;
/*
* perf parser doesn't really like '-' symbol in events name, so let's
diff --git a/arch/csky/kernel/perf_event.c b/arch/csky/kernel/perf_event.c
index e0a36acd265b..c5ba6e235a6f 100644
--- a/arch/csky/kernel/perf_event.c
+++ b/arch/csky/kernel/perf_event.c
@@ -1204,6 +1204,7 @@ int init_hw_perf_events(void)
}
csky_pmu.pmu = (struct pmu) {
+ .capabilities = PERF_PMU_CAP_SAMPLING,
.pmu_enable = csky_pmu_enable,
.pmu_disable = csky_pmu_disable,
.event_init = csky_pmu_event_init,
@@ -1314,6 +1315,7 @@ int csky_pmu_device_probe(struct platform_device *pdev,
ret = csky_pmu_request_irq(csky_pmu_handle_irq);
if (ret) {
+ csky_pmu.pmu.capabilities &= ~PERF_PMU_CAP_SAMPLING;
csky_pmu.pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
pr_notice("[perf] PMU request irq fail!\n");
}
diff --git a/arch/loongarch/kernel/perf_event.c b/arch/loongarch/kernel/perf_event.c
index 8ad098703488..341b17bedd0e 100644
--- a/arch/loongarch/kernel/perf_event.c
+++ b/arch/loongarch/kernel/perf_event.c
@@ -571,6 +571,7 @@ static int loongarch_pmu_event_init(struct perf_event *event)
}
static struct pmu pmu = {
+ .capabilities = PERF_PMU_CAP_SAMPLING,
.pmu_enable = loongarch_pmu_enable,
.pmu_disable = loongarch_pmu_disable,
.event_init = loongarch_pmu_event_init,
diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c
index 196a070349b0..4c5d64d1158e 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -687,6 +687,7 @@ static int mipspmu_event_init(struct perf_event *event)
}
static struct pmu pmu = {
+ .capabilities = PERF_PMU_CAP_SAMPLING,
.pmu_enable = mipspmu_enable,
.pmu_disable = mipspmu_disable,
.event_init = mipspmu_event_init,
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index d67f7d511f13..cfe7d3c120e1 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2207,6 +2207,7 @@ ssize_t power_events_sysfs_show(struct device *dev,
}
static struct pmu power_pmu = {
+ .capabilities = PERF_PMU_CAP_SAMPLING,
.pmu_enable = power_pmu_enable,
.pmu_disable = power_pmu_disable,
.event_init = power_pmu_event_init,
diff --git a/arch/powerpc/perf/core-fsl-emb.c b/arch/powerpc/perf/core-fsl-emb.c
index 509932b91b75..62038ff3663f 100644
--- a/arch/powerpc/perf/core-fsl-emb.c
+++ b/arch/powerpc/perf/core-fsl-emb.c
@@ -570,6 +570,7 @@ static int fsl_emb_pmu_event_init(struct perf_event *event)
}
static struct pmu fsl_emb_pmu = {
+ .capabilities = PERF_PMU_CAP_SAMPLING,
.pmu_enable = fsl_emb_pmu_enable,
.pmu_disable = fsl_emb_pmu_disable,
.event_init = fsl_emb_pmu_event_init,
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 8664a7d297ad..f352dda3baf9 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1507,6 +1507,7 @@ static int update_pmu_ops(struct imc_pmu *pmu)
pmu->pmu.commit_txn = thread_imc_pmu_commit_txn;
break;
case IMC_DOMAIN_TRACE:
+ pmu->pmu.capabilities |= PERF_PMU_CAP_SAMPLING;
pmu->pmu.event_init = trace_imc_event_init;
pmu->pmu.add = trace_imc_event_add;
pmu->pmu.del = trace_imc_event_del;
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 4d09954ebf49..7d10842d54f0 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -1861,6 +1861,7 @@ static const struct attribute_group *cfdiag_attr_groups[] = {
*/
static struct pmu cf_diag = {
.task_ctx_nr = perf_sw_context,
+ .capabilities = PERF_PMU_CAP_SAMPLING,
.event_init = cfdiag_event_init,
.pmu_enable = cpumf_pmu_enable,
.pmu_disable = cpumf_pmu_disable,
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index f432869f8921..3d2c400f0aaa 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -1892,6 +1892,8 @@ static const struct attribute_group *cpumsf_pmu_attr_groups[] = {
};
static struct pmu cpumf_sampling = {
+ .capabilities = PERF_PMU_CAP_SAMPLING,
+
.pmu_enable = cpumsf_pmu_enable,
.pmu_disable = cpumsf_pmu_disable,
diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
index f373a1009c45..a64b6b056a21 100644
--- a/arch/s390/kernel/perf_pai_crypto.c
+++ b/arch/s390/kernel/perf_pai_crypto.c
@@ -569,6 +569,7 @@ static const struct attribute_group *paicrypt_attr_groups[] = {
/* Performance monitoring unit for mapped counters */
static struct pmu paicrypt = {
.task_ctx_nr = perf_hw_context,
+ .capabilities = PERF_PMU_CAP_SAMPLING,
.event_init = paicrypt_event_init,
.add = paicrypt_add,
.del = paicrypt_del,
diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
index d827473e7f87..1261f80c6d52 100644
--- a/arch/s390/kernel/perf_pai_ext.c
+++ b/arch/s390/kernel/perf_pai_ext.c
@@ -595,6 +595,7 @@ static const struct attribute_group *paiext_attr_groups[] = {
/* Performance monitoring unit for mapped counters */
static struct pmu paiext = {
.task_ctx_nr = perf_hw_context,
+ .capabilities = PERF_PMU_CAP_SAMPLING,
.event_init = paiext_event_init,
.add = paiext_add,
.del = paiext_del,
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 706127749c66..6ecea8e7b592 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1573,6 +1573,7 @@ static int sparc_pmu_commit_txn(struct pmu *pmu)
}
static struct pmu pmu = {
+ .capabilities = PERF_PMU_CAP_SAMPLING,
.pmu_enable = sparc_pmu_enable,
.pmu_disable = sparc_pmu_disable,
.event_init = sparc_pmu_event_init,
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 95de309fc7d5..ed07d80b6fe0 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -768,6 +768,7 @@ static struct perf_ibs perf_ibs_fetch = {
.pmu = {
.task_ctx_nr = perf_hw_context,
+ .capabilities = PERF_PMU_CAP_SAMPLING,
.event_init = perf_ibs_init,
.add = perf_ibs_add,
.del = perf_ibs_del,
@@ -793,6 +794,7 @@ static struct perf_ibs perf_ibs_op = {
.pmu = {
.task_ctx_nr = perf_hw_context,
+ .capabilities = PERF_PMU_CAP_SAMPLING,
.event_init = perf_ibs_init,
.add = perf_ibs_add,
.del = perf_ibs_del,
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index eca5bb49aa85..72a4c43951ee 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1837,7 +1837,7 @@ static void __init pmu_check_apic(void)
* sample via a hrtimer based software event):
*/
pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
-
+ pmu.capabilities &= ~PERF_PMU_CAP_SAMPLING;
}
static struct attribute_group x86_pmu_format_group __ro_after_init = {
@@ -2698,6 +2698,8 @@ static bool x86_pmu_filter(struct pmu *pmu, int cpu)
}
static struct pmu pmu = {
+ .capabilities = PERF_PMU_CAP_SAMPLING,
+
.pmu_enable = x86_pmu_enable,
.pmu_disable = x86_pmu_disable,
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index 223f1d452310..b03a2feb0f92 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -397,6 +397,7 @@ irqreturn_t xtensa_pmu_irq_handler(int irq, void *dev_id)
}
static struct pmu xtensa_pmu = {
+ .capabilities = PERF_PMU_CAP_SAMPLING,
.pmu_enable = xtensa_pmu_enable,
.pmu_disable = xtensa_pmu_disable,
.event_init = xtensa_pmu_event_init,
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 2c1af3a0207c..72d8f38d0aa5 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -876,7 +876,8 @@ struct arm_pmu *armpmu_alloc(void)
* PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE events on a
* specific PMU.
*/
- .capabilities = PERF_PMU_CAP_EXTENDED_REGS |
+ .capabilities = PERF_PMU_CAP_SAMPLING |
+ PERF_PMU_CAP_EXTENDED_REGS |
PERF_PMU_CAP_EXTENDED_HW_TYPE,
};
diff --git a/drivers/perf/arm_pmu_platform.c b/drivers/perf/arm_pmu_platform.c
index 118170a5cede..ab7a802cd0d6 100644
--- a/drivers/perf/arm_pmu_platform.c
+++ b/drivers/perf/arm_pmu_platform.c
@@ -109,6 +109,7 @@ static int pmu_parse_irqs(struct arm_pmu *pmu)
*/
if (num_irqs == 0) {
dev_warn(dev, "no irqs for PMU, sampling events not supported\n");
+ pmu->pmu.capabilities &= ~PERF_PMU_CAP_SAMPLING;
pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
cpumask_setall(&pmu->supported_cpus);
return 0;
diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index 369e77ad5f13..dbd52851f5c6 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -955,7 +955,8 @@ static int arm_spe_pmu_perf_init(struct arm_spe_pmu *spe_pmu)
spe_pmu->pmu = (struct pmu) {
.module = THIS_MODULE,
.parent = &spe_pmu->pdev->dev,
- .capabilities = PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE,
+ .capabilities = PERF_PMU_CAP_SAMPLING |
+ PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE,
.attr_groups = arm_spe_pmu_attr_groups,
/*
* We hitch a ride on the software context here, so that
diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
index 698de8ddf895..d185ea8c47ba 100644
--- a/drivers/perf/riscv_pmu_sbi.c
+++ b/drivers/perf/riscv_pmu_sbi.c
@@ -1361,6 +1361,8 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
pr_info("Perf sampling/filtering is not supported as sscof extension is not available\n");
pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
pmu->pmu.capabilities |= PERF_PMU_CAP_NO_EXCLUDE;
+ } else {
+ pmu->pmu.capabilities |= PERF_PMU_CAP_SAMPLING;
}
pmu->pmu.attr_groups = riscv_pmu_attr_groups;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4d439c24c901..bf2cfbeabba2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -294,7 +294,7 @@ struct perf_event_pmu_context;
/**
* pmu::capabilities flags
*/
-#define PERF_PMU_CAP_NO_INTERRUPT 0x0001
+#define PERF_PMU_CAP_SAMPLING 0x0001
#define PERF_PMU_CAP_NO_NMI 0x0002
#define PERF_PMU_CAP_AUX_NO_SG 0x0004
#define PERF_PMU_CAP_EXTENDED_REGS 0x0008
@@ -305,6 +305,7 @@ struct perf_event_pmu_context;
#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
#define PERF_PMU_CAP_AUX_PAUSE 0x0200
#define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400
+#define PERF_PMU_CAP_NO_INTERRUPT 0x0800
/**
* pmu::scope
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8060c2857bb2..71b2a6730705 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4359,7 +4359,7 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
continue;
if (!perf_pmu_ctx_is_active(pmu_ctx))
continue;
- if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
+ if (!(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_SAMPLING))
continue;
perf_pmu_disable(pmu_ctx->pmu);
@@ -10819,7 +10819,7 @@ static int perf_swevent_init(struct perf_event *event)
static struct pmu perf_swevent = {
.task_ctx_nr = perf_sw_context,
- .capabilities = PERF_PMU_CAP_NO_NMI,
+ .capabilities = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_NO_NMI,
.event_init = perf_swevent_init,
.add = perf_swevent_add,
@@ -10861,6 +10861,7 @@ static int perf_tp_event_init(struct perf_event *event)
static struct pmu perf_tracepoint = {
.task_ctx_nr = perf_sw_context,
+ .capabilities = PERF_PMU_CAP_SAMPLING,
.event_init = perf_tp_event_init,
.add = perf_trace_add,
.del = perf_trace_del,
@@ -11066,6 +11067,7 @@ static struct pmu perf_kprobe = {
.stop = perf_swevent_stop,
.read = perf_swevent_read,
.attr_groups = kprobe_attr_groups,
+ .capabilities = PERF_PMU_CAP_SAMPLING,
};
static int perf_kprobe_event_init(struct perf_event *event)
@@ -11125,6 +11127,7 @@ static struct pmu perf_uprobe = {
.stop = perf_swevent_stop,
.read = perf_swevent_read,
.attr_groups = uprobe_attr_groups,
+ .capabilities = PERF_PMU_CAP_SAMPLING,
};
static int perf_uprobe_event_init(struct perf_event *event)
@@ -11899,7 +11902,7 @@ static int cpu_clock_event_init(struct perf_event *event)
static struct pmu perf_cpu_clock = {
.task_ctx_nr = perf_sw_context,
- .capabilities = PERF_PMU_CAP_NO_NMI,
+ .capabilities = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_NO_NMI,
.dev = PMU_NULL_DEV,
.event_init = cpu_clock_event_init,
@@ -11982,7 +11985,7 @@ static int task_clock_event_init(struct perf_event *event)
static struct pmu perf_task_clock = {
.task_ctx_nr = perf_sw_context,
- .capabilities = PERF_PMU_CAP_NO_NMI,
+ .capabilities = PERF_PMU_CAP_SAMPLING | PERF_PMU_CAP_NO_NMI,
.dev = PMU_NULL_DEV,
.event_init = task_clock_event_init,
@@ -13476,11 +13479,10 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_task;
}
- if (is_sampling_event(event)) {
- if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
- err = -EOPNOTSUPP;
- goto err_alloc;
- }
+ if (is_sampling_event(event) &&
+ !(event->pmu->capabilities & PERF_PMU_CAP_SAMPLING)) {
+ err = -EOPNOTSUPP;
+ goto err_alloc;
}
/*
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 8ec2cb688903..604be7d7aecf 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -996,6 +996,7 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
static struct pmu perf_breakpoint = {
.task_ctx_nr = perf_sw_context, /* could eventually get its own */
+ .capabilities = PERF_PMU_CAP_SAMPLING,
.event_init = hw_breakpoint_event_init,
.add = hw_breakpoint_add,
.del = hw_breakpoint_del,
--
2.39.2.101.g768bb238c484.dirty
On Wed, Aug 13, 2025 at 06:01:08PM +0100, Robin Murphy wrote: > Sampling is inherently a feature for CPU PMUs, given that the thing > to be sampled is a CPU context. These days, we have many more > uncore/system PMUs than CPU PMUs, so it no longer makes much sense to > assume sampling support by default and force the ever-growing majority > of drivers to opt out of it (or erroneously fail to). Instead, let's > introduce a positive opt-in capability that's more obvious and easier to > maintain. [...] > diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c > index 369e77ad5f13..dbd52851f5c6 100644 > --- a/drivers/perf/arm_spe_pmu.c > +++ b/drivers/perf/arm_spe_pmu.c > @@ -955,7 +955,8 @@ static int arm_spe_pmu_perf_init(struct arm_spe_pmu *spe_pmu) > spe_pmu->pmu = (struct pmu) { > .module = THIS_MODULE, > .parent = &spe_pmu->pdev->dev, > - .capabilities = PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE, > + .capabilities = PERF_PMU_CAP_SAMPLING | > + PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE, > .attr_groups = arm_spe_pmu_attr_groups, > /* > * We hitch a ride on the software context here, so that The change in Arm SPE driver looks good to me. I noticed you did not set the flag for other AUX events, like Arm CoreSight, Intel PT and bts. The drivers locate in: drivers/hwtracing/coresight/coresight-etm-perf.c arch/x86/events/intel/bts.c arch/x86/events/intel/pt.c Genearlly, AUX events generate interrupts based on AUX ring buffer watermark but not the period. Seems to me, it is correct to set the PERF_PMU_CAP_SAMPLING flag for them. A special case is Arm CoreSight legacy sinks (like ETR/ETB, etc) don't has interrupt. We might need set or clear the flag on the fly based on sink type: diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c index f1551c08ecb2..404edc94c198 100644 --- a/drivers/hwtracing/coresight/coresight-etm-perf.c +++ b/drivers/hwtracing/coresight/coresight-etm-perf.c @@ -433,6 +433,11 @@ static void *etm_setup_aux(struct perf_event *event, void **pages, if (!sink) goto err; + if (coresight_is_percpu_sink(sink)) + event->pmu.capabilities = PERF_PMU_CAP_SAMPLING; + else + event->pmu.capabilities &= ~PERF_PMU_CAP_SAMPLING; + Thanks, Leo
On 2025-08-26 2:11 pm, Leo Yan wrote: > On Wed, Aug 13, 2025 at 06:01:08PM +0100, Robin Murphy wrote: >> Sampling is inherently a feature for CPU PMUs, given that the thing >> to be sampled is a CPU context. These days, we have many more >> uncore/system PMUs than CPU PMUs, so it no longer makes much sense to >> assume sampling support by default and force the ever-growing majority >> of drivers to opt out of it (or erroneously fail to). Instead, let's >> introduce a positive opt-in capability that's more obvious and easier to >> maintain. > > [...] > >> diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c >> index 369e77ad5f13..dbd52851f5c6 100644 >> --- a/drivers/perf/arm_spe_pmu.c >> +++ b/drivers/perf/arm_spe_pmu.c >> @@ -955,7 +955,8 @@ static int arm_spe_pmu_perf_init(struct arm_spe_pmu *spe_pmu) >> spe_pmu->pmu = (struct pmu) { >> .module = THIS_MODULE, >> .parent = &spe_pmu->pdev->dev, >> - .capabilities = PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE, >> + .capabilities = PERF_PMU_CAP_SAMPLING | >> + PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE, >> .attr_groups = arm_spe_pmu_attr_groups, >> /* >> * We hitch a ride on the software context here, so that > > The change in Arm SPE driver looks good to me. > > I noticed you did not set the flag for other AUX events, like Arm > CoreSight, Intel PT and bts. The drivers locate in: > > drivers/hwtracing/coresight/coresight-etm-perf.c > arch/x86/events/intel/bts.c > arch/x86/events/intel/pt.c > > Genearlly, AUX events generate interrupts based on AUX ring buffer > watermark but not the period. Seems to me, it is correct to set the > PERF_PMU_CAP_SAMPLING flag for them. This cap is given to drivers which handle event->attr.sample_period and call perf_event_overflow() - or in a few rare cases, perf_output_sample() directly - to do something meaningful with it, since the intent is to convey "I properly handle events for which is_sampling_event() is true". My understanding is that aux events are something else entirely, but I'm happy to be corrected. Otherwise, perhaps this suggests it deserves to be named a little more specifically for clarity, maybe PERF_CAP_SAMPLING_EVENTS? Thanks, Robin. > A special case is Arm CoreSight legacy sinks (like ETR/ETB, etc) > don't has interrupt. We might need set or clear the flag on the fly > based on sink type: > > diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c > index f1551c08ecb2..404edc94c198 100644 > --- a/drivers/hwtracing/coresight/coresight-etm-perf.c > +++ b/drivers/hwtracing/coresight/coresight-etm-perf.c > @@ -433,6 +433,11 @@ static void *etm_setup_aux(struct perf_event *event, void **pages, > if (!sink) > goto err; > > + if (coresight_is_percpu_sink(sink)) > + event->pmu.capabilities = PERF_PMU_CAP_SAMPLING; > + else > + event->pmu.capabilities &= ~PERF_PMU_CAP_SAMPLING; > + > > Thanks, > Leo
On Tue, Aug 26, 2025 at 04:53:51PM +0100, Robin Murphy wrote: [...] > > Genearlly, AUX events generate interrupts based on AUX ring buffer > > watermark but not the period. Seems to me, it is correct to set the > > PERF_PMU_CAP_SAMPLING flag for them. > > This cap is given to drivers which handle event->attr.sample_period and call > perf_event_overflow() - or in a few rare cases, perf_output_sample() > directly - to do something meaningful with it, since the intent is to convey > "I properly handle events for which is_sampling_event() is true". My > understanding is that aux events are something else entirely, but I'm happy > to be corrected. If the discussion is based only on this patch, my understanding is that the PERF_PMU_CAP_SAMPLING flag replaces the PERF_PMU_CAP_NO_INTERRUPT flag for checking whether a PMU event needs to be re-enabled in perf_adjust_freq_unthr_context(). AUX events can trigger a large number of interrupts under certain conditions (e.g., if we set a very small watermark). This is why I conclude that we need to set the PERF_PMU_CAP_SAMPLING flag to ensure that AUX events are re-enabled properly after throttling (see perf_adjust_freq_unthr_events()). > Otherwise, perhaps this suggests it deserves to be named a little more > specifically for clarity, maybe PERF_CAP_SAMPLING_EVENTS? Seems to me, the naming is not critical. If without setting the PERF_PMU_CAP_SAMPLING flag, AUX events might lose chance to be re-enabled after throttling. Thanks, Leo
On Wed, Aug 13, 2025 at 06:01:08PM +0100, Robin Murphy wrote: > Sampling is inherently a feature for CPU PMUs, given that the thing > to be sampled is a CPU context. These days, we have many more > uncore/system PMUs than CPU PMUs, so it no longer makes much sense to > assume sampling support by default and force the ever-growing majority > of drivers to opt out of it (or erroneously fail to). Instead, let's > introduce a positive opt-in capability that's more obvious and easier to > maintain. > > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h > index 4d439c24c901..bf2cfbeabba2 100644 > --- a/include/linux/perf_event.h > +++ b/include/linux/perf_event.h > @@ -294,7 +294,7 @@ struct perf_event_pmu_context; > /** > * pmu::capabilities flags > */ > -#define PERF_PMU_CAP_NO_INTERRUPT 0x0001 > +#define PERF_PMU_CAP_SAMPLING 0x0001 > #define PERF_PMU_CAP_NO_NMI 0x0002 > #define PERF_PMU_CAP_AUX_NO_SG 0x0004 > #define PERF_PMU_CAP_EXTENDED_REGS 0x0008 > @@ -305,6 +305,7 @@ struct perf_event_pmu_context; > #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 > #define PERF_PMU_CAP_AUX_PAUSE 0x0200 > #define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 > +#define PERF_PMU_CAP_NO_INTERRUPT 0x0800 So NO_INTERRUPT was supposed to be the negative of your new SAMPLING (and I agree with your reasoning). What I'm confused/curious about is why we retain NO_INTERRUPT?
On Tue, Aug 26, 2025 at 03:08:06PM +0200, Peter Zijlstra wrote: > On Wed, Aug 13, 2025 at 06:01:08PM +0100, Robin Murphy wrote: > > Sampling is inherently a feature for CPU PMUs, given that the thing > > to be sampled is a CPU context. These days, we have many more > > uncore/system PMUs than CPU PMUs, so it no longer makes much sense to > > assume sampling support by default and force the ever-growing majority > > of drivers to opt out of it (or erroneously fail to). Instead, let's > > introduce a positive opt-in capability that's more obvious and easier to > > maintain. > > > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h > > index 4d439c24c901..bf2cfbeabba2 100644 > > --- a/include/linux/perf_event.h > > +++ b/include/linux/perf_event.h > > @@ -294,7 +294,7 @@ struct perf_event_pmu_context; > > /** > > * pmu::capabilities flags > > */ > > -#define PERF_PMU_CAP_NO_INTERRUPT 0x0001 > > +#define PERF_PMU_CAP_SAMPLING 0x0001 > > #define PERF_PMU_CAP_NO_NMI 0x0002 > > #define PERF_PMU_CAP_AUX_NO_SG 0x0004 > > #define PERF_PMU_CAP_EXTENDED_REGS 0x0008 > > @@ -305,6 +305,7 @@ struct perf_event_pmu_context; > > #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 > > #define PERF_PMU_CAP_AUX_PAUSE 0x0200 > > #define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 > > +#define PERF_PMU_CAP_NO_INTERRUPT 0x0800 > > So NO_INTERRUPT was supposed to be the negative of your new SAMPLING > (and I agree with your reasoning). > > What I'm confused/curious about is why we retain NO_INTERRUPT? I see from your other reply that you spotted the next patch does that. For the sake of other reviewers or anyone digging through the git history it's probably worth adding a line to this commit message to say: | A subsequent patch will remove PERF_PMU_CAP_NO_INTERRUPT as this | requires some additional cleanup. Mark.
On 2025-08-26 2:28 pm, Mark Rutland wrote: > On Tue, Aug 26, 2025 at 03:08:06PM +0200, Peter Zijlstra wrote: >> On Wed, Aug 13, 2025 at 06:01:08PM +0100, Robin Murphy wrote: >>> Sampling is inherently a feature for CPU PMUs, given that the thing >>> to be sampled is a CPU context. These days, we have many more >>> uncore/system PMUs than CPU PMUs, so it no longer makes much sense to >>> assume sampling support by default and force the ever-growing majority >>> of drivers to opt out of it (or erroneously fail to). Instead, let's >>> introduce a positive opt-in capability that's more obvious and easier to >>> maintain. >> >>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h >>> index 4d439c24c901..bf2cfbeabba2 100644 >>> --- a/include/linux/perf_event.h >>> +++ b/include/linux/perf_event.h >>> @@ -294,7 +294,7 @@ struct perf_event_pmu_context; >>> /** >>> * pmu::capabilities flags >>> */ >>> -#define PERF_PMU_CAP_NO_INTERRUPT 0x0001 >>> +#define PERF_PMU_CAP_SAMPLING 0x0001 >>> #define PERF_PMU_CAP_NO_NMI 0x0002 >>> #define PERF_PMU_CAP_AUX_NO_SG 0x0004 >>> #define PERF_PMU_CAP_EXTENDED_REGS 0x0008 >>> @@ -305,6 +305,7 @@ struct perf_event_pmu_context; >>> #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 >>> #define PERF_PMU_CAP_AUX_PAUSE 0x0200 >>> #define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 >>> +#define PERF_PMU_CAP_NO_INTERRUPT 0x0800 >> >> So NO_INTERRUPT was supposed to be the negative of your new SAMPLING >> (and I agree with your reasoning). >> >> What I'm confused/curious about is why we retain NO_INTERRUPT? > > I see from your other reply that you spotted the next patch does that. > > For the sake of other reviewers or anyone digging through the git > history it's probably worth adding a line to this commit message to say: > > | A subsequent patch will remove PERF_PMU_CAP_NO_INTERRUPT as this > | requires some additional cleanup. Yup, the main reason is the set of drivers getting the new cap is smaller than the set of drivers currently not rejecting sampling events, so I wanted it to be clearly visible in the patch. Indeed I shall clarify the relationship to NO_INTERRUPT in the commit message. Thanks, Robin.
© 2016 - 2025 Red Hat, Inc.