Since current IBS OP PMU does not have the capability to tag only load/
stores instructions, tools like perf mem/c2c ends up recording lots of
unwanted samples. So, introduce a load/store software filter in the IBS
OP PMU:
ibs_op/swfilt=1,ldop=1/ --> Only load samples
ibs_op/swfilt=1,stop=1/ --> Only store samples
ibs_op/swfilt=1,ldop=1,stop=1/ --> Load OR store samples
Other HW or SW filters in combination with this ldst filter are logical
AND. For ex:
ibs_op/swfilt=1,ldop=1,stop=1/u is
"privilege == userspace && (ldop == 1 || stop == 1)"
ibs_op/swfilt=1,ldop=1,stop=1,l3missonly=1/ is
"l3missonly == 1 && (ldop == 1 || stop == 1)"
An alternate approach is mem_op BPF filter:
perf record --filter "mem_op == load || mem_op == store" ...
However, there are few issues with it:
o BPF filter is called after preparing entire perf sample. If the sample
does not satisfy the filtering criteria, all the efforts of preparing
perf sample gets wasted.
o BPF filter requires root privilege.
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
---
arch/x86/events/amd/ibs.c | 73 +++++++++++++++++++++++++++++++++++---
include/linux/perf_event.h | 14 ++++++++
2 files changed, 83 insertions(+), 4 deletions(-)
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 0252b7ea8bca..d18ce6464b27 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -31,6 +31,10 @@ static u32 ibs_caps;
/* attr.config2 */
#define IBS_SW_FILTER_MASK 1
+/* attr.config1 */
+#define IBS_LDOP_FILTER_MASK (1UL << 12)
+#define IBS_STOP_FILTER_MASK (1UL << 13)
+
/*
* IBS states:
*
@@ -308,6 +312,11 @@ static int perf_ibs_init(struct perf_event *event)
event->attr.exclude_hv))
return -EINVAL;
+ if (!(event->attr.config2 & IBS_SW_FILTER_MASK) &&
+ (event->attr.config1 & (IBS_LDOP_FILTER_MASK |
+ IBS_STOP_FILTER_MASK)))
+ return -EINVAL;
+
ret = validate_group(event);
if (ret)
return ret;
@@ -624,6 +633,10 @@ static struct attribute_group empty_caps_group = {
PMU_FORMAT_ATTR(rand_en, "config:57");
PMU_FORMAT_ATTR(cnt_ctl, "config:19");
PMU_FORMAT_ATTR(swfilt, "config2:0");
+PMU_FORMAT_ATTR(ldop, "config1:12"); /* IBS_LDOP_FILTER_MASK */
+PMU_FORMAT_ATTR(stop, "config1:13"); /* IBS_STOP_FILTER_MASK */
+PMU_CAP_ATTR(swfilt_ldst, "1");
+
PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59");
PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16");
PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_format, "config1:0-11");
@@ -724,6 +737,8 @@ cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i)
static struct attribute *op_attrs[] = {
&format_attr_swfilt.attr,
+ &format_attr_ldop.attr,
+ &format_attr_stop.attr,
NULL,
};
@@ -737,11 +752,21 @@ static struct attribute *op_l3missonly_attrs[] = {
NULL,
};
+static struct attribute *op_attrs_caps[] = {
+ &cap_attr_swfilt_ldst.attr,
+ NULL,
+};
+
static struct attribute_group group_op_formats = {
.name = "format",
.attrs = op_attrs,
};
+static struct attribute_group group_op_caps = {
+ .name = "caps",
+ .attrs = op_attrs_caps,
+};
+
static struct attribute *ibs_op_ldlat_format_attrs[] = {
&ibs_op_ldlat_format.attr.attr,
NULL,
@@ -761,7 +786,7 @@ static struct attribute_group group_op_l3missonly = {
static const struct attribute_group *op_attr_groups[] = {
&group_op_formats,
- &empty_caps_group,
+ &group_op_caps,
NULL,
};
@@ -1148,13 +1173,23 @@ static bool perf_ibs_is_mem_sample_type(struct perf_ibs *perf_ibs,
PERF_SAMPLE_PHYS_ADDR);
}
+static bool perf_ibs_ld_st_filter_event(struct perf_ibs *perf_ibs,
+ struct perf_event *event)
+{
+ return perf_ibs == &perf_ibs_op &&
+ (event->attr.config2 & IBS_SW_FILTER_MASK) &&
+ (event->attr.config1 & (IBS_LDOP_FILTER_MASK |
+ IBS_STOP_FILTER_MASK));
+}
+
static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs,
struct perf_event *event,
int check_rip)
{
if (event->attr.sample_type & PERF_SAMPLE_RAW ||
perf_ibs_is_mem_sample_type(perf_ibs, event) ||
- perf_ibs_ldlat_event(perf_ibs, event))
+ perf_ibs_ldlat_event(perf_ibs, event) ||
+ perf_ibs_ld_st_filter_event(perf_ibs, event))
return perf_ibs->offset_max;
else if (check_rip)
return 3;
@@ -1189,6 +1224,32 @@ static bool perf_ibs_is_kernel_br_target(struct perf_event *event,
op_data.op_brn_ret && kernel_ip(br_target));
}
+/*
+ * ibs_op/swfilt=1,ldop=1/ --> Only load samples
+ * ibs_op/swfilt=1,stop=1/ --> Only store samples
+ * ibs_op/swfilt=1,ldop=1,stop=1/ --> Load OR store samples
+ */
+static bool perf_ibs_ld_st_filter(struct perf_event *event,
+ struct perf_ibs_data *ibs_data)
+{
+ union ibs_op_data3 op_data3;
+
+ if (!(event->attr.config1 & (IBS_LDOP_FILTER_MASK |
+ IBS_STOP_FILTER_MASK))) {
+ return false;
+ }
+
+ op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)];
+
+ if ((event->attr.config1 & IBS_LDOP_FILTER_MASK) && op_data3.ld_op)
+ return false;
+
+ if ((event->attr.config1 & IBS_STOP_FILTER_MASK) && op_data3.st_op)
+ return false;
+
+ return true;
+}
+
static bool perf_ibs_swfilt_discard(struct perf_ibs *perf_ibs, struct perf_event *event,
struct pt_regs *regs, struct perf_ibs_data *ibs_data,
int br_target_idx)
@@ -1196,9 +1257,12 @@ static bool perf_ibs_swfilt_discard(struct perf_ibs *perf_ibs, struct perf_event
if (perf_exclude_event(event, regs))
return true;
- if (perf_ibs != &perf_ibs_op || !event->attr.exclude_kernel)
+ if (perf_ibs != &perf_ibs_op)
return false;
+ if (!event->attr.exclude_kernel)
+ goto ldst_filter;
+
if (perf_ibs_is_kernel_data_addr(event, ibs_data))
return true;
@@ -1206,7 +1270,8 @@ static bool perf_ibs_swfilt_discard(struct perf_ibs *perf_ibs, struct perf_event
perf_ibs_is_kernel_br_target(event, ibs_data, br_target_idx))
return true;
- return false;
+ldst_filter:
+ return perf_ibs_ld_st_filter(event, ibs_data);
}
static void perf_ibs_phyaddr_clear(struct perf_ibs *perf_ibs,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0069ba6866a4..dedb92d5cd61 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1998,6 +1998,20 @@ _name##_show(struct device *dev, \
\
static struct device_attribute format_attr_##_name = __ATTR_RO(_name)
+#define PMU_CAP_ATTR_SHOW(_name, _cap) \
+static ssize_t \
+_name##_show(struct device *dev, struct device_attribute *attr, \
+ char *page) \
+{ \
+ BUILD_BUG_ON(sizeof(_cap) >= PAGE_SIZE); \
+ return sprintf(page, _cap "\n"); \
+}
+
+#define PMU_CAP_ATTR(_name, _cap) \
+ PMU_CAP_ATTR_SHOW(_name, _cap) \
+ \
+static struct device_attribute cap_attr_##_name = __ATTR_RO(_name)
+
/* Performance counter hotplug functions */
#ifdef CONFIG_PERF_EVENTS
int perf_event_init_cpu(unsigned int cpu);
--
2.43.0
* Ravi Bangoria <ravi.bangoria@amd.com> wrote: > Since current IBS OP PMU does not have the capability to tag only load/ > stores instructions, tools like perf mem/c2c ends up recording lots of > unwanted samples. So, introduce a load/store software filter in the IBS > OP PMU: > > ibs_op/swfilt=1,ldop=1/ --> Only load samples > ibs_op/swfilt=1,stop=1/ --> Only store samples > ibs_op/swfilt=1,ldop=1,stop=1/ --> Load OR store samples > > Other HW or SW filters in combination with this ldst filter are logical > AND. For ex: > > ibs_op/swfilt=1,ldop=1,stop=1/u is > "privilege == userspace && (ldop == 1 || stop == 1)" > > ibs_op/swfilt=1,ldop=1,stop=1,l3missonly=1/ is > "l3missonly == 1 && (ldop == 1 || stop == 1)" No objections, but: > An alternate approach is mem_op BPF filter: > > perf record --filter "mem_op == load || mem_op == store" ... > > However, there are few issues with it: > o BPF filter is called after preparing entire perf sample. If the sample > does not satisfy the filtering criteria, all the efforts of preparing > perf sample gets wasted. Could we add an 'early' BPF callback point as well, to fast-discard samples? > o BPF filter requires root privilege. Could we add 'built-in', 'safe' BPF scripts that are specifically prepared for perf events filtering purposes, that can be toggled by non-root users as well? These could be toggled by tooling via sysfs or so, or even via the perf syscall if that turns out to be the better approach. It would give us the flexibility and extensibility of BPF, combining it with the safety & compatibility of the filtering functionality being provided by the kernel. It could be provided in the form of a BPF program crypto signature registry of upstream-approved BPF scripts for perf BPF callback(s), or so. (While root could load any BPF script.) Thanks, Ingo
Hello, + bpf list On Sat, May 31, 2025 at 09:53:44AM +0200, Ingo Molnar wrote: > > * Ravi Bangoria <ravi.bangoria@amd.com> wrote: > > > Since current IBS OP PMU does not have the capability to tag only load/ > > stores instructions, tools like perf mem/c2c ends up recording lots of > > unwanted samples. So, introduce a load/store software filter in the IBS > > OP PMU: > > > > ibs_op/swfilt=1,ldop=1/ --> Only load samples > > ibs_op/swfilt=1,stop=1/ --> Only store samples > > ibs_op/swfilt=1,ldop=1,stop=1/ --> Load OR store samples > > > > Other HW or SW filters in combination with this ldst filter are logical > > AND. For ex: > > > > ibs_op/swfilt=1,ldop=1,stop=1/u is > > "privilege == userspace && (ldop == 1 || stop == 1)" > > > > ibs_op/swfilt=1,ldop=1,stop=1,l3missonly=1/ is > > "l3missonly == 1 && (ldop == 1 || stop == 1)" > > No objections, but: > > > An alternate approach is mem_op BPF filter: > > > > perf record --filter "mem_op == load || mem_op == store" ... > > > > However, there are few issues with it: > > o BPF filter is called after preparing entire perf sample. If the sample > > does not satisfy the filtering criteria, all the efforts of preparing > > perf sample gets wasted. > > Could we add an 'early' BPF callback point as well, to fast-discard > samples? I guess that would require a new BPF program type than PERF_EVENT and handle driver-specific details. > > > o BPF filter requires root privilege. > > Could we add 'built-in', 'safe' BPF scripts that are specifically > prepared for perf events filtering purposes, that can be toggled by > non-root users as well? These could be toggled by tooling via sysfs or > so, or even via the perf syscall if that turns out to be the better > approach. We have BPF filter framework in the perf tools and it can be run as normal user. But root user should load and pin the BPF program prior to use like below. $ sudo perf record --setup-filter pin $ perf record -d -e ibs_op/swfilt/u --filter 'mem_op == load' ... Thanks, Namhyung > > It would give us the flexibility and extensibility of BPF, combining it > with the safety & compatibility of the filtering functionality being > provided by the kernel. > > It could be provided in the form of a BPF program crypto signature > registry of upstream-approved BPF scripts for perf BPF callback(s), > or so. (While root could load any BPF script.) > > Thanks, > > Ingo
Hi Ingo, Namhyung, >>> An alternate approach is mem_op BPF filter: >>> >>> perf record --filter "mem_op == load || mem_op == store" ... >>> >>> However, there are few issues with it: >>> o BPF filter is called after preparing entire perf sample. If the sample >>> does not satisfy the filtering criteria, all the efforts of preparing >>> perf sample gets wasted. >> >> Could we add an 'early' BPF callback point as well, to fast-discard >> samples? > > I guess that would require a new BPF program type than PERF_EVENT and > handle driver-specific details. Right. >>> o BPF filter requires root privilege. >> >> Could we add 'built-in', 'safe' BPF scripts that are specifically >> prepared for perf events filtering purposes, that can be toggled by >> non-root users as well? These could be toggled by tooling via sysfs or >> so, or even via the perf syscall if that turns out to be the better >> approach. > > We have BPF filter framework in the perf tools and it can be run as > normal user. But root user should load and pin the BPF program prior > to use like below. > > $ sudo perf record --setup-filter pin > > $ perf record -d -e ibs_op/swfilt/u --filter 'mem_op == load' ... Thanks Namhyung. Ingo, Do you feel the idea of perf specific 'safe' BPF script is still worth pursuing despite similar functionality is already provided by --setup-filter? Thanks, Ravi
© 2016 - 2025 Red Hat, Inc.