When a perf_event is dropped due to some kind of (SW-based) filter, it
won't generate sample data. For example, software events drops samples
when it doesn't match to privilege from exclude_{user,kernel}.
In order to account such dropped samples, add a new counter in the
perf_event, and let users can read(2) the number with the new
PERF_FORMAT_DROPPED like the lost sample count.
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
include/linux/perf_event.h | 1 +
include/uapi/linux/perf_event.h | 5 ++++-
kernel/events/core.c | 12 ++++++++++++
3 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fb908843f209288d..c1e6340e561c400e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -830,6 +830,7 @@ struct perf_event {
u64 id;
atomic64_t lost_samples;
+ atomic64_t dropped_samples;
u64 (*clock)(void);
perf_overflow_handler_t overflow_handler;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 4842c36fdf801996..7813e05218657713 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -347,6 +347,7 @@ enum {
* { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
* { u64 id; } && PERF_FORMAT_ID
* { u64 lost; } && PERF_FORMAT_LOST
+ * { u64 dropped; } && PERF_FORMAT_DROPPED
* } && !PERF_FORMAT_GROUP
*
* { u64 nr;
@@ -355,6 +356,7 @@ enum {
* { u64 value;
* { u64 id; } && PERF_FORMAT_ID
* { u64 lost; } && PERF_FORMAT_LOST
+ * { u64 dropped; } && PERF_FORMAT_DROPPED
* } cntr[nr];
* } && PERF_FORMAT_GROUP
* };
@@ -365,8 +367,9 @@ enum perf_event_read_format {
PERF_FORMAT_ID = 1U << 2,
PERF_FORMAT_GROUP = 1U << 3,
PERF_FORMAT_LOST = 1U << 4,
+ PERF_FORMAT_DROPPED = 1U << 5,
- PERF_FORMAT_MAX = 1U << 5, /* non-ABI */
+ PERF_FORMAT_MAX = 1U << 6, /* non-ABI */
};
#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e3589c4287cb458c..7e15fe0a8dee4ee7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5697,6 +5697,8 @@ static int __perf_read_group_add(struct perf_event *leader,
values[n++] = primary_event_id(leader);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&leader->lost_samples);
+ if (read_format & PERF_FORMAT_DROPPED)
+ values[n++] = atomic64_read(&leader->dropped_samples);
for_each_sibling_event(sub, leader) {
values[n++] += perf_event_count(sub, false);
@@ -5704,6 +5706,8 @@ static int __perf_read_group_add(struct perf_event *leader,
values[n++] = primary_event_id(sub);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&sub->lost_samples);
+ if (read_format & PERF_FORMAT_DROPPED)
+ values[n++] = atomic64_read(&sub->dropped_samples);
}
unlock:
@@ -5769,6 +5773,8 @@ static int perf_read_one(struct perf_event *event,
values[n++] = primary_event_id(event);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&event->lost_samples);
+ if (read_format & PERF_FORMAT_DROPPED)
+ values[n++] = atomic64_read(&event->dropped_samples);
if (copy_to_user(buf, values, n * sizeof(u64)))
return -EFAULT;
@@ -7370,6 +7376,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
values[n++] = primary_event_id(event);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&event->lost_samples);
+ if (read_format & PERF_FORMAT_DROPPED)
+ values[n++] = atomic64_read(&event->dropped_samples);
__output_copy(handle, values, n * sizeof(u64));
}
@@ -7408,6 +7416,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = primary_event_id(leader);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&leader->lost_samples);
+ if (read_format & PERF_FORMAT_DROPPED)
+ values[n++] = atomic64_read(&leader->dropped_samples);
__output_copy(handle, values, n * sizeof(u64));
@@ -7423,6 +7433,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = primary_event_id(sub);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&sub->lost_samples);
+ if (read_format & PERF_FORMAT_DROPPED)
+ values[n++] = atomic64_read(&sub->dropped_samples);
__output_copy(handle, values, n * sizeof(u64));
}
--
2.47.0.105.g07ac214952-goog
Namhyung Kim <namhyung@kernel.org> writes: > When a perf_event is dropped due to some kind of (SW-based) filter, it > won't generate sample data. For example, software events drops samples > when it doesn't match to privilege from exclude_{user,kernel}. > > In order to account such dropped samples, add a new counter in the > perf_event, and let users can read(2) the number with the new > PERF_FORMAT_DROPPED like the lost sample count. Are we sure there's no scenario where exposing the dropped event count gives an unprivileged user a way to probe what's happening in the kernel, which is supposed to be prevented by exclude_kernel? Clearly it provides an attacker with some information, ie. the event fired in the kernel and was dropped. For most events that's not very interesting, but for some maybe it could be a useful signal? On the other hand most CPU PMUs implement filtering in hardware, which this won't affect, so maybe I'm being too paranoid. cheers
Hello, On Wed, Oct 23, 2024 at 10:05:32PM +1100, Michael Ellerman wrote: > Namhyung Kim <namhyung@kernel.org> writes: > > When a perf_event is dropped due to some kind of (SW-based) filter, it > > won't generate sample data. For example, software events drops samples > > when it doesn't match to privilege from exclude_{user,kernel}. > > > > In order to account such dropped samples, add a new counter in the > > perf_event, and let users can read(2) the number with the new > > PERF_FORMAT_DROPPED like the lost sample count. > > Are we sure there's no scenario where exposing the dropped event count > gives an unprivileged user a way to probe what's happening in the > kernel, which is supposed to be prevented by exclude_kernel? > > Clearly it provides an attacker with some information, ie. the event > fired in the kernel and was dropped. > > For most events that's not very interesting, but for some maybe it could > be a useful signal? Hmm.. good point. It'd give some information to users. I'm not sure how much impact it'd have, but there are some folks who want to know exact number of samples including dropped ones to reconstruct total period for the monitoring session. > > On the other hand most CPU PMUs implement filtering in hardware, which > this won't affect, so maybe I'm being too paranoid. Right, it might be possible to estimate some numbers by comparing with similar events in the core PMU that implements HW filtering even without this interface IMHO. Thanks, Namhyung
>>> When a perf_event is dropped due to some kind of (SW-based) filter, it >>> won't generate sample data. For example, software events drops samples >>> when it doesn't match to privilege from exclude_{user,kernel}. >>> >>> In order to account such dropped samples, add a new counter in the >>> perf_event, and let users can read(2) the number with the new >>> PERF_FORMAT_DROPPED like the lost sample count. >> >> Are we sure there's no scenario where exposing the dropped event count >> gives an unprivileged user a way to probe what's happening in the >> kernel, which is supposed to be prevented by exclude_kernel? >> >> Clearly it provides an attacker with some information, ie. the event >> fired in the kernel and was dropped. >> >> For most events that's not very interesting, but for some maybe it could >> be a useful signal? > > Hmm.. good point. It'd give some information to users. I'm not sure > how much impact it'd have, but there are some folks who want to know > exact number of samples including dropped ones to reconstruct total > period for the monitoring session. Can we restrict PERF_FORMAT_DROPPED to perfmon_capable() if it's a genuine problem? Or would it defeat the whole purpose of _DROPPED count? Thanks, Ravi
Hello Ravi, On Thu, Oct 24, 2024 at 10:13:24AM +0530, Ravi Bangoria wrote: > >>> When a perf_event is dropped due to some kind of (SW-based) filter, it > >>> won't generate sample data. For example, software events drops samples > >>> when it doesn't match to privilege from exclude_{user,kernel}. > >>> > >>> In order to account such dropped samples, add a new counter in the > >>> perf_event, and let users can read(2) the number with the new > >>> PERF_FORMAT_DROPPED like the lost sample count. > >> > >> Are we sure there's no scenario where exposing the dropped event count > >> gives an unprivileged user a way to probe what's happening in the > >> kernel, which is supposed to be prevented by exclude_kernel? > >> > >> Clearly it provides an attacker with some information, ie. the event > >> fired in the kernel and was dropped. > >> > >> For most events that's not very interesting, but for some maybe it could > >> be a useful signal? > > > > Hmm.. good point. It'd give some information to users. I'm not sure > > how much impact it'd have, but there are some folks who want to know > > exact number of samples including dropped ones to reconstruct total > > period for the monitoring session. > > Can we restrict PERF_FORMAT_DROPPED to perfmon_capable() if it's a > genuine problem? Or would it defeat the whole purpose of _DROPPED > count? Right, that's the purpose of the PERF_FORMAT_DROPPED. But I think we can discuss this interface later and just focus on the IBS swfilt first. I'll remove this part now and add it back separately. Thanks, Namhyung
© 2016 - 2024 Red Hat, Inc.