From nobody Sun Feb 8 04:12:28 2026 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id DEA8C43152; Tue, 30 Jul 2024 08:44:29 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1722329071; cv=none; b=aEplHHkwJzzPFG4lr3UqhDlslUk6+Rmfbd1bOX6kCgIbqcbnxnv4a+es/IuHOK1ZxTL0xmV+LX+4gab5geSom3p6BEZJYx/1IgZqHgc28QocjpWC4T0/92dm6uiqXOhcdNeFgbLF2URWtYS3DyjU+U3qNjRoDZHA56tpISz70JM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1722329071; c=relaxed/simple; bh=70etayXrUCfZdnKM3OyZxCdJ+uHIiB7Bw8Sf3hLSHjU=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=jq5BSPd7/FcPOzVvjQt0GE8vryGxIPRs16JCSWRhCddLpFfMAucNweuEbrc6GiOpzBS99irKuiHHtrpYqhhuqxKnPZ3zSSA8mkTmn2BdIi0IUd3RAF0Fcs2o+TCzKm82IPNXfPNN/2dS86xojn0FqfhaoqZk0Thxw9jtxY8LaDY= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id CF14C106F; Tue, 30 Jul 2024 01:44:54 -0700 (PDT) Received: from e126817.cambridge.arm.com (e126817.cambridge.arm.com [10.2.3.8]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 880233F766; Tue, 30 Jul 2024 01:44:27 -0700 (PDT) From: Ben Gainey To: peterz@infradead.org, mingo@redhat.com, acme@kernel.org, namhyung@kernel.org Cc: james.clark@arm.com, mark.rutland@arm.com, alexander.shishkin@linux.intel.com, jolsa@kernel.org, irogers@google.com, adrian.hunter@intel.com, linux-perf-users@vger.kernel.org, linux-kernel@vger.kernel.org, Ben Gainey Subject: [PATCH v9 1/4] perf: Rename perf_event_context.nr_pending to nr_no_switch_fast. Date: Tue, 30 Jul 2024 09:44:14 +0100 Message-ID: <20240730084417.7693-2-ben.gainey@arm.com> X-Mailer: git-send-email 2.45.2 In-Reply-To: <20240730084417.7693-1-ben.gainey@arm.com> References: <20240730084417.7693-1-ben.gainey@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" nr_pending counts the number of events in the context that either pending_sigtrap or pending_work, but it is used to prevent taking the fast path in perf_event_context_sched_out. Renamed to reflect what it is used for, rather than what it counts. This change allows using the field to track other event properties that also require skipping the fast path without possible confusion over the name. Signed-off-by: Ben Gainey --- include/linux/perf_event.h | 5 +++-- kernel/events/core.c | 12 ++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1a8942277dda..87ccb7ca241f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -963,12 +963,13 @@ struct perf_event_context { struct rcu_head rcu_head; =20 /* - * Sum (event->pending_work + event->pending_work) + * The count of events for which using the switch-out fast path + * should be avoided. * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. */ - local_t nr_pending; + local_t nr_no_switch_fast; }; =20 struct perf_cpu_pmu_context { diff --git a/kernel/events/core.c b/kernel/events/core.c index aa3450bdc227..e6cc354a3cee 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3516,9 +3516,9 @@ perf_event_context_sched_out(struct task_struct *task= , struct task_struct *next) =20 perf_ctx_disable(ctx, false); =20 - /* PMIs are disabled; ctx->nr_pending is stable. */ - if (local_read(&ctx->nr_pending) || - local_read(&next_ctx->nr_pending)) { + /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ + if (local_read(&ctx->nr_no_switch_fast) || + local_read(&next_ctx->nr_no_switch_fast)) { /* * Must not swap out ctx when there's pending * events that rely on the ctx->task relation. @@ -5204,7 +5204,7 @@ static void perf_pending_task_sync(struct perf_event = *event) */ if (task_work_cancel(current, head)) { event->pending_work =3D 0; - local_dec(&event->ctx->nr_pending); + local_dec(&event->ctx->nr_no_switch_fast); return; } =20 @@ -6868,7 +6868,7 @@ static void perf_pending_task(struct callback_head *h= ead) if (event->pending_work) { event->pending_work =3D 0; perf_sigtrap(event); - local_dec(&event->ctx->nr_pending); + local_dec(&event->ctx->nr_no_switch_fast); rcuwait_wake_up(&event->pending_work_wait); } rcu_read_unlock(); @@ -9740,7 +9740,7 @@ static int __perf_event_overflow(struct perf_event *e= vent, if (!event->pending_work && !task_work_add(current, &event->pending_task, notify_mode)) { event->pending_work =3D pending_id; - local_inc(&event->ctx->nr_pending); + local_inc(&event->ctx->nr_no_switch_fast); =20 event->pending_addr =3D 0; if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) --=20 2.45.2 From nobody Sun Feb 8 04:12:28 2026 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id BD153194151; Tue, 30 Jul 2024 08:44:31 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1722329074; cv=none; b=jA+vZgvdDbSBKBXcdmdSN5/5Fxm+Cky4zpfBkAg/Wu2etF5SX6gkkvWnxy6Y+c1ztmeozYBmuAoZdcjXwEoaDS+4HKCi+irshkuWJcT38Uzk+DJtp2TIPypc6zqgcTumWbDGTas9VBkEY+Lgho9I5p1x5EMKu0Xwwvs441YSGsc= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1722329074; c=relaxed/simple; bh=VzW8kdosZ9tUetiiJ0DDiela/paF00EnJ6dVFxGjhac=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=ew8bjXKsXoVbUcZlpcCzUppDsoybEmFbirJgj25MY59KKx/y1uRlgJmBDLanmgQZBi/zZMi/+wj5NotQmpTbVrdwe4gi4xjjg7AjJZMVHdLZCUH5iKw4Umllcn3PfCBSaQPPpLkXDE2MBCqHx25ylVGOk0KeNVfyHgMUXmG7mek= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id BA328152B; Tue, 30 Jul 2024 01:44:56 -0700 (PDT) Received: from e126817.cambridge.arm.com (e126817.cambridge.arm.com [10.2.3.8]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 735E63F766; Tue, 30 Jul 2024 01:44:29 -0700 (PDT) From: Ben Gainey To: peterz@infradead.org, mingo@redhat.com, acme@kernel.org, namhyung@kernel.org Cc: james.clark@arm.com, mark.rutland@arm.com, alexander.shishkin@linux.intel.com, jolsa@kernel.org, irogers@google.com, adrian.hunter@intel.com, linux-perf-users@vger.kernel.org, linux-kernel@vger.kernel.org, Ben Gainey Subject: [PATCH v9 2/4] perf: Support PERF_SAMPLE_READ with inherit Date: Tue, 30 Jul 2024 09:44:15 +0100 Message-ID: <20240730084417.7693-3-ben.gainey@arm.com> X-Mailer: git-send-email 2.45.2 In-Reply-To: <20240730084417.7693-1-ben.gainey@arm.com> References: <20240730084417.7693-1-ben.gainey@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" This change allows events to use PERF_SAMPLE_READ with inherit so long as PERF_SAMPLE_TID is also set. This enables sample based profiling of a group of counters over a hierarchy of processes or threads. This is useful, for example, for collecting per-thread counters/metrics, event based sampling of multiple counters as a unit, access to the enabled and running time when using multiplexing and so on. Prior to this, users were restricted to either collecting aggregate statistics for a multi-threaded/-process application (e.g. with "perf stat"), or to sample individual threads, or to profile the entire system (which requires root or CAP_PERFMON, and may produce much more data than is required). Theoretically a tool could poll for or otherwise monitor thread/process creation and construct whatever events the user is interested in using perf_event_open, for each new thread or process, but this is racy, can lead to file-descriptor exhaustion, and ultimately just replicates the behaviour of inherit, but in userspace. This configuration differs from inherit without PERF_SAMPLE_READ in that the accumulated event count, and consequently any sample (such as if triggered by overflow of sample_period) will be on a per-thread rather than on an aggregate basis. The meaning of read_format::value field of both PERF_RECORD_READ and PERF_RECORD_SAMPLE is changed such that if the sampled event uses this new configuration then the values reported will be per-thread rather than the global aggregate value. This is a change from the existing semantics of read_format (where PERF_SAMPLE_READ is used without inherit), but it is necessary to expose the per-thread counter values, and it avoids reinventing a separate "read_format_thread" field that otherwise replicates the same behaviour. This change should not break existing tools, since this configuration was not previously valid and was rejected by the kernel. Tools that opt into this new mode will need to account for this when calculating the counter delta for a given sample. Tools that wish to have both the per-thread and aggregate value can perform the global aggregation themselves from the per-thread values. The change to read_format::value does not affect existing valid perf_event_attr configurations, nor does it change the behaviour of calls to "read" on an event descriptor. Both continue to report the aggregate value for the entire thread/process hierarchy. The difference between the results reported by "read" and PERF_RECORD_SAMPLE in this new configuration is justified on the basis that it is not (easily) possible for "read" to target a specific thread (the caller only has the fd for the original parent event). Signed-off-by: Ben Gainey --- include/linux/perf_event.h | 3 +++ kernel/events/core.c | 55 ++++++++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 14 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 87ccb7ca241f..6c96da389b30 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -966,6 +966,9 @@ struct perf_event_context { * The count of events for which using the switch-out fast path * should be avoided. * + * Sum (event->pending_work + events with + * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))) + * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. */ diff --git a/kernel/events/core.c b/kernel/events/core.c index e6cc354a3cee..c01a32687dad 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1767,6 +1767,14 @@ perf_event_groups_next(struct perf_event *event, str= uct pmu *pmu) event =3D rb_entry_safe(rb_next(&event->group_node), \ typeof(*event), group_node)) =20 +/* + * Does the event attribute request inherit with PERF_SAMPLE_READ + */ +static inline bool has_inherit_and_sample_read(struct perf_event_attr *att= r) +{ + return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ); +} + /* * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -1797,6 +1805,8 @@ list_add_event(struct perf_event *event, struct perf_= event_context *ctx) ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; + if (has_inherit_and_sample_read(&event->attr)) + local_inc(&ctx->nr_no_switch_fast); =20 if (event->state > PERF_EVENT_STATE_OFF) perf_cgroup_event_enable(event, ctx); @@ -2021,6 +2031,8 @@ list_del_event(struct perf_event *event, struct perf_= event_context *ctx) ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; + if (has_inherit_and_sample_read(&event->attr)) + local_dec(&ctx->nr_no_switch_fast); =20 list_del_rcu(&event->event_entry); =20 @@ -3522,6 +3534,11 @@ perf_event_context_sched_out(struct task_struct *tas= k, struct task_struct *next) /* * Must not swap out ctx when there's pending * events that rely on the ctx->task relation. + * + * Likewise, when a context contains inherit + + * SAMPLE_READ events they should be switched + * out using the slow path so that they are + * treated as if they were distinct contexts. */ raw_spin_unlock(&next_ctx->lock); rcu_read_unlock(); @@ -4538,8 +4555,11 @@ static void __perf_event_read(void *info) raw_spin_unlock(&ctx->lock); } =20 -static inline u64 perf_event_count(struct perf_event *event) +static inline u64 perf_event_count(struct perf_event *event, bool self) { + if (self) + return local64_read(&event->count); + return local64_read(&event->count) + atomic64_read(&event->child_count); } =20 @@ -5498,7 +5518,7 @@ static u64 __perf_event_read_value(struct perf_event = *event, u64 *enabled, u64 * mutex_lock(&event->child_mutex); =20 (void)perf_event_read(event, false); - total +=3D perf_event_count(event); + total +=3D perf_event_count(event, false); =20 *enabled +=3D event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@ -5507,7 +5527,7 @@ static u64 __perf_event_read_value(struct perf_event = *event, u64 *enabled, u64 * =20 list_for_each_entry(child, &event->child_list, child_list) { (void)perf_event_read(child, false); - total +=3D perf_event_count(child); + total +=3D perf_event_count(child, false); *enabled +=3D child->total_time_enabled; *running +=3D child->total_time_running; } @@ -5589,14 +5609,14 @@ static int __perf_read_group_add(struct perf_event = *leader, /* * Write {count,id} tuples for every sibling. */ - values[n++] +=3D perf_event_count(leader); + values[n++] +=3D perf_event_count(leader, false); if (read_format & PERF_FORMAT_ID) values[n++] =3D primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) values[n++] =3D atomic64_read(&leader->lost_samples); =20 for_each_sibling_event(sub, leader) { - values[n++] +=3D perf_event_count(sub); + values[n++] +=3D perf_event_count(sub, false); if (read_format & PERF_FORMAT_ID) values[n++] =3D primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) @@ -6176,7 +6196,7 @@ void perf_event_update_userpage(struct perf_event *ev= ent) ++userpg->lock; barrier(); userpg->index =3D perf_event_index(event); - userpg->offset =3D perf_event_count(event); + userpg->offset =3D perf_event_count(event, false); if (userpg->index) userpg->offset -=3D local64_read(&event->hw.prev_count); =20 @@ -7250,7 +7270,7 @@ static void perf_output_read_one(struct perf_output_h= andle *handle, u64 values[5]; int n =3D 0; =20 - values[n++] =3D perf_event_count(event); + values[n++] =3D perf_event_count(event, has_inherit_and_sample_read(&even= t->attr)); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] =3D enabled + atomic64_read(&event->child_total_time_enabled); @@ -7268,14 +7288,15 @@ static void perf_output_read_one(struct perf_output= _handle *handle, } =20 static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_event *event, - u64 enabled, u64 running) + struct perf_event *event, + u64 enabled, u64 running) { struct perf_event *leader =3D event->group_leader, *sub; u64 read_format =3D event->attr.read_format; unsigned long flags; u64 values[6]; int n =3D 0; + bool self =3D has_inherit_and_sample_read(&event->attr); =20 /* * Disabling interrupts avoids all counter scheduling @@ -7295,7 +7316,7 @@ static void perf_output_read_group(struct perf_output= _handle *handle, (leader->state =3D=3D PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader); =20 - values[n++] =3D perf_event_count(leader); + values[n++] =3D perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) values[n++] =3D primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) @@ -7310,7 +7331,7 @@ static void perf_output_read_group(struct perf_output= _handle *handle, (sub->state =3D=3D PERF_EVENT_STATE_ACTIVE)) sub->pmu->read(sub); =20 - values[n++] =3D perf_event_count(sub); + values[n++] =3D perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) values[n++] =3D primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) @@ -7331,6 +7352,10 @@ static void perf_output_read_group(struct perf_outpu= t_handle *handle, * The problem is that its both hard and excessively expensive to iterate = the * child list, not to mention that its impossible to IPI the children runn= ing * on another CPU, from interrupt/NMI context. + * + * Instead the combination of PERF_SAMPLE_READ and inherit will track per-= thread + * counts rather than attempting to accumulate some value across all child= ren on + * all cores. */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) @@ -12057,10 +12082,12 @@ perf_event_alloc(struct perf_event_attr *attr, in= t cpu, local64_set(&hwc->period_left, hwc->sample_period); =20 /* - * We currently do not support PERF_SAMPLE_READ on inherited events. + * We do not support PERF_SAMPLE_READ on inherited events unless + * PERF_SAMPLE_TID is also selected, which allows inherited events to + * collect per-thread samples. * See perf_output_read(). */ - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) + if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPL= E_TID)) goto err_ns; =20 if (!has_branch_stack(event)) @@ -13084,7 +13111,7 @@ static void sync_child_event(struct perf_event *chi= ld_event) perf_event_read_event(child_event, task); } =20 - child_val =3D perf_event_count(child_event); + child_val =3D perf_event_count(child_event, false); =20 /* * Add back the child's count to the parent's count: --=20 2.45.2 From nobody Sun Feb 8 04:12:28 2026 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id A39F443152; Tue, 30 Jul 2024 08:44:33 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1722329075; cv=none; b=url/rjBLgYUQyW0jaRNeZhsN0O4kcA9E0o0ijZIXlFSvkOMpD7G5wrcNZ8pMQhtHskJiLOexgIIybf8+bJESFzm1CUrP6rItNmKqWURSzaEDrh1rfRfxXsGRR24Tkb+tF6Ox8qhkB9Flfqb4IPu8L7OD/sPRujCzKM7956PJJ4c= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1722329075; c=relaxed/simple; bh=dddagchXUe8texT49SKIcRmWx4Y+Lk2YGVYOMeJ/N6c=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=kCTYE0HOhSQ6HdXnT/xfFvgz7ziOgJItF52tQGhJr5hBD3Sn4fO2VnlGGbHSWBAkgAu65Siu3G0LS05lEGscFi5TtYMmE5F8FmVbqNMfKq4exSC3flaDHEAXAcwptVZKjHuXnB6es7AmFkfoYGv2OKfshun12aj2WK+jGzcaPK0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id A60C3153B; Tue, 30 Jul 2024 01:44:58 -0700 (PDT) Received: from e126817.cambridge.arm.com (e126817.cambridge.arm.com [10.2.3.8]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 5EC683F766; Tue, 30 Jul 2024 01:44:31 -0700 (PDT) From: Ben Gainey To: peterz@infradead.org, mingo@redhat.com, acme@kernel.org, namhyung@kernel.org Cc: james.clark@arm.com, mark.rutland@arm.com, alexander.shishkin@linux.intel.com, jolsa@kernel.org, irogers@google.com, adrian.hunter@intel.com, linux-perf-users@vger.kernel.org, linux-kernel@vger.kernel.org, Ben Gainey Subject: [PATCH v9 3/4] tools/perf: Correctly calculate sample period for inherited SAMPLE_READ values Date: Tue, 30 Jul 2024 09:44:16 +0100 Message-ID: <20240730084417.7693-4-ben.gainey@arm.com> X-Mailer: git-send-email 2.45.2 In-Reply-To: <20240730084417.7693-1-ben.gainey@arm.com> References: <20240730084417.7693-1-ben.gainey@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Sample period calculation in deliver_sample_value is updated to calculate the per-thread period delta for events that are inherit + PERF_SAMPLE_READ. When the sampling event has this configuration, the read_format.id is used with the tid from the sample to lookup the storage of the previously accumulated counter total before calculating the delta. All existing valid configurations where read_format.value represents some global value continue to use just the read_format.id to locate the storage of the previously accumulated total. perf_sample_id is modified to support tracking per-thread values, along with the existing global per-id values. In the per-thread case, values are stored in a hash by tid within the perf_sample_id, and are dynamically allocated as the number is not known ahead of time. Signed-off-by: Ben Gainey --- tools/lib/perf/evsel.c | 48 +++++++++++++++++++ tools/lib/perf/include/internal/evsel.h | 63 ++++++++++++++++++++++++- tools/perf/util/session.c | 25 ++++++---- 3 files changed, 126 insertions(+), 10 deletions(-) diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c index c07160953224..abdae2f9498b 100644 --- a/tools/lib/perf/evsel.c +++ b/tools/lib/perf/evsel.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,7 @@ void perf_evsel__init(struct perf_evsel *evsel, struct pe= rf_event_attr *attr, int idx) { INIT_LIST_HEAD(&evsel->node); + INIT_LIST_HEAD(&evsel->per_stream_periods); evsel->attr =3D *attr; evsel->idx =3D idx; evsel->leader =3D evsel; @@ -531,10 +533,56 @@ int perf_evsel__alloc_id(struct perf_evsel *evsel, in= t ncpus, int nthreads) =20 void perf_evsel__free_id(struct perf_evsel *evsel) { + struct perf_sample_id_period *pos, *n; + xyarray__delete(evsel->sample_id); evsel->sample_id =3D NULL; zfree(&evsel->id); evsel->ids =3D 0; + + perf_evsel_for_each_per_thread_period_safe(evsel, n, pos) { + list_del_init(&pos->node); + free(pos); + } +} + +bool perf_evsel__attr_has_per_thread_sample_period(struct perf_evsel *evse= l) +{ + return (evsel->attr.sample_type & PERF_SAMPLE_READ) + && (evsel->attr.sample_type & PERF_SAMPLE_TID) + && evsel->attr.inherit; +} + +u64 *perf_sample_id__get_period_storage(struct perf_sample_id *sid, u32 ti= d, bool per_thread) +{ + struct hlist_head *head; + struct perf_sample_id_period *res; + int hash; + + if (!per_thread) + return &sid->period; + + hash =3D hash_32(tid, PERF_SAMPLE_ID__HLIST_BITS); + head =3D &sid->periods[hash]; + + hlist_for_each_entry(res, head, hnode) + if (res->tid =3D=3D tid) + return &res->period; + + if (sid->evsel =3D=3D NULL) + return NULL; + + res =3D zalloc(sizeof(struct perf_sample_id_period)); + if (res =3D=3D NULL) + return NULL; + + INIT_LIST_HEAD(&res->node); + res->tid =3D tid; + + list_add_tail(&res->node, &sid->evsel->per_stream_periods); + hlist_add_head(&res->hnode, &sid->periods[hash]); + + return &res->period; } =20 void perf_counts_values__scale(struct perf_counts_values *count, diff --git a/tools/lib/perf/include/internal/evsel.h b/tools/lib/perf/inclu= de/internal/evsel.h index 5cd220a61962..ea78defa77d0 100644 --- a/tools/lib/perf/include/internal/evsel.h +++ b/tools/lib/perf/include/internal/evsel.h @@ -11,6 +11,32 @@ struct perf_thread_map; struct xyarray; =20 +/** + * The per-thread accumulated period storage node. + */ +struct perf_sample_id_period { + struct list_head node; + struct hlist_node hnode; + /* Holds total ID period value for PERF_SAMPLE_READ processing. */ + u64 period; + /* The TID that the values belongs to */ + u32 tid; +}; + +/** + * perf_evsel_for_each_per_thread_period_safe - safely iterate thru all the + * per_stream_periods + * @evlist:perf_evsel instance to iterate + * @item: struct perf_sample_id_period iterator + * @tmp: struct perf_sample_id_period temp iterator + */ +#define perf_evsel_for_each_per_thread_period_safe(evsel, tmp, item) \ + list_for_each_entry_safe(item, tmp, &(evsel)->per_stream_periods, node) + + +#define PERF_SAMPLE_ID__HLIST_BITS 4 +#define PERF_SAMPLE_ID__HLIST_SIZE (1 << PERF_SAMPLE_ID__HLIST_BITS) + /* * Per fd, to map back from PERF_SAMPLE_ID to evsel, only used when there = are * more than one entry in the evlist. @@ -34,8 +60,32 @@ struct perf_sample_id { pid_t machine_pid; struct perf_cpu vcpu; =20 - /* Holds total ID period value for PERF_SAMPLE_READ processing. */ - u64 period; + /* + * Per-thread, and global event counts are mutually exclusive: + * Whilst it is possible to combine events into a group with differing + * values of PERF_SAMPLE_READ, it is not valid to have inconsistent + * values for `inherit`. Therefore it is not possible to have a + * situation where a per-thread event is sampled as a global event; + * all !inherit groups are global, and all groups where the sampling + * event is inherit + PERF_SAMPLE_READ will be per-thread. Any event + * that is part of such a group that is inherit but not PERF_SAMPLE_READ + * will be read as per-thread. If such an event can also trigger a + * sample (such as with sample_period > 0) then it will not cause + * `read_format` to be included in its PERF_RECORD_SAMPLE, and + * therefore will not expose the per-thread group members as global. + */ + union { + /* + * Holds total ID period value for PERF_SAMPLE_READ processing + * (when period is not per-thread). + */ + u64 period; + /* + * Holds total ID period value for PERF_SAMPLE_READ processing + * (when period is per-thread). + */ + struct hlist_head periods[PERF_SAMPLE_ID__HLIST_SIZE]; + }; }; =20 struct perf_evsel { @@ -58,6 +108,10 @@ struct perf_evsel { u32 ids; struct perf_evsel *leader; =20 + /* For events where the read_format value is per-thread rather than + * global, stores the per-thread cumulative period */ + struct list_head per_stream_periods; + /* parse modifier helper */ int nr_members; /* @@ -88,4 +142,9 @@ int perf_evsel__apply_filter(struct perf_evsel *evsel, c= onst char *filter); int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads= ); void perf_evsel__free_id(struct perf_evsel *evsel); =20 +bool perf_evsel__attr_has_per_thread_sample_period(struct perf_evsel *evse= l); + +u64 *perf_sample_id__get_period_storage(struct perf_sample_id *sid, u32 ti= d, + bool per_thread); + #endif /* __LIBPERF_INTERNAL_EVSEL_H */ diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 5596bed1b8c8..fac0557ff6ea 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1474,18 +1474,24 @@ static int deliver_sample_value(struct evlist *evli= st, union perf_event *event, struct perf_sample *sample, struct sample_read_value *v, - struct machine *machine) + struct machine *machine, + bool per_thread) { struct perf_sample_id *sid =3D evlist__id2sid(evlist, v->id); struct evsel *evsel; + u64 *storage =3D NULL; =20 if (sid) { + storage =3D perf_sample_id__get_period_storage(sid, sample->tid, per_thr= ead); + } + + if (storage) { sample->id =3D v->id; - sample->period =3D v->value - sid->period; - sid->period =3D v->value; + sample->period =3D v->value - *storage; + *storage =3D v->value; } =20 - if (!sid || sid->evsel =3D=3D NULL) { + if (!storage || sid->evsel =3D=3D NULL) { ++evlist->stats.nr_unknown_id; return 0; } @@ -1506,14 +1512,15 @@ static int deliver_sample_group(struct evlist *evli= st, union perf_event *event, struct perf_sample *sample, struct machine *machine, - u64 read_format) + u64 read_format, + bool per_thread) { int ret =3D -EINVAL; struct sample_read_value *v =3D sample->read.group.values; =20 sample_read_group__for_each(v, sample->read.group.nr, read_format) { ret =3D deliver_sample_value(evlist, tool, event, sample, v, - machine); + machine, per_thread); if (ret) break; } @@ -1528,6 +1535,7 @@ static int evlist__deliver_sample(struct evlist *evli= st, struct perf_tool *tool, /* We know evsel !=3D NULL. */ u64 sample_type =3D evsel->core.attr.sample_type; u64 read_format =3D evsel->core.attr.read_format; + bool per_thread =3D perf_evsel__attr_has_per_thread_sample_period(&evsel-= >core); =20 /* Standard sample delivery. */ if (!(sample_type & PERF_SAMPLE_READ)) @@ -1536,10 +1544,11 @@ static int evlist__deliver_sample(struct evlist *ev= list, struct perf_tool *tool, /* For PERF_SAMPLE_READ we have either single or group mode. */ if (read_format & PERF_FORMAT_GROUP) return deliver_sample_group(evlist, tool, event, sample, - machine, read_format); + machine, read_format, per_thread); else return deliver_sample_value(evlist, tool, event, sample, - &sample->read.one, machine); + &sample->read.one, machine, + per_thread); } =20 static int machines__deliver_event(struct machines *machines, --=20 2.45.2 From nobody Sun Feb 8 04:12:28 2026 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 236D5192B95; Tue, 30 Jul 2024 08:44:35 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1722329076; cv=none; b=QQFYFAX9h3xaWWk5AVWmDKEhhwXfeyZLcqS+B2n86tUpTDKa75C3Nbo0dKoNtRwpkdsiHIj0hbIvvnCWLv9Tq89pFSebVFAdXDWYGEc0HP5vCg+L985hM0x9hwsfLFI3oms4XKBkQejhSrfhZ6NEJMLSScaL7CZ6M/TP9b43vxA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1722329076; c=relaxed/simple; bh=tlkpz5IM9AvXyxF/eoxqjC0QWstdR2IPthY0muHgUgM=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=q/xb4H3DT3GyeG570d3qKcjiIFac86d9yi/RMbWG+HGopMOXADroAkgSKiC1T7EskMX+jUYbioh6NsynYJ+R7JWE/FT+WXndTolX4hxxEGBEs0qH3weF6o07riud4HQnBG6rBbcfPBtrgcdY2YMT+FL0vagZH+3eRfUdZNFnVno= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 910D3168F; Tue, 30 Jul 2024 01:45:00 -0700 (PDT) Received: from e126817.cambridge.arm.com (e126817.cambridge.arm.com [10.2.3.8]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 4A1743F766; Tue, 30 Jul 2024 01:44:33 -0700 (PDT) From: Ben Gainey To: peterz@infradead.org, mingo@redhat.com, acme@kernel.org, namhyung@kernel.org Cc: james.clark@arm.com, mark.rutland@arm.com, alexander.shishkin@linux.intel.com, jolsa@kernel.org, irogers@google.com, adrian.hunter@intel.com, linux-perf-users@vger.kernel.org, linux-kernel@vger.kernel.org, Ben Gainey Subject: [PATCH v9 4/4] tools/perf: Allow inherit + PERF_SAMPLE_READ when opening events Date: Tue, 30 Jul 2024 09:44:17 +0100 Message-ID: <20240730084417.7693-5-ben.gainey@arm.com> X-Mailer: git-send-email 2.45.2 In-Reply-To: <20240730084417.7693-1-ben.gainey@arm.com> References: <20240730084417.7693-1-ben.gainey@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" The "perf record" tool will now default to this new mode if the user specifies a sampling group when not in system-wide mode, and when "--no-inherit" is not specified. This change updates evsel to allow the combination of inherit and PERF_SAMPLE_READ. A fallback is implemented for kernel versions where this feature is not supported. Signed-off-by: Ben Gainey --- tools/perf/tests/attr/README | 2 + .../tests/attr/test-record-group-sampling | 3 +- .../tests/attr/test-record-group-sampling1 | 51 ++++++++++++++++ .../tests/attr/test-record-group-sampling2 | 61 +++++++++++++++++++ tools/perf/tests/attr/test-record-group2 | 1 + ...{test-record-group2 =3D> test-record-group3} | 10 +-- tools/perf/util/evsel.c | 19 +++++- tools/perf/util/evsel.h | 1 + 8 files changed, 141 insertions(+), 7 deletions(-) create mode 100644 tools/perf/tests/attr/test-record-group-sampling1 create mode 100644 tools/perf/tests/attr/test-record-group-sampling2 copy tools/perf/tests/attr/{test-record-group2 =3D> test-record-group3} (8= 1%) diff --git a/tools/perf/tests/attr/README b/tools/perf/tests/attr/README index 4066fec7180a..67c4ca76b85d 100644 --- a/tools/perf/tests/attr/README +++ b/tools/perf/tests/attr/README @@ -51,6 +51,8 @@ Following tests are defined (with perf commands): perf record --call-graph fp kill (test-record-graph-fp-aarc= h64) perf record -e '{cycles,instructions}' kill (test-record-group1) perf record -e '{cycles/period=3D1/,instructions/period=3D2/}:S' kill (t= est-record-group2) + perf record -e '{cycles,cache-misses}:S' kill (test-record-group-samplin= g1) + perf record -c 10000 -e '{cycles,cache-misses}:S' kill (test-record-grou= p-sampling2) perf record -D kill (test-record-no-delay) perf record -i kill (test-record-no-inherit) perf record -n kill (test-record-no-samples) diff --git a/tools/perf/tests/attr/test-record-group-sampling b/tools/perf/= tests/attr/test-record-group-sampling index 97e7e64a38f0..da7a5d10785f 100644 --- a/tools/perf/tests/attr/test-record-group-sampling +++ b/tools/perf/tests/attr/test-record-group-sampling @@ -2,6 +2,7 @@ command =3D record args =3D --no-bpf-event -e '{cycles,cache-misses}:S' kill >/dev/null 2>= &1 ret =3D 1 +kernel_until =3D 6.11 =20 [event-1:base-record] fd=3D1 @@ -18,7 +19,7 @@ group_fd=3D1 type=3D0 config=3D3 =20 -# default | PERF_SAMPLE_READ +# default | PERF_SAMPLE_READ | PERF_SAMPLE_PERIOD sample_type=3D343 =20 # PERF_FORMAT_ID | PERF_FORMAT_GROUP | PERF_FORMAT_LOST diff --git a/tools/perf/tests/attr/test-record-group-sampling1 b/tools/perf= /tests/attr/test-record-group-sampling1 new file mode 100644 index 000000000000..b02de391718d --- /dev/null +++ b/tools/perf/tests/attr/test-record-group-sampling1 @@ -0,0 +1,51 @@ +[config] +command =3D record +args =3D --no-bpf-event -e '{cycles,cache-misses}:S' kill >/dev/null 2>= &1 +ret =3D 1 +kernel_since =3D 6.11 + +[event-1:base-record] +fd=3D1 +group_fd=3D-1 + +# cycles +type=3D0 +config=3D0 + +# default | PERF_SAMPLE_READ | PERF_SAMPLE_PERIOD +sample_type=3D343 + +# PERF_FORMAT_ID | PERF_FORMAT_GROUP | PERF_FORMAT_LOST | PERF_FORMAT_TOT= AL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING +read_format=3D28|31 +task=3D1 +mmap=3D1 +comm=3D1 +enable_on_exec=3D1 +disabled=3D1 + +# inherit is enabled for group sampling +inherit=3D1 + +[event-2:base-record] +fd=3D2 +group_fd=3D1 + +# cache-misses +type=3D0 +config=3D3 + +# default | PERF_SAMPLE_READ | PERF_SAMPLE_PERIOD +sample_type=3D343 + +# PERF_FORMAT_ID | PERF_FORMAT_GROUP | PERF_FORMAT_LOST | PERF_FORMAT_TOT= AL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING +read_format=3D28|31 +task=3D0 +mmap=3D0 +comm=3D0 +enable_on_exec=3D0 +disabled=3D0 +freq=3D0 + +# inherit is enabled for group sampling +inherit=3D1 + diff --git a/tools/perf/tests/attr/test-record-group-sampling2 b/tools/perf= /tests/attr/test-record-group-sampling2 new file mode 100644 index 000000000000..060fd1d24f63 --- /dev/null +++ b/tools/perf/tests/attr/test-record-group-sampling2 @@ -0,0 +1,61 @@ +[config] +command =3D record +args =3D --no-bpf-event -c 10000 -e '{cycles,cache-misses}:S' kill >/de= v/null 2>&1 +ret =3D 1 +kernel_since =3D 6.11 + +[event-1:base-record] +fd=3D1 +group_fd=3D-1 + +# cycles +type=3D0 +config=3D0 + +# default | PERF_SAMPLE_READ +sample_type=3D87 + +# PERF_FORMAT_ID | PERF_FORMAT_GROUP | PERF_FORMAT_LOST | PERF_FORMAT_TOT= AL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING +read_format=3D28|31 +task=3D1 +mmap=3D1 +comm=3D1 +enable_on_exec=3D1 +disabled=3D1 + +# inherit is enabled for group sampling +inherit=3D1 + +# sampling disabled +sample_freq=3D0 +sample_period=3D10000 +freq=3D0 +write_backward=3D0 + +[event-2:base-record] +fd=3D2 +group_fd=3D1 + +# cache-misses +type=3D0 +config=3D3 + +# default | PERF_SAMPLE_READ +sample_type=3D87 + +# PERF_FORMAT_ID | PERF_FORMAT_GROUP | PERF_FORMAT_LOST | PERF_FORMAT_TOT= AL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING +read_format=3D28|31 +task=3D0 +mmap=3D0 +comm=3D0 +enable_on_exec=3D0 +disabled=3D0 + +# inherit is enabled for group sampling +inherit=3D1 + +# sampling disabled +sample_freq=3D0 +sample_period=3D0 +freq=3D0 +write_backward=3D0 diff --git a/tools/perf/tests/attr/test-record-group2 b/tools/perf/tests/at= tr/test-record-group2 index cebdaa8e64e4..ad97df77a506 100644 --- a/tools/perf/tests/attr/test-record-group2 +++ b/tools/perf/tests/attr/test-record-group2 @@ -2,6 +2,7 @@ command =3D record args =3D --no-bpf-event -e '{cycles/period=3D1234000/,instructions/peri= od=3D6789000/}:S' kill >/dev/null 2>&1 ret =3D 1 +kernel_until =3D 6.11 =20 [event-1:base-record] fd=3D1 diff --git a/tools/perf/tests/attr/test-record-group2 b/tools/perf/tests/at= tr/test-record-group3 similarity index 81% copy from tools/perf/tests/attr/test-record-group2 copy to tools/perf/tests/attr/test-record-group3 index cebdaa8e64e4..311afb478b85 100644 --- a/tools/perf/tests/attr/test-record-group2 +++ b/tools/perf/tests/attr/test-record-group3 @@ -2,6 +2,7 @@ command =3D record args =3D --no-bpf-event -e '{cycles/period=3D1234000/,instructions/peri= od=3D6789000/}:S' kill >/dev/null 2>&1 ret =3D 1 +kernel_since =3D 6.11 =20 [event-1:base-record] fd=3D1 @@ -9,8 +10,9 @@ group_fd=3D-1 config=3D0|1 sample_period=3D1234000 sample_type=3D87 -read_format=3D12|28 -inherit=3D0 +read_format=3D28|31 +disabled=3D1 +inherit=3D1 freq=3D0 =20 [event-2:base-record] @@ -19,9 +21,9 @@ group_fd=3D1 config=3D0|1 sample_period=3D6789000 sample_type=3D87 -read_format=3D12|28 +read_format=3D28|31 disabled=3D0 -inherit=3D0 +inherit=3D1 mmap=3D0 comm=3D0 freq=3D0 diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index bc603193c477..ceb09b6a8c2f 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1171,7 +1171,15 @@ void evsel__config(struct evsel *evsel, struct recor= d_opts *opts, */ if (leader->core.nr_members > 1) { attr->read_format |=3D PERF_FORMAT_GROUP; - attr->inherit =3D 0; + } + + /* + * Inherit + SAMPLE_READ requires SAMPLE_TID in the read_format + */ + if (attr->inherit) { + evsel__set_sample_bit(evsel, TID); + evsel->core.attr.read_format |=3D + PERF_FORMAT_ID; } } =20 @@ -2020,6 +2028,8 @@ static int __evsel__prepare_open(struct evsel *evsel,= struct perf_cpu_map *cpus, =20 static void evsel__disable_missing_features(struct evsel *evsel) { + if (perf_missing_features.inherit_sample_read) + evsel->core.attr.inherit =3D 0; if (perf_missing_features.branch_counters) evsel->core.attr.branch_sample_type &=3D ~PERF_SAMPLE_BRANCH_COUNTERS; if (perf_missing_features.read_lost) @@ -2075,7 +2085,12 @@ bool evsel__detect_missing_features(struct evsel *ev= sel) * Must probe features in the order they were added to the * perf_event_attr interface. */ - if (!perf_missing_features.branch_counters && + if (!perf_missing_features.inherit_sample_read && + evsel->core.attr.inherit && (evsel->core.attr.sample_type & PERF_SAMP= LE_READ)) { + perf_missing_features.inherit_sample_read =3D true; + pr_debug2("Using PERF_SAMPLE_READ / :S modifier is not compatible with i= nherit, falling back to no-inherit.\n"); + return true; + } else if (!perf_missing_features.branch_counters && (evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS)) { perf_missing_features.branch_counters =3D true; pr_debug2("switching off branch counters support\n"); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 80b5f6dd868e..bb0c91c23679 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -206,6 +206,7 @@ struct perf_missing_features { bool weight_struct; bool read_lost; bool branch_counters; + bool inherit_sample_read; }; =20 extern struct perf_missing_features perf_missing_features; --=20 2.45.2