From: Kan Liang <kan.liang@linux.intel.com>
The perf subsystem assumes that the counters of a PMU are per-CPU. So
the user space tool reads a counter from each CPU in the system wide
mode. However, many PMUs don't have a per-CPU counter. The counter is
effective for a scope, e.g., a die or a socket. To address this, a
cpumask is exposed by the kernel driver to restrict to one CPU to stand
for a specific scope. In case the given CPU is removed,
the hotplug support has to be implemented for each such driver.
The codes to support the cpumask and hotplug are very similar.
- Expose a cpumask into sysfs
- Pickup another CPU in the same scope if the given CPU is removed.
- Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
- In event init, always set the CPU in the cpumask to event->cpu
Similar duplicated codes are implemented for each such PMU driver. It
would be good to introduce a generic infrastructure to avoid such
duplication.
5 popular scopes are implemented here, core, die, cluster, pkg, and
the system-wide. The scope can be set when a PMU is registered. If so, a
"cpumask" is automatically exposed for the PMU.
The "cpumask" is from the perf_online_<scope>_mask, which is to track
the active CPU for each scope. They are set when the first CPU of the
scope is online via the generic perf hotplug support. When a
corresponding CPU is removed, the perf_online_<scope>_mask is updated
accordingly and the PMU will be moved to a new CPU from the same scope
if possible.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
include/linux/perf_event.h | 18 ++++
kernel/events/core.c | 164 ++++++++++++++++++++++++++++++++++++-
2 files changed, 180 insertions(+), 2 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1a8942277dda..1102d5c2be70 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -292,6 +292,19 @@ struct perf_event_pmu_context;
#define PERF_PMU_CAP_AUX_OUTPUT 0x0080
#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
+/**
+ * pmu::scope
+ */
+enum perf_pmu_scope {
+ PERF_PMU_SCOPE_NONE = 0,
+ PERF_PMU_SCOPE_CORE,
+ PERF_PMU_SCOPE_DIE,
+ PERF_PMU_SCOPE_CLUSTER,
+ PERF_PMU_SCOPE_PKG,
+ PERF_PMU_SCOPE_SYS_WIDE,
+ PERF_PMU_MAX_SCOPE,
+};
+
struct perf_output_handle;
#define PMU_NULL_DEV ((void *)(~0UL))
@@ -315,6 +328,11 @@ struct pmu {
*/
int capabilities;
+ /*
+ * PMU scope
+ */
+ unsigned int scope;
+
int __percpu *pmu_disable_count;
struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index aa3450bdc227..5e1877c4cb4c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -407,6 +407,11 @@ static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
static cpumask_var_t perf_online_mask;
+static cpumask_var_t perf_online_core_mask;
+static cpumask_var_t perf_online_die_mask;
+static cpumask_var_t perf_online_cluster_mask;
+static cpumask_var_t perf_online_pkg_mask;
+static cpumask_var_t perf_online_sys_mask;
static struct kmem_cache *perf_event_cache;
/*
@@ -11477,10 +11482,60 @@ perf_event_mux_interval_ms_store(struct device *dev,
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
+{
+ switch (scope) {
+ case PERF_PMU_SCOPE_CORE:
+ return topology_sibling_cpumask(cpu);
+ case PERF_PMU_SCOPE_DIE:
+ return topology_die_cpumask(cpu);
+ case PERF_PMU_SCOPE_CLUSTER:
+ return topology_cluster_cpumask(cpu);
+ case PERF_PMU_SCOPE_PKG:
+ return topology_core_cpumask(cpu);
+ case PERF_PMU_SCOPE_SYS_WIDE:
+ return cpu_online_mask;
+ }
+
+ return NULL;
+}
+
+static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
+{
+ switch (scope) {
+ case PERF_PMU_SCOPE_CORE:
+ return perf_online_core_mask;
+ case PERF_PMU_SCOPE_DIE:
+ return perf_online_die_mask;
+ case PERF_PMU_SCOPE_CLUSTER:
+ return perf_online_cluster_mask;
+ case PERF_PMU_SCOPE_PKG:
+ return perf_online_pkg_mask;
+ case PERF_PMU_SCOPE_SYS_WIDE:
+ return perf_online_sys_mask;
+ }
+
+ return NULL;
+}
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+ struct cpumask *mask = perf_scope_cpumask(pmu->scope);
+
+ if (mask)
+ return cpumap_print_to_pagebuf(true, buf, mask);
+ return 0;
+}
+
+static DEVICE_ATTR_RO(cpumask);
+
static struct attribute *pmu_dev_attrs[] = {
&dev_attr_type.attr,
&dev_attr_perf_event_mux_interval_ms.attr,
&dev_attr_nr_addr_filters.attr,
+ &dev_attr_cpumask.attr,
NULL,
};
@@ -11492,6 +11547,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int
if (n == 2 && !pmu->nr_addr_filters)
return 0;
+ /* cpumask */
+ if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
+ return 0;
+
return a->mode;
}
@@ -11576,6 +11635,11 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
goto free_pdc;
}
+ if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
+ ret = -EINVAL;
+ goto free_pdc;
+ }
+
pmu->name = name;
if (type >= 0)
@@ -11730,6 +11794,22 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
event_has_any_exclude_flag(event))
ret = -EINVAL;
+ if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
+ int cpu;
+
+ if (pmu_cpumask && cpumask) {
+ cpu = cpumask_any_and(pmu_cpumask, cpumask);
+ if (cpu >= nr_cpu_ids)
+ ret = -ENODEV;
+ else
+ event->cpu = cpu;
+ } else {
+ ret = -ENODEV;
+ }
+ }
+
if (ret && event->destroy)
event->destroy(event);
}
@@ -13681,6 +13761,12 @@ static void __init perf_event_init_all_cpus(void)
int cpu;
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
+
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
@@ -13730,6 +13816,40 @@ static void __perf_event_exit_context(void *__info)
raw_spin_unlock(&ctx->lock);
}
+static void perf_event_clear_cpumask(unsigned int cpu)
+{
+ int target[PERF_PMU_MAX_SCOPE];
+ unsigned int scope;
+ struct pmu *pmu;
+
+ cpumask_clear_cpu(cpu, perf_online_mask);
+
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
+
+ target[scope] = -1;
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ continue;
+
+ if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
+ continue;
+ target[scope] = cpumask_any_but(cpumask, cpu);
+ if (target[scope] < nr_cpu_ids)
+ cpumask_set_cpu(target[scope], pmu_cpumask);
+ }
+
+ /* migrate */
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
+ if (pmu->scope == PERF_PMU_SCOPE_NONE ||
+ WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
+ continue;
+
+ if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
+ perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
+ }
+}
+
static void perf_event_exit_cpu_context(int cpu)
{
struct perf_cpu_context *cpuctx;
@@ -13737,6 +13857,11 @@ static void perf_event_exit_cpu_context(int cpu)
// XXX simplify cpuctx->online
mutex_lock(&pmus_lock);
+ /*
+ * Clear the cpumasks, and migrate to other CPUs if possible.
+ * Must be invoked before the __perf_event_exit_context.
+ */
+ perf_event_clear_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx;
@@ -13744,7 +13869,6 @@ static void perf_event_exit_cpu_context(int cpu)
smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
cpuctx->online = 0;
mutex_unlock(&ctx->mutex);
- cpumask_clear_cpu(cpu, perf_online_mask);
mutex_unlock(&pmus_lock);
}
#else
@@ -13753,6 +13877,42 @@ static void perf_event_exit_cpu_context(int cpu) { }
#endif
+static void perf_event_setup_cpumask(unsigned int cpu)
+{
+ struct cpumask *pmu_cpumask;
+ unsigned int scope;
+
+ cpumask_set_cpu(cpu, perf_online_mask);
+
+ /*
+ * Early boot stage, the cpumask hasn't been set yet.
+ * The perf_online_<domain>_masks includes the first CPU of each domain.
+ * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
+ */
+ if (!topology_sibling_cpumask(cpu)) {
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ pmu_cpumask = perf_scope_cpumask(scope);
+ if (WARN_ON_ONCE(!pmu_cpumask))
+ continue;
+ cpumask_set_cpu(cpu, pmu_cpumask);
+ }
+ return;
+ }
+
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+
+ pmu_cpumask = perf_scope_cpumask(scope);
+
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ continue;
+
+ if (!cpumask_empty(cpumask) &&
+ cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, pmu_cpumask);
+ }
+}
+
int perf_event_init_cpu(unsigned int cpu)
{
struct perf_cpu_context *cpuctx;
@@ -13761,7 +13921,7 @@ int perf_event_init_cpu(unsigned int cpu)
perf_swevent_init_cpu(cpu);
mutex_lock(&pmus_lock);
- cpumask_set_cpu(cpu, perf_online_mask);
+ perf_event_setup_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx;
--
2.38.1
Hi Kan Liang,
(+ cc Perf ML)
On 02/08/2024 17:16, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
>
> The perf subsystem assumes that the counters of a PMU are per-CPU. So
> the user space tool reads a counter from each CPU in the system wide
> mode. However, many PMUs don't have a per-CPU counter. The counter is
> effective for a scope, e.g., a die or a socket. To address this, a
> cpumask is exposed by the kernel driver to restrict to one CPU to stand
> for a specific scope. In case the given CPU is removed,
> the hotplug support has to be implemented for each such driver.
>
> The codes to support the cpumask and hotplug are very similar.
> - Expose a cpumask into sysfs
> - Pickup another CPU in the same scope if the given CPU is removed.
> - Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
> - In event init, always set the CPU in the cpumask to event->cpu
>
> Similar duplicated codes are implemented for each such PMU driver. It
> would be good to introduce a generic infrastructure to avoid such
> duplication.
>
> 5 popular scopes are implemented here, core, die, cluster, pkg, and
> the system-wide. The scope can be set when a PMU is registered. If so, a
> "cpumask" is automatically exposed for the PMU.
>
> The "cpumask" is from the perf_online_<scope>_mask, which is to track
> the active CPU for each scope. They are set when the first CPU of the
> scope is online via the generic perf hotplug support. When a
> corresponding CPU is removed, the perf_online_<scope>_mask is updated
> accordingly and the PMU will be moved to a new CPU from the same scope
> if possible.
Thank you for the patch.
It looks like this modification causes the following warning on my side
when shutting down a VM running a kernel built with a debug config
including CONFIG_PROVE_RCU_LIST=y (and CONFIG_RCU_EXPERT=y):
> # /usr/lib/klibc/bin/poweroff
>
> =============================
> WARNING: suspicious RCU usage
> 6.12.0-rc3+ #3 Not tainted
> -----------------------------
> kernel/events/core.c:13962 RCU-list traversed in non-reader section!!
>
> other info that might help us debug this:
>
>
> rcu_scheduler_active = 2, debug_locks = 1
> 3 locks held by poweroff/11748:
> #0: ffffffff9b441e28 (system_transition_mutex){+.+.}-{3:3}, at: __do_sys_reboot (kernel/reboot.c:770)
> #1: ffffffff9b43eab0 ((reboot_notifier_list).rwsem){++++}-{3:3}, at: blocking_notifier_call_chain (kernel/notifier.c:388)
> #2: ffffffff9b6d06c8 (pmus_lock){+.+.}-{3:3}, at: perf_event_exit_cpu_context (kernel/events/core.c:13983)
>
> stack backtrace:
> CPU: 0 UID: 0 PID: 11748 Comm: poweroff Not tainted 6.12.0-rc3+ #3
> Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> Call Trace:
> <TASK>
> dump_stack_lvl (lib/dump_stack.c:123)
> lockdep_rcu_suspicious (kernel/locking/lockdep.c:6822)
> perf_event_clear_cpumask (kernel/events/core.c:13962 (discriminator 9))
> ? __pfx_perf_event_clear_cpumask (kernel/events/core.c:13939)
> ? acpi_execute_simple_method (drivers/acpi/utils.c:679)
> ? __pfx_acpi_execute_simple_method (drivers/acpi/utils.c:679)
> ? md_notify_reboot (drivers/md/md.c:9860)
> perf_event_exit_cpu_context (kernel/events/core.c:13984 (discriminator 1))
> perf_reboot (kernel/events/core.c:14066 (discriminator 3))
> ? trace_notifier_run (include/trace/events/notifier.h:59 (discriminator 2))
> notifier_call_chain (kernel/notifier.c:93)
> blocking_notifier_call_chain (kernel/notifier.c:389)
> kernel_power_off (kernel/reboot.c:294)
> __do_sys_reboot (kernel/reboot.c:771)
> ? __pfx___do_sys_reboot (kernel/reboot.c:717)
> ? __pfx_ksys_sync (fs/sync.c:98)
> do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1))
> entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
It is very easy for me to reproduce it: simply by stopping the VM. Just
in case, here are the steps I used to have the same VM:
$ cd [kernel source code]
$ echo true > .virtme-exec-run
$ docker run -v "${PWD}:${PWD}:rw" -w "${PWD}" --privileged --rm -it \
--pull always mptcp/mptcp-upstream-virtme-docker:latest \
auto-debug -e RCU_EXPERT -e PROVE_RCU_LIST
I have one question below about the modification you did here.
(...)
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index aa3450bdc227..5e1877c4cb4c 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
(...)
> @@ -13730,6 +13816,40 @@ static void __perf_event_exit_context(void *__info)
> raw_spin_unlock(&ctx->lock);
> }
>
> +static void perf_event_clear_cpumask(unsigned int cpu)
> +{
> + int target[PERF_PMU_MAX_SCOPE];
> + unsigned int scope;
> + struct pmu *pmu;
> +
> + cpumask_clear_cpu(cpu, perf_online_mask);
> +
> + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
> + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
> + struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
> +
> + target[scope] = -1;
> + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
> + continue;
> +
> + if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
> + continue;
> + target[scope] = cpumask_any_but(cpumask, cpu);
> + if (target[scope] < nr_cpu_ids)
> + cpumask_set_cpu(target[scope], pmu_cpumask);
> + }
> +
> + /* migrate */
> + list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
Here is the line causing the warning, because rcu_read_lock() is not
used before.
I don't know this code, but I guess you are not only doing read
operations here if you are migrating something, right? This operation is
done under the "pmus_lock", maybe the "_rcu" variant is not needed here?
So just using this instead is maybe enough?
list_for_each_entry(pmu, &pmus, entry) {
> + if (pmu->scope == PERF_PMU_SCOPE_NONE ||
> + WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
> + continue;
> +
> + if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
> + perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
> + }
> +}
> +
> static void perf_event_exit_cpu_context(int cpu)
> {
> struct perf_cpu_context *cpuctx;
> @@ -13737,6 +13857,11 @@ static void perf_event_exit_cpu_context(int cpu)
>
> // XXX simplify cpuctx->online
> mutex_lock(&pmus_lock);
> + /*
> + * Clear the cpumasks, and migrate to other CPUs if possible.
> + * Must be invoked before the __perf_event_exit_context.
> + */
> + perf_event_clear_cpumask(cpu);
> cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
> ctx = &cpuctx->ctx;
>
> @@ -13744,7 +13869,6 @@ static void perf_event_exit_cpu_context(int cpu)
> smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
> cpuctx->online = 0;
> mutex_unlock(&ctx->mutex);
> - cpumask_clear_cpu(cpu, perf_online_mask);
> mutex_unlock(&pmus_lock);
> }
> #else
(...)
Cheers,
Matt
--
Sponsored by the NGI0 Core fund.
On 2024-10-23 1:09 p.m., Matthieu Baerts wrote:
> Hi Kan Liang,
>
> (+ cc Perf ML)
>
> On 02/08/2024 17:16, kan.liang@linux.intel.com wrote:
>> From: Kan Liang <kan.liang@linux.intel.com>
>>
>> The perf subsystem assumes that the counters of a PMU are per-CPU. So
>> the user space tool reads a counter from each CPU in the system wide
>> mode. However, many PMUs don't have a per-CPU counter. The counter is
>> effective for a scope, e.g., a die or a socket. To address this, a
>> cpumask is exposed by the kernel driver to restrict to one CPU to stand
>> for a specific scope. In case the given CPU is removed,
>> the hotplug support has to be implemented for each such driver.
>>
>> The codes to support the cpumask and hotplug are very similar.
>> - Expose a cpumask into sysfs
>> - Pickup another CPU in the same scope if the given CPU is removed.
>> - Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
>> - In event init, always set the CPU in the cpumask to event->cpu
>>
>> Similar duplicated codes are implemented for each such PMU driver. It
>> would be good to introduce a generic infrastructure to avoid such
>> duplication.
>>
>> 5 popular scopes are implemented here, core, die, cluster, pkg, and
>> the system-wide. The scope can be set when a PMU is registered. If so, a
>> "cpumask" is automatically exposed for the PMU.
>>
>> The "cpumask" is from the perf_online_<scope>_mask, which is to track
>> the active CPU for each scope. They are set when the first CPU of the
>> scope is online via the generic perf hotplug support. When a
>> corresponding CPU is removed, the perf_online_<scope>_mask is updated
>> accordingly and the PMU will be moved to a new CPU from the same scope
>> if possible.
>
> Thank you for the patch.
>
> It looks like this modification causes the following warning on my side
> when shutting down a VM running a kernel built with a debug config
> including CONFIG_PROVE_RCU_LIST=y (and CONFIG_RCU_EXPERT=y):
>
>
>> # /usr/lib/klibc/bin/poweroff
>>
>> =============================
>> WARNING: suspicious RCU usage
>> 6.12.0-rc3+ #3 Not tainted
>> -----------------------------
>> kernel/events/core.c:13962 RCU-list traversed in non-reader section!!
>>
>> other info that might help us debug this:
>>
>>
>> rcu_scheduler_active = 2, debug_locks = 1
>> 3 locks held by poweroff/11748:
>> #0: ffffffff9b441e28 (system_transition_mutex){+.+.}-{3:3}, at: __do_sys_reboot (kernel/reboot.c:770)
>> #1: ffffffff9b43eab0 ((reboot_notifier_list).rwsem){++++}-{3:3}, at: blocking_notifier_call_chain (kernel/notifier.c:388)
>> #2: ffffffff9b6d06c8 (pmus_lock){+.+.}-{3:3}, at: perf_event_exit_cpu_context (kernel/events/core.c:13983)
>>
>> stack backtrace:
>> CPU: 0 UID: 0 PID: 11748 Comm: poweroff Not tainted 6.12.0-rc3+ #3
>> Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
>> Call Trace:
>> <TASK>
>> dump_stack_lvl (lib/dump_stack.c:123)
>> lockdep_rcu_suspicious (kernel/locking/lockdep.c:6822)
>> perf_event_clear_cpumask (kernel/events/core.c:13962 (discriminator 9))
>> ? __pfx_perf_event_clear_cpumask (kernel/events/core.c:13939)
>> ? acpi_execute_simple_method (drivers/acpi/utils.c:679)
>> ? __pfx_acpi_execute_simple_method (drivers/acpi/utils.c:679)
>> ? md_notify_reboot (drivers/md/md.c:9860)
>> perf_event_exit_cpu_context (kernel/events/core.c:13984 (discriminator 1))
>> perf_reboot (kernel/events/core.c:14066 (discriminator 3))
>> ? trace_notifier_run (include/trace/events/notifier.h:59 (discriminator 2))
>> notifier_call_chain (kernel/notifier.c:93)
>> blocking_notifier_call_chain (kernel/notifier.c:389)
>> kernel_power_off (kernel/reboot.c:294)
>> __do_sys_reboot (kernel/reboot.c:771)
>> ? __pfx___do_sys_reboot (kernel/reboot.c:717)
>> ? __pfx_ksys_sync (fs/sync.c:98)
>> do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1))
>> entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
>
>
> It is very easy for me to reproduce it: simply by stopping the VM. Just
> in case, here are the steps I used to have the same VM:
>
> $ cd [kernel source code]
> $ echo true > .virtme-exec-run
> $ docker run -v "${PWD}:${PWD}:rw" -w "${PWD}" --privileged --rm -it \
> --pull always mptcp/mptcp-upstream-virtme-docker:latest \
> auto-debug -e RCU_EXPERT -e PROVE_RCU_LIST
>
>
> I have one question below about the modification you did here.
>
> (...)
>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index aa3450bdc227..5e1877c4cb4c 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>
> (...)
>
>> @@ -13730,6 +13816,40 @@ static void __perf_event_exit_context(void *__info)
>> raw_spin_unlock(&ctx->lock);
>> }
>>
>> +static void perf_event_clear_cpumask(unsigned int cpu)
>> +{
>> + int target[PERF_PMU_MAX_SCOPE];
>> + unsigned int scope;
>> + struct pmu *pmu;
>> +
>> + cpumask_clear_cpu(cpu, perf_online_mask);
>> +
>> + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
>> + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
>> + struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
>> +
>> + target[scope] = -1;
>> + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
>> + continue;
>> +
>> + if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
>> + continue;
>> + target[scope] = cpumask_any_but(cpumask, cpu);
>> + if (target[scope] < nr_cpu_ids)
>> + cpumask_set_cpu(target[scope], pmu_cpumask);
>> + }
>> +
>> + /* migrate */
>> + list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
>
>
> Here is the line causing the warning, because rcu_read_lock() is not
> used before.
>
> I don't know this code, but I guess you are not only doing read
> operations here if you are migrating something, right? This operation is
> done under the "pmus_lock", maybe the "_rcu" variant is not needed here?
>
> So just using this instead is maybe enough?
>
> list_for_each_entry(pmu, &pmus, entry) {
Yes, it's good enough. A patch has been proposed, but haven't been
merged yet.
https://lore.kernel.org/lkml/20240913162340.2142976-1-kan.liang@linux.intel.com/
Thanks,
Kan
>
>
>> + if (pmu->scope == PERF_PMU_SCOPE_NONE ||
>> + WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
>> + continue;
>> +
>> + if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
>> + perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
>> + }
>> +}
>> +
>> static void perf_event_exit_cpu_context(int cpu)
>> {
>> struct perf_cpu_context *cpuctx;
>> @@ -13737,6 +13857,11 @@ static void perf_event_exit_cpu_context(int cpu)
>>
>> // XXX simplify cpuctx->online
>> mutex_lock(&pmus_lock);
>> + /*
>> + * Clear the cpumasks, and migrate to other CPUs if possible.
>> + * Must be invoked before the __perf_event_exit_context.
>> + */
>> + perf_event_clear_cpumask(cpu);
>> cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
>> ctx = &cpuctx->ctx;
>>
>> @@ -13744,7 +13869,6 @@ static void perf_event_exit_cpu_context(int cpu)
>> smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
>> cpuctx->online = 0;
>> mutex_unlock(&ctx->mutex);
>> - cpumask_clear_cpu(cpu, perf_online_mask);
>> mutex_unlock(&pmus_lock);
>> }
>> #else
> (...)
>
> Cheers,
> Matt
On Wed, Oct 23, 2024 at 01:46:43PM -0400, Liang, Kan wrote: > Yes, it's good enough. A patch has been proposed, but haven't been > merged yet. > > https://lore.kernel.org/lkml/20240913162340.2142976-1-kan.liang@linux.intel.com/ Got it now.
On Fri, Aug 02, 2024 at 08:16:37AM -0700, kan.liang@linux.intel.com wrote: > From: Kan Liang <kan.liang@linux.intel.com> > > The perf subsystem assumes that the counters of a PMU are per-CPU. So > the user space tool reads a counter from each CPU in the system wide > mode. However, many PMUs don't have a per-CPU counter. The counter is > effective for a scope, e.g., a die or a socket. To address this, a > cpumask is exposed by the kernel driver to restrict to one CPU to stand > for a specific scope. In case the given CPU is removed, > the hotplug support has to be implemented for each such driver. > > The codes to support the cpumask and hotplug are very similar. > - Expose a cpumask into sysfs > - Pickup another CPU in the same scope if the given CPU is removed. > - Invoke the perf_pmu_migrate_context() to migrate to a new CPU. > - In event init, always set the CPU in the cpumask to event->cpu > > Similar duplicated codes are implemented for each such PMU driver. It > would be good to introduce a generic infrastructure to avoid such > duplication. > > 5 popular scopes are implemented here, core, die, cluster, pkg, and > the system-wide. The scope can be set when a PMU is registered. If so, a > "cpumask" is automatically exposed for the PMU. > > The "cpumask" is from the perf_online_<scope>_mask, which is to track > the active CPU for each scope. They are set when the first CPU of the > scope is online via the generic perf hotplug support. When a > corresponding CPU is removed, the perf_online_<scope>_mask is updated > accordingly and the PMU will be moved to a new CPU from the same scope > if possible. > > Signed-off-by: Kan Liang <kan.liang@linux.intel.com> This patch results in build failures in the mainline kernel. Building arm:allmodconfig ... failed Building arm64:allmodconfig ... failed Building loongarch:allmodconfig ... failed Building mips:allmodconfig ... failed Building parisc:allmodconfig ... failed [ and more ] -------------- Error log: kernel/events/core.c: In function 'perf_event_setup_cpumask': kernel/events/core.c:14012:13: error: the comparison will always evaluate as 'true' for the address of 'thread_sibling' will never be NULL
On 2024-09-19 11:43 a.m., Guenter Roeck wrote: > On Fri, Aug 02, 2024 at 08:16:37AM -0700, kan.liang@linux.intel.com wrote: >> From: Kan Liang <kan.liang@linux.intel.com> >> >> The perf subsystem assumes that the counters of a PMU are per-CPU. So >> the user space tool reads a counter from each CPU in the system wide >> mode. However, many PMUs don't have a per-CPU counter. The counter is >> effective for a scope, e.g., a die or a socket. To address this, a >> cpumask is exposed by the kernel driver to restrict to one CPU to stand >> for a specific scope. In case the given CPU is removed, >> the hotplug support has to be implemented for each such driver. >> >> The codes to support the cpumask and hotplug are very similar. >> - Expose a cpumask into sysfs >> - Pickup another CPU in the same scope if the given CPU is removed. >> - Invoke the perf_pmu_migrate_context() to migrate to a new CPU. >> - In event init, always set the CPU in the cpumask to event->cpu >> >> Similar duplicated codes are implemented for each such PMU driver. It >> would be good to introduce a generic infrastructure to avoid such >> duplication. >> >> 5 popular scopes are implemented here, core, die, cluster, pkg, and >> the system-wide. The scope can be set when a PMU is registered. If so, a >> "cpumask" is automatically exposed for the PMU. >> >> The "cpumask" is from the perf_online_<scope>_mask, which is to track >> the active CPU for each scope. They are set when the first CPU of the >> scope is online via the generic perf hotplug support. When a >> corresponding CPU is removed, the perf_online_<scope>_mask is updated >> accordingly and the PMU will be moved to a new CPU from the same scope >> if possible. >> >> Signed-off-by: Kan Liang <kan.liang@linux.intel.com> > > This patch results in build failures in the mainline kernel. > > Building arm:allmodconfig ... failed > Building arm64:allmodconfig ... failed > Building loongarch:allmodconfig ... failed > Building mips:allmodconfig ... failed > Building parisc:allmodconfig ... failed > > [ and more ] > > -------------- > Error log: > kernel/events/core.c: In function 'perf_event_setup_cpumask': > kernel/events/core.c:14012:13: error: the comparison will always evaluate as 'true' for the address of 'thread_sibling' will never be NULL > The patch has been posted, but haven't been merged yet. https://lore.kernel.org/lkml/20240912145025.1574448-1-kan.liang@linux.intel.com/ Thanks, Kan
The following commit has been merged into the perf/core branch of tip:
Commit-ID: 4ba4f1afb6a9fed8ef896c2363076e36572f71da
Gitweb: https://git.kernel.org/tip/4ba4f1afb6a9fed8ef896c2363076e36572f71da
Author: Kan Liang <kan.liang@linux.intel.com>
AuthorDate: Fri, 02 Aug 2024 08:16:37 -07:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Sep 2024 11:44:12 +02:00
perf: Generic hotplug support for a PMU with a scope
The perf subsystem assumes that the counters of a PMU are per-CPU. So
the user space tool reads a counter from each CPU in the system wide
mode. However, many PMUs don't have a per-CPU counter. The counter is
effective for a scope, e.g., a die or a socket. To address this, a
cpumask is exposed by the kernel driver to restrict to one CPU to stand
for a specific scope. In case the given CPU is removed,
the hotplug support has to be implemented for each such driver.
The codes to support the cpumask and hotplug are very similar.
- Expose a cpumask into sysfs
- Pickup another CPU in the same scope if the given CPU is removed.
- Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
- In event init, always set the CPU in the cpumask to event->cpu
Similar duplicated codes are implemented for each such PMU driver. It
would be good to introduce a generic infrastructure to avoid such
duplication.
5 popular scopes are implemented here, core, die, cluster, pkg, and
the system-wide. The scope can be set when a PMU is registered. If so, a
"cpumask" is automatically exposed for the PMU.
The "cpumask" is from the perf_online_<scope>_mask, which is to track
the active CPU for each scope. They are set when the first CPU of the
scope is online via the generic perf hotplug support. When a
corresponding CPU is removed, the perf_online_<scope>_mask is updated
accordingly and the PMU will be moved to a new CPU from the same scope
if possible.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240802151643.1691631-2-kan.liang@linux.intel.com
---
include/linux/perf_event.h | 18 ++++-
kernel/events/core.c | 164 +++++++++++++++++++++++++++++++++++-
2 files changed, 180 insertions(+), 2 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7015499..a3cbcd7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -295,6 +295,19 @@ struct perf_event_pmu_context;
#define PERF_PMU_CAP_AUX_OUTPUT 0x0080
#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
+/**
+ * pmu::scope
+ */
+enum perf_pmu_scope {
+ PERF_PMU_SCOPE_NONE = 0,
+ PERF_PMU_SCOPE_CORE,
+ PERF_PMU_SCOPE_DIE,
+ PERF_PMU_SCOPE_CLUSTER,
+ PERF_PMU_SCOPE_PKG,
+ PERF_PMU_SCOPE_SYS_WIDE,
+ PERF_PMU_MAX_SCOPE,
+};
+
struct perf_output_handle;
#define PMU_NULL_DEV ((void *)(~0UL))
@@ -318,6 +331,11 @@ struct pmu {
*/
int capabilities;
+ /*
+ * PMU scope
+ */
+ unsigned int scope;
+
int __percpu *pmu_disable_count;
struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67e115d..5ff9735 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -436,6 +436,11 @@ static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
static cpumask_var_t perf_online_mask;
+static cpumask_var_t perf_online_core_mask;
+static cpumask_var_t perf_online_die_mask;
+static cpumask_var_t perf_online_cluster_mask;
+static cpumask_var_t perf_online_pkg_mask;
+static cpumask_var_t perf_online_sys_mask;
static struct kmem_cache *perf_event_cache;
/*
@@ -11578,10 +11583,60 @@ perf_event_mux_interval_ms_store(struct device *dev,
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
+{
+ switch (scope) {
+ case PERF_PMU_SCOPE_CORE:
+ return topology_sibling_cpumask(cpu);
+ case PERF_PMU_SCOPE_DIE:
+ return topology_die_cpumask(cpu);
+ case PERF_PMU_SCOPE_CLUSTER:
+ return topology_cluster_cpumask(cpu);
+ case PERF_PMU_SCOPE_PKG:
+ return topology_core_cpumask(cpu);
+ case PERF_PMU_SCOPE_SYS_WIDE:
+ return cpu_online_mask;
+ }
+
+ return NULL;
+}
+
+static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
+{
+ switch (scope) {
+ case PERF_PMU_SCOPE_CORE:
+ return perf_online_core_mask;
+ case PERF_PMU_SCOPE_DIE:
+ return perf_online_die_mask;
+ case PERF_PMU_SCOPE_CLUSTER:
+ return perf_online_cluster_mask;
+ case PERF_PMU_SCOPE_PKG:
+ return perf_online_pkg_mask;
+ case PERF_PMU_SCOPE_SYS_WIDE:
+ return perf_online_sys_mask;
+ }
+
+ return NULL;
+}
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+ struct cpumask *mask = perf_scope_cpumask(pmu->scope);
+
+ if (mask)
+ return cpumap_print_to_pagebuf(true, buf, mask);
+ return 0;
+}
+
+static DEVICE_ATTR_RO(cpumask);
+
static struct attribute *pmu_dev_attrs[] = {
&dev_attr_type.attr,
&dev_attr_perf_event_mux_interval_ms.attr,
&dev_attr_nr_addr_filters.attr,
+ &dev_attr_cpumask.attr,
NULL,
};
@@ -11593,6 +11648,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int
if (n == 2 && !pmu->nr_addr_filters)
return 0;
+ /* cpumask */
+ if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
+ return 0;
+
return a->mode;
}
@@ -11677,6 +11736,11 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
goto free_pdc;
}
+ if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
+ ret = -EINVAL;
+ goto free_pdc;
+ }
+
pmu->name = name;
if (type >= 0)
@@ -11831,6 +11895,22 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
event_has_any_exclude_flag(event))
ret = -EINVAL;
+ if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
+ int cpu;
+
+ if (pmu_cpumask && cpumask) {
+ cpu = cpumask_any_and(pmu_cpumask, cpumask);
+ if (cpu >= nr_cpu_ids)
+ ret = -ENODEV;
+ else
+ event->cpu = cpu;
+ } else {
+ ret = -ENODEV;
+ }
+ }
+
if (ret && event->destroy)
event->destroy(event);
}
@@ -13784,6 +13864,12 @@ static void __init perf_event_init_all_cpus(void)
int cpu;
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
+
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
@@ -13833,6 +13919,40 @@ static void __perf_event_exit_context(void *__info)
raw_spin_unlock(&ctx->lock);
}
+static void perf_event_clear_cpumask(unsigned int cpu)
+{
+ int target[PERF_PMU_MAX_SCOPE];
+ unsigned int scope;
+ struct pmu *pmu;
+
+ cpumask_clear_cpu(cpu, perf_online_mask);
+
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
+
+ target[scope] = -1;
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ continue;
+
+ if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
+ continue;
+ target[scope] = cpumask_any_but(cpumask, cpu);
+ if (target[scope] < nr_cpu_ids)
+ cpumask_set_cpu(target[scope], pmu_cpumask);
+ }
+
+ /* migrate */
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
+ if (pmu->scope == PERF_PMU_SCOPE_NONE ||
+ WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
+ continue;
+
+ if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
+ perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
+ }
+}
+
static void perf_event_exit_cpu_context(int cpu)
{
struct perf_cpu_context *cpuctx;
@@ -13840,6 +13960,11 @@ static void perf_event_exit_cpu_context(int cpu)
// XXX simplify cpuctx->online
mutex_lock(&pmus_lock);
+ /*
+ * Clear the cpumasks, and migrate to other CPUs if possible.
+ * Must be invoked before the __perf_event_exit_context.
+ */
+ perf_event_clear_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx;
@@ -13847,7 +13972,6 @@ static void perf_event_exit_cpu_context(int cpu)
smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
cpuctx->online = 0;
mutex_unlock(&ctx->mutex);
- cpumask_clear_cpu(cpu, perf_online_mask);
mutex_unlock(&pmus_lock);
}
#else
@@ -13856,6 +13980,42 @@ static void perf_event_exit_cpu_context(int cpu) { }
#endif
+static void perf_event_setup_cpumask(unsigned int cpu)
+{
+ struct cpumask *pmu_cpumask;
+ unsigned int scope;
+
+ cpumask_set_cpu(cpu, perf_online_mask);
+
+ /*
+ * Early boot stage, the cpumask hasn't been set yet.
+ * The perf_online_<domain>_masks includes the first CPU of each domain.
+ * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
+ */
+ if (!topology_sibling_cpumask(cpu)) {
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ pmu_cpumask = perf_scope_cpumask(scope);
+ if (WARN_ON_ONCE(!pmu_cpumask))
+ continue;
+ cpumask_set_cpu(cpu, pmu_cpumask);
+ }
+ return;
+ }
+
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+
+ pmu_cpumask = perf_scope_cpumask(scope);
+
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ continue;
+
+ if (!cpumask_empty(cpumask) &&
+ cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, pmu_cpumask);
+ }
+}
+
int perf_event_init_cpu(unsigned int cpu)
{
struct perf_cpu_context *cpuctx;
@@ -13864,7 +14024,7 @@ int perf_event_init_cpu(unsigned int cpu)
perf_swevent_init_cpu(cpu);
mutex_lock(&pmus_lock);
- cpumask_set_cpu(cpu, perf_online_mask);
+ perf_event_setup_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx;
On 10/09/2024 10:59, tip-bot2 for Kan Liang wrote:
> The following commit has been merged into the perf/core branch of tip:
>
> Commit-ID: 4ba4f1afb6a9fed8ef896c2363076e36572f71da
> Gitweb: https://git.kernel.org/tip/4ba4f1afb6a9fed8ef896c2363076e36572f71da
> Author: Kan Liang <kan.liang@linux.intel.com>
> AuthorDate: Fri, 02 Aug 2024 08:16:37 -07:00
> Committer: Peter Zijlstra <peterz@infradead.org>
> CommitterDate: Tue, 10 Sep 2024 11:44:12 +02:00
>
> perf: Generic hotplug support for a PMU with a scope
>
> The perf subsystem assumes that the counters of a PMU are per-CPU. So
> the user space tool reads a counter from each CPU in the system wide
> mode. However, many PMUs don't have a per-CPU counter. The counter is
> effective for a scope, e.g., a die or a socket. To address this, a
> cpumask is exposed by the kernel driver to restrict to one CPU to stand
> for a specific scope. In case the given CPU is removed,
> the hotplug support has to be implemented for each such driver.
>
> The codes to support the cpumask and hotplug are very similar.
> - Expose a cpumask into sysfs
> - Pickup another CPU in the same scope if the given CPU is removed.
> - Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
> - In event init, always set the CPU in the cpumask to event->cpu
>
> Similar duplicated codes are implemented for each such PMU driver. It
> would be good to introduce a generic infrastructure to avoid such
> duplication.
>
> 5 popular scopes are implemented here, core, die, cluster, pkg, and
> the system-wide. The scope can be set when a PMU is registered. If so, a
> "cpumask" is automatically exposed for the PMU.
>
> The "cpumask" is from the perf_online_<scope>_mask, which is to track
> the active CPU for each scope. They are set when the first CPU of the
> scope is online via the generic perf hotplug support. When a
> corresponding CPU is removed, the perf_online_<scope>_mask is updated
> accordingly and the PMU will be moved to a new CPU from the same scope
> if possible.
>
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Link: https://lore.kernel.org/r/20240802151643.1691631-2-kan.liang@linux.intel.com
> ---
> include/linux/perf_event.h | 18 ++++-
> kernel/events/core.c | 164 +++++++++++++++++++++++++++++++++++-
> 2 files changed, 180 insertions(+), 2 deletions(-)
>
[...]
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 67e115d..5ff9735 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
[...]
> @@ -13856,6 +13980,42 @@ static void perf_event_exit_cpu_context(int cpu) { }
>
> #endif
>
> +static void perf_event_setup_cpumask(unsigned int cpu)
> +{
> + struct cpumask *pmu_cpumask;
> + unsigned int scope;
> +
> + cpumask_set_cpu(cpu, perf_online_mask);
> +
> + /*
> + * Early boot stage, the cpumask hasn't been set yet.
> + * The perf_online_<domain>_masks includes the first CPU of each domain.
> + * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
^^^^^^^^^^^^^^^ typo
> + */
> + if (!topology_sibling_cpumask(cpu)) {
This causes a compiler warning:
> kernel/events/core.c: In function 'perf_event_setup_cpumask':
> kernel/events/core.c:14012:13: error: the comparison will always evaluate as 'true' for the address of 'thread_sibling' will never be NULL [-Werror=address]
> 14012 | if (!topology_sibling_cpumask(cpu)) {
> | ^
> In file included from ./include/linux/topology.h:30,
> from ./include/linux/gfp.h:8,
> from ./include/linux/xarray.h:16,
> from ./include/linux/list_lru.h:14,
> from ./include/linux/fs.h:13,
> from kernel/events/core.c:11:
> ./include/linux/arch_topology.h:78:19: note: 'thread_sibling' declared here
> 78 | cpumask_t thread_sibling;
> | ^~~~~~~~~~~~~~
> cc1: all warnings being treated as errors
Steve
On 2024-09-12 6:12 a.m., Steven Price wrote:
> On 10/09/2024 10:59, tip-bot2 for Kan Liang wrote:
>> The following commit has been merged into the perf/core branch of tip:
>>
>> Commit-ID: 4ba4f1afb6a9fed8ef896c2363076e36572f71da
>> Gitweb: https://git.kernel.org/tip/4ba4f1afb6a9fed8ef896c2363076e36572f71da
>> Author: Kan Liang <kan.liang@linux.intel.com>
>> AuthorDate: Fri, 02 Aug 2024 08:16:37 -07:00
>> Committer: Peter Zijlstra <peterz@infradead.org>
>> CommitterDate: Tue, 10 Sep 2024 11:44:12 +02:00
>>
>> perf: Generic hotplug support for a PMU with a scope
>>
>> The perf subsystem assumes that the counters of a PMU are per-CPU. So
>> the user space tool reads a counter from each CPU in the system wide
>> mode. However, many PMUs don't have a per-CPU counter. The counter is
>> effective for a scope, e.g., a die or a socket. To address this, a
>> cpumask is exposed by the kernel driver to restrict to one CPU to stand
>> for a specific scope. In case the given CPU is removed,
>> the hotplug support has to be implemented for each such driver.
>>
>> The codes to support the cpumask and hotplug are very similar.
>> - Expose a cpumask into sysfs
>> - Pickup another CPU in the same scope if the given CPU is removed.
>> - Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
>> - In event init, always set the CPU in the cpumask to event->cpu
>>
>> Similar duplicated codes are implemented for each such PMU driver. It
>> would be good to introduce a generic infrastructure to avoid such
>> duplication.
>>
>> 5 popular scopes are implemented here, core, die, cluster, pkg, and
>> the system-wide. The scope can be set when a PMU is registered. If so, a
>> "cpumask" is automatically exposed for the PMU.
>>
>> The "cpumask" is from the perf_online_<scope>_mask, which is to track
>> the active CPU for each scope. They are set when the first CPU of the
>> scope is online via the generic perf hotplug support. When a
>> corresponding CPU is removed, the perf_online_<scope>_mask is updated
>> accordingly and the PMU will be moved to a new CPU from the same scope
>> if possible.
>>
>> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Link: https://lore.kernel.org/r/20240802151643.1691631-2-kan.liang@linux.intel.com
>> ---
>> include/linux/perf_event.h | 18 ++++-
>> kernel/events/core.c | 164 +++++++++++++++++++++++++++++++++++-
>> 2 files changed, 180 insertions(+), 2 deletions(-)
>>
> [...]
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 67e115d..5ff9735 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
> [...]
>> @@ -13856,6 +13980,42 @@ static void perf_event_exit_cpu_context(int cpu) { }
>>
>> #endif
>>
>> +static void perf_event_setup_cpumask(unsigned int cpu)
>> +{
>> + struct cpumask *pmu_cpumask;
>> + unsigned int scope;
>> +
>> + cpumask_set_cpu(cpu, perf_online_mask);
>> +
>> + /*
>> + * Early boot stage, the cpumask hasn't been set yet.
>> + * The perf_online_<domain>_masks includes the first CPU of each domain.
>> + * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
> ^^^^^^^^^^^^^^^ typo
>
>> + */
>> + if (!topology_sibling_cpumask(cpu)) {
>
> This causes a compiler warning:
>
>> kernel/events/core.c: In function 'perf_event_setup_cpumask':
>> kernel/events/core.c:14012:13: error: the comparison will always evaluate as 'true' for the address of 'thread_sibling' will never be NULL [-Werror=address]
>> 14012 | if (!topology_sibling_cpumask(cpu)) {
>> | ^
>> In file included from ./include/linux/topology.h:30,
>> from ./include/linux/gfp.h:8,
>> from ./include/linux/xarray.h:16,
>> from ./include/linux/list_lru.h:14,
>> from ./include/linux/fs.h:13,
>> from kernel/events/core.c:11:
>> ./include/linux/arch_topology.h:78:19: note: 'thread_sibling' declared here
>> 78 | cpumask_t thread_sibling;
>> | ^~~~~~~~~~~~~~
>> cc1: all warnings being treated as errors
>
The patch to fix the warning has been posted.
https://lore.kernel.org/lkml/20240912145025.1574448-1-kan.liang@linux.intel.com/
Please give it a try.
Thanks,
Kan
© 2016 - 2025 Red Hat, Inc.