Generic hotplug support for a PMU with a scope

[PATCH 1/7] perf: Generic hotplug support for a PMU with a scope

Posted by kan.liang@linux.intel.com 1 year, 6 months ago

From: Kan Liang <kan.liang@linux.intel.com>

The perf subsystem assumes that the counters of a PMU are per-CPU. So
the user space tool reads a counter from each CPU in the system wide
mode. However, many PMUs don't have a per-CPU counter. The counter is
effective for a scope, e.g., a die or a socket. To address this, a
cpumask is exposed by the kernel driver to restrict to one CPU to stand
for a specific scope. In case the given CPU is removed,
the hotplug support has to be implemented for each such driver.

The codes to support the cpumask and hotplug are very similar.
- Expose a cpumask into sysfs
- Pickup another CPU in the same scope if the given CPU is removed.
- Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
- In event init, always set the CPU in the cpumask to event->cpu

Similar duplicated codes are implemented for each such PMU driver. It
would be good to introduce a generic infrastructure to avoid such
duplication.

5 popular scopes are implemented here, core, die, cluster, pkg, and
the system-wide. The scope can be set when a PMU is registered. If so, a
"cpumask" is automatically exposed for the PMU.

The "cpumask" is from the perf_online_<scope>_mask, which is to track
the active CPU for each scope. They are set when the first CPU of the
scope is online via the generic perf hotplug support. When a
corresponding CPU is removed, the perf_online_<scope>_mask is updated
accordingly and the PMU will be moved to a new CPU from the same scope
if possible.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
 include/linux/perf_event.h |  18 ++++
 kernel/events/core.c       | 164 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 180 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1a8942277dda..1102d5c2be70 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -292,6 +292,19 @@ struct perf_event_pmu_context;
 #define PERF_PMU_CAP_AUX_OUTPUT			0x0080
 #define PERF_PMU_CAP_EXTENDED_HW_TYPE		0x0100
 
+/**
+ * pmu::scope
+ */
+enum perf_pmu_scope {
+	PERF_PMU_SCOPE_NONE	= 0,
+	PERF_PMU_SCOPE_CORE,
+	PERF_PMU_SCOPE_DIE,
+	PERF_PMU_SCOPE_CLUSTER,
+	PERF_PMU_SCOPE_PKG,
+	PERF_PMU_SCOPE_SYS_WIDE,
+	PERF_PMU_MAX_SCOPE,
+};
+
 struct perf_output_handle;
 
 #define PMU_NULL_DEV	((void *)(~0UL))
@@ -315,6 +328,11 @@ struct pmu {
 	 */
 	int				capabilities;
 
+	/*
+	 * PMU scope
+	 */
+	unsigned int			scope;
+
 	int __percpu			*pmu_disable_count;
 	struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
 	atomic_t			exclusive_cnt; /* < 0: cpu; > 0: tsk */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index aa3450bdc227..5e1877c4cb4c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -407,6 +407,11 @@ static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
 static struct srcu_struct pmus_srcu;
 static cpumask_var_t perf_online_mask;
+static cpumask_var_t perf_online_core_mask;
+static cpumask_var_t perf_online_die_mask;
+static cpumask_var_t perf_online_cluster_mask;
+static cpumask_var_t perf_online_pkg_mask;
+static cpumask_var_t perf_online_sys_mask;
 static struct kmem_cache *perf_event_cache;
 
 /*
@@ -11477,10 +11482,60 @@ perf_event_mux_interval_ms_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
 
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
+{
+	switch (scope) {
+	case PERF_PMU_SCOPE_CORE:
+		return topology_sibling_cpumask(cpu);
+	case PERF_PMU_SCOPE_DIE:
+		return topology_die_cpumask(cpu);
+	case PERF_PMU_SCOPE_CLUSTER:
+		return topology_cluster_cpumask(cpu);
+	case PERF_PMU_SCOPE_PKG:
+		return topology_core_cpumask(cpu);
+	case PERF_PMU_SCOPE_SYS_WIDE:
+		return cpu_online_mask;
+	}
+
+	return NULL;
+}
+
+static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
+{
+	switch (scope) {
+	case PERF_PMU_SCOPE_CORE:
+		return perf_online_core_mask;
+	case PERF_PMU_SCOPE_DIE:
+		return perf_online_die_mask;
+	case PERF_PMU_SCOPE_CLUSTER:
+		return perf_online_cluster_mask;
+	case PERF_PMU_SCOPE_PKG:
+		return perf_online_pkg_mask;
+	case PERF_PMU_SCOPE_SYS_WIDE:
+		return perf_online_sys_mask;
+	}
+
+	return NULL;
+}
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct cpumask *mask = perf_scope_cpumask(pmu->scope);
+
+	if (mask)
+		return cpumap_print_to_pagebuf(true, buf, mask);
+	return 0;
+}
+
+static DEVICE_ATTR_RO(cpumask);
+
 static struct attribute *pmu_dev_attrs[] = {
 	&dev_attr_type.attr,
 	&dev_attr_perf_event_mux_interval_ms.attr,
 	&dev_attr_nr_addr_filters.attr,
+	&dev_attr_cpumask.attr,
 	NULL,
 };
 
@@ -11492,6 +11547,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int
 	if (n == 2 && !pmu->nr_addr_filters)
 		return 0;
 
+	/* cpumask */
+	if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
+		return 0;
+
 	return a->mode;
 }
 
@@ -11576,6 +11635,11 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 		goto free_pdc;
 	}
 
+	if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
+		ret = -EINVAL;
+		goto free_pdc;
+	}
+
 	pmu->name = name;
 
 	if (type >= 0)
@@ -11730,6 +11794,22 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 		    event_has_any_exclude_flag(event))
 			ret = -EINVAL;
 
+		if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
+			const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
+			struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
+			int cpu;
+
+			if (pmu_cpumask && cpumask) {
+				cpu = cpumask_any_and(pmu_cpumask, cpumask);
+				if (cpu >= nr_cpu_ids)
+					ret = -ENODEV;
+				else
+					event->cpu = cpu;
+			} else {
+				ret = -ENODEV;
+			}
+		}
+
 		if (ret && event->destroy)
 			event->destroy(event);
 	}
@@ -13681,6 +13761,12 @@ static void __init perf_event_init_all_cpus(void)
 	int cpu;
 
 	zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
+	zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
+	zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
+	zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
+	zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
+	zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
+
 
 	for_each_possible_cpu(cpu) {
 		swhash = &per_cpu(swevent_htable, cpu);
@@ -13730,6 +13816,40 @@ static void __perf_event_exit_context(void *__info)
 	raw_spin_unlock(&ctx->lock);
 }
 
+static void perf_event_clear_cpumask(unsigned int cpu)
+{
+	int target[PERF_PMU_MAX_SCOPE];
+	unsigned int scope;
+	struct pmu *pmu;
+
+	cpumask_clear_cpu(cpu, perf_online_mask);
+
+	for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+		const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+		struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
+
+		target[scope] = -1;
+		if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+			continue;
+
+		if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
+			continue;
+		target[scope] = cpumask_any_but(cpumask, cpu);
+		if (target[scope] < nr_cpu_ids)
+			cpumask_set_cpu(target[scope], pmu_cpumask);
+	}
+
+	/* migrate */
+	list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
+		if (pmu->scope == PERF_PMU_SCOPE_NONE ||
+		    WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
+			continue;
+
+		if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
+			perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
+	}
+}
+
 static void perf_event_exit_cpu_context(int cpu)
 {
 	struct perf_cpu_context *cpuctx;
@@ -13737,6 +13857,11 @@ static void perf_event_exit_cpu_context(int cpu)
 
 	// XXX simplify cpuctx->online
 	mutex_lock(&pmus_lock);
+	/*
+	 * Clear the cpumasks, and migrate to other CPUs if possible.
+	 * Must be invoked before the __perf_event_exit_context.
+	 */
+	perf_event_clear_cpumask(cpu);
 	cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
 	ctx = &cpuctx->ctx;
 
@@ -13744,7 +13869,6 @@ static void perf_event_exit_cpu_context(int cpu)
 	smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
 	cpuctx->online = 0;
 	mutex_unlock(&ctx->mutex);
-	cpumask_clear_cpu(cpu, perf_online_mask);
 	mutex_unlock(&pmus_lock);
 }
 #else
@@ -13753,6 +13877,42 @@ static void perf_event_exit_cpu_context(int cpu) { }
 
 #endif
 
+static void perf_event_setup_cpumask(unsigned int cpu)
+{
+	struct cpumask *pmu_cpumask;
+	unsigned int scope;
+
+	cpumask_set_cpu(cpu, perf_online_mask);
+
+	/*
+	 * Early boot stage, the cpumask hasn't been set yet.
+	 * The perf_online_<domain>_masks includes the first CPU of each domain.
+	 * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
+	 */
+	if (!topology_sibling_cpumask(cpu)) {
+		for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+			pmu_cpumask = perf_scope_cpumask(scope);
+			if (WARN_ON_ONCE(!pmu_cpumask))
+				continue;
+			cpumask_set_cpu(cpu, pmu_cpumask);
+		}
+		return;
+	}
+
+	for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+		const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+
+		pmu_cpumask = perf_scope_cpumask(scope);
+
+		if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+			continue;
+
+		if (!cpumask_empty(cpumask) &&
+		    cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
+			cpumask_set_cpu(cpu, pmu_cpumask);
+	}
+}
+
 int perf_event_init_cpu(unsigned int cpu)
 {
 	struct perf_cpu_context *cpuctx;
@@ -13761,7 +13921,7 @@ int perf_event_init_cpu(unsigned int cpu)
 	perf_swevent_init_cpu(cpu);
 
 	mutex_lock(&pmus_lock);
-	cpumask_set_cpu(cpu, perf_online_mask);
+	perf_event_setup_cpumask(cpu);
 	cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
 	ctx = &cpuctx->ctx;
 
-- 
2.38.1

Re: [PATCH 1/7] perf: Generic hotplug support for a PMU with a scope

Posted by Matthieu Baerts 1 year, 3 months ago

Hi Kan Liang,

(+ cc Perf ML)

On 02/08/2024 17:16, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
> 
> The perf subsystem assumes that the counters of a PMU are per-CPU. So
> the user space tool reads a counter from each CPU in the system wide
> mode. However, many PMUs don't have a per-CPU counter. The counter is
> effective for a scope, e.g., a die or a socket. To address this, a
> cpumask is exposed by the kernel driver to restrict to one CPU to stand
> for a specific scope. In case the given CPU is removed,
> the hotplug support has to be implemented for each such driver.
> 
> The codes to support the cpumask and hotplug are very similar.
> - Expose a cpumask into sysfs
> - Pickup another CPU in the same scope if the given CPU is removed.
> - Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
> - In event init, always set the CPU in the cpumask to event->cpu
> 
> Similar duplicated codes are implemented for each such PMU driver. It
> would be good to introduce a generic infrastructure to avoid such
> duplication.
> 
> 5 popular scopes are implemented here, core, die, cluster, pkg, and
> the system-wide. The scope can be set when a PMU is registered. If so, a
> "cpumask" is automatically exposed for the PMU.
> 
> The "cpumask" is from the perf_online_<scope>_mask, which is to track
> the active CPU for each scope. They are set when the first CPU of the
> scope is online via the generic perf hotplug support. When a
> corresponding CPU is removed, the perf_online_<scope>_mask is updated
> accordingly and the PMU will be moved to a new CPU from the same scope
> if possible.

Thank you for the patch.

It looks like this modification causes the following warning on my side
when shutting down a VM running a kernel built with a debug config
including CONFIG_PROVE_RCU_LIST=y (and CONFIG_RCU_EXPERT=y):


> # /usr/lib/klibc/bin/poweroff
> 
> =============================
> WARNING: suspicious RCU usage
> 6.12.0-rc3+ #3 Not tainted
> -----------------------------
> kernel/events/core.c:13962 RCU-list traversed in non-reader section!!
> 
> other info that might help us debug this:
> 
> 
> rcu_scheduler_active = 2, debug_locks = 1
> 3 locks held by poweroff/11748:
>  #0: ffffffff9b441e28 (system_transition_mutex){+.+.}-{3:3}, at: __do_sys_reboot (kernel/reboot.c:770)
>  #1: ffffffff9b43eab0 ((reboot_notifier_list).rwsem){++++}-{3:3}, at: blocking_notifier_call_chain (kernel/notifier.c:388)
>  #2: ffffffff9b6d06c8 (pmus_lock){+.+.}-{3:3}, at: perf_event_exit_cpu_context (kernel/events/core.c:13983)
> 
> stack backtrace:
> CPU: 0 UID: 0 PID: 11748 Comm: poweroff Not tainted 6.12.0-rc3+ #3
> Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> Call Trace:
>  <TASK>
>  dump_stack_lvl (lib/dump_stack.c:123)
>  lockdep_rcu_suspicious (kernel/locking/lockdep.c:6822)
>  perf_event_clear_cpumask (kernel/events/core.c:13962 (discriminator 9))
>  ? __pfx_perf_event_clear_cpumask (kernel/events/core.c:13939)
>  ? acpi_execute_simple_method (drivers/acpi/utils.c:679)
>  ? __pfx_acpi_execute_simple_method (drivers/acpi/utils.c:679)
>  ? md_notify_reboot (drivers/md/md.c:9860)
>  perf_event_exit_cpu_context (kernel/events/core.c:13984 (discriminator 1))
>  perf_reboot (kernel/events/core.c:14066 (discriminator 3))
>  ? trace_notifier_run (include/trace/events/notifier.h:59 (discriminator 2))
>  notifier_call_chain (kernel/notifier.c:93)
>  blocking_notifier_call_chain (kernel/notifier.c:389)
>  kernel_power_off (kernel/reboot.c:294)
>  __do_sys_reboot (kernel/reboot.c:771)
>  ? __pfx___do_sys_reboot (kernel/reboot.c:717)
>  ? __pfx_ksys_sync (fs/sync.c:98)
>  do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1))
>  entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)


It is very easy for me to reproduce it: simply by stopping the VM. Just
in case, here are the steps I used to have the same VM:

  $ cd [kernel source code]
  $ echo true > .virtme-exec-run
  $ docker run -v "${PWD}:${PWD}:rw" -w "${PWD}" --privileged --rm -it \
        --pull always mptcp/mptcp-upstream-virtme-docker:latest \
        auto-debug -e RCU_EXPERT -e PROVE_RCU_LIST


I have one question below about the modification you did here.

(...)

> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index aa3450bdc227..5e1877c4cb4c 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c

(...)

> @@ -13730,6 +13816,40 @@ static void __perf_event_exit_context(void *__info)
>  	raw_spin_unlock(&ctx->lock);
>  }
>  
> +static void perf_event_clear_cpumask(unsigned int cpu)
> +{
> +	int target[PERF_PMU_MAX_SCOPE];
> +	unsigned int scope;
> +	struct pmu *pmu;
> +
> +	cpumask_clear_cpu(cpu, perf_online_mask);
> +
> +	for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
> +		const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
> +		struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
> +
> +		target[scope] = -1;
> +		if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
> +			continue;
> +
> +		if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
> +			continue;
> +		target[scope] = cpumask_any_but(cpumask, cpu);
> +		if (target[scope] < nr_cpu_ids)
> +			cpumask_set_cpu(target[scope], pmu_cpumask);
> +	}
> +
> +	/* migrate */
> +	list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {


Here is the line causing the warning, because rcu_read_lock() is not
used before.

I don't know this code, but I guess you are not only doing read
operations here if you are migrating something, right? This operation is
done under the "pmus_lock", maybe the "_rcu" variant is not needed here?

So just using this instead is maybe enough?

  list_for_each_entry(pmu, &pmus, entry) {


> +		if (pmu->scope == PERF_PMU_SCOPE_NONE ||
> +		    WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
> +			continue;
> +
> +		if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
> +			perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
> +	}
> +}
> +
>  static void perf_event_exit_cpu_context(int cpu)
>  {
>  	struct perf_cpu_context *cpuctx;
> @@ -13737,6 +13857,11 @@ static void perf_event_exit_cpu_context(int cpu)
>  
>  	// XXX simplify cpuctx->online
>  	mutex_lock(&pmus_lock);
> +	/*
> +	 * Clear the cpumasks, and migrate to other CPUs if possible.
> +	 * Must be invoked before the __perf_event_exit_context.
> +	 */
> +	perf_event_clear_cpumask(cpu);
>  	cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
>  	ctx = &cpuctx->ctx;
>  
> @@ -13744,7 +13869,6 @@ static void perf_event_exit_cpu_context(int cpu)
>  	smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
>  	cpuctx->online = 0;
>  	mutex_unlock(&ctx->mutex);
> -	cpumask_clear_cpu(cpu, perf_online_mask);
>  	mutex_unlock(&pmus_lock);
>  }
>  #else
(...)

Cheers,
Matt
-- 
Sponsored by the NGI0 Core fund.

Re: [PATCH 1/7] perf: Generic hotplug support for a PMU with a scope

Posted by Liang, Kan 1 year, 3 months ago


On 2024-10-23 1:09 p.m., Matthieu Baerts wrote:
> Hi Kan Liang,
> 
> (+ cc Perf ML)
> 
> On 02/08/2024 17:16, kan.liang@linux.intel.com wrote:
>> From: Kan Liang <kan.liang@linux.intel.com>
>>
>> The perf subsystem assumes that the counters of a PMU are per-CPU. So
>> the user space tool reads a counter from each CPU in the system wide
>> mode. However, many PMUs don't have a per-CPU counter. The counter is
>> effective for a scope, e.g., a die or a socket. To address this, a
>> cpumask is exposed by the kernel driver to restrict to one CPU to stand
>> for a specific scope. In case the given CPU is removed,
>> the hotplug support has to be implemented for each such driver.
>>
>> The codes to support the cpumask and hotplug are very similar.
>> - Expose a cpumask into sysfs
>> - Pickup another CPU in the same scope if the given CPU is removed.
>> - Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
>> - In event init, always set the CPU in the cpumask to event->cpu
>>
>> Similar duplicated codes are implemented for each such PMU driver. It
>> would be good to introduce a generic infrastructure to avoid such
>> duplication.
>>
>> 5 popular scopes are implemented here, core, die, cluster, pkg, and
>> the system-wide. The scope can be set when a PMU is registered. If so, a
>> "cpumask" is automatically exposed for the PMU.
>>
>> The "cpumask" is from the perf_online_<scope>_mask, which is to track
>> the active CPU for each scope. They are set when the first CPU of the
>> scope is online via the generic perf hotplug support. When a
>> corresponding CPU is removed, the perf_online_<scope>_mask is updated
>> accordingly and the PMU will be moved to a new CPU from the same scope
>> if possible.
> 
> Thank you for the patch.
> 
> It looks like this modification causes the following warning on my side
> when shutting down a VM running a kernel built with a debug config
> including CONFIG_PROVE_RCU_LIST=y (and CONFIG_RCU_EXPERT=y):
> 
> 
>> # /usr/lib/klibc/bin/poweroff
>>
>> =============================
>> WARNING: suspicious RCU usage
>> 6.12.0-rc3+ #3 Not tainted
>> -----------------------------
>> kernel/events/core.c:13962 RCU-list traversed in non-reader section!!
>>
>> other info that might help us debug this:
>>
>>
>> rcu_scheduler_active = 2, debug_locks = 1
>> 3 locks held by poweroff/11748:
>>  #0: ffffffff9b441e28 (system_transition_mutex){+.+.}-{3:3}, at: __do_sys_reboot (kernel/reboot.c:770)
>>  #1: ffffffff9b43eab0 ((reboot_notifier_list).rwsem){++++}-{3:3}, at: blocking_notifier_call_chain (kernel/notifier.c:388)
>>  #2: ffffffff9b6d06c8 (pmus_lock){+.+.}-{3:3}, at: perf_event_exit_cpu_context (kernel/events/core.c:13983)
>>
>> stack backtrace:
>> CPU: 0 UID: 0 PID: 11748 Comm: poweroff Not tainted 6.12.0-rc3+ #3
>> Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
>> Call Trace:
>>  <TASK>
>>  dump_stack_lvl (lib/dump_stack.c:123)
>>  lockdep_rcu_suspicious (kernel/locking/lockdep.c:6822)
>>  perf_event_clear_cpumask (kernel/events/core.c:13962 (discriminator 9))
>>  ? __pfx_perf_event_clear_cpumask (kernel/events/core.c:13939)
>>  ? acpi_execute_simple_method (drivers/acpi/utils.c:679)
>>  ? __pfx_acpi_execute_simple_method (drivers/acpi/utils.c:679)
>>  ? md_notify_reboot (drivers/md/md.c:9860)
>>  perf_event_exit_cpu_context (kernel/events/core.c:13984 (discriminator 1))
>>  perf_reboot (kernel/events/core.c:14066 (discriminator 3))
>>  ? trace_notifier_run (include/trace/events/notifier.h:59 (discriminator 2))
>>  notifier_call_chain (kernel/notifier.c:93)
>>  blocking_notifier_call_chain (kernel/notifier.c:389)
>>  kernel_power_off (kernel/reboot.c:294)
>>  __do_sys_reboot (kernel/reboot.c:771)
>>  ? __pfx___do_sys_reboot (kernel/reboot.c:717)
>>  ? __pfx_ksys_sync (fs/sync.c:98)
>>  do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1))
>>  entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
> 
> 
> It is very easy for me to reproduce it: simply by stopping the VM. Just
> in case, here are the steps I used to have the same VM:
> 
>   $ cd [kernel source code]
>   $ echo true > .virtme-exec-run
>   $ docker run -v "${PWD}:${PWD}:rw" -w "${PWD}" --privileged --rm -it \
>         --pull always mptcp/mptcp-upstream-virtme-docker:latest \
>         auto-debug -e RCU_EXPERT -e PROVE_RCU_LIST
> 
> 
> I have one question below about the modification you did here.
> 
> (...)
> 
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index aa3450bdc227..5e1877c4cb4c 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
> 
> (...)
> 
>> @@ -13730,6 +13816,40 @@ static void __perf_event_exit_context(void *__info)
>>  	raw_spin_unlock(&ctx->lock);
>>  }
>>  
>> +static void perf_event_clear_cpumask(unsigned int cpu)
>> +{
>> +	int target[PERF_PMU_MAX_SCOPE];
>> +	unsigned int scope;
>> +	struct pmu *pmu;
>> +
>> +	cpumask_clear_cpu(cpu, perf_online_mask);
>> +
>> +	for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
>> +		const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
>> +		struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
>> +
>> +		target[scope] = -1;
>> +		if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
>> +			continue;
>> +
>> +		if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
>> +			continue;
>> +		target[scope] = cpumask_any_but(cpumask, cpu);
>> +		if (target[scope] < nr_cpu_ids)
>> +			cpumask_set_cpu(target[scope], pmu_cpumask);
>> +	}
>> +
>> +	/* migrate */
>> +	list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
> 
> 
> Here is the line causing the warning, because rcu_read_lock() is not
> used before.
> 
> I don't know this code, but I guess you are not only doing read
> operations here if you are migrating something, right? This operation is
> done under the "pmus_lock", maybe the "_rcu" variant is not needed here?
> 
> So just using this instead is maybe enough?
> 
>   list_for_each_entry(pmu, &pmus, entry) {

Yes, it's good enough. A patch has been proposed, but haven't been
merged yet.

https://lore.kernel.org/lkml/20240913162340.2142976-1-kan.liang@linux.intel.com/

Thanks,
Kan
> 
> 
>> +		if (pmu->scope == PERF_PMU_SCOPE_NONE ||
>> +		    WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
>> +			continue;
>> +
>> +		if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
>> +			perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
>> +	}
>> +}
>> +
>>  static void perf_event_exit_cpu_context(int cpu)
>>  {
>>  	struct perf_cpu_context *cpuctx;
>> @@ -13737,6 +13857,11 @@ static void perf_event_exit_cpu_context(int cpu)
>>  
>>  	// XXX simplify cpuctx->online
>>  	mutex_lock(&pmus_lock);
>> +	/*
>> +	 * Clear the cpumasks, and migrate to other CPUs if possible.
>> +	 * Must be invoked before the __perf_event_exit_context.
>> +	 */
>> +	perf_event_clear_cpumask(cpu);
>>  	cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
>>  	ctx = &cpuctx->ctx;
>>  
>> @@ -13744,7 +13869,6 @@ static void perf_event_exit_cpu_context(int cpu)
>>  	smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
>>  	cpuctx->online = 0;
>>  	mutex_unlock(&ctx->mutex);
>> -	cpumask_clear_cpu(cpu, perf_online_mask);
>>  	mutex_unlock(&pmus_lock);
>>  }
>>  #else
> (...)
> 
> Cheers,
> Matt

Re: [PATCH 1/7] perf: Generic hotplug support for a PMU with a scope

Posted by Peter Zijlstra 1 year, 3 months ago

On Wed, Oct 23, 2024 at 01:46:43PM -0400, Liang, Kan wrote:
> Yes, it's good enough. A patch has been proposed, but haven't been
> merged yet.
> 
> https://lore.kernel.org/lkml/20240913162340.2142976-1-kan.liang@linux.intel.com/

Got it now.

Re: [PATCH 1/7] perf: Generic hotplug support for a PMU with a scope

Posted by Guenter Roeck 1 year, 4 months ago

On Fri, Aug 02, 2024 at 08:16:37AM -0700, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
> 
> The perf subsystem assumes that the counters of a PMU are per-CPU. So
> the user space tool reads a counter from each CPU in the system wide
> mode. However, many PMUs don't have a per-CPU counter. The counter is
> effective for a scope, e.g., a die or a socket. To address this, a
> cpumask is exposed by the kernel driver to restrict to one CPU to stand
> for a specific scope. In case the given CPU is removed,
> the hotplug support has to be implemented for each such driver.
> 
> The codes to support the cpumask and hotplug are very similar.
> - Expose a cpumask into sysfs
> - Pickup another CPU in the same scope if the given CPU is removed.
> - Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
> - In event init, always set the CPU in the cpumask to event->cpu
> 
> Similar duplicated codes are implemented for each such PMU driver. It
> would be good to introduce a generic infrastructure to avoid such
> duplication.
> 
> 5 popular scopes are implemented here, core, die, cluster, pkg, and
> the system-wide. The scope can be set when a PMU is registered. If so, a
> "cpumask" is automatically exposed for the PMU.
> 
> The "cpumask" is from the perf_online_<scope>_mask, which is to track
> the active CPU for each scope. They are set when the first CPU of the
> scope is online via the generic perf hotplug support. When a
> corresponding CPU is removed, the perf_online_<scope>_mask is updated
> accordingly and the PMU will be moved to a new CPU from the same scope
> if possible.
> 
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>

This patch results in build failures in the mainline kernel.

Building arm:allmodconfig ... failed
Building arm64:allmodconfig ... failed
Building loongarch:allmodconfig ... failed
Building mips:allmodconfig ... failed
Building parisc:allmodconfig ... failed

[ and more ]

--------------
Error log:
kernel/events/core.c: In function 'perf_event_setup_cpumask':
kernel/events/core.c:14012:13: error: the comparison will always evaluate as 'true' for the address of 'thread_sibling' will never be NULL

Re: [PATCH 1/7] perf: Generic hotplug support for a PMU with a scope

Posted by Liang, Kan 1 year, 4 months ago


On 2024-09-19 11:43 a.m., Guenter Roeck wrote:
> On Fri, Aug 02, 2024 at 08:16:37AM -0700, kan.liang@linux.intel.com wrote:
>> From: Kan Liang <kan.liang@linux.intel.com>
>>
>> The perf subsystem assumes that the counters of a PMU are per-CPU. So
>> the user space tool reads a counter from each CPU in the system wide
>> mode. However, many PMUs don't have a per-CPU counter. The counter is
>> effective for a scope, e.g., a die or a socket. To address this, a
>> cpumask is exposed by the kernel driver to restrict to one CPU to stand
>> for a specific scope. In case the given CPU is removed,
>> the hotplug support has to be implemented for each such driver.
>>
>> The codes to support the cpumask and hotplug are very similar.
>> - Expose a cpumask into sysfs
>> - Pickup another CPU in the same scope if the given CPU is removed.
>> - Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
>> - In event init, always set the CPU in the cpumask to event->cpu
>>
>> Similar duplicated codes are implemented for each such PMU driver. It
>> would be good to introduce a generic infrastructure to avoid such
>> duplication.
>>
>> 5 popular scopes are implemented here, core, die, cluster, pkg, and
>> the system-wide. The scope can be set when a PMU is registered. If so, a
>> "cpumask" is automatically exposed for the PMU.
>>
>> The "cpumask" is from the perf_online_<scope>_mask, which is to track
>> the active CPU for each scope. They are set when the first CPU of the
>> scope is online via the generic perf hotplug support. When a
>> corresponding CPU is removed, the perf_online_<scope>_mask is updated
>> accordingly and the PMU will be moved to a new CPU from the same scope
>> if possible.
>>
>> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> 
> This patch results in build failures in the mainline kernel.
> 
> Building arm:allmodconfig ... failed
> Building arm64:allmodconfig ... failed
> Building loongarch:allmodconfig ... failed
> Building mips:allmodconfig ... failed
> Building parisc:allmodconfig ... failed
> 
> [ and more ]
> 
> --------------
> Error log:
> kernel/events/core.c: In function 'perf_event_setup_cpumask':
> kernel/events/core.c:14012:13: error: the comparison will always evaluate as 'true' for the address of 'thread_sibling' will never be NULL
> 

The patch has been posted, but haven't been merged yet.

https://lore.kernel.org/lkml/20240912145025.1574448-1-kan.liang@linux.intel.com/

Thanks,
Kan

[tip: perf/core] perf: Generic hotplug support for a PMU with a scope

Posted by tip-bot2 for Kan Liang 1 year, 5 months ago

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     4ba4f1afb6a9fed8ef896c2363076e36572f71da
Gitweb:        https://git.kernel.org/tip/4ba4f1afb6a9fed8ef896c2363076e36572f71da
Author:        Kan Liang <kan.liang@linux.intel.com>
AuthorDate:    Fri, 02 Aug 2024 08:16:37 -07:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 10 Sep 2024 11:44:12 +02:00

perf: Generic hotplug support for a PMU with a scope

The perf subsystem assumes that the counters of a PMU are per-CPU. So
the user space tool reads a counter from each CPU in the system wide
mode. However, many PMUs don't have a per-CPU counter. The counter is
effective for a scope, e.g., a die or a socket. To address this, a
cpumask is exposed by the kernel driver to restrict to one CPU to stand
for a specific scope. In case the given CPU is removed,
the hotplug support has to be implemented for each such driver.

The codes to support the cpumask and hotplug are very similar.
- Expose a cpumask into sysfs
- Pickup another CPU in the same scope if the given CPU is removed.
- Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
- In event init, always set the CPU in the cpumask to event->cpu

Similar duplicated codes are implemented for each such PMU driver. It
would be good to introduce a generic infrastructure to avoid such
duplication.

5 popular scopes are implemented here, core, die, cluster, pkg, and
the system-wide. The scope can be set when a PMU is registered. If so, a
"cpumask" is automatically exposed for the PMU.

The "cpumask" is from the perf_online_<scope>_mask, which is to track
the active CPU for each scope. They are set when the first CPU of the
scope is online via the generic perf hotplug support. When a
corresponding CPU is removed, the perf_online_<scope>_mask is updated
accordingly and the PMU will be moved to a new CPU from the same scope
if possible.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240802151643.1691631-2-kan.liang@linux.intel.com
---
 include/linux/perf_event.h |  18 ++++-
 kernel/events/core.c       | 164 +++++++++++++++++++++++++++++++++++-
 2 files changed, 180 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7015499..a3cbcd7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -295,6 +295,19 @@ struct perf_event_pmu_context;
 #define PERF_PMU_CAP_AUX_OUTPUT			0x0080
 #define PERF_PMU_CAP_EXTENDED_HW_TYPE		0x0100
 
+/**
+ * pmu::scope
+ */
+enum perf_pmu_scope {
+	PERF_PMU_SCOPE_NONE	= 0,
+	PERF_PMU_SCOPE_CORE,
+	PERF_PMU_SCOPE_DIE,
+	PERF_PMU_SCOPE_CLUSTER,
+	PERF_PMU_SCOPE_PKG,
+	PERF_PMU_SCOPE_SYS_WIDE,
+	PERF_PMU_MAX_SCOPE,
+};
+
 struct perf_output_handle;
 
 #define PMU_NULL_DEV	((void *)(~0UL))
@@ -318,6 +331,11 @@ struct pmu {
 	 */
 	int				capabilities;
 
+	/*
+	 * PMU scope
+	 */
+	unsigned int			scope;
+
 	int __percpu			*pmu_disable_count;
 	struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
 	atomic_t			exclusive_cnt; /* < 0: cpu; > 0: tsk */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67e115d..5ff9735 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -436,6 +436,11 @@ static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
 static struct srcu_struct pmus_srcu;
 static cpumask_var_t perf_online_mask;
+static cpumask_var_t perf_online_core_mask;
+static cpumask_var_t perf_online_die_mask;
+static cpumask_var_t perf_online_cluster_mask;
+static cpumask_var_t perf_online_pkg_mask;
+static cpumask_var_t perf_online_sys_mask;
 static struct kmem_cache *perf_event_cache;
 
 /*
@@ -11578,10 +11583,60 @@ perf_event_mux_interval_ms_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
 
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
+{
+	switch (scope) {
+	case PERF_PMU_SCOPE_CORE:
+		return topology_sibling_cpumask(cpu);
+	case PERF_PMU_SCOPE_DIE:
+		return topology_die_cpumask(cpu);
+	case PERF_PMU_SCOPE_CLUSTER:
+		return topology_cluster_cpumask(cpu);
+	case PERF_PMU_SCOPE_PKG:
+		return topology_core_cpumask(cpu);
+	case PERF_PMU_SCOPE_SYS_WIDE:
+		return cpu_online_mask;
+	}
+
+	return NULL;
+}
+
+static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
+{
+	switch (scope) {
+	case PERF_PMU_SCOPE_CORE:
+		return perf_online_core_mask;
+	case PERF_PMU_SCOPE_DIE:
+		return perf_online_die_mask;
+	case PERF_PMU_SCOPE_CLUSTER:
+		return perf_online_cluster_mask;
+	case PERF_PMU_SCOPE_PKG:
+		return perf_online_pkg_mask;
+	case PERF_PMU_SCOPE_SYS_WIDE:
+		return perf_online_sys_mask;
+	}
+
+	return NULL;
+}
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct cpumask *mask = perf_scope_cpumask(pmu->scope);
+
+	if (mask)
+		return cpumap_print_to_pagebuf(true, buf, mask);
+	return 0;
+}
+
+static DEVICE_ATTR_RO(cpumask);
+
 static struct attribute *pmu_dev_attrs[] = {
 	&dev_attr_type.attr,
 	&dev_attr_perf_event_mux_interval_ms.attr,
 	&dev_attr_nr_addr_filters.attr,
+	&dev_attr_cpumask.attr,
 	NULL,
 };
 
@@ -11593,6 +11648,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int
 	if (n == 2 && !pmu->nr_addr_filters)
 		return 0;
 
+	/* cpumask */
+	if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
+		return 0;
+
 	return a->mode;
 }
 
@@ -11677,6 +11736,11 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 		goto free_pdc;
 	}
 
+	if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
+		ret = -EINVAL;
+		goto free_pdc;
+	}
+
 	pmu->name = name;
 
 	if (type >= 0)
@@ -11831,6 +11895,22 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 		    event_has_any_exclude_flag(event))
 			ret = -EINVAL;
 
+		if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
+			const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
+			struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
+			int cpu;
+
+			if (pmu_cpumask && cpumask) {
+				cpu = cpumask_any_and(pmu_cpumask, cpumask);
+				if (cpu >= nr_cpu_ids)
+					ret = -ENODEV;
+				else
+					event->cpu = cpu;
+			} else {
+				ret = -ENODEV;
+			}
+		}
+
 		if (ret && event->destroy)
 			event->destroy(event);
 	}
@@ -13784,6 +13864,12 @@ static void __init perf_event_init_all_cpus(void)
 	int cpu;
 
 	zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
+	zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
+	zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
+	zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
+	zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
+	zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
+
 
 	for_each_possible_cpu(cpu) {
 		swhash = &per_cpu(swevent_htable, cpu);
@@ -13833,6 +13919,40 @@ static void __perf_event_exit_context(void *__info)
 	raw_spin_unlock(&ctx->lock);
 }
 
+static void perf_event_clear_cpumask(unsigned int cpu)
+{
+	int target[PERF_PMU_MAX_SCOPE];
+	unsigned int scope;
+	struct pmu *pmu;
+
+	cpumask_clear_cpu(cpu, perf_online_mask);
+
+	for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+		const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+		struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
+
+		target[scope] = -1;
+		if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+			continue;
+
+		if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
+			continue;
+		target[scope] = cpumask_any_but(cpumask, cpu);
+		if (target[scope] < nr_cpu_ids)
+			cpumask_set_cpu(target[scope], pmu_cpumask);
+	}
+
+	/* migrate */
+	list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
+		if (pmu->scope == PERF_PMU_SCOPE_NONE ||
+		    WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
+			continue;
+
+		if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
+			perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
+	}
+}
+
 static void perf_event_exit_cpu_context(int cpu)
 {
 	struct perf_cpu_context *cpuctx;
@@ -13840,6 +13960,11 @@ static void perf_event_exit_cpu_context(int cpu)
 
 	// XXX simplify cpuctx->online
 	mutex_lock(&pmus_lock);
+	/*
+	 * Clear the cpumasks, and migrate to other CPUs if possible.
+	 * Must be invoked before the __perf_event_exit_context.
+	 */
+	perf_event_clear_cpumask(cpu);
 	cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
 	ctx = &cpuctx->ctx;
 
@@ -13847,7 +13972,6 @@ static void perf_event_exit_cpu_context(int cpu)
 	smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
 	cpuctx->online = 0;
 	mutex_unlock(&ctx->mutex);
-	cpumask_clear_cpu(cpu, perf_online_mask);
 	mutex_unlock(&pmus_lock);
 }
 #else
@@ -13856,6 +13980,42 @@ static void perf_event_exit_cpu_context(int cpu) { }
 
 #endif
 
+static void perf_event_setup_cpumask(unsigned int cpu)
+{
+	struct cpumask *pmu_cpumask;
+	unsigned int scope;
+
+	cpumask_set_cpu(cpu, perf_online_mask);
+
+	/*
+	 * Early boot stage, the cpumask hasn't been set yet.
+	 * The perf_online_<domain>_masks includes the first CPU of each domain.
+	 * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
+	 */
+	if (!topology_sibling_cpumask(cpu)) {
+		for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+			pmu_cpumask = perf_scope_cpumask(scope);
+			if (WARN_ON_ONCE(!pmu_cpumask))
+				continue;
+			cpumask_set_cpu(cpu, pmu_cpumask);
+		}
+		return;
+	}
+
+	for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+		const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+
+		pmu_cpumask = perf_scope_cpumask(scope);
+
+		if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+			continue;
+
+		if (!cpumask_empty(cpumask) &&
+		    cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
+			cpumask_set_cpu(cpu, pmu_cpumask);
+	}
+}
+
 int perf_event_init_cpu(unsigned int cpu)
 {
 	struct perf_cpu_context *cpuctx;
@@ -13864,7 +14024,7 @@ int perf_event_init_cpu(unsigned int cpu)
 	perf_swevent_init_cpu(cpu);
 
 	mutex_lock(&pmus_lock);
-	cpumask_set_cpu(cpu, perf_online_mask);
+	perf_event_setup_cpumask(cpu);
 	cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
 	ctx = &cpuctx->ctx;

Re: [tip: perf/core] perf: Generic hotplug support for a PMU with a scope

Posted by Steven Price 1 year, 4 months ago

On 10/09/2024 10:59, tip-bot2 for Kan Liang wrote:
> The following commit has been merged into the perf/core branch of tip:
> 
> Commit-ID:     4ba4f1afb6a9fed8ef896c2363076e36572f71da
> Gitweb:        https://git.kernel.org/tip/4ba4f1afb6a9fed8ef896c2363076e36572f71da
> Author:        Kan Liang <kan.liang@linux.intel.com>
> AuthorDate:    Fri, 02 Aug 2024 08:16:37 -07:00
> Committer:     Peter Zijlstra <peterz@infradead.org>
> CommitterDate: Tue, 10 Sep 2024 11:44:12 +02:00
> 
> perf: Generic hotplug support for a PMU with a scope
> 
> The perf subsystem assumes that the counters of a PMU are per-CPU. So
> the user space tool reads a counter from each CPU in the system wide
> mode. However, many PMUs don't have a per-CPU counter. The counter is
> effective for a scope, e.g., a die or a socket. To address this, a
> cpumask is exposed by the kernel driver to restrict to one CPU to stand
> for a specific scope. In case the given CPU is removed,
> the hotplug support has to be implemented for each such driver.
> 
> The codes to support the cpumask and hotplug are very similar.
> - Expose a cpumask into sysfs
> - Pickup another CPU in the same scope if the given CPU is removed.
> - Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
> - In event init, always set the CPU in the cpumask to event->cpu
> 
> Similar duplicated codes are implemented for each such PMU driver. It
> would be good to introduce a generic infrastructure to avoid such
> duplication.
> 
> 5 popular scopes are implemented here, core, die, cluster, pkg, and
> the system-wide. The scope can be set when a PMU is registered. If so, a
> "cpumask" is automatically exposed for the PMU.
> 
> The "cpumask" is from the perf_online_<scope>_mask, which is to track
> the active CPU for each scope. They are set when the first CPU of the
> scope is online via the generic perf hotplug support. When a
> corresponding CPU is removed, the perf_online_<scope>_mask is updated
> accordingly and the PMU will be moved to a new CPU from the same scope
> if possible.
> 
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Link: https://lore.kernel.org/r/20240802151643.1691631-2-kan.liang@linux.intel.com
> ---
>  include/linux/perf_event.h |  18 ++++-
>  kernel/events/core.c       | 164 +++++++++++++++++++++++++++++++++++-
>  2 files changed, 180 insertions(+), 2 deletions(-)
> 
[...]
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 67e115d..5ff9735 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
[...]
> @@ -13856,6 +13980,42 @@ static void perf_event_exit_cpu_context(int cpu) { }
>  
>  #endif
>  
> +static void perf_event_setup_cpumask(unsigned int cpu)
> +{
> +	struct cpumask *pmu_cpumask;
> +	unsigned int scope;
> +
> +	cpumask_set_cpu(cpu, perf_online_mask);
> +
> +	/*
> +	 * Early boot stage, the cpumask hasn't been set yet.
> +	 * The perf_online_<domain>_masks includes the first CPU of each domain.
> +	 * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
                  ^^^^^^^^^^^^^^^ typo

> +	 */
> +	if (!topology_sibling_cpumask(cpu)) {

This causes a compiler warning:

> kernel/events/core.c: In function 'perf_event_setup_cpumask':
> kernel/events/core.c:14012:13: error: the comparison will always evaluate as 'true' for the address of 'thread_sibling' will never be NULL [-Werror=address]
> 14012 |         if (!topology_sibling_cpumask(cpu)) {
>       |             ^
> In file included from ./include/linux/topology.h:30,
>                  from ./include/linux/gfp.h:8,
>                  from ./include/linux/xarray.h:16,
>                  from ./include/linux/list_lru.h:14,
>                  from ./include/linux/fs.h:13,
>                  from kernel/events/core.c:11:
> ./include/linux/arch_topology.h:78:19: note: 'thread_sibling' declared here
>    78 |         cpumask_t thread_sibling;
>       |                   ^~~~~~~~~~~~~~
> cc1: all warnings being treated as errors

Steve

Re: [tip: perf/core] perf: Generic hotplug support for a PMU with a scope

Posted by Liang, Kan 1 year, 4 months ago


On 2024-09-12 6:12 a.m., Steven Price wrote:
> On 10/09/2024 10:59, tip-bot2 for Kan Liang wrote:
>> The following commit has been merged into the perf/core branch of tip:
>>
>> Commit-ID:     4ba4f1afb6a9fed8ef896c2363076e36572f71da
>> Gitweb:        https://git.kernel.org/tip/4ba4f1afb6a9fed8ef896c2363076e36572f71da
>> Author:        Kan Liang <kan.liang@linux.intel.com>
>> AuthorDate:    Fri, 02 Aug 2024 08:16:37 -07:00
>> Committer:     Peter Zijlstra <peterz@infradead.org>
>> CommitterDate: Tue, 10 Sep 2024 11:44:12 +02:00
>>
>> perf: Generic hotplug support for a PMU with a scope
>>
>> The perf subsystem assumes that the counters of a PMU are per-CPU. So
>> the user space tool reads a counter from each CPU in the system wide
>> mode. However, many PMUs don't have a per-CPU counter. The counter is
>> effective for a scope, e.g., a die or a socket. To address this, a
>> cpumask is exposed by the kernel driver to restrict to one CPU to stand
>> for a specific scope. In case the given CPU is removed,
>> the hotplug support has to be implemented for each such driver.
>>
>> The codes to support the cpumask and hotplug are very similar.
>> - Expose a cpumask into sysfs
>> - Pickup another CPU in the same scope if the given CPU is removed.
>> - Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
>> - In event init, always set the CPU in the cpumask to event->cpu
>>
>> Similar duplicated codes are implemented for each such PMU driver. It
>> would be good to introduce a generic infrastructure to avoid such
>> duplication.
>>
>> 5 popular scopes are implemented here, core, die, cluster, pkg, and
>> the system-wide. The scope can be set when a PMU is registered. If so, a
>> "cpumask" is automatically exposed for the PMU.
>>
>> The "cpumask" is from the perf_online_<scope>_mask, which is to track
>> the active CPU for each scope. They are set when the first CPU of the
>> scope is online via the generic perf hotplug support. When a
>> corresponding CPU is removed, the perf_online_<scope>_mask is updated
>> accordingly and the PMU will be moved to a new CPU from the same scope
>> if possible.
>>
>> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Link: https://lore.kernel.org/r/20240802151643.1691631-2-kan.liang@linux.intel.com
>> ---
>>  include/linux/perf_event.h |  18 ++++-
>>  kernel/events/core.c       | 164 +++++++++++++++++++++++++++++++++++-
>>  2 files changed, 180 insertions(+), 2 deletions(-)
>>
> [...]
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 67e115d..5ff9735 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
> [...]
>> @@ -13856,6 +13980,42 @@ static void perf_event_exit_cpu_context(int cpu) { }
>>  
>>  #endif
>>  
>> +static void perf_event_setup_cpumask(unsigned int cpu)
>> +{
>> +	struct cpumask *pmu_cpumask;
>> +	unsigned int scope;
>> +
>> +	cpumask_set_cpu(cpu, perf_online_mask);
>> +
>> +	/*
>> +	 * Early boot stage, the cpumask hasn't been set yet.
>> +	 * The perf_online_<domain>_masks includes the first CPU of each domain.
>> +	 * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
>                   ^^^^^^^^^^^^^^^ typo
> 
>> +	 */
>> +	if (!topology_sibling_cpumask(cpu)) {
> 
> This causes a compiler warning:
> 
>> kernel/events/core.c: In function 'perf_event_setup_cpumask':
>> kernel/events/core.c:14012:13: error: the comparison will always evaluate as 'true' for the address of 'thread_sibling' will never be NULL [-Werror=address]
>> 14012 |         if (!topology_sibling_cpumask(cpu)) {
>>       |             ^
>> In file included from ./include/linux/topology.h:30,
>>                  from ./include/linux/gfp.h:8,
>>                  from ./include/linux/xarray.h:16,
>>                  from ./include/linux/list_lru.h:14,
>>                  from ./include/linux/fs.h:13,
>>                  from kernel/events/core.c:11:
>> ./include/linux/arch_topology.h:78:19: note: 'thread_sibling' declared here
>>    78 |         cpumask_t thread_sibling;
>>       |                   ^~~~~~~~~~~~~~
>> cc1: all warnings being treated as errors
> 

The patch to fix the warning has been posted.
https://lore.kernel.org/lkml/20240912145025.1574448-1-kan.liang@linux.intel.com/

Please give it a try.

Thanks,
Kan

[PATCH 1/7] perf: Generic hotplug support for a PMU with a scope
[PATCH 2/7] perf: Add PERF_EV_CAP_READ_SCOPE
[PATCH 3/7] perf/x86/intel/cstate: Clean up cpumask and hotplug
[PATCH 4/7] iommu/vt-d: Clean up cpumask and hotplug for perfmon
[PATCH 5/7] dmaengine: idxd: Clean up cpumask and hotplug for perfmon
[PATCH 6/7] perf/x86/rapl: Move the pmu allocation out of CPU hotplug
[PATCH 7/7] perf/x86/rapl: Clean up cpumask and hotplug