[v10] Support SMT control on arm64

[PATCH v10 3/4] arm64: topology: Support SMT control on ACPI based system

Posted by Yicong Yang 1 year, 1 month ago

From: Yicong Yang <yangyicong@hisilicon.com>

For ACPI we'll build the topology from PPTT and we cannot directly
get the SMT number of each core. Instead using a temporary xarray
to record the heterogeneous information (from ACPI_PPTT_ACPI_IDENTICAL)
and SMT information of the first core in its heterogeneous CPU cluster
when building the topology. Then we can know the largest SMT number
in the system. If a homogeneous system's using ACPI 6.2 or later,
all the CPUs should be under the root node of PPTT. There'll be
only one entry in the xarray and all the CPUs in the system will
be assumed identical.

The core's SMT control provides two interface to the users [1]:
1) enable/disable SMT by writing on/off
2) enable/disable SMT by writing thread number 1/max_thread_number

If a system have more than one SMT thread number the 2) may
not handle it well, since there're multiple thread numbers in the
system and 2) only accept 1/max_thread_number. So issue a warning
to notify the users if such system detected.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/ABI/testing/sysfs-devices-system-cpu#n542
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
---
 arch/arm64/kernel/topology.c | 66 ++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 1a2c72f3e7f8..85cb18d72a29 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -15,8 +15,10 @@
 #include <linux/arch_topology.h>
 #include <linux/cacheinfo.h>
 #include <linux/cpufreq.h>
+#include <linux/cpu_smt.h>
 #include <linux/init.h>
 #include <linux/percpu.h>
+#include <linux/xarray.h>
 
 #include <asm/cpu.h>
 #include <asm/cputype.h>
@@ -37,17 +39,28 @@ static bool __init acpi_cpu_is_threaded(int cpu)
 	return !!is_threaded;
 }
 
+struct cpu_smt_info {
+	int thread_num;
+	int core_id;
+};
+
 /*
  * Propagate the topology information of the processor_topology_node tree to the
  * cpu_topology array.
  */
 int __init parse_acpi_topology(void)
 {
+	int max_smt_thread_num = 0;
+	struct cpu_smt_info *entry;
+	struct xarray hetero_cpu;
+	unsigned long hetero_id;
 	int cpu, topology_id;
 
 	if (acpi_disabled)
 		return 0;
 
+	xa_init(&hetero_cpu);
+
 	for_each_possible_cpu(cpu) {
 		topology_id = find_acpi_cpu_topology(cpu, 0);
 		if (topology_id < 0)
@@ -57,6 +70,32 @@ int __init parse_acpi_topology(void)
 			cpu_topology[cpu].thread_id = topology_id;
 			topology_id = find_acpi_cpu_topology(cpu, 1);
 			cpu_topology[cpu].core_id   = topology_id;
+
+			/*
+			 * In the PPTT, CPUs below a node with the 'identical
+			 * implementation' flag have the same number of threads.
+			 * Count the number of threads for only one CPU (i.e.
+			 * one core_id) among those with the same hetero_id.
+			 * See the comment of find_acpi_cpu_topology_hetero_id()
+			 * for more details.
+			 *
+			 * One entry is created for each node having:
+			 * - the 'identical implementation' flag
+			 * - its parent not having the flag
+			 */
+			hetero_id = find_acpi_cpu_topology_hetero_id(cpu);
+			entry = (struct cpu_smt_info *)xa_load(&hetero_cpu, hetero_id);
+			if (!entry) {
+				entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+				WARN_ON(!entry);
+
+				entry->core_id = topology_id;
+				entry->thread_num = 1;
+				xa_store(&hetero_cpu, hetero_id,
+					 entry, GFP_KERNEL);
+			} else if (entry->core_id == topology_id) {
+				entry->thread_num++;
+			}
 		} else {
 			cpu_topology[cpu].thread_id  = -1;
 			cpu_topology[cpu].core_id    = topology_id;
@@ -67,6 +106,33 @@ int __init parse_acpi_topology(void)
 		cpu_topology[cpu].package_id = topology_id;
 	}
 
+	/*
+	 * This should be a short loop depending on the number of heterogeneous
+	 * CPU clusters. Typically on a homogeneous system there's only one
+	 * entry in the XArray.
+	 */
+	xa_for_each(&hetero_cpu, hetero_id, entry) {
+		if (entry->thread_num != max_smt_thread_num && max_smt_thread_num)
+			pr_warn_once("Heterogeneous SMT topology is partly supported by SMT control\n");
+
+		if (entry->thread_num > max_smt_thread_num)
+			max_smt_thread_num = entry->thread_num;
+
+		xa_erase(&hetero_cpu, hetero_id);
+		kfree(entry);
+	}
+
+	/*
+	 * Notify the CPU framework of the SMT support. Initialize the
+	 * max_smt_thread_num to 1 if no SMT support detected. A thread
+	 * number of 1 can be handled by the framework so we don't need
+	 * to check max_smt_thread_num to see we support SMT or not.
+	 */
+	if (!max_smt_thread_num)
+		max_smt_thread_num = 1;
+
+	cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
+	xa_destroy(&hetero_cpu);
 	return 0;
 }
 #endif
-- 
2.24.0

Re: [PATCH v10 3/4] arm64: topology: Support SMT control on ACPI based system

Posted by Jonathan Cameron 1 year, 1 month ago

On Fri, 20 Dec 2024 15:53:12 +0800
Yicong Yang <yangyicong@huawei.com> wrote:

> From: Yicong Yang <yangyicong@hisilicon.com>
> 
> For ACPI we'll build the topology from PPTT and we cannot directly
> get the SMT number of each core. Instead using a temporary xarray
> to record the heterogeneous information (from ACPI_PPTT_ACPI_IDENTICAL)
> and SMT information of the first core in its heterogeneous CPU cluster
> when building the topology. Then we can know the largest SMT number
> in the system. If a homogeneous system's using ACPI 6.2 or later,
> all the CPUs should be under the root node of PPTT. There'll be
> only one entry in the xarray and all the CPUs in the system will
> be assumed identical.
> 
> The core's SMT control provides two interface to the users [1]:
> 1) enable/disable SMT by writing on/off
> 2) enable/disable SMT by writing thread number 1/max_thread_number
> 
> If a system have more than one SMT thread number the 2) may
> not handle it well, since there're multiple thread numbers in the
> system and 2) only accept 1/max_thread_number. So issue a warning
> to notify the users if such system detected.
> 
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/ABI/testing/sysfs-devices-system-cpu#n542
> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>

A few trivial things inline.  Either way it's fine as really just my style
preferences

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

> ---
>  arch/arm64/kernel/topology.c | 66 ++++++++++++++++++++++++++++++++++++
>  1 file changed, 66 insertions(+)
> 
> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
> index 1a2c72f3e7f8..85cb18d72a29 100644
> --- a/arch/arm64/kernel/topology.c
> +++ b/arch/arm64/kernel/topology.c
> @@ -15,8 +15,10 @@
>  #include <linux/arch_topology.h>
>  #include <linux/cacheinfo.h>
>  #include <linux/cpufreq.h>
> +#include <linux/cpu_smt.h>
>  #include <linux/init.h>
>  #include <linux/percpu.h>
> +#include <linux/xarray.h>
>  
>  #include <asm/cpu.h>
>  #include <asm/cputype.h>
> @@ -37,17 +39,28 @@ static bool __init acpi_cpu_is_threaded(int cpu)
>  	return !!is_threaded;
>  }
>  
> +struct cpu_smt_info {
> +	int thread_num;
> +	int core_id;
> +};
> +
>  /*
>   * Propagate the topology information of the processor_topology_node tree to the
>   * cpu_topology array.
>   */
>  int __init parse_acpi_topology(void)
>  {
> +	int max_smt_thread_num = 0;
> +	struct cpu_smt_info *entry;
> +	struct xarray hetero_cpu;
> +	unsigned long hetero_id;
>  	int cpu, topology_id;
>  
>  	if (acpi_disabled)
>  		return 0;
>  
> +	xa_init(&hetero_cpu);
> +
>  	for_each_possible_cpu(cpu) {
>  		topology_id = find_acpi_cpu_topology(cpu, 0);
>  		if (topology_id < 0)
> @@ -57,6 +70,32 @@ int __init parse_acpi_topology(void)
>  			cpu_topology[cpu].thread_id = topology_id;
>  			topology_id = find_acpi_cpu_topology(cpu, 1);
>  			cpu_topology[cpu].core_id   = topology_id;
> +
> +			/*
> +			 * In the PPTT, CPUs below a node with the 'identical
> +			 * implementation' flag have the same number of threads.
> +			 * Count the number of threads for only one CPU (i.e.
> +			 * one core_id) among those with the same hetero_id.
> +			 * See the comment of find_acpi_cpu_topology_hetero_id()
> +			 * for more details.
> +			 *
> +			 * One entry is created for each node having:
> +			 * - the 'identical implementation' flag
> +			 * - its parent not having the flag
> +			 */
> +			hetero_id = find_acpi_cpu_topology_hetero_id(cpu);
> +			entry = (struct cpu_smt_info *)xa_load(&hetero_cpu, hetero_id);

Given xa_load returns a void *,

			entry = xa_load(&hetero_cpu, hetero_id);

should be fine (I haven't checked local style, so feel free to ignore if
local style is to cast anyway).  Maybe drag the definition of entry into
a more local scope as well.


> +			if (!entry) {
> +				entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> +				WARN_ON(!entry);
> +
> +				entry->core_id = topology_id;
> +				entry->thread_num = 1;
> +				xa_store(&hetero_cpu, hetero_id,
> +					 entry, GFP_KERNEL);
> +			} else if (entry->core_id == topology_id) {
> +				entry->thread_num++;
> +			}
>  		} else {
>  			cpu_topology[cpu].thread_id  = -1;
>  			cpu_topology[cpu].core_id    = topology_id;
> @@ -67,6 +106,33 @@ int __init parse_acpi_topology(void)
>  		cpu_topology[cpu].package_id = topology_id;
>  	}
>  
> +	/*
> +	 * This should be a short loop depending on the number of heterogeneous
> +	 * CPU clusters. Typically on a homogeneous system there's only one
> +	 * entry in the XArray.
> +	 */
> +	xa_for_each(&hetero_cpu, hetero_id, entry) {
> +		if (entry->thread_num != max_smt_thread_num && max_smt_thread_num)
> +			pr_warn_once("Heterogeneous SMT topology is partly supported by SMT control\n");
> +
> +		if (entry->thread_num > max_smt_thread_num)
> +			max_smt_thread_num = entry->thread_num;

As with DT, maybe min is more informative?

		max_smt_thread_num = min(max_smt_thread_num, entry->thread_num);

I don't care strongly about it though.
> +
> +		xa_erase(&hetero_cpu, hetero_id);
> +		kfree(entry);
> +	}
> +
> +	/*
> +	 * Notify the CPU framework of the SMT support. Initialize the
> +	 * max_smt_thread_num to 1 if no SMT support detected. A thread
> +	 * number of 1 can be handled by the framework so we don't need
> +	 * to check max_smt_thread_num to see we support SMT or not.
> +	 */
> +	if (!max_smt_thread_num)
> +		max_smt_thread_num = 1;
> +
> +	cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
> +	xa_destroy(&hetero_cpu);
>  	return 0;
>  }
>  #endif

Re: [PATCH v10 3/4] arm64: topology: Support SMT control on ACPI based system

Posted by Yicong Yang 1 year, 1 month ago

On 2024/12/24 0:40, Jonathan Cameron wrote:
> On Fri, 20 Dec 2024 15:53:12 +0800
> Yicong Yang <yangyicong@huawei.com> wrote:
> 
>> From: Yicong Yang <yangyicong@hisilicon.com>
>>
>> For ACPI we'll build the topology from PPTT and we cannot directly
>> get the SMT number of each core. Instead using a temporary xarray
>> to record the heterogeneous information (from ACPI_PPTT_ACPI_IDENTICAL)
>> and SMT information of the first core in its heterogeneous CPU cluster
>> when building the topology. Then we can know the largest SMT number
>> in the system. If a homogeneous system's using ACPI 6.2 or later,
>> all the CPUs should be under the root node of PPTT. There'll be
>> only one entry in the xarray and all the CPUs in the system will
>> be assumed identical.
>>
>> The core's SMT control provides two interface to the users [1]:
>> 1) enable/disable SMT by writing on/off
>> 2) enable/disable SMT by writing thread number 1/max_thread_number
>>
>> If a system have more than one SMT thread number the 2) may
>> not handle it well, since there're multiple thread numbers in the
>> system and 2) only accept 1/max_thread_number. So issue a warning
>> to notify the users if such system detected.
>>
>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/ABI/testing/sysfs-devices-system-cpu#n542
>> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
> 
> A few trivial things inline.  Either way it's fine as really just my style
> preferences
> 
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 
>> ---
>>  arch/arm64/kernel/topology.c | 66 ++++++++++++++++++++++++++++++++++++
>>  1 file changed, 66 insertions(+)
>>
>> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
>> index 1a2c72f3e7f8..85cb18d72a29 100644
>> --- a/arch/arm64/kernel/topology.c
>> +++ b/arch/arm64/kernel/topology.c
>> @@ -15,8 +15,10 @@
>>  #include <linux/arch_topology.h>
>>  #include <linux/cacheinfo.h>
>>  #include <linux/cpufreq.h>
>> +#include <linux/cpu_smt.h>
>>  #include <linux/init.h>
>>  #include <linux/percpu.h>
>> +#include <linux/xarray.h>
>>  
>>  #include <asm/cpu.h>
>>  #include <asm/cputype.h>
>> @@ -37,17 +39,28 @@ static bool __init acpi_cpu_is_threaded(int cpu)
>>  	return !!is_threaded;
>>  }
>>  
>> +struct cpu_smt_info {
>> +	int thread_num;
>> +	int core_id;
>> +};
>> +
>>  /*
>>   * Propagate the topology information of the processor_topology_node tree to the
>>   * cpu_topology array.
>>   */
>>  int __init parse_acpi_topology(void)
>>  {
>> +	int max_smt_thread_num = 0;
>> +	struct cpu_smt_info *entry;
>> +	struct xarray hetero_cpu;
>> +	unsigned long hetero_id;
>>  	int cpu, topology_id;
>>  
>>  	if (acpi_disabled)
>>  		return 0;
>>  
>> +	xa_init(&hetero_cpu);
>> +
>>  	for_each_possible_cpu(cpu) {
>>  		topology_id = find_acpi_cpu_topology(cpu, 0);
>>  		if (topology_id < 0)
>> @@ -57,6 +70,32 @@ int __init parse_acpi_topology(void)
>>  			cpu_topology[cpu].thread_id = topology_id;
>>  			topology_id = find_acpi_cpu_topology(cpu, 1);
>>  			cpu_topology[cpu].core_id   = topology_id;
>> +
>> +			/*
>> +			 * In the PPTT, CPUs below a node with the 'identical
>> +			 * implementation' flag have the same number of threads.
>> +			 * Count the number of threads for only one CPU (i.e.
>> +			 * one core_id) among those with the same hetero_id.
>> +			 * See the comment of find_acpi_cpu_topology_hetero_id()
>> +			 * for more details.
>> +			 *
>> +			 * One entry is created for each node having:
>> +			 * - the 'identical implementation' flag
>> +			 * - its parent not having the flag
>> +			 */
>> +			hetero_id = find_acpi_cpu_topology_hetero_id(cpu);
>> +			entry = (struct cpu_smt_info *)xa_load(&hetero_cpu, hetero_id);
> 
> Given xa_load returns a void *,
> 
> 			entry = xa_load(&hetero_cpu, hetero_id);
> 
> should be fine (I haven't checked local style, so feel free to ignore if
> local style is to cast anyway).  Maybe drag the definition of entry into
> a more local scope as well.
> 

sure. will get rid of the cast and checked it won't violate the local style.

> 
>> +			if (!entry) {
>> +				entry = kzalloc(sizeof(*entry), GFP_KERNEL);
>> +				WARN_ON(!entry);
>> +
>> +				entry->core_id = topology_id;
>> +				entry->thread_num = 1;
>> +				xa_store(&hetero_cpu, hetero_id,
>> +					 entry, GFP_KERNEL);
>> +			} else if (entry->core_id == topology_id) {
>> +				entry->thread_num++;
>> +			}
>>  		} else {
>>  			cpu_topology[cpu].thread_id  = -1;
>>  			cpu_topology[cpu].core_id    = topology_id;
>> @@ -67,6 +106,33 @@ int __init parse_acpi_topology(void)
>>  		cpu_topology[cpu].package_id = topology_id;
>>  	}
>>  
>> +	/*
>> +	 * This should be a short loop depending on the number of heterogeneous
>> +	 * CPU clusters. Typically on a homogeneous system there's only one
>> +	 * entry in the XArray.
>> +	 */
>> +	xa_for_each(&hetero_cpu, hetero_id, entry) {
>> +		if (entry->thread_num != max_smt_thread_num && max_smt_thread_num)
>> +			pr_warn_once("Heterogeneous SMT topology is partly supported by SMT control\n");
>> +
>> +		if (entry->thread_num > max_smt_thread_num)
>> +			max_smt_thread_num = entry->thread_num;
> 
> As with DT, maybe min is more informative?
> 
> 		max_smt_thread_num = min(max_smt_thread_num, entry->thread_num);
> 
> I don't care strongly about it though.

either's ok with me, will keep it consistent with DT.

Thanks.

[PATCH v10 1/4] cpu/SMT: Provide a default topology_is_primary_thread()
[PATCH v10 2/4] arch_topology: Support SMT control for OF based system
[PATCH v10 3/4] arm64: topology: Support SMT control on ACPI based system
[PATCH v10 4/4] arm64: Kconfig: Enable HOTPLUG_SMT