Sub-NUMA Cluster divides CPUs sharing an L3 cache into separate NUMA
nodes. Systems may support splitting into either two or four nodes.
When SNC mode is enabled the effective amount of L3 cache available
for allocation is divided by the number of nodes per L3.
Detect which SNC mode is active by comparing the number of CPUs
that share a cache with CPU0, with the number of CPUs on node0.
This gives some hope of tests passing. But additional test
infrastructure changes are needed to bind tests to nodes and
guarantee memory allocation from the local node.
Reported-by: "Shaopeng Tan (Fujitsu)" <tan.shaopeng@fujitsu.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
tools/testing/selftests/resctrl/resctrl.h | 1 +
tools/testing/selftests/resctrl/resctrlfs.c | 57 +++++++++++++++++++++
2 files changed, 58 insertions(+)
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 87e39456dee0..a8b43210b573 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -13,6 +13,7 @@
#include <signal.h>
#include <dirent.h>
#include <stdbool.h>
+#include <ctype.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index fb00245dee92..79eecbf9f863 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -130,6 +130,61 @@ int get_resource_id(int cpu_no, int *resource_id)
return 0;
}
+/*
+ * Count number of CPUs in a /sys bit map
+ */
+static int count_sys_bitmap_bits(char *name)
+{
+ FILE *fp = fopen(name, "r");
+ int count = 0, c;
+
+ if (!fp)
+ return 0;
+
+ while ((c = fgetc(fp)) != EOF) {
+ if (!isxdigit(c))
+ continue;
+ switch (c) {
+ case 'f':
+ count++;
+ case '7': case 'b': case 'd': case 'e':
+ count++;
+ case '3': case '5': case '6': case '9': case 'a': case 'c':
+ count++;
+ case '1': case '2': case '4': case '8':
+ count++;
+ }
+ }
+ fclose(fp);
+
+ return count;
+}
+
+/*
+ * Detect SNC by compating #CPUs in node0 with #CPUs sharing LLC with CPU0
+ * Try to get this right, even if a few CPUs are offline so that the number
+ * of CPUs in node0 is not exactly half or a quarter of the CPUs sharing the
+ * LLC of CPU0.
+ */
+static int snc_ways(void)
+{
+ int node_cpus, cache_cpus;
+
+ node_cpus = count_sys_bitmap_bits("/sys/devices/system/node/node0/cpumap");
+ cache_cpus = count_sys_bitmap_bits("/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_map");
+
+ if (!node_cpus || !cache_cpus) {
+ fprintf(stderr, "Warning could not determine Sub-NUMA Cluster mode\n");
+ return 1;
+ }
+
+ if (4 * node_cpus >= cache_cpus)
+ return 4;
+ else if (2 * node_cpus >= cache_cpus)
+ return 2;
+ return 1;
+}
+
/*
* get_cache_size - Get cache size for a specified CPU
* @cpu_no: CPU number
@@ -190,6 +245,8 @@ int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size)
break;
}
+ if (cache_num == 3)
+ *cache_size /= snc_ways();
return 0;
}
--
2.41.0
Hi Tony,
> Sub-NUMA Cluster divides CPUs sharing an L3 cache into separate NUMA
> nodes. Systems may support splitting into either two or four nodes.
>
> When SNC mode is enabled the effective amount of L3 cache available for
> allocation is divided by the number of nodes per L3.
>
> Detect which SNC mode is active by comparing the number of CPUs that share
> a cache with CPU0, with the number of CPUs on node0.
>
> This gives some hope of tests passing. But additional test infrastructure
> changes are needed to bind tests to nodes and guarantee memory allocation
> from the local node.
>
> Reported-by: "Shaopeng Tan (Fujitsu)" <tan.shaopeng@fujitsu.com>
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
> tools/testing/selftests/resctrl/resctrl.h | 1 +
> tools/testing/selftests/resctrl/resctrlfs.c | 57
> +++++++++++++++++++++
> 2 files changed, 58 insertions(+)
>
> diff --git a/tools/testing/selftests/resctrl/resctrl.h
> b/tools/testing/selftests/resctrl/resctrl.h
> index 87e39456dee0..a8b43210b573 100644
> --- a/tools/testing/selftests/resctrl/resctrl.h
> +++ b/tools/testing/selftests/resctrl/resctrl.h
> @@ -13,6 +13,7 @@
> #include <signal.h>
> #include <dirent.h>
> #include <stdbool.h>
> +#include <ctype.h>
> #include <sys/stat.h>
> #include <sys/ioctl.h>
> #include <sys/mount.h>
> diff --git a/tools/testing/selftests/resctrl/resctrlfs.c
> b/tools/testing/selftests/resctrl/resctrlfs.c
> index fb00245dee92..79eecbf9f863 100644
> --- a/tools/testing/selftests/resctrl/resctrlfs.c
> +++ b/tools/testing/selftests/resctrl/resctrlfs.c
> @@ -130,6 +130,61 @@ int get_resource_id(int cpu_no, int *resource_id)
> return 0;
> }
>
> +/*
> + * Count number of CPUs in a /sys bit map */ static int
> +count_sys_bitmap_bits(char *name) {
> + FILE *fp = fopen(name, "r");
> + int count = 0, c;
> +
> + if (!fp)
> + return 0;
> +
> + while ((c = fgetc(fp)) != EOF) {
> + if (!isxdigit(c))
> + continue;
> + switch (c) {
> + case 'f':
> + count++;
> + case '7': case 'b': case 'd': case 'e':
> + count++;
> + case '3': case '5': case '6': case '9': case 'a': case 'c':
> + count++;
> + case '1': case '2': case '4': case '8':
> + count++;
> + }
> + }
> + fclose(fp);
> +
> + return count;
> +}
> +
> +/*
> + * Detect SNC by compating #CPUs in node0 with #CPUs sharing LLC with
> +CPU0
> + * Try to get this right, even if a few CPUs are offline so that the
> +number
> + * of CPUs in node0 is not exactly half or a quarter of the CPUs
> +sharing the
> + * LLC of CPU0.
> + */
> +static int snc_ways(void)
> +{
> + int node_cpus, cache_cpus;
> +
> + node_cpus =
> count_sys_bitmap_bits("/sys/devices/system/node/node0/cpumap");
> + cache_cpus =
> +count_sys_bitmap_bits("/sys/devices/system/cpu/cpu0/cache/index3/sh
> ared
> +_cpu_map");
> +
> + if (!node_cpus || !cache_cpus) {
> + fprintf(stderr, "Warning could not determine Sub-NUMA
> Cluster mode\n");
> + return 1;
> + }
> +
> + if (4 * node_cpus >= cache_cpus)
> + return 4;
> + else if (2 * node_cpus >= cache_cpus)
> + return 2;
If "4 * node_cpus >= cache_cpus " is not true,
"2 * node_cpus >= cache_cpus" will never be true.
Is it the following code?
+ if (2 * node_cpus >= cache_cpus)
+ return 2;
+ else if (4 * node_cpus >= cache_cpus)
+ return 4;
Best regards,
Shaopeng TAN
> > + if (4 * node_cpus >= cache_cpus) > > + return 4; > > + else if (2 * node_cpus >= cache_cpus) > > + return 2; > > > If "4 * node_cpus >= cache_cpus " is not true, > "2 * node_cpus >= cache_cpus" will never be true. > Is it the following code? > > + if (2 * node_cpus >= cache_cpus) > + return 2; > + else if (4 * node_cpus >= cache_cpus) > + return 4; Shaopeng TAN, Good catch. Your solution is the correct one. Will fix in next post. -Tony
On 2023-09-07 at 16:19:37 +0000, Luck, Tony wrote: >> > + if (4 * node_cpus >= cache_cpus) >> > + return 4; >> > + else if (2 * node_cpus >= cache_cpus) >> > + return 2; >> >> >> If "4 * node_cpus >= cache_cpus " is not true, >> "2 * node_cpus >= cache_cpus" will never be true. >> Is it the following code? >> >> + if (2 * node_cpus >= cache_cpus) >> + return 2; >> + else if (4 * node_cpus >= cache_cpus) >> + return 4; > > >Shaopeng TAN, > >Good catch. Your solution is the correct one. > >Will fix in next post. I played around with this code a little and I think the logical expressions are returning wrong values. On a system that has SNC disabled the function reports both "node_cpus" and "cache_cpus" equal to 56. In this case snc_ways() returns "2". It is the same on a system with SNC enabled that reports the previously mentioned variables to be different by a factor of two (36 and 72). Is it possible for node_cpus and cache_cpus to not be multiples of each other? (as in for example cache_cpus being 10 and node_cpus being 21?). If not I'd suggest using "==" instead of ">=". If yes then I guess something like this could work? : + if (node_cpus >= cache_cpus) + return 1; + else if (2 * node_cpus >= cache_cpus) + return 2; + else if (4 * node_cpus >= cache_cpus) + return 4; PS. I did my tests on two Intel Ice Lakes. -- Kind regards Maciej Wieczór-Retman
> On a system that has SNC disabled the function reports both "node_cpus"
> and "cache_cpus" equal to 56. In this case snc_ways() returns "2". It is
> the same on a system with SNC enabled that reports the previously mentioned
> variables to be different by a factor of two (36 and 72).
> Is it possible for node_cpus and cache_cpus to not be multiples of each
> other? (as in for example cache_cpus being 10 and node_cpus being 21?).
> If not I'd suggest using "==" instead of ">=".
Some CPUs may be offline when the test is run. E.g. with one CPU offline on SNC
node 0, you'd see node_cpus = 35 and cache_cpus = 71. But with one CPU offline
on node 1, you'd have node_cpus = 36, cache_cpus = 71.
> If yes then I guess something like this could work? :
+ if (node_cpus >= cache_cpus)
+ return 1;
+ else if (2 * node_cpus >= cache_cpus)
+ return 2;
+ else if (4 * node_cpus >= cache_cpus)
+ return 4;
This returns "4" for the 36 71 case. But should still be "2".
>> PS. I did my tests on two Intel Ice Lakes.
Perhaps easier to play with the algorithm in user code?
#include <stdio.h>
#include <stdlib.h>
static int snc(int node_cpus, int cache_cpus)
{
if (node_cpus >= cache_cpus)
return 1;
else if (2 * node_cpus >= cache_cpus)
return 2;
else if (4 * node_cpus >= cache_cpus)
return 4;
return -1;
}
int main(int argc, char **argv)
{
printf("%d\n", snc(atoi(argv[1]), atoi(argv[2])));
return 0;
}
N.B. it's probably not possible to handle the case where somebody took ALL the CPUs in SNC
node 1 offline (or SNC nodes 1,2,3 for the SNC 4 case).
I think it reasonable that the code handle some simple "small number of CPUs offline" cases.
But don't worry too much about cases where the user has done something extreme.
-Tony
Hi, thanks for the reply
On 2023-09-19 at 14:36:06 +0000, Luck, Tony wrote:
>> On a system that has SNC disabled the function reports both "node_cpus"
>> and "cache_cpus" equal to 56. In this case snc_ways() returns "2". It is
>> the same on a system with SNC enabled that reports the previously mentioned
>> variables to be different by a factor of two (36 and 72).
>
>> Is it possible for node_cpus and cache_cpus to not be multiples of each
>> other? (as in for example cache_cpus being 10 and node_cpus being 21?).
>> If not I'd suggest using "==" instead of ">=".
>
>Some CPUs may be offline when the test is run. E.g. with one CPU offline on SNC
>node 0, you'd see node_cpus = 35 and cache_cpus = 71. But with one CPU offline
>on node 1, you'd have node_cpus = 36, cache_cpus = 71.
Okay, thanks, good to know. On systems with disabled SNC that number
should be equal even if some CPUs were offline, right? I was mostly
concerned that the previous version was returning the same number
whether SNC was enabled with 2 nodes or disabled.
>> If yes then I guess something like this could work? :
>
>+ if (node_cpus >= cache_cpus)
>+ return 1;
>+ else if (2 * node_cpus >= cache_cpus)
>+ return 2;
>+ else if (4 * node_cpus >= cache_cpus)
>+ return 4;
>
>This returns "4" for the 36 71 case. But should still be "2".
>
>>> PS. I did my tests on two Intel Ice Lakes.
>
>Perhaps easier to play with the algorithm in user code?
>
>#include <stdio.h>
>#include <stdlib.h>
>
>static int snc(int node_cpus, int cache_cpus)
>{
> if (node_cpus >= cache_cpus)
> return 1;
> else if (2 * node_cpus >= cache_cpus)
> return 2;
> else if (4 * node_cpus >= cache_cpus)
> return 4;
> return -1;
>}
>
>int main(int argc, char **argv)
>{
> printf("%d\n", snc(atoi(argv[1]), atoi(argv[2])));
>
> return 0;
>}
My previous understanding was that the presence of ">=" comparison
implied the number of node_cpus could somehow get larger. So I
assumed that keeping it that way would be sufficient but now I can see
that wouldn't be the case.
>
>N.B. it's probably not possible to handle the case where somebody took ALL the CPUs in SNC
>node 1 offline (or SNC nodes 1,2,3 for the SNC 4 case).
>
>I think it reasonable that the code handle some simple "small number of CPUs offline" cases.
>But don't worry too much about cases where the user has done something extreme.
>
>-Tony
What about outputing this value to userspace from resctrl? The ratio is
already saved inside snc_nodes_per_l3_cache variable. And that would
help avoid these difficult cases when some cpus are offline which could
cause snc_ways() to return a wrong value. Or are there some pitfalls
to that approach?
--
Kind regards
Maciej Wieczór-Retman
> What about outputing this value to userspace from resctrl? The ratio is > already saved inside snc_nodes_per_l3_cache variable. And that would > help avoid these difficult cases when some cpus are offline which could > cause snc_ways() to return a wrong value. Or are there some pitfalls > to that approach? My original patch series added an "snc_ways" file to the info/ directory to make this visible. But I was talked out of it because of a lack of clear user mode use case that needs it. https://lore.kernel.org/all/f0841866-315b-4727-0a6c-ec60d22ca29c@arm.com/ I don't know if the resctrl self tests constitute a valid use case. Perhaps not as they can figure this out. But ... maybe the difficulties that user mode has (because of possibly offline CPUs) are an indicator that the kernel should expose this. If only to make the kernel's view clear in case there are situations where it got the calculation wrong. E.g. kernel booted with a maxcpus=N parameter. -Tony
Hello,
On 2023-08-29 at 16:44:26 -0700, Tony Luck wrote:
>Sub-NUMA Cluster divides CPUs sharing an L3 cache into separate NUMA
>nodes. Systems may support splitting into either two or four nodes.
>
>When SNC mode is enabled the effective amount of L3 cache available
>for allocation is divided by the number of nodes per L3.
>
>Detect which SNC mode is active by comparing the number of CPUs
>that share a cache with CPU0, with the number of CPUs on node0.
>
>This gives some hope of tests passing. But additional test
>infrastructure changes are needed to bind tests to nodes and
>guarantee memory allocation from the local node.
>
>Reported-by: "Shaopeng Tan (Fujitsu)" <tan.shaopeng@fujitsu.com>
>Signed-off-by: Tony Luck <tony.luck@intel.com>
>---
> tools/testing/selftests/resctrl/resctrl.h | 1 +
> tools/testing/selftests/resctrl/resctrlfs.c | 57 +++++++++++++++++++++
> 2 files changed, 58 insertions(+)
>
>diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
>index 87e39456dee0..a8b43210b573 100644
>--- a/tools/testing/selftests/resctrl/resctrl.h
>+++ b/tools/testing/selftests/resctrl/resctrl.h
>@@ -13,6 +13,7 @@
> #include <signal.h>
> #include <dirent.h>
> #include <stdbool.h>
>+#include <ctype.h>
> #include <sys/stat.h>
> #include <sys/ioctl.h>
> #include <sys/mount.h>
>diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
>index fb00245dee92..79eecbf9f863 100644
>--- a/tools/testing/selftests/resctrl/resctrlfs.c
>+++ b/tools/testing/selftests/resctrl/resctrlfs.c
>@@ -130,6 +130,61 @@ int get_resource_id(int cpu_no, int *resource_id)
> return 0;
> }
>
>+/*
>+ * Count number of CPUs in a /sys bit map
>+ */
>+static int count_sys_bitmap_bits(char *name)
>+{
>+ FILE *fp = fopen(name, "r");
>+ int count = 0, c;
>+
>+ if (!fp)
>+ return 0;
>+
>+ while ((c = fgetc(fp)) != EOF) {
>+ if (!isxdigit(c))
>+ continue;
>+ switch (c) {
>+ case 'f':
>+ count++;
>+ case '7': case 'b': case 'd': case 'e':
>+ count++;
>+ case '3': case '5': case '6': case '9': case 'a': case 'c':
>+ count++;
>+ case '1': case '2': case '4': case '8':
>+ count++;
>+ }
>+ }
>+ fclose(fp);
>+
>+ return count;
>+}
>+
The resctrl selftest has a function for counting bits, could it be used
here instead of the switch statement like this for example?
count = count_bits(c);
Or is there some reason this wouldn't be a good fit here?
--
Kind regards
Maciej Wieczór-Retman
> >+static int count_sys_bitmap_bits(char *name)
> >+{
> >+ FILE *fp = fopen(name, "r");
> >+ int count = 0, c;
> >+
> >+ if (!fp)
> >+ return 0;
> >+
> >+ while ((c = fgetc(fp)) != EOF) {
> >+ if (!isxdigit(c))
> >+ continue;
> >+ switch (c) {
> >+ case 'f':
> >+ count++;
> >+ case '7': case 'b': case 'd': case 'e':
> >+ count++;
> >+ case '3': case '5': case '6': case '9': case 'a': case 'c':
> >+ count++;
> >+ case '1': case '2': case '4': case '8':
> >+ count++;
> >+ }
> >+ }
> >+ fclose(fp);
> >+
> >+ return count;
> >+}
> >+
>
> The resctrl selftest has a function for counting bits, could it be used
> here instead of the switch statement like this for example?
>
> count = count_bits(c);
>
> Or is there some reason this wouldn't be a good fit here?
Thanks for looking at my patch.
That count_bits() function is doing so with input from an "unsigned long"
argument. My function is parsing the string result from a sysfs file which
might look like this:
$ cat shared_cpu_map
0000,00000fff,ffffff00,0000000f,ffffffff
To use count_bits() I'd have to use something like strtol() on each of the
comma separated fields first to convert from ascii strings to binary
values to feed into count_bits().
-Tony
On 2023-08-30 at 15:43:05 +0000, Luck, Tony wrote:
>> >+static int count_sys_bitmap_bits(char *name)
>> >+{
>> >+ FILE *fp = fopen(name, "r");
>> >+ int count = 0, c;
>> >+
>> >+ if (!fp)
>> >+ return 0;
>> >+
>> >+ while ((c = fgetc(fp)) != EOF) {
>> >+ if (!isxdigit(c))
>> >+ continue;
>> >+ switch (c) {
>> >+ case 'f':
>> >+ count++;
>> >+ case '7': case 'b': case 'd': case 'e':
>> >+ count++;
>> >+ case '3': case '5': case '6': case '9': case 'a': case 'c':
>> >+ count++;
>> >+ case '1': case '2': case '4': case '8':
>> >+ count++;
>> >+ }
>> >+ }
>> >+ fclose(fp);
>> >+
>> >+ return count;
>> >+}
>> >+
>>
>> The resctrl selftest has a function for counting bits, could it be used
>> here instead of the switch statement like this for example?
>>
>> count = count_bits(c);
>>
>> Or is there some reason this wouldn't be a good fit here?
>
>Thanks for looking at my patch.
>
>That count_bits() function is doing so with input from an "unsigned long"
>argument. My function is parsing the string result from a sysfs file which
>might look like this:
>
>$ cat shared_cpu_map
>0000,00000fff,ffffff00,0000000f,ffffffff
>
>To use count_bits() I'd have to use something like strtol() on each of the
>comma separated fields first to convert from ascii strings to binary
>values to feed into count_bits().
I missed they are being read as characters and not bytes, sorry.
Out of curiosity, what about using fscanf instead of fgetc? With the
format being %x and reading one byte at the time. Then instead of
isxdigit just checking if the read number was bigger than 0xF.
I also remembered there is a gcc (and I think clang has it as well)
builtin function that returns the number of set bits in a number.
So it would look like this:
while ((fscanf(fp, "%x", c)) != EOF ) {
if (c > 0xF)
continue;
count = __builtin_popcount(c);
}
Are there some problems with an approach like that?
--
Kind regards
Maciej Wieczór-Retman
> So it would look like this:
>
> while ((fscanf(fp, "%x", c)) != EOF ) {
> if (c > 0xF)
> continue;
> count = __builtin_popcount(c);
> }
>
> Are there some problems with an approach like that?
I think I'd prefer something that does more checking on the input
(e.g. the hex numbers are separated by "," and terminated with
a '\n').
If Shuah Khan doesn't like my original patch I can re-write
to use fscanf() et. al.
-Tony
© 2016 - 2025 Red Hat, Inc.