[v1] perf pmu intel: Adjust cpumaks for sub-NUMA clusters on Emeraldrapids

[PATCH] perf pmu intel: Adjust cpumaks for sub-NUMA clusters on Emeraldrapids

Posted by Chun-Tse Shao 1 month ago

Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
Adjust cpumasks as the logic for GNR in [1].

Tested on Emeraldrapids with SNC2 enabled:
  $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1

   Performance counter stats for 'system wide':

  N0       30        72125876670      UNC_CHA_CLOCKTICKS
  N0        4         8815163648      UNC_M_CLOCKTICKS
  N1       30        72124958844      UNC_CHA_CLOCKTICKS
  N1        4         8815014974      UNC_M_CLOCKTICKS
  N2       30        72121049022      UNC_CHA_CLOCKTICKS
  N2        4         8814592626      UNC_M_CLOCKTICKS
  N3       30        72117133854      UNC_CHA_CLOCKTICKS
  N3        4         8814012840      UNC_M_CLOCKTICKS

         1.001574118 seconds time elapsed

[1] lore.kernel.org/20250515181417.491401-1-irogers@google.com

Signed-off-by: Chun-Tse Shao <ctshao@google.com>
---
 tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
index a3f96221758d..fad68a0f7b5d 100644
--- a/tools/perf/arch/x86/util/pmu.c
+++ b/tools/perf/arch/x86/util/pmu.c
@@ -22,20 +22,29 @@
 #include "util/env.h"
 #include "util/header.h"
 
-static bool x86__is_intel_graniterapids(void)
+static bool x86__is_snc_supported(void)
 {
-	static bool checked_if_graniterapids;
-	static bool is_graniterapids;
+	static bool checked_if_snc_supported;
+	static bool is_supported;
 
-	if (!checked_if_graniterapids) {
-		const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
+	if (!checked_if_snc_supported) {
+
+		/* Emeraldrapids and Graniterapids support SNC configuration. */
+		static const char *const supported_cpuids[] = {
+			"GenuineIntel-6-CF", /* Emeraldrapids */
+			"GenuineIntel-6-A[DE]", /* Graniterapids */
+		};
 		char *cpuid = get_cpuid_str((struct perf_cpu){0});
 
-		is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
+		for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
+			is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
+			if (is_supported)
+				break;
+		}
 		free(cpuid);
-		checked_if_graniterapids = true;
+		checked_if_snc_supported = true;
 	}
-	return is_graniterapids;
+	return checked_if_snc_supported;
 }
 
 static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
@@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
 			read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
 
 		snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
+
 		perf_cpu_map__put(cache_cpus);
 		perf_cpu_map__put(node_cpus);
 		checked_snc = true;
@@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
 	// Compute the IMC SNC using lookup tables.
 	unsigned int imc_num;
 	int snc_nodes = snc_nodes_per_l3_cache();
-	const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
-	const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
+	const u8 snc2_map[] = {0, 0, 1, 1};
+	const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
 	const u8 *snc_map;
 	size_t snc_map_len;
 
@@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
 		pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
 		return 0;
 	}
-	if (imc_num >= snc_map_len) {
+	if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
 		pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
 		return 0;
 	}
-	return snc_map[imc_num];
+	return snc_map[imc_num % snc_map_len];
 }
 
 static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
@@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
 	return cpu_adjust[pmu_snc];
 }
 
-static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
+static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
 {
 	// With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
 	// topology. For example, a two socket graniterapids machine may be set
@@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
 				pmu->mem_events = perf_mem_events_intel_aux;
 			else
 				pmu->mem_events = perf_mem_events_intel;
-		} else if (x86__is_intel_graniterapids()) {
+		} else if (x86__is_snc_supported()) {
 			if (starts_with(pmu->name, "uncore_cha_"))
-				gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
-			else if (starts_with(pmu->name, "uncore_imc_"))
-				gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
+				uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
+			else if (starts_with(pmu->name, "uncore_imc_") &&
+				 !starts_with(pmu->name, "uncore_imc_free_running"))
+				uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
 		}
 	}
 }
-- 
2.52.0.457.g6b5491de43-goog

Re: [PATCH] perf pmu intel: Adjust cpumaks for sub-NUMA clusters on Emeraldrapids

Posted by Ian Rogers 1 month ago

On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
>
> Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
> Adjust cpumasks as the logic for GNR in [1].
>
> Tested on Emeraldrapids with SNC2 enabled:
>   $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
>
>    Performance counter stats for 'system wide':
>
>   N0       30        72125876670      UNC_CHA_CLOCKTICKS
>   N0        4         8815163648      UNC_M_CLOCKTICKS
>   N1       30        72124958844      UNC_CHA_CLOCKTICKS
>   N1        4         8815014974      UNC_M_CLOCKTICKS
>   N2       30        72121049022      UNC_CHA_CLOCKTICKS
>   N2        4         8814592626      UNC_M_CLOCKTICKS
>   N3       30        72117133854      UNC_CHA_CLOCKTICKS
>   N3        4         8814012840      UNC_M_CLOCKTICKS
>
>          1.001574118 seconds time elapsed
>
> [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
>
> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> ---
>  tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
>  1 file changed, 28 insertions(+), 17 deletions(-)
>
> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
> index a3f96221758d..fad68a0f7b5d 100644
> --- a/tools/perf/arch/x86/util/pmu.c
> +++ b/tools/perf/arch/x86/util/pmu.c
> @@ -22,20 +22,29 @@
>  #include "util/env.h"
>  #include "util/header.h"
>
> -static bool x86__is_intel_graniterapids(void)
> +static bool x86__is_snc_supported(void)
>  {
> -       static bool checked_if_graniterapids;
> -       static bool is_graniterapids;
> +       static bool checked_if_snc_supported;
> +       static bool is_supported;
>
> -       if (!checked_if_graniterapids) {
> -               const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
> +       if (!checked_if_snc_supported) {
> +
> +               /* Emeraldrapids and Graniterapids support SNC configuration. */
> +               static const char *const supported_cpuids[] = {
> +                       "GenuineIntel-6-CF", /* Emeraldrapids */
> +                       "GenuineIntel-6-A[DE]", /* Graniterapids */
> +               };
>                 char *cpuid = get_cpuid_str((struct perf_cpu){0});
>
> -               is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
> +               for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
> +                       is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
> +                       if (is_supported)
> +                               break;
> +               }
>                 free(cpuid);
> -               checked_if_graniterapids = true;
> +               checked_if_snc_supported = true;
>         }
> -       return is_graniterapids;
> +       return checked_if_snc_supported;
>  }
>
>  static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
> @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
>                         read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
>
>                 snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
> +
>                 perf_cpu_map__put(cache_cpus);
>                 perf_cpu_map__put(node_cpus);
>                 checked_snc = true;
> @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>         // Compute the IMC SNC using lookup tables.
>         unsigned int imc_num;
>         int snc_nodes = snc_nodes_per_l3_cache();
> -       const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
> -       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
> +       const u8 snc2_map[] = {0, 0, 1, 1};

Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.

Thanks,
Ian

> +       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
>         const u8 *snc_map;
>         size_t snc_map_len;
>
> @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>                 pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
>                 return 0;
>         }
> -       if (imc_num >= snc_map_len) {
> +       if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
>                 pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
>                 return 0;
>         }
> -       return snc_map[imc_num];
> +       return snc_map[imc_num % snc_map_len];
>  }
>
>  static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
> @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>         return cpu_adjust[pmu_snc];
>  }
>
> -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
> +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>  {
>         // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
>         // topology. For example, a two socket graniterapids machine may be set
> @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
>                                 pmu->mem_events = perf_mem_events_intel_aux;
>                         else
>                                 pmu->mem_events = perf_mem_events_intel;
> -               } else if (x86__is_intel_graniterapids()) {
> +               } else if (x86__is_snc_supported()) {
>                         if (starts_with(pmu->name, "uncore_cha_"))
> -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
> -                       else if (starts_with(pmu->name, "uncore_imc_"))
> -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
> +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
> +                       else if (starts_with(pmu->name, "uncore_imc_") &&
> +                                !starts_with(pmu->name, "uncore_imc_free_running"))
> +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>                 }
>         }
>  }
> --
> 2.52.0.457.g6b5491de43-goog
>

Re: [PATCH] perf pmu intel: Adjust cpumaks for sub-NUMA clusters on Emeraldrapids

Posted by Chun-Tse Shao 3 weeks, 4 days ago

Ping.

Thanks for your comment, Ian. To Intel team, can we get confirmation
of the GNR SNR2 configuration?

-CT

On Thu, Jan 8, 2026 at 11:19 AM Ian Rogers <irogers@google.com> wrote:
>
> On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
> >
> > Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
> > Adjust cpumasks as the logic for GNR in [1].
> >
> > Tested on Emeraldrapids with SNC2 enabled:
> >   $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
> >
> >    Performance counter stats for 'system wide':
> >
> >   N0       30        72125876670      UNC_CHA_CLOCKTICKS
> >   N0        4         8815163648      UNC_M_CLOCKTICKS
> >   N1       30        72124958844      UNC_CHA_CLOCKTICKS
> >   N1        4         8815014974      UNC_M_CLOCKTICKS
> >   N2       30        72121049022      UNC_CHA_CLOCKTICKS
> >   N2        4         8814592626      UNC_M_CLOCKTICKS
> >   N3       30        72117133854      UNC_CHA_CLOCKTICKS
> >   N3        4         8814012840      UNC_M_CLOCKTICKS
> >
> >          1.001574118 seconds time elapsed
> >
> > [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
> >
> > Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> > ---
> >  tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
> >  1 file changed, 28 insertions(+), 17 deletions(-)
> >
> > diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
> > index a3f96221758d..fad68a0f7b5d 100644
> > --- a/tools/perf/arch/x86/util/pmu.c
> > +++ b/tools/perf/arch/x86/util/pmu.c
> > @@ -22,20 +22,29 @@
> >  #include "util/env.h"
> >  #include "util/header.h"
> >
> > -static bool x86__is_intel_graniterapids(void)
> > +static bool x86__is_snc_supported(void)
> >  {
> > -       static bool checked_if_graniterapids;
> > -       static bool is_graniterapids;
> > +       static bool checked_if_snc_supported;
> > +       static bool is_supported;
> >
> > -       if (!checked_if_graniterapids) {
> > -               const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
> > +       if (!checked_if_snc_supported) {
> > +
> > +               /* Emeraldrapids and Graniterapids support SNC configuration. */
> > +               static const char *const supported_cpuids[] = {
> > +                       "GenuineIntel-6-CF", /* Emeraldrapids */
> > +                       "GenuineIntel-6-A[DE]", /* Graniterapids */
> > +               };
> >                 char *cpuid = get_cpuid_str((struct perf_cpu){0});
> >
> > -               is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
> > +               for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
> > +                       is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
> > +                       if (is_supported)
> > +                               break;
> > +               }
> >                 free(cpuid);
> > -               checked_if_graniterapids = true;
> > +               checked_if_snc_supported = true;
> >         }
> > -       return is_graniterapids;
> > +       return checked_if_snc_supported;
> >  }
> >
> >  static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
> > @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
> >                         read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
> >
> >                 snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
> > +
> >                 perf_cpu_map__put(cache_cpus);
> >                 perf_cpu_map__put(node_cpus);
> >                 checked_snc = true;
> > @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
> >         // Compute the IMC SNC using lookup tables.
> >         unsigned int imc_num;
> >         int snc_nodes = snc_nodes_per_l3_cache();
> > -       const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
> > -       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
> > +       const u8 snc2_map[] = {0, 0, 1, 1};
>
> Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.
>
> Thanks,
> Ian
>
> > +       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
> >         const u8 *snc_map;
> >         size_t snc_map_len;
> >
> > @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
> >                 pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
> >                 return 0;
> >         }
> > -       if (imc_num >= snc_map_len) {
> > +       if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
> >                 pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
> >                 return 0;
> >         }
> > -       return snc_map[imc_num];
> > +       return snc_map[imc_num % snc_map_len];
> >  }
> >
> >  static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
> > @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
> >         return cpu_adjust[pmu_snc];
> >  }
> >
> > -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
> > +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
> >  {
> >         // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
> >         // topology. For example, a two socket graniterapids machine may be set
> > @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
> >                                 pmu->mem_events = perf_mem_events_intel_aux;
> >                         else
> >                                 pmu->mem_events = perf_mem_events_intel;
> > -               } else if (x86__is_intel_graniterapids()) {
> > +               } else if (x86__is_snc_supported()) {
> >                         if (starts_with(pmu->name, "uncore_cha_"))
> > -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
> > -                       else if (starts_with(pmu->name, "uncore_imc_"))
> > -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
> > +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
> > +                       else if (starts_with(pmu->name, "uncore_imc_") &&
> > +                                !starts_with(pmu->name, "uncore_imc_free_running"))
> > +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
> >                 }
> >         }
> >  }
> > --
> > 2.52.0.457.g6b5491de43-goog
> >

Re: [PATCH] perf pmu intel: Adjust cpumaks for sub-NUMA clusters on Emeraldrapids

Posted by Mi, Dapeng 2 weeks, 6 days ago

On 1/14/2026 2:06 AM, Chun-Tse Shao wrote:
> Ping.
>
> Thanks for your comment, Ian. To Intel team, can we get confirmation
> of the GNR SNR2 configuration?

+ Zide

Zide would look at and verify the configuration. Thanks.


>
> -CT
>
> On Thu, Jan 8, 2026 at 11:19 AM Ian Rogers <irogers@google.com> wrote:
>> On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
>>> Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
>>> Adjust cpumasks as the logic for GNR in [1].
>>>
>>> Tested on Emeraldrapids with SNC2 enabled:
>>>   $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
>>>
>>>    Performance counter stats for 'system wide':
>>>
>>>   N0       30        72125876670      UNC_CHA_CLOCKTICKS
>>>   N0        4         8815163648      UNC_M_CLOCKTICKS
>>>   N1       30        72124958844      UNC_CHA_CLOCKTICKS
>>>   N1        4         8815014974      UNC_M_CLOCKTICKS
>>>   N2       30        72121049022      UNC_CHA_CLOCKTICKS
>>>   N2        4         8814592626      UNC_M_CLOCKTICKS
>>>   N3       30        72117133854      UNC_CHA_CLOCKTICKS
>>>   N3        4         8814012840      UNC_M_CLOCKTICKS
>>>
>>>          1.001574118 seconds time elapsed
>>>
>>> [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
>>>
>>> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
>>> ---
>>>  tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
>>>  1 file changed, 28 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
>>> index a3f96221758d..fad68a0f7b5d 100644
>>> --- a/tools/perf/arch/x86/util/pmu.c
>>> +++ b/tools/perf/arch/x86/util/pmu.c
>>> @@ -22,20 +22,29 @@
>>>  #include "util/env.h"
>>>  #include "util/header.h"
>>>
>>> -static bool x86__is_intel_graniterapids(void)
>>> +static bool x86__is_snc_supported(void)
>>>  {
>>> -       static bool checked_if_graniterapids;
>>> -       static bool is_graniterapids;
>>> +       static bool checked_if_snc_supported;
>>> +       static bool is_supported;
>>>
>>> -       if (!checked_if_graniterapids) {
>>> -               const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
>>> +       if (!checked_if_snc_supported) {
>>> +
>>> +               /* Emeraldrapids and Graniterapids support SNC configuration. */
>>> +               static const char *const supported_cpuids[] = {
>>> +                       "GenuineIntel-6-CF", /* Emeraldrapids */
>>> +                       "GenuineIntel-6-A[DE]", /* Graniterapids */
>>> +               };
>>>                 char *cpuid = get_cpuid_str((struct perf_cpu){0});
>>>
>>> -               is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
>>> +               for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
>>> +                       is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
>>> +                       if (is_supported)
>>> +                               break;
>>> +               }
>>>                 free(cpuid);
>>> -               checked_if_graniterapids = true;
>>> +               checked_if_snc_supported = true;
>>>         }
>>> -       return is_graniterapids;
>>> +       return checked_if_snc_supported;
>>>  }
>>>
>>>  static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
>>> @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
>>>                         read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
>>>
>>>                 snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
>>> +
>>>                 perf_cpu_map__put(cache_cpus);
>>>                 perf_cpu_map__put(node_cpus);
>>>                 checked_snc = true;
>>> @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>>         // Compute the IMC SNC using lookup tables.
>>>         unsigned int imc_num;
>>>         int snc_nodes = snc_nodes_per_l3_cache();
>>> -       const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
>>> -       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
>>> +       const u8 snc2_map[] = {0, 0, 1, 1};
>> Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.
>>
>> Thanks,
>> Ian
>>
>>> +       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
>>>         const u8 *snc_map;
>>>         size_t snc_map_len;
>>>
>>> @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>>                 pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
>>>                 return 0;
>>>         }
>>> -       if (imc_num >= snc_map_len) {
>>> +       if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
>>>                 pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
>>>                 return 0;
>>>         }
>>> -       return snc_map[imc_num];
>>> +       return snc_map[imc_num % snc_map_len];
>>>  }
>>>
>>>  static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>> @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>>         return cpu_adjust[pmu_snc];
>>>  }
>>>
>>> -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>> +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>>  {
>>>         // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
>>>         // topology. For example, a two socket graniterapids machine may be set
>>> @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
>>>                                 pmu->mem_events = perf_mem_events_intel_aux;
>>>                         else
>>>                                 pmu->mem_events = perf_mem_events_intel;
>>> -               } else if (x86__is_intel_graniterapids()) {
>>> +               } else if (x86__is_snc_supported()) {
>>>                         if (starts_with(pmu->name, "uncore_cha_"))
>>> -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>> -                       else if (starts_with(pmu->name, "uncore_imc_"))
>>> -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>> +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>> +                       else if (starts_with(pmu->name, "uncore_imc_") &&
>>> +                                !starts_with(pmu->name, "uncore_imc_free_running"))
>>> +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>>                 }
>>>         }
>>>  }
>>> --
>>> 2.52.0.457.g6b5491de43-goog
>>>

Re: [PATCH] perf pmu intel: Adjust cpumaks for sub-NUMA clusters on Emeraldrapids

Posted by Chen, Zide 2 weeks, 2 days ago


On 1/18/2026 4:51 PM, Mi, Dapeng wrote:
> 
> On 1/14/2026 2:06 AM, Chun-Tse Shao wrote:
>> Ping.
>>
>> Thanks for your comment, Ian. To Intel team, can we get confirmation
>> of the GNR SNR2 configuration?
>>
>> -CT
>>
>> On Thu, Jan 8, 2026 at 11:19 AM Ian Rogers <irogers@google.com> wrote:
>>> On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
>>>> Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
>>>> Adjust cpumasks as the logic for GNR in [1].
>>>>
>>>> Tested on Emeraldrapids with SNC2 enabled:
>>>>   $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
>>>>
>>>>    Performance counter stats for 'system wide':
>>>>
>>>>   N0       30        72125876670      UNC_CHA_CLOCKTICKS
>>>>   N0        4         8815163648      UNC_M_CLOCKTICKS
>>>>   N1       30        72124958844      UNC_CHA_CLOCKTICKS
>>>>   N1        4         8815014974      UNC_M_CLOCKTICKS
>>>>   N2       30        72121049022      UNC_CHA_CLOCKTICKS
>>>>   N2        4         8814592626      UNC_M_CLOCKTICKS
>>>>   N3       30        72117133854      UNC_CHA_CLOCKTICKS
>>>>   N3        4         8814012840      UNC_M_CLOCKTICKS
>>>>
>>>>          1.001574118 seconds time elapsed
>>>>
>>>> [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
>>>>
>>>> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
>>>> ---
>>>>  tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
>>>>  1 file changed, 28 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
>>>> index a3f96221758d..fad68a0f7b5d 100644
>>>> --- a/tools/perf/arch/x86/util/pmu.c
>>>> +++ b/tools/perf/arch/x86/util/pmu.c
>>>> @@ -22,20 +22,29 @@
>>>>  #include "util/env.h"
>>>>  #include "util/header.h"
>>>>
>>>> -static bool x86__is_intel_graniterapids(void)
>>>> +static bool x86__is_snc_supported(void)
>>>>  {
>>>> -       static bool checked_if_graniterapids;
>>>> -       static bool is_graniterapids;
>>>> +       static bool checked_if_snc_supported;
>>>> +       static bool is_supported;
>>>>
>>>> -       if (!checked_if_graniterapids) {
>>>> -               const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
>>>> +       if (!checked_if_snc_supported) {
>>>> +
>>>> +               /* Emeraldrapids and Graniterapids support SNC configuration. */
>>>> +               static const char *const supported_cpuids[] = {
>>>> +                       "GenuineIntel-6-CF", /* Emeraldrapids */
>>>> +                       "GenuineIntel-6-A[DE]", /* Graniterapids */
>>>> +               };
>>>>                 char *cpuid = get_cpuid_str((struct perf_cpu){0});
>>>>
>>>> -               is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
>>>> +               for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
>>>> +                       is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
>>>> +                       if (is_supported)
>>>> +                               break;
>>>> +               }
>>>>                 free(cpuid);
>>>> -               checked_if_graniterapids = true;
>>>> +               checked_if_snc_supported = true;
>>>>         }
>>>> -       return is_graniterapids;
>>>> +       return checked_if_snc_supported;
>>>>  }
>>>>
>>>>  static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
>>>> @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
>>>>                         read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
>>>>
>>>>                 snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
>>>> +
>>>>                 perf_cpu_map__put(cache_cpus);
>>>>                 perf_cpu_map__put(node_cpus);
>>>>                 checked_snc = true;
>>>> @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>>>         // Compute the IMC SNC using lookup tables.
>>>>         unsigned int imc_num;
>>>>         int snc_nodes = snc_nodes_per_l3_cache();
>>>> -       const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
>>>> -       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
>>>> +       const u8 snc2_map[] = {0, 0, 1, 1};
>>> Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.

It appears to break GNR SNC2. While it works for the --per-node test, it
fails the following affinity test. Testing on EMR shows that it follows
the new lookup table. Should we use a model-specific lookup table here?

# Running workload on CPU0
$ taskset -c CPU0 stress-ng --vm 1 --vm-bytes 2G --vm-method all
--timeout 30s

# Profiling UNC_M_PRE_COUNT.ALL on all IMC boxes.
$ perf stat \
        -e uncore_imc_0/event=0x03,umask=0xFF/ \
        -e uncore_imc_1/event=0x03,umask=0xFF/ \
        -e uncore_imc_2/event=0x03,umask=0xFF/ \
        -e uncore_imc_3/event=0x03,umask=0xFF/ \
        -e uncore_imc_4/event=0x03,umask=0xFF/ \
        -e uncore_imc_5/event=0x03,umask=0xFF/ \
        -e uncore_imc_6/event=0x03,umask=0xFF/ \
        -e uncore_imc_7/event=0x03,umask=0xFF/ \
        -a -I 1000

This shows that the uncore_imc_[2|3|6|7] boxes are affinitized to CPU0.

     5.013638757          1,635,470      uncore_imc_0/event=0x03,umask=0xFF/
     5.013638757          1,638,157      uncore_imc_1/event=0x03,umask=0xFF/
     5.013638757         27,093,922      uncore_imc_2/event=0x03,umask=0xFF/
     5.013638757         27,025,980      uncore_imc_3/event=0x03,umask=0xFF/
     5.013638757          1,616,974      uncore_imc_4/event=0x03,umask=0xFF/
     5.013638757          1,627,251      uncore_imc_5/event=0x03,umask=0xFF/
     5.013638757         26,854,588      uncore_imc_6/event=0x03,umask=0xFF/
     5.013638757         26,974,506      uncore_imc_7/event=0x03,umask=0xFF/

Testing with additional CPUs confirms that the original GNR SNC2 lookup
table is correct.

                        CPU                     uncore_imc box
NUMA node0 CPU(s):      0-42,344-386            2 3 6 7
NUMA node1 CPU(s):      43-85,387-429           0 1 4 5
NUMA node2 CPU(s):      86-128,430-472          2 3 6 7
NUMA node3 CPU(s):      129-171,473-515         0 1 4 5
NUMA node4 CPU(s):      172-214,516-558         2 3 6 7
NUMA node5 CPU(s):      215-257,559-601         0 1 4 5
NUMA node6 CPU(s):      258-300,602-644         2 3 6 7
NUMA node7 CPU(s):      301-343,645-687         0 1 4 5


>>> Thanks,
>>> Ian
>>>
>>>> +       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
>>>>         const u8 *snc_map;
>>>>         size_t snc_map_len;
>>>>
>>>> @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>>>                 pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
>>>>                 return 0;
>>>>         }
>>>> -       if (imc_num >= snc_map_len) {
>>>> +       if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
>>>>                 pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
>>>>                 return 0;
>>>>         }
>>>> -       return snc_map[imc_num];
>>>> +       return snc_map[imc_num % snc_map_len];
>>>>  }
>>>>
>>>>  static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>>> @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>>>         return cpu_adjust[pmu_snc];
>>>>  }
>>>>
>>>> -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>>> +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>>>  {
>>>>         // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
>>>>         // topology. For example, a two socket graniterapids machine may be set
>>>> @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
>>>>                                 pmu->mem_events = perf_mem_events_intel_aux;
>>>>                         else
>>>>                                 pmu->mem_events = perf_mem_events_intel;
>>>> -               } else if (x86__is_intel_graniterapids()) {
>>>> +               } else if (x86__is_snc_supported()) {
>>>>                         if (starts_with(pmu->name, "uncore_cha_"))
>>>> -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>>> -                       else if (starts_with(pmu->name, "uncore_imc_"))
>>>> -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>>> +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>>> +                       else if (starts_with(pmu->name, "uncore_imc_") &&
>>>> +                                !starts_with(pmu->name, "uncore_imc_free_running"))
>>>> +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>>>                 }
>>>>         }
>>>>  }
>>>> --
>>>> 2.52.0.457.g6b5491de43-goog
>>>>

Re: [PATCH] perf pmu intel: Adjust cpumaks for sub-NUMA clusters on Emeraldrapids

Posted by Chun-Tse Shao 2 weeks, 1 day ago

Thank you for the confirmation, Zide. I will submit another patch to
split EMR and GNR SNC2 IMC map.

-CT

On Thu, Jan 22, 2026 at 2:06 PM Chen, Zide <zide.chen@intel.com> wrote:
>
>
>
> On 1/18/2026 4:51 PM, Mi, Dapeng wrote:
> >
> > On 1/14/2026 2:06 AM, Chun-Tse Shao wrote:
> >> Ping.
> >>
> >> Thanks for your comment, Ian. To Intel team, can we get confirmation
> >> of the GNR SNR2 configuration?
> >>
> >> -CT
> >>
> >> On Thu, Jan 8, 2026 at 11:19 AM Ian Rogers <irogers@google.com> wrote:
> >>> On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
> >>>> Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
> >>>> Adjust cpumasks as the logic for GNR in [1].
> >>>>
> >>>> Tested on Emeraldrapids with SNC2 enabled:
> >>>>   $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
> >>>>
> >>>>    Performance counter stats for 'system wide':
> >>>>
> >>>>   N0       30        72125876670      UNC_CHA_CLOCKTICKS
> >>>>   N0        4         8815163648      UNC_M_CLOCKTICKS
> >>>>   N1       30        72124958844      UNC_CHA_CLOCKTICKS
> >>>>   N1        4         8815014974      UNC_M_CLOCKTICKS
> >>>>   N2       30        72121049022      UNC_CHA_CLOCKTICKS
> >>>>   N2        4         8814592626      UNC_M_CLOCKTICKS
> >>>>   N3       30        72117133854      UNC_CHA_CLOCKTICKS
> >>>>   N3        4         8814012840      UNC_M_CLOCKTICKS
> >>>>
> >>>>          1.001574118 seconds time elapsed
> >>>>
> >>>> [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
> >>>>
> >>>> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> >>>> ---
> >>>>  tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
> >>>>  1 file changed, 28 insertions(+), 17 deletions(-)
> >>>>
> >>>> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
> >>>> index a3f96221758d..fad68a0f7b5d 100644
> >>>> --- a/tools/perf/arch/x86/util/pmu.c
> >>>> +++ b/tools/perf/arch/x86/util/pmu.c
> >>>> @@ -22,20 +22,29 @@
> >>>>  #include "util/env.h"
> >>>>  #include "util/header.h"
> >>>>
> >>>> -static bool x86__is_intel_graniterapids(void)
> >>>> +static bool x86__is_snc_supported(void)
> >>>>  {
> >>>> -       static bool checked_if_graniterapids;
> >>>> -       static bool is_graniterapids;
> >>>> +       static bool checked_if_snc_supported;
> >>>> +       static bool is_supported;
> >>>>
> >>>> -       if (!checked_if_graniterapids) {
> >>>> -               const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
> >>>> +       if (!checked_if_snc_supported) {
> >>>> +
> >>>> +               /* Emeraldrapids and Graniterapids support SNC configuration. */
> >>>> +               static const char *const supported_cpuids[] = {
> >>>> +                       "GenuineIntel-6-CF", /* Emeraldrapids */
> >>>> +                       "GenuineIntel-6-A[DE]", /* Graniterapids */
> >>>> +               };
> >>>>                 char *cpuid = get_cpuid_str((struct perf_cpu){0});
> >>>>
> >>>> -               is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
> >>>> +               for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
> >>>> +                       is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
> >>>> +                       if (is_supported)
> >>>> +                               break;
> >>>> +               }
> >>>>                 free(cpuid);
> >>>> -               checked_if_graniterapids = true;
> >>>> +               checked_if_snc_supported = true;
> >>>>         }
> >>>> -       return is_graniterapids;
> >>>> +       return checked_if_snc_supported;
> >>>>  }
> >>>>
> >>>>  static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
> >>>> @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
> >>>>                         read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
> >>>>
> >>>>                 snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
> >>>> +
> >>>>                 perf_cpu_map__put(cache_cpus);
> >>>>                 perf_cpu_map__put(node_cpus);
> >>>>                 checked_snc = true;
> >>>> @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
> >>>>         // Compute the IMC SNC using lookup tables.
> >>>>         unsigned int imc_num;
> >>>>         int snc_nodes = snc_nodes_per_l3_cache();
> >>>> -       const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
> >>>> -       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
> >>>> +       const u8 snc2_map[] = {0, 0, 1, 1};
> >>> Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.
>
> It appears to break GNR SNC2. While it works for the --per-node test, it
> fails the following affinity test. Testing on EMR shows that it follows
> the new lookup table. Should we use a model-specific lookup table here?
>
> # Running workload on CPU0
> $ taskset -c CPU0 stress-ng --vm 1 --vm-bytes 2G --vm-method all
> --timeout 30s
>
> # Profiling UNC_M_PRE_COUNT.ALL on all IMC boxes.
> $ perf stat \
>         -e uncore_imc_0/event=0x03,umask=0xFF/ \
>         -e uncore_imc_1/event=0x03,umask=0xFF/ \
>         -e uncore_imc_2/event=0x03,umask=0xFF/ \
>         -e uncore_imc_3/event=0x03,umask=0xFF/ \
>         -e uncore_imc_4/event=0x03,umask=0xFF/ \
>         -e uncore_imc_5/event=0x03,umask=0xFF/ \
>         -e uncore_imc_6/event=0x03,umask=0xFF/ \
>         -e uncore_imc_7/event=0x03,umask=0xFF/ \
>         -a -I 1000
>
> This shows that the uncore_imc_[2|3|6|7] boxes are affinitized to CPU0.
>
>      5.013638757          1,635,470      uncore_imc_0/event=0x03,umask=0xFF/
>      5.013638757          1,638,157      uncore_imc_1/event=0x03,umask=0xFF/
>      5.013638757         27,093,922      uncore_imc_2/event=0x03,umask=0xFF/
>      5.013638757         27,025,980      uncore_imc_3/event=0x03,umask=0xFF/
>      5.013638757          1,616,974      uncore_imc_4/event=0x03,umask=0xFF/
>      5.013638757          1,627,251      uncore_imc_5/event=0x03,umask=0xFF/
>      5.013638757         26,854,588      uncore_imc_6/event=0x03,umask=0xFF/
>      5.013638757         26,974,506      uncore_imc_7/event=0x03,umask=0xFF/
>
> Testing with additional CPUs confirms that the original GNR SNC2 lookup
> table is correct.
>
>                         CPU                     uncore_imc box
> NUMA node0 CPU(s):      0-42,344-386            2 3 6 7
> NUMA node1 CPU(s):      43-85,387-429           0 1 4 5
> NUMA node2 CPU(s):      86-128,430-472          2 3 6 7
> NUMA node3 CPU(s):      129-171,473-515         0 1 4 5
> NUMA node4 CPU(s):      172-214,516-558         2 3 6 7
> NUMA node5 CPU(s):      215-257,559-601         0 1 4 5
> NUMA node6 CPU(s):      258-300,602-644         2 3 6 7
> NUMA node7 CPU(s):      301-343,645-687         0 1 4 5
>
>
> >>> Thanks,
> >>> Ian
> >>>
> >>>> +       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
> >>>>         const u8 *snc_map;
> >>>>         size_t snc_map_len;
> >>>>
> >>>> @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
> >>>>                 pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
> >>>>                 return 0;
> >>>>         }
> >>>> -       if (imc_num >= snc_map_len) {
> >>>> +       if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
> >>>>                 pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
> >>>>                 return 0;
> >>>>         }
> >>>> -       return snc_map[imc_num];
> >>>> +       return snc_map[imc_num % snc_map_len];
> >>>>  }
> >>>>
> >>>>  static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
> >>>> @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
> >>>>         return cpu_adjust[pmu_snc];
> >>>>  }
> >>>>
> >>>> -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
> >>>> +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
> >>>>  {
> >>>>         // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
> >>>>         // topology. For example, a two socket graniterapids machine may be set
> >>>> @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
> >>>>                                 pmu->mem_events = perf_mem_events_intel_aux;
> >>>>                         else
> >>>>                                 pmu->mem_events = perf_mem_events_intel;
> >>>> -               } else if (x86__is_intel_graniterapids()) {
> >>>> +               } else if (x86__is_snc_supported()) {
> >>>>                         if (starts_with(pmu->name, "uncore_cha_"))
> >>>> -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
> >>>> -                       else if (starts_with(pmu->name, "uncore_imc_"))
> >>>> -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
> >>>> +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
> >>>> +                       else if (starts_with(pmu->name, "uncore_imc_") &&
> >>>> +                                !starts_with(pmu->name, "uncore_imc_free_running"))
> >>>> +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
> >>>>                 }
> >>>>         }
> >>>>  }
> >>>> --
> >>>> 2.52.0.457.g6b5491de43-goog
> >>>>
>

Re: [PATCH] perf pmu intel: Adjust cpumaks for sub-NUMA clusters on Emeraldrapids

Posted by Chen, Zide 2 weeks, 2 days ago


On 1/18/2026 4:51 PM, Mi, Dapeng wrote:
> 
> On 1/14/2026 2:06 AM, Chun-Tse Shao wrote:
>> Ping.
>>
>> Thanks for your comment, Ian. To Intel team, can we get confirmation
>> of the GNR SNR2 configuration?
>>
>> -CT
>>
>> On Thu, Jan 8, 2026 at 11:19 AM Ian Rogers <irogers@google.com> wrote:
>>> On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
>>>> Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
>>>> Adjust cpumasks as the logic for GNR in [1].
>>>>
>>>> Tested on Emeraldrapids with SNC2 enabled:
>>>>   $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
>>>>
>>>>    Performance counter stats for 'system wide':
>>>>
>>>>   N0       30        72125876670      UNC_CHA_CLOCKTICKS
>>>>   N0        4         8815163648      UNC_M_CLOCKTICKS
>>>>   N1       30        72124958844      UNC_CHA_CLOCKTICKS
>>>>   N1        4         8815014974      UNC_M_CLOCKTICKS
>>>>   N2       30        72121049022      UNC_CHA_CLOCKTICKS
>>>>   N2        4         8814592626      UNC_M_CLOCKTICKS
>>>>   N3       30        72117133854      UNC_CHA_CLOCKTICKS
>>>>   N3        4         8814012840      UNC_M_CLOCKTICKS
>>>>
>>>>          1.001574118 seconds time elapsed
>>>>
>>>> [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
>>>>
>>>> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
>>>> ---
>>>>  tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
>>>>  1 file changed, 28 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
>>>> index a3f96221758d..fad68a0f7b5d 100644
>>>> --- a/tools/perf/arch/x86/util/pmu.c
>>>> +++ b/tools/perf/arch/x86/util/pmu.c
>>>> @@ -22,20 +22,29 @@
>>>>  #include "util/env.h"
>>>>  #include "util/header.h"
>>>>
>>>> -static bool x86__is_intel_graniterapids(void)
>>>> +static bool x86__is_snc_supported(void)
>>>>  {
>>>> -       static bool checked_if_graniterapids;
>>>> -       static bool is_graniterapids;
>>>> +       static bool checked_if_snc_supported;
>>>> +       static bool is_supported;
>>>>
>>>> -       if (!checked_if_graniterapids) {
>>>> -               const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
>>>> +       if (!checked_if_snc_supported) {
>>>> +
>>>> +               /* Emeraldrapids and Graniterapids support SNC configuration. */
>>>> +               static const char *const supported_cpuids[] = {
>>>> +                       "GenuineIntel-6-CF", /* Emeraldrapids */
>>>> +                       "GenuineIntel-6-A[DE]", /* Graniterapids */
>>>> +               };
>>>>                 char *cpuid = get_cpuid_str((struct perf_cpu){0});
>>>>
>>>> -               is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
>>>> +               for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
>>>> +                       is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
>>>> +                       if (is_supported)
>>>> +                               break;
>>>> +               }
>>>>                 free(cpuid);
>>>> -               checked_if_graniterapids = true;
>>>> +               checked_if_snc_supported = true;
>>>>         }
>>>> -       return is_graniterapids;
>>>> +       return checked_if_snc_supported;
>>>>  }
>>>>
>>>>  static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
>>>> @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
>>>>                         read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
>>>>
>>>>                 snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
>>>> +
>>>>                 perf_cpu_map__put(cache_cpus);
>>>>                 perf_cpu_map__put(node_cpus);
>>>>                 checked_snc = true;
>>>> @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>>>         // Compute the IMC SNC using lookup tables.
>>>>         unsigned int imc_num;
>>>>         int snc_nodes = snc_nodes_per_l3_cache();
>>>> -       const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
>>>> -       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
>>>> +       const u8 snc2_map[] = {0, 0, 1, 1};
>>> Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.

It appears to break GNR SNC2. While it works for the --per-node test, it
fails the following affinity test. Testing on EMR shows that it follows
the new lookup table. Should we use a model-specific lookup table here?

$ taskset -c CPU0 stress-ng --vm 1 --vm-bytes 2G --vm-method all
--timeout 30s

# UNC_M_PRE_COUNT.ALL
$ perf stat \
        -e uncore_imc_0/event=0x03,umask=0xFF/ \
        -e uncore_imc_1/event=0x03,umask=0xFF/ \
        -e uncore_imc_2/event=0x03,umask=0xFF/ \
        -e uncore_imc_3/event=0x03,umask=0xFF/ \
        -e uncore_imc_4/event=0x03,umask=0xFF/ \
        -e uncore_imc_5/event=0x03,umask=0xFF/ \
        -e uncore_imc_6/event=0x03,umask=0xFF/ \
        -e uncore_imc_7/event=0x03,umask=0xFF/ \
        -a -I 1000

This shows that the uncore_imc_[2|3|6|7] boxes are affinitized to CPU0.

     6.015968927          1,598,800      uncore_imc_0/event=0x03,umask=0xFF/
     6.015968927          1,605,301      uncore_imc_1/event=0x03,umask=0xFF/
     6.015968927         20,252,028      uncore_imc_2/event=0x03,umask=0xFF/
     6.015968927         20,256,187      uncore_imc_3/event=0x03,umask=0xFF/
     6.015968927          1,594,551      uncore_imc_4/event=0x03,umask=0xFF/
     6.015968927          1,598,350      uncore_imc_5/event=0x03,umask=0xFF/
     6.015968927         20,185,615      uncore_imc_6/event=0x03,umask=0xFF/
     6.015968927         20,128,015      uncore_imc_7/event=0x03,umask=0xFF/

Testing with additional CPUs confirms that the original GNR SNC2 lookup
table is correct.

                        CPU                     uncore_imc_n
NUMA node0 CPU(s):      0-42,344-386            2 3 6 7
NUMA node1 CPU(s):      43-85,387-429           0 1 4 5
NUMA node2 CPU(s):      86-128,430-472          2 3 6 7
NUMA node3 CPU(s):      129-171,473-515         0 1 4 5
NUMA node4 CPU(s):      172-214,516-558         2 3 6 7
NUMA node5 CPU(s):      215-257,559-601         0 1 4 5
NUMA node6 CPU(s):      258-300,602-644         2 3 6 7
NUMA node7 CPU(s):      301-343,645-687         0 1 4 5

>>>
>>> Thanks,
>>> Ian
>>>
>>>> +       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
>>>>         const u8 *snc_map;
>>>>         size_t snc_map_len;
>>>>
>>>> @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>>>                 pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
>>>>                 return 0;
>>>>         }
>>>> -       if (imc_num >= snc_map_len) {
>>>> +       if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
>>>>                 pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
>>>>                 return 0;
>>>>         }
>>>> -       return snc_map[imc_num];
>>>> +       return snc_map[imc_num % snc_map_len];
>>>>  }
>>>>
>>>>  static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>>> @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>>>         return cpu_adjust[pmu_snc];
>>>>  }
>>>>
>>>> -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>>> +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>>>  {
>>>>         // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
>>>>         // topology. For example, a two socket graniterapids machine may be set
>>>> @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
>>>>                                 pmu->mem_events = perf_mem_events_intel_aux;
>>>>                         else
>>>>                                 pmu->mem_events = perf_mem_events_intel;
>>>> -               } else if (x86__is_intel_graniterapids()) {
>>>> +               } else if (x86__is_snc_supported()) {
>>>>                         if (starts_with(pmu->name, "uncore_cha_"))
>>>> -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>>> -                       else if (starts_with(pmu->name, "uncore_imc_"))
>>>> -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>>> +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>>> +                       else if (starts_with(pmu->name, "uncore_imc_") &&
>>>> +                                !starts_with(pmu->name, "uncore_imc_free_running"))
>>>> +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>>>                 }
>>>>         }
>>>>  }
>>>> --
>>>> 2.52.0.457.g6b5491de43-goog
>>>>

Re: [PATCH] perf pmu intel: Adjust cpumaks for sub-NUMA clusters on Emeraldrapids

Posted by Mi, Dapeng 3 weeks, 4 days ago

On 1/14/2026 2:06 AM, Chun-Tse Shao wrote:
> Ping.
>
> Thanks for your comment, Ian. To Intel team, can we get confirmation
> of the GNR SNR2 configuration?

It seems you missed Ian's comments. 

I have same question with Ian, the snc map for uncore IMC is changed. It
may not change the total count of uncore_imc events, but it would change
the count of the uncore_imc events for a specific SNC node. What's the
reason you did the change?

I have no GNR/EMR on my hand, I can't check how the SNC nodes are mapped on
these 2 machines.


>
> -CT
>
> On Thu, Jan 8, 2026 at 11:19 AM Ian Rogers <irogers@google.com> wrote:
>> On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
>>> Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
>>> Adjust cpumasks as the logic for GNR in [1].
>>>
>>> Tested on Emeraldrapids with SNC2 enabled:
>>>   $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
>>>
>>>    Performance counter stats for 'system wide':
>>>
>>>   N0       30        72125876670      UNC_CHA_CLOCKTICKS
>>>   N0        4         8815163648      UNC_M_CLOCKTICKS
>>>   N1       30        72124958844      UNC_CHA_CLOCKTICKS
>>>   N1        4         8815014974      UNC_M_CLOCKTICKS
>>>   N2       30        72121049022      UNC_CHA_CLOCKTICKS
>>>   N2        4         8814592626      UNC_M_CLOCKTICKS
>>>   N3       30        72117133854      UNC_CHA_CLOCKTICKS
>>>   N3        4         8814012840      UNC_M_CLOCKTICKS
>>>
>>>          1.001574118 seconds time elapsed
>>>
>>> [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
>>>
>>> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
>>> ---
>>>  tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
>>>  1 file changed, 28 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
>>> index a3f96221758d..fad68a0f7b5d 100644
>>> --- a/tools/perf/arch/x86/util/pmu.c
>>> +++ b/tools/perf/arch/x86/util/pmu.c
>>> @@ -22,20 +22,29 @@
>>>  #include "util/env.h"
>>>  #include "util/header.h"
>>>
>>> -static bool x86__is_intel_graniterapids(void)
>>> +static bool x86__is_snc_supported(void)
>>>  {
>>> -       static bool checked_if_graniterapids;
>>> -       static bool is_graniterapids;
>>> +       static bool checked_if_snc_supported;
>>> +       static bool is_supported;
>>>
>>> -       if (!checked_if_graniterapids) {
>>> -               const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
>>> +       if (!checked_if_snc_supported) {
>>> +
>>> +               /* Emeraldrapids and Graniterapids support SNC configuration. */
>>> +               static const char *const supported_cpuids[] = {
>>> +                       "GenuineIntel-6-CF", /* Emeraldrapids */
>>> +                       "GenuineIntel-6-A[DE]", /* Graniterapids */
>>> +               };
>>>                 char *cpuid = get_cpuid_str((struct perf_cpu){0});
>>>
>>> -               is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
>>> +               for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
>>> +                       is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
>>> +                       if (is_supported)
>>> +                               break;
>>> +               }
>>>                 free(cpuid);
>>> -               checked_if_graniterapids = true;
>>> +               checked_if_snc_supported = true;
>>>         }
>>> -       return is_graniterapids;
>>> +       return checked_if_snc_supported;
>>>  }
>>>
>>>  static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
>>> @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
>>>                         read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
>>>
>>>                 snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
>>> +
>>>                 perf_cpu_map__put(cache_cpus);
>>>                 perf_cpu_map__put(node_cpus);
>>>                 checked_snc = true;
>>> @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>>         // Compute the IMC SNC using lookup tables.
>>>         unsigned int imc_num;
>>>         int snc_nodes = snc_nodes_per_l3_cache();
>>> -       const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
>>> -       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
>>> +       const u8 snc2_map[] = {0, 0, 1, 1};
>> Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.
>>
>> Thanks,
>> Ian
>>
>>> +       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
>>>         const u8 *snc_map;
>>>         size_t snc_map_len;
>>>
>>> @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>>                 pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
>>>                 return 0;
>>>         }
>>> -       if (imc_num >= snc_map_len) {
>>> +       if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
>>>                 pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
>>>                 return 0;
>>>         }
>>> -       return snc_map[imc_num];
>>> +       return snc_map[imc_num % snc_map_len];
>>>  }
>>>
>>>  static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>> @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>>         return cpu_adjust[pmu_snc];
>>>  }
>>>
>>> -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>> +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>>  {
>>>         // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
>>>         // topology. For example, a two socket graniterapids machine may be set
>>> @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
>>>                                 pmu->mem_events = perf_mem_events_intel_aux;
>>>                         else
>>>                                 pmu->mem_events = perf_mem_events_intel;
>>> -               } else if (x86__is_intel_graniterapids()) {
>>> +               } else if (x86__is_snc_supported()) {
>>>                         if (starts_with(pmu->name, "uncore_cha_"))
>>> -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>> -                       else if (starts_with(pmu->name, "uncore_imc_"))
>>> -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>> +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>> +                       else if (starts_with(pmu->name, "uncore_imc_") &&
>>> +                                !starts_with(pmu->name, "uncore_imc_free_running"))
>>> +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>>                 }
>>>         }
>>>  }
>>> --
>>> 2.52.0.457.g6b5491de43-goog
>>>

Re: [PATCH] perf pmu intel: Adjust cpumaks for sub-NUMA clusters on Emeraldrapids

Posted by Chun-Tse Shao 3 weeks, 3 days ago

Hi Dapeng,

I set the SNC2 config to {0, 0, 1, 1} because that is how my EMR
testing machine shows with SNC2 enabled. If that is true, I wonder if
the SNC2 map for GNR is still {1, 1, 0, 0}.

Thanks,
CT

On Tue, Jan 13, 2026 at 4:42 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote:
>
>
> On 1/14/2026 2:06 AM, Chun-Tse Shao wrote:
> > Ping.
> >
> > Thanks for your comment, Ian. To Intel team, can we get confirmation
> > of the GNR SNR2 configuration?
>
> It seems you missed Ian's comments.
>
> I have same question with Ian, the snc map for uncore IMC is changed. It
> may not change the total count of uncore_imc events, but it would change
> the count of the uncore_imc events for a specific SNC node. What's the
> reason you did the change?
>
> I have no GNR/EMR on my hand, I can't check how the SNC nodes are mapped on
> these 2 machines.
>
>
> >
> > -CT
> >
> > On Thu, Jan 8, 2026 at 11:19 AM Ian Rogers <irogers@google.com> wrote:
> >> On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
> >>> Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
> >>> Adjust cpumasks as the logic for GNR in [1].
> >>>
> >>> Tested on Emeraldrapids with SNC2 enabled:
> >>>   $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
> >>>
> >>>    Performance counter stats for 'system wide':
> >>>
> >>>   N0       30        72125876670      UNC_CHA_CLOCKTICKS
> >>>   N0        4         8815163648      UNC_M_CLOCKTICKS
> >>>   N1       30        72124958844      UNC_CHA_CLOCKTICKS
> >>>   N1        4         8815014974      UNC_M_CLOCKTICKS
> >>>   N2       30        72121049022      UNC_CHA_CLOCKTICKS
> >>>   N2        4         8814592626      UNC_M_CLOCKTICKS
> >>>   N3       30        72117133854      UNC_CHA_CLOCKTICKS
> >>>   N3        4         8814012840      UNC_M_CLOCKTICKS
> >>>
> >>>          1.001574118 seconds time elapsed
> >>>
> >>> [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
> >>>
> >>> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> >>> ---
> >>>  tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
> >>>  1 file changed, 28 insertions(+), 17 deletions(-)
> >>>
> >>> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
> >>> index a3f96221758d..fad68a0f7b5d 100644
> >>> --- a/tools/perf/arch/x86/util/pmu.c
> >>> +++ b/tools/perf/arch/x86/util/pmu.c
> >>> @@ -22,20 +22,29 @@
> >>>  #include "util/env.h"
> >>>  #include "util/header.h"
> >>>
> >>> -static bool x86__is_intel_graniterapids(void)
> >>> +static bool x86__is_snc_supported(void)
> >>>  {
> >>> -       static bool checked_if_graniterapids;
> >>> -       static bool is_graniterapids;
> >>> +       static bool checked_if_snc_supported;
> >>> +       static bool is_supported;
> >>>
> >>> -       if (!checked_if_graniterapids) {
> >>> -               const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
> >>> +       if (!checked_if_snc_supported) {
> >>> +
> >>> +               /* Emeraldrapids and Graniterapids support SNC configuration. */
> >>> +               static const char *const supported_cpuids[] = {
> >>> +                       "GenuineIntel-6-CF", /* Emeraldrapids */
> >>> +                       "GenuineIntel-6-A[DE]", /* Graniterapids */
> >>> +               };
> >>>                 char *cpuid = get_cpuid_str((struct perf_cpu){0});
> >>>
> >>> -               is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
> >>> +               for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
> >>> +                       is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
> >>> +                       if (is_supported)
> >>> +                               break;
> >>> +               }
> >>>                 free(cpuid);
> >>> -               checked_if_graniterapids = true;
> >>> +               checked_if_snc_supported = true;
> >>>         }
> >>> -       return is_graniterapids;
> >>> +       return checked_if_snc_supported;
> >>>  }
> >>>
> >>>  static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
> >>> @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
> >>>                         read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
> >>>
> >>>                 snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
> >>> +
> >>>                 perf_cpu_map__put(cache_cpus);
> >>>                 perf_cpu_map__put(node_cpus);
> >>>                 checked_snc = true;
> >>> @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
> >>>         // Compute the IMC SNC using lookup tables.
> >>>         unsigned int imc_num;
> >>>         int snc_nodes = snc_nodes_per_l3_cache();
> >>> -       const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
> >>> -       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
> >>> +       const u8 snc2_map[] = {0, 0, 1, 1};
> >> Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.
> >>
> >> Thanks,
> >> Ian
> >>
> >>> +       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
> >>>         const u8 *snc_map;
> >>>         size_t snc_map_len;
> >>>
> >>> @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
> >>>                 pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
> >>>                 return 0;
> >>>         }
> >>> -       if (imc_num >= snc_map_len) {
> >>> +       if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
> >>>                 pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
> >>>                 return 0;
> >>>         }
> >>> -       return snc_map[imc_num];
> >>> +       return snc_map[imc_num % snc_map_len];
> >>>  }
> >>>
> >>>  static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
> >>> @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
> >>>         return cpu_adjust[pmu_snc];
> >>>  }
> >>>
> >>> -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
> >>> +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
> >>>  {
> >>>         // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
> >>>         // topology. For example, a two socket graniterapids machine may be set
> >>> @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
> >>>                                 pmu->mem_events = perf_mem_events_intel_aux;
> >>>                         else
> >>>                                 pmu->mem_events = perf_mem_events_intel;
> >>> -               } else if (x86__is_intel_graniterapids()) {
> >>> +               } else if (x86__is_snc_supported()) {
> >>>                         if (starts_with(pmu->name, "uncore_cha_"))
> >>> -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
> >>> -                       else if (starts_with(pmu->name, "uncore_imc_"))
> >>> -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
> >>> +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
> >>> +                       else if (starts_with(pmu->name, "uncore_imc_") &&
> >>> +                                !starts_with(pmu->name, "uncore_imc_free_running"))
> >>> +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
> >>>                 }
> >>>         }
> >>>  }
> >>> --
> >>> 2.52.0.457.g6b5491de43-goog
> >>>

Re: [PATCH] perf pmu intel: Adjust cpumaks for sub-NUMA clusters on Emeraldrapids

Posted by Mi, Dapeng 3 weeks, 3 days ago

On 1/15/2026 2:03 AM, Chun-Tse Shao wrote:
> Hi Dapeng,
>
> I set the SNC2 config to {0, 0, 1, 1} because that is how my EMR
> testing machine shows with SNC2 enabled. If that is true, I wonder if
> the SNC2 map for GNR is still {1, 1, 0, 0}.

Ok, let me find a GNR to double check it. (Maybe next week, I have some
higher priority things on my hand in this week). Thanks.


>
> Thanks,
> CT
>
> On Tue, Jan 13, 2026 at 4:42 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote:
>>
>> On 1/14/2026 2:06 AM, Chun-Tse Shao wrote:
>>> Ping.
>>>
>>> Thanks for your comment, Ian. To Intel team, can we get confirmation
>>> of the GNR SNR2 configuration?
>> It seems you missed Ian's comments.
>>
>> I have same question with Ian, the snc map for uncore IMC is changed. It
>> may not change the total count of uncore_imc events, but it would change
>> the count of the uncore_imc events for a specific SNC node. What's the
>> reason you did the change?
>>
>> I have no GNR/EMR on my hand, I can't check how the SNC nodes are mapped on
>> these 2 machines.
>>
>>
>>> -CT
>>>
>>> On Thu, Jan 8, 2026 at 11:19 AM Ian Rogers <irogers@google.com> wrote:
>>>> On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
>>>>> Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
>>>>> Adjust cpumasks as the logic for GNR in [1].
>>>>>
>>>>> Tested on Emeraldrapids with SNC2 enabled:
>>>>>   $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
>>>>>
>>>>>    Performance counter stats for 'system wide':
>>>>>
>>>>>   N0       30        72125876670      UNC_CHA_CLOCKTICKS
>>>>>   N0        4         8815163648      UNC_M_CLOCKTICKS
>>>>>   N1       30        72124958844      UNC_CHA_CLOCKTICKS
>>>>>   N1        4         8815014974      UNC_M_CLOCKTICKS
>>>>>   N2       30        72121049022      UNC_CHA_CLOCKTICKS
>>>>>   N2        4         8814592626      UNC_M_CLOCKTICKS
>>>>>   N3       30        72117133854      UNC_CHA_CLOCKTICKS
>>>>>   N3        4         8814012840      UNC_M_CLOCKTICKS
>>>>>
>>>>>          1.001574118 seconds time elapsed
>>>>>
>>>>> [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
>>>>>
>>>>> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
>>>>> ---
>>>>>  tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
>>>>>  1 file changed, 28 insertions(+), 17 deletions(-)
>>>>>
>>>>> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
>>>>> index a3f96221758d..fad68a0f7b5d 100644
>>>>> --- a/tools/perf/arch/x86/util/pmu.c
>>>>> +++ b/tools/perf/arch/x86/util/pmu.c
>>>>> @@ -22,20 +22,29 @@
>>>>>  #include "util/env.h"
>>>>>  #include "util/header.h"
>>>>>
>>>>> -static bool x86__is_intel_graniterapids(void)
>>>>> +static bool x86__is_snc_supported(void)
>>>>>  {
>>>>> -       static bool checked_if_graniterapids;
>>>>> -       static bool is_graniterapids;
>>>>> +       static bool checked_if_snc_supported;
>>>>> +       static bool is_supported;
>>>>>
>>>>> -       if (!checked_if_graniterapids) {
>>>>> -               const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
>>>>> +       if (!checked_if_snc_supported) {
>>>>> +
>>>>> +               /* Emeraldrapids and Graniterapids support SNC configuration. */
>>>>> +               static const char *const supported_cpuids[] = {
>>>>> +                       "GenuineIntel-6-CF", /* Emeraldrapids */
>>>>> +                       "GenuineIntel-6-A[DE]", /* Graniterapids */
>>>>> +               };
>>>>>                 char *cpuid = get_cpuid_str((struct perf_cpu){0});
>>>>>
>>>>> -               is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
>>>>> +               for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
>>>>> +                       is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
>>>>> +                       if (is_supported)
>>>>> +                               break;
>>>>> +               }
>>>>>                 free(cpuid);
>>>>> -               checked_if_graniterapids = true;
>>>>> +               checked_if_snc_supported = true;
>>>>>         }
>>>>> -       return is_graniterapids;
>>>>> +       return checked_if_snc_supported;
>>>>>  }
>>>>>
>>>>>  static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
>>>>> @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
>>>>>                         read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
>>>>>
>>>>>                 snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
>>>>> +
>>>>>                 perf_cpu_map__put(cache_cpus);
>>>>>                 perf_cpu_map__put(node_cpus);
>>>>>                 checked_snc = true;
>>>>> @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>>>>         // Compute the IMC SNC using lookup tables.
>>>>>         unsigned int imc_num;
>>>>>         int snc_nodes = snc_nodes_per_l3_cache();
>>>>> -       const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
>>>>> -       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
>>>>> +       const u8 snc2_map[] = {0, 0, 1, 1};
>>>> Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.
>>>>
>>>> Thanks,
>>>> Ian
>>>>
>>>>> +       const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
>>>>>         const u8 *snc_map;
>>>>>         size_t snc_map_len;
>>>>>
>>>>> @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>>>>                 pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
>>>>>                 return 0;
>>>>>         }
>>>>> -       if (imc_num >= snc_map_len) {
>>>>> +       if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
>>>>>                 pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
>>>>>                 return 0;
>>>>>         }
>>>>> -       return snc_map[imc_num];
>>>>> +       return snc_map[imc_num % snc_map_len];
>>>>>  }
>>>>>
>>>>>  static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>>>> @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>>>>         return cpu_adjust[pmu_snc];
>>>>>  }
>>>>>
>>>>> -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>>>> +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>>>>  {
>>>>>         // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
>>>>>         // topology. For example, a two socket graniterapids machine may be set
>>>>> @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
>>>>>                                 pmu->mem_events = perf_mem_events_intel_aux;
>>>>>                         else
>>>>>                                 pmu->mem_events = perf_mem_events_intel;
>>>>> -               } else if (x86__is_intel_graniterapids()) {
>>>>> +               } else if (x86__is_snc_supported()) {
>>>>>                         if (starts_with(pmu->name, "uncore_cha_"))
>>>>> -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>>>> -                       else if (starts_with(pmu->name, "uncore_imc_"))
>>>>> -                               gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>>>> +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>>>> +                       else if (starts_with(pmu->name, "uncore_imc_") &&
>>>>> +                                !starts_with(pmu->name, "uncore_imc_free_running"))
>>>>> +                               uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>>>>                 }
>>>>>         }
>>>>>  }
>>>>> --
>>>>> 2.52.0.457.g6b5491de43-goog
>>>>>