tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 17 deletions(-)
Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
Adjust cpumasks as the logic for GNR in [1].
Tested on Emeraldrapids with SNC2 enabled:
$ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
Performance counter stats for 'system wide':
N0 30 72125876670 UNC_CHA_CLOCKTICKS
N0 4 8815163648 UNC_M_CLOCKTICKS
N1 30 72124958844 UNC_CHA_CLOCKTICKS
N1 4 8815014974 UNC_M_CLOCKTICKS
N2 30 72121049022 UNC_CHA_CLOCKTICKS
N2 4 8814592626 UNC_M_CLOCKTICKS
N3 30 72117133854 UNC_CHA_CLOCKTICKS
N3 4 8814012840 UNC_M_CLOCKTICKS
1.001574118 seconds time elapsed
[1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
Signed-off-by: Chun-Tse Shao <ctshao@google.com>
---
tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
1 file changed, 28 insertions(+), 17 deletions(-)
diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
index a3f96221758d..fad68a0f7b5d 100644
--- a/tools/perf/arch/x86/util/pmu.c
+++ b/tools/perf/arch/x86/util/pmu.c
@@ -22,20 +22,29 @@
#include "util/env.h"
#include "util/header.h"
-static bool x86__is_intel_graniterapids(void)
+static bool x86__is_snc_supported(void)
{
- static bool checked_if_graniterapids;
- static bool is_graniterapids;
+ static bool checked_if_snc_supported;
+ static bool is_supported;
- if (!checked_if_graniterapids) {
- const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
+ if (!checked_if_snc_supported) {
+
+ /* Emeraldrapids and Graniterapids support SNC configuration. */
+ static const char *const supported_cpuids[] = {
+ "GenuineIntel-6-CF", /* Emeraldrapids */
+ "GenuineIntel-6-A[DE]", /* Graniterapids */
+ };
char *cpuid = get_cpuid_str((struct perf_cpu){0});
- is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
+ for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
+ is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
+ if (is_supported)
+ break;
+ }
free(cpuid);
- checked_if_graniterapids = true;
+ checked_if_snc_supported = true;
}
- return is_graniterapids;
+ return checked_if_snc_supported;
}
static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
@@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
+
perf_cpu_map__put(cache_cpus);
perf_cpu_map__put(node_cpus);
checked_snc = true;
@@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
// Compute the IMC SNC using lookup tables.
unsigned int imc_num;
int snc_nodes = snc_nodes_per_l3_cache();
- const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
- const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
+ const u8 snc2_map[] = {0, 0, 1, 1};
+ const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
const u8 *snc_map;
size_t snc_map_len;
@@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
return 0;
}
- if (imc_num >= snc_map_len) {
+ if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
return 0;
}
- return snc_map[imc_num];
+ return snc_map[imc_num % snc_map_len];
}
static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
@@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
return cpu_adjust[pmu_snc];
}
-static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
+static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
{
// With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
// topology. For example, a two socket graniterapids machine may be set
@@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
pmu->mem_events = perf_mem_events_intel_aux;
else
pmu->mem_events = perf_mem_events_intel;
- } else if (x86__is_intel_graniterapids()) {
+ } else if (x86__is_snc_supported()) {
if (starts_with(pmu->name, "uncore_cha_"))
- gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
- else if (starts_with(pmu->name, "uncore_imc_"))
- gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
+ uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
+ else if (starts_with(pmu->name, "uncore_imc_") &&
+ !starts_with(pmu->name, "uncore_imc_free_running"))
+ uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
}
}
}
--
2.52.0.457.g6b5491de43-goog
On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
>
> Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
> Adjust cpumasks as the logic for GNR in [1].
>
> Tested on Emeraldrapids with SNC2 enabled:
> $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
>
> Performance counter stats for 'system wide':
>
> N0 30 72125876670 UNC_CHA_CLOCKTICKS
> N0 4 8815163648 UNC_M_CLOCKTICKS
> N1 30 72124958844 UNC_CHA_CLOCKTICKS
> N1 4 8815014974 UNC_M_CLOCKTICKS
> N2 30 72121049022 UNC_CHA_CLOCKTICKS
> N2 4 8814592626 UNC_M_CLOCKTICKS
> N3 30 72117133854 UNC_CHA_CLOCKTICKS
> N3 4 8814012840 UNC_M_CLOCKTICKS
>
> 1.001574118 seconds time elapsed
>
> [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
>
> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> ---
> tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
> 1 file changed, 28 insertions(+), 17 deletions(-)
>
> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
> index a3f96221758d..fad68a0f7b5d 100644
> --- a/tools/perf/arch/x86/util/pmu.c
> +++ b/tools/perf/arch/x86/util/pmu.c
> @@ -22,20 +22,29 @@
> #include "util/env.h"
> #include "util/header.h"
>
> -static bool x86__is_intel_graniterapids(void)
> +static bool x86__is_snc_supported(void)
> {
> - static bool checked_if_graniterapids;
> - static bool is_graniterapids;
> + static bool checked_if_snc_supported;
> + static bool is_supported;
>
> - if (!checked_if_graniterapids) {
> - const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
> + if (!checked_if_snc_supported) {
> +
> + /* Emeraldrapids and Graniterapids support SNC configuration. */
> + static const char *const supported_cpuids[] = {
> + "GenuineIntel-6-CF", /* Emeraldrapids */
> + "GenuineIntel-6-A[DE]", /* Graniterapids */
> + };
> char *cpuid = get_cpuid_str((struct perf_cpu){0});
>
> - is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
> + for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
> + is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
> + if (is_supported)
> + break;
> + }
> free(cpuid);
> - checked_if_graniterapids = true;
> + checked_if_snc_supported = true;
> }
> - return is_graniterapids;
> + return checked_if_snc_supported;
> }
>
> static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
> @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
> read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
>
> snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
> +
> perf_cpu_map__put(cache_cpus);
> perf_cpu_map__put(node_cpus);
> checked_snc = true;
> @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
> // Compute the IMC SNC using lookup tables.
> unsigned int imc_num;
> int snc_nodes = snc_nodes_per_l3_cache();
> - const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
> - const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
> + const u8 snc2_map[] = {0, 0, 1, 1};
Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.
Thanks,
Ian
> + const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
> const u8 *snc_map;
> size_t snc_map_len;
>
> @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
> pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
> return 0;
> }
> - if (imc_num >= snc_map_len) {
> + if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
> pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
> return 0;
> }
> - return snc_map[imc_num];
> + return snc_map[imc_num % snc_map_len];
> }
>
> static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
> @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
> return cpu_adjust[pmu_snc];
> }
>
> -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
> +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
> {
> // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
> // topology. For example, a two socket graniterapids machine may be set
> @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
> pmu->mem_events = perf_mem_events_intel_aux;
> else
> pmu->mem_events = perf_mem_events_intel;
> - } else if (x86__is_intel_graniterapids()) {
> + } else if (x86__is_snc_supported()) {
> if (starts_with(pmu->name, "uncore_cha_"))
> - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
> - else if (starts_with(pmu->name, "uncore_imc_"))
> - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
> + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
> + else if (starts_with(pmu->name, "uncore_imc_") &&
> + !starts_with(pmu->name, "uncore_imc_free_running"))
> + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
> }
> }
> }
> --
> 2.52.0.457.g6b5491de43-goog
>
Ping.
Thanks for your comment, Ian. To Intel team, can we get confirmation
of the GNR SNR2 configuration?
-CT
On Thu, Jan 8, 2026 at 11:19 AM Ian Rogers <irogers@google.com> wrote:
>
> On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
> >
> > Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
> > Adjust cpumasks as the logic for GNR in [1].
> >
> > Tested on Emeraldrapids with SNC2 enabled:
> > $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
> >
> > Performance counter stats for 'system wide':
> >
> > N0 30 72125876670 UNC_CHA_CLOCKTICKS
> > N0 4 8815163648 UNC_M_CLOCKTICKS
> > N1 30 72124958844 UNC_CHA_CLOCKTICKS
> > N1 4 8815014974 UNC_M_CLOCKTICKS
> > N2 30 72121049022 UNC_CHA_CLOCKTICKS
> > N2 4 8814592626 UNC_M_CLOCKTICKS
> > N3 30 72117133854 UNC_CHA_CLOCKTICKS
> > N3 4 8814012840 UNC_M_CLOCKTICKS
> >
> > 1.001574118 seconds time elapsed
> >
> > [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
> >
> > Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> > ---
> > tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
> > 1 file changed, 28 insertions(+), 17 deletions(-)
> >
> > diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
> > index a3f96221758d..fad68a0f7b5d 100644
> > --- a/tools/perf/arch/x86/util/pmu.c
> > +++ b/tools/perf/arch/x86/util/pmu.c
> > @@ -22,20 +22,29 @@
> > #include "util/env.h"
> > #include "util/header.h"
> >
> > -static bool x86__is_intel_graniterapids(void)
> > +static bool x86__is_snc_supported(void)
> > {
> > - static bool checked_if_graniterapids;
> > - static bool is_graniterapids;
> > + static bool checked_if_snc_supported;
> > + static bool is_supported;
> >
> > - if (!checked_if_graniterapids) {
> > - const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
> > + if (!checked_if_snc_supported) {
> > +
> > + /* Emeraldrapids and Graniterapids support SNC configuration. */
> > + static const char *const supported_cpuids[] = {
> > + "GenuineIntel-6-CF", /* Emeraldrapids */
> > + "GenuineIntel-6-A[DE]", /* Graniterapids */
> > + };
> > char *cpuid = get_cpuid_str((struct perf_cpu){0});
> >
> > - is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
> > + for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
> > + is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
> > + if (is_supported)
> > + break;
> > + }
> > free(cpuid);
> > - checked_if_graniterapids = true;
> > + checked_if_snc_supported = true;
> > }
> > - return is_graniterapids;
> > + return checked_if_snc_supported;
> > }
> >
> > static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
> > @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
> > read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
> >
> > snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
> > +
> > perf_cpu_map__put(cache_cpus);
> > perf_cpu_map__put(node_cpus);
> > checked_snc = true;
> > @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
> > // Compute the IMC SNC using lookup tables.
> > unsigned int imc_num;
> > int snc_nodes = snc_nodes_per_l3_cache();
> > - const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
> > - const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
> > + const u8 snc2_map[] = {0, 0, 1, 1};
>
> Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.
>
> Thanks,
> Ian
>
> > + const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
> > const u8 *snc_map;
> > size_t snc_map_len;
> >
> > @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
> > pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
> > return 0;
> > }
> > - if (imc_num >= snc_map_len) {
> > + if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
> > pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
> > return 0;
> > }
> > - return snc_map[imc_num];
> > + return snc_map[imc_num % snc_map_len];
> > }
> >
> > static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
> > @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
> > return cpu_adjust[pmu_snc];
> > }
> >
> > -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
> > +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
> > {
> > // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
> > // topology. For example, a two socket graniterapids machine may be set
> > @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
> > pmu->mem_events = perf_mem_events_intel_aux;
> > else
> > pmu->mem_events = perf_mem_events_intel;
> > - } else if (x86__is_intel_graniterapids()) {
> > + } else if (x86__is_snc_supported()) {
> > if (starts_with(pmu->name, "uncore_cha_"))
> > - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
> > - else if (starts_with(pmu->name, "uncore_imc_"))
> > - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
> > + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
> > + else if (starts_with(pmu->name, "uncore_imc_") &&
> > + !starts_with(pmu->name, "uncore_imc_free_running"))
> > + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
> > }
> > }
> > }
> > --
> > 2.52.0.457.g6b5491de43-goog
> >
On 1/14/2026 2:06 AM, Chun-Tse Shao wrote:
> Ping.
>
> Thanks for your comment, Ian. To Intel team, can we get confirmation
> of the GNR SNR2 configuration?
+ Zide
Zide would look at and verify the configuration. Thanks.
>
> -CT
>
> On Thu, Jan 8, 2026 at 11:19 AM Ian Rogers <irogers@google.com> wrote:
>> On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
>>> Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
>>> Adjust cpumasks as the logic for GNR in [1].
>>>
>>> Tested on Emeraldrapids with SNC2 enabled:
>>> $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
>>>
>>> Performance counter stats for 'system wide':
>>>
>>> N0 30 72125876670 UNC_CHA_CLOCKTICKS
>>> N0 4 8815163648 UNC_M_CLOCKTICKS
>>> N1 30 72124958844 UNC_CHA_CLOCKTICKS
>>> N1 4 8815014974 UNC_M_CLOCKTICKS
>>> N2 30 72121049022 UNC_CHA_CLOCKTICKS
>>> N2 4 8814592626 UNC_M_CLOCKTICKS
>>> N3 30 72117133854 UNC_CHA_CLOCKTICKS
>>> N3 4 8814012840 UNC_M_CLOCKTICKS
>>>
>>> 1.001574118 seconds time elapsed
>>>
>>> [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
>>>
>>> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
>>> ---
>>> tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
>>> 1 file changed, 28 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
>>> index a3f96221758d..fad68a0f7b5d 100644
>>> --- a/tools/perf/arch/x86/util/pmu.c
>>> +++ b/tools/perf/arch/x86/util/pmu.c
>>> @@ -22,20 +22,29 @@
>>> #include "util/env.h"
>>> #include "util/header.h"
>>>
>>> -static bool x86__is_intel_graniterapids(void)
>>> +static bool x86__is_snc_supported(void)
>>> {
>>> - static bool checked_if_graniterapids;
>>> - static bool is_graniterapids;
>>> + static bool checked_if_snc_supported;
>>> + static bool is_supported;
>>>
>>> - if (!checked_if_graniterapids) {
>>> - const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
>>> + if (!checked_if_snc_supported) {
>>> +
>>> + /* Emeraldrapids and Graniterapids support SNC configuration. */
>>> + static const char *const supported_cpuids[] = {
>>> + "GenuineIntel-6-CF", /* Emeraldrapids */
>>> + "GenuineIntel-6-A[DE]", /* Graniterapids */
>>> + };
>>> char *cpuid = get_cpuid_str((struct perf_cpu){0});
>>>
>>> - is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
>>> + for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
>>> + is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
>>> + if (is_supported)
>>> + break;
>>> + }
>>> free(cpuid);
>>> - checked_if_graniterapids = true;
>>> + checked_if_snc_supported = true;
>>> }
>>> - return is_graniterapids;
>>> + return checked_if_snc_supported;
>>> }
>>>
>>> static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
>>> @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
>>> read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
>>>
>>> snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
>>> +
>>> perf_cpu_map__put(cache_cpus);
>>> perf_cpu_map__put(node_cpus);
>>> checked_snc = true;
>>> @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>> // Compute the IMC SNC using lookup tables.
>>> unsigned int imc_num;
>>> int snc_nodes = snc_nodes_per_l3_cache();
>>> - const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
>>> - const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
>>> + const u8 snc2_map[] = {0, 0, 1, 1};
>> Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.
>>
>> Thanks,
>> Ian
>>
>>> + const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
>>> const u8 *snc_map;
>>> size_t snc_map_len;
>>>
>>> @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>> pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
>>> return 0;
>>> }
>>> - if (imc_num >= snc_map_len) {
>>> + if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
>>> pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
>>> return 0;
>>> }
>>> - return snc_map[imc_num];
>>> + return snc_map[imc_num % snc_map_len];
>>> }
>>>
>>> static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>> @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>> return cpu_adjust[pmu_snc];
>>> }
>>>
>>> -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>> +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>> {
>>> // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
>>> // topology. For example, a two socket graniterapids machine may be set
>>> @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
>>> pmu->mem_events = perf_mem_events_intel_aux;
>>> else
>>> pmu->mem_events = perf_mem_events_intel;
>>> - } else if (x86__is_intel_graniterapids()) {
>>> + } else if (x86__is_snc_supported()) {
>>> if (starts_with(pmu->name, "uncore_cha_"))
>>> - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>> - else if (starts_with(pmu->name, "uncore_imc_"))
>>> - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>> + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>> + else if (starts_with(pmu->name, "uncore_imc_") &&
>>> + !starts_with(pmu->name, "uncore_imc_free_running"))
>>> + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>> }
>>> }
>>> }
>>> --
>>> 2.52.0.457.g6b5491de43-goog
>>>
On 1/18/2026 4:51 PM, Mi, Dapeng wrote:
>
> On 1/14/2026 2:06 AM, Chun-Tse Shao wrote:
>> Ping.
>>
>> Thanks for your comment, Ian. To Intel team, can we get confirmation
>> of the GNR SNR2 configuration?
>>
>> -CT
>>
>> On Thu, Jan 8, 2026 at 11:19 AM Ian Rogers <irogers@google.com> wrote:
>>> On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
>>>> Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
>>>> Adjust cpumasks as the logic for GNR in [1].
>>>>
>>>> Tested on Emeraldrapids with SNC2 enabled:
>>>> $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
>>>>
>>>> Performance counter stats for 'system wide':
>>>>
>>>> N0 30 72125876670 UNC_CHA_CLOCKTICKS
>>>> N0 4 8815163648 UNC_M_CLOCKTICKS
>>>> N1 30 72124958844 UNC_CHA_CLOCKTICKS
>>>> N1 4 8815014974 UNC_M_CLOCKTICKS
>>>> N2 30 72121049022 UNC_CHA_CLOCKTICKS
>>>> N2 4 8814592626 UNC_M_CLOCKTICKS
>>>> N3 30 72117133854 UNC_CHA_CLOCKTICKS
>>>> N3 4 8814012840 UNC_M_CLOCKTICKS
>>>>
>>>> 1.001574118 seconds time elapsed
>>>>
>>>> [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
>>>>
>>>> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
>>>> ---
>>>> tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
>>>> 1 file changed, 28 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
>>>> index a3f96221758d..fad68a0f7b5d 100644
>>>> --- a/tools/perf/arch/x86/util/pmu.c
>>>> +++ b/tools/perf/arch/x86/util/pmu.c
>>>> @@ -22,20 +22,29 @@
>>>> #include "util/env.h"
>>>> #include "util/header.h"
>>>>
>>>> -static bool x86__is_intel_graniterapids(void)
>>>> +static bool x86__is_snc_supported(void)
>>>> {
>>>> - static bool checked_if_graniterapids;
>>>> - static bool is_graniterapids;
>>>> + static bool checked_if_snc_supported;
>>>> + static bool is_supported;
>>>>
>>>> - if (!checked_if_graniterapids) {
>>>> - const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
>>>> + if (!checked_if_snc_supported) {
>>>> +
>>>> + /* Emeraldrapids and Graniterapids support SNC configuration. */
>>>> + static const char *const supported_cpuids[] = {
>>>> + "GenuineIntel-6-CF", /* Emeraldrapids */
>>>> + "GenuineIntel-6-A[DE]", /* Graniterapids */
>>>> + };
>>>> char *cpuid = get_cpuid_str((struct perf_cpu){0});
>>>>
>>>> - is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
>>>> + for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
>>>> + is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
>>>> + if (is_supported)
>>>> + break;
>>>> + }
>>>> free(cpuid);
>>>> - checked_if_graniterapids = true;
>>>> + checked_if_snc_supported = true;
>>>> }
>>>> - return is_graniterapids;
>>>> + return checked_if_snc_supported;
>>>> }
>>>>
>>>> static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
>>>> @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
>>>> read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
>>>>
>>>> snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
>>>> +
>>>> perf_cpu_map__put(cache_cpus);
>>>> perf_cpu_map__put(node_cpus);
>>>> checked_snc = true;
>>>> @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>>> // Compute the IMC SNC using lookup tables.
>>>> unsigned int imc_num;
>>>> int snc_nodes = snc_nodes_per_l3_cache();
>>>> - const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
>>>> - const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
>>>> + const u8 snc2_map[] = {0, 0, 1, 1};
>>> Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.
It appears to break GNR SNC2. While it works for the --per-node test, it
fails the following affinity test. Testing on EMR shows that it follows
the new lookup table. Should we use a model-specific lookup table here?
# Running workload on CPU0
$ taskset -c CPU0 stress-ng --vm 1 --vm-bytes 2G --vm-method all
--timeout 30s
# Profiling UNC_M_PRE_COUNT.ALL on all IMC boxes.
$ perf stat \
-e uncore_imc_0/event=0x03,umask=0xFF/ \
-e uncore_imc_1/event=0x03,umask=0xFF/ \
-e uncore_imc_2/event=0x03,umask=0xFF/ \
-e uncore_imc_3/event=0x03,umask=0xFF/ \
-e uncore_imc_4/event=0x03,umask=0xFF/ \
-e uncore_imc_5/event=0x03,umask=0xFF/ \
-e uncore_imc_6/event=0x03,umask=0xFF/ \
-e uncore_imc_7/event=0x03,umask=0xFF/ \
-a -I 1000
This shows that the uncore_imc_[2|3|6|7] boxes are affinitized to CPU0.
5.013638757 1,635,470 uncore_imc_0/event=0x03,umask=0xFF/
5.013638757 1,638,157 uncore_imc_1/event=0x03,umask=0xFF/
5.013638757 27,093,922 uncore_imc_2/event=0x03,umask=0xFF/
5.013638757 27,025,980 uncore_imc_3/event=0x03,umask=0xFF/
5.013638757 1,616,974 uncore_imc_4/event=0x03,umask=0xFF/
5.013638757 1,627,251 uncore_imc_5/event=0x03,umask=0xFF/
5.013638757 26,854,588 uncore_imc_6/event=0x03,umask=0xFF/
5.013638757 26,974,506 uncore_imc_7/event=0x03,umask=0xFF/
Testing with additional CPUs confirms that the original GNR SNC2 lookup
table is correct.
CPU uncore_imc box
NUMA node0 CPU(s): 0-42,344-386 2 3 6 7
NUMA node1 CPU(s): 43-85,387-429 0 1 4 5
NUMA node2 CPU(s): 86-128,430-472 2 3 6 7
NUMA node3 CPU(s): 129-171,473-515 0 1 4 5
NUMA node4 CPU(s): 172-214,516-558 2 3 6 7
NUMA node5 CPU(s): 215-257,559-601 0 1 4 5
NUMA node6 CPU(s): 258-300,602-644 2 3 6 7
NUMA node7 CPU(s): 301-343,645-687 0 1 4 5
>>> Thanks,
>>> Ian
>>>
>>>> + const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
>>>> const u8 *snc_map;
>>>> size_t snc_map_len;
>>>>
>>>> @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>>> pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
>>>> return 0;
>>>> }
>>>> - if (imc_num >= snc_map_len) {
>>>> + if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
>>>> pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
>>>> return 0;
>>>> }
>>>> - return snc_map[imc_num];
>>>> + return snc_map[imc_num % snc_map_len];
>>>> }
>>>>
>>>> static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>>> @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>>> return cpu_adjust[pmu_snc];
>>>> }
>>>>
>>>> -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>>> +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>>> {
>>>> // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
>>>> // topology. For example, a two socket graniterapids machine may be set
>>>> @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
>>>> pmu->mem_events = perf_mem_events_intel_aux;
>>>> else
>>>> pmu->mem_events = perf_mem_events_intel;
>>>> - } else if (x86__is_intel_graniterapids()) {
>>>> + } else if (x86__is_snc_supported()) {
>>>> if (starts_with(pmu->name, "uncore_cha_"))
>>>> - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>>> - else if (starts_with(pmu->name, "uncore_imc_"))
>>>> - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>>> + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>>> + else if (starts_with(pmu->name, "uncore_imc_") &&
>>>> + !starts_with(pmu->name, "uncore_imc_free_running"))
>>>> + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>>> }
>>>> }
>>>> }
>>>> --
>>>> 2.52.0.457.g6b5491de43-goog
>>>>
Thank you for the confirmation, Zide. I will submit another patch to
split EMR and GNR SNC2 IMC map.
-CT
On Thu, Jan 22, 2026 at 2:06 PM Chen, Zide <zide.chen@intel.com> wrote:
>
>
>
> On 1/18/2026 4:51 PM, Mi, Dapeng wrote:
> >
> > On 1/14/2026 2:06 AM, Chun-Tse Shao wrote:
> >> Ping.
> >>
> >> Thanks for your comment, Ian. To Intel team, can we get confirmation
> >> of the GNR SNR2 configuration?
> >>
> >> -CT
> >>
> >> On Thu, Jan 8, 2026 at 11:19 AM Ian Rogers <irogers@google.com> wrote:
> >>> On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
> >>>> Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
> >>>> Adjust cpumasks as the logic for GNR in [1].
> >>>>
> >>>> Tested on Emeraldrapids with SNC2 enabled:
> >>>> $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
> >>>>
> >>>> Performance counter stats for 'system wide':
> >>>>
> >>>> N0 30 72125876670 UNC_CHA_CLOCKTICKS
> >>>> N0 4 8815163648 UNC_M_CLOCKTICKS
> >>>> N1 30 72124958844 UNC_CHA_CLOCKTICKS
> >>>> N1 4 8815014974 UNC_M_CLOCKTICKS
> >>>> N2 30 72121049022 UNC_CHA_CLOCKTICKS
> >>>> N2 4 8814592626 UNC_M_CLOCKTICKS
> >>>> N3 30 72117133854 UNC_CHA_CLOCKTICKS
> >>>> N3 4 8814012840 UNC_M_CLOCKTICKS
> >>>>
> >>>> 1.001574118 seconds time elapsed
> >>>>
> >>>> [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
> >>>>
> >>>> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> >>>> ---
> >>>> tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
> >>>> 1 file changed, 28 insertions(+), 17 deletions(-)
> >>>>
> >>>> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
> >>>> index a3f96221758d..fad68a0f7b5d 100644
> >>>> --- a/tools/perf/arch/x86/util/pmu.c
> >>>> +++ b/tools/perf/arch/x86/util/pmu.c
> >>>> @@ -22,20 +22,29 @@
> >>>> #include "util/env.h"
> >>>> #include "util/header.h"
> >>>>
> >>>> -static bool x86__is_intel_graniterapids(void)
> >>>> +static bool x86__is_snc_supported(void)
> >>>> {
> >>>> - static bool checked_if_graniterapids;
> >>>> - static bool is_graniterapids;
> >>>> + static bool checked_if_snc_supported;
> >>>> + static bool is_supported;
> >>>>
> >>>> - if (!checked_if_graniterapids) {
> >>>> - const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
> >>>> + if (!checked_if_snc_supported) {
> >>>> +
> >>>> + /* Emeraldrapids and Graniterapids support SNC configuration. */
> >>>> + static const char *const supported_cpuids[] = {
> >>>> + "GenuineIntel-6-CF", /* Emeraldrapids */
> >>>> + "GenuineIntel-6-A[DE]", /* Graniterapids */
> >>>> + };
> >>>> char *cpuid = get_cpuid_str((struct perf_cpu){0});
> >>>>
> >>>> - is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
> >>>> + for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
> >>>> + is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
> >>>> + if (is_supported)
> >>>> + break;
> >>>> + }
> >>>> free(cpuid);
> >>>> - checked_if_graniterapids = true;
> >>>> + checked_if_snc_supported = true;
> >>>> }
> >>>> - return is_graniterapids;
> >>>> + return checked_if_snc_supported;
> >>>> }
> >>>>
> >>>> static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
> >>>> @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
> >>>> read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
> >>>>
> >>>> snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
> >>>> +
> >>>> perf_cpu_map__put(cache_cpus);
> >>>> perf_cpu_map__put(node_cpus);
> >>>> checked_snc = true;
> >>>> @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
> >>>> // Compute the IMC SNC using lookup tables.
> >>>> unsigned int imc_num;
> >>>> int snc_nodes = snc_nodes_per_l3_cache();
> >>>> - const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
> >>>> - const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
> >>>> + const u8 snc2_map[] = {0, 0, 1, 1};
> >>> Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.
>
> It appears to break GNR SNC2. While it works for the --per-node test, it
> fails the following affinity test. Testing on EMR shows that it follows
> the new lookup table. Should we use a model-specific lookup table here?
>
> # Running workload on CPU0
> $ taskset -c CPU0 stress-ng --vm 1 --vm-bytes 2G --vm-method all
> --timeout 30s
>
> # Profiling UNC_M_PRE_COUNT.ALL on all IMC boxes.
> $ perf stat \
> -e uncore_imc_0/event=0x03,umask=0xFF/ \
> -e uncore_imc_1/event=0x03,umask=0xFF/ \
> -e uncore_imc_2/event=0x03,umask=0xFF/ \
> -e uncore_imc_3/event=0x03,umask=0xFF/ \
> -e uncore_imc_4/event=0x03,umask=0xFF/ \
> -e uncore_imc_5/event=0x03,umask=0xFF/ \
> -e uncore_imc_6/event=0x03,umask=0xFF/ \
> -e uncore_imc_7/event=0x03,umask=0xFF/ \
> -a -I 1000
>
> This shows that the uncore_imc_[2|3|6|7] boxes are affinitized to CPU0.
>
> 5.013638757 1,635,470 uncore_imc_0/event=0x03,umask=0xFF/
> 5.013638757 1,638,157 uncore_imc_1/event=0x03,umask=0xFF/
> 5.013638757 27,093,922 uncore_imc_2/event=0x03,umask=0xFF/
> 5.013638757 27,025,980 uncore_imc_3/event=0x03,umask=0xFF/
> 5.013638757 1,616,974 uncore_imc_4/event=0x03,umask=0xFF/
> 5.013638757 1,627,251 uncore_imc_5/event=0x03,umask=0xFF/
> 5.013638757 26,854,588 uncore_imc_6/event=0x03,umask=0xFF/
> 5.013638757 26,974,506 uncore_imc_7/event=0x03,umask=0xFF/
>
> Testing with additional CPUs confirms that the original GNR SNC2 lookup
> table is correct.
>
> CPU uncore_imc box
> NUMA node0 CPU(s): 0-42,344-386 2 3 6 7
> NUMA node1 CPU(s): 43-85,387-429 0 1 4 5
> NUMA node2 CPU(s): 86-128,430-472 2 3 6 7
> NUMA node3 CPU(s): 129-171,473-515 0 1 4 5
> NUMA node4 CPU(s): 172-214,516-558 2 3 6 7
> NUMA node5 CPU(s): 215-257,559-601 0 1 4 5
> NUMA node6 CPU(s): 258-300,602-644 2 3 6 7
> NUMA node7 CPU(s): 301-343,645-687 0 1 4 5
>
>
> >>> Thanks,
> >>> Ian
> >>>
> >>>> + const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
> >>>> const u8 *snc_map;
> >>>> size_t snc_map_len;
> >>>>
> >>>> @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
> >>>> pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
> >>>> return 0;
> >>>> }
> >>>> - if (imc_num >= snc_map_len) {
> >>>> + if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
> >>>> pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
> >>>> return 0;
> >>>> }
> >>>> - return snc_map[imc_num];
> >>>> + return snc_map[imc_num % snc_map_len];
> >>>> }
> >>>>
> >>>> static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
> >>>> @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
> >>>> return cpu_adjust[pmu_snc];
> >>>> }
> >>>>
> >>>> -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
> >>>> +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
> >>>> {
> >>>> // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
> >>>> // topology. For example, a two socket graniterapids machine may be set
> >>>> @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
> >>>> pmu->mem_events = perf_mem_events_intel_aux;
> >>>> else
> >>>> pmu->mem_events = perf_mem_events_intel;
> >>>> - } else if (x86__is_intel_graniterapids()) {
> >>>> + } else if (x86__is_snc_supported()) {
> >>>> if (starts_with(pmu->name, "uncore_cha_"))
> >>>> - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
> >>>> - else if (starts_with(pmu->name, "uncore_imc_"))
> >>>> - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
> >>>> + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
> >>>> + else if (starts_with(pmu->name, "uncore_imc_") &&
> >>>> + !starts_with(pmu->name, "uncore_imc_free_running"))
> >>>> + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
> >>>> }
> >>>> }
> >>>> }
> >>>> --
> >>>> 2.52.0.457.g6b5491de43-goog
> >>>>
>
On 1/18/2026 4:51 PM, Mi, Dapeng wrote:
>
> On 1/14/2026 2:06 AM, Chun-Tse Shao wrote:
>> Ping.
>>
>> Thanks for your comment, Ian. To Intel team, can we get confirmation
>> of the GNR SNR2 configuration?
>>
>> -CT
>>
>> On Thu, Jan 8, 2026 at 11:19 AM Ian Rogers <irogers@google.com> wrote:
>>> On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
>>>> Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
>>>> Adjust cpumasks as the logic for GNR in [1].
>>>>
>>>> Tested on Emeraldrapids with SNC2 enabled:
>>>> $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
>>>>
>>>> Performance counter stats for 'system wide':
>>>>
>>>> N0 30 72125876670 UNC_CHA_CLOCKTICKS
>>>> N0 4 8815163648 UNC_M_CLOCKTICKS
>>>> N1 30 72124958844 UNC_CHA_CLOCKTICKS
>>>> N1 4 8815014974 UNC_M_CLOCKTICKS
>>>> N2 30 72121049022 UNC_CHA_CLOCKTICKS
>>>> N2 4 8814592626 UNC_M_CLOCKTICKS
>>>> N3 30 72117133854 UNC_CHA_CLOCKTICKS
>>>> N3 4 8814012840 UNC_M_CLOCKTICKS
>>>>
>>>> 1.001574118 seconds time elapsed
>>>>
>>>> [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
>>>>
>>>> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
>>>> ---
>>>> tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
>>>> 1 file changed, 28 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
>>>> index a3f96221758d..fad68a0f7b5d 100644
>>>> --- a/tools/perf/arch/x86/util/pmu.c
>>>> +++ b/tools/perf/arch/x86/util/pmu.c
>>>> @@ -22,20 +22,29 @@
>>>> #include "util/env.h"
>>>> #include "util/header.h"
>>>>
>>>> -static bool x86__is_intel_graniterapids(void)
>>>> +static bool x86__is_snc_supported(void)
>>>> {
>>>> - static bool checked_if_graniterapids;
>>>> - static bool is_graniterapids;
>>>> + static bool checked_if_snc_supported;
>>>> + static bool is_supported;
>>>>
>>>> - if (!checked_if_graniterapids) {
>>>> - const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
>>>> + if (!checked_if_snc_supported) {
>>>> +
>>>> + /* Emeraldrapids and Graniterapids support SNC configuration. */
>>>> + static const char *const supported_cpuids[] = {
>>>> + "GenuineIntel-6-CF", /* Emeraldrapids */
>>>> + "GenuineIntel-6-A[DE]", /* Graniterapids */
>>>> + };
>>>> char *cpuid = get_cpuid_str((struct perf_cpu){0});
>>>>
>>>> - is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
>>>> + for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
>>>> + is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
>>>> + if (is_supported)
>>>> + break;
>>>> + }
>>>> free(cpuid);
>>>> - checked_if_graniterapids = true;
>>>> + checked_if_snc_supported = true;
>>>> }
>>>> - return is_graniterapids;
>>>> + return checked_if_snc_supported;
>>>> }
>>>>
>>>> static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
>>>> @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
>>>> read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
>>>>
>>>> snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
>>>> +
>>>> perf_cpu_map__put(cache_cpus);
>>>> perf_cpu_map__put(node_cpus);
>>>> checked_snc = true;
>>>> @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>>> // Compute the IMC SNC using lookup tables.
>>>> unsigned int imc_num;
>>>> int snc_nodes = snc_nodes_per_l3_cache();
>>>> - const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
>>>> - const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
>>>> + const u8 snc2_map[] = {0, 0, 1, 1};
>>> Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.
It appears to break GNR SNC2. While it works for the --per-node test, it
fails the following affinity test. Testing on EMR shows that it follows
the new lookup table. Should we use a model-specific lookup table here?
$ taskset -c CPU0 stress-ng --vm 1 --vm-bytes 2G --vm-method all
--timeout 30s
# UNC_M_PRE_COUNT.ALL
$ perf stat \
-e uncore_imc_0/event=0x03,umask=0xFF/ \
-e uncore_imc_1/event=0x03,umask=0xFF/ \
-e uncore_imc_2/event=0x03,umask=0xFF/ \
-e uncore_imc_3/event=0x03,umask=0xFF/ \
-e uncore_imc_4/event=0x03,umask=0xFF/ \
-e uncore_imc_5/event=0x03,umask=0xFF/ \
-e uncore_imc_6/event=0x03,umask=0xFF/ \
-e uncore_imc_7/event=0x03,umask=0xFF/ \
-a -I 1000
This shows that the uncore_imc_[2|3|6|7] boxes are affinitized to CPU0.
6.015968927 1,598,800 uncore_imc_0/event=0x03,umask=0xFF/
6.015968927 1,605,301 uncore_imc_1/event=0x03,umask=0xFF/
6.015968927 20,252,028 uncore_imc_2/event=0x03,umask=0xFF/
6.015968927 20,256,187 uncore_imc_3/event=0x03,umask=0xFF/
6.015968927 1,594,551 uncore_imc_4/event=0x03,umask=0xFF/
6.015968927 1,598,350 uncore_imc_5/event=0x03,umask=0xFF/
6.015968927 20,185,615 uncore_imc_6/event=0x03,umask=0xFF/
6.015968927 20,128,015 uncore_imc_7/event=0x03,umask=0xFF/
Testing with additional CPUs confirms that the original GNR SNC2 lookup
table is correct.
CPU uncore_imc_n
NUMA node0 CPU(s): 0-42,344-386 2 3 6 7
NUMA node1 CPU(s): 43-85,387-429 0 1 4 5
NUMA node2 CPU(s): 86-128,430-472 2 3 6 7
NUMA node3 CPU(s): 129-171,473-515 0 1 4 5
NUMA node4 CPU(s): 172-214,516-558 2 3 6 7
NUMA node5 CPU(s): 215-257,559-601 0 1 4 5
NUMA node6 CPU(s): 258-300,602-644 2 3 6 7
NUMA node7 CPU(s): 301-343,645-687 0 1 4 5
>>>
>>> Thanks,
>>> Ian
>>>
>>>> + const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
>>>> const u8 *snc_map;
>>>> size_t snc_map_len;
>>>>
>>>> @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>>> pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
>>>> return 0;
>>>> }
>>>> - if (imc_num >= snc_map_len) {
>>>> + if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
>>>> pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
>>>> return 0;
>>>> }
>>>> - return snc_map[imc_num];
>>>> + return snc_map[imc_num % snc_map_len];
>>>> }
>>>>
>>>> static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>>> @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>>> return cpu_adjust[pmu_snc];
>>>> }
>>>>
>>>> -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>>> +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>>> {
>>>> // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
>>>> // topology. For example, a two socket graniterapids machine may be set
>>>> @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
>>>> pmu->mem_events = perf_mem_events_intel_aux;
>>>> else
>>>> pmu->mem_events = perf_mem_events_intel;
>>>> - } else if (x86__is_intel_graniterapids()) {
>>>> + } else if (x86__is_snc_supported()) {
>>>> if (starts_with(pmu->name, "uncore_cha_"))
>>>> - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>>> - else if (starts_with(pmu->name, "uncore_imc_"))
>>>> - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>>> + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>>> + else if (starts_with(pmu->name, "uncore_imc_") &&
>>>> + !starts_with(pmu->name, "uncore_imc_free_running"))
>>>> + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>>> }
>>>> }
>>>> }
>>>> --
>>>> 2.52.0.457.g6b5491de43-goog
>>>>
On 1/14/2026 2:06 AM, Chun-Tse Shao wrote:
> Ping.
>
> Thanks for your comment, Ian. To Intel team, can we get confirmation
> of the GNR SNR2 configuration?
It seems you missed Ian's comments.
I have same question with Ian, the snc map for uncore IMC is changed. It
may not change the total count of uncore_imc events, but it would change
the count of the uncore_imc events for a specific SNC node. What's the
reason you did the change?
I have no GNR/EMR on my hand, I can't check how the SNC nodes are mapped on
these 2 machines.
>
> -CT
>
> On Thu, Jan 8, 2026 at 11:19 AM Ian Rogers <irogers@google.com> wrote:
>> On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
>>> Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
>>> Adjust cpumasks as the logic for GNR in [1].
>>>
>>> Tested on Emeraldrapids with SNC2 enabled:
>>> $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
>>>
>>> Performance counter stats for 'system wide':
>>>
>>> N0 30 72125876670 UNC_CHA_CLOCKTICKS
>>> N0 4 8815163648 UNC_M_CLOCKTICKS
>>> N1 30 72124958844 UNC_CHA_CLOCKTICKS
>>> N1 4 8815014974 UNC_M_CLOCKTICKS
>>> N2 30 72121049022 UNC_CHA_CLOCKTICKS
>>> N2 4 8814592626 UNC_M_CLOCKTICKS
>>> N3 30 72117133854 UNC_CHA_CLOCKTICKS
>>> N3 4 8814012840 UNC_M_CLOCKTICKS
>>>
>>> 1.001574118 seconds time elapsed
>>>
>>> [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
>>>
>>> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
>>> ---
>>> tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
>>> 1 file changed, 28 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
>>> index a3f96221758d..fad68a0f7b5d 100644
>>> --- a/tools/perf/arch/x86/util/pmu.c
>>> +++ b/tools/perf/arch/x86/util/pmu.c
>>> @@ -22,20 +22,29 @@
>>> #include "util/env.h"
>>> #include "util/header.h"
>>>
>>> -static bool x86__is_intel_graniterapids(void)
>>> +static bool x86__is_snc_supported(void)
>>> {
>>> - static bool checked_if_graniterapids;
>>> - static bool is_graniterapids;
>>> + static bool checked_if_snc_supported;
>>> + static bool is_supported;
>>>
>>> - if (!checked_if_graniterapids) {
>>> - const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
>>> + if (!checked_if_snc_supported) {
>>> +
>>> + /* Emeraldrapids and Graniterapids support SNC configuration. */
>>> + static const char *const supported_cpuids[] = {
>>> + "GenuineIntel-6-CF", /* Emeraldrapids */
>>> + "GenuineIntel-6-A[DE]", /* Graniterapids */
>>> + };
>>> char *cpuid = get_cpuid_str((struct perf_cpu){0});
>>>
>>> - is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
>>> + for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
>>> + is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
>>> + if (is_supported)
>>> + break;
>>> + }
>>> free(cpuid);
>>> - checked_if_graniterapids = true;
>>> + checked_if_snc_supported = true;
>>> }
>>> - return is_graniterapids;
>>> + return checked_if_snc_supported;
>>> }
>>>
>>> static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
>>> @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
>>> read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
>>>
>>> snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
>>> +
>>> perf_cpu_map__put(cache_cpus);
>>> perf_cpu_map__put(node_cpus);
>>> checked_snc = true;
>>> @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>> // Compute the IMC SNC using lookup tables.
>>> unsigned int imc_num;
>>> int snc_nodes = snc_nodes_per_l3_cache();
>>> - const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
>>> - const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
>>> + const u8 snc2_map[] = {0, 0, 1, 1};
>> Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.
>>
>> Thanks,
>> Ian
>>
>>> + const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
>>> const u8 *snc_map;
>>> size_t snc_map_len;
>>>
>>> @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>> pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
>>> return 0;
>>> }
>>> - if (imc_num >= snc_map_len) {
>>> + if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
>>> pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
>>> return 0;
>>> }
>>> - return snc_map[imc_num];
>>> + return snc_map[imc_num % snc_map_len];
>>> }
>>>
>>> static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>> @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>> return cpu_adjust[pmu_snc];
>>> }
>>>
>>> -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>> +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>> {
>>> // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
>>> // topology. For example, a two socket graniterapids machine may be set
>>> @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
>>> pmu->mem_events = perf_mem_events_intel_aux;
>>> else
>>> pmu->mem_events = perf_mem_events_intel;
>>> - } else if (x86__is_intel_graniterapids()) {
>>> + } else if (x86__is_snc_supported()) {
>>> if (starts_with(pmu->name, "uncore_cha_"))
>>> - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>> - else if (starts_with(pmu->name, "uncore_imc_"))
>>> - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>> + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>> + else if (starts_with(pmu->name, "uncore_imc_") &&
>>> + !starts_with(pmu->name, "uncore_imc_free_running"))
>>> + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>> }
>>> }
>>> }
>>> --
>>> 2.52.0.457.g6b5491de43-goog
>>>
Hi Dapeng,
I set the SNC2 config to {0, 0, 1, 1} because that is how my EMR
testing machine shows with SNC2 enabled. If that is true, I wonder if
the SNC2 map for GNR is still {1, 1, 0, 0}.
Thanks,
CT
On Tue, Jan 13, 2026 at 4:42 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote:
>
>
> On 1/14/2026 2:06 AM, Chun-Tse Shao wrote:
> > Ping.
> >
> > Thanks for your comment, Ian. To Intel team, can we get confirmation
> > of the GNR SNR2 configuration?
>
> It seems you missed Ian's comments.
>
> I have same question with Ian, the snc map for uncore IMC is changed. It
> may not change the total count of uncore_imc events, but it would change
> the count of the uncore_imc events for a specific SNC node. What's the
> reason you did the change?
>
> I have no GNR/EMR on my hand, I can't check how the SNC nodes are mapped on
> these 2 machines.
>
>
> >
> > -CT
> >
> > On Thu, Jan 8, 2026 at 11:19 AM Ian Rogers <irogers@google.com> wrote:
> >> On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
> >>> Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
> >>> Adjust cpumasks as the logic for GNR in [1].
> >>>
> >>> Tested on Emeraldrapids with SNC2 enabled:
> >>> $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
> >>>
> >>> Performance counter stats for 'system wide':
> >>>
> >>> N0 30 72125876670 UNC_CHA_CLOCKTICKS
> >>> N0 4 8815163648 UNC_M_CLOCKTICKS
> >>> N1 30 72124958844 UNC_CHA_CLOCKTICKS
> >>> N1 4 8815014974 UNC_M_CLOCKTICKS
> >>> N2 30 72121049022 UNC_CHA_CLOCKTICKS
> >>> N2 4 8814592626 UNC_M_CLOCKTICKS
> >>> N3 30 72117133854 UNC_CHA_CLOCKTICKS
> >>> N3 4 8814012840 UNC_M_CLOCKTICKS
> >>>
> >>> 1.001574118 seconds time elapsed
> >>>
> >>> [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
> >>>
> >>> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> >>> ---
> >>> tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
> >>> 1 file changed, 28 insertions(+), 17 deletions(-)
> >>>
> >>> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
> >>> index a3f96221758d..fad68a0f7b5d 100644
> >>> --- a/tools/perf/arch/x86/util/pmu.c
> >>> +++ b/tools/perf/arch/x86/util/pmu.c
> >>> @@ -22,20 +22,29 @@
> >>> #include "util/env.h"
> >>> #include "util/header.h"
> >>>
> >>> -static bool x86__is_intel_graniterapids(void)
> >>> +static bool x86__is_snc_supported(void)
> >>> {
> >>> - static bool checked_if_graniterapids;
> >>> - static bool is_graniterapids;
> >>> + static bool checked_if_snc_supported;
> >>> + static bool is_supported;
> >>>
> >>> - if (!checked_if_graniterapids) {
> >>> - const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
> >>> + if (!checked_if_snc_supported) {
> >>> +
> >>> + /* Emeraldrapids and Graniterapids support SNC configuration. */
> >>> + static const char *const supported_cpuids[] = {
> >>> + "GenuineIntel-6-CF", /* Emeraldrapids */
> >>> + "GenuineIntel-6-A[DE]", /* Graniterapids */
> >>> + };
> >>> char *cpuid = get_cpuid_str((struct perf_cpu){0});
> >>>
> >>> - is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
> >>> + for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
> >>> + is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
> >>> + if (is_supported)
> >>> + break;
> >>> + }
> >>> free(cpuid);
> >>> - checked_if_graniterapids = true;
> >>> + checked_if_snc_supported = true;
> >>> }
> >>> - return is_graniterapids;
> >>> + return checked_if_snc_supported;
> >>> }
> >>>
> >>> static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
> >>> @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
> >>> read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
> >>>
> >>> snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
> >>> +
> >>> perf_cpu_map__put(cache_cpus);
> >>> perf_cpu_map__put(node_cpus);
> >>> checked_snc = true;
> >>> @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
> >>> // Compute the IMC SNC using lookup tables.
> >>> unsigned int imc_num;
> >>> int snc_nodes = snc_nodes_per_l3_cache();
> >>> - const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
> >>> - const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
> >>> + const u8 snc2_map[] = {0, 0, 1, 1};
> >> Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.
> >>
> >> Thanks,
> >> Ian
> >>
> >>> + const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
> >>> const u8 *snc_map;
> >>> size_t snc_map_len;
> >>>
> >>> @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
> >>> pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
> >>> return 0;
> >>> }
> >>> - if (imc_num >= snc_map_len) {
> >>> + if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
> >>> pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
> >>> return 0;
> >>> }
> >>> - return snc_map[imc_num];
> >>> + return snc_map[imc_num % snc_map_len];
> >>> }
> >>>
> >>> static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
> >>> @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
> >>> return cpu_adjust[pmu_snc];
> >>> }
> >>>
> >>> -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
> >>> +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
> >>> {
> >>> // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
> >>> // topology. For example, a two socket graniterapids machine may be set
> >>> @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
> >>> pmu->mem_events = perf_mem_events_intel_aux;
> >>> else
> >>> pmu->mem_events = perf_mem_events_intel;
> >>> - } else if (x86__is_intel_graniterapids()) {
> >>> + } else if (x86__is_snc_supported()) {
> >>> if (starts_with(pmu->name, "uncore_cha_"))
> >>> - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
> >>> - else if (starts_with(pmu->name, "uncore_imc_"))
> >>> - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
> >>> + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
> >>> + else if (starts_with(pmu->name, "uncore_imc_") &&
> >>> + !starts_with(pmu->name, "uncore_imc_free_running"))
> >>> + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
> >>> }
> >>> }
> >>> }
> >>> --
> >>> 2.52.0.457.g6b5491de43-goog
> >>>
On 1/15/2026 2:03 AM, Chun-Tse Shao wrote:
> Hi Dapeng,
>
> I set the SNC2 config to {0, 0, 1, 1} because that is how my EMR
> testing machine shows with SNC2 enabled. If that is true, I wonder if
> the SNC2 map for GNR is still {1, 1, 0, 0}.
Ok, let me find a GNR to double check it. (Maybe next week, I have some
higher priority things on my hand in this week). Thanks.
>
> Thanks,
> CT
>
> On Tue, Jan 13, 2026 at 4:42 PM Mi, Dapeng <dapeng1.mi@linux.intel.com> wrote:
>>
>> On 1/14/2026 2:06 AM, Chun-Tse Shao wrote:
>>> Ping.
>>>
>>> Thanks for your comment, Ian. To Intel team, can we get confirmation
>>> of the GNR SNR2 configuration?
>> It seems you missed Ian's comments.
>>
>> I have same question with Ian, the snc map for uncore IMC is changed. It
>> may not change the total count of uncore_imc events, but it would change
>> the count of the uncore_imc events for a specific SNC node. What's the
>> reason you did the change?
>>
>> I have no GNR/EMR on my hand, I can't check how the SNC nodes are mapped on
>> these 2 machines.
>>
>>
>>> -CT
>>>
>>> On Thu, Jan 8, 2026 at 11:19 AM Ian Rogers <irogers@google.com> wrote:
>>>> On Thu, Jan 8, 2026 at 10:45 AM Chun-Tse Shao <ctshao@google.com> wrote:
>>>>> Similar to GNR [1], Emeraldrapids supports sub-NUMA clusters as well.
>>>>> Adjust cpumasks as the logic for GNR in [1].
>>>>>
>>>>> Tested on Emeraldrapids with SNC2 enabled:
>>>>> $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a -- sleep 1
>>>>>
>>>>> Performance counter stats for 'system wide':
>>>>>
>>>>> N0 30 72125876670 UNC_CHA_CLOCKTICKS
>>>>> N0 4 8815163648 UNC_M_CLOCKTICKS
>>>>> N1 30 72124958844 UNC_CHA_CLOCKTICKS
>>>>> N1 4 8815014974 UNC_M_CLOCKTICKS
>>>>> N2 30 72121049022 UNC_CHA_CLOCKTICKS
>>>>> N2 4 8814592626 UNC_M_CLOCKTICKS
>>>>> N3 30 72117133854 UNC_CHA_CLOCKTICKS
>>>>> N3 4 8814012840 UNC_M_CLOCKTICKS
>>>>>
>>>>> 1.001574118 seconds time elapsed
>>>>>
>>>>> [1] lore.kernel.org/20250515181417.491401-1-irogers@google.com
>>>>>
>>>>> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
>>>>> ---
>>>>> tools/perf/arch/x86/util/pmu.c | 45 +++++++++++++++++++++-------------
>>>>> 1 file changed, 28 insertions(+), 17 deletions(-)
>>>>>
>>>>> diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
>>>>> index a3f96221758d..fad68a0f7b5d 100644
>>>>> --- a/tools/perf/arch/x86/util/pmu.c
>>>>> +++ b/tools/perf/arch/x86/util/pmu.c
>>>>> @@ -22,20 +22,29 @@
>>>>> #include "util/env.h"
>>>>> #include "util/header.h"
>>>>>
>>>>> -static bool x86__is_intel_graniterapids(void)
>>>>> +static bool x86__is_snc_supported(void)
>>>>> {
>>>>> - static bool checked_if_graniterapids;
>>>>> - static bool is_graniterapids;
>>>>> + static bool checked_if_snc_supported;
>>>>> + static bool is_supported;
>>>>>
>>>>> - if (!checked_if_graniterapids) {
>>>>> - const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
>>>>> + if (!checked_if_snc_supported) {
>>>>> +
>>>>> + /* Emeraldrapids and Graniterapids support SNC configuration. */
>>>>> + static const char *const supported_cpuids[] = {
>>>>> + "GenuineIntel-6-CF", /* Emeraldrapids */
>>>>> + "GenuineIntel-6-A[DE]", /* Graniterapids */
>>>>> + };
>>>>> char *cpuid = get_cpuid_str((struct perf_cpu){0});
>>>>>
>>>>> - is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
>>>>> + for (size_t i = 0; i < ARRAY_SIZE(supported_cpuids); i++) {
>>>>> + is_supported = cpuid && strcmp_cpuid_str(supported_cpuids[i], cpuid) == 0;
>>>>> + if (is_supported)
>>>>> + break;
>>>>> + }
>>>>> free(cpuid);
>>>>> - checked_if_graniterapids = true;
>>>>> + checked_if_snc_supported = true;
>>>>> }
>>>>> - return is_graniterapids;
>>>>> + return checked_if_snc_supported;
>>>>> }
>>>>>
>>>>> static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
>>>>> @@ -64,6 +73,7 @@ static int snc_nodes_per_l3_cache(void)
>>>>> read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
>>>>>
>>>>> snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
>>>>> +
>>>>> perf_cpu_map__put(cache_cpus);
>>>>> perf_cpu_map__put(node_cpus);
>>>>> checked_snc = true;
>>>>> @@ -137,8 +147,8 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>>>> // Compute the IMC SNC using lookup tables.
>>>>> unsigned int imc_num;
>>>>> int snc_nodes = snc_nodes_per_l3_cache();
>>>>> - const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
>>>>> - const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
>>>>> + const u8 snc2_map[] = {0, 0, 1, 1};
>>>> Does this alter the behavior on GNR? ie 1,1,0,0 vs 0,0,1,1.
>>>>
>>>> Thanks,
>>>> Ian
>>>>
>>>>> + const u8 snc3_map[] = {1, 1, 0, 0, 2, 2};
>>>>> const u8 *snc_map;
>>>>> size_t snc_map_len;
>>>>>
>>>>> @@ -161,11 +171,11 @@ static int uncore_imc_snc(struct perf_pmu *pmu)
>>>>> pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
>>>>> return 0;
>>>>> }
>>>>> - if (imc_num >= snc_map_len) {
>>>>> + if (imc_num >= snc_map_len * perf_cpu_map__nr(pmu->cpus)) {
>>>>> pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
>>>>> return 0;
>>>>> }
>>>>> - return snc_map[imc_num];
>>>>> + return snc_map[imc_num % snc_map_len];
>>>>> }
>>>>>
>>>>> static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>>>> @@ -205,7 +215,7 @@ static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
>>>>> return cpu_adjust[pmu_snc];
>>>>> }
>>>>>
>>>>> -static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>>>> +static void uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
>>>>> {
>>>>> // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
>>>>> // topology. For example, a two socket graniterapids machine may be set
>>>>> @@ -304,11 +314,12 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
>>>>> pmu->mem_events = perf_mem_events_intel_aux;
>>>>> else
>>>>> pmu->mem_events = perf_mem_events_intel;
>>>>> - } else if (x86__is_intel_graniterapids()) {
>>>>> + } else if (x86__is_snc_supported()) {
>>>>> if (starts_with(pmu->name, "uncore_cha_"))
>>>>> - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>>>> - else if (starts_with(pmu->name, "uncore_imc_"))
>>>>> - gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>>>> + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
>>>>> + else if (starts_with(pmu->name, "uncore_imc_") &&
>>>>> + !starts_with(pmu->name, "uncore_imc_free_running"))
>>>>> + uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
>>>>> }
>>>>> }
>>>>> }
>>>>> --
>>>>> 2.52.0.457.g6b5491de43-goog
>>>>>
© 2016 - 2026 Red Hat, Inc.