[PATCH v2 7/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize.

Tony Luck posted 7 patches 2 years, 7 months ago
There is a newer version of this series
[PATCH v2 7/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize.
Posted by Tony Luck 2 years, 7 months ago
There isn't a simple hardware enumeration to indicate to software that
a system is running with Sub-NUMA Cluster enabled.

Compare the number of NUMA nodes with the number of L3 caches to calculate
the number of Sub-NUMA nodes per L3 cache.

When Sub-NUMA cluster mode is enabled in BIOS setup the RMID counters
are distributed equally between the SNC nodes within each socket.

E.g. if there are 400 RMID counters, and the system is configured with
two SNC nodes per socket, then RMID counter 0..199 are used on SNC node
0 on the socket, and RMID counter 200..399 on SNC node 1.

A model specific MSR (0xca0) can change the configuration of the RMIDs
when SNC mode is enabled.

The MSR controls the interpretation of the RMID field in the
IA32_PQR_ASSOC MSR so that the appropriate hardware counters
within the SNC node are updated.

Also initialize a per-cpu RMID offset value. Use this
to calculate the value to write to the IA32_QM_EVTSEL MSR when
reading RMID event values.

N.B. this works well for well-behaved NUMA applications that access
memory predominantly from the local memory node. For applications that
access memory across multiple nodes it may be necessary for the user
to read counters for all SNC nodes on a socket and add the values to
get the actual LLC occupancy or memory bandwidth. Perhaps this isn't
all that different from applications that span across multiple sockets
in a legacy system.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/include/asm/resctrl.h        |  2 +
 arch/x86/kernel/cpu/resctrl/core.c    | 99 ++++++++++++++++++++++++++-
 arch/x86/kernel/cpu/resctrl/monitor.c |  2 +-
 3 files changed, 99 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 255a78d9d906..f95e69bacc65 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -35,6 +35,8 @@ DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
 DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
 
+DECLARE_PER_CPU(int, rmid_offset);
+
 /*
  * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
  *
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index af3be3c2db96..869cfb46e8e4 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -16,11 +16,14 @@
 
 #define pr_fmt(fmt)	"resctrl: " fmt
 
+#include <linux/cpu.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/cacheinfo.h>
 #include <linux/cpuhotplug.h>
+#include <linux/mod_devicetable.h>
 
+#include <asm/cpu_device_id.h>
 #include <asm/intel-family.h>
 #include <asm/resctrl.h>
 #include "internal.h"
@@ -524,6 +527,39 @@ static int get_domain_id(int cpu, enum resctrl_scope scope)
 	}
 }
 
+DEFINE_PER_CPU(int, rmid_offset);
+
+static void set_per_cpu_rmid_offset(int cpu, struct rdt_resource *r)
+{
+	this_cpu_write(rmid_offset, (cpu_to_node(cpu) % snc_ways) * r->num_rmid);
+}
+
+/*
+ * This MSR provides for configuration of RMIDs on Sub-NUMA Cluster
+ * systems.
+ * Bit0 = 1 (default) For legacy configuration
+ * Bit0 = 0 RMIDs are divided evenly between SNC nodes.
+ */
+#define MSR_RMID_SNC_CONFIG   0xCA0
+
+static void snc_add_pkg(void)
+{
+	u64	msrval;
+
+	rdmsrl(MSR_RMID_SNC_CONFIG, msrval);
+	msrval |= BIT_ULL(0);
+	wrmsrl(MSR_RMID_SNC_CONFIG, msrval);
+}
+
+static void snc_remove_pkg(void)
+{
+	u64	msrval;
+
+	rdmsrl(MSR_RMID_SNC_CONFIG, msrval);
+	msrval &= ~BIT_ULL(0);
+	wrmsrl(MSR_RMID_SNC_CONFIG, msrval);
+}
+
 /*
  * domain_add_cpu - Add a cpu to a resource's domain list.
  *
@@ -555,6 +591,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 		cpumask_set_cpu(cpu, &d->cpu_mask);
 		if (r->cache.arch_has_per_cpu_cfg)
 			rdt_domain_reconfigure_cdp(r);
+		if (r->mon_capable)
+			set_per_cpu_rmid_offset(cpu, r);
 		return;
 	}
 
@@ -573,11 +611,17 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 		return;
 	}
 
-	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
-		domain_free(hw_dom);
-		return;
+	if (r->mon_capable) {
+		if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
+			domain_free(hw_dom);
+			return;
+		}
+		set_per_cpu_rmid_offset(cpu, r);
 	}
 
+	if (r->pkg_actions)
+		snc_add_pkg();
+
 	list_add_tail(&d->list, add_pos);
 
 	err = resctrl_online_domain(r, d);
@@ -613,6 +657,9 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 			d->plr->d = NULL;
 		domain_free(hw_dom);
 
+		if (r->pkg_actions)
+			snc_remove_pkg();
+
 		return;
 	}
 
@@ -899,11 +946,57 @@ static __init bool get_rdt_resources(void)
 	return (rdt_mon_capable || rdt_alloc_capable);
 }
 
+static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
+	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
+	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 0),
+	X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, 0),
+	{}
+};
+
+/*
+ * There isn't a simple enumeration bit to show whether SNC mode
+ * is enabled. Look at the ratio of number of NUMA nodes to the
+ * number of distinct L3 caches. Take care to skip memory-only nodes.
+ */
+static __init int find_snc_ways(void)
+{
+	unsigned long *node_caches;
+	int mem_only_nodes = 0;
+	int cpu, node, ret;
+
+	if (!x86_match_cpu(snc_cpu_ids))
+		return 1;
+
+	node_caches = kcalloc(BITS_TO_LONGS(nr_node_ids), sizeof(*node_caches), GFP_KERNEL);
+	if (!node_caches)
+		return 1;
+
+	cpus_read_lock();
+	for_each_node(node) {
+		cpu = cpumask_first(cpumask_of_node(node));
+		if (cpu < nr_cpu_ids)
+			set_bit(get_cpu_cacheinfo_id(cpu, 3), node_caches);
+		else
+			mem_only_nodes++;
+	}
+	cpus_read_unlock();
+
+	ret = (nr_node_ids - mem_only_nodes) / bitmap_weight(node_caches, nr_node_ids);
+	kfree(node_caches);
+
+	if (ret > 1)
+		rdt_resources_all[RDT_RESOURCE_PKG].r_resctrl.pkg_actions = true;
+
+	return ret;
+}
+
 static __init void rdt_init_res_defs_intel(void)
 {
 	struct rdt_hw_resource *hw_res;
 	struct rdt_resource *r;
 
+	snc_ways = find_snc_ways();
+
 	for_each_rdt_resource(r) {
 		hw_res = resctrl_to_arch_res(r);
 
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index da3f36212898..74db99d299e1 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -160,7 +160,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
 	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
 	 * are error bits.
 	 */
-	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid + this_cpu_read(rmid_offset));
 	rdmsrl(MSR_IA32_QM_CTR, msr_val);
 
 	if (msr_val & RMID_VAL_ERROR)
-- 
2.40.1
RE: [PATCH v2 7/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize.
Posted by Shaopeng Tan (Fujitsu) 2 years, 7 months ago
Hi Tony,

> There isn't a simple hardware enumeration to indicate to software that a
> system is running with Sub-NUMA Cluster enabled.
> 
> Compare the number of NUMA nodes with the number of L3 caches to calculate
> the number of Sub-NUMA nodes per L3 cache.
> 
> When Sub-NUMA cluster mode is enabled in BIOS setup the RMID counters are
> distributed equally between the SNC nodes within each socket.
> 
> E.g. if there are 400 RMID counters, and the system is configured with two SNC
> nodes per socket, then RMID counter 0..199 are used on SNC node
> 0 on the socket, and RMID counter 200..399 on SNC node 1.
> 
> A model specific MSR (0xca0) can change the configuration of the RMIDs when
> SNC mode is enabled.
> 
> The MSR controls the interpretation of the RMID field in the IA32_PQR_ASSOC
> MSR so that the appropriate hardware counters within the SNC node are
> updated.
> 
> Also initialize a per-cpu RMID offset value. Use this to calculate the value to
> write to the IA32_QM_EVTSEL MSR when reading RMID event values.
> 
> N.B. this works well for well-behaved NUMA applications that access memory
> predominantly from the local memory node. For applications that access
> memory across multiple nodes it may be necessary for the user to read counters
> for all SNC nodes on a socket and add the values to get the actual LLC
> occupancy or memory bandwidth. Perhaps this isn't all that different from
> applications that span across multiple sockets in a legacy system.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/include/asm/resctrl.h        |  2 +
>  arch/x86/kernel/cpu/resctrl/core.c    | 99
> ++++++++++++++++++++++++++-
>  arch/x86/kernel/cpu/resctrl/monitor.c |  2 +-
>  3 files changed, 99 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
> index 255a78d9d906..f95e69bacc65 100644
> --- a/arch/x86/include/asm/resctrl.h
> +++ b/arch/x86/include/asm/resctrl.h
> @@ -35,6 +35,8 @@ DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
>  DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
>  DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
> 
> +DECLARE_PER_CPU(int, rmid_offset);
> +
>  /*
>   * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
>   *
> diff --git a/arch/x86/kernel/cpu/resctrl/core.c
> b/arch/x86/kernel/cpu/resctrl/core.c
> index af3be3c2db96..869cfb46e8e4 100644
> --- a/arch/x86/kernel/cpu/resctrl/core.c
> +++ b/arch/x86/kernel/cpu/resctrl/core.c
> @@ -16,11 +16,14 @@
> 
>  #define pr_fmt(fmt)	"resctrl: " fmt
> 
> +#include <linux/cpu.h>
>  #include <linux/slab.h>
>  #include <linux/err.h>
>  #include <linux/cacheinfo.h>
>  #include <linux/cpuhotplug.h>
> +#include <linux/mod_devicetable.h>
> 
> +#include <asm/cpu_device_id.h>
>  #include <asm/intel-family.h>
>  #include <asm/resctrl.h>
>  #include "internal.h"
> @@ -524,6 +527,39 @@ static int get_domain_id(int cpu, enum resctrl_scope
> scope)
>  	}
>  }
> 
> +DEFINE_PER_CPU(int, rmid_offset);
> +
> +static void set_per_cpu_rmid_offset(int cpu, struct rdt_resource *r) {
> +	this_cpu_write(rmid_offset, (cpu_to_node(cpu) % snc_ways) *
> +r->num_rmid); }
> +
> +/*
> + * This MSR provides for configuration of RMIDs on Sub-NUMA Cluster
> + * systems.
> + * Bit0 = 1 (default) For legacy configuration
> + * Bit0 = 0 RMIDs are divided evenly between SNC nodes.
> + */
> +#define MSR_RMID_SNC_CONFIG   0xCA0
> +
> +static void snc_add_pkg(void)
> +{
> +	u64	msrval;
> +
> +	rdmsrl(MSR_RMID_SNC_CONFIG, msrval);
> +	msrval |= BIT_ULL(0);
> +	wrmsrl(MSR_RMID_SNC_CONFIG, msrval);
> +}
> +
> +static void snc_remove_pkg(void)
> +{
> +	u64	msrval;
> +
> +	rdmsrl(MSR_RMID_SNC_CONFIG, msrval);
> +	msrval &= ~BIT_ULL(0);
> +	wrmsrl(MSR_RMID_SNC_CONFIG, msrval);
> +}
> +
>  /*
>   * domain_add_cpu - Add a cpu to a resource's domain list.
>   *
> @@ -555,6 +591,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource
> *r)
>  		cpumask_set_cpu(cpu, &d->cpu_mask);
>  		if (r->cache.arch_has_per_cpu_cfg)
>  			rdt_domain_reconfigure_cdp(r);
> +		if (r->mon_capable)
> +			set_per_cpu_rmid_offset(cpu, r);
>  		return;
>  	}
> 
> @@ -573,11 +611,17 @@ static void domain_add_cpu(int cpu, struct
> rdt_resource *r)
>  		return;
>  	}
> 
> -	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid,
> hw_dom)) {
> -		domain_free(hw_dom);
> -		return;
> +	if (r->mon_capable) {
> +		if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
> +			domain_free(hw_dom);
> +			return;
> +		}
> +		set_per_cpu_rmid_offset(cpu, r);
>  	}
> 
> +	if (r->pkg_actions)
> +		snc_add_pkg();
> +
>  	list_add_tail(&d->list, add_pos);
> 
>  	err = resctrl_online_domain(r, d);
> @@ -613,6 +657,9 @@ static void domain_remove_cpu(int cpu, struct
> rdt_resource *r)
>  			d->plr->d = NULL;
>  		domain_free(hw_dom);
> 
> +		if (r->pkg_actions)
> +			snc_remove_pkg();
> +
>  		return;
>  	}
> 
> @@ -899,11 +946,57 @@ static __init bool get_rdt_resources(void)
>  	return (rdt_mon_capable || rdt_alloc_capable);  }
> 
> +static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
> +	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
> +	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 0),
> +	X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, 0),
> +	{}
> +};

Cascade Lake and Skylake also seem to support Sub-NUMA cluster.
At least in my environment(Intel(R) Xeon(R) Gold 6254 CPU @ 3.10GHz),
Sub-NUMA cluster is supported.

Best regards,
Shaopeng TAN
RE: [PATCH v2 7/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize.
Posted by Luck, Tony 2 years, 7 months ago
>> +static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
>> +	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
>> +	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 0),
>> +	X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, 0),
>> +	{}
>> +};
>
> Cascade Lake and Skylake also seem to support Sub-NUMA cluster.
> At least in my environment(Intel(R) Xeon(R) Gold 6254 CPU @ 3.10GHz),
> Sub-NUMA cluster is supported.

Shaopeng TAN,

This is true. But MSR_RMID_SNC_CONFIG  (0xCA0) is not implemented
for CPUs before ICELAKE.

-Tony
Re: [PATCH v2 7/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize.
Posted by Peter Newman 2 years, 7 months ago
Hi Tony,

On Wed, Jun 21, 2023 at 7:40 PM Tony Luck <tony.luck@intel.com> wrote:

> diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
> index 255a78d9d906..f95e69bacc65 100644
> --- a/arch/x86/include/asm/resctrl.h
> +++ b/arch/x86/include/asm/resctrl.h
> @@ -35,6 +35,8 @@ DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
>  DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
>  DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
>
> +DECLARE_PER_CPU(int, rmid_offset);
> +

I assumed that declarations in this file were those needed by
__resctrl_sched_in(). Now that rmid_offset is used when setting
PQR_ASSOC, would this go somewhere else?

Other than this and fixing the MSR update, the series looks fine to me.

Reviewed-By: Peter Newman <peternewman@google.com>

Thanks!
-Peter
RE: [PATCH v2 7/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize.
Posted by Luck, Tony 2 years, 7 months ago
>> +++ b/arch/x86/include/asm/resctrl.h
>> @@ -35,6 +35,8 @@ DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
>>  DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
>>  DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
>>
>> +DECLARE_PER_CPU(int, rmid_offset);
>> +
>
> I assumed that declarations in this file were those needed by
> __resctrl_sched_in(). Now that rmid_offset is used when setting
> PQR_ASSOC, would this go somewhere else?

PQR_ASSOC no longer needs rmid_offset. But QM_EVTSEL still does.

I'll take a look to see if all the SNC detection code can move into
monitor.c. Then rmid_offset could be static in that file. But if that gets
complicated I may leave it alone (with rmid_offset set in core.c and used
in monitor.c).

> Other than this and fixing the MSR update, the series looks fine to me.
>
> Reviewed-By: Peter Newman <peternewman@google.com>

Peter,

Thanks for testing and review. Did you also test on one of your systems
with a memory-only node? I recall that was an issue with my very first
patch series.

-Tony
Re: [PATCH v2 7/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize.
Posted by Peter Newman 2 years, 7 months ago
Hi Tony,

On Mon, Jun 26, 2023 at 5:52 PM Luck, Tony <tony.luck@intel.com> wrote:

> Thanks for testing and review. Did you also test on one of your systems
> with a memory-only node? I recall that was an issue with my very first
> patch series.

It turns out the people who reported the crash to me were running an
experiment, but they were able to find one machine they hadn't yanked
the AEP DIMMs out of yet.

It was a cascade lake, so I had to yank the CPU family match check to
ensure that your fix was actually exercised. I can confirm now that it
booted successfully.

-Peter
RE: [PATCH v2 7/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize.
Posted by Luck, Tony 2 years, 7 months ago
> I'll take a look to see if all the SNC detection code can move into
> monitor.c. Then rmid_offset could be static in that file. But if that gets
> complicated I may leave it alone (with rmid_offset set in core.c and used
> in monitor.c).

Hmm. Setting rmid_offset is deeply tangled into the cpu/domain
hotplug code path. So I'm going to leave that in core.c

-Tony
Re: [PATCH v2 7/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize.
Posted by Peter Newman 2 years, 7 months ago
Hi Tony,

On Wed, Jun 21, 2023 at 7:40 PM Tony Luck <tony.luck@intel.com> wrote:
>
> There isn't a simple hardware enumeration to indicate to software that
> a system is running with Sub-NUMA Cluster enabled.
>
> Compare the number of NUMA nodes with the number of L3 caches to calculate
> the number of Sub-NUMA nodes per L3 cache.
>
> When Sub-NUMA cluster mode is enabled in BIOS setup the RMID counters
> are distributed equally between the SNC nodes within each socket.
>
> E.g. if there are 400 RMID counters, and the system is configured with
> two SNC nodes per socket, then RMID counter 0..199 are used on SNC node
> 0 on the socket, and RMID counter 200..399 on SNC node 1.
>
> A model specific MSR (0xca0) can change the configuration of the RMIDs
> when SNC mode is enabled.
>
> The MSR controls the interpretation of the RMID field in the
> IA32_PQR_ASSOC MSR so that the appropriate hardware counters
> within the SNC node are updated.
>
> Also initialize a per-cpu RMID offset value. Use this
> to calculate the value to write to the IA32_QM_EVTSEL MSR when
> reading RMID event values.
>
> N.B. this works well for well-behaved NUMA applications that access
> memory predominantly from the local memory node. For applications that
> access memory across multiple nodes it may be necessary for the user
> to read counters for all SNC nodes on a socket and add the values to
> get the actual LLC occupancy or memory bandwidth. Perhaps this isn't
> all that different from applications that span across multiple sockets
> in a legacy system.

Unfortunately I'm not getting as good of results with the new series.
The main difference seems to be updating the 0xca0 MSR instead of
applying the offset to PQR_ASSOC.

In my test case of generating bandwidth on 20 random CPUs in 20 random
RMIDs, I'm only getting correct counts from CPUs in node 0. Node 1
CPUs are showing counts which are too small, and nodes 2 and 3 are
seeing no bandwidth at all:

(expected bandwidth is around 30 GB, value in first parenthesis is L3 cache id)

cpu: 134 (0), rmid: 30 (g29):   0 -> 30640791552 (30640791552)
cpu: 138 (0), rmid: 103 (g101): 0 -> 28196962304 (28196962304)

cpu: 35 (0), rmid: 211 (g209):  0 -> 3039232 (3039232)
cpu: 55 (0), rmid: 113 (g111):  0 -> 4874240 (4874240)
cpu: 41 (0), rmid: 83 (g81):    0 -> 2637824 (2637824)
cpu: 42 (0), rmid: 218 (g216):  0 -> 2408448 (2408448)
cpu: 161 (0), rmid: 8 (g7):     0 -> 7856128 (7856128)

cpu: 86 (1), rmid: 171 (g169):  0 -> 0 (0)
cpu: 86 (1), rmid: 121 (g119):  0 -> 0 (0)
cpu: 212 (1), rmid: 163 (g161): 0 -> 0 (0)
cpu: 180 (1), rmid: 129 (g127): 0 -> 0 (0)
cpu: 205 (1), rmid: 186 (g184): 0 -> 0 (0)
cpu: 194 (1), rmid: 160 (g158): 0 -> 0 (0)
cpu: 186 (1), rmid: 196 (g194): 0 -> 0 (0)
cpu: 106 (1), rmid: 93 (g91):   0 -> 0 (0)
cpu: 84 (1), rmid: 168 (g166):  0 -> 0 (0)
cpu: 197 (1), rmid: 104 (g102): 0 -> 0 (0)
cpu: 64 (1), rmid: 103 (g101):  0 -> 0 (0)
cpu: 71 (1), rmid: 81 (g79):    0 -> 0 (0)
cpu: 60 (1), rmid: 221 (g219):  0 -> 0 (0)

Here's the output of `cat /sys/devices/system/node/node*/cpulist` on
this machine for reference:

0-27,112-139
28-55,140-167
56-83,168-195
84-111,196-223

-Peter
RE: [PATCH v2 7/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize.
Posted by Luck, Tony 2 years, 7 months ago
> Unfortunately I'm not getting as good of results with the new series.
> The main difference seems to be updating the 0xca0 MSR instead of
> applying the offset to PQR_ASSOC.

I think I may have reversed the actions to update the MSR in one of
my refactor/rebase. The comment here is correct, but that's not
what the code is doing :-(

Can you swap the bodies of these two functions and retest?

+/*
+ * This MSR provides for configuration of RMIDs on Sub-NUMA Cluster
+ * systems.
+ * Bit0 = 1 (default) For legacy configuration
+ * Bit0 = 0 RMIDs are divided evenly between SNC nodes.
+ */
+#define MSR_RMID_SNC_CONFIG   0xCA0
+
+static void snc_add_pkg(void)
+{
+       u64     msrval;
+
+       rdmsrl(MSR_RMID_SNC_CONFIG, msrval);
+       msrval |= BIT_ULL(0);
+       wrmsrl(MSR_RMID_SNC_CONFIG, msrval);
+}
+
+static void snc_remove_pkg(void)
+{
+       u64     msrval;
+
+       rdmsrl(MSR_RMID_SNC_CONFIG, msrval);
+       msrval &= ~BIT_ULL(0);
+       wrmsrl(MSR_RMID_SNC_CONFIG, msrval);
+}

-Tony
Re: [PATCH v2 7/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize.
Posted by Peter Newman 2 years, 7 months ago
Hi Tony,

On Thu, Jun 22, 2023 at 6:02 PM Luck, Tony <tony.luck@intel.com> wrote:
>
> > Unfortunately I'm not getting as good of results with the new series.
> > The main difference seems to be updating the 0xca0 MSR instead of
> > applying the offset to PQR_ASSOC.
>
> I think I may have reversed the actions to update the MSR in one of
> my refactor/rebase. The comment here is correct, but that's not
> what the code is doing :-(
>
> Can you swap the bodies of these two functions and retest?

It's a small improvement, but still not great. Still only node 0
giving believable results, but at least no more empty results from the
second package.

I poked around in /proc/kcore and noticed that my snc_ways is still 1, though.

-Peter
Re: [PATCH v2 7/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize.
Posted by Tony Luck 2 years, 7 months ago
On Fri, Jun 23, 2023 at 05:19:37PM +0200, Peter Newman wrote:
> Hi Tony,
> 
> On Thu, Jun 22, 2023 at 6:02 PM Luck, Tony <tony.luck@intel.com> wrote:
> >
> > > Unfortunately I'm not getting as good of results with the new series.
> > > The main difference seems to be updating the 0xca0 MSR instead of
> > > applying the offset to PQR_ASSOC.
> >
> > I think I may have reversed the actions to update the MSR in one of
> > my refactor/rebase. The comment here is correct, but that's not
> > what the code is doing :-(
> >
> > Can you swap the bodies of these two functions and retest?
> 
> It's a small improvement, but still not great. Still only node 0
> giving believable results, but at least no more empty results from the
> second package.
> 
> I poked around in /proc/kcore and noticed that my snc_ways is still 1, though.

Below is the patch I applied to reverse the add/remove package actions
together with some debug to make double sure SNC is being detected as
I expect and the right actions taken.

When booting the debug messages say:

[    9.458624] resctrl: SNC_ways = 2
[    9.458801] resctrl: CPU0: set MSR_RMID_SNC_CONFIG to 0x0
[    9.461986] resctrl: CPU56: set MSR_RMID_SNC_CONFIG to 0x0

which is all good and correct.

For my tests I have a memory hog process that loops on a memcpy()
of 100 MBytes to generate enough traffic to be totally obvious when
looking at the mbm counters.

Test 1: I used taskset(1) to start a copy on the first CPU of node0
and checked the MBM counters. Both local and remote showed around
25 GB/s on node 0. Killed this process.

Tests 2, 3, 4: Same as 1, but started the process on first CPU of node
1, 2, 3.  Same result. Around 25 GB/s appeared in the MBM counts for
the right node in each cycle.

Test 5: Started on node0, then periodically used taskset to bind the
running process onto a CPU on another node.  It looks like Linux
migrates the memory for the job shortly after the affinity of the
process is changed. A few seconds after each process migration, the
MBM counters reflect traffic on the node that is the new home of the
process.

-Tony

From 414341db02cd51daaf4a9ea8bd68b22a23cf4b59 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Fri, 23 Jun 2023 08:57:57 -0700
Subject: [PATCH] Fix inverted SNC enable/disable MSR writes. Add some debug
 too.

---
 arch/x86/kernel/cpu/resctrl/core.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 869cfb46e8e4..e66b2b84fe6f 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -546,18 +546,20 @@ static void snc_add_pkg(void)
 {
 	u64	msrval;
 
-	rdmsrl(MSR_RMID_SNC_CONFIG, msrval);
-	msrval |= BIT_ULL(0);
-	wrmsrl(MSR_RMID_SNC_CONFIG, msrval);
-}
-
-static void snc_remove_pkg(void)
-{
-	u64	msrval;
-
 	rdmsrl(MSR_RMID_SNC_CONFIG, msrval);
 	msrval &= ~BIT_ULL(0);
 	wrmsrl(MSR_RMID_SNC_CONFIG, msrval);
+pr_info("CPU%d: set MSR_RMID_SNC_CONFIG to 0x%llx\n", raw_smp_processor_id(), msrval);
+}
+
+static void snc_remove_pkg(void)
+{
+	u64	msrval;
+
+	rdmsrl(MSR_RMID_SNC_CONFIG, msrval);
+	msrval |= BIT_ULL(0);
+	wrmsrl(MSR_RMID_SNC_CONFIG, msrval);
+pr_info("CPU%d: set MSR_RMID_SNC_CONFIG to 0x%llx\n", raw_smp_processor_id(), msrval);
 }
 
 /*
@@ -987,6 +989,7 @@ static __init int find_snc_ways(void)
 	if (ret > 1)
 		rdt_resources_all[RDT_RESOURCE_PKG].r_resctrl.pkg_actions = true;
 
+pr_info("SNC_ways = %d\n", ret);
 	return ret;
 }
 
-- 
2.40.1

Re: [PATCH v2 7/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize.
Posted by Peter Newman 2 years, 7 months ago
Hi Tony,

On Fri, Jun 23, 2023 at 10:20 PM Tony Luck <tony.luck@intel.com> wrote:
> On Fri, Jun 23, 2023 at 05:19:37PM +0200, Peter Newman wrote:
> > On Thu, Jun 22, 2023 at 6:02 PM Luck, Tony <tony.luck@intel.com> wrote:
> > >
> > > > Unfortunately I'm not getting as good of results with the new series.
> > > > The main difference seems to be updating the 0xca0 MSR instead of
> > > > applying the offset to PQR_ASSOC.
> > >
> > > I think I may have reversed the actions to update the MSR in one of
> > > my refactor/rebase. The comment here is correct, but that's not
> > > what the code is doing :-(
> > >
> > > Can you swap the bodies of these two functions and retest?
> >
> > It's a small improvement, but still not great. Still only node 0
> > giving believable results, but at least no more empty results from the
> > second package.
> >
> > I poked around in /proc/kcore and noticed that my snc_ways is still 1, though.

It turns out I just forgot that I had KASLR on. snc_ways was in fact 2.

The real problem was my test program was assuming that the absence of
the snc_ways file meant no SNC, so it was using cache IDs instead of
node IDs when choosing a mon_data subdirectory to read results from.

With that fixed, the results look good:

cpu: 95 (3), rmid: 17 (g16): 0 -> 32313974784 (32313974784)
cpu: 198 (3), rmid: 103 (g102): 0 -> 26078961664 (26078961664)
cpu: 117 (0), rmid: 59 (g58): 0 -> 26692599808 (26692599808)
cpu: 152 (1), rmid: 113 (g112): 0 -> 33368244224 (33368244224)
cpu: 33 (1), rmid: 77 (g76): 0 -> 26902077440 (26902077440)
cpu: 63 (2), rmid: 76 (g75): 0 -> 32478494720 (32478494720)
cpu: 90 (3), rmid: 94 (g93): 0 -> 31206088704 (31206088704)
cpu: 136 (0), rmid: 13 (g12): 0 -> 28095463424 (28095463424)
cpu: 37 (1), rmid: 177 (g175): 0 -> 31255060480 (31255060480)
cpu: 124 (0), rmid: 6 (g5): 0 -> 31128502272 (31128502272)
cpu: 102 (3), rmid: 68 (g67): 0 -> 28848963584 (28848963584)
cpu: 103 (3), rmid: 62 (g61): 0 -> 26091233280 (26091233280)
cpu: 71 (2), rmid: 123 (g122): 0 -> 29157933056 (29157933056)
cpu: 94 (3), rmid: 69 (g68): 0 -> 27776458752 (27776458752)
cpu: 102 (3), rmid: 174 (g172): 0 -> 26349281280 (26349281280)
cpu: 155 (1), rmid: 3 (g2): 0 -> 33489125376 (33489125376)
cpu: 40 (1), rmid: 16 (g15): 0 -> 27142750208 (27142750208)
cpu: 69 (2), rmid: 156 (g154): 0 -> 29294411776 (29294411776)
cpu: 121 (0), rmid: 63 (g62): 0 -> 30636777472 (30636777472)
cpu: 171 (2), rmid: 93 (g92): 0 -> 26103046144 (26103046144)

(and it turns out I never needed to manually look up the node IDs,
because the test output would have already told me, had the test been
working correctly)

Sorry for the extra trouble. The series works well for me.

Tested-By: Peter Newman <peternewman@google.com>

Thanks!
-Peter