[v4] Add support for Sub-NUMA cluster (SNC) systems

[PATCH v4 1/7] x86/resctrl: Create separate domains for control and monitoring

Posted by Tony Luck 2 years, 6 months ago

First step towards supporting resource control where the scope of
control operations is not the same as monitor operations.

Add an extra list in the rdt_resource structure. For this will
just duplicate the existing list of domains based on the L3 cache
scope.

Refactor the domain_add_cpu() and domain_remove() functions to
build separate lists for r->alloc_capable and r->mon_capable
resources. Note that only the "L3" domain currently supports
both types.

Change all places where monitoring functions walk the list of
domains to use the new "mondomains" list instead of the old
"domains" list.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 include/linux/resctrl.h                   |  10 +-
 arch/x86/kernel/cpu/resctrl/internal.h    |   2 +-
 arch/x86/kernel/cpu/resctrl/core.c        | 195 +++++++++++++++-------
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c |   2 +-
 arch/x86/kernel/cpu/resctrl/monitor.c     |   2 +-
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    |  30 ++--
 6 files changed, 167 insertions(+), 74 deletions(-)

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 8334eeacfec5..1267d56f9e76 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -151,9 +151,11 @@ struct resctrl_schema;
  * @mon_capable:	Is monitor feature available on this machine
  * @num_rmid:		Number of RMIDs available
  * @cache_level:	Which cache level defines scope of this resource
+ * @mon_scope:		Scope of this resource if different from cache_level
  * @cache:		Cache allocation related data
  * @membw:		If the component has bandwidth controls, their properties.
  * @domains:		All domains for this resource
+ * @mondomains:		Monitor domains for this resource
  * @name:		Name to use in "schemata" file.
  * @data_width:		Character width of data when displaying
  * @default_ctrl:	Specifies default cache cbm or memory B/W percent.
@@ -169,9 +171,11 @@ struct rdt_resource {
 	bool			mon_capable;
 	int			num_rmid;
 	int			cache_level;
+	int			mon_scope;
 	struct resctrl_cache	cache;
 	struct resctrl_membw	membw;
 	struct list_head	domains;
+	struct list_head	mondomains;
 	char			*name;
 	int			data_width;
 	u32			default_ctrl;
@@ -217,8 +221,10 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
 
 u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
 			    u32 closid, enum resctrl_conf_type type);
-int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d);
-void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d);
+int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d);
+int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain *d);
+void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d);
+void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain *d);
 
 /**
  * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 85ceaf9a31ac..c5e2ac2a60cf 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -511,7 +511,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn);
 int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
 int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
 			     umode_t mask);
-struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+struct rdt_domain *rdt_find_domain(struct list_head *h, int id,
 				   struct list_head **pos);
 ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 				char *buf, size_t nbytes, loff_t off);
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 030d3b409768..274605aaa026 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -57,7 +57,7 @@ static void
 mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m,
 	      struct rdt_resource *r);
 
-#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains)
+#define domain_init(id, field) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.field)
 
 struct rdt_hw_resource rdt_resources_all[] = {
 	[RDT_RESOURCE_L3] =
@@ -66,7 +66,9 @@ struct rdt_hw_resource rdt_resources_all[] = {
 			.rid			= RDT_RESOURCE_L3,
 			.name			= "L3",
 			.cache_level		= 3,
-			.domains		= domain_init(RDT_RESOURCE_L3),
+			.mon_scope		= 3,
+			.domains		= domain_init(RDT_RESOURCE_L3, domains),
+			.mondomains		= domain_init(RDT_RESOURCE_L3, mondomains),
 			.parse_ctrlval		= parse_cbm,
 			.format_str		= "%d=%0*x",
 			.fflags			= RFTYPE_RES_CACHE,
@@ -80,7 +82,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
 			.rid			= RDT_RESOURCE_L2,
 			.name			= "L2",
 			.cache_level		= 2,
-			.domains		= domain_init(RDT_RESOURCE_L2),
+			.domains		= domain_init(RDT_RESOURCE_L2, domains),
 			.parse_ctrlval		= parse_cbm,
 			.format_str		= "%d=%0*x",
 			.fflags			= RFTYPE_RES_CACHE,
@@ -94,7 +96,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
 			.rid			= RDT_RESOURCE_MBA,
 			.name			= "MB",
 			.cache_level		= 3,
-			.domains		= domain_init(RDT_RESOURCE_MBA),
+			.domains		= domain_init(RDT_RESOURCE_MBA, domains),
 			.parse_ctrlval		= parse_bw,
 			.format_str		= "%d=%*u",
 			.fflags			= RFTYPE_RES_MB,
@@ -106,7 +108,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
 			.rid			= RDT_RESOURCE_SMBA,
 			.name			= "SMBA",
 			.cache_level		= 3,
-			.domains		= domain_init(RDT_RESOURCE_SMBA),
+			.domains		= domain_init(RDT_RESOURCE_SMBA, domains),
 			.parse_ctrlval		= parse_bw,
 			.format_str		= "%d=%*u",
 			.fflags			= RFTYPE_RES_MB,
@@ -384,14 +386,15 @@ void rdt_ctrl_update(void *arg)
 }
 
 /*
- * rdt_find_domain - Find a domain in a resource that matches input resource id
+ * rdt_find_domain - Find a domain in one of the lists for a resource that
+ * matches input resource id
  *
  * Search resource r's domain list to find the resource id. If the resource
  * id is found in a domain, return the domain. Otherwise, if requested by
  * caller, return the first domain whose id is bigger than the input id.
  * The domain list is sorted by id in ascending order.
  */
-struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+struct rdt_domain *rdt_find_domain(struct list_head *h, int id,
 				   struct list_head **pos)
 {
 	struct rdt_domain *d;
@@ -400,7 +403,7 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
 	if (id < 0)
 		return ERR_PTR(-ENODEV);
 
-	list_for_each(l, &r->domains) {
+	list_for_each(l, h) {
 		d = list_entry(l, struct rdt_domain, list);
 		/* When id is found, return its domain. */
 		if (id == d->id)
@@ -487,6 +490,94 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
 	return 0;
 }
 
+static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
+{
+	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
+	struct list_head *add_pos = NULL;
+	struct rdt_hw_domain *hw_dom;
+	struct rdt_domain *d;
+	int err;
+
+	d = rdt_find_domain(&r->domains, id, &add_pos);
+	if (IS_ERR(d)) {
+		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
+		return;
+	}
+
+	if (d) {
+		cpumask_set_cpu(cpu, &d->cpu_mask);
+		if (r->cache.arch_has_per_cpu_cfg)
+			rdt_domain_reconfigure_cdp(r);
+		return;
+	}
+
+	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+	if (!hw_dom)
+		return;
+
+	d = &hw_dom->d_resctrl;
+	d->id = id;
+	cpumask_set_cpu(cpu, &d->cpu_mask);
+
+	rdt_domain_reconfigure_cdp(r);
+
+	if (domain_setup_ctrlval(r, d)) {
+		domain_free(hw_dom);
+		return;
+	}
+
+	list_add_tail(&d->list, add_pos);
+
+	err = resctrl_online_ctrl_domain(r, d);
+	if (err) {
+		list_del(&d->list);
+		domain_free(hw_dom);
+	}
+}
+
+static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
+{
+	int id = get_cpu_cacheinfo_id(cpu, r->mon_scope);
+	struct list_head *add_pos = NULL;
+	struct rdt_hw_domain *hw_dom;
+	struct rdt_domain *d;
+	int err;
+
+	d = rdt_find_domain(&r->mondomains, id, &add_pos);
+	if (IS_ERR(d)) {
+		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
+		return;
+	}
+
+	if (d) {
+		cpumask_set_cpu(cpu, &d->cpu_mask);
+		if (r->cache.arch_has_per_cpu_cfg)
+			rdt_domain_reconfigure_cdp(r);
+		return;
+	}
+
+	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+	if (!hw_dom)
+		return;
+
+	d = &hw_dom->d_resctrl;
+	d->id = id;
+	cpumask_set_cpu(cpu, &d->cpu_mask);
+
+	if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
+		domain_free(hw_dom);
+		return;
+	}
+
+	list_add_tail(&d->list, add_pos);
+
+	err = resctrl_online_mon_domain(r, d);
+	if (err) {
+		list_del(&d->list);
+		domain_free(hw_dom);
+	}
+}
+
 /*
  * domain_add_cpu - Add a cpu to a resource's domain list.
  *
@@ -502,61 +593,19 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
  */
 static void domain_add_cpu(int cpu, struct rdt_resource *r)
 {
-	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
-	struct list_head *add_pos = NULL;
-	struct rdt_hw_domain *hw_dom;
-	struct rdt_domain *d;
-	int err;
-
-	d = rdt_find_domain(r, id, &add_pos);
-	if (IS_ERR(d)) {
-		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
-		return;
-	}
-
-	if (d) {
-		cpumask_set_cpu(cpu, &d->cpu_mask);
-		if (r->cache.arch_has_per_cpu_cfg)
-			rdt_domain_reconfigure_cdp(r);
-		return;
-	}
-
-	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
-	if (!hw_dom)
-		return;
-
-	d = &hw_dom->d_resctrl;
-	d->id = id;
-	cpumask_set_cpu(cpu, &d->cpu_mask);
-
-	rdt_domain_reconfigure_cdp(r);
-
-	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
-		domain_free(hw_dom);
-		return;
-	}
-
-	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
-		domain_free(hw_dom);
-		return;
-	}
-
-	list_add_tail(&d->list, add_pos);
-
-	err = resctrl_online_domain(r, d);
-	if (err) {
-		list_del(&d->list);
-		domain_free(hw_dom);
-	}
+	if (r->alloc_capable)
+		domain_add_cpu_ctrl(cpu, r);
+	if (r->mon_capable)
+		domain_add_cpu_mon(cpu, r);
 }
 
-static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
 {
 	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
 	struct rdt_hw_domain *hw_dom;
 	struct rdt_domain *d;
 
-	d = rdt_find_domain(r, id, NULL);
+	d = rdt_find_domain(&r->domains, id, NULL);
 	if (IS_ERR_OR_NULL(d)) {
 		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
 		return;
@@ -565,7 +614,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 
 	cpumask_clear_cpu(cpu, &d->cpu_mask);
 	if (cpumask_empty(&d->cpu_mask)) {
-		resctrl_offline_domain(r, d);
+		resctrl_offline_ctrl_domain(r, d);
 		list_del(&d->list);
 
 		/*
@@ -578,6 +627,30 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 
 		return;
 	}
+}
+
+static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
+{
+	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
+	struct rdt_hw_domain *hw_dom;
+	struct rdt_domain *d;
+
+	d = rdt_find_domain(&r->mondomains, id, NULL);
+	if (IS_ERR_OR_NULL(d)) {
+		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
+		return;
+	}
+	hw_dom = resctrl_to_arch_dom(d);
+
+	cpumask_clear_cpu(cpu, &d->cpu_mask);
+	if (cpumask_empty(&d->cpu_mask)) {
+		resctrl_offline_mon_domain(r, d);
+		list_del(&d->list);
+
+		domain_free(hw_dom);
+
+		return;
+	}
 
 	if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) {
 		if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
@@ -592,6 +665,14 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 	}
 }
 
+static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+{
+	if (r->alloc_capable)
+		domain_remove_cpu_ctrl(cpu, r);
+	if (r->mon_capable)
+		domain_remove_cpu_mon(cpu, r);
+}
+
 static void clear_closid_rmid(int cpu)
 {
 	struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index b44c487727d4..839df83d1a0a 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -560,7 +560,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 	evtid = md.u.evtid;
 
 	r = &rdt_resources_all[resid].r_resctrl;
-	d = rdt_find_domain(r, domid, NULL);
+	d = rdt_find_domain(&r->mondomains, domid, NULL);
 	if (IS_ERR_OR_NULL(d)) {
 		ret = -ENOENT;
 		goto out;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index ded1fc7cb7cb..66beca785535 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -340,7 +340,7 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
 
 	entry->busy = 0;
 	cpu = get_cpu();
-	list_for_each_entry(d, &r->domains, list) {
+	list_for_each_entry(d, &r->mondomains, list) {
 		if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
 			err = resctrl_arch_rmid_read(r, d, entry->rmid,
 						     QOS_L3_OCCUP_EVENT_ID,
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 725344048f85..27753eb5d513 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1496,7 +1496,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid
 
 	mutex_lock(&rdtgroup_mutex);
 
-	list_for_each_entry(dom, &r->domains, list) {
+	list_for_each_entry(dom, &r->mondomains, list) {
 		if (sep)
 			seq_puts(s, ";");
 
@@ -1619,7 +1619,7 @@ static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
 		return -EINVAL;
 	}
 
-	list_for_each_entry(d, &r->domains, list) {
+	list_for_each_entry(d, &r->mondomains, list) {
 		if (d->id == dom_id) {
 			ret = mbm_config_write_domain(r, d, evtid, val);
 			if (ret)
@@ -2525,7 +2525,7 @@ static int rdt_get_tree(struct fs_context *fc)
 
 	if (is_mbm_enabled()) {
 		r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
-		list_for_each_entry(dom, &r->domains, list)
+		list_for_each_entry(dom, &r->mondomains, list)
 			mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
 	}
 
@@ -2919,7 +2919,7 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
 	struct rdt_domain *dom;
 	int ret;
 
-	list_for_each_entry(dom, &r->domains, list) {
+	list_for_each_entry(dom, &r->mondomains, list) {
 		ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
 		if (ret)
 			return ret;
@@ -3708,15 +3708,17 @@ static void domain_destroy_mon_state(struct rdt_domain *d)
 	kfree(d->mbm_local);
 }
 
-void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d)
+void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d)
 {
 	lockdep_assert_held(&rdtgroup_mutex);
 
 	if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
 		mba_sc_domain_destroy(r, d);
+}
 
-	if (!r->mon_capable)
-		return;
+void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain *d)
+{
+	lockdep_assert_held(&rdtgroup_mutex);
 
 	/*
 	 * If resctrl is mounted, remove all the
@@ -3773,18 +3775,22 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
 	return 0;
 }
 
-int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d)
+int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_domain *d)
 {
-	int err;
-
 	lockdep_assert_held(&rdtgroup_mutex);
 
 	if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
 		/* RDT_RESOURCE_MBA is never mon_capable */
 		return mba_sc_domain_allocate(r, d);
 
-	if (!r->mon_capable)
-		return 0;
+	return 0;
+}
+
+int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain *d)
+{
+	int err;
+
+	lockdep_assert_held(&rdtgroup_mutex);
 
 	err = domain_setup_mon_state(r, d);
 	if (err)
-- 
2.40.1

Re: [PATCH v4 1/7] x86/resctrl: Create separate domains for control and monitoring

Posted by Reinette Chatre 2 years, 6 months ago

Hi Tony,

On 7/22/2023 12:07 PM, Tony Luck wrote:
> First step towards supporting resource control where the scope of
> control operations is not the same as monitor operations.

Each changelog should stand on its own merit. This changelog
appears to be written as a continuation of the cover letter.

Please do ensure that each patch first establishes the context
before it describes the problem and solution. For example,
as a context this changelog can start by describing what the
resctrl domains list represents.

> 
> Add an extra list in the rdt_resource structure. For this will
> just duplicate the existing list of domains based on the L3 cache
> scope.

The above paragraph does not make this change appealing at all.

> Refactor the domain_add_cpu() and domain_remove() functions to

domain_remove() -> domain_remove_cpu()

> build separate lists for r->alloc_capable and r->mon_capable
> resources. Note that only the "L3" domain currently supports
> both types.

"L3" domain -> "L3" resource?

> 
> Change all places where monitoring functions walk the list of
> domains to use the new "mondomains" list instead of the old
> "domains" list.

I would not refer to it as "the old domains list" as it creates
impression that this is being replaced. The changelog makes
no mention that domains list will remain and be dedicated to
control domains. I think this is important to include in description
of this change.

> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  include/linux/resctrl.h                   |  10 +-
>  arch/x86/kernel/cpu/resctrl/internal.h    |   2 +-
>  arch/x86/kernel/cpu/resctrl/core.c        | 195 +++++++++++++++-------
>  arch/x86/kernel/cpu/resctrl/ctrlmondata.c |   2 +-
>  arch/x86/kernel/cpu/resctrl/monitor.c     |   2 +-
>  arch/x86/kernel/cpu/resctrl/rdtgroup.c    |  30 ++--
>  6 files changed, 167 insertions(+), 74 deletions(-)
> 
> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> index 8334eeacfec5..1267d56f9e76 100644
> --- a/include/linux/resctrl.h
> +++ b/include/linux/resctrl.h
> @@ -151,9 +151,11 @@ struct resctrl_schema;
>   * @mon_capable:	Is monitor feature available on this machine
>   * @num_rmid:		Number of RMIDs available
>   * @cache_level:	Which cache level defines scope of this resource
> + * @mon_scope:		Scope of this resource if different from cache_level

I think this addition should be deferred. As it is here it the "if different
from cache_level" also creates many questions (when will it be different?
how will it be determined that the scope is different in order to know that
mon_scope should be used?)

Looking ahead on how mon_scope is used there does not seem to be an "if"
involved at all ... mon_scope is always the monitoring scope. 

>   * @cache:		Cache allocation related data
>   * @membw:		If the component has bandwidth controls, their properties.
>   * @domains:		All domains for this resource

A change to the domains comment would also help - to highlight that it is
now dedicated to control domains.

> + * @mondomains:		Monitor domains for this resource
>   * @name:		Name to use in "schemata" file.
>   * @data_width:		Character width of data when displaying
>   * @default_ctrl:	Specifies default cache cbm or memory B/W percent.
> @@ -169,9 +171,11 @@ struct rdt_resource {
>  	bool			mon_capable;
>  	int			num_rmid;
>  	int			cache_level;
> +	int			mon_scope;
>  	struct resctrl_cache	cache;
>  	struct resctrl_membw	membw;
>  	struct list_head	domains;
> +	struct list_head	mondomains;
>  	char			*name;
>  	int			data_width;
>  	u32			default_ctrl;

...

> @@ -384,14 +386,15 @@ void rdt_ctrl_update(void *arg)
>  }
>  
>  /*
> - * rdt_find_domain - Find a domain in a resource that matches input resource id
> + * rdt_find_domain - Find a domain in one of the lists for a resource that
> + * matches input resource id
>   *

This change makes the function more vague. I think original summary is
still accurate, how the list is used can be describe in the details below.
I see more changes to this function is upcoming and I will comment more 
at those sites.

>   * Search resource r's domain list to find the resource id. If the resource
>   * id is found in a domain, return the domain. Otherwise, if requested by
>   * caller, return the first domain whose id is bigger than the input id.
>   * The domain list is sorted by id in ascending order.
>   */
> -struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
> +struct rdt_domain *rdt_find_domain(struct list_head *h, int id,
>  				   struct list_head **pos)
>  {
>  	struct rdt_domain *d;
> @@ -400,7 +403,7 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
>  	if (id < 0)
>  		return ERR_PTR(-ENODEV);
>  
> -	list_for_each(l, &r->domains) {
> +	list_for_each(l, h) {
>  		d = list_entry(l, struct rdt_domain, list);
>  		/* When id is found, return its domain. */
>  		if (id == d->id)
> @@ -487,6 +490,94 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
>  	return 0;
>  }
>  
> +static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
> +{
> +	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
> +	struct list_head *add_pos = NULL;
> +	struct rdt_hw_domain *hw_dom;
> +	struct rdt_domain *d;
> +	int err;
> +
> +	d = rdt_find_domain(&r->domains, id, &add_pos);
> +	if (IS_ERR(d)) {
> +		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
> +		return;
> +	}
> +
> +	if (d) {
> +		cpumask_set_cpu(cpu, &d->cpu_mask);
> +		if (r->cache.arch_has_per_cpu_cfg)
> +			rdt_domain_reconfigure_cdp(r);
> +		return;
> +	}
> +
> +	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
> +	if (!hw_dom)
> +		return;
> +
> +	d = &hw_dom->d_resctrl;
> +	d->id = id;
> +	cpumask_set_cpu(cpu, &d->cpu_mask);
> +
> +	rdt_domain_reconfigure_cdp(r);
> +
> +	if (domain_setup_ctrlval(r, d)) {
> +		domain_free(hw_dom);
> +		return;
> +	}
> +
> +	list_add_tail(&d->list, add_pos);
> +
> +	err = resctrl_online_ctrl_domain(r, d);
> +	if (err) {
> +		list_del(&d->list);
> +		domain_free(hw_dom);
> +	}
> +}
> +
> +static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
> +{
> +	int id = get_cpu_cacheinfo_id(cpu, r->mon_scope);

Using a different scope variable but continuing to treat it
as a cache level creates unnecessary confusion at this point.

> +	struct list_head *add_pos = NULL;
> +	struct rdt_hw_domain *hw_dom;
> +	struct rdt_domain *d;
> +	int err;
> +
> +	d = rdt_find_domain(&r->mondomains, id, &add_pos);
> +	if (IS_ERR(d)) {
> +		pr_warn("Couldn't find cache id for CPU %d\n", cpu);

Note for future change ... this continues to refer to monitor scope as
a cache id. I did not see this changed in the later patch that actually
changes how scope is used.

> +		return;
> +	}
> +
> +	if (d) {
> +		cpumask_set_cpu(cpu, &d->cpu_mask);
> +		if (r->cache.arch_has_per_cpu_cfg)
> +			rdt_domain_reconfigure_cdp(r);

Copy & paste error?

> +		return;
> +	}
> +
> +	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
> +	if (!hw_dom)
> +		return;
> +
> +	d = &hw_dom->d_resctrl;
> +	d->id = id;
> +	cpumask_set_cpu(cpu, &d->cpu_mask);
> +
> +	if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
> +		domain_free(hw_dom);
> +		return;
> +	}
> +
> +	list_add_tail(&d->list, add_pos);
> +
> +	err = resctrl_online_mon_domain(r, d);
> +	if (err) {
> +		list_del(&d->list);
> +		domain_free(hw_dom);
> +	}
> +}
> +
>  /*
>   * domain_add_cpu - Add a cpu to a resource's domain list.
>   *
> @@ -502,61 +593,19 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
>   */
>  static void domain_add_cpu(int cpu, struct rdt_resource *r)
>  {
> -	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
> -	struct list_head *add_pos = NULL;
> -	struct rdt_hw_domain *hw_dom;
> -	struct rdt_domain *d;
> -	int err;
> -
> -	d = rdt_find_domain(r, id, &add_pos);
> -	if (IS_ERR(d)) {
> -		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
> -		return;
> -	}
> -
> -	if (d) {
> -		cpumask_set_cpu(cpu, &d->cpu_mask);
> -		if (r->cache.arch_has_per_cpu_cfg)
> -			rdt_domain_reconfigure_cdp(r);
> -		return;
> -	}
> -
> -	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
> -	if (!hw_dom)
> -		return;
> -
> -	d = &hw_dom->d_resctrl;
> -	d->id = id;
> -	cpumask_set_cpu(cpu, &d->cpu_mask);
> -
> -	rdt_domain_reconfigure_cdp(r);
> -
> -	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
> -		domain_free(hw_dom);
> -		return;
> -	}
> -
> -	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
> -		domain_free(hw_dom);
> -		return;
> -	}
> -
> -	list_add_tail(&d->list, add_pos);
> -
> -	err = resctrl_online_domain(r, d);
> -	if (err) {
> -		list_del(&d->list);
> -		domain_free(hw_dom);
> -	}
> +	if (r->alloc_capable)
> +		domain_add_cpu_ctrl(cpu, r);
> +	if (r->mon_capable)
> +		domain_add_cpu_mon(cpu, r);
>  }

A resource could be both alloc and mon capable ... both
domain_add_cpu_ctrl() and domain_add_cpu_mon() can fail.
Should domain_add_cpu_mon() still be run for a CPU if
domain_add_cpu_ctrl() failed? 

Looking ahead the CPU should probably also not be added
to the default groups mask if a failure occurred.

> -static void domain_remove_cpu(int cpu, struct rdt_resource *r)
> +static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
>  {
>  	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
>  	struct rdt_hw_domain *hw_dom;
>  	struct rdt_domain *d;
>  
> -	d = rdt_find_domain(r, id, NULL);
> +	d = rdt_find_domain(&r->domains, id, NULL);
>  	if (IS_ERR_OR_NULL(d)) {
>  		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
>  		return;
> @@ -565,7 +614,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
>  
>  	cpumask_clear_cpu(cpu, &d->cpu_mask);
>  	if (cpumask_empty(&d->cpu_mask)) {
> -		resctrl_offline_domain(r, d);
> +		resctrl_offline_ctrl_domain(r, d);
>  		list_del(&d->list);
>  
>  		/*
> @@ -578,6 +627,30 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
>  
>  		return;
>  	}
> +}
> +
> +static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
> +{
> +	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);

Introducing mon_scope can really be deferred ... here the monitoring code
is not using mon_scope anyway.

> +	struct rdt_hw_domain *hw_dom;
> +	struct rdt_domain *d;
> +
> +	d = rdt_find_domain(&r->mondomains, id, NULL);
> +	if (IS_ERR_OR_NULL(d)) {
> +		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
> +		return;
> +	}
> +	hw_dom = resctrl_to_arch_dom(d);
> +
> +	cpumask_clear_cpu(cpu, &d->cpu_mask);
> +	if (cpumask_empty(&d->cpu_mask)) {
> +		resctrl_offline_mon_domain(r, d);
> +		list_del(&d->list);
> +
> +		domain_free(hw_dom);
> +
> +		return;
> +	}
>  
>  	if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) {
>  		if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {

Reinette

Re: [PATCH v4 1/7] x86/resctrl: Create separate domains for control and monitoring

Posted by Tony Luck 2 years, 5 months ago

On Fri, Aug 11, 2023 at 10:29:25AM -0700, Reinette Chatre wrote:
> Hi Tony,
> 
> On 7/22/2023 12:07 PM, Tony Luck wrote:
> > First step towards supporting resource control where the scope of
> > control operations is not the same as monitor operations.
> 
> Each changelog should stand on its own merit. This changelog
> appears to be written as a continuation of the cover letter.
> 
> Please do ensure that each patch first establishes the context
> before it describes the problem and solution. For example,
> as a context this changelog can start by describing what the
> resctrl domains list represents.
> 
> > 
> > Add an extra list in the rdt_resource structure. For this will
> > just duplicate the existing list of domains based on the L3 cache
> > scope.
> 
> The above paragraph does not make this change appealing at all.
> 
> > Refactor the domain_add_cpu() and domain_remove() functions to
> 
> domain_remove() -> domain_remove_cpu()
> 
> > build separate lists for r->alloc_capable and r->mon_capable
> > resources. Note that only the "L3" domain currently supports
> > both types.
> 
> "L3" domain -> "L3" resource?
> 
> > 
> > Change all places where monitoring functions walk the list of
> > domains to use the new "mondomains" list instead of the old
> > "domains" list.
> 
> I would not refer to it as "the old domains list" as it creates
> impression that this is being replaced. The changelog makes
> no mention that domains list will remain and be dedicated to
> control domains. I think this is important to include in description
> of this change.

I've rewritten the entire commit message incorporating your suggestions.
V6 will be posted soon (after I get some time on an SNC SPR to check
that it all works!)

> 
> > 
> > Signed-off-by: Tony Luck <tony.luck@intel.com>
> > ---
> >  include/linux/resctrl.h                   |  10 +-
> >  arch/x86/kernel/cpu/resctrl/internal.h    |   2 +-
> >  arch/x86/kernel/cpu/resctrl/core.c        | 195 +++++++++++++++-------
> >  arch/x86/kernel/cpu/resctrl/ctrlmondata.c |   2 +-
> >  arch/x86/kernel/cpu/resctrl/monitor.c     |   2 +-
> >  arch/x86/kernel/cpu/resctrl/rdtgroup.c    |  30 ++--
> >  6 files changed, 167 insertions(+), 74 deletions(-)
> > 
> > diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> > index 8334eeacfec5..1267d56f9e76 100644
> > --- a/include/linux/resctrl.h
> > +++ b/include/linux/resctrl.h
> > @@ -151,9 +151,11 @@ struct resctrl_schema;
> >   * @mon_capable:	Is monitor feature available on this machine
> >   * @num_rmid:		Number of RMIDs available
> >   * @cache_level:	Which cache level defines scope of this resource
> > + * @mon_scope:		Scope of this resource if different from cache_level
> 
> I think this addition should be deferred. As it is here it the "if different
> from cache_level" also creates many questions (when will it be different?
> how will it be determined that the scope is different in order to know that
> mon_scope should be used?)

I've gone in a different direction. V6 renames "cache_level" to
"ctrl_scope". I think this makes intent clear from step #1.

> 
> Looking ahead on how mon_scope is used there does not seem to be an "if"
> involved at all ... mon_scope is always the monitoring scope. 

Dropped the "if different" comment. You are correct that this is
unconditionally the monitor scope.

> 
> >   * @cache:		Cache allocation related data
> >   * @membw:		If the component has bandwidth controls, their properties.
> >   * @domains:		All domains for this resource
> 
> A change to the domains comment would also help - to highlight that it is
> now dedicated to control domains.

Done.

> 
> > + * @mondomains:		Monitor domains for this resource
> >   * @name:		Name to use in "schemata" file.
> >   * @data_width:		Character width of data when displaying
> >   * @default_ctrl:	Specifies default cache cbm or memory B/W percent.
> > @@ -169,9 +171,11 @@ struct rdt_resource {
> >  	bool			mon_capable;
> >  	int			num_rmid;
> >  	int			cache_level;
> > +	int			mon_scope;
> >  	struct resctrl_cache	cache;
> >  	struct resctrl_membw	membw;
> >  	struct list_head	domains;
> > +	struct list_head	mondomains;
> >  	char			*name;
> >  	int			data_width;
> >  	u32			default_ctrl;
> 
> ...
> 
> > @@ -384,14 +386,15 @@ void rdt_ctrl_update(void *arg)
> >  }
> >  
> >  /*
> > - * rdt_find_domain - Find a domain in a resource that matches input resource id
> > + * rdt_find_domain - Find a domain in one of the lists for a resource that
> > + * matches input resource id
> >   *
> 
> This change makes the function more vague. I think original summary is
> still accurate, how the list is used can be describe in the details below.
> I see more changes to this function is upcoming and I will comment more 
> at those sites.

Re-worked this based on your other suggestions to have separate find*
functions for ctrl and mon cases calling to common __rdt_find_domain().

> 
> >   * Search resource r's domain list to find the resource id. If the resource
> >   * id is found in a domain, return the domain. Otherwise, if requested by
> >   * caller, return the first domain whose id is bigger than the input id.
> >   * The domain list is sorted by id in ascending order.
> >   */
> > -struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
> > +struct rdt_domain *rdt_find_domain(struct list_head *h, int id,
> >  				   struct list_head **pos)
> >  {
> >  	struct rdt_domain *d;
> > @@ -400,7 +403,7 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
> >  	if (id < 0)
> >  		return ERR_PTR(-ENODEV);
> >  
> > -	list_for_each(l, &r->domains) {
> > +	list_for_each(l, h) {
> >  		d = list_entry(l, struct rdt_domain, list);
> >  		/* When id is found, return its domain. */
> >  		if (id == d->id)
> > @@ -487,6 +490,94 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
> >  	return 0;
> >  }
> >  
> > +static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
> > +{
> > +	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
> > +	struct list_head *add_pos = NULL;
> > +	struct rdt_hw_domain *hw_dom;
> > +	struct rdt_domain *d;
> > +	int err;
> > +
> > +	d = rdt_find_domain(&r->domains, id, &add_pos);
> > +	if (IS_ERR(d)) {
> > +		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
> > +		return;
> > +	}
> > +
> > +	if (d) {
> > +		cpumask_set_cpu(cpu, &d->cpu_mask);
> > +		if (r->cache.arch_has_per_cpu_cfg)
> > +			rdt_domain_reconfigure_cdp(r);
> > +		return;
> > +	}
> > +
> > +	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
> > +	if (!hw_dom)
> > +		return;
> > +
> > +	d = &hw_dom->d_resctrl;
> > +	d->id = id;
> > +	cpumask_set_cpu(cpu, &d->cpu_mask);
> > +
> > +	rdt_domain_reconfigure_cdp(r);
> > +
> > +	if (domain_setup_ctrlval(r, d)) {
> > +		domain_free(hw_dom);
> > +		return;
> > +	}
> > +
> > +	list_add_tail(&d->list, add_pos);
> > +
> > +	err = resctrl_online_ctrl_domain(r, d);
> > +	if (err) {
> > +		list_del(&d->list);
> > +		domain_free(hw_dom);
> > +	}
> > +}
> > +
> > +static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
> > +{
> > +	int id = get_cpu_cacheinfo_id(cpu, r->mon_scope);
> 
> Using a different scope variable but continuing to treat it
> as a cache level creates unnecessary confusion at this point.
> 
> > +	struct list_head *add_pos = NULL;
> > +	struct rdt_hw_domain *hw_dom;
> > +	struct rdt_domain *d;
> > +	int err;
> > +
> > +	d = rdt_find_domain(&r->mondomains, id, &add_pos);
> > +	if (IS_ERR(d)) {
> > +		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
> 
> Note for future change ... this continues to refer to monitor scope as
> a cache id. I did not see this changed in the later patch that actually
> changes how scope is used.

Changed all these messages to refer to scope instead of cache id.

> 
> > +		return;
> > +	}
> > +
> > +	if (d) {
> > +		cpumask_set_cpu(cpu, &d->cpu_mask);
> > +		if (r->cache.arch_has_per_cpu_cfg)
> > +			rdt_domain_reconfigure_cdp(r);
> 
> Copy & paste error?

Indeed yes. This code isn't appropriate for the monitor case.
Deleted.

> 
> > +		return;
> > +	}
> > +
> > +	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
> > +	if (!hw_dom)
> > +		return;
> > +
> > +	d = &hw_dom->d_resctrl;
> > +	d->id = id;
> > +	cpumask_set_cpu(cpu, &d->cpu_mask);
> > +
> > +	if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
> > +		domain_free(hw_dom);
> > +		return;
> > +	}
> > +
> > +	list_add_tail(&d->list, add_pos);
> > +
> > +	err = resctrl_online_mon_domain(r, d);
> > +	if (err) {
> > +		list_del(&d->list);
> > +		domain_free(hw_dom);
> > +	}
> > +}
> > +
> >  /*
> >   * domain_add_cpu - Add a cpu to a resource's domain list.
> >   *
> > @@ -502,61 +593,19 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
> >   */
> >  static void domain_add_cpu(int cpu, struct rdt_resource *r)
> >  {
> > -	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
> > -	struct list_head *add_pos = NULL;
> > -	struct rdt_hw_domain *hw_dom;
> > -	struct rdt_domain *d;
> > -	int err;
> > -
> > -	d = rdt_find_domain(r, id, &add_pos);
> > -	if (IS_ERR(d)) {
> > -		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
> > -		return;
> > -	}
> > -
> > -	if (d) {
> > -		cpumask_set_cpu(cpu, &d->cpu_mask);
> > -		if (r->cache.arch_has_per_cpu_cfg)
> > -			rdt_domain_reconfigure_cdp(r);
> > -		return;
> > -	}
> > -
> > -	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
> > -	if (!hw_dom)
> > -		return;
> > -
> > -	d = &hw_dom->d_resctrl;
> > -	d->id = id;
> > -	cpumask_set_cpu(cpu, &d->cpu_mask);
> > -
> > -	rdt_domain_reconfigure_cdp(r);
> > -
> > -	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
> > -		domain_free(hw_dom);
> > -		return;
> > -	}
> > -
> > -	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
> > -		domain_free(hw_dom);
> > -		return;
> > -	}
> > -
> > -	list_add_tail(&d->list, add_pos);
> > -
> > -	err = resctrl_online_domain(r, d);
> > -	if (err) {
> > -		list_del(&d->list);
> > -		domain_free(hw_dom);
> > -	}
> > +	if (r->alloc_capable)
> > +		domain_add_cpu_ctrl(cpu, r);
> > +	if (r->mon_capable)
> > +		domain_add_cpu_mon(cpu, r);
> >  }
> 
> A resource could be both alloc and mon capable ... both
> domain_add_cpu_ctrl() and domain_add_cpu_mon() can fail.
> Should domain_add_cpu_mon() still be run for a CPU if
> domain_add_cpu_ctrl() failed? 
> 
> Looking ahead the CPU should probably also not be added
> to the default groups mask if a failure occurred.

Existing code doesn't do anything for the case where a CPU
can't be added to a domain (probably the only real error case
is failure to allocate memory for the domain structure).

May be something to tackle in a future series if anyone
thinks this is a serious problem and has suggestions on
what to do. It seems like a catastrophic problem to not
have some CPUs in some/all domains of some resources.
Maybe this should disable mounting resctrl filesystem
completely?

> 
> > -static void domain_remove_cpu(int cpu, struct rdt_resource *r)
> > +static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
> >  {
> >  	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
> >  	struct rdt_hw_domain *hw_dom;
> >  	struct rdt_domain *d;
> >  
> > -	d = rdt_find_domain(r, id, NULL);
> > +	d = rdt_find_domain(&r->domains, id, NULL);
> >  	if (IS_ERR_OR_NULL(d)) {
> >  		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
> >  		return;
> > @@ -565,7 +614,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
> >  
> >  	cpumask_clear_cpu(cpu, &d->cpu_mask);
> >  	if (cpumask_empty(&d->cpu_mask)) {
> > -		resctrl_offline_domain(r, d);
> > +		resctrl_offline_ctrl_domain(r, d);
> >  		list_del(&d->list);
> >  
> >  		/*
> > @@ -578,6 +627,30 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
> >  
> >  		return;
> >  	}
> > +}
> > +
> > +static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
> > +{
> > +	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
> 
> Introducing mon_scope can really be deferred ... here the monitoring code
> is not using mon_scope anyway.

Not deferring. But I did fix this to use r->mon_scope. Good catch.

> 
> > +	struct rdt_hw_domain *hw_dom;
> > +	struct rdt_domain *d;
> > +
> > +	d = rdt_find_domain(&r->mondomains, id, NULL);
> > +	if (IS_ERR_OR_NULL(d)) {
> > +		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
> > +		return;
> > +	}
> > +	hw_dom = resctrl_to_arch_dom(d);
> > +
> > +	cpumask_clear_cpu(cpu, &d->cpu_mask);
> > +	if (cpumask_empty(&d->cpu_mask)) {
> > +		resctrl_offline_mon_domain(r, d);
> > +		list_del(&d->list);
> > +
> > +		domain_free(hw_dom);
> > +
> > +		return;
> > +	}
> >  
> >  	if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) {
> >  		if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
> 
> Reinette

-Tony

Re: [PATCH v4 1/7] x86/resctrl: Create separate domains for control and monitoring

Posted by Reinette Chatre 2 years, 5 months ago

Hi Tony,

On 8/25/2023 10:05 AM, Tony Luck wrote:
> On Fri, Aug 11, 2023 at 10:29:25AM -0700, Reinette Chatre wrote:
>> On 7/22/2023 12:07 PM, Tony Luck wrote:

>>> Change all places where monitoring functions walk the list of
>>> domains to use the new "mondomains" list instead of the old
>>> "domains" list.
>>
>> I would not refer to it as "the old domains list" as it creates
>> impression that this is being replaced. The changelog makes
>> no mention that domains list will remain and be dedicated to
>> control domains. I think this is important to include in description
>> of this change.
> 
> I've rewritten the entire commit message incorporating your suggestions.
> V6 will be posted soon (after I get some time on an SNC SPR to check
> that it all works!)

I seem to have missed v5.

> 
>>
>>>
>>> Signed-off-by: Tony Luck <tony.luck@intel.com>
>>> ---
>>>  include/linux/resctrl.h                   |  10 +-
>>>  arch/x86/kernel/cpu/resctrl/internal.h    |   2 +-
>>>  arch/x86/kernel/cpu/resctrl/core.c        | 195 +++++++++++++++-------
>>>  arch/x86/kernel/cpu/resctrl/ctrlmondata.c |   2 +-
>>>  arch/x86/kernel/cpu/resctrl/monitor.c     |   2 +-
>>>  arch/x86/kernel/cpu/resctrl/rdtgroup.c    |  30 ++--
>>>  6 files changed, 167 insertions(+), 74 deletions(-)
>>>
>>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
>>> index 8334eeacfec5..1267d56f9e76 100644
>>> --- a/include/linux/resctrl.h
>>> +++ b/include/linux/resctrl.h
>>> @@ -151,9 +151,11 @@ struct resctrl_schema;
>>>   * @mon_capable:	Is monitor feature available on this machine
>>>   * @num_rmid:		Number of RMIDs available
>>>   * @cache_level:	Which cache level defines scope of this resource
>>> + * @mon_scope:		Scope of this resource if different from cache_level
>>
>> I think this addition should be deferred. As it is here it the "if different
>> from cache_level" also creates many questions (when will it be different?
>> how will it be determined that the scope is different in order to know that
>> mon_scope should be used?)
> 
> I've gone in a different direction. V6 renames "cache_level" to
> "ctrl_scope". I think this makes intent clear from step #1.
>

This change is not clear to me. Previously you changed this name
but kept using it in code specific to cache levels. It is not clear
to me how this time's rename would be different.
 
...

>>> @@ -502,61 +593,19 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
>>>   */
>>>  static void domain_add_cpu(int cpu, struct rdt_resource *r)
>>>  {
>>> -	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
>>> -	struct list_head *add_pos = NULL;
>>> -	struct rdt_hw_domain *hw_dom;
>>> -	struct rdt_domain *d;
>>> -	int err;
>>> -
>>> -	d = rdt_find_domain(r, id, &add_pos);
>>> -	if (IS_ERR(d)) {
>>> -		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
>>> -		return;
>>> -	}
>>> -
>>> -	if (d) {
>>> -		cpumask_set_cpu(cpu, &d->cpu_mask);
>>> -		if (r->cache.arch_has_per_cpu_cfg)
>>> -			rdt_domain_reconfigure_cdp(r);
>>> -		return;
>>> -	}
>>> -
>>> -	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
>>> -	if (!hw_dom)
>>> -		return;
>>> -
>>> -	d = &hw_dom->d_resctrl;
>>> -	d->id = id;
>>> -	cpumask_set_cpu(cpu, &d->cpu_mask);
>>> -
>>> -	rdt_domain_reconfigure_cdp(r);
>>> -
>>> -	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
>>> -		domain_free(hw_dom);
>>> -		return;
>>> -	}
>>> -
>>> -	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
>>> -		domain_free(hw_dom);
>>> -		return;
>>> -	}
>>> -
>>> -	list_add_tail(&d->list, add_pos);
>>> -
>>> -	err = resctrl_online_domain(r, d);
>>> -	if (err) {
>>> -		list_del(&d->list);
>>> -		domain_free(hw_dom);
>>> -	}
>>> +	if (r->alloc_capable)
>>> +		domain_add_cpu_ctrl(cpu, r);
>>> +	if (r->mon_capable)
>>> +		domain_add_cpu_mon(cpu, r);
>>>  }
>>
>> A resource could be both alloc and mon capable ... both
>> domain_add_cpu_ctrl() and domain_add_cpu_mon() can fail.
>> Should domain_add_cpu_mon() still be run for a CPU if
>> domain_add_cpu_ctrl() failed? 
>>
>> Looking ahead the CPU should probably also not be added
>> to the default groups mask if a failure occurred.
> 
> Existing code doesn't do anything for the case where a CPU
> can't be added to a domain (probably the only real error case
> is failure to allocate memory for the domain structure).

Is my statement about CPU being added to default group mask
incorrect? Seems like a potential issue related to domain's
CPU mask also.

Please see my earlier question. Existing code does not proceed
with monitor initialization if control initialization fails and
undoes control initialization if monitor initialization fails. 

> May be something to tackle in a future series if anyone
> thinks this is a serious problem and has suggestions on
> what to do. It seems like a catastrophic problem to not
> have some CPUs in some/all domains of some resources.
> Maybe this should disable mounting resctrl filesystem
> completely?

It is not clear to me how this is catastrophic but I
do not think resctrl should claim support for a resource
on a CPU if it was not able to complete initialization

Reinette

Re: [PATCH v4 1/7] x86/resctrl: Create separate domains for control and monitoring

Posted by Tony Luck 2 years, 5 months ago

On Mon, Aug 28, 2023 at 10:05:31AM -0700, Reinette Chatre wrote:
> Hi Tony,
> 
> On 8/25/2023 10:05 AM, Tony Luck wrote:
> > On Fri, Aug 11, 2023 at 10:29:25AM -0700, Reinette Chatre wrote:
> >> On 7/22/2023 12:07 PM, Tony Luck wrote:
> 
> >>> Change all places where monitoring functions walk the list of
> >>> domains to use the new "mondomains" list instead of the old
> >>> "domains" list.
> >>
> >> I would not refer to it as "the old domains list" as it creates
> >> impression that this is being replaced. The changelog makes
> >> no mention that domains list will remain and be dedicated to
> >> control domains. I think this is important to include in description
> >> of this change.
> > 
> > I've rewritten the entire commit message incorporating your suggestions.
> > V6 will be posted soon (after I get some time on an SNC SPR to check
> > that it all works!)
> 
> I seem to have missed v5.

I simply can't count. You are correct that next version to be posted
will be v5.

> > 
> >>
> >>>
> >>> Signed-off-by: Tony Luck <tony.luck@intel.com>
> >>> ---
> >>>  include/linux/resctrl.h                   |  10 +-
> >>>  arch/x86/kernel/cpu/resctrl/internal.h    |   2 +-
> >>>  arch/x86/kernel/cpu/resctrl/core.c        | 195 +++++++++++++++-------
> >>>  arch/x86/kernel/cpu/resctrl/ctrlmondata.c |   2 +-
> >>>  arch/x86/kernel/cpu/resctrl/monitor.c     |   2 +-
> >>>  arch/x86/kernel/cpu/resctrl/rdtgroup.c    |  30 ++--
> >>>  6 files changed, 167 insertions(+), 74 deletions(-)
> >>>
> >>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> >>> index 8334eeacfec5..1267d56f9e76 100644
> >>> --- a/include/linux/resctrl.h
> >>> +++ b/include/linux/resctrl.h
> >>> @@ -151,9 +151,11 @@ struct resctrl_schema;
> >>>   * @mon_capable:	Is monitor feature available on this machine
> >>>   * @num_rmid:		Number of RMIDs available
> >>>   * @cache_level:	Which cache level defines scope of this resource
> >>> + * @mon_scope:		Scope of this resource if different from cache_level
> >>
> >> I think this addition should be deferred. As it is here it the "if different
> >> from cache_level" also creates many questions (when will it be different?
> >> how will it be determined that the scope is different in order to know that
> >> mon_scope should be used?)
> > 
> > I've gone in a different direction. V6 renames "cache_level" to
> > "ctrl_scope". I think this makes intent clear from step #1.
> >
> 
> This change is not clear to me. Previously you changed this name
> but kept using it in code specific to cache levels. It is not clear
> to me how this time's rename would be different.

The current "cache_level" field in the structure is describing
the scope of each instance using the cache level (2 or 3) as the
method to describe which CPUs are considered part of a group. Currently
the scope is the same for both control and monitor resources.

Would you like to see patches in this progrssion:

1) Rename "cache_level" to "scope". With commit comment that future
patches are going to base the scope on NUMA nodes in addtion to sharing
caches at particular levels, and will split into separate control and
monitor scope.

2) Split the "scope" field from first patch into "ctrl_scope" and
"mon_scope" (also with the addition of the new list for the mon_scope).

3) Add "node" as a new option for scope in addtion to L3 and L2 cache.

> ...
> 
> >>> @@ -502,61 +593,19 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
> >>>   */
> >>>  static void domain_add_cpu(int cpu, struct rdt_resource *r)
> >>>  {
> >>> -	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
> >>> -	struct list_head *add_pos = NULL;
> >>> -	struct rdt_hw_domain *hw_dom;
> >>> -	struct rdt_domain *d;
> >>> -	int err;
> >>> -
> >>> -	d = rdt_find_domain(r, id, &add_pos);
> >>> -	if (IS_ERR(d)) {
> >>> -		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
> >>> -		return;
> >>> -	}
> >>> -
> >>> -	if (d) {
> >>> -		cpumask_set_cpu(cpu, &d->cpu_mask);
> >>> -		if (r->cache.arch_has_per_cpu_cfg)
> >>> -			rdt_domain_reconfigure_cdp(r);
> >>> -		return;
> >>> -	}
> >>> -
> >>> -	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
> >>> -	if (!hw_dom)
> >>> -		return;
> >>> -
> >>> -	d = &hw_dom->d_resctrl;
> >>> -	d->id = id;
> >>> -	cpumask_set_cpu(cpu, &d->cpu_mask);
> >>> -
> >>> -	rdt_domain_reconfigure_cdp(r);
> >>> -
> >>> -	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
> >>> -		domain_free(hw_dom);
> >>> -		return;
> >>> -	}
> >>> -
> >>> -	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
> >>> -		domain_free(hw_dom);
> >>> -		return;
> >>> -	}
> >>> -
> >>> -	list_add_tail(&d->list, add_pos);
> >>> -
> >>> -	err = resctrl_online_domain(r, d);
> >>> -	if (err) {
> >>> -		list_del(&d->list);
> >>> -		domain_free(hw_dom);
> >>> -	}
> >>> +	if (r->alloc_capable)
> >>> +		domain_add_cpu_ctrl(cpu, r);
> >>> +	if (r->mon_capable)
> >>> +		domain_add_cpu_mon(cpu, r);
> >>>  }
> >>
> >> A resource could be both alloc and mon capable ... both
> >> domain_add_cpu_ctrl() and domain_add_cpu_mon() can fail.
> >> Should domain_add_cpu_mon() still be run for a CPU if
> >> domain_add_cpu_ctrl() failed? 
> >>
> >> Looking ahead the CPU should probably also not be added
> >> to the default groups mask if a failure occurred.
> > 
> > Existing code doesn't do anything for the case where a CPU
> > can't be added to a domain (probably the only real error case
> > is failure to allocate memory for the domain structure).
> 
> Is my statement about CPU being added to default group mask
> incorrect? Seems like a potential issue related to domain's
> CPU mask also.
> 
> Please see my earlier question. Existing code does not proceed
> with monitor initialization if control initialization fails and
> undoes control initialization if monitor initialization fails. 

Existing code silently continues if a domain structure cannot
be allocated to add a CPU to a domain:

503 static void domain_add_cpu(int cpu, struct rdt_resource *r)
504 {
505         int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
506         struct list_head *add_pos = NULL;
507         struct rdt_hw_domain *hw_dom;
508         struct rdt_domain *d;
509         int err;

...

523
524         hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
525         if (!hw_dom)
526                 return;
527

> 
> > May be something to tackle in a future series if anyone
> > thinks this is a serious problem and has suggestions on
> > what to do. It seems like a catastrophic problem to not
> > have some CPUs in some/all domains of some resources.
> > Maybe this should disable mounting resctrl filesystem
> > completely?
> 
> It is not clear to me how this is catastrophic but I
> do not think resctrl should claim support for a resource
> on a CPU if it was not able to complete initialization

That's the status quo. See above code snippet.

-Tony

Re: [PATCH v4 1/7] x86/resctrl: Create separate domains for control and monitoring

Posted by Reinette Chatre 2 years, 5 months ago

Hi Tony,

On 8/28/2023 11:46 AM, Tony Luck wrote:
> On Mon, Aug 28, 2023 at 10:05:31AM -0700, Reinette Chatre wrote:
>> Hi Tony,
>>
>> On 8/25/2023 10:05 AM, Tony Luck wrote:
>>> On Fri, Aug 11, 2023 at 10:29:25AM -0700, Reinette Chatre wrote:
>>>> On 7/22/2023 12:07 PM, Tony Luck wrote:
>>
>>>>> Change all places where monitoring functions walk the list of
>>>>> domains to use the new "mondomains" list instead of the old
>>>>> "domains" list.
>>>>
>>>> I would not refer to it as "the old domains list" as it creates
>>>> impression that this is being replaced. The changelog makes
>>>> no mention that domains list will remain and be dedicated to
>>>> control domains. I think this is important to include in description
>>>> of this change.
>>>
>>> I've rewritten the entire commit message incorporating your suggestions.
>>> V6 will be posted soon (after I get some time on an SNC SPR to check
>>> that it all works!)
>>
>> I seem to have missed v5.
> 
> I simply can't count. You are correct that next version to be posted
> will be v5.
> 
>>>
>>>>
>>>>>
>>>>> Signed-off-by: Tony Luck <tony.luck@intel.com>
>>>>> ---
>>>>>  include/linux/resctrl.h                   |  10 +-
>>>>>  arch/x86/kernel/cpu/resctrl/internal.h    |   2 +-
>>>>>  arch/x86/kernel/cpu/resctrl/core.c        | 195 +++++++++++++++-------
>>>>>  arch/x86/kernel/cpu/resctrl/ctrlmondata.c |   2 +-
>>>>>  arch/x86/kernel/cpu/resctrl/monitor.c     |   2 +-
>>>>>  arch/x86/kernel/cpu/resctrl/rdtgroup.c    |  30 ++--
>>>>>  6 files changed, 167 insertions(+), 74 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
>>>>> index 8334eeacfec5..1267d56f9e76 100644
>>>>> --- a/include/linux/resctrl.h
>>>>> +++ b/include/linux/resctrl.h
>>>>> @@ -151,9 +151,11 @@ struct resctrl_schema;
>>>>>   * @mon_capable:	Is monitor feature available on this machine
>>>>>   * @num_rmid:		Number of RMIDs available
>>>>>   * @cache_level:	Which cache level defines scope of this resource
>>>>> + * @mon_scope:		Scope of this resource if different from cache_level
>>>>
>>>> I think this addition should be deferred. As it is here it the "if different
>>>> from cache_level" also creates many questions (when will it be different?
>>>> how will it be determined that the scope is different in order to know that
>>>> mon_scope should be used?)
>>>
>>> I've gone in a different direction. V6 renames "cache_level" to
>>> "ctrl_scope". I think this makes intent clear from step #1.
>>>
>>
>> This change is not clear to me. Previously you changed this name
>> but kept using it in code specific to cache levels. It is not clear
>> to me how this time's rename would be different.
> 
> The current "cache_level" field in the structure is describing
> the scope of each instance using the cache level (2 or 3) as the
> method to describe which CPUs are considered part of a group. Currently
> the scope is the same for both control and monitor resources.

Right.

> 
> Would you like to see patches in this progrssion:
> 
> 1) Rename "cache_level" to "scope". With commit comment that future
> patches are going to base the scope on NUMA nodes in addtion to sharing
> caches at particular levels, and will split into separate control and
> monitor scope.
> 
> 2) Split the "scope" field from first patch into "ctrl_scope" and
> "mon_scope" (also with the addition of the new list for the mon_scope).
> 
> 3) Add "node" as a new option for scope in addtion to L3 and L2 cache.
> 

hmmm - my comment cannot be addressed through patch re-ordering.
If I understand correctly you plan to change the name of "cache_level"
to "ctrl_scope". My comment is that this obfuscates the code as long as
you use this variable to compare against data that can only represent cache
levels. This just repeats what I wrote in
https://lore.kernel.org/lkml/09847c37-66d7-c286-a313-308eaa338c64@intel.com/


>> ...
>>
>>>>> @@ -502,61 +593,19 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
>>>>>   */
>>>>>  static void domain_add_cpu(int cpu, struct rdt_resource *r)
>>>>>  {
>>>>> -	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
>>>>> -	struct list_head *add_pos = NULL;
>>>>> -	struct rdt_hw_domain *hw_dom;
>>>>> -	struct rdt_domain *d;
>>>>> -	int err;
>>>>> -
>>>>> -	d = rdt_find_domain(r, id, &add_pos);
>>>>> -	if (IS_ERR(d)) {
>>>>> -		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
>>>>> -		return;
>>>>> -	}
>>>>> -
>>>>> -	if (d) {
>>>>> -		cpumask_set_cpu(cpu, &d->cpu_mask);
>>>>> -		if (r->cache.arch_has_per_cpu_cfg)
>>>>> -			rdt_domain_reconfigure_cdp(r);
>>>>> -		return;
>>>>> -	}
>>>>> -
>>>>> -	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
>>>>> -	if (!hw_dom)
>>>>> -		return;
>>>>> -
>>>>> -	d = &hw_dom->d_resctrl;
>>>>> -	d->id = id;
>>>>> -	cpumask_set_cpu(cpu, &d->cpu_mask);
>>>>> -
>>>>> -	rdt_domain_reconfigure_cdp(r);
>>>>> -
>>>>> -	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
>>>>> -		domain_free(hw_dom);
>>>>> -		return;
>>>>> -	}
>>>>> -
>>>>> -	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
>>>>> -		domain_free(hw_dom);
>>>>> -		return;
>>>>> -	}
>>>>> -
>>>>> -	list_add_tail(&d->list, add_pos);
>>>>> -
>>>>> -	err = resctrl_online_domain(r, d);
>>>>> -	if (err) {
>>>>> -		list_del(&d->list);
>>>>> -		domain_free(hw_dom);
>>>>> -	}
>>>>> +	if (r->alloc_capable)
>>>>> +		domain_add_cpu_ctrl(cpu, r);
>>>>> +	if (r->mon_capable)
>>>>> +		domain_add_cpu_mon(cpu, r);
>>>>>  }
>>>>
>>>> A resource could be both alloc and mon capable ... both
>>>> domain_add_cpu_ctrl() and domain_add_cpu_mon() can fail.
>>>> Should domain_add_cpu_mon() still be run for a CPU if
>>>> domain_add_cpu_ctrl() failed? 
>>>>
>>>> Looking ahead the CPU should probably also not be added
>>>> to the default groups mask if a failure occurred.
>>>
>>> Existing code doesn't do anything for the case where a CPU
>>> can't be added to a domain (probably the only real error case
>>> is failure to allocate memory for the domain structure).
>>
>> Is my statement about CPU being added to default group mask
>> incorrect? Seems like a potential issue related to domain's
>> CPU mask also.
>>
>> Please see my earlier question. Existing code does not proceed
>> with monitor initialization if control initialization fails and
>> undoes control initialization if monitor initialization fails. 
> 
> Existing code silently continues if a domain structure cannot
> be allocated to add a CPU to a domain:
> 
> 503 static void domain_add_cpu(int cpu, struct rdt_resource *r)
> 504 {
> 505         int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
> 506         struct list_head *add_pos = NULL;
> 507         struct rdt_hw_domain *hw_dom;
> 508         struct rdt_domain *d;
> 509         int err;
> 
> ...
> 
> 523
> 524         hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
> 525         if (!hw_dom)
> 526                 return;
> 527
> 


Right ... and if it returns silently as above it runs:

static int resctrl_online_cpu(unsigned int cpu)
{


	for_each_capable_rdt_resource(r)
		domain_add_cpu(cpu, r);
	>>>>> cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); <<<<<<<<

}

Also, note within domain_add_cpu():

static void domain_add_cpu(int cpu, struct rdt_resource *r)
{


	...
	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
		domain_free(hw_dom);
		return;
	}

	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
		domain_free(hw_dom);
		return;
	}

	...
}

The above is the other item that I've been trying to discuss
with you. Note that existing resctrl will not initialize monitoring if
control could not be initialized.
Compare with this submission:

	if (r->alloc_capable)
		domain_add_cpu_ctrl(cpu, r);
	if (r->mon_capable)
		domain_add_cpu_mon(cpu, r);



>>
>>> May be something to tackle in a future series if anyone
>>> thinks this is a serious problem and has suggestions on
>>> what to do. It seems like a catastrophic problem to not
>>> have some CPUs in some/all domains of some resources.
>>> Maybe this should disable mounting resctrl filesystem
>>> completely?
>>
>> It is not clear to me how this is catastrophic but I
>> do not think resctrl should claim support for a resource
>> on a CPU if it was not able to complete initialization
> 
> That's the status quo. See above code snippet.

Could you please elaborate what you mean with status quo?

Reinette

Re: [PATCH v4 1/7] x86/resctrl: Create separate domains for control and monitoring

Posted by Tony Luck 2 years, 5 months ago

On Mon, Aug 28, 2023 at 12:56:27PM -0700, Reinette Chatre wrote:
> Hi Tony,
> 
> On 8/28/2023 11:46 AM, Tony Luck wrote:
> > On Mon, Aug 28, 2023 at 10:05:31AM -0700, Reinette Chatre wrote:
> >> Hi Tony,
> >>
> >> On 8/25/2023 10:05 AM, Tony Luck wrote:
> >>> On Fri, Aug 11, 2023 at 10:29:25AM -0700, Reinette Chatre wrote:
> >>>> On 7/22/2023 12:07 PM, Tony Luck wrote:
> >>
> >>>>> Change all places where monitoring functions walk the list of
> >>>>> domains to use the new "mondomains" list instead of the old
> >>>>> "domains" list.
> >>>>
> >>>> I would not refer to it as "the old domains list" as it creates
> >>>> impression that this is being replaced. The changelog makes
> >>>> no mention that domains list will remain and be dedicated to
> >>>> control domains. I think this is important to include in description
> >>>> of this change.
> >>>
> >>> I've rewritten the entire commit message incorporating your suggestions.
> >>> V6 will be posted soon (after I get some time on an SNC SPR to check
> >>> that it all works!)
> >>
> >> I seem to have missed v5.
> > 
> > I simply can't count. You are correct that next version to be posted
> > will be v5.
> > 
> >>>
> >>>>
> >>>>>
> >>>>> Signed-off-by: Tony Luck <tony.luck@intel.com>
> >>>>> ---
> >>>>>  include/linux/resctrl.h                   |  10 +-
> >>>>>  arch/x86/kernel/cpu/resctrl/internal.h    |   2 +-
> >>>>>  arch/x86/kernel/cpu/resctrl/core.c        | 195 +++++++++++++++-------
> >>>>>  arch/x86/kernel/cpu/resctrl/ctrlmondata.c |   2 +-
> >>>>>  arch/x86/kernel/cpu/resctrl/monitor.c     |   2 +-
> >>>>>  arch/x86/kernel/cpu/resctrl/rdtgroup.c    |  30 ++--
> >>>>>  6 files changed, 167 insertions(+), 74 deletions(-)
> >>>>>
> >>>>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> >>>>> index 8334eeacfec5..1267d56f9e76 100644
> >>>>> --- a/include/linux/resctrl.h
> >>>>> +++ b/include/linux/resctrl.h
> >>>>> @@ -151,9 +151,11 @@ struct resctrl_schema;
> >>>>>   * @mon_capable:	Is monitor feature available on this machine
> >>>>>   * @num_rmid:		Number of RMIDs available
> >>>>>   * @cache_level:	Which cache level defines scope of this resource
> >>>>> + * @mon_scope:		Scope of this resource if different from cache_level
> >>>>
> >>>> I think this addition should be deferred. As it is here it the "if different
> >>>> from cache_level" also creates many questions (when will it be different?
> >>>> how will it be determined that the scope is different in order to know that
> >>>> mon_scope should be used?)
> >>>
> >>> I've gone in a different direction. V6 renames "cache_level" to
> >>> "ctrl_scope". I think this makes intent clear from step #1.
> >>>
> >>
> >> This change is not clear to me. Previously you changed this name
> >> but kept using it in code specific to cache levels. It is not clear
> >> to me how this time's rename would be different.
> > 
> > The current "cache_level" field in the structure is describing
> > the scope of each instance using the cache level (2 or 3) as the
> > method to describe which CPUs are considered part of a group. Currently
> > the scope is the same for both control and monitor resources.
> 
> Right.
> 
> > 
> > Would you like to see patches in this progrssion:
> > 
> > 1) Rename "cache_level" to "scope". With commit comment that future
> > patches are going to base the scope on NUMA nodes in addtion to sharing
> > caches at particular levels, and will split into separate control and
> > monitor scope.
> > 
> > 2) Split the "scope" field from first patch into "ctrl_scope" and
> > "mon_scope" (also with the addition of the new list for the mon_scope).
> > 
> > 3) Add "node" as a new option for scope in addtion to L3 and L2 cache.
> > 
> 
> hmmm - my comment cannot be addressed through patch re-ordering.
> If I understand correctly you plan to change the name of "cache_level"
> to "ctrl_scope". My comment is that this obfuscates the code as long as
> you use this variable to compare against data that can only represent cache
> levels. This just repeats what I wrote in
> https://lore.kernel.org/lkml/09847c37-66d7-c286-a313-308eaa338c64@intel.com/

I'm proposing more than just re-ordering. The above sequence is a
couple of extra patches in the series.

Existing state of code:

	There is a single field named "cache_level" that describes how
	CPUs are assigned to domains based on their sharing of a cache
	at a particular level. Hard-coded values of "2" and "3" are used
	to describe the level. This is just a scope description of which
	CPUs are grouped together. But it is limited to just doing so
	based on which caches are shared by those CPUs.

Step 1:

	Change the name of the field s/cache_level/scope/. Provide an
	enum with values RESCTRL_L3_CACHE, RESCTRL_L2_CACHE aand use
	those througout code instead of 3, 2, and implictly passing
	the resctrl scope to functions like get_cpu_cacheinfo_id()

	Add get_domain_id_from_scope() function that takes the enum
	values for scope and converts to "3", "2" to pass to
	get_cpu_cacheinfo_id().

	No functional change. Just broadening the meaning of the field
	so that it can in future patches to describe scopes that
	aren't a function of sharing a cache of a particular level.

Step 2:

	Break the single list of domains into two separate lists. One
	for control domains, one for monitor domains. Change the name
	of the "scope" field to "ctrl"scope", add a new field
	"mon_scope".

Step 3:

	Add RESCTRL_NODE as a new option in the enum for how CPUs are
	grouped into domains.
> 
> 
> >> ...
> >>
> >>>>> @@ -502,61 +593,19 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
> >>>>>   */
> >>>>>  static void domain_add_cpu(int cpu, struct rdt_resource *r)
> >>>>>  {
> >>>>> -	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
> >>>>> -	struct list_head *add_pos = NULL;
> >>>>> -	struct rdt_hw_domain *hw_dom;
> >>>>> -	struct rdt_domain *d;
> >>>>> -	int err;
> >>>>> -
> >>>>> -	d = rdt_find_domain(r, id, &add_pos);
> >>>>> -	if (IS_ERR(d)) {
> >>>>> -		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
> >>>>> -		return;
> >>>>> -	}
> >>>>> -
> >>>>> -	if (d) {
> >>>>> -		cpumask_set_cpu(cpu, &d->cpu_mask);
> >>>>> -		if (r->cache.arch_has_per_cpu_cfg)
> >>>>> -			rdt_domain_reconfigure_cdp(r);
> >>>>> -		return;
> >>>>> -	}
> >>>>> -
> >>>>> -	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
> >>>>> -	if (!hw_dom)
> >>>>> -		return;
> >>>>> -
> >>>>> -	d = &hw_dom->d_resctrl;
> >>>>> -	d->id = id;
> >>>>> -	cpumask_set_cpu(cpu, &d->cpu_mask);
> >>>>> -
> >>>>> -	rdt_domain_reconfigure_cdp(r);
> >>>>> -
> >>>>> -	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
> >>>>> -		domain_free(hw_dom);
> >>>>> -		return;
> >>>>> -	}
> >>>>> -
> >>>>> -	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
> >>>>> -		domain_free(hw_dom);
> >>>>> -		return;
> >>>>> -	}
> >>>>> -
> >>>>> -	list_add_tail(&d->list, add_pos);
> >>>>> -
> >>>>> -	err = resctrl_online_domain(r, d);
> >>>>> -	if (err) {
> >>>>> -		list_del(&d->list);
> >>>>> -		domain_free(hw_dom);
> >>>>> -	}
> >>>>> +	if (r->alloc_capable)
> >>>>> +		domain_add_cpu_ctrl(cpu, r);
> >>>>> +	if (r->mon_capable)
> >>>>> +		domain_add_cpu_mon(cpu, r);
> >>>>>  }
> >>>>
> >>>> A resource could be both alloc and mon capable ... both
> >>>> domain_add_cpu_ctrl() and domain_add_cpu_mon() can fail.
> >>>> Should domain_add_cpu_mon() still be run for a CPU if
> >>>> domain_add_cpu_ctrl() failed? 
> >>>>
> >>>> Looking ahead the CPU should probably also not be added
> >>>> to the default groups mask if a failure occurred.
> >>>
> >>> Existing code doesn't do anything for the case where a CPU
> >>> can't be added to a domain (probably the only real error case
> >>> is failure to allocate memory for the domain structure).
> >>
> >> Is my statement about CPU being added to default group mask
> >> incorrect? Seems like a potential issue related to domain's
> >> CPU mask also.
> >>
> >> Please see my earlier question. Existing code does not proceed
> >> with monitor initialization if control initialization fails and
> >> undoes control initialization if monitor initialization fails. 
> > 
> > Existing code silently continues if a domain structure cannot
> > be allocated to add a CPU to a domain:
> > 
> > 503 static void domain_add_cpu(int cpu, struct rdt_resource *r)
> > 504 {
> > 505         int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
> > 506         struct list_head *add_pos = NULL;
> > 507         struct rdt_hw_domain *hw_dom;
> > 508         struct rdt_domain *d;
> > 509         int err;
> > 
> > ...
> > 
> > 523
> > 524         hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
> > 525         if (!hw_dom)
> > 526                 return;
> > 527
> > 
> 
> 
> Right ... and if it returns silently as above it runs:
> 
> static int resctrl_online_cpu(unsigned int cpu)
> {
> 
> 
> 	for_each_capable_rdt_resource(r)
> 		domain_add_cpu(cpu, r);
> 	>>>>> cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); <<<<<<<<
> 
> }
> 
> Also, note within domain_add_cpu():
> 
> static void domain_add_cpu(int cpu, struct rdt_resource *r)
> {
> 
> 
> 	...
> 	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
> 		domain_free(hw_dom);
> 		return;
> 	}
> 
> 	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
> 		domain_free(hw_dom);
> 		return;
> 	}
> 
> 	...
> }
> 
> The above is the other item that I've been trying to discuss
> with you. Note that existing resctrl will not initialize monitoring if
> control could not be initialized.
> Compare with this submission:
> 
> 	if (r->alloc_capable)
> 		domain_add_cpu_ctrl(cpu, r);
> 	if (r->mon_capable)
> 		domain_add_cpu_mon(cpu, r);
> 
> 
> 
> >>
> >>> May be something to tackle in a future series if anyone
> >>> thinks this is a serious problem and has suggestions on
> >>> what to do. It seems like a catastrophic problem to not
> >>> have some CPUs in some/all domains of some resources.
> >>> Maybe this should disable mounting resctrl filesystem
> >>> completely?
> >>
> >> It is not clear to me how this is catastrophic but I
> >> do not think resctrl should claim support for a resource
> >> on a CPU if it was not able to complete initialization
> > 
> > That's the status quo. See above code snippet.
> 
> Could you please elaborate what you mean with status quo?

Status quo. System may be booting and add a bunch of CPUs to the
first domain for a resource. When it finds a CPU from a different
domain it calls domain_add_cpu(), but for some reason the allocation
of a rdt_hw_domain fails here:

524         hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
525         if (!hw_dom)
526                 return;

domain_add_cpu() returns with no message to the console, no error code.
System boots, but this CPU is missing from the domain (likely the whole
domain is missing as the same allocation will most likely fail for the
remainder of the CPUs in this domain). Only indication to user is when
they read the schemata file and see something like this:

$ cat /sys/fs/resctrl/schemata
    MB:0= 100
    L3:0=7fff;1=7fff

(assuming that all domains were successfully allocated for L3, and the
failure described above happened on the allocation for the second domain
of the MB resource.)

-Tony

Re: [PATCH v4 1/7] x86/resctrl: Create separate domains for control and monitoring

Posted by Reinette Chatre 2 years, 5 months ago


On 8/28/2023 1:59 PM, Tony Luck wrote:
> On Mon, Aug 28, 2023 at 12:56:27PM -0700, Reinette Chatre wrote:
>> Hi Tony,
>>
>> On 8/28/2023 11:46 AM, Tony Luck wrote:
>>> On Mon, Aug 28, 2023 at 10:05:31AM -0700, Reinette Chatre wrote:
>>>> Hi Tony,
>>>>
>>>> On 8/25/2023 10:05 AM, Tony Luck wrote:
>>>>> On Fri, Aug 11, 2023 at 10:29:25AM -0700, Reinette Chatre wrote:
>>>>>> On 7/22/2023 12:07 PM, Tony Luck wrote:
>>>>
>>>>>>> Change all places where monitoring functions walk the list of
>>>>>>> domains to use the new "mondomains" list instead of the old
>>>>>>> "domains" list.
>>>>>>
>>>>>> I would not refer to it as "the old domains list" as it creates
>>>>>> impression that this is being replaced. The changelog makes
>>>>>> no mention that domains list will remain and be dedicated to
>>>>>> control domains. I think this is important to include in description
>>>>>> of this change.
>>>>>
>>>>> I've rewritten the entire commit message incorporating your suggestions.
>>>>> V6 will be posted soon (after I get some time on an SNC SPR to check
>>>>> that it all works!)
>>>>
>>>> I seem to have missed v5.
>>>
>>> I simply can't count. You are correct that next version to be posted
>>> will be v5.
>>>
>>>>>
>>>>>>
>>>>>>>
>>>>>>> Signed-off-by: Tony Luck <tony.luck@intel.com>
>>>>>>> ---
>>>>>>>  include/linux/resctrl.h                   |  10 +-
>>>>>>>  arch/x86/kernel/cpu/resctrl/internal.h    |   2 +-
>>>>>>>  arch/x86/kernel/cpu/resctrl/core.c        | 195 +++++++++++++++-------
>>>>>>>  arch/x86/kernel/cpu/resctrl/ctrlmondata.c |   2 +-
>>>>>>>  arch/x86/kernel/cpu/resctrl/monitor.c     |   2 +-
>>>>>>>  arch/x86/kernel/cpu/resctrl/rdtgroup.c    |  30 ++--
>>>>>>>  6 files changed, 167 insertions(+), 74 deletions(-)
>>>>>>>
>>>>>>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
>>>>>>> index 8334eeacfec5..1267d56f9e76 100644
>>>>>>> --- a/include/linux/resctrl.h
>>>>>>> +++ b/include/linux/resctrl.h
>>>>>>> @@ -151,9 +151,11 @@ struct resctrl_schema;
>>>>>>>   * @mon_capable:	Is monitor feature available on this machine
>>>>>>>   * @num_rmid:		Number of RMIDs available
>>>>>>>   * @cache_level:	Which cache level defines scope of this resource
>>>>>>> + * @mon_scope:		Scope of this resource if different from cache_level
>>>>>>
>>>>>> I think this addition should be deferred. As it is here it the "if different
>>>>>> from cache_level" also creates many questions (when will it be different?
>>>>>> how will it be determined that the scope is different in order to know that
>>>>>> mon_scope should be used?)
>>>>>
>>>>> I've gone in a different direction. V6 renames "cache_level" to
>>>>> "ctrl_scope". I think this makes intent clear from step #1.
>>>>>
>>>>
>>>> This change is not clear to me. Previously you changed this name
>>>> but kept using it in code specific to cache levels. It is not clear
>>>> to me how this time's rename would be different.
>>>
>>> The current "cache_level" field in the structure is describing
>>> the scope of each instance using the cache level (2 or 3) as the
>>> method to describe which CPUs are considered part of a group. Currently
>>> the scope is the same for both control and monitor resources.
>>
>> Right.
>>
>>>
>>> Would you like to see patches in this progrssion:
>>>
>>> 1) Rename "cache_level" to "scope". With commit comment that future
>>> patches are going to base the scope on NUMA nodes in addtion to sharing
>>> caches at particular levels, and will split into separate control and
>>> monitor scope.
>>>
>>> 2) Split the "scope" field from first patch into "ctrl_scope" and
>>> "mon_scope" (also with the addition of the new list for the mon_scope).
>>>
>>> 3) Add "node" as a new option for scope in addtion to L3 and L2 cache.
>>>
>>
>> hmmm - my comment cannot be addressed through patch re-ordering.
>> If I understand correctly you plan to change the name of "cache_level"
>> to "ctrl_scope". My comment is that this obfuscates the code as long as
>> you use this variable to compare against data that can only represent cache
>> levels. This just repeats what I wrote in
>> https://lore.kernel.org/lkml/09847c37-66d7-c286-a313-308eaa338c64@intel.com/
> 
> I'm proposing more than just re-ordering. The above sequence is a
> couple of extra patches in the series.
> 
> Existing state of code:
> 
> 	There is a single field named "cache_level" that describes how
> 	CPUs are assigned to domains based on their sharing of a cache
> 	at a particular level. Hard-coded values of "2" and "3" are used
> 	to describe the level. This is just a scope description of which
> 	CPUs are grouped together. But it is limited to just doing so
> 	based on which caches are shared by those CPUs.
> 
> Step 1:
> 
> 	Change the name of the field s/cache_level/scope/. Provide an
> 	enum with values RESCTRL_L3_CACHE, RESCTRL_L2_CACHE aand use
> 	those througout code instead of 3, 2, and implictly passing
> 	the resctrl scope to functions like get_cpu_cacheinfo_id()
> 
> 	Add get_domain_id_from_scope() function that takes the enum
> 	values for scope and converts to "3", "2" to pass to
> 	get_cpu_cacheinfo_id().
> 
> 	No functional change. Just broadening the meaning of the field
> 	so that it can in future patches to describe scopes that
> 	aren't a function of sharing a cache of a particular level.

Right. So if I understand you are planning the same change as you did
in V3, like below, and my original comment still stands.

 @@ -1348,7 +1348,7 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
  	num_b = bitmap_weight(&cbm, r->cache.cbm_len);
  	ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
  	for (i = 0; i < ci->num_leaves; i++) {
 -		if (ci->info_list[i].level == r->cache_level) {
 +		if (ci->info_list[i].level == r->scope) {
  			size = ci->info_list[i].size / r->cache.cbm_len * num_b;
  			break;
  		}


>>>>>>> @@ -502,61 +593,19 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
>>>>>>>   */
>>>>>>>  static void domain_add_cpu(int cpu, struct rdt_resource *r)
>>>>>>>  {
>>>>>>> -	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
>>>>>>> -	struct list_head *add_pos = NULL;
>>>>>>> -	struct rdt_hw_domain *hw_dom;
>>>>>>> -	struct rdt_domain *d;
>>>>>>> -	int err;
>>>>>>> -
>>>>>>> -	d = rdt_find_domain(r, id, &add_pos);
>>>>>>> -	if (IS_ERR(d)) {
>>>>>>> -		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
>>>>>>> -		return;
>>>>>>> -	}
>>>>>>> -
>>>>>>> -	if (d) {
>>>>>>> -		cpumask_set_cpu(cpu, &d->cpu_mask);
>>>>>>> -		if (r->cache.arch_has_per_cpu_cfg)
>>>>>>> -			rdt_domain_reconfigure_cdp(r);
>>>>>>> -		return;
>>>>>>> -	}
>>>>>>> -
>>>>>>> -	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
>>>>>>> -	if (!hw_dom)
>>>>>>> -		return;
>>>>>>> -
>>>>>>> -	d = &hw_dom->d_resctrl;
>>>>>>> -	d->id = id;
>>>>>>> -	cpumask_set_cpu(cpu, &d->cpu_mask);
>>>>>>> -
>>>>>>> -	rdt_domain_reconfigure_cdp(r);
>>>>>>> -
>>>>>>> -	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
>>>>>>> -		domain_free(hw_dom);
>>>>>>> -		return;
>>>>>>> -	}
>>>>>>> -
>>>>>>> -	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
>>>>>>> -		domain_free(hw_dom);
>>>>>>> -		return;
>>>>>>> -	}
>>>>>>> -
>>>>>>> -	list_add_tail(&d->list, add_pos);
>>>>>>> -
>>>>>>> -	err = resctrl_online_domain(r, d);
>>>>>>> -	if (err) {
>>>>>>> -		list_del(&d->list);
>>>>>>> -		domain_free(hw_dom);
>>>>>>> -	}
>>>>>>> +	if (r->alloc_capable)
>>>>>>> +		domain_add_cpu_ctrl(cpu, r);
>>>>>>> +	if (r->mon_capable)
>>>>>>> +		domain_add_cpu_mon(cpu, r);
>>>>>>>  }
>>>>>>
>>>>>> A resource could be both alloc and mon capable ... both
>>>>>> domain_add_cpu_ctrl() and domain_add_cpu_mon() can fail.
>>>>>> Should domain_add_cpu_mon() still be run for a CPU if
>>>>>> domain_add_cpu_ctrl() failed? 
>>>>>>
>>>>>> Looking ahead the CPU should probably also not be added
>>>>>> to the default groups mask if a failure occurred.
>>>>>
>>>>> Existing code doesn't do anything for the case where a CPU
>>>>> can't be added to a domain (probably the only real error case
>>>>> is failure to allocate memory for the domain structure).
>>>>
>>>> Is my statement about CPU being added to default group mask
>>>> incorrect? Seems like a potential issue related to domain's
>>>> CPU mask also.
>>>>
>>>> Please see my earlier question. Existing code does not proceed
>>>> with monitor initialization if control initialization fails and
>>>> undoes control initialization if monitor initialization fails. 
>>>
>>> Existing code silently continues if a domain structure cannot
>>> be allocated to add a CPU to a domain:
>>>
>>> 503 static void domain_add_cpu(int cpu, struct rdt_resource *r)
>>> 504 {
>>> 505         int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
>>> 506         struct list_head *add_pos = NULL;
>>> 507         struct rdt_hw_domain *hw_dom;
>>> 508         struct rdt_domain *d;
>>> 509         int err;
>>>
>>> ...
>>>
>>> 523
>>> 524         hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
>>> 525         if (!hw_dom)
>>> 526                 return;
>>> 527
>>>
>>
>>
>> Right ... and if it returns silently as above it runs:
>>
>> static int resctrl_online_cpu(unsigned int cpu)
>> {
>>
>>
>> 	for_each_capable_rdt_resource(r)
>> 		domain_add_cpu(cpu, r);
>> 	>>>>> cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); <<<<<<<<
>>
>> }
>>
>> Also, note within domain_add_cpu():
>>
>> static void domain_add_cpu(int cpu, struct rdt_resource *r)
>> {
>>
>>
>> 	...
>> 	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
>> 		domain_free(hw_dom);
>> 		return;
>> 	}
>>
>> 	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
>> 		domain_free(hw_dom);
>> 		return;
>> 	}
>>
>> 	...
>> }
>>
>> The above is the other item that I've been trying to discuss
>> with you. Note that existing resctrl will not initialize monitoring if
>> control could not be initialized.
>> Compare with this submission:
>>
>> 	if (r->alloc_capable)
>> 		domain_add_cpu_ctrl(cpu, r);
>> 	if (r->mon_capable)
>> 		domain_add_cpu_mon(cpu, r);
>>
>>

I'll stop trying

Reinette

Re: [PATCH v4 1/7] x86/resctrl: Create separate domains for control and monitoring

Posted by Tony Luck 2 years, 5 months ago

On Mon, Aug 28, 2023 at 02:20:04PM -0700, Reinette Chatre wrote:
> 
> 
> On 8/28/2023 1:59 PM, Tony Luck wrote:
> > On Mon, Aug 28, 2023 at 12:56:27PM -0700, Reinette Chatre wrote:
> >> Hi Tony,
> >>
> >> On 8/28/2023 11:46 AM, Tony Luck wrote:
> >>> On Mon, Aug 28, 2023 at 10:05:31AM -0700, Reinette Chatre wrote:
> >>>> Hi Tony,
> >>>>
> >>>> On 8/25/2023 10:05 AM, Tony Luck wrote:
> >>>>> On Fri, Aug 11, 2023 at 10:29:25AM -0700, Reinette Chatre wrote:
> >>>>>> On 7/22/2023 12:07 PM, Tony Luck wrote:
> >>>>
> >>>>>>> Change all places where monitoring functions walk the list of
> >>>>>>> domains to use the new "mondomains" list instead of the old
> >>>>>>> "domains" list.
> >>>>>>
> >>>>>> I would not refer to it as "the old domains list" as it creates
> >>>>>> impression that this is being replaced. The changelog makes
> >>>>>> no mention that domains list will remain and be dedicated to
> >>>>>> control domains. I think this is important to include in description
> >>>>>> of this change.
> >>>>>
> >>>>> I've rewritten the entire commit message incorporating your suggestions.
> >>>>> V6 will be posted soon (after I get some time on an SNC SPR to check
> >>>>> that it all works!)
> >>>>
> >>>> I seem to have missed v5.
> >>>
> >>> I simply can't count. You are correct that next version to be posted
> >>> will be v5.
> >>>
> >>>>>
> >>>>>>
> >>>>>>>
> >>>>>>> Signed-off-by: Tony Luck <tony.luck@intel.com>
> >>>>>>> ---
> >>>>>>>  include/linux/resctrl.h                   |  10 +-
> >>>>>>>  arch/x86/kernel/cpu/resctrl/internal.h    |   2 +-
> >>>>>>>  arch/x86/kernel/cpu/resctrl/core.c        | 195 +++++++++++++++-------
> >>>>>>>  arch/x86/kernel/cpu/resctrl/ctrlmondata.c |   2 +-
> >>>>>>>  arch/x86/kernel/cpu/resctrl/monitor.c     |   2 +-
> >>>>>>>  arch/x86/kernel/cpu/resctrl/rdtgroup.c    |  30 ++--
> >>>>>>>  6 files changed, 167 insertions(+), 74 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> >>>>>>> index 8334eeacfec5..1267d56f9e76 100644
> >>>>>>> --- a/include/linux/resctrl.h
> >>>>>>> +++ b/include/linux/resctrl.h
> >>>>>>> @@ -151,9 +151,11 @@ struct resctrl_schema;
> >>>>>>>   * @mon_capable:	Is monitor feature available on this machine
> >>>>>>>   * @num_rmid:		Number of RMIDs available
> >>>>>>>   * @cache_level:	Which cache level defines scope of this resource
> >>>>>>> + * @mon_scope:		Scope of this resource if different from cache_level
> >>>>>>
> >>>>>> I think this addition should be deferred. As it is here it the "if different
> >>>>>> from cache_level" also creates many questions (when will it be different?
> >>>>>> how will it be determined that the scope is different in order to know that
> >>>>>> mon_scope should be used?)
> >>>>>
> >>>>> I've gone in a different direction. V6 renames "cache_level" to
> >>>>> "ctrl_scope". I think this makes intent clear from step #1.
> >>>>>
> >>>>
> >>>> This change is not clear to me. Previously you changed this name
> >>>> but kept using it in code specific to cache levels. It is not clear
> >>>> to me how this time's rename would be different.
> >>>
> >>> The current "cache_level" field in the structure is describing
> >>> the scope of each instance using the cache level (2 or 3) as the
> >>> method to describe which CPUs are considered part of a group. Currently
> >>> the scope is the same for both control and monitor resources.
> >>
> >> Right.
> >>
> >>>
> >>> Would you like to see patches in this progrssion:
> >>>
> >>> 1) Rename "cache_level" to "scope". With commit comment that future
> >>> patches are going to base the scope on NUMA nodes in addtion to sharing
> >>> caches at particular levels, and will split into separate control and
> >>> monitor scope.
> >>>
> >>> 2) Split the "scope" field from first patch into "ctrl_scope" and
> >>> "mon_scope" (also with the addition of the new list for the mon_scope).
> >>>
> >>> 3) Add "node" as a new option for scope in addtion to L3 and L2 cache.
> >>>
> >>
> >> hmmm - my comment cannot be addressed through patch re-ordering.
> >> If I understand correctly you plan to change the name of "cache_level"
> >> to "ctrl_scope". My comment is that this obfuscates the code as long as
> >> you use this variable to compare against data that can only represent cache
> >> levels. This just repeats what I wrote in
> >> https://lore.kernel.org/lkml/09847c37-66d7-c286-a313-308eaa338c64@intel.com/
> > 
> > I'm proposing more than just re-ordering. The above sequence is a
> > couple of extra patches in the series.
> > 
> > Existing state of code:
> > 
> > 	There is a single field named "cache_level" that describes how
> > 	CPUs are assigned to domains based on their sharing of a cache
> > 	at a particular level. Hard-coded values of "2" and "3" are used
> > 	to describe the level. This is just a scope description of which
> > 	CPUs are grouped together. But it is limited to just doing so
> > 	based on which caches are shared by those CPUs.
> > 
> > Step 1:
> > 
> > 	Change the name of the field s/cache_level/scope/. Provide an
> > 	enum with values RESCTRL_L3_CACHE, RESCTRL_L2_CACHE aand use
> > 	those througout code instead of 3, 2, and implictly passing
> > 	the resctrl scope to functions like get_cpu_cacheinfo_id()
> > 
> > 	Add get_domain_id_from_scope() function that takes the enum
> > 	values for scope and converts to "3", "2" to pass to
> > 	get_cpu_cacheinfo_id().
> > 
> > 	No functional change. Just broadening the meaning of the field
> > 	so that it can in future patches to describe scopes that
> > 	aren't a function of sharing a cache of a particular level.
> 
> Right. So if I understand you are planning the same change as you did
> in V3, like below, and my original comment still stands.
> 
>  @@ -1348,7 +1348,7 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
>   	num_b = bitmap_weight(&cbm, r->cache.cbm_len);
>   	ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
>   	for (i = 0; i < ci->num_leaves; i++) {
>  -		if (ci->info_list[i].level == r->cache_level) {
>  +		if (ci->info_list[i].level == r->scope) {
>   			size = ci->info_list[i].size / r->cache.cbm_len * num_b;
>   			break;

No. I heard your complaint about mixing "scope" and "cache_level".
Latest version of that code looks like this:

1341 unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
1342                                   struct rdt_domain *d, unsigned long cbm)
1343 {
1344         struct cpu_cacheinfo *ci;
1345         unsigned int size = 0;
1346         int cache_level;
1347         int num_b, i;
1348
1349         switch (r->ctrl_scope) {
1350         case RESCTRL_L3_CACHE:
1351                 cache_level = 3;
1352                 break;
1353         case RESCTRL_L2_CACHE:
1354                 cache_level = 2;
1355                 break;
1356         default:
1357                 WARN_ON_ONCE(1);
1358                 return size;
1359         }
1360
1361         num_b = bitmap_weight(&cbm, r->cache.cbm_len);
1362         ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
1363         for (i = 0; i < ci->num_leaves; i++) {
1364                 if (ci->info_list[i].level == cache_level) {
1365                         size = ci->info_list[i].size / r->cache.cbm_len * num_b;
1366                         break;
1367                 }
1368         }
1369
1370         return size / snc_nodes_per_l3_cache;
1371 }

>   		}
> 
> 
> >>>>>>> @@ -502,61 +593,19 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
> >>>>>>>   */
> >>>>>>>  static void domain_add_cpu(int cpu, struct rdt_resource *r)
> >>>>>>>  {
> >>>>>>> -	int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
> >>>>>>> -	struct list_head *add_pos = NULL;
> >>>>>>> -	struct rdt_hw_domain *hw_dom;
> >>>>>>> -	struct rdt_domain *d;
> >>>>>>> -	int err;
> >>>>>>> -
> >>>>>>> -	d = rdt_find_domain(r, id, &add_pos);
> >>>>>>> -	if (IS_ERR(d)) {
> >>>>>>> -		pr_warn("Couldn't find cache id for CPU %d\n", cpu);
> >>>>>>> -		return;
> >>>>>>> -	}
> >>>>>>> -
> >>>>>>> -	if (d) {
> >>>>>>> -		cpumask_set_cpu(cpu, &d->cpu_mask);
> >>>>>>> -		if (r->cache.arch_has_per_cpu_cfg)
> >>>>>>> -			rdt_domain_reconfigure_cdp(r);
> >>>>>>> -		return;
> >>>>>>> -	}
> >>>>>>> -
> >>>>>>> -	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
> >>>>>>> -	if (!hw_dom)
> >>>>>>> -		return;
> >>>>>>> -
> >>>>>>> -	d = &hw_dom->d_resctrl;
> >>>>>>> -	d->id = id;
> >>>>>>> -	cpumask_set_cpu(cpu, &d->cpu_mask);
> >>>>>>> -
> >>>>>>> -	rdt_domain_reconfigure_cdp(r);
> >>>>>>> -
> >>>>>>> -	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
> >>>>>>> -		domain_free(hw_dom);
> >>>>>>> -		return;
> >>>>>>> -	}
> >>>>>>> -
> >>>>>>> -	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
> >>>>>>> -		domain_free(hw_dom);
> >>>>>>> -		return;
> >>>>>>> -	}
> >>>>>>> -
> >>>>>>> -	list_add_tail(&d->list, add_pos);
> >>>>>>> -
> >>>>>>> -	err = resctrl_online_domain(r, d);
> >>>>>>> -	if (err) {
> >>>>>>> -		list_del(&d->list);
> >>>>>>> -		domain_free(hw_dom);
> >>>>>>> -	}
> >>>>>>> +	if (r->alloc_capable)
> >>>>>>> +		domain_add_cpu_ctrl(cpu, r);
> >>>>>>> +	if (r->mon_capable)
> >>>>>>> +		domain_add_cpu_mon(cpu, r);
> >>>>>>>  }
> >>>>>>
> >>>>>> A resource could be both alloc and mon capable ... both
> >>>>>> domain_add_cpu_ctrl() and domain_add_cpu_mon() can fail.
> >>>>>> Should domain_add_cpu_mon() still be run for a CPU if
> >>>>>> domain_add_cpu_ctrl() failed? 
> >>>>>>
> >>>>>> Looking ahead the CPU should probably also not be added
> >>>>>> to the default groups mask if a failure occurred.
> >>>>>
> >>>>> Existing code doesn't do anything for the case where a CPU
> >>>>> can't be added to a domain (probably the only real error case
> >>>>> is failure to allocate memory for the domain structure).
> >>>>
> >>>> Is my statement about CPU being added to default group mask
> >>>> incorrect? Seems like a potential issue related to domain's
> >>>> CPU mask also.
> >>>>
> >>>> Please see my earlier question. Existing code does not proceed
> >>>> with monitor initialization if control initialization fails and
> >>>> undoes control initialization if monitor initialization fails. 
> >>>
> >>> Existing code silently continues if a domain structure cannot
> >>> be allocated to add a CPU to a domain:
> >>>
> >>> 503 static void domain_add_cpu(int cpu, struct rdt_resource *r)
> >>> 504 {
> >>> 505         int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
> >>> 506         struct list_head *add_pos = NULL;
> >>> 507         struct rdt_hw_domain *hw_dom;
> >>> 508         struct rdt_domain *d;
> >>> 509         int err;
> >>>
> >>> ...
> >>>
> >>> 523
> >>> 524         hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
> >>> 525         if (!hw_dom)
> >>> 526                 return;
> >>> 527
> >>>
> >>
> >>
> >> Right ... and if it returns silently as above it runs:
> >>
> >> static int resctrl_online_cpu(unsigned int cpu)
> >> {
> >>
> >>
> >> 	for_each_capable_rdt_resource(r)
> >> 		domain_add_cpu(cpu, r);
> >> 	>>>>> cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); <<<<<<<<
> >>
> >> }
> >>
> >> Also, note within domain_add_cpu():
> >>
> >> static void domain_add_cpu(int cpu, struct rdt_resource *r)
> >> {
> >>
> >>
> >> 	...
> >> 	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
> >> 		domain_free(hw_dom);
> >> 		return;
> >> 	}
> >>
> >> 	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
> >> 		domain_free(hw_dom);
> >> 		return;
> >> 	}
> >>
> >> 	...
> >> }
> >>
> >> The above is the other item that I've been trying to discuss
> >> with you. Note that existing resctrl will not initialize monitoring if
> >> control could not be initialized.
> >> Compare with this submission:
> >>
> >> 	if (r->alloc_capable)
> >> 		domain_add_cpu_ctrl(cpu, r);
> >> 	if (r->mon_capable)
> >> 		domain_add_cpu_mon(cpu, r);
> >>
> >>
> 
> I'll stop trying

I'll see what I can do to improve the error handling.

-Tony

Re: [PATCH v4 1/7] x86/resctrl: Create separate domains for control and monitoring

Posted by Tony Luck 2 years, 5 months ago

Trimming to the core of the discussion.

> > >> 	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
> > >> 		domain_free(hw_dom);
> > >> 		return;
> > >> 	}
> > >>
> > >> 	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
> > >> 		domain_free(hw_dom);
> > >> 		return;
> > >> 	}

In the current code the control and monitor functions share a domain
structure. So enabling one without the other would lead to a whole
slew of complications. Various other bits of code would need to change
to make run-time checks that the control/monitor pieces of the shared
domain structure had been completely set up. So the existing code takes
the easy path out and says "if I can't have both, I'll take have
neither".

> > >> The above is the other item that I've been trying to discuss
> > >> with you. Note that existing resctrl will not initialize monitoring if
> > >> control could not be initialized.
> > >> Compare with this submission:
> > >>
> > >> 	if (r->alloc_capable)
> > >> 		domain_add_cpu_ctrl(cpu, r);
> > >> 	if (r->mon_capable)
> > >> 		domain_add_cpu_mon(cpu, r);

With the updates I'm making, there are separate domain structures for
control and monitor.  They don't have to be tied together. It's possible
for initialization to fail on one, but the other remain completely
functional (without adding "is this completely setup" checks to
other code).

The question is what should the code do? Should it allow a partial
success (now that is possible)? Or should it maintain legacy behavior
and block use of both control and monitor for domain if either fails
to allocate necessary memory to operate?

I'm generally in favor of legacy semantics. But it seems weird to make
the code deliberately worse to preserve this particular behavior.
Especially as it adds complexity to the code for a case that in practice
is never going to happen.

-Tony

[PATCH v4 1/7] x86/resctrl: Create separate domains for control and monitoring
[PATCH v4 2/7] x86/resctrl: Split the rdt_domain structures
[PATCH v4 3/7] x86/resctrl: Change monitor code to use rdt_mondomain
[PATCH v4 4/7] x86/resctrl: Delete unused fields from struct rdt_domain
[PATCH v4 5/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize.
[PATCH v4 6/7] x86/resctrl: Update documentation with Sub-NUMA cluster changes
[PATCH v4 7/7] selftests/resctrl: Adjust effective L3 cache size when SNC enabled