[v7] x86/resctrl: mba_MBps enhancements

[PATCH v7 1/4] x86/resctrl: Make input event for MBA Software Controller configurable

Posted by Tony Luck 1 month, 3 weeks ago

The MBA Software Controller(mba_sc) is a feedback loop that uses
measurements of local memory bandwidth to adjust MBA throttling levels
to keep workloads in a resctrl group within a target bandwidth set in
the schemata file.

Users may want to use total memory bandwidth instead of local to handle
workloads that have poor NUMA localization.

Update the once-per-second polling code to pick a chosen event (local
or total memory bandwidth).

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 include/linux/resctrl.h                |  2 +
 arch/x86/kernel/cpu/resctrl/monitor.c  | 80 ++++++++++++--------------
 arch/x86/kernel/cpu/resctrl/rdtgroup.c |  2 +
 3 files changed, 40 insertions(+), 44 deletions(-)

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index d94abba1c716..ccb0f50dc18c 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -161,6 +161,7 @@ enum membw_throttle_mode {
  * @throttle_mode:	Bandwidth throttling mode when threads request
  *			different memory bandwidths
  * @mba_sc:		True if MBA software controller(mba_sc) is enabled
+ * @mba_mbps_event:	Monitoring event guiding feedback loop when @mba_sc is true
  * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
  */
 struct resctrl_membw {
@@ -170,6 +171,7 @@ struct resctrl_membw {
 	bool				arch_needs_linear;
 	enum membw_throttle_mode	throttle_mode;
 	bool				mba_sc;
+	enum resctrl_event_id		mba_mbps_event;
 	u32				*mb_map;
 };
 
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 851b561850e0..2692ce7f708e 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -663,10 +663,11 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
  */
 static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
 {
-	u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
-	struct mbm_state *m = &rr->d->mbm_local[idx];
 	u64 cur_bw, bytes, cur_bytes;
+	struct mbm_state *m;
 
+	m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
+	WARN_ON(!m);
 	cur_bytes = rr->val;
 	bytes = cur_bytes - m->prev_bw_bytes;
 	m->prev_bw_bytes = cur_bytes;
@@ -752,20 +753,22 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
 	u32 closid, rmid, cur_msr_val, new_msr_val;
 	struct mbm_state *pmbm_data, *cmbm_data;
 	struct rdt_ctrl_domain *dom_mba;
+	enum resctrl_event_id evt_id;
 	struct rdt_resource *r_mba;
-	u32 cur_bw, user_bw, idx;
 	struct list_head *head;
 	struct rdtgroup *entry;
+	u32 cur_bw, user_bw;
 
-	if (!is_mbm_local_enabled())
+	if (!is_mbm_enabled())
 		return;
 
 	r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+	evt_id = r_mba->membw.mba_mbps_event;
 
 	closid = rgrp->closid;
 	rmid = rgrp->mon.rmid;
-	idx = resctrl_arch_rmid_idx_encode(closid, rmid);
-	pmbm_data = &dom_mbm->mbm_local[idx];
+	pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id);
+	WARN_ON(!pmbm_data);
 
 	dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba);
 	if (!dom_mba) {
@@ -784,7 +787,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
 	 */
 	head = &rgrp->mon.crdtgrp_list;
 	list_for_each_entry(entry, head, mon.crdtgrp_list) {
-		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+		cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id);
+		WARN_ON(!cmbm_data);
 		cur_bw += cmbm_data->prev_bw;
 	}
 
@@ -813,54 +817,42 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
 	resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
 }
 
-static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
-		       u32 closid, u32 rmid)
+static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
+				 u32 closid, u32 rmid, enum resctrl_event_id evtid)
 {
+	struct rdt_resource *rmba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
 	struct rmid_read rr = {0};
 
 	rr.r = r;
 	rr.d = d;
+	rr.evtid = evtid;
+	rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
+	if (IS_ERR(rr.arch_mon_ctx)) {
+		pr_warn_ratelimited("Failed to allocate monitor context: %ld",
+				    PTR_ERR(rr.arch_mon_ctx));
+		return;
+	}
 
+	__mon_event_count(closid, rmid, &rr);
+
+	if (is_mba_sc(NULL) && rr.evtid == rmba->membw.mba_mbps_event)
+		mbm_bw_count(closid, rmid, &rr);
+
+	resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
+}
+
+static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
+		       u32 closid, u32 rmid)
+{
 	/*
 	 * This is protected from concurrent reads from user
 	 * as both the user and we hold the global mutex.
 	 */
-	if (is_mbm_total_enabled()) {
-		rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
-		rr.val = 0;
-		rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
-		if (IS_ERR(rr.arch_mon_ctx)) {
-			pr_warn_ratelimited("Failed to allocate monitor context: %ld",
-					    PTR_ERR(rr.arch_mon_ctx));
-			return;
-		}
-
-		__mon_event_count(closid, rmid, &rr);
-
-		resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
-	}
-	if (is_mbm_local_enabled()) {
-		rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
-		rr.val = 0;
-		rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
-		if (IS_ERR(rr.arch_mon_ctx)) {
-			pr_warn_ratelimited("Failed to allocate monitor context: %ld",
-					    PTR_ERR(rr.arch_mon_ctx));
-			return;
-		}
-
-		__mon_event_count(closid, rmid, &rr);
-
-		/*
-		 * Call the MBA software controller only for the
-		 * control groups and when user has enabled
-		 * the software controller explicitly.
-		 */
-		if (is_mba_sc(NULL))
-			mbm_bw_count(closid, rmid, &rr);
+	if (is_mbm_total_enabled())
+		mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
 
-		resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
-	}
+	if (is_mbm_local_enabled())
+		mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index d7163b764c62..aedb30120d50 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2505,6 +2505,7 @@ static void rdt_disable_ctx(void)
 
 static int rdt_enable_ctx(struct rdt_fs_context *ctx)
 {
+	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
 	int ret = 0;
 
 	if (ctx->enable_cdpl2) {
@@ -2520,6 +2521,7 @@ static int rdt_enable_ctx(struct rdt_fs_context *ctx)
 	}
 
 	if (ctx->enable_mba_mbps) {
+		r->membw.mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;
 		ret = set_mba_sc(true);
 		if (ret)
 			goto out_cdpl3;
-- 
2.46.1

Re: [PATCH v7 1/4] x86/resctrl: Make input event for MBA Software Controller configurable

Posted by Reinette Chatre 1 month ago

Hi Tony,

On 10/3/24 12:12 PM, Tony Luck wrote:
> The MBA Software Controller(mba_sc) is a feedback loop that uses
> measurements of local memory bandwidth to adjust MBA throttling levels
> to keep workloads in a resctrl group within a target bandwidth set in
> the schemata file.
> 
> Users may want to use total memory bandwidth instead of local to handle
> workloads that have poor NUMA localization.
> 
> Update the once-per-second polling code to pick a chosen event (local
> or total memory bandwidth).
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  include/linux/resctrl.h                |  2 +
>  arch/x86/kernel/cpu/resctrl/monitor.c  | 80 ++++++++++++--------------
>  arch/x86/kernel/cpu/resctrl/rdtgroup.c |  2 +
>  3 files changed, 40 insertions(+), 44 deletions(-)
> 
> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> index d94abba1c716..ccb0f50dc18c 100644
> --- a/include/linux/resctrl.h
> +++ b/include/linux/resctrl.h
> @@ -161,6 +161,7 @@ enum membw_throttle_mode {
>   * @throttle_mode:	Bandwidth throttling mode when threads request
>   *			different memory bandwidths
>   * @mba_sc:		True if MBA software controller(mba_sc) is enabled
> + * @mba_mbps_event:	Monitoring event guiding feedback loop when @mba_sc is true
>   * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
>   */
>  struct resctrl_membw {
> @@ -170,6 +171,7 @@ struct resctrl_membw {
>  	bool				arch_needs_linear;
>  	enum membw_throttle_mode	throttle_mode;
>  	bool				mba_sc;
> +	enum resctrl_event_id		mba_mbps_event;
>  	u32				*mb_map;
>  };

I do still [1] think that mba_sc_event will be easier to understand to be a companion
of mba_sc .

>  
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index 851b561850e0..2692ce7f708e 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -663,10 +663,11 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
>   */
>  static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
>  {
> -	u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
> -	struct mbm_state *m = &rr->d->mbm_local[idx];
>  	u64 cur_bw, bytes, cur_bytes;
> +	struct mbm_state *m;
>  
> +	m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
> +	WARN_ON(!m);

In the unlikely case this WARN() is hit then this function should exit before attempting
to dereference this NULL pointer a few lines down.

>  	cur_bytes = rr->val;
>  	bytes = cur_bytes - m->prev_bw_bytes;
>  	m->prev_bw_bytes = cur_bytes;

I needed to refresh my understanding of this work by re-reading the previous discussions.
You mentioned in [2]:
	I tried out some code to make the event runtime selectable via a r/w file in the
	resctrl/info directories. But that got complicated because of the amount of state
	that needs to be updated when switching events.

Could you please clarify which state you referred to? I wonder if it may be the 
struct mbm_state state maintained by mbm_bw_count()? mbm_bw_count() is lightweight
and I see no problem with it being called for all supported MBM events when
the software controller is enabled. With state for all supported events always available
it seems simpler to runtime switch between which events guide the software controller?

Thinking about it more, it seems possible for the user to use different
MBM events to guide the software controller for different resource groups.

If it is possible to do runtime switching in this way I do think it will simplify this
implementation while not requiring the user to remount resctrl to make changes. You
mentioned [3] that "a separate patch series" may be coming to address this but doing this
now seems simpler while avoiding any future work as well as confusing duplicate ABI
... unless you were referring to other issues that needs to be addressed separately?

> @@ -752,20 +753,22 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
>  	u32 closid, rmid, cur_msr_val, new_msr_val;
>  	struct mbm_state *pmbm_data, *cmbm_data;
>  	struct rdt_ctrl_domain *dom_mba;
> +	enum resctrl_event_id evt_id;
>  	struct rdt_resource *r_mba;
> -	u32 cur_bw, user_bw, idx;
>  	struct list_head *head;
>  	struct rdtgroup *entry;
> +	u32 cur_bw, user_bw;
>  
> -	if (!is_mbm_local_enabled())
> +	if (!is_mbm_enabled())
>  		return;
>  
>  	r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
> +	evt_id = r_mba->membw.mba_mbps_event;

I see no check that evt_id is actually supported by the system. The new "is_mbm_enabled()"
check a few lines up tests if _any_ MBM event is supported, which may
differ from the event hardcoded without checking in rdt_enable_ctx() below.


>  	closid = rgrp->closid;
>  	rmid = rgrp->mon.rmid;
> -	idx = resctrl_arch_rmid_idx_encode(closid, rmid);
> -	pmbm_data = &dom_mbm->mbm_local[idx];
> +	pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id);
> +	WARN_ON(!pmbm_data);

same comment about WARN ... also, since these calls are made frequently it may be
better to use WARN_ON_ONCE()

>  
>  	dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba);
>  	if (!dom_mba) {
> @@ -784,7 +787,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
>  	 */
>  	head = &rgrp->mon.crdtgrp_list;
>  	list_for_each_entry(entry, head, mon.crdtgrp_list) {
> -		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
> +		cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id);
> +		WARN_ON(!cmbm_data);

Same here.

>  		cur_bw += cmbm_data->prev_bw;
>  	}
>  
> @@ -813,54 +817,42 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
>  	resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
>  }
>  
> -static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
> -		       u32 closid, u32 rmid)
> +static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
> +				 u32 closid, u32 rmid, enum resctrl_event_id evtid)
>  {
> +	struct rdt_resource *rmba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
>  	struct rmid_read rr = {0};
>  
>  	rr.r = r;
>  	rr.d = d;
> +	rr.evtid = evtid;
> +	rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> +	if (IS_ERR(rr.arch_mon_ctx)) {
> +		pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> +				    PTR_ERR(rr.arch_mon_ctx));
> +		return;
> +	}
>  
> +	__mon_event_count(closid, rmid, &rr);
> +
> +	if (is_mba_sc(NULL) && rr.evtid == rmba->membw.mba_mbps_event)
> +		mbm_bw_count(closid, rmid, &rr);

This is where I was thinking it could be simplified to instead always update all state:
	if (is_mba_sc(NULL))
		mbm_bw_count(closid, rmid, &rr);


> +
> +	resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> +}
> +

...

> @@ -2520,6 +2521,7 @@ static int rdt_enable_ctx(struct rdt_fs_context *ctx)
>  	}
>  
>  	if (ctx->enable_mba_mbps) {
> +		r->membw.mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;

By dropping the earlier "if (!is_mbm_local_enabled())" check there remains no
check that local MBM is supported while hardcoding its use here.

>  		ret = set_mba_sc(true);
>  		if (ret)
>  			goto out_cdpl3;

Reinette

[1] https://lore.kernel.org/all/5215fe1e-52e1-4ca4-8bd2-a42152f3e0e3@intel.com/
[2] https://lore.kernel.org/all/20231128231439.81691-1-tony.luck@intel.com/
[3] https://lore.kernel.org/all/ZWpF5m4mIeZdK8kv@agluck-desk3/

RE: [PATCH v7 1/4] x86/resctrl: Make input event for MBA Software Controller configurable

Posted by Luck, Tony 1 month ago

Plucking out just the big, direction change, comment for discussion (which may make
several of the code comments moot).

> I needed to refresh my understanding of this work by re-reading the previous discussions.
> You mentioned in [2]:
>       I tried out some code to make the event runtime selectable via a r/w file in the
>       resctrl/info directories. But that got complicated because of the amount of state
>       that needs to be updated when switching events.
>
> Could you please clarify which state you referred to? I wonder if it may be the
> struct mbm_state state maintained by mbm_bw_count()? mbm_bw_count() is lightweight
> and I see no problem with it being called for all supported MBM events when
> the software controller is enabled. With state for all supported events always available
> it seems simpler to runtime switch between which events guide the software controller?
>
> Thinking about it more, it seems possible for the user to use different
> MBM events to guide the software controller for different resource groups.
>
> If it is possible to do runtime switching in this way I do think it will simplify this
> implementation while not requiring the user to remount resctrl to make changes. You
> mentioned [3] that "a separate patch series" may be coming to address this but doing this
> now seems simpler while avoiding any future work as well as confusing duplicate ABI
> ... unless you were referring to other issues that needs to be addressed separately?

Yes, the state maintained by mbm_bw_count() was the piece that worried me. After
a user switch to a different event there would be no bandwidth data until two updates
passed by (one to get a baseline, second to compute bandwidth). So update_mba_bw()
would need to be aware of this liminal period to avoid making updates with no data to
back them up.

Your solution is elegant. The cost to maintain bandwidth data for each event is indeed
very low. So there are no weird transition cases. update_mba_bw() can immediately 
compare bandwidth for the new event against the target bandwidth and make appropriate
adjustments.

This requires a new file in each CTRL_MON directory when mba_sc is enabled so
the user can make their selection.

Note that technically it would be possible to make a different selection for each domain.
But that seems like an option without an obvious use case and would just complicate
the syntax of the new file.

Maybe name this new file "mba_sc_event"[1] with contents that match the names of
the mbm_monitor events as listed in /sys/fs/resctrl/info/L3_MON/mon_features?

So default state when resctrl is mounted with the software controller enabled would
have:

$ cat /sys/fs/resctrl/mba_sc_event
mbm_local_bytes

User could switch to total with

# echo mbm_total_bytes > /sys/fs/resctrl/mba_sc_event

On systems where mbm_local_bytes is not supported default would be mbm_total_bytes.

New CTRL_MON directories would also default to mbm_local_bytes if it is supported.

-Tony

[1] Other suggestions welcome.

Re: [PATCH v7 1/4] x86/resctrl: Make input event for MBA Software Controller configurable

Posted by Reinette Chatre 1 month ago

Hi Tony,

On 10/25/24 1:42 PM, Luck, Tony wrote:
> Plucking out just the big, direction change, comment for discussion (which may make
> several of the code comments moot).
> 
>> I needed to refresh my understanding of this work by re-reading the previous discussions.
>> You mentioned in [2]:
>>       I tried out some code to make the event runtime selectable via a r/w file in the
>>       resctrl/info directories. But that got complicated because of the amount of state
>>       that needs to be updated when switching events.
>>
>> Could you please clarify which state you referred to? I wonder if it may be the
>> struct mbm_state state maintained by mbm_bw_count()? mbm_bw_count() is lightweight
>> and I see no problem with it being called for all supported MBM events when
>> the software controller is enabled. With state for all supported events always available
>> it seems simpler to runtime switch between which events guide the software controller?
>>
>> Thinking about it more, it seems possible for the user to use different
>> MBM events to guide the software controller for different resource groups.
>>
>> If it is possible to do runtime switching in this way I do think it will simplify this
>> implementation while not requiring the user to remount resctrl to make changes. You
>> mentioned [3] that "a separate patch series" may be coming to address this but doing this
>> now seems simpler while avoiding any future work as well as confusing duplicate ABI
>> ... unless you were referring to other issues that needs to be addressed separately?
> 
> Yes, the state maintained by mbm_bw_count() was the piece that worried me. After
> a user switch to a different event there would be no bandwidth data until two updates
> passed by (one to get a baseline, second to compute bandwidth). So update_mba_bw()
> would need to be aware of this liminal period to avoid making updates with no data to
> back them up.
> 
> Your solution is elegant. The cost to maintain bandwidth data for each event is indeed
> very low. So there are no weird transition cases. update_mba_bw() can immediately 
> compare bandwidth for the new event against the target bandwidth and make appropriate
> adjustments.

Thank you for considering it.

> 
> This requires a new file in each CTRL_MON directory when mba_sc is enabled so
> the user can make their selection.
> 
> Note that technically it would be possible to make a different selection for each domain.
> But that seems like an option without an obvious use case and would just complicate
> the syntax of the new file.

I did not consider this possibility. I agree with your assessment.

> 
> Maybe name this new file "mba_sc_event"[1] with contents that match the names of
> the mbm_monitor events as listed in /sys/fs/resctrl/info/L3_MON/mon_features?

I do like that the content is connected to existing user interface by using events
from mon_features. What do you think of connecting the filename to existing 
user interface (the mount option) also by, for example, being named "mba_MBps_event"?

> 
> So default state when resctrl is mounted with the software controller enabled would
> have:
> 
> $ cat /sys/fs/resctrl/mba_sc_event
> mbm_local_bytes
> 
> User could switch to total with
> 
> # echo mbm_total_bytes > /sys/fs/resctrl/mba_sc_event
> 
> On systems where mbm_local_bytes is not supported default would be mbm_total_bytes.
> 
> New CTRL_MON directories would also default to mbm_local_bytes if it is supported.

This sounds good to me. Thank you very much for considering the change.

Reinette

[PATCH v7 1/4] x86/resctrl: Make input event for MBA Software Controller configurable
[PATCH v7 2/4] x86/resctrl: Add mount option to pick input event for mba_MBps mode
[PATCH v7 3/4] x86/resctrl: Use total bandwidth for mba_MBps option when local isn't present
[PATCH v7 4/4] x86/resctrl: Add new "mba_MBps_event" mount option to documentation