Computing the bandwidth for an event is cheap, and only done once
per second. Doing so simplifies switching between events and allows
choosing different events per ctrl_mon group.
Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
arch/x86/kernel/cpu/resctrl/monitor.c | 72 ++++++++++++---------------
1 file changed, 33 insertions(+), 39 deletions(-)
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 2176e355e864..da4ae21350c8 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -663,9 +663,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
*/
static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
{
- u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
- struct mbm_state *m = &rr->d->mbm_local[idx];
u64 cur_bw, bytes, cur_bytes;
+ struct mbm_state *m;
+
+ m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
+ if (WARN_ON_ONCE(!m))
+ return;
cur_bytes = rr->val;
bytes = cur_bytes - m->prev_bw_bytes;
@@ -826,54 +829,45 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
}
-static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
- u32 closid, u32 rmid)
+static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 closid, u32 rmid, enum resctrl_event_id evtid)
{
struct rmid_read rr = {0};
rr.r = r;
rr.d = d;
+ rr.evtid = evtid;
+ rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
+ if (IS_ERR(rr.arch_mon_ctx)) {
+ pr_warn_ratelimited("Failed to allocate monitor context: %ld",
+ PTR_ERR(rr.arch_mon_ctx));
+ return;
+ }
+
+ __mon_event_count(closid, rmid, &rr);
/*
- * This is protected from concurrent reads from user
- * as both the user and we hold the global mutex.
+ * If the software controller is enabled, compute the
+ * bandwidth for this event id.
*/
- if (is_mbm_total_enabled()) {
- rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
- rr.val = 0;
- rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
- if (IS_ERR(rr.arch_mon_ctx)) {
- pr_warn_ratelimited("Failed to allocate monitor context: %ld",
- PTR_ERR(rr.arch_mon_ctx));
- return;
- }
-
- __mon_event_count(closid, rmid, &rr);
+ if (is_mba_sc(NULL))
+ mbm_bw_count(closid, rmid, &rr);
- resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
- }
- if (is_mbm_local_enabled()) {
- rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
- rr.val = 0;
- rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
- if (IS_ERR(rr.arch_mon_ctx)) {
- pr_warn_ratelimited("Failed to allocate monitor context: %ld",
- PTR_ERR(rr.arch_mon_ctx));
- return;
- }
-
- __mon_event_count(closid, rmid, &rr);
+ resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
+}
- /*
- * Call the MBA software controller only for the
- * control groups and when user has enabled
- * the software controller explicitly.
- */
- if (is_mba_sc(NULL))
- mbm_bw_count(closid, rmid, &rr);
+static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 closid, u32 rmid)
+{
+ /*
+ * This is protected from concurrent reads from user
+ * as both the user and we hold the global mutex.
+ */
+ if (is_mbm_total_enabled())
+ mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
- resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
- }
+ if (is_mbm_local_enabled())
+ mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
}
/*
--
2.47.0
Hi Tony,
On 11/13/24 4:17 PM, Tony Luck wrote:
> Computing the bandwidth for an event is cheap, and only done once
> per second. Doing so simplifies switching between events and allows
> choosing different events per ctrl_mon group.
This just reads like some a general statement. There surely can be
some context, problem and *some* description about how this patch goes
about addressing the problem?
>
> Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
> arch/x86/kernel/cpu/resctrl/monitor.c | 72 ++++++++++++---------------
> 1 file changed, 33 insertions(+), 39 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index 2176e355e864..da4ae21350c8 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -663,9 +663,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
> */
> static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
> {
> - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
> - struct mbm_state *m = &rr->d->mbm_local[idx];
> u64 cur_bw, bytes, cur_bytes;
> + struct mbm_state *m;
> +
> + m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
> + if (WARN_ON_ONCE(!m))
> + return;
>
> cur_bytes = rr->val;
> bytes = cur_bytes - m->prev_bw_bytes;
> @@ -826,54 +829,45 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
> resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
> }
>
> -static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
> - u32 closid, u32 rmid)
> +static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
> + u32 closid, u32 rmid, enum resctrl_event_id evtid)
> {
> struct rmid_read rr = {0};
>
> rr.r = r;
> rr.d = d;
> + rr.evtid = evtid;
> + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> + if (IS_ERR(rr.arch_mon_ctx)) {
> + pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> + PTR_ERR(rr.arch_mon_ctx));
> + return;
> + }
> +
> + __mon_event_count(closid, rmid, &rr);
>
> /*
> - * This is protected from concurrent reads from user
> - * as both the user and we hold the global mutex.
> + * If the software controller is enabled, compute the
> + * bandwidth for this event id.
> */
> - if (is_mbm_total_enabled()) {
> - rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
> - rr.val = 0;
> - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> - if (IS_ERR(rr.arch_mon_ctx)) {
> - pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> - PTR_ERR(rr.arch_mon_ctx));
> - return;
> - }
> -
> - __mon_event_count(closid, rmid, &rr);
> + if (is_mba_sc(NULL))
> + mbm_bw_count(closid, rmid, &rr);
>
> - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> - }
> - if (is_mbm_local_enabled()) {
> - rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
> - rr.val = 0;
> - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> - if (IS_ERR(rr.arch_mon_ctx)) {
> - pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> - PTR_ERR(rr.arch_mon_ctx));
> - return;
> - }
> -
> - __mon_event_count(closid, rmid, &rr);
> + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> +}
>
> - /*
> - * Call the MBA software controller only for the
> - * control groups and when user has enabled
> - * the software controller explicitly.
> - */
> - if (is_mba_sc(NULL))
> - mbm_bw_count(closid, rmid, &rr);
> +static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
> + u32 closid, u32 rmid)
> +{
> + /*
> + * This is protected from concurrent reads from user
> + * as both the user and we hold the global mutex.
I understand that you are just copy&pasting a comment here but could you please
help to avoid any obstacles by removing the code impersonation? Perhaps something like:
* This is protected from concurrent reads from user
* as both the user and overflow handler hold the global mutex.
(please feel free to improve)
> + */
> + if (is_mbm_total_enabled())
> + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
>
> - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> - }
> + if (is_mbm_local_enabled())
> + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
> }
>
> /*
Reinette
On Tue, Nov 19, 2024 at 07:45:01PM -0800, Reinette Chatre wrote:
> Hi Tony,
>
> On 11/13/24 4:17 PM, Tony Luck wrote:
> > Computing the bandwidth for an event is cheap, and only done once
> > per second. Doing so simplifies switching between events and allows
> > choosing different events per ctrl_mon group.
>
> This just reads like some a general statement. There surely can be
> some context, problem and *some* description about how this patch goes
> about addressing the problem?
I've rewritten this in the problem ... solution format.
> >
> > Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
> > Signed-off-by: Tony Luck <tony.luck@intel.com>
> > ---
> > arch/x86/kernel/cpu/resctrl/monitor.c | 72 ++++++++++++---------------
> > 1 file changed, 33 insertions(+), 39 deletions(-)
> >
> > diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> > index 2176e355e864..da4ae21350c8 100644
> > --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> > +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> > @@ -663,9 +663,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
> > */
> > static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
> > {
> > - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
> > - struct mbm_state *m = &rr->d->mbm_local[idx];
> > u64 cur_bw, bytes, cur_bytes;
> > + struct mbm_state *m;
> > +
> > + m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
> > + if (WARN_ON_ONCE(!m))
> > + return;
> >
> > cur_bytes = rr->val;
> > bytes = cur_bytes - m->prev_bw_bytes;
> > @@ -826,54 +829,45 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
> > resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
> > }
> >
> > -static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
> > - u32 closid, u32 rmid)
> > +static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
> > + u32 closid, u32 rmid, enum resctrl_event_id evtid)
> > {
> > struct rmid_read rr = {0};
> >
> > rr.r = r;
> > rr.d = d;
> > + rr.evtid = evtid;
> > + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> > + if (IS_ERR(rr.arch_mon_ctx)) {
> > + pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> > + PTR_ERR(rr.arch_mon_ctx));
> > + return;
> > + }
> > +
> > + __mon_event_count(closid, rmid, &rr);
> >
> > /*
> > - * This is protected from concurrent reads from user
> > - * as both the user and we hold the global mutex.
> > + * If the software controller is enabled, compute the
> > + * bandwidth for this event id.
> > */
> > - if (is_mbm_total_enabled()) {
> > - rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
> > - rr.val = 0;
> > - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> > - if (IS_ERR(rr.arch_mon_ctx)) {
> > - pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> > - PTR_ERR(rr.arch_mon_ctx));
> > - return;
> > - }
> > -
> > - __mon_event_count(closid, rmid, &rr);
> > + if (is_mba_sc(NULL))
> > + mbm_bw_count(closid, rmid, &rr);
> >
> > - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> > - }
> > - if (is_mbm_local_enabled()) {
> > - rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
> > - rr.val = 0;
> > - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> > - if (IS_ERR(rr.arch_mon_ctx)) {
> > - pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> > - PTR_ERR(rr.arch_mon_ctx));
> > - return;
> > - }
> > -
> > - __mon_event_count(closid, rmid, &rr);
> > + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> > +}
> >
> > - /*
> > - * Call the MBA software controller only for the
> > - * control groups and when user has enabled
> > - * the software controller explicitly.
> > - */
> > - if (is_mba_sc(NULL))
> > - mbm_bw_count(closid, rmid, &rr);
> > +static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
> > + u32 closid, u32 rmid)
> > +{
> > + /*
> > + * This is protected from concurrent reads from user
> > + * as both the user and we hold the global mutex.
>
> I understand that you are just copy&pasting a comment here but could you please
> help to avoid any obstacles by removing the code impersonation? Perhaps something like:
>
> * This is protected from concurrent reads from user
> * as both the user and overflow handler hold the global mutex.
>
> (please feel free to improve)
No improvement to the text needed. But I did move some words from 2nd
line to the first to look better (IMHO).
> > + */
> > + if (is_mbm_total_enabled())
> > + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
> >
> > - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> > - }
> > + if (is_mbm_local_enabled())
> > + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
> > }
> >
> > /*
>
> Reinette
-Tony
Hi Tony,
On Thu, Nov 14, 2024 at 1:17 AM Tony Luck <tony.luck@intel.com> wrote:
>
> Computing the bandwidth for an event is cheap, and only done once
> per second. Doing so simplifies switching between events and allows
> choosing different events per ctrl_mon group.
>
> Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
> arch/x86/kernel/cpu/resctrl/monitor.c | 72 ++++++++++++---------------
> 1 file changed, 33 insertions(+), 39 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index 2176e355e864..da4ae21350c8 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -663,9 +663,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
> */
> static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
> {
> - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
> - struct mbm_state *m = &rr->d->mbm_local[idx];
> u64 cur_bw, bytes, cur_bytes;
> + struct mbm_state *m;
> +
> + m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
> + if (WARN_ON_ONCE(!m))
> + return;
>
> cur_bytes = rr->val;
> bytes = cur_bytes - m->prev_bw_bytes;
> @@ -826,54 +829,45 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
> resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
> }
>
> -static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
> - u32 closid, u32 rmid)
> +static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
> + u32 closid, u32 rmid, enum resctrl_event_id evtid)
> {
> struct rmid_read rr = {0};
>
> rr.r = r;
> rr.d = d;
> + rr.evtid = evtid;
> + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> + if (IS_ERR(rr.arch_mon_ctx)) {
> + pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> + PTR_ERR(rr.arch_mon_ctx));
> + return;
> + }
> +
> + __mon_event_count(closid, rmid, &rr);
>
> /*
> - * This is protected from concurrent reads from user
> - * as both the user and we hold the global mutex.
> + * If the software controller is enabled, compute the
> + * bandwidth for this event id.
> */
> - if (is_mbm_total_enabled()) {
> - rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
> - rr.val = 0;
> - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> - if (IS_ERR(rr.arch_mon_ctx)) {
> - pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> - PTR_ERR(rr.arch_mon_ctx));
> - return;
> - }
> -
> - __mon_event_count(closid, rmid, &rr);
> + if (is_mba_sc(NULL))
> + mbm_bw_count(closid, rmid, &rr);
>
> - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> - }
> - if (is_mbm_local_enabled()) {
> - rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
> - rr.val = 0;
> - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> - if (IS_ERR(rr.arch_mon_ctx)) {
> - pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> - PTR_ERR(rr.arch_mon_ctx));
> - return;
> - }
> -
> - __mon_event_count(closid, rmid, &rr);
> + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> +}
>
> - /*
> - * Call the MBA software controller only for the
> - * control groups and when user has enabled
> - * the software controller explicitly.
> - */
> - if (is_mba_sc(NULL))
> - mbm_bw_count(closid, rmid, &rr);
> +static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
> + u32 closid, u32 rmid)
> +{
> + /*
> + * This is protected from concurrent reads from user
> + * as both the user and we hold the global mutex.
> + */
> + if (is_mbm_total_enabled())
> + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
>
> - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> - }
> + if (is_mbm_local_enabled())
> + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
> }
>
> /*
> --
> 2.47.0
>
I experimented with all-groups, per-domain counter aggregation files
prototype using this change as a starting point.
I'm happy to report that the values reported looked fairly reasonable.
Tested-by: Peter Newman <peternewman@google.com>
I also did a quick experiment to see how close to 1 second apart the
readings were:
@@ -664,6 +664,7 @@ static int __mon_event_count(u32 closid, u32 rmid,
struct rmid_read *rr)
static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
{
u64 cur_bw, bytes, cur_bytes;
struct mbm_state *m;
+ u64 ts;
m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
@@ -680,6 +681,10 @@ static void mbm_bw_count(u32 closid, u32 rmid,
struct rmid_read *rr)
cur_bw = bytes / SZ_1M;
m->prev_bw = cur_bw;
+
+ ts = jiffies;
+ m->latency = ts - m->last_update_jiffies;
+ m->last_update_jiffies = ts;
}
On an AMD EPYC 7B12 64-Core Processor, I saw a consistent 1.021-1.026
second period. Is this enough error that you would want to divide by
the actual period instead of assuming a denominator of 1 exactly?
We're mainly concerned with the relative bandwidth of jobs, so this
error isn't much concern as long as it doesn't favor any group.
The only thing I'd worry about is if the user is using setitimer() to
keep a consistent 1 second period for reading the bandwidth rate, the
window of the resctrl updates would drift away from the userspace
consumer over time.
Thanks!
-Peter
> I experimented with all-groups, per-domain counter aggregation files > prototype using this change as a starting point. > > I'm happy to report that the values reported looked fairly reasonable. > > Tested-by: Peter Newman <peternewman@google.com> Thanks for the test report. > On an AMD EPYC 7B12 64-Core Processor, I saw a consistent 1.021-1.026 > second period. Is this enough error that you would want to divide by > the actual period instead of assuming a denominator of 1 exactly? > We're mainly concerned with the relative bandwidth of jobs, so this > error isn't much concern as long as it doesn't favor any group. I see pretty much the same delta_t on Intel Icelake. We could use jiffies to get a bit more precision (depending on HZ value). > The only thing I'd worry about is if the user is using setitimer() to > keep a consistent 1 second period for reading the bandwidth rate, the > window of the resctrl updates would drift away from the userspace > consumer over time. One other thing I did in my resctrl2 summary code was to patch the modification time of the summary file to when the kernel ran mbm_handle_overflow(). That would allow users to check the update time to stay in sync with kernel updates. -Tony
© 2016 - 2026 Red Hat, Inc.