From: James Morse <james.morse@arm.com>
resctrl exposes a counter via a file named llc_occupancy. This isn't really
a counter as its value goes up and down, this is a snapshot of the cache
storage usage monitor.
Add some picking code to find a cache as close as possible to the L3 that
supports the CSU monitor.
If there is an L3, but it doesn't have any controls, force the L3 resource
to exist. The existing topology_matches_l3() and
mpam_resctrl_domain_hdr_init() code will ensure this looks like the L3,
even if the class belongs to a later cache.
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: James Morse <james.morse@arm.com>
Co-developed-by: Dave Martin <dave.martin@arm.com>
Signed-off-by: Dave Martin <dave.martin@arm.com>
Signed-off-by: Ben Horgan <ben.horgan@arm.com>
---
Changes since rfc:
Allow csu counters however many partid or pmg there are
else if -> if
reduce scope of local variables
drop has_csu
Changes since v2:
return -> break so works for mbwu in later patch
add for_each_mpam_resctrl_mon
return error from mpam_resctrl_monitor_init(). It may fail when is abmc
allocation introduced in a later patch.
Squashed in patch from Dave Martin:
https://lore.kernel.org/lkml/20250820131621.54983-1-Dave.Martin@arm.com/
---
drivers/resctrl/mpam_internal.h | 6 ++
drivers/resctrl/mpam_resctrl.c | 173 +++++++++++++++++++++++++++++++-
2 files changed, 174 insertions(+), 5 deletions(-)
diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
index f89ceaf7623d..21cc776e57aa 100644
--- a/drivers/resctrl/mpam_internal.h
+++ b/drivers/resctrl/mpam_internal.h
@@ -349,6 +349,12 @@ struct mpam_resctrl_res {
struct rdt_resource resctrl_res;
};
+struct mpam_resctrl_mon {
+ struct mpam_class *class;
+
+ /* per-class data that resctrl needs will live here */
+};
+
static inline int mpam_alloc_csu_mon(struct mpam_class *class)
{
struct mpam_props *cprops = &class->props;
diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c
index 7402bf4293b6..5020a5faed96 100644
--- a/drivers/resctrl/mpam_resctrl.c
+++ b/drivers/resctrl/mpam_resctrl.c
@@ -37,6 +37,21 @@ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES];
/* The lock for modifying resctrl's domain lists from cpuhp callbacks. */
static DEFINE_MUTEX(domain_list_lock);
+/*
+ * The classes we've picked to map to resctrl events.
+ * Resctrl believes all the worlds a Xeon, and these are all on the L3. This
+ * array lets us find the actual class backing the event counters. e.g.
+ * the only memory bandwidth counters may be on the memory controller, but to
+ * make use of them, we pretend they are on L3.
+ * Class pointer may be NULL.
+ */
+static struct mpam_resctrl_mon mpam_resctrl_counters[QOS_NUM_EVENTS];
+
+#define for_each_mpam_resctrl_mon(mon, eventid) \
+ for (eventid = 0, mon = &mpam_resctrl_counters[eventid]; \
+ eventid < QOS_NUM_EVENTS; \
+ eventid++, mon = &mpam_resctrl_counters[eventid])
+
static bool exposed_alloc_capable;
static bool exposed_mon_capable;
@@ -259,6 +274,28 @@ static bool class_has_usable_mba(struct mpam_props *cprops)
return mba_class_use_mbw_max(cprops);
}
+static bool cache_has_usable_csu(struct mpam_class *class)
+{
+ struct mpam_props *cprops;
+
+ if (!class)
+ return false;
+
+ cprops = &class->props;
+
+ if (!mpam_has_feature(mpam_feat_msmon_csu, cprops))
+ return false;
+
+ /*
+ * CSU counters settle on the value, so we can get away with
+ * having only one.
+ */
+ if (!cprops->num_csu_mon)
+ return false;
+
+ return true;
+}
+
/*
* Calculate the worst-case percentage change from each implemented step
* in the control.
@@ -507,6 +544,64 @@ static void mpam_resctrl_pick_mba(void)
}
}
+static void counter_update_class(enum resctrl_event_id evt_id,
+ struct mpam_class *class)
+{
+ struct mpam_class *existing_class = mpam_resctrl_counters[evt_id].class;
+
+ if (existing_class) {
+ if (class->level == 3) {
+ pr_debug("Existing class is L3 - L3 wins\n");
+ return;
+ }
+
+ if (existing_class->level < class->level) {
+ pr_debug("Existing class is closer to L3, %u versus %u - closer is better\n",
+ existing_class->level, class->level);
+ return;
+ }
+ }
+
+ mpam_resctrl_counters[evt_id].class = class;
+ exposed_mon_capable = true;
+}
+
+static void mpam_resctrl_pick_counters(void)
+{
+ struct mpam_class *class;
+
+ lockdep_assert_cpus_held();
+
+ guard(srcu)(&mpam_srcu);
+ list_for_each_entry_srcu(class, &mpam_classes, classes_list,
+ srcu_read_lock_held(&mpam_srcu)) {
+ if (class->level < 3) {
+ pr_debug("class %u is before L3", class->level);
+ continue;
+ }
+
+ if (!cpumask_equal(&class->affinity, cpu_possible_mask)) {
+ pr_debug("class %u does not cover all CPUs",
+ class->level);
+ continue;
+ }
+
+ if (cache_has_usable_csu(class) && topology_matches_l3(class)) {
+ pr_debug("class %u has usable CSU, and matches L3 topology",
+ class->level);
+
+ /* CSU counters only make sense on a cache. */
+ switch (class->type) {
+ case MPAM_CLASS_CACHE:
+ counter_update_class(QOS_L3_OCCUP_EVENT_ID, class);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+}
+
static int mpam_resctrl_control_init(struct mpam_resctrl_res *res)
{
struct mpam_class *class = res->class;
@@ -582,6 +677,57 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp)
return comp->comp_id;
}
+static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon,
+ enum resctrl_event_id type)
+{
+ struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
+ struct rdt_resource *l3 = &res->resctrl_res;
+
+ lockdep_assert_cpus_held();
+
+ /* There also needs to be an L3 cache present */
+ if (get_cpu_cacheinfo_id(smp_processor_id(), 3) == -1)
+ return 0;
+
+ /*
+ * If there are no MPAM resources on L3, force it into existence.
+ * topology_matches_l3() already ensures this looks like the L3.
+ * The domain-ids will be fixed up by mpam_resctrl_domain_hdr_init().
+ */
+ if (!res->class) {
+ pr_warn_once("Faking L3 MSC to enable counters.\n");
+ res->class = mpam_resctrl_counters[type].class;
+ }
+
+ /* Called multiple times!, once per event type */
+ if (exposed_mon_capable) {
+ l3->mon_capable = true;
+
+ /* Setting name is necessary on monitor only platforms */
+ l3->name = "L3";
+ l3->mon_scope = RESCTRL_L3_CACHE;
+
+ resctrl_enable_mon_event(type);
+
+ /*
+ * Unfortunately, num_rmid doesn't mean anything for
+ * mpam, and its exposed to user-space!
+ *
+ * num-rmid is supposed to mean the minimum number of
+ * monitoring groups that can exist simultaneously, including
+ * the default monitoring group for each control group.
+ *
+ * For mpam, each control group has its own pmg/rmid space, so
+ * it is not appropriate to advertise the whole rmid_idx space
+ * here. But the pmgs corresponding to the parent control
+ * group can be allocated freely:
+ */
+ l3->mon.num_rmid = mpam_pmg_max + 1;
+ }
+
+ return 0;
+}
+
u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
u32 closid, enum resctrl_conf_type type)
{
@@ -958,6 +1104,8 @@ int mpam_resctrl_setup(void)
int err = 0;
struct mpam_resctrl_res *res;
enum resctrl_res_level rid;
+ struct mpam_resctrl_mon *mon;
+ enum resctrl_event_id eventid;
wait_event(wait_cacheinfo_ready, cacheinfo_ready);
@@ -980,16 +1128,26 @@ int mpam_resctrl_setup(void)
err = mpam_resctrl_control_init(res);
if (err) {
pr_debug("Failed to initialise rid %u\n", rid);
- break;
+ goto internal_error;
}
}
- cpus_read_unlock();
- if (err) {
- pr_debug("Internal error %d - resctrl not supported\n", err);
- return err;
+ /* Find some classes to use for monitors */
+ mpam_resctrl_pick_counters();
+
+ for_each_mpam_resctrl_mon(mon, eventid) {
+ if (!mon->class)
+ continue; // dummy resource
+
+ err = mpam_resctrl_monitor_init(mon, eventid);
+ if (err) {
+ pr_debug("Failed to initialise event %u\n", eventid);
+ goto internal_error;
+ }
}
+ cpus_read_unlock();
+
if (!exposed_alloc_capable && !exposed_mon_capable) {
pr_debug("No alloc(%u) or monitor(%u) found - resctrl not supported\n",
exposed_alloc_capable, exposed_mon_capable);
@@ -999,6 +1157,11 @@ int mpam_resctrl_setup(void)
/* TODO: call resctrl_init() */
return 0;
+
+internal_error:
+ cpus_read_unlock();
+ pr_debug("Internal error %d - resctrl not supported\n", err);
+ return err;
}
static int __init __cacheinfo_ready(void)
--
2.43.0
Replying to myself...
On 1/12/26 16:58, Ben Horgan wrote:
> From: James Morse <james.morse@arm.com>
>
> resctrl exposes a counter via a file named llc_occupancy. This isn't really
> a counter as its value goes up and down, this is a snapshot of the cache
> storage usage monitor.
>
> Add some picking code to find a cache as close as possible to the L3 that
> supports the CSU monitor.
>
> If there is an L3, but it doesn't have any controls, force the L3 resource
> to exist. The existing topology_matches_l3() and
> mpam_resctrl_domain_hdr_init() code will ensure this looks like the L3,
> even if the class belongs to a later cache.
>
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> Signed-off-by: James Morse <james.morse@arm.com>
> Co-developed-by: Dave Martin <dave.martin@arm.com>
> Signed-off-by: Dave Martin <dave.martin@arm.com>
> Signed-off-by: Ben Horgan <ben.horgan@arm.com>
> ---
> Changes since rfc:
> Allow csu counters however many partid or pmg there are
> else if -> if
> reduce scope of local variables
> drop has_csu
>
> Changes since v2:
> return -> break so works for mbwu in later patch
> add for_each_mpam_resctrl_mon
> return error from mpam_resctrl_monitor_init(). It may fail when is abmc
> allocation introduced in a later patch.
> Squashed in patch from Dave Martin:
> https://lore.kernel.org/lkml/20250820131621.54983-1-Dave.Martin@arm.com/
> ---
> drivers/resctrl/mpam_internal.h | 6 ++
> drivers/resctrl/mpam_resctrl.c | 173 +++++++++++++++++++++++++++++++-
> 2 files changed, 174 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
> index f89ceaf7623d..21cc776e57aa 100644
> --- a/drivers/resctrl/mpam_internal.h
> +++ b/drivers/resctrl/mpam_internal.h
> @@ -349,6 +349,12 @@ struct mpam_resctrl_res {
> struct rdt_resource resctrl_res;
> };
>
> +struct mpam_resctrl_mon {
> + struct mpam_class *class;
> +
> + /* per-class data that resctrl needs will live here */
> +};
> +
> static inline int mpam_alloc_csu_mon(struct mpam_class *class)
> {
> struct mpam_props *cprops = &class->props;
> diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c
> index 7402bf4293b6..5020a5faed96 100644
> --- a/drivers/resctrl/mpam_resctrl.c
> +++ b/drivers/resctrl/mpam_resctrl.c
> @@ -37,6 +37,21 @@ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES];
> /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */
> static DEFINE_MUTEX(domain_list_lock);
>
> +/*
> + * The classes we've picked to map to resctrl events.
> + * Resctrl believes all the worlds a Xeon, and these are all on the L3. This
> + * array lets us find the actual class backing the event counters. e.g.
> + * the only memory bandwidth counters may be on the memory controller, but to
> + * make use of them, we pretend they are on L3.
> + * Class pointer may be NULL.
> + */
> +static struct mpam_resctrl_mon mpam_resctrl_counters[QOS_NUM_EVENTS];
> +
> +#define for_each_mpam_resctrl_mon(mon, eventid) \
> + for (eventid = 0, mon = &mpam_resctrl_counters[eventid]; \
> + eventid < QOS_NUM_EVENTS; \
> + eventid++, mon = &mpam_resctrl_counters[eventid])
> +
> static bool exposed_alloc_capable;
> static bool exposed_mon_capable;
>
> @@ -259,6 +274,28 @@ static bool class_has_usable_mba(struct mpam_props *cprops)
> return mba_class_use_mbw_max(cprops);
> }
>
> +static bool cache_has_usable_csu(struct mpam_class *class)
> +{
> + struct mpam_props *cprops;
> +
> + if (!class)
> + return false;
> +
> + cprops = &class->props;
> +
> + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops))
> + return false;
> +
> + /*
> + * CSU counters settle on the value, so we can get away with
> + * having only one.
> + */
> + if (!cprops->num_csu_mon)
> + return false;
> +
> + return true;
> +}
> +
> /*
> * Calculate the worst-case percentage change from each implemented step
> * in the control.
> @@ -507,6 +544,64 @@ static void mpam_resctrl_pick_mba(void)
> }
> }
>
> +static void counter_update_class(enum resctrl_event_id evt_id,
> + struct mpam_class *class)
> +{
> + struct mpam_class *existing_class = mpam_resctrl_counters[evt_id].class;
> +
> + if (existing_class) {
> + if (class->level == 3) {
> + pr_debug("Existing class is L3 - L3 wins\n");
> + return;
> + }
> +
> + if (existing_class->level < class->level) {
> + pr_debug("Existing class is closer to L3, %u versus %u - closer is better\n",
> + existing_class->level, class->level);
> + return;
> + }
> + }
> +
> + mpam_resctrl_counters[evt_id].class = class;
> + exposed_mon_capable = true;
> +}
> +
> +static void mpam_resctrl_pick_counters(void)
> +{
> + struct mpam_class *class;
> +
> + lockdep_assert_cpus_held();
> +
> + guard(srcu)(&mpam_srcu);
> + list_for_each_entry_srcu(class, &mpam_classes, classes_list,
> + srcu_read_lock_held(&mpam_srcu)) {
> + if (class->level < 3) {
> + pr_debug("class %u is before L3", class->level);
> + continue;
> + }
> +
> + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) {
> + pr_debug("class %u does not cover all CPUs",
> + class->level);
> + continue;
> + }
> +
> + if (cache_has_usable_csu(class) && topology_matches_l3(class)) {
> + pr_debug("class %u has usable CSU, and matches L3 topology",
> + class->level);
> +
> + /* CSU counters only make sense on a cache. */
> + switch (class->type) {
> + case MPAM_CLASS_CACHE:
> + counter_update_class(QOS_L3_OCCUP_EVENT_ID, class);
As the counter is named llc_occupancy (llc = last level cache) and we
are naming the resource L3 it would be surprising to have llc_occupancy
on anything other than an L3. Also, that L3 should be a last level
cache. I'll update this as part of a general push to tighten up the
heuristics in this series to make sure that in the future when more
fitting user visible interfaces are added in resctrl we are able to use
them rather than being stuck with something that almost fits.
> + break;
> + default:
> + break;
> + }
> + }
> + }
> +}
> +
[...]
Thanks,
Ben
Hi Ben,
On 1/12/26 8:58 AM, Ben Horgan wrote:
> From: James Morse <james.morse@arm.com>
>
> resctrl exposes a counter via a file named llc_occupancy. This isn't really
> a counter as its value goes up and down, this is a snapshot of the cache
> storage usage monitor.
>
> Add some picking code to find a cache as close as possible to the L3 that
> supports the CSU monitor.
>
> If there is an L3, but it doesn't have any controls, force the L3 resource
> to exist. The existing topology_matches_l3() and
> mpam_resctrl_domain_hdr_init() code will ensure this looks like the L3,
> even if the class belongs to a later cache.
>
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> Signed-off-by: James Morse <james.morse@arm.com>
> Co-developed-by: Dave Martin <dave.martin@arm.com>
> Signed-off-by: Dave Martin <dave.martin@arm.com>
> Signed-off-by: Ben Horgan <ben.horgan@arm.com>
> ---
> Changes since rfc:
> Allow csu counters however many partid or pmg there are
> else if -> if
> reduce scope of local variables
> drop has_csu
>
> Changes since v2:
> return -> break so works for mbwu in later patch
> add for_each_mpam_resctrl_mon
> return error from mpam_resctrl_monitor_init(). It may fail when is abmc
> allocation introduced in a later patch.
> Squashed in patch from Dave Martin:
> https://lore.kernel.org/lkml/20250820131621.54983-1-Dave.Martin@arm.com/
> ---
> drivers/resctrl/mpam_internal.h | 6 ++
> drivers/resctrl/mpam_resctrl.c | 173 +++++++++++++++++++++++++++++++-
> 2 files changed, 174 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
> index f89ceaf7623d..21cc776e57aa 100644
> --- a/drivers/resctrl/mpam_internal.h
> +++ b/drivers/resctrl/mpam_internal.h
> @@ -349,6 +349,12 @@ struct mpam_resctrl_res {
> struct rdt_resource resctrl_res;
> };
>
> +struct mpam_resctrl_mon {
> + struct mpam_class *class;
> +
> + /* per-class data that resctrl needs will live here */
> +};
> +
> static inline int mpam_alloc_csu_mon(struct mpam_class *class)
> {
> struct mpam_props *cprops = &class->props;
> diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c
> index 7402bf4293b6..5020a5faed96 100644
> --- a/drivers/resctrl/mpam_resctrl.c
> +++ b/drivers/resctrl/mpam_resctrl.c
> @@ -37,6 +37,21 @@ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES];
> /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */
> static DEFINE_MUTEX(domain_list_lock);
>
> +/*
> + * The classes we've picked to map to resctrl events.
> + * Resctrl believes all the worlds a Xeon, and these are all on the L3. This
> + * array lets us find the actual class backing the event counters. e.g.
> + * the only memory bandwidth counters may be on the memory controller, but to
> + * make use of them, we pretend they are on L3.
> + * Class pointer may be NULL.
> + */
> +static struct mpam_resctrl_mon mpam_resctrl_counters[QOS_NUM_EVENTS];
> +
> +#define for_each_mpam_resctrl_mon(mon, eventid) \
> + for (eventid = 0, mon = &mpam_resctrl_counters[eventid]; \
> + eventid < QOS_NUM_EVENTS; \
> + eventid++, mon = &mpam_resctrl_counters[eventid])
> +
Reading the above loop and how it is used to call mpam_resctrl_monitor_init() for every event
it looks like there is an implicit assumption that MPAM supports all events known to
resctrl.
Please consider the most recent resctrl feature "telemetry monitoring" currently queued
for inclusion: https://lore.kernel.org/lkml/20251217172121.12030-1-tony.luck@intel.com/
(You can find latest resctrl code queued for inclusion on the x86/cache branch of
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git)
New telemetry monitoring introduces several new events known to resctrl. Specifically, here
is how enum resctrl_event_id looks at the moment:
/* Event IDs */
enum resctrl_event_id {
/* Must match value of first event below */
QOS_FIRST_EVENT = 0x01,
/*
* These values match those used to program IA32_QM_EVTSEL before
* reading IA32_QM_CTR on RDT systems.
*/
QOS_L3_OCCUP_EVENT_ID = 0x01,
QOS_L3_MBM_TOTAL_EVENT_ID = 0x02,
QOS_L3_MBM_LOCAL_EVENT_ID = 0x03,
/* Intel Telemetry Events */
PMT_EVENT_ENERGY,
PMT_EVENT_ACTIVITY,
PMT_EVENT_STALLS_LLC_HIT,
PMT_EVENT_C1_RES,
PMT_EVENT_UNHALTED_CORE_CYCLES,
PMT_EVENT_STALLS_LLC_MISS,
PMT_EVENT_AUTO_C6_RES,
PMT_EVENT_UNHALTED_REF_CYCLES,
PMT_EVENT_UOPS_RETIRED,
/* Must be the last */
QOS_NUM_EVENTS,
};
...
> @@ -582,6 +677,57 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp)
> return comp->comp_id;
> }
>
> +static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon,
> + enum resctrl_event_id type)
> +{
> + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
> + struct rdt_resource *l3 = &res->resctrl_res;
> +
> + lockdep_assert_cpus_held();
> +
> + /* There also needs to be an L3 cache present */
> + if (get_cpu_cacheinfo_id(smp_processor_id(), 3) == -1)
> + return 0;
> +
> + /*
> + * If there are no MPAM resources on L3, force it into existence.
> + * topology_matches_l3() already ensures this looks like the L3.
> + * The domain-ids will be fixed up by mpam_resctrl_domain_hdr_init().
> + */
> + if (!res->class) {
> + pr_warn_once("Faking L3 MSC to enable counters.\n");
> + res->class = mpam_resctrl_counters[type].class;
> + }
> +
> + /* Called multiple times!, once per event type */
> + if (exposed_mon_capable) {
> + l3->mon_capable = true;
> +
> + /* Setting name is necessary on monitor only platforms */
> + l3->name = "L3";
> + l3->mon_scope = RESCTRL_L3_CACHE;
> +
> + resctrl_enable_mon_event(type);
btw, the telemetry work also changed this function prototype to be:
bool resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu,
unsigned int binary_bits, void *arch_priv);
If I understand correctly resctrl_enable_mon_event() will be called for every event in
enum resctrl_event_id which now contains events that may not actually be supported. I think it
may be safer to be specific in which events MPAM wants to enable.
> +
> + /*
> + * Unfortunately, num_rmid doesn't mean anything for
> + * mpam, and its exposed to user-space!
> + *
The idea of adding a per MON group "num_mon_groups" file has been floated a couple of
times now. I have not heard any objections against doing something like this.
https://lore.kernel.org/all/cbe665c2-fe83-e446-1696-7115c0f9fd76@arm.com/
https://lore.kernel.org/lkml/46767ca7-1f1b-48e8-8ce6-be4b00d129f9@intel.com/
> + * num-rmid is supposed to mean the minimum number of
> + * monitoring groups that can exist simultaneously, including
> + * the default monitoring group for each control group.
> + *
> + * For mpam, each control group has its own pmg/rmid space, so
> + * it is not appropriate to advertise the whole rmid_idx space
> + * here. But the pmgs corresponding to the parent control
> + * group can be allocated freely:
> + */
> + l3->mon.num_rmid = mpam_pmg_max + 1;
> + }
> +
> + return 0;
> +}
> +
Reinette
Hi Reinette,
On 1/13/26 23:14, Reinette Chatre wrote:
> Hi Ben,
>
> On 1/12/26 8:58 AM, Ben Horgan wrote:
>> From: James Morse <james.morse@arm.com>
>>
>> resctrl exposes a counter via a file named llc_occupancy. This isn't really
>> a counter as its value goes up and down, this is a snapshot of the cache
>> storage usage monitor.
>>
>> Add some picking code to find a cache as close as possible to the L3 that
>> supports the CSU monitor.
>>
>> If there is an L3, but it doesn't have any controls, force the L3 resource
>> to exist. The existing topology_matches_l3() and
>> mpam_resctrl_domain_hdr_init() code will ensure this looks like the L3,
>> even if the class belongs to a later cache.
>>
>> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
>> Signed-off-by: James Morse <james.morse@arm.com>
>> Co-developed-by: Dave Martin <dave.martin@arm.com>
>> Signed-off-by: Dave Martin <dave.martin@arm.com>
>> Signed-off-by: Ben Horgan <ben.horgan@arm.com>
>> ---
>> Changes since rfc:
>> Allow csu counters however many partid or pmg there are
>> else if -> if
>> reduce scope of local variables
>> drop has_csu
>>
>> Changes since v2:
>> return -> break so works for mbwu in later patch
>> add for_each_mpam_resctrl_mon
>> return error from mpam_resctrl_monitor_init(). It may fail when is abmc
>> allocation introduced in a later patch.
>> Squashed in patch from Dave Martin:
>> https://lore.kernel.org/lkml/20250820131621.54983-1-Dave.Martin@arm.com/
>> ---
>> drivers/resctrl/mpam_internal.h | 6 ++
>> drivers/resctrl/mpam_resctrl.c | 173 +++++++++++++++++++++++++++++++-
>> 2 files changed, 174 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
>> index f89ceaf7623d..21cc776e57aa 100644
>> --- a/drivers/resctrl/mpam_internal.h
>> +++ b/drivers/resctrl/mpam_internal.h
>> @@ -349,6 +349,12 @@ struct mpam_resctrl_res {
>> struct rdt_resource resctrl_res;
>> };
>>
>> +struct mpam_resctrl_mon {
>> + struct mpam_class *class;
>> +
>> + /* per-class data that resctrl needs will live here */
>> +};
>> +
>> static inline int mpam_alloc_csu_mon(struct mpam_class *class)
>> {
>> struct mpam_props *cprops = &class->props;
>> diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c
>> index 7402bf4293b6..5020a5faed96 100644
>> --- a/drivers/resctrl/mpam_resctrl.c
>> +++ b/drivers/resctrl/mpam_resctrl.c
>> @@ -37,6 +37,21 @@ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES];
>> /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */
>> static DEFINE_MUTEX(domain_list_lock);
>>
>> +/*
>> + * The classes we've picked to map to resctrl events.
>> + * Resctrl believes all the worlds a Xeon, and these are all on the L3. This
>> + * array lets us find the actual class backing the event counters. e.g.
>> + * the only memory bandwidth counters may be on the memory controller, but to
>> + * make use of them, we pretend they are on L3.
>> + * Class pointer may be NULL.
>> + */
>> +static struct mpam_resctrl_mon mpam_resctrl_counters[QOS_NUM_EVENTS];
>> +
>> +#define for_each_mpam_resctrl_mon(mon, eventid) \
>> + for (eventid = 0, mon = &mpam_resctrl_counters[eventid]; \
>> + eventid < QOS_NUM_EVENTS; \
>> + eventid++, mon = &mpam_resctrl_counters[eventid])
>> +
>
> Reading the above loop and how it is used to call mpam_resctrl_monitor_init() for every event
> it looks like there is an implicit assumption that MPAM supports all events known to
> resctrl.
>
> Please consider the most recent resctrl feature "telemetry monitoring" currently queued
> for inclusion: https://lore.kernel.org/lkml/20251217172121.12030-1-tony.luck@intel.com/
>
> (You can find latest resctrl code queued for inclusion on the x86/cache branch of
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git)
I'll test against this.
>
> New telemetry monitoring introduces several new events known to resctrl. Specifically, here
> is how enum resctrl_event_id looks at the moment:
>
> /* Event IDs */
> enum resctrl_event_id {
> /* Must match value of first event below */
> QOS_FIRST_EVENT = 0x01,
[...]
Thanks for bringing this to my attention. mpam_resctrl_monitor_init()
won't be called for all events known to resctrl as
mpam_resctrl_pick_counters() will only set a class for the 3 that MPAM
knows about. Still, it is probably best to restrict the iterator to the
relevant ones.
> };
>
> ...
>
>> @@ -582,6 +677,57 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp)
>> return comp->comp_id;
>> }
>>
>> +static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon,
>> + enum resctrl_event_id type)
>> +{
>> + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
>> + struct rdt_resource *l3 = &res->resctrl_res;
>> +
>> + lockdep_assert_cpus_held();
>> +
>> + /* There also needs to be an L3 cache present */
>> + if (get_cpu_cacheinfo_id(smp_processor_id(), 3) == -1)
>> + return 0;
>> +
>> + /*
>> + * If there are no MPAM resources on L3, force it into existence.
>> + * topology_matches_l3() already ensures this looks like the L3.
>> + * The domain-ids will be fixed up by mpam_resctrl_domain_hdr_init().
>> + */
>> + if (!res->class) {
>> + pr_warn_once("Faking L3 MSC to enable counters.\n");
>> + res->class = mpam_resctrl_counters[type].class;
>> + }
>> +
>> + /* Called multiple times!, once per event type */
>> + if (exposed_mon_capable) {
>> + l3->mon_capable = true;
>> +
>> + /* Setting name is necessary on monitor only platforms */
>> + l3->name = "L3";
>> + l3->mon_scope = RESCTRL_L3_CACHE;
>> +
>> + resctrl_enable_mon_event(type);
>
> btw, the telemetry work also changed this function prototype to be:
> bool resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu,
> unsigned int binary_bits, void *arch_priv);
I will update to use the new signature.
>
> If I understand correctly resctrl_enable_mon_event() will be called for every event in
> enum resctrl_event_id which now contains events that may not actually be supported. I think it
> may be safer to be specific in which events MPAM wants to enable.
As noted above, this only happens for the ones chosen
mpam_resctrl_pick_counters().
>
>> +
>> + /*
>> + * Unfortunately, num_rmid doesn't mean anything for
>> + * mpam, and its exposed to user-space!
>> + *
>
> The idea of adding a per MON group "num_mon_groups" file has been floated a couple of
> times now. I have not heard any objections against doing something like this.
> https://lore.kernel.org/all/cbe665c2-fe83-e446-1696-7115c0f9fd76@arm.com/
> https://lore.kernel.org/lkml/46767ca7-1f1b-48e8-8ce6-be4b00d129f9@intel.com/
Hmm, I see now that 'num_rmid' is documented as an upper bound and so
neither 1 or mpam_pmg_max + 1 agree with the documentation.
"
"num_rmids":
The number of RMIDs available. This is the
upper bound for how many "CTRL_MON" + "MON"
groups can be created.
"
So, if I understand correctly you're proposing setting
num_rmids = num_pmg * num_partids on arm platforms and that in the
interim this can then be used to calculate the num_pmg by calculating
num_closid/num_rmid but that a per CTRL_MON num_mon_groups should be
added to make this consistent across architectures?
>
>> + * num-rmid is supposed to mean the minimum number of
>> + * monitoring groups that can exist simultaneously, including
>> + * the default monitoring group for each control group.
>> + *
>> + * For mpam, each control group has its own pmg/rmid space, so
>> + * it is not appropriate to advertise the whole rmid_idx space
>> + * here. But the pmgs corresponding to the parent control
>> + * group can be allocated freely:
>> + */
>> + l3->mon.num_rmid = mpam_pmg_max + 1;
>> + }
>> +
>> + return 0;
>> +}
>> +
>
> Reinette
>
I appreciate that you have shared this resctrl knowledge with me.
Thanks,
Ben
Hi Ben, On 1/15/26 7:43 AM, Ben Horgan wrote: > On 1/13/26 23:14, Reinette Chatre wrote: >> On 1/12/26 8:58 AM, Ben Horgan wrote: ... >>> + >>> + /* >>> + * Unfortunately, num_rmid doesn't mean anything for >>> + * mpam, and its exposed to user-space! >>> + * >> >> The idea of adding a per MON group "num_mon_groups" file has been floated a couple of >> times now. I have not heard any objections against doing something like this. >> https://lore.kernel.org/all/cbe665c2-fe83-e446-1696-7115c0f9fd76@arm.com/ >> https://lore.kernel.org/lkml/46767ca7-1f1b-48e8-8ce6-be4b00d129f9@intel.com/ > > Hmm, I see now that 'num_rmid' is documented as an upper bound and so > neither 1 or mpam_pmg_max + 1 agree with the documentation. > > " > "num_rmids": > The number of RMIDs available. This is the > upper bound for how many "CTRL_MON" + "MON" > groups can be created. > " Please note that this documentation has been refactored (without changing its meaning). The above quoted text is specific to L3 monitoring and with the addition of telemetry monitoring the relevant text now reads: The upper bound for how many "CTRL_MON" + "MON" can be created is the smaller of the L3_MON and PERF_PKG_MON "num_rmids" values. > > So, if I understand correctly you're proposing setting > num_rmids = num_pmg * num_partids on arm platforms and that in the > interim this can then be used to calculate the num_pmg by calculating > num_closid/num_rmid but that a per CTRL_MON num_mon_groups should be > added to make this consistent across architectures? Yes for num_rmids = num_pmg * num_partids. The motivation for this is that to me this looks like the value that best matches the num_rmids documentation. I understand the RMID vs PMG is difficult so my proposal is certainly not set in stone and I would like to hear motivation for different interpretations. "calculating num_pmg" is not obvious though. I interpret "num_pmg" here as number of monitor groups per control group and on an Arm system this is indeed num_closid/num_rmids (if num_rmids = num_pmg * num_partids) but on x86 it is just num_rmids. Having user space depend on such computation to determine how many monitor groups per control group would thus require that user space knows whether the underlying system is Arm or x86 and would go against goal of having resctrl as a generic interface. The way forward may be to deprecate (somehow) num_rmids and transition to something like "num_mon_groups" but it is currently vague how "num_mon_groups" may look like. That thread (https://lore.kernel.org/lkml/46767ca7-1f1b-48e8-8ce6-be4b00d129f9@intel.com/) fizzled out after raising a few options how it may look. Another proposal was to add a "mon_id_includes_control_id" to use as another "guide" to determine how many monitoring groups can be created but at the time it seemed an intermediary step for user to determine the number of monitor groups that resctrl can also provide. https://lore.kernel.org/lkml/CALPaoChad6=xqz+BQQd=dB915xhj1gusmcrS9ya+T2GyhTQc5Q@mail.gmail.com/ Making this consistent across architectures is the goal since resctrl aims to be a generic interface. Users should not need to do things like infer which system they are running on by looking at output of resctrl files as mentioned. fwiw ... there seems to be a usage by Google to compare num_rmids to num_closids to determine how to interact with resctrl: https://lore.kernel.org/lkml/CALPaoCgSO7HzK9BjyM8yL50oPyq9kBj64Nkgyo1WEJrWy5uHUg@mail.gmail.com/ Reinette
Hi Reinette, Peter, On 1/15/26 18:54, Reinette Chatre wrote: > Hi Ben, > > On 1/15/26 7:43 AM, Ben Horgan wrote: >> On 1/13/26 23:14, Reinette Chatre wrote: >>> On 1/12/26 8:58 AM, Ben Horgan wrote: > ... >>>> + >>>> + /* >>>> + * Unfortunately, num_rmid doesn't mean anything for >>>> + * mpam, and its exposed to user-space! >>>> + * >>> >>> The idea of adding a per MON group "num_mon_groups" file has been floated a couple of >>> times now. I have not heard any objections against doing something like this. >>> https://lore.kernel.org/all/cbe665c2-fe83-e446-1696-7115c0f9fd76@arm.com/ >>> https://lore.kernel.org/lkml/46767ca7-1f1b-48e8-8ce6-be4b00d129f9@intel.com/ >> >> Hmm, I see now that 'num_rmid' is documented as an upper bound and so >> neither 1 or mpam_pmg_max + 1 agree with the documentation. >> >> " >> "num_rmids": >> The number of RMIDs available. This is the >> upper bound for how many "CTRL_MON" + "MON" >> groups can be created. >> " > > Please note that this documentation has been refactored (without changing its > meaning). The above quoted text is specific to L3 monitoring and with the > addition of telemetry monitoring the relevant text now reads: > The upper bound for how many "CTRL_MON" + "MON" can be created > is the smaller of the L3_MON and PERF_PKG_MON "num_rmids" values. > >> >> So, if I understand correctly you're proposing setting >> num_rmids = num_pmg * num_partids on arm platforms and that in the >> interim this can then be used to calculate the num_pmg by calculating >> num_closid/num_rmid but that a per CTRL_MON num_mon_groups should be >> added to make this consistent across architectures? > > Yes for num_rmids = num_pmg * num_partids. Ok, I don't really see another option. The motivation for this is that to me > this looks like the value that best matches the num_rmids documentation. I understand > the RMID vs PMG is difficult so my proposal is certainly not set in stone and I would like to > hear motivation for different interpretations. "calculating num_pmg" is not obvious > though. I interpret "num_pmg" here as number of monitor groups per control group and on > an Arm system this is indeed num_closid/num_rmids (if num_rmids = num_pmg * num_partids) > but on x86 it is just num_rmids. Having user space depend on such computation to determine how > many monitor groups per control group would thus require that user space knows whether the > underlying system is Arm or x86 and would go against goal of having resctrl as a generic interface. > > The way forward may be to deprecate (somehow) num_rmids and transition to something > like "num_mon_groups" but it is currently vague how "num_mon_groups" may look like. That thread > (https://lore.kernel.org/lkml/46767ca7-1f1b-48e8-8ce6-be4b00d129f9@intel.com/) fizzled > out after raising a few options how it may look. > > Another proposal was to add a "mon_id_includes_control_id" to use as another "guide" to > determine how many monitoring groups can be created but at the time it seemed an intermediary > step for user to determine the number of monitor groups that resctrl can also provide. > https://lore.kernel.org/lkml/CALPaoChad6=xqz+BQQd=dB915xhj1gusmcrS9ya+T2GyhTQc5Q@mail.gmail.com/ Just thinking about it now but the "mon_id_includes_control_id" option seems the best to me as it is a single bit option that along with "num_rmids" let's you know which monitor groups you can create and if it's sensible to move monitor groups between CTRL MON groups. The "num_mon_groups" per CTRL MON group would also need to be interpreted together with "num_rmid" to know if it is a global or per CTRL MON upper bound. This option also uses multiple files to give the same bit of information. > > Making this consistent across architectures is the goal since resctrl aims to be > a generic interface. Users should not need to do things like infer which system they > are running on by looking at output of resctrl files as mentioned. > > fwiw ... there seems to be a usage by Google to compare num_rmids to num_closids to determine > how to interact with resctrl: > https://lore.kernel.org/lkml/CALPaoCgSO7HzK9BjyM8yL50oPyq9kBj64Nkgyo1WEJrWy5uHUg@mail.gmail.com/ Unfortunately, it looks like we're about to break this heuristic :( At least, until a way to get this information generically in resctrl is decided upon. > > Reinette Thanks, Ben
Hi Ben, On Fri, Jan 16, 2026 at 11:29 AM Ben Horgan <ben.horgan@arm.com> wrote: > > Hi Reinette, Peter, > > On 1/15/26 18:54, Reinette Chatre wrote: > > Hi Ben, > > > > On 1/15/26 7:43 AM, Ben Horgan wrote: > >> On 1/13/26 23:14, Reinette Chatre wrote: > >>> On 1/12/26 8:58 AM, Ben Horgan wrote: > > ... > >>>> + > >>>> + /* > >>>> + * Unfortunately, num_rmid doesn't mean anything for > >>>> + * mpam, and its exposed to user-space! > >>>> + * > >>> > >>> The idea of adding a per MON group "num_mon_groups" file has been floated a couple of > >>> times now. I have not heard any objections against doing something like this. > >>> https://lore.kernel.org/all/cbe665c2-fe83-e446-1696-7115c0f9fd76@arm.com/ > >>> https://lore.kernel.org/lkml/46767ca7-1f1b-48e8-8ce6-be4b00d129f9@intel.com/ > >> > >> Hmm, I see now that 'num_rmid' is documented as an upper bound and so > >> neither 1 or mpam_pmg_max + 1 agree with the documentation. > >> > >> " > >> "num_rmids": > >> The number of RMIDs available. This is the > >> upper bound for how many "CTRL_MON" + "MON" > >> groups can be created. > >> " > > > > Please note that this documentation has been refactored (without changing its > > meaning). The above quoted text is specific to L3 monitoring and with the > > addition of telemetry monitoring the relevant text now reads: > > The upper bound for how many "CTRL_MON" + "MON" can be created > > is the smaller of the L3_MON and PERF_PKG_MON "num_rmids" values. > > > >> > >> So, if I understand correctly you're proposing setting > >> num_rmids = num_pmg * num_partids on arm platforms and that in the > >> interim this can then be used to calculate the num_pmg by calculating > >> num_closid/num_rmid but that a per CTRL_MON num_mon_groups should be > >> added to make this consistent across architectures? > > > > Yes for num_rmids = num_pmg * num_partids. > > Ok, I don't really see another option. > > The motivation for this is that to me > > this looks like the value that best matches the num_rmids documentation. I understand > > the RMID vs PMG is difficult so my proposal is certainly not set in stone and I would like to > > hear motivation for different interpretations. "calculating num_pmg" is not obvious > > though. I interpret "num_pmg" here as number of monitor groups per control group and on > > an Arm system this is indeed num_closid/num_rmids (if num_rmids = num_pmg * num_partids) > > but on x86 it is just num_rmids. Having user space depend on such computation to determine how > > many monitor groups per control group would thus require that user space knows whether the > > underlying system is Arm or x86 and would go against goal of having resctrl as a generic interface. > > > > The way forward may be to deprecate (somehow) num_rmids and transition to something > > like "num_mon_groups" but it is currently vague how "num_mon_groups" may look like. That thread > > (https://lore.kernel.org/lkml/46767ca7-1f1b-48e8-8ce6-be4b00d129f9@intel.com/) fizzled > > out after raising a few options how it may look. > > > > Another proposal was to add a "mon_id_includes_control_id" to use as another "guide" to > > determine how many monitoring groups can be created but at the time it seemed an intermediary > > step for user to determine the number of monitor groups that resctrl can also provide. > > https://lore.kernel.org/lkml/CALPaoChad6=xqz+BQQd=dB915xhj1gusmcrS9ya+T2GyhTQc5Q@mail.gmail.com/ > > Just thinking about it now but the "mon_id_includes_control_id" option > seems the best to me as it is a single bit option that along with > "num_rmids" let's you know which monitor groups you can create and if > it's sensible to move monitor groups between CTRL MON groups. > > The "num_mon_groups" per CTRL MON group would also need to be > interpreted together with "num_rmid" to know if it is a global or per > CTRL MON upper bound. This option also uses multiple files to give the > same bit of information. > > > > > Making this consistent across architectures is the goal since resctrl aims to be > > a generic interface. Users should not need to do things like infer which system they > > are running on by looking at output of resctrl files as mentioned. > > > > fwiw ... there seems to be a usage by Google to compare num_rmids to num_closids to determine > > how to interact with resctrl: > > https://lore.kernel.org/lkml/CALPaoCgSO7HzK9BjyM8yL50oPyq9kBj64Nkgyo1WEJrWy5uHUg@mail.gmail.com/ > > Unfortunately, it looks like we're about to break this heuristic :( At > least, until a way to get this information generically in resctrl is > decided upon. We actually ended up going with the "mon_id_includes_control_id" approach. The property it represents is rather fundamental to what a monitoring group actually is and is a low-level implementation detail that is difficult to hide. Google generally needs support for as many monitoring IDs as jobs it expects to be able to run on a machine, so the number of monitoring groups will be routinely maxed out (and there will be some jobs that are forever stuck in the default group because no RMIDs were free at the time it started[1]) Thanks, -Peter [1] https://lore.kernel.org/lkml/CALPaoCjTwySGX9i7uAtCWLKQpmELKP55xDLJhHmUve8ptsfFTw@mail.gmail.com/
Hi Ben and Peter, On 1/20/26 7:28 AM, Peter Newman wrote: > Hi Ben, > > On Fri, Jan 16, 2026 at 11:29 AM Ben Horgan <ben.horgan@arm.com> wrote: >> >> Hi Reinette, Peter, >> >> On 1/15/26 18:54, Reinette Chatre wrote: >>> Hi Ben, >>> >>> On 1/15/26 7:43 AM, Ben Horgan wrote: >>>> On 1/13/26 23:14, Reinette Chatre wrote: >>>>> On 1/12/26 8:58 AM, Ben Horgan wrote: >>> ... >>>>>> + >>>>>> + /* >>>>>> + * Unfortunately, num_rmid doesn't mean anything for >>>>>> + * mpam, and its exposed to user-space! >>>>>> + * >>>>> >>>>> The idea of adding a per MON group "num_mon_groups" file has been floated a couple of >>>>> times now. I have not heard any objections against doing something like this. >>>>> https://lore.kernel.org/all/cbe665c2-fe83-e446-1696-7115c0f9fd76@arm.com/ >>>>> https://lore.kernel.org/lkml/46767ca7-1f1b-48e8-8ce6-be4b00d129f9@intel.com/ >>>> >>>> Hmm, I see now that 'num_rmid' is documented as an upper bound and so >>>> neither 1 or mpam_pmg_max + 1 agree with the documentation. >>>> >>>> " >>>> "num_rmids": >>>> The number of RMIDs available. This is the >>>> upper bound for how many "CTRL_MON" + "MON" >>>> groups can be created. >>>> " >>> >>> Please note that this documentation has been refactored (without changing its >>> meaning). The above quoted text is specific to L3 monitoring and with the >>> addition of telemetry monitoring the relevant text now reads: >>> The upper bound for how many "CTRL_MON" + "MON" can be created >>> is the smaller of the L3_MON and PERF_PKG_MON "num_rmids" values. >>> >>>> >>>> So, if I understand correctly you're proposing setting >>>> num_rmids = num_pmg * num_partids on arm platforms and that in the >>>> interim this can then be used to calculate the num_pmg by calculating >>>> num_closid/num_rmid but that a per CTRL_MON num_mon_groups should be >>>> added to make this consistent across architectures? >>> >>> Yes for num_rmids = num_pmg * num_partids. >> >> Ok, I don't really see another option. >> >> The motivation for this is that to me >>> this looks like the value that best matches the num_rmids documentation. I understand >>> the RMID vs PMG is difficult so my proposal is certainly not set in stone and I would like to >>> hear motivation for different interpretations. "calculating num_pmg" is not obvious >>> though. I interpret "num_pmg" here as number of monitor groups per control group and on >>> an Arm system this is indeed num_closid/num_rmids (if num_rmids = num_pmg * num_partids) >>> but on x86 it is just num_rmids. Having user space depend on such computation to determine how >>> many monitor groups per control group would thus require that user space knows whether the >>> underlying system is Arm or x86 and would go against goal of having resctrl as a generic interface. >>> >>> The way forward may be to deprecate (somehow) num_rmids and transition to something >>> like "num_mon_groups" but it is currently vague how "num_mon_groups" may look like. That thread >>> (https://lore.kernel.org/lkml/46767ca7-1f1b-48e8-8ce6-be4b00d129f9@intel.com/) fizzled >>> out after raising a few options how it may look. >>> >>> Another proposal was to add a "mon_id_includes_control_id" to use as another "guide" to >>> determine how many monitoring groups can be created but at the time it seemed an intermediary >>> step for user to determine the number of monitor groups that resctrl can also provide. >>> https://lore.kernel.org/lkml/CALPaoChad6=xqz+BQQd=dB915xhj1gusmcrS9ya+T2GyhTQc5Q@mail.gmail.com/ >> >> Just thinking about it now but the "mon_id_includes_control_id" option >> seems the best to me as it is a single bit option that along with >> "num_rmids" let's you know which monitor groups you can create and if >> it's sensible to move monitor groups between CTRL MON groups. >> >> The "num_mon_groups" per CTRL MON group would also need to be >> interpreted together with "num_rmid" to know if it is a global or per >> CTRL MON upper bound. This option also uses multiple files to give the >> same bit of information. >> >>> >>> Making this consistent across architectures is the goal since resctrl aims to be >>> a generic interface. Users should not need to do things like infer which system they >>> are running on by looking at output of resctrl files as mentioned. >>> >>> fwiw ... there seems to be a usage by Google to compare num_rmids to num_closids to determine >>> how to interact with resctrl: >>> https://lore.kernel.org/lkml/CALPaoCgSO7HzK9BjyM8yL50oPyq9kBj64Nkgyo1WEJrWy5uHUg@mail.gmail.com/ >> >> Unfortunately, it looks like we're about to break this heuristic :( At >> least, until a way to get this information generically in resctrl is >> decided upon. > > We actually ended up going with the "mon_id_includes_control_id" approach. Thank you for confirming. I was hoping we could deprecate num_rmids after introducing a per resource group file but this does not seem to support all the use cases as highlighted by Ben. As I see it, a name like "mon_id_includes_control_id" also implies that "num_rmids", perhaps linked to a new "num_mon_ids" as Peter suggested in [2], should contain num_pmg * num_partids. One concern from earlier was that "mon_id_includes_control_id" may be used as a heuristic for whether monitor groups can be moved or not. Instead I seem to remember that there was a plan for MPAM to support moving monitor groups, with the caveat that counters will reset for which resctrl may need another flag. > The property it represents is rather fundamental to what a monitoring > group actually is and is a low-level implementation detail that is > difficult to hide. Google generally needs support for as many > monitoring IDs as jobs it expects to be able to run on a machine, so > the number of monitoring groups will be routinely maxed out (and there > will be some jobs that are forever stuck in the default group because > no RMIDs were free at the time it started[1]) > > Thanks, > -Peter > > [1] https://lore.kernel.org/lkml/CALPaoCjTwySGX9i7uAtCWLKQpmELKP55xDLJhHmUve8ptsfFTw@mail.gmail.com/ Reinette [2] https://lore.kernel.org/lkml/CALPaoChad6=xqz+BQQd=dB915xhj1gusmcrS9ya+T2GyhTQc5Q@mail.gmail.com/
Hi Reinette, Peter, On 1/21/26 17:58, Reinette Chatre wrote: > Hi Ben and Peter, > > On 1/20/26 7:28 AM, Peter Newman wrote: >> Hi Ben, >> >> On Fri, Jan 16, 2026 at 11:29 AM Ben Horgan <ben.horgan@arm.com> wrote: >>> >>> Hi Reinette, Peter, >>> >>> On 1/15/26 18:54, Reinette Chatre wrote: >>>> Hi Ben, >>>> >>>> On 1/15/26 7:43 AM, Ben Horgan wrote: >>>>> On 1/13/26 23:14, Reinette Chatre wrote: >>>>>> On 1/12/26 8:58 AM, Ben Horgan wrote: >>>> ... >>>>>>> + >>>>>>> + /* >>>>>>> + * Unfortunately, num_rmid doesn't mean anything for >>>>>>> + * mpam, and its exposed to user-space! >>>>>>> + * >>>>>> >>>>>> The idea of adding a per MON group "num_mon_groups" file has been floated a couple of >>>>>> times now. I have not heard any objections against doing something like this. >>>>>> https://lore.kernel.org/all/cbe665c2-fe83-e446-1696-7115c0f9fd76@arm.com/ >>>>>> https://lore.kernel.org/lkml/46767ca7-1f1b-48e8-8ce6-be4b00d129f9@intel.com/ >>>>> >>>>> Hmm, I see now that 'num_rmid' is documented as an upper bound and so >>>>> neither 1 or mpam_pmg_max + 1 agree with the documentation. >>>>> >>>>> " >>>>> "num_rmids": >>>>> The number of RMIDs available. This is the >>>>> upper bound for how many "CTRL_MON" + "MON" >>>>> groups can be created. >>>>> " >>>> >>>> Please note that this documentation has been refactored (without changing its >>>> meaning). The above quoted text is specific to L3 monitoring and with the >>>> addition of telemetry monitoring the relevant text now reads: >>>> The upper bound for how many "CTRL_MON" + "MON" can be created >>>> is the smaller of the L3_MON and PERF_PKG_MON "num_rmids" values. >>>> >>>>> >>>>> So, if I understand correctly you're proposing setting >>>>> num_rmids = num_pmg * num_partids on arm platforms and that in the >>>>> interim this can then be used to calculate the num_pmg by calculating >>>>> num_closid/num_rmid but that a per CTRL_MON num_mon_groups should be >>>>> added to make this consistent across architectures? >>>> >>>> Yes for num_rmids = num_pmg * num_partids. >>> >>> Ok, I don't really see another option. >>> >>> The motivation for this is that to me >>>> this looks like the value that best matches the num_rmids documentation. I understand >>>> the RMID vs PMG is difficult so my proposal is certainly not set in stone and I would like to >>>> hear motivation for different interpretations. "calculating num_pmg" is not obvious >>>> though. I interpret "num_pmg" here as number of monitor groups per control group and on >>>> an Arm system this is indeed num_closid/num_rmids (if num_rmids = num_pmg * num_partids) >>>> but on x86 it is just num_rmids. Having user space depend on such computation to determine how >>>> many monitor groups per control group would thus require that user space knows whether the >>>> underlying system is Arm or x86 and would go against goal of having resctrl as a generic interface. >>>> >>>> The way forward may be to deprecate (somehow) num_rmids and transition to something >>>> like "num_mon_groups" but it is currently vague how "num_mon_groups" may look like. That thread >>>> (https://lore.kernel.org/lkml/46767ca7-1f1b-48e8-8ce6-be4b00d129f9@intel.com/) fizzled >>>> out after raising a few options how it may look. >>>> >>>> Another proposal was to add a "mon_id_includes_control_id" to use as another "guide" to >>>> determine how many monitoring groups can be created but at the time it seemed an intermediary >>>> step for user to determine the number of monitor groups that resctrl can also provide. >>>> https://lore.kernel.org/lkml/CALPaoChad6=xqz+BQQd=dB915xhj1gusmcrS9ya+T2GyhTQc5Q@mail.gmail.com/ >>> >>> Just thinking about it now but the "mon_id_includes_control_id" option >>> seems the best to me as it is a single bit option that along with >>> "num_rmids" let's you know which monitor groups you can create and if >>> it's sensible to move monitor groups between CTRL MON groups. >>> >>> The "num_mon_groups" per CTRL MON group would also need to be >>> interpreted together with "num_rmid" to know if it is a global or per >>> CTRL MON upper bound. This option also uses multiple files to give the >>> same bit of information. >>> >>>> >>>> Making this consistent across architectures is the goal since resctrl aims to be >>>> a generic interface. Users should not need to do things like infer which system they >>>> are running on by looking at output of resctrl files as mentioned. >>>> >>>> fwiw ... there seems to be a usage by Google to compare num_rmids to num_closids to determine >>>> how to interact with resctrl: >>>> https://lore.kernel.org/lkml/CALPaoCgSO7HzK9BjyM8yL50oPyq9kBj64Nkgyo1WEJrWy5uHUg@mail.gmail.com/ >>> >>> Unfortunately, it looks like we're about to break this heuristic :( At >>> least, until a way to get this information generically in resctrl is >>> decided upon. >> >> We actually ended up going with the "mon_id_includes_control_id" approach. > > Thank you for confirming. I was hoping we could deprecate num_rmids after introducing a > per resource group file but this does not seem to support all the use cases as highlighted by > Ben. > > As I see it, a name like "mon_id_includes_control_id" also implies that "num_rmids", perhaps > linked to a new "num_mon_ids" as Peter suggested in [2], should contain num_pmg * num_partids. > > One concern from earlier was that "mon_id_includes_control_id" may be used as a > heuristic for whether monitor groups can be moved or not. Instead I seem to remember that > there was a plan for MPAM to support moving monitor groups, with the caveat that > counters will reset for which resctrl may need another flag. I had a chat offline with James about this. Currently, userspace expects either the copy to succeed and the counters not to glitch or the move to fail. If we were going to support a monitor move in MPAM with counter reset (or a best effort counter value) we would have to make this opt-in for userspace. If userspace tries the monitor move while being unaware of the new flag it would unexpectedly lose counter data. To get this opt-in behaviour there could be a mount option, "destructive_monitor_move" or such like. Although this was considered in the past, we're not currently aware of any usecase for this desctructive monitor move and so are not proposing adding it or changing the existing behaviour around this. This doesn't mean that a flag for indicating whether monitor move is supported or not is not useful; a user may want to know if monitor move is supported but not to do a monitor move at the current time. > >> The property it represents is rather fundamental to what a monitoring >> group actually is and is a low-level implementation detail that is >> difficult to hide. Google generally needs support for as many >> monitoring IDs as jobs it expects to be able to run on a machine, so >> the number of monitoring groups will be routinely maxed out (and there >> will be some jobs that are forever stuck in the default group because >> no RMIDs were free at the time it started[1]) >> >> Thanks, >> -Peter >> >> [1] https://lore.kernel.org/lkml/CALPaoCjTwySGX9i7uAtCWLKQpmELKP55xDLJhHmUve8ptsfFTw@mail.gmail.com/ > > Reinette > > [2] https://lore.kernel.org/lkml/CALPaoChad6=xqz+BQQd=dB915xhj1gusmcrS9ya+T2GyhTQc5Q@mail.gmail.com/ Thanks, Ben
© 2016 - 2026 Red Hat, Inc.