Provide the interface to display the number of free monitoring counters
available for assignment in each doamin when mbm_cntr_assign is supported.
Signed-off-by: Babu Moger <babu.moger@amd.com>
---
v9: New patch.
---
Documentation/arch/x86/resctrl.rst | 4 ++++
arch/x86/kernel/cpu/resctrl/monitor.c | 1 +
arch/x86/kernel/cpu/resctrl/rdtgroup.c | 33 ++++++++++++++++++++++++++
3 files changed, 38 insertions(+)
diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst
index 2f3a86278e84..2bc58d974934 100644
--- a/Documentation/arch/x86/resctrl.rst
+++ b/Documentation/arch/x86/resctrl.rst
@@ -302,6 +302,10 @@ with the following files:
memory bandwidth tracking to a single memory bandwidth event per
monitoring group.
+"available_mbm_cntrs":
+ The number of free monitoring counters available assignment in each domain
+ when the architecture supports mbm_cntr_assign mode.
+
"max_threshold_occupancy":
Read/write file provides the largest value (in
bytes) at which a previously used LLC_occupancy
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 3996f7528b66..e8d38a963f39 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -1268,6 +1268,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx);
r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1;
resctrl_file_fflags_init("num_mbm_cntrs", RFTYPE_MON_INFO);
+ resctrl_file_fflags_init("available_mbm_cntrs", RFTYPE_MON_INFO);
}
}
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 654cdfee1b00..ef0c1246fa2a 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -898,6 +898,33 @@ static int rdtgroup_num_mbm_cntrs_show(struct kernfs_open_file *of,
return 0;
}
+static int rdtgroup_available_mbm_cntrs_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_mon_domain *dom;
+ bool sep = false;
+ u32 val;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ list_for_each_entry(dom, &r->mon_domains, hdr.list) {
+ if (sep)
+ seq_puts(s, ";");
+
+ val = r->mon.num_mbm_cntrs - hweight64(*dom->mbm_cntr_map);
+ seq_printf(s, "%d=%d", dom->hdr.id, val);
+ sep = true;
+ }
+ seq_puts(s, "\n");
+
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+
+ return 0;
+}
+
#ifdef CONFIG_PROC_CPU_RESCTRL
/*
@@ -1984,6 +2011,12 @@ static struct rftype res_common_files[] = {
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdtgroup_num_mbm_cntrs_show,
},
+ {
+ .name = "available_mbm_cntrs",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdtgroup_available_mbm_cntrs_show,
+ },
{
.name = "cpus_list",
.mode = 0644,
--
2.34.1
Hi Babu,
On 10/29/24 4:21 PM, Babu Moger wrote:
> Provide the interface to display the number of free monitoring counters
> available for assignment in each doamin when mbm_cntr_assign is supported.
>
> Signed-off-by: Babu Moger <babu.moger@amd.com>
> ---
> v9: New patch.
> ---
> Documentation/arch/x86/resctrl.rst | 4 ++++
> arch/x86/kernel/cpu/resctrl/monitor.c | 1 +
> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 33 ++++++++++++++++++++++++++
> 3 files changed, 38 insertions(+)
>
> diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst
> index 2f3a86278e84..2bc58d974934 100644
> --- a/Documentation/arch/x86/resctrl.rst
> +++ b/Documentation/arch/x86/resctrl.rst
> @@ -302,6 +302,10 @@ with the following files:
> memory bandwidth tracking to a single memory bandwidth event per
> monitoring group.
>
> +"available_mbm_cntrs":
> + The number of free monitoring counters available assignment in each domain
"The number of free monitoring counters available assignment" -> "The number of monitoring
counters available for assignment"?
(not taking into account how text may change after addressing Peter's feedback)
> + when the architecture supports mbm_cntr_assign mode.
> +
> "max_threshold_occupancy":
> Read/write file provides the largest value (in
> bytes) at which a previously used LLC_occupancy
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index 3996f7528b66..e8d38a963f39 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -1268,6 +1268,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
> cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx);
> r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1;
> resctrl_file_fflags_init("num_mbm_cntrs", RFTYPE_MON_INFO);
> + resctrl_file_fflags_init("available_mbm_cntrs", RFTYPE_MON_INFO);
> }
> }
>
> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index 654cdfee1b00..ef0c1246fa2a 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -898,6 +898,33 @@ static int rdtgroup_num_mbm_cntrs_show(struct kernfs_open_file *of,
> return 0;
> }
>
> +static int rdtgroup_available_mbm_cntrs_show(struct kernfs_open_file *of,
> + struct seq_file *s, void *v)
> +{
> + struct rdt_resource *r = of->kn->parent->priv;
> + struct rdt_mon_domain *dom;
> + bool sep = false;
> + u32 val;
> +
> + cpus_read_lock();
> + mutex_lock(&rdtgroup_mutex);
> +
> + list_for_each_entry(dom, &r->mon_domains, hdr.list) {
> + if (sep)
> + seq_puts(s, ";");
> +
> + val = r->mon.num_mbm_cntrs - hweight64(*dom->mbm_cntr_map);
This should probably be bitmap_weight() to address warnings like below that are
encountered by build testing with various configs (32bit in this case). 0day does
not seem to automatically pick up patches just based on submission but it sure will
when these are merged to tip so this needs a clean slate.
>> arch/x86/kernel/cpu/resctrl/rdtgroup.c:916:32: warning: shift count >= width of type [-Wshift-count-overflow]
916 | val = r->mon.num_mbm_cntrs - hweight64(*dom->mbm_cntr_map);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/asm-generic/bitops/const_hweight.h:29:49: note: expanded from macro 'hweight64'
29 | #define hweight64(w) (__builtin_constant_p(w) ? __const_hweight64(w) : __arch_hweight64(w))
| ^~~~~~~~~~~~~~~~~~~~
include/asm-generic/bitops/const_hweight.h:21:76: note: expanded from macro '__const_hweight64'
21 | #define __const_hweight64(w) (__const_hweight32(w) + __const_hweight32((w) >> 32))
| ^ ~~
include/asm-generic/bitops/const_hweight.h:20:49: note: expanded from macro '__const_hweight32'
20 | #define __const_hweight32(w) (__const_hweight16(w) + __const_hweight16((w) >> 16))
| ^
include/asm-generic/bitops/const_hweight.h:19:48: note: expanded from macro '__const_hweight16'
19 | #define __const_hweight16(w) (__const_hweight8(w) + __const_hweight8((w) >> 8 ))
| ^
include/asm-generic/bitops/const_hweight.h:10:9: note: expanded from macro '__const_hweight8'
10 | ((!!((w) & (1ULL << 0))) + \
| ^
Reinette
Hi Reinette,
On 11/15/24 18:31, Reinette Chatre wrote:
> Hi Babu,
>
> On 10/29/24 4:21 PM, Babu Moger wrote:
>> Provide the interface to display the number of free monitoring counters
>> available for assignment in each doamin when mbm_cntr_assign is supported.
>>
>> Signed-off-by: Babu Moger <babu.moger@amd.com>
>> ---
>> v9: New patch.
>> ---
>> Documentation/arch/x86/resctrl.rst | 4 ++++
>> arch/x86/kernel/cpu/resctrl/monitor.c | 1 +
>> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 33 ++++++++++++++++++++++++++
>> 3 files changed, 38 insertions(+)
>>
>> diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst
>> index 2f3a86278e84..2bc58d974934 100644
>> --- a/Documentation/arch/x86/resctrl.rst
>> +++ b/Documentation/arch/x86/resctrl.rst
>> @@ -302,6 +302,10 @@ with the following files:
>> memory bandwidth tracking to a single memory bandwidth event per
>> monitoring group.
>>
>> +"available_mbm_cntrs":
>> + The number of free monitoring counters available assignment in each domain
>
> "The number of free monitoring counters available assignment" -> "The number of monitoring
> counters available for assignment"?
>
> (not taking into account how text may change after addressing Peter's feedback)
How about this?
"The number of monitoring counters available for assignment in each domain
when the architecture supports mbm_cntr_assign mode. There are a total of
"num_mbm_cntrs" counters are available for assignment. Counters can be
assigned or unassigned individually in each domain. A counter is available
for new assignment if it is unassigned in all domains."
>
>> + when the architecture supports mbm_cntr_assign mode.
>> +
>> "max_threshold_occupancy":
>> Read/write file provides the largest value (in
>> bytes) at which a previously used LLC_occupancy
>> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
>> index 3996f7528b66..e8d38a963f39 100644
>> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
>> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
>> @@ -1268,6 +1268,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
>> cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx);
>> r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1;
>> resctrl_file_fflags_init("num_mbm_cntrs", RFTYPE_MON_INFO);
>> + resctrl_file_fflags_init("available_mbm_cntrs", RFTYPE_MON_INFO);
>> }
>> }
>>
>> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>> index 654cdfee1b00..ef0c1246fa2a 100644
>> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>> @@ -898,6 +898,33 @@ static int rdtgroup_num_mbm_cntrs_show(struct kernfs_open_file *of,
>> return 0;
>> }
>>
>> +static int rdtgroup_available_mbm_cntrs_show(struct kernfs_open_file *of,
>> + struct seq_file *s, void *v)
>> +{
>> + struct rdt_resource *r = of->kn->parent->priv;
>> + struct rdt_mon_domain *dom;
>> + bool sep = false;
>> + u32 val;
>> +
>> + cpus_read_lock();
>> + mutex_lock(&rdtgroup_mutex);
>> +
>> + list_for_each_entry(dom, &r->mon_domains, hdr.list) {
>> + if (sep)
>> + seq_puts(s, ";");
>> +
>> + val = r->mon.num_mbm_cntrs - hweight64(*dom->mbm_cntr_map);
>
> This should probably be bitmap_weight() to address warnings like below that are
> encountered by build testing with various configs (32bit in this case). 0day does
> not seem to automatically pick up patches just based on submission but it sure will
> when these are merged to tip so this needs a clean slate.
Sure.
>
>>> arch/x86/kernel/cpu/resctrl/rdtgroup.c:916:32: warning: shift count >= width of type [-Wshift-count-overflow]
> 916 | val = r->mon.num_mbm_cntrs - hweight64(*dom->mbm_cntr_map);
> | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> include/asm-generic/bitops/const_hweight.h:29:49: note: expanded from macro 'hweight64'
> 29 | #define hweight64(w) (__builtin_constant_p(w) ? __const_hweight64(w) : __arch_hweight64(w))
> | ^~~~~~~~~~~~~~~~~~~~
> include/asm-generic/bitops/const_hweight.h:21:76: note: expanded from macro '__const_hweight64'
> 21 | #define __const_hweight64(w) (__const_hweight32(w) + __const_hweight32((w) >> 32))
> | ^ ~~
> include/asm-generic/bitops/const_hweight.h:20:49: note: expanded from macro '__const_hweight32'
> 20 | #define __const_hweight32(w) (__const_hweight16(w) + __const_hweight16((w) >> 16))
> | ^
> include/asm-generic/bitops/const_hweight.h:19:48: note: expanded from macro '__const_hweight16'
> 19 | #define __const_hweight16(w) (__const_hweight8(w) + __const_hweight8((w) >> 8 ))
> | ^
> include/asm-generic/bitops/const_hweight.h:10:9: note: expanded from macro '__const_hweight8'
> 10 | ((!!((w) & (1ULL << 0))) + \
> | ^
>
>
> Reinette
>
>
>
--
Thanks
Babu Moger
Hi Babu, On 11/19/24 11:20 AM, Moger, Babu wrote: > Hi Reinette, > > On 11/15/24 18:31, Reinette Chatre wrote: >> Hi Babu, >> >> On 10/29/24 4:21 PM, Babu Moger wrote: >>> Provide the interface to display the number of free monitoring counters >>> available for assignment in each doamin when mbm_cntr_assign is supported. >>> >>> Signed-off-by: Babu Moger <babu.moger@amd.com> >>> --- >>> v9: New patch. >>> --- >>> Documentation/arch/x86/resctrl.rst | 4 ++++ >>> arch/x86/kernel/cpu/resctrl/monitor.c | 1 + >>> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 33 ++++++++++++++++++++++++++ >>> 3 files changed, 38 insertions(+) >>> >>> diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst >>> index 2f3a86278e84..2bc58d974934 100644 >>> --- a/Documentation/arch/x86/resctrl.rst >>> +++ b/Documentation/arch/x86/resctrl.rst >>> @@ -302,6 +302,10 @@ with the following files: >>> memory bandwidth tracking to a single memory bandwidth event per >>> monitoring group. >>> >>> +"available_mbm_cntrs": >>> + The number of free monitoring counters available assignment in each domain >> >> "The number of free monitoring counters available assignment" -> "The number of monitoring >> counters available for assignment"? >> >> (not taking into account how text may change after addressing Peter's feedback) > > How about this? > > "The number of monitoring counters available for assignment in each domain > when the architecture supports mbm_cntr_assign mode. There are a total of > "num_mbm_cntrs" counters are available for assignment. Counters can be > assigned or unassigned individually in each domain. A counter is available > for new assignment if it is unassigned in all domains." Please consider the context of this paragraph. It follows right after the description of "num_mbm_cntrs" that states "Up to two counters can be assigned per monitoring group". I think it is confusing to follow that with a paragraph that states "Counters can be assigned or unassigned individually in each domain." I wonder if it may be helpful to use a different term ... for example a counter is *assigned* to an event of a monitoring group but this assignment may be to specified (not yet supported) or all (this work) domains while it is only *programmed*/*activated* to specified domains. Of course, all of this documentation needs to remain coherent if future work decides to indeed support per-domain assignment. Reinette
Hi Reinette, On 11/21/2024 3:12 PM, Reinette Chatre wrote: > Hi Babu, > > On 11/19/24 11:20 AM, Moger, Babu wrote: >> Hi Reinette, >> >> On 11/15/24 18:31, Reinette Chatre wrote: >>> Hi Babu, >>> >>> On 10/29/24 4:21 PM, Babu Moger wrote: >>>> Provide the interface to display the number of free monitoring counters >>>> available for assignment in each doamin when mbm_cntr_assign is supported. >>>> >>>> Signed-off-by: Babu Moger <babu.moger@amd.com> >>>> --- >>>> v9: New patch. >>>> --- >>>> Documentation/arch/x86/resctrl.rst | 4 ++++ >>>> arch/x86/kernel/cpu/resctrl/monitor.c | 1 + >>>> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 33 ++++++++++++++++++++++++++ >>>> 3 files changed, 38 insertions(+) >>>> >>>> diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst >>>> index 2f3a86278e84..2bc58d974934 100644 >>>> --- a/Documentation/arch/x86/resctrl.rst >>>> +++ b/Documentation/arch/x86/resctrl.rst >>>> @@ -302,6 +302,10 @@ with the following files: >>>> memory bandwidth tracking to a single memory bandwidth event per >>>> monitoring group. >>>> >>>> +"available_mbm_cntrs": >>>> + The number of free monitoring counters available assignment in each domain >>> >>> "The number of free monitoring counters available assignment" -> "The number of monitoring >>> counters available for assignment"? >>> >>> (not taking into account how text may change after addressing Peter's feedback) >> >> How about this? >> >> "The number of monitoring counters available for assignment in each domain >> when the architecture supports mbm_cntr_assign mode. There are a total of >> "num_mbm_cntrs" counters are available for assignment. Counters can be >> assigned or unassigned individually in each domain. A counter is available >> for new assignment if it is unassigned in all domains." > > Please consider the context of this paragraph. It follows right after the description > of "num_mbm_cntrs" that states "Up to two counters can be assigned per monitoring group". > I think it is confusing to follow that with a paragraph that states "Counters can be > assigned or unassigned individually in each domain." I wonder if it may be helpful to > use a different term ... for example a counter is *assigned* to an event of a monitoring > group but this assignment may be to specified (not yet supported) or all (this work) domains while > it is only *programmed*/*activated* to specified domains. Of course, all of this documentation > needs to remain coherent if future work decides to indeed support per-domain assignment. > Little bit lost here. Please help me. "available_mbm_cntrs": "The number of monitoring counters available for assignment in each domain when the architecture supports "mbm_cntr_assign" mode. There are a total of "num_mbm_cntrs" counters are available for assignment. A counter is assigned to an event within a monitoring group and is available for activation across all domains. Users have the flexibility to activate it selectively within specific domains." Thanks - Babu Moger
Hi Babu, On 11/22/24 3:36 PM, Moger, Babu wrote: > Hi Reinette, > > On 11/21/2024 3:12 PM, Reinette Chatre wrote: >> Hi Babu, >> >> On 11/19/24 11:20 AM, Moger, Babu wrote: >>> Hi Reinette, >>> >>> On 11/15/24 18:31, Reinette Chatre wrote: >>>> Hi Babu, >>>> >>>> On 10/29/24 4:21 PM, Babu Moger wrote: >>>>> Provide the interface to display the number of free monitoring counters >>>>> available for assignment in each doamin when mbm_cntr_assign is supported. >>>>> >>>>> Signed-off-by: Babu Moger <babu.moger@amd.com> >>>>> --- >>>>> v9: New patch. >>>>> --- >>>>> Documentation/arch/x86/resctrl.rst | 4 ++++ >>>>> arch/x86/kernel/cpu/resctrl/monitor.c | 1 + >>>>> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 33 ++++++++++++++++++++++++++ >>>>> 3 files changed, 38 insertions(+) >>>>> >>>>> diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst >>>>> index 2f3a86278e84..2bc58d974934 100644 >>>>> --- a/Documentation/arch/x86/resctrl.rst >>>>> +++ b/Documentation/arch/x86/resctrl.rst >>>>> @@ -302,6 +302,10 @@ with the following files: >>>>> memory bandwidth tracking to a single memory bandwidth event per >>>>> monitoring group. >>>>> +"available_mbm_cntrs": >>>>> + The number of free monitoring counters available assignment in each domain >>>> >>>> "The number of free monitoring counters available assignment" -> "The number of monitoring >>>> counters available for assignment"? >>>> >>>> (not taking into account how text may change after addressing Peter's feedback) >>> >>> How about this? >>> >>> "The number of monitoring counters available for assignment in each domain >>> when the architecture supports mbm_cntr_assign mode. There are a total of >>> "num_mbm_cntrs" counters are available for assignment. Counters can be >>> assigned or unassigned individually in each domain. A counter is available >>> for new assignment if it is unassigned in all domains." >> >> Please consider the context of this paragraph. It follows right after the description >> of "num_mbm_cntrs" that states "Up to two counters can be assigned per monitoring group". >> I think it is confusing to follow that with a paragraph that states "Counters can be >> assigned or unassigned individually in each domain." I wonder if it may be helpful to >> use a different term ... for example a counter is *assigned* to an event of a monitoring >> group but this assignment may be to specified (not yet supported) or all (this work) domains while >> it is only *programmed*/*activated* to specified domains. Of course, all of this documentation >> needs to remain coherent if future work decides to indeed support per-domain assignment. >> > > Little bit lost here. Please help me. I think this highlights the uncertainty this interface brings. How do you expect users to use this interface? At this time I think this interface can create a lot of confusion. For example, consider a hypothetical system with three domains and four counters that has the following state per mbm_assign_control: //0=tl;1=_;2=l #default group uses counters 0 and 1 to monitor total and local MBM /m1/0=_;1=t;2=t #monitor group m1 uses counter 2, just for total MBM /m2/0=l;1=_;2=l #monitor group m2 uses counter 3, just for local MBM /m3/0=_;1=_;2=_ Since, in this system there are only four counters available, and they have all been assigned, then there are no new counters available for assignment. If I understand correctly, available_mbm_cntrs will read: 0=1;1=3;2=1 How is a user to interpret the above numbers? It does not reflect that no counter can be assigned to m3, instead it reflects which of the already assigned counters still need to be activated on domains. If, for example, a user is expected to use this file to know how many counters can still be assigned, should it not reflect the actual available counters. In the above scenario it will then be: 0=0;1=0;2=0 Of course, when doing the above the user may get impression that a counter that has already been assigned, just not activated, is no longer available for use. > "available_mbm_cntrs": > "The number of monitoring counters available for assignment in each domain when the architecture supports "mbm_cntr_assign" mode. There are a total of "num_mbm_cntrs" counters are available for assignment. > A counter is assigned to an event within a monitoring group and is available for activation across all domains. Users have the flexibility to activate it selectively within specific domains." > Once we understand how users are to use this file the documentation should be easier to create. Reinette
Hi Reinette, On 11/25/2024 1:00 PM, Reinette Chatre wrote: > Hi Babu, > > On 11/22/24 3:36 PM, Moger, Babu wrote: >> Hi Reinette, >> >> On 11/21/2024 3:12 PM, Reinette Chatre wrote: >>> Hi Babu, >>> >>> On 11/19/24 11:20 AM, Moger, Babu wrote: >>>> Hi Reinette, >>>> >>>> On 11/15/24 18:31, Reinette Chatre wrote: >>>>> Hi Babu, >>>>> >>>>> On 10/29/24 4:21 PM, Babu Moger wrote: >>>>>> Provide the interface to display the number of free monitoring counters >>>>>> available for assignment in each doamin when mbm_cntr_assign is supported. >>>>>> >>>>>> Signed-off-by: Babu Moger <babu.moger@amd.com> >>>>>> --- >>>>>> v9: New patch. >>>>>> --- >>>>>> Documentation/arch/x86/resctrl.rst | 4 ++++ >>>>>> arch/x86/kernel/cpu/resctrl/monitor.c | 1 + >>>>>> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 33 ++++++++++++++++++++++++++ >>>>>> 3 files changed, 38 insertions(+) >>>>>> >>>>>> diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst >>>>>> index 2f3a86278e84..2bc58d974934 100644 >>>>>> --- a/Documentation/arch/x86/resctrl.rst >>>>>> +++ b/Documentation/arch/x86/resctrl.rst >>>>>> @@ -302,6 +302,10 @@ with the following files: >>>>>> memory bandwidth tracking to a single memory bandwidth event per >>>>>> monitoring group. >>>>>> +"available_mbm_cntrs": >>>>>> + The number of free monitoring counters available assignment in each domain >>>>> >>>>> "The number of free monitoring counters available assignment" -> "The number of monitoring >>>>> counters available for assignment"? >>>>> >>>>> (not taking into account how text may change after addressing Peter's feedback) >>>> >>>> How about this? >>>> >>>> "The number of monitoring counters available for assignment in each domain >>>> when the architecture supports mbm_cntr_assign mode. There are a total of >>>> "num_mbm_cntrs" counters are available for assignment. Counters can be >>>> assigned or unassigned individually in each domain. A counter is available >>>> for new assignment if it is unassigned in all domains." >>> >>> Please consider the context of this paragraph. It follows right after the description >>> of "num_mbm_cntrs" that states "Up to two counters can be assigned per monitoring group". >>> I think it is confusing to follow that with a paragraph that states "Counters can be >>> assigned or unassigned individually in each domain." I wonder if it may be helpful to >>> use a different term ... for example a counter is *assigned* to an event of a monitoring >>> group but this assignment may be to specified (not yet supported) or all (this work) domains while >>> it is only *programmed*/*activated* to specified domains. Of course, all of this documentation >>> needs to remain coherent if future work decides to indeed support per-domain assignment. >>> >> >> Little bit lost here. Please help me. > > I think this highlights the uncertainty this interface brings. How do you expect users > to use this interface? At this time I think this interface can create a lot of confusion. > For example, consider a hypothetical system with three domains and four counters that > has the following state per mbm_assign_control: > > //0=tl;1=_;2=l #default group uses counters 0 and 1 to monitor total and local MBM > /m1/0=_;1=t;2=t #monitor group m1 uses counter 2, just for total MBM > /m2/0=l;1=_;2=l #monitor group m2 uses counter 3, just for local MBM > /m3/0=_;1=_;2=_ > > Since, in this system there are only four counters available, and > they have all been assigned, then there are no new counters available for > assignment. > > If I understand correctly, available_mbm_cntrs will read: > 0=1;1=3;2=1 Yes. Exactly. This causes confusion to the user. > > How is a user to interpret the above numbers? It does not reflect > that no counter can be assigned to m3, instead it reflects which of the > already assigned counters still need to be activated on domains. > If, for example, a user is expected to use this file to know how > many counters can still be assigned, should it not reflect the actual > available counters. In the above scenario it will then be: > 0=0;1=0;2=0 We can also just print #cat available_mbm_cntrs 0 The domain specific information is not important here. That was my original idea. We can go back to that definition. That is more clear to the user. > > Of course, when doing the above the user may get impression that a counter > that has already been assigned, just not activated, is no longer available > for use. > > >> "available_mbm_cntrs": >> "The number of monitoring counters available for assignment in each domain when the architecture supports "mbm_cntr_assign" mode. There are a total of "num_mbm_cntrs" counters are available for assignment. >> A counter is assigned to an event within a monitoring group and is available for activation across all domains. Users have the flexibility to activate it selectively within specific domains." >> > > Once we understand how users are to use this file the documentation should be easier > to create. > > Reinette > > -- - Babu Moger
Hi Babu, On 11/26/24 3:31 PM, Moger, Babu wrote: > On 11/25/2024 1:00 PM, Reinette Chatre wrote: >> On 11/22/24 3:36 PM, Moger, Babu wrote: >>> On 11/21/2024 3:12 PM, Reinette Chatre wrote: >>>> On 11/19/24 11:20 AM, Moger, Babu wrote: >>>>> On 11/15/24 18:31, Reinette Chatre wrote: >>>>>> On 10/29/24 4:21 PM, Babu Moger wrote: >>>>>>> Provide the interface to display the number of free monitoring counters >>>>>>> available for assignment in each doamin when mbm_cntr_assign is supported. >>>>>>> >>>>>>> Signed-off-by: Babu Moger <babu.moger@amd.com> >>>>>>> --- >>>>>>> v9: New patch. >>>>>>> --- >>>>>>> Documentation/arch/x86/resctrl.rst | 4 ++++ >>>>>>> arch/x86/kernel/cpu/resctrl/monitor.c | 1 + >>>>>>> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 33 ++++++++++++++++++++++++++ >>>>>>> 3 files changed, 38 insertions(+) >>>>>>> >>>>>>> diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst >>>>>>> index 2f3a86278e84..2bc58d974934 100644 >>>>>>> --- a/Documentation/arch/x86/resctrl.rst >>>>>>> +++ b/Documentation/arch/x86/resctrl.rst >>>>>>> @@ -302,6 +302,10 @@ with the following files: >>>>>>> memory bandwidth tracking to a single memory bandwidth event per >>>>>>> monitoring group. >>>>>>> +"available_mbm_cntrs": >>>>>>> + The number of free monitoring counters available assignment in each domain >>>>>> >>>>>> "The number of free monitoring counters available assignment" -> "The number of monitoring >>>>>> counters available for assignment"? >>>>>> >>>>>> (not taking into account how text may change after addressing Peter's feedback) >>>>> >>>>> How about this? >>>>> >>>>> "The number of monitoring counters available for assignment in each domain >>>>> when the architecture supports mbm_cntr_assign mode. There are a total of >>>>> "num_mbm_cntrs" counters are available for assignment. Counters can be >>>>> assigned or unassigned individually in each domain. A counter is available >>>>> for new assignment if it is unassigned in all domains." >>>> >>>> Please consider the context of this paragraph. It follows right after the description >>>> of "num_mbm_cntrs" that states "Up to two counters can be assigned per monitoring group". >>>> I think it is confusing to follow that with a paragraph that states "Counters can be >>>> assigned or unassigned individually in each domain." I wonder if it may be helpful to >>>> use a different term ... for example a counter is *assigned* to an event of a monitoring >>>> group but this assignment may be to specified (not yet supported) or all (this work) domains while >>>> it is only *programmed*/*activated* to specified domains. Of course, all of this documentation >>>> needs to remain coherent if future work decides to indeed support per-domain assignment. >>>> >>> >>> Little bit lost here. Please help me. >> >> I think this highlights the uncertainty this interface brings. How do you expect users >> to use this interface? At this time I think this interface can create a lot of confusion. >> For example, consider a hypothetical system with three domains and four counters that >> has the following state per mbm_assign_control: >> >> //0=tl;1=_;2=l #default group uses counters 0 and 1 to monitor total and local MBM >> /m1/0=_;1=t;2=t #monitor group m1 uses counter 2, just for total MBM >> /m2/0=l;1=_;2=l #monitor group m2 uses counter 3, just for local MBM >> /m3/0=_;1=_;2=_ >> >> Since, in this system there are only four counters available, and >> they have all been assigned, then there are no new counters available for >> assignment. >> >> If I understand correctly, available_mbm_cntrs will read: >> 0=1;1=3;2=1 > > Yes. Exactly. This causes confusion to the user. >> >> How is a user to interpret the above numbers? It does not reflect >> that no counter can be assigned to m3, instead it reflects which of the >> already assigned counters still need to be activated on domains. >> If, for example, a user is expected to use this file to know how >> many counters can still be assigned, should it not reflect the actual >> available counters. In the above scenario it will then be: >> 0=0;1=0;2=0 > > We can also just print > #cat available_mbm_cntrs > 0 > > The domain specific information is not important here. > That was my original idea. We can go back to that definition. That is more clear to the user. Tony's response [1] still applies. I believe Tony's suggestion [2] considered that the available counters will be the same for every domain for this implementation. That is why my example noted: "0=0;1=0;2=0" The confusion surrounding the global allocator seems to be prevalent ([3], [4]) as folks familiar with resctrl attempt to digest the work. The struggle to make this documentation clear makes me more concerned how this feature will be perceived by users who are not as familiar with resctrl internals. I think that it may be worth it to take a moment and investigate what it will take to implement a per-domain counter allocator. The hardware supports it and I suspect that the upfront work to do the enabling will make it easier for users to adopt and understand the feature. What do you think? Reinette [1] https://lore.kernel.org/all/SJ1PR11MB6083DC9EA6D323356E957A87FC4E2@SJ1PR11MB6083.namprd11.prod.outlook.com/ [2] https://lore.kernel.org/all/SJ1PR11MB6083583A24FA3B3B7C2DCD64FC442@SJ1PR11MB6083.namprd11.prod.outlook.com/ [3] https://lore.kernel.org/all/ZwmadFbK--Qb8qWP@agluck-desk3.sc.intel.com/ [4] https://lore.kernel.org/all/CALPaoCh1BWdWww8Kztd13GBaY9mMeZX268fOQgECRytiKm-nPQ@mail.gmail.com/
Hi Reinette,
On 11/26/2024 5:56 PM, Reinette Chatre wrote:
> Hi Babu,
>
> On 11/26/24 3:31 PM, Moger, Babu wrote:
>> On 11/25/2024 1:00 PM, Reinette Chatre wrote:
>>> On 11/22/24 3:36 PM, Moger, Babu wrote:
>>>> On 11/21/2024 3:12 PM, Reinette Chatre wrote:
>>>>> On 11/19/24 11:20 AM, Moger, Babu wrote:
>>>>>> On 11/15/24 18:31, Reinette Chatre wrote:
>>>>>>> On 10/29/24 4:21 PM, Babu Moger wrote:
>>>>>>>> Provide the interface to display the number of free monitoring counters
>>>>>>>> available for assignment in each doamin when mbm_cntr_assign is supported.
>>>>>>>>
>>>>>>>> Signed-off-by: Babu Moger <babu.moger@amd.com>
>>>>>>>> ---
>>>>>>>> v9: New patch.
>>>>>>>> ---
>>>>>>>> Documentation/arch/x86/resctrl.rst | 4 ++++
>>>>>>>> arch/x86/kernel/cpu/resctrl/monitor.c | 1 +
>>>>>>>> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 33 ++++++++++++++++++++++++++
>>>>>>>> 3 files changed, 38 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst
>>>>>>>> index 2f3a86278e84..2bc58d974934 100644
>>>>>>>> --- a/Documentation/arch/x86/resctrl.rst
>>>>>>>> +++ b/Documentation/arch/x86/resctrl.rst
>>>>>>>> @@ -302,6 +302,10 @@ with the following files:
>>>>>>>> memory bandwidth tracking to a single memory bandwidth event per
>>>>>>>> monitoring group.
>>>>>>>> +"available_mbm_cntrs":
>>>>>>>> + The number of free monitoring counters available assignment in each domain
>>>>>>>
>>>>>>> "The number of free monitoring counters available assignment" -> "The number of monitoring
>>>>>>> counters available for assignment"?
>>>>>>>
>>>>>>> (not taking into account how text may change after addressing Peter's feedback)
>>>>>>
>>>>>> How about this?
>>>>>>
>>>>>> "The number of monitoring counters available for assignment in each domain
>>>>>> when the architecture supports mbm_cntr_assign mode. There are a total of
>>>>>> "num_mbm_cntrs" counters are available for assignment. Counters can be
>>>>>> assigned or unassigned individually in each domain. A counter is available
>>>>>> for new assignment if it is unassigned in all domains."
>>>>>
>>>>> Please consider the context of this paragraph. It follows right after the description
>>>>> of "num_mbm_cntrs" that states "Up to two counters can be assigned per monitoring group".
>>>>> I think it is confusing to follow that with a paragraph that states "Counters can be
>>>>> assigned or unassigned individually in each domain." I wonder if it may be helpful to
>>>>> use a different term ... for example a counter is *assigned* to an event of a monitoring
>>>>> group but this assignment may be to specified (not yet supported) or all (this work) domains while
>>>>> it is only *programmed*/*activated* to specified domains. Of course, all of this documentation
>>>>> needs to remain coherent if future work decides to indeed support per-domain assignment.
>>>>>
>>>>
>>>> Little bit lost here. Please help me.
>>>
>>> I think this highlights the uncertainty this interface brings. How do you expect users
>>> to use this interface? At this time I think this interface can create a lot of confusion.
>>> For example, consider a hypothetical system with three domains and four counters that
>>> has the following state per mbm_assign_control:
>>>
>>> //0=tl;1=_;2=l #default group uses counters 0 and 1 to monitor total and local MBM
>>> /m1/0=_;1=t;2=t #monitor group m1 uses counter 2, just for total MBM
>>> /m2/0=l;1=_;2=l #monitor group m2 uses counter 3, just for local MBM
>>> /m3/0=_;1=_;2=_
>>>
>>> Since, in this system there are only four counters available, and
>>> they have all been assigned, then there are no new counters available for
>>> assignment.
>>>
>>> If I understand correctly, available_mbm_cntrs will read:
>>> 0=1;1=3;2=1
>>
>> Yes. Exactly. This causes confusion to the user.
>>>
>>> How is a user to interpret the above numbers? It does not reflect
>>> that no counter can be assigned to m3, instead it reflects which of the
>>> already assigned counters still need to be activated on domains.
>>> If, for example, a user is expected to use this file to know how
>>> many counters can still be assigned, should it not reflect the actual
>>> available counters. In the above scenario it will then be:
>>> 0=0;1=0;2=0
>>
>> We can also just print
>> #cat available_mbm_cntrs
>> 0
>>
>> The domain specific information is not important here.
>> That was my original idea. We can go back to that definition. That is more clear to the user.
>
> Tony's response [1] still applies.
>
> I believe Tony's suggestion [2] considered that the available counters will be the
> same for every domain for this implementation. That is why my example noted:
> "0=0;1=0;2=0"
yes. We can keep it like this.
>
> The confusion surrounding the global allocator seems to be prevalent ([3], [4]) as folks
> familiar with resctrl attempt to digest the work. The struggle to make this documentation clear
> makes me more concerned how this feature will be perceived by users who are not as familiar with
> resctrl internals. I think that it may be worth it to take a moment and investigate what it will take
> to implement a per-domain counter allocator. The hardware supports it and I suspect that the upfront
> work to do the enabling will make it easier for users to adopt and understand the feature.
>
> What do you think?
It adds more complexity for sure.
1. Each group needs to remember counter ids in each domain for each event.
For example:
Resctrl group mon1
Total event
dom 0 cntr_id 1,
dom 1 cntr_id 10
dom 2 cntr_id 11
Local event
dom 0 cntr_id 2,
dom 1 cntr_id 15
dom 2 cntr_id 10
2. We should have a bitmap of "available counters" in each domain. We have
this already. But allocation method changes.
3. Dynamic allocation/free of the counters
There could be some more things which I can't think right now. It might
come up when we start working on it.
It is doable. But, is it worth adding this complexity? I am not sure.
Peter mentioned earlier that he was not interested in domain specific
assignments. He was only interested in all domain ("*") implementation.
We can add the support but not sure if it is going to drastically help the
user.
Yes, We should keep the options open for supporting domain level
allocation for future.
For now, we can go ahead with the current implementation.
>
> Reinette
>
> [1] https://lore.kernel.org/all/SJ1PR11MB6083DC9EA6D323356E957A87FC4E2@SJ1PR11MB6083.namprd11.prod.outlook.com/
> [2] https://lore.kernel.org/all/SJ1PR11MB6083583A24FA3B3B7C2DCD64FC442@SJ1PR11MB6083.namprd11.prod.outlook.com/
> [3] https://lore.kernel.org/all/ZwmadFbK--Qb8qWP@agluck-desk3.sc.intel.com/
> [4] https://lore.kernel.org/all/CALPaoCh1BWdWww8Kztd13GBaY9mMeZX268fOQgECRytiKm-nPQ@mail.gmail.com/
>
--
- Babu Moger
Hi Babu,
On 11/27/24 6:57 AM, Moger, Babu wrote:
> Hi Reinette,
>
> On 11/26/2024 5:56 PM, Reinette Chatre wrote:
>> Hi Babu,
>>
>> On 11/26/24 3:31 PM, Moger, Babu wrote:
>>> On 11/25/2024 1:00 PM, Reinette Chatre wrote:
>>>> On 11/22/24 3:36 PM, Moger, Babu wrote:
>>>>> On 11/21/2024 3:12 PM, Reinette Chatre wrote:
>>>>>> On 11/19/24 11:20 AM, Moger, Babu wrote:
>>>>>>> On 11/15/24 18:31, Reinette Chatre wrote:
>>>>>>>> On 10/29/24 4:21 PM, Babu Moger wrote:
>>>>>>>>> Provide the interface to display the number of free monitoring counters
>>>>>>>>> available for assignment in each doamin when mbm_cntr_assign is supported.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Babu Moger <babu.moger@amd.com>
>>>>>>>>> ---
>>>>>>>>> v9: New patch.
>>>>>>>>> ---
>>>>>>>>> Documentation/arch/x86/resctrl.rst | 4 ++++
>>>>>>>>> arch/x86/kernel/cpu/resctrl/monitor.c | 1 +
>>>>>>>>> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 33 ++++++++++++++++++++++++++
>>>>>>>>> 3 files changed, 38 insertions(+)
>>>>>>>>>
>>>>>>>>> diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst
>>>>>>>>> index 2f3a86278e84..2bc58d974934 100644
>>>>>>>>> --- a/Documentation/arch/x86/resctrl.rst
>>>>>>>>> +++ b/Documentation/arch/x86/resctrl.rst
>>>>>>>>> @@ -302,6 +302,10 @@ with the following files:
>>>>>>>>> memory bandwidth tracking to a single memory bandwidth event per
>>>>>>>>> monitoring group.
>>>>>>>>> +"available_mbm_cntrs":
>>>>>>>>> + The number of free monitoring counters available assignment in each domain
>>>>>>>>
>>>>>>>> "The number of free monitoring counters available assignment" -> "The number of monitoring
>>>>>>>> counters available for assignment"?
>>>>>>>>
>>>>>>>> (not taking into account how text may change after addressing Peter's feedback)
>>>>>>>
>>>>>>> How about this?
>>>>>>>
>>>>>>> "The number of monitoring counters available for assignment in each domain
>>>>>>> when the architecture supports mbm_cntr_assign mode. There are a total of
>>>>>>> "num_mbm_cntrs" counters are available for assignment. Counters can be
>>>>>>> assigned or unassigned individually in each domain. A counter is available
>>>>>>> for new assignment if it is unassigned in all domains."
>>>>>>
>>>>>> Please consider the context of this paragraph. It follows right after the description
>>>>>> of "num_mbm_cntrs" that states "Up to two counters can be assigned per monitoring group".
>>>>>> I think it is confusing to follow that with a paragraph that states "Counters can be
>>>>>> assigned or unassigned individually in each domain." I wonder if it may be helpful to
>>>>>> use a different term ... for example a counter is *assigned* to an event of a monitoring
>>>>>> group but this assignment may be to specified (not yet supported) or all (this work) domains while
>>>>>> it is only *programmed*/*activated* to specified domains. Of course, all of this documentation
>>>>>> needs to remain coherent if future work decides to indeed support per-domain assignment.
>>>>>>
>>>>>
>>>>> Little bit lost here. Please help me.
>>>>
>>>> I think this highlights the uncertainty this interface brings. How do you expect users
>>>> to use this interface? At this time I think this interface can create a lot of confusion.
>>>> For example, consider a hypothetical system with three domains and four counters that
>>>> has the following state per mbm_assign_control:
>>>>
>>>> //0=tl;1=_;2=l #default group uses counters 0 and 1 to monitor total and local MBM
>>>> /m1/0=_;1=t;2=t #monitor group m1 uses counter 2, just for total MBM
>>>> /m2/0=l;1=_;2=l #monitor group m2 uses counter 3, just for local MBM
>>>> /m3/0=_;1=_;2=_
>>>>
>>>> Since, in this system there are only four counters available, and
>>>> they have all been assigned, then there are no new counters available for
>>>> assignment.
>>>>
>>>> If I understand correctly, available_mbm_cntrs will read:
>>>> 0=1;1=3;2=1
>>>
>>> Yes. Exactly. This causes confusion to the user.
>>>>
>>>> How is a user to interpret the above numbers? It does not reflect
>>>> that no counter can be assigned to m3, instead it reflects which of the
>>>> already assigned counters still need to be activated on domains.
>>>> If, for example, a user is expected to use this file to know how
>>>> many counters can still be assigned, should it not reflect the actual
>>>> available counters. In the above scenario it will then be:
>>>> 0=0;1=0;2=0
>>>
>>> We can also just print
>>> #cat available_mbm_cntrs
>>> 0
>>>
>>> The domain specific information is not important here.
>>> That was my original idea. We can go back to that definition. That is more clear to the user.
>>
>> Tony's response [1] still applies.
>>
>> I believe Tony's suggestion [2] considered that the available counters will be the
>> same for every domain for this implementation. That is why my example noted:
>> "0=0;1=0;2=0"
>
> yes. We can keep it like this.
>
>>
>> The confusion surrounding the global allocator seems to be prevalent ([3], [4]) as folks
>> familiar with resctrl attempt to digest the work. The struggle to make this documentation clear
>> makes me more concerned how this feature will be perceived by users who are not as familiar with
>> resctrl internals. I think that it may be worth it to take a moment and investigate what it will take
>> to implement a per-domain counter allocator. The hardware supports it and I suspect that the upfront
>> work to do the enabling will make it easier for users to adopt and understand the feature.
>>
>> What do you think?
>
> It adds more complexity for sure.
I do see a difference in data structures used but the additional complexity is not
obvious to me. It seems like there will be one fewer data structure, the
global bitmap, and I think that will actually bring with it some simplification since
there is no longer the need to coordinate between the per-domain and global counters,
for example the logic that only frees a global counter if it is no longer used by a domain.
This may also simplify the update of the monitor event config (BMEC) since it can be
done directly on counters of the domain instead of needing to go back and forth between
global and per-domain counters.
>
> 1. Each group needs to remember counter ids in each domain for each event.
> For example:
> Resctrl group mon1
> Total event
> dom 0 cntr_id 1,
> dom 1 cntr_id 10
> dom 2 cntr_id 11
>
> Local event
> dom 0 cntr_id 2,
> dom 1 cntr_id 15
> dom 2 cntr_id 10
Indeed. The challenge here is that domains may come and go so it cannot be a simple
static array. As an alternative it can be an xarray indexed by the domain ID with
pointers to a struct like below to contain the counters associated with the monitor
group:
struct cntr_id {
u32 mbm_total;
u32 mbm_local;
}
Thinking more about how this array needs to be managed made me wonder how the
current implementation deals with domains that come and go. I do not think
this is currently handled. For example, if a new domain comes online and
monitoring groups had counters dynamically assigned, then these counters are
not configured to the newly online domain.
>
>
> 2. We should have a bitmap of "available counters" in each domain. We have
> this already. But allocation method changes.
Would allocation/free not be simpler with only the per-domain bitmap needing
to be consulted?
One implementation change I can think of is the dynamic assign of counters when
a monitor group is created. Now a free counter needs to be found in each
domain. Here it can be discussed if it should be an "all or nothing"
assignment but the handling does not seem to be complex and would need to be
solved eventually anyway.
> 3. Dynamic allocation/free of the counters
>
> There could be some more things which I can't think right now. It might
> come up when we start working on it.
>
> It is doable. But, is it worth adding this complexity? I am not sure.
Please elaborate where you see that this is too complex.
>
> Peter mentioned earlier that he was not interested in domain specific
> assignments. He was only interested in all domain ("*") implementation.
Peter's most recent message indicates otherwise:
https://lore.kernel.org/all/CALPaoCgiHEaY_cDbCo=537JJ7mkYZDFFDs9heYvtQ80fXuuvWQ@mail.gmail.com/
>
> We can add the support but not sure if it is going to drastically help the
> user.
>
> Yes, We should keep the options open for supporting domain level
> allocation for future.
The current interface supports domain level allocation with this support in
mind. The complication is that the interface is not behaving in an intuitive
way when backing it with global allocation. So far I have seen a lot of
confusion from knowledgeable users and I'm afraid this will worsen when more
users are exposed to this work.
>
> For now, we can go ahead with the current implementation.
>
As I see it this will require detailed documentation to explain the interface
peculiarities. This documentation is made more complicated knowing that the
peculiarities will be temporary since Peter already indicated that he will
need to fix this to support his work.
Putting this all together I do think it may really just avoid a lot of confusion
and extra unnecessary work if this is done now.
Reinette
Hi Babu, Reinette,
On Wed, Nov 27, 2024 at 8:05 PM Reinette Chatre
<reinette.chatre@intel.com> wrote:
>
> Hi Babu,
>
> On 11/27/24 6:57 AM, Moger, Babu wrote:
> > Hi Reinette,
> >
> > On 11/26/2024 5:56 PM, Reinette Chatre wrote:
> >> Hi Babu,
> >>
> >> On 11/26/24 3:31 PM, Moger, Babu wrote:
> >>> On 11/25/2024 1:00 PM, Reinette Chatre wrote:
> >>>> On 11/22/24 3:36 PM, Moger, Babu wrote:
> >>>>> On 11/21/2024 3:12 PM, Reinette Chatre wrote:
> >>>>>> On 11/19/24 11:20 AM, Moger, Babu wrote:
> >>>>>>> On 11/15/24 18:31, Reinette Chatre wrote:
> >>>>>>>> On 10/29/24 4:21 PM, Babu Moger wrote:
> >>>>>>>>> Provide the interface to display the number of free monitoring counters
> >>>>>>>>> available for assignment in each doamin when mbm_cntr_assign is supported.
> >>>>>>>>>
> >>>>>>>>> Signed-off-by: Babu Moger <babu.moger@amd.com>
> >>>>>>>>> ---
> >>>>>>>>> v9: New patch.
> >>>>>>>>> ---
> >>>>>>>>> Documentation/arch/x86/resctrl.rst | 4 ++++
> >>>>>>>>> arch/x86/kernel/cpu/resctrl/monitor.c | 1 +
> >>>>>>>>> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 33 ++++++++++++++++++++++++++
> >>>>>>>>> 3 files changed, 38 insertions(+)
> >>>>>>>>>
> >>>>>>>>> diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst
> >>>>>>>>> index 2f3a86278e84..2bc58d974934 100644
> >>>>>>>>> --- a/Documentation/arch/x86/resctrl.rst
> >>>>>>>>> +++ b/Documentation/arch/x86/resctrl.rst
> >>>>>>>>> @@ -302,6 +302,10 @@ with the following files:
> >>>>>>>>> memory bandwidth tracking to a single memory bandwidth event per
> >>>>>>>>> monitoring group.
> >>>>>>>>> +"available_mbm_cntrs":
> >>>>>>>>> + The number of free monitoring counters available assignment in each domain
> >>>>>>>>
> >>>>>>>> "The number of free monitoring counters available assignment" -> "The number of monitoring
> >>>>>>>> counters available for assignment"?
> >>>>>>>>
> >>>>>>>> (not taking into account how text may change after addressing Peter's feedback)
> >>>>>>>
> >>>>>>> How about this?
> >>>>>>>
> >>>>>>> "The number of monitoring counters available for assignment in each domain
> >>>>>>> when the architecture supports mbm_cntr_assign mode. There are a total of
> >>>>>>> "num_mbm_cntrs" counters are available for assignment. Counters can be
> >>>>>>> assigned or unassigned individually in each domain. A counter is available
> >>>>>>> for new assignment if it is unassigned in all domains."
> >>>>>>
> >>>>>> Please consider the context of this paragraph. It follows right after the description
> >>>>>> of "num_mbm_cntrs" that states "Up to two counters can be assigned per monitoring group".
> >>>>>> I think it is confusing to follow that with a paragraph that states "Counters can be
> >>>>>> assigned or unassigned individually in each domain." I wonder if it may be helpful to
> >>>>>> use a different term ... for example a counter is *assigned* to an event of a monitoring
> >>>>>> group but this assignment may be to specified (not yet supported) or all (this work) domains while
> >>>>>> it is only *programmed*/*activated* to specified domains. Of course, all of this documentation
> >>>>>> needs to remain coherent if future work decides to indeed support per-domain assignment.
> >>>>>>
> >>>>>
> >>>>> Little bit lost here. Please help me.
> >>>>
> >>>> I think this highlights the uncertainty this interface brings. How do you expect users
> >>>> to use this interface? At this time I think this interface can create a lot of confusion.
> >>>> For example, consider a hypothetical system with three domains and four counters that
> >>>> has the following state per mbm_assign_control:
> >>>>
> >>>> //0=tl;1=_;2=l #default group uses counters 0 and 1 to monitor total and local MBM
> >>>> /m1/0=_;1=t;2=t #monitor group m1 uses counter 2, just for total MBM
> >>>> /m2/0=l;1=_;2=l #monitor group m2 uses counter 3, just for local MBM
> >>>> /m3/0=_;1=_;2=_
> >>>>
> >>>> Since, in this system there are only four counters available, and
> >>>> they have all been assigned, then there are no new counters available for
> >>>> assignment.
> >>>>
> >>>> If I understand correctly, available_mbm_cntrs will read:
> >>>> 0=1;1=3;2=1
> >>>
> >>> Yes. Exactly. This causes confusion to the user.
> >>>>
> >>>> How is a user to interpret the above numbers? It does not reflect
> >>>> that no counter can be assigned to m3, instead it reflects which of the
> >>>> already assigned counters still need to be activated on domains.
> >>>> If, for example, a user is expected to use this file to know how
> >>>> many counters can still be assigned, should it not reflect the actual
> >>>> available counters. In the above scenario it will then be:
> >>>> 0=0;1=0;2=0
> >>>
> >>> We can also just print
> >>> #cat available_mbm_cntrs
> >>> 0
> >>>
> >>> The domain specific information is not important here.
> >>> That was my original idea. We can go back to that definition. That is more clear to the user.
> >>
> >> Tony's response [1] still applies.
> >>
> >> I believe Tony's suggestion [2] considered that the available counters will be the
> >> same for every domain for this implementation. That is why my example noted:
> >> "0=0;1=0;2=0"
> >
> > yes. We can keep it like this.
> >
> >>
> >> The confusion surrounding the global allocator seems to be prevalent ([3], [4]) as folks
> >> familiar with resctrl attempt to digest the work. The struggle to make this documentation clear
> >> makes me more concerned how this feature will be perceived by users who are not as familiar with
> >> resctrl internals. I think that it may be worth it to take a moment and investigate what it will take
> >> to implement a per-domain counter allocator. The hardware supports it and I suspect that the upfront
> >> work to do the enabling will make it easier for users to adopt and understand the feature.
> >>
> >> What do you think?
> >
> > It adds more complexity for sure.
>
> I do see a difference in data structures used but the additional complexity is not
> obvious to me. It seems like there will be one fewer data structure, the
> global bitmap, and I think that will actually bring with it some simplification since
> there is no longer the need to coordinate between the per-domain and global counters,
> for example the logic that only frees a global counter if it is no longer used by a domain.
>
> This may also simplify the update of the monitor event config (BMEC) since it can be
> done directly on counters of the domain instead of needing to go back and forth between
> global and per-domain counters.
>
> >
> > 1. Each group needs to remember counter ids in each domain for each event.
> > For example:
> > Resctrl group mon1
> > Total event
> > dom 0 cntr_id 1,
> > dom 1 cntr_id 10
> > dom 2 cntr_id 11
> >
> > Local event
> > dom 0 cntr_id 2,
> > dom 1 cntr_id 15
> > dom 2 cntr_id 10
>
> Indeed. The challenge here is that domains may come and go so it cannot be a simple
> static array. As an alternative it can be an xarray indexed by the domain ID with
> pointers to a struct like below to contain the counters associated with the monitor
> group:
> struct cntr_id {
> u32 mbm_total;
> u32 mbm_local;
> }
>
>
> Thinking more about how this array needs to be managed made me wonder how the
> current implementation deals with domains that come and go. I do not think
> this is currently handled. For example, if a new domain comes online and
> monitoring groups had counters dynamically assigned, then these counters are
> not configured to the newly online domain.
In my prototype, I allocated a counter id-indexed array to each
monitoring domain structure for tracking the counter allocations,
because the hardware counters are all domain-scoped. That way the
tracking data goes away when the hardware does.
I was focused on allowing all pending counter updates to a domain
resulting from a single mbm_assign_control write to be batched and
processed in a single IPI, so I structured the counter tracker
something like this:
struct resctrl_monitor_cfg {
int closid;
int rmid;
int evtid;
bool dirty;
};
This mirrors the info needed in whatever register configures the
counter, plus a dirty flag to skip over the ones that don't need to be
updated.
For the benefit of displaying mbm_assign_control, I put a pointer back
to any counter array entry allocated in the mbm_state struct only
because it's an existing structure that exists for every rmid-domain
combination.
I didn't need to change the rdtgroup structure.
>
> >
> >
> > 2. We should have a bitmap of "available counters" in each domain. We have
> > this already. But allocation method changes.
>
> Would allocation/free not be simpler with only the per-domain bitmap needing
> to be consulted?
>
> One implementation change I can think of is the dynamic assign of counters when
> a monitor group is created. Now a free counter needs to be found in each
> domain. Here it can be discussed if it should be an "all or nothing"
> assignment but the handling does not seem to be complex and would need to be
> solved eventually anyway.
>
> > 3. Dynamic allocation/free of the counters
> >
> > There could be some more things which I can't think right now. It might
> > come up when we start working on it.
> >
> > It is doable. But, is it worth adding this complexity? I am not sure.
>
> Please elaborate where you see that this is too complex.
>
> >
> > Peter mentioned earlier that he was not interested in domain specific
> > assignments. He was only interested in all domain ("*") implementation.
>
> Peter's most recent message indicates otherwise:
> https://lore.kernel.org/all/CALPaoCgiHEaY_cDbCo=537JJ7mkYZDFFDs9heYvtQ80fXuuvWQ@mail.gmail.com/
For now, I'm focused on managing the domains locally whenever possible
to avoid all IPIs, as this gives me the least overhead.
I'm also prototyping the 'T' vs 't' approach that Reinette
suggested[1], as this may take a lot of performance pressure off the
mbm_assign_control interface, as most of the routine updates to
counter assignments would be automated.
-Peter
[1] https://lore.kernel.org/lkml/7ee63634-3b55-4427-8283-8e3d38105f41@intel.com/
Hi Peter,
On 11/28/2024 5:10 AM, Peter Newman wrote:
> Hi Babu, Reinette,
>
> On Wed, Nov 27, 2024 at 8:05 PM Reinette Chatre
> <reinette.chatre@intel.com> wrote:
>>
>> Hi Babu,
>>
>> On 11/27/24 6:57 AM, Moger, Babu wrote:
>>> Hi Reinette,
>>>
>>> On 11/26/2024 5:56 PM, Reinette Chatre wrote:
>>>> Hi Babu,
>>>>
>>>> On 11/26/24 3:31 PM, Moger, Babu wrote:
>>>>> On 11/25/2024 1:00 PM, Reinette Chatre wrote:
>>>>>> On 11/22/24 3:36 PM, Moger, Babu wrote:
>>>>>>> On 11/21/2024 3:12 PM, Reinette Chatre wrote:
>>>>>>>> On 11/19/24 11:20 AM, Moger, Babu wrote:
>>>>>>>>> On 11/15/24 18:31, Reinette Chatre wrote:
>>>>>>>>>> On 10/29/24 4:21 PM, Babu Moger wrote:
>>>>>>>>>>> Provide the interface to display the number of free monitoring counters
>>>>>>>>>>> available for assignment in each doamin when mbm_cntr_assign is supported.
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Babu Moger <babu.moger@amd.com>
>>>>>>>>>>> ---
>>>>>>>>>>> v9: New patch.
>>>>>>>>>>> ---
>>>>>>>>>>> Documentation/arch/x86/resctrl.rst | 4 ++++
>>>>>>>>>>> arch/x86/kernel/cpu/resctrl/monitor.c | 1 +
>>>>>>>>>>> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 33 ++++++++++++++++++++++++++
>>>>>>>>>>> 3 files changed, 38 insertions(+)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst
>>>>>>>>>>> index 2f3a86278e84..2bc58d974934 100644
>>>>>>>>>>> --- a/Documentation/arch/x86/resctrl.rst
>>>>>>>>>>> +++ b/Documentation/arch/x86/resctrl.rst
>>>>>>>>>>> @@ -302,6 +302,10 @@ with the following files:
>>>>>>>>>>> memory bandwidth tracking to a single memory bandwidth event per
>>>>>>>>>>> monitoring group.
>>>>>>>>>>> +"available_mbm_cntrs":
>>>>>>>>>>> + The number of free monitoring counters available assignment in each domain
>>>>>>>>>>
>>>>>>>>>> "The number of free monitoring counters available assignment" -> "The number of monitoring
>>>>>>>>>> counters available for assignment"?
>>>>>>>>>>
>>>>>>>>>> (not taking into account how text may change after addressing Peter's feedback)
>>>>>>>>>
>>>>>>>>> How about this?
>>>>>>>>>
>>>>>>>>> "The number of monitoring counters available for assignment in each domain
>>>>>>>>> when the architecture supports mbm_cntr_assign mode. There are a total of
>>>>>>>>> "num_mbm_cntrs" counters are available for assignment. Counters can be
>>>>>>>>> assigned or unassigned individually in each domain. A counter is available
>>>>>>>>> for new assignment if it is unassigned in all domains."
>>>>>>>>
>>>>>>>> Please consider the context of this paragraph. It follows right after the description
>>>>>>>> of "num_mbm_cntrs" that states "Up to two counters can be assigned per monitoring group".
>>>>>>>> I think it is confusing to follow that with a paragraph that states "Counters can be
>>>>>>>> assigned or unassigned individually in each domain." I wonder if it may be helpful to
>>>>>>>> use a different term ... for example a counter is *assigned* to an event of a monitoring
>>>>>>>> group but this assignment may be to specified (not yet supported) or all (this work) domains while
>>>>>>>> it is only *programmed*/*activated* to specified domains. Of course, all of this documentation
>>>>>>>> needs to remain coherent if future work decides to indeed support per-domain assignment.
>>>>>>>>
>>>>>>>
>>>>>>> Little bit lost here. Please help me.
>>>>>>
>>>>>> I think this highlights the uncertainty this interface brings. How do you expect users
>>>>>> to use this interface? At this time I think this interface can create a lot of confusion.
>>>>>> For example, consider a hypothetical system with three domains and four counters that
>>>>>> has the following state per mbm_assign_control:
>>>>>>
>>>>>> //0=tl;1=_;2=l #default group uses counters 0 and 1 to monitor total and local MBM
>>>>>> /m1/0=_;1=t;2=t #monitor group m1 uses counter 2, just for total MBM
>>>>>> /m2/0=l;1=_;2=l #monitor group m2 uses counter 3, just for local MBM
>>>>>> /m3/0=_;1=_;2=_
>>>>>>
>>>>>> Since, in this system there are only four counters available, and
>>>>>> they have all been assigned, then there are no new counters available for
>>>>>> assignment.
>>>>>>
>>>>>> If I understand correctly, available_mbm_cntrs will read:
>>>>>> 0=1;1=3;2=1
>>>>>
>>>>> Yes. Exactly. This causes confusion to the user.
>>>>>>
>>>>>> How is a user to interpret the above numbers? It does not reflect
>>>>>> that no counter can be assigned to m3, instead it reflects which of the
>>>>>> already assigned counters still need to be activated on domains.
>>>>>> If, for example, a user is expected to use this file to know how
>>>>>> many counters can still be assigned, should it not reflect the actual
>>>>>> available counters. In the above scenario it will then be:
>>>>>> 0=0;1=0;2=0
>>>>>
>>>>> We can also just print
>>>>> #cat available_mbm_cntrs
>>>>> 0
>>>>>
>>>>> The domain specific information is not important here.
>>>>> That was my original idea. We can go back to that definition. That is more clear to the user.
>>>>
>>>> Tony's response [1] still applies.
>>>>
>>>> I believe Tony's suggestion [2] considered that the available counters will be the
>>>> same for every domain for this implementation. That is why my example noted:
>>>> "0=0;1=0;2=0"
>>>
>>> yes. We can keep it like this.
>>>
>>>>
>>>> The confusion surrounding the global allocator seems to be prevalent ([3], [4]) as folks
>>>> familiar with resctrl attempt to digest the work. The struggle to make this documentation clear
>>>> makes me more concerned how this feature will be perceived by users who are not as familiar with
>>>> resctrl internals. I think that it may be worth it to take a moment and investigate what it will take
>>>> to implement a per-domain counter allocator. The hardware supports it and I suspect that the upfront
>>>> work to do the enabling will make it easier for users to adopt and understand the feature.
>>>>
>>>> What do you think?
>>>
>>> It adds more complexity for sure.
>>
>> I do see a difference in data structures used but the additional complexity is not
>> obvious to me. It seems like there will be one fewer data structure, the
>> global bitmap, and I think that will actually bring with it some simplification since
>> there is no longer the need to coordinate between the per-domain and global counters,
>> for example the logic that only frees a global counter if it is no longer used by a domain.
>>
>> This may also simplify the update of the monitor event config (BMEC) since it can be
>> done directly on counters of the domain instead of needing to go back and forth between
>> global and per-domain counters.
>>
>>>
>>> 1. Each group needs to remember counter ids in each domain for each event.
>>> For example:
>>> Resctrl group mon1
>>> Total event
>>> dom 0 cntr_id 1,
>>> dom 1 cntr_id 10
>>> dom 2 cntr_id 11
>>>
>>> Local event
>>> dom 0 cntr_id 2,
>>> dom 1 cntr_id 15
>>> dom 2 cntr_id 10
>>
>> Indeed. The challenge here is that domains may come and go so it cannot be a simple
>> static array. As an alternative it can be an xarray indexed by the domain ID with
>> pointers to a struct like below to contain the counters associated with the monitor
>> group:
>> struct cntr_id {
>> u32 mbm_total;
>> u32 mbm_local;
>> }
>>
>>
>> Thinking more about how this array needs to be managed made me wonder how the
>> current implementation deals with domains that come and go. I do not think
>> this is currently handled. For example, if a new domain comes online and
>> monitoring groups had counters dynamically assigned, then these counters are
>> not configured to the newly online domain.
I am trying to understand the details of your approach here.
>
> In my prototype, I allocated a counter id-indexed array to each
> monitoring domain structure for tracking the counter allocations,
> because the hardware counters are all domain-scoped. That way the
> tracking data goes away when the hardware does.
>
> I was focused on allowing all pending counter updates to a domain
> resulting from a single mbm_assign_control write to be batched and
> processed in a single IPI, so I structured the counter tracker
> something like this:
Not sure what you meant here. How are you batching two IPIs for two domains?
#echo "//0=t;1=t" > /sys/fs/resctrl/info/L3_MON/mbm_assign_control
This is still a single write. Two IPIs are sent separately, one for each
domain.
Are you doing something different?
>
> struct resctrl_monitor_cfg {
> int closid;
> int rmid;
> int evtid;
> bool dirty;
> };
>
> This mirrors the info needed in whatever register configures the
> counter, plus a dirty flag to skip over the ones that don't need to be
> updated.
This is what my understanding of your implementation.
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index d94abba1c716..9cebf065cc97 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -94,6 +94,13 @@ struct rdt_ctrl_domain {
u32 *mbps_val;
};
+struct resctrl_monitor_cfg {
+ int closid;
+ int rmid;
+ int evtid;
+ bool dirty;
+};
+
/**
* struct rdt_mon_domain - group of CPUs sharing a resctrl monitor
resource
* @hdr: common header for different domain types
@@ -116,6 +123,7 @@ struct rdt_mon_domain {
struct delayed_work cqm_limbo;
int mbm_work_cpu;
int cqm_work_cpu;
+ /* Allocate num_mbm_cntrs entries in each domain */
+ struct resctrl_monitor_cfg *mon_cfg;
};
When a user requests an assignment for total event to the default group
for domain 0, you go search in rdt_mon_domain(dom 0) for empty mon_cfg
entry.
If there is an empty entry, then use that entry for assignment and
update closid, rmid, evtid and dirty = 1. We can get all these
information from default group here.
Does this make sense?
>
> For the benefit of displaying mbm_assign_control, I put a pointer back
> to any counter array entry allocated in the mbm_state struct only
> because it's an existing structure that exists for every rmid-domain
> combination.
Pointer in mbm_state may not be required here.
We are going to loop over resctrl groups. We can search the
rdt_mon_domain to see if specific closid, rmid, evtid is already
assigned or not in that domain.
>
> I didn't need to change the rdtgroup structure.
Ok. That is good.
>
>>
>>>
>>>
>>> 2. We should have a bitmap of "available counters" in each domain. We have
>>> this already. But allocation method changes.
>>
>> Would allocation/free not be simpler with only the per-domain bitmap needing
>> to be consulted?
>>
>> One implementation change I can think of is the dynamic assign of counters when
>> a monitor group is created. Now a free counter needs to be found in each
>> domain. Here it can be discussed if it should be an "all or nothing"
>> assignment but the handling does not seem to be complex and would need to be
>> solved eventually anyway.
>>
>>> 3. Dynamic allocation/free of the counters
>>>
>>> There could be some more things which I can't think right now. It might
>>> come up when we start working on it.
>>>
>>> It is doable. But, is it worth adding this complexity? I am not sure.
>>
>> Please elaborate where you see that this is too complex.
>>
>>>
>>> Peter mentioned earlier that he was not interested in domain specific
>>> assignments. He was only interested in all domain ("*") implementation.
>>
>> Peter's most recent message indicates otherwise:
>> https://lore.kernel.org/all/CALPaoCgiHEaY_cDbCo=537JJ7mkYZDFFDs9heYvtQ80fXuuvWQ@mail.gmail.com/
>
> For now, I'm focused on managing the domains locally whenever possible
> to avoid all IPIs, as this gives me the least overhead.
>
> I'm also prototyping the 'T' vs 't' approach that Reinette
> suggested[1], as this may take a lot of performance pressure off the
> mbm_assign_control interface, as most of the routine updates to
> counter assignments would be automated.
>
> -Peter
>
> [1] https://lore.kernel.org/lkml/7ee63634-3b55-4427-8283-8e3d38105f41@intel.com/
>
--
- Babu Moger
Hi Babu,
On Thu, Nov 28, 2024 at 8:35 PM Moger, Babu <bmoger@amd.com> wrote:
>
> Hi Peter,
>
> On 11/28/2024 5:10 AM, Peter Newman wrote:
> > Hi Babu, Reinette,
> >
> > On Wed, Nov 27, 2024 at 8:05 PM Reinette Chatre
> > <reinette.chatre@intel.com> wrote:
> >>
> >> Hi Babu,
> >>
> >> On 11/27/24 6:57 AM, Moger, Babu wrote:
> >>> 1. Each group needs to remember counter ids in each domain for each event.
> >>> For example:
> >>> Resctrl group mon1
> >>> Total event
> >>> dom 0 cntr_id 1,
> >>> dom 1 cntr_id 10
> >>> dom 2 cntr_id 11
> >>>
> >>> Local event
> >>> dom 0 cntr_id 2,
> >>> dom 1 cntr_id 15
> >>> dom 2 cntr_id 10
> >>
> >> Indeed. The challenge here is that domains may come and go so it cannot be a simple
> >> static array. As an alternative it can be an xarray indexed by the domain ID with
> >> pointers to a struct like below to contain the counters associated with the monitor
> >> group:
> >> struct cntr_id {
> >> u32 mbm_total;
> >> u32 mbm_local;
> >> }
> >>
> >>
> >> Thinking more about how this array needs to be managed made me wonder how the
> >> current implementation deals with domains that come and go. I do not think
> >> this is currently handled. For example, if a new domain comes online and
> >> monitoring groups had counters dynamically assigned, then these counters are
> >> not configured to the newly online domain.
>
> I am trying to understand the details of your approach here.
> >
> > In my prototype, I allocated a counter id-indexed array to each
> > monitoring domain structure for tracking the counter allocations,
> > because the hardware counters are all domain-scoped. That way the
> > tracking data goes away when the hardware does.
> >
> > I was focused on allowing all pending counter updates to a domain
> > resulting from a single mbm_assign_control write to be batched and
> > processed in a single IPI, so I structured the counter tracker
> > something like this:
>
> Not sure what you meant here. How are you batching two IPIs for two domains?
>
> #echo "//0=t;1=t" > /sys/fs/resctrl/info/L3_MON/mbm_assign_control
>
> This is still a single write. Two IPIs are sent separately, one for each
> domain.
>
> Are you doing something different?
I said "all pending counter updates to a domain", whereby I meant
targeting a single domain.
Depending on the CPU of the caller, your example write requires 1 or 2 IPIs.
What is important is that the following write also requires 1 or 2 IPIs:
(assuming /sys/fs/resctrl/mon_groups/[g1-g31] exist, line breaks added
for readability)
echo $'//0=t;1=t\n
/g1/0=t;1=t\n
/g2/0=t;1=t\n
/g3/0=t;1=t\n
/g4/0=t;1=t\n
/g5/0=t;1=t\n
/g6/0=t;1=t\n
/g7/0=t;1=t\n
/g8/0=t;1=t\n
/g9/0=t;1=t\n
/g10/0=t;1=t\n
/g11/0=t;1=t\n
/g12/0=t;1=t\n
/g13/0=t;1=t\n
/g14/0=t;1=t\n
/g15/0=t;1=t\n
/g16/0=t;1=t\n
/g17/0=t;1=t\n
/g18/0=t;1=t\n
/g19/0=t;1=t\n
/g20/0=t;1=t\n
/g21/0=t;1=t\n
/g22/0=t;1=t\n
/g23/0=t;1=t\n
/g24/0=t;1=t\n
/g25/0=t;1=t\n
/g26/0=t;1=t\n
/g27/0=t;1=t\n
/g28/0=t;1=t\n
/g29/0=t;1=t\n
/g30/0=t;1=t\n
/g31/0=t;1=t\n'
My ultimate goal is for a thread bound to a particular domain to be
able to unassign and reassign the local domain's 32 counters in a
single write() with no IPIs at all. And when IPIs are required, then
no more than one per domain, regardless of the number of groups
updated.
>
> >
> > struct resctrl_monitor_cfg {
> > int closid;
> > int rmid;
> > int evtid;
> > bool dirty;
> > };
> >
> > This mirrors the info needed in whatever register configures the
> > counter, plus a dirty flag to skip over the ones that don't need to be
> > updated.
>
> This is what my understanding of your implementation.
>
> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> index d94abba1c716..9cebf065cc97 100644
> --- a/include/linux/resctrl.h
> +++ b/include/linux/resctrl.h
> @@ -94,6 +94,13 @@ struct rdt_ctrl_domain {
> u32 *mbps_val;
> };
>
> +struct resctrl_monitor_cfg {
> + int closid;
> + int rmid;
> + int evtid;
> + bool dirty;
> +};
> +
> /**
> * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor
> resource
> * @hdr: common header for different domain types
> @@ -116,6 +123,7 @@ struct rdt_mon_domain {
> struct delayed_work cqm_limbo;
> int mbm_work_cpu;
> int cqm_work_cpu;
> + /* Allocate num_mbm_cntrs entries in each domain */
> + struct resctrl_monitor_cfg *mon_cfg;
> };
>
>
> When a user requests an assignment for total event to the default group
> for domain 0, you go search in rdt_mon_domain(dom 0) for empty mon_cfg
> entry.
>
> If there is an empty entry, then use that entry for assignment and
> update closid, rmid, evtid and dirty = 1. We can get all these
> information from default group here.
>
> Does this make sense?
Yes, sounds correct.
>
> >
> > For the benefit of displaying mbm_assign_control, I put a pointer back
> > to any counter array entry allocated in the mbm_state struct only
> > because it's an existing structure that exists for every rmid-domain
> > combination.
>
> Pointer in mbm_state may not be required here.
>
> We are going to loop over resctrl groups. We can search the
> rdt_mon_domain to see if specific closid, rmid, evtid is already
> assigned or not in that domain.
No, not required I guess. High-performance CPUs can probably search a
32-entry array very quickly.
-Peter
Hi Peter, Reinette,
On 11/29/2024 3:59 AM, Peter Newman wrote:
> Hi Babu,
>
> On Thu, Nov 28, 2024 at 8:35 PM Moger, Babu <bmoger@amd.com> wrote:
>>
>> Hi Peter,
>>
>> On 11/28/2024 5:10 AM, Peter Newman wrote:
>>> Hi Babu, Reinette,
>>>
>>> On Wed, Nov 27, 2024 at 8:05 PM Reinette Chatre
>>> <reinette.chatre@intel.com> wrote:
>>>>
>>>> Hi Babu,
>>>>
>>>> On 11/27/24 6:57 AM, Moger, Babu wrote:
>
>>>>> 1. Each group needs to remember counter ids in each domain for each event.
>>>>> For example:
>>>>> Resctrl group mon1
>>>>> Total event
>>>>> dom 0 cntr_id 1,
>>>>> dom 1 cntr_id 10
>>>>> dom 2 cntr_id 11
>>>>>
>>>>> Local event
>>>>> dom 0 cntr_id 2,
>>>>> dom 1 cntr_id 15
>>>>> dom 2 cntr_id 10
>>>>
>>>> Indeed. The challenge here is that domains may come and go so it cannot be a simple
>>>> static array. As an alternative it can be an xarray indexed by the domain ID with
>>>> pointers to a struct like below to contain the counters associated with the monitor
>>>> group:
>>>> struct cntr_id {
>>>> u32 mbm_total;
>>>> u32 mbm_local;
>>>> }
>>>>
>>>>
>>>> Thinking more about how this array needs to be managed made me wonder how the
>>>> current implementation deals with domains that come and go. I do not think
>>>> this is currently handled. For example, if a new domain comes online and
>>>> monitoring groups had counters dynamically assigned, then these counters are
>>>> not configured to the newly online domain.
>>
>> I am trying to understand the details of your approach here.
>>>
>>> In my prototype, I allocated a counter id-indexed array to each
>>> monitoring domain structure for tracking the counter allocations,
>>> because the hardware counters are all domain-scoped. That way the
>>> tracking data goes away when the hardware does.
>>>
>>> I was focused on allowing all pending counter updates to a domain
>>> resulting from a single mbm_assign_control write to be batched and
>>> processed in a single IPI, so I structured the counter tracker
>>> something like this:
>>
>> Not sure what you meant here. How are you batching two IPIs for two domains?
>>
>> #echo "//0=t;1=t" > /sys/fs/resctrl/info/L3_MON/mbm_assign_control
>>
>> This is still a single write. Two IPIs are sent separately, one for each
>> domain.
>>
>> Are you doing something different?
>
> I said "all pending counter updates to a domain", whereby I meant
> targeting a single domain.
>
> Depending on the CPU of the caller, your example write requires 1 or 2 IPIs.
>
> What is important is that the following write also requires 1 or 2 IPIs:
>
> (assuming /sys/fs/resctrl/mon_groups/[g1-g31] exist, line breaks added
> for readability)
>
> echo $'//0=t;1=t\n
> /g1/0=t;1=t\n
> /g2/0=t;1=t\n
> /g3/0=t;1=t\n
> /g4/0=t;1=t\n
> /g5/0=t;1=t\n
> /g6/0=t;1=t\n
> /g7/0=t;1=t\n
> /g8/0=t;1=t\n
> /g9/0=t;1=t\n
> /g10/0=t;1=t\n
> /g11/0=t;1=t\n
> /g12/0=t;1=t\n
> /g13/0=t;1=t\n
> /g14/0=t;1=t\n
> /g15/0=t;1=t\n
> /g16/0=t;1=t\n
> /g17/0=t;1=t\n
> /g18/0=t;1=t\n
> /g19/0=t;1=t\n
> /g20/0=t;1=t\n
> /g21/0=t;1=t\n
> /g22/0=t;1=t\n
> /g23/0=t;1=t\n
> /g24/0=t;1=t\n
> /g25/0=t;1=t\n
> /g26/0=t;1=t\n
> /g27/0=t;1=t\n
> /g28/0=t;1=t\n
> /g29/0=t;1=t\n
> /g30/0=t;1=t\n
> /g31/0=t;1=t\n'
>
> My ultimate goal is for a thread bound to a particular domain to be
> able to unassign and reassign the local domain's 32 counters in a
> single write() with no IPIs at all. And when IPIs are required, then
> no more than one per domain, regardless of the number of groups
> updated.
>
Yes. I think I got the idea. Thanks.
>
>>
>>>
>>> struct resctrl_monitor_cfg {
>>> int closid;
>>> int rmid;
>>> int evtid;
>>> bool dirty;
>>> };
>>>
>>> This mirrors the info needed in whatever register configures the
>>> counter, plus a dirty flag to skip over the ones that don't need to be
>>> updated.
>>
>> This is what my understanding of your implementation.
>>
>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
>> index d94abba1c716..9cebf065cc97 100644
>> --- a/include/linux/resctrl.h
>> +++ b/include/linux/resctrl.h
>> @@ -94,6 +94,13 @@ struct rdt_ctrl_domain {
>> u32 *mbps_val;
>> };
>>
>> +struct resctrl_monitor_cfg {
>> + int closid;
>> + int rmid;
>> + int evtid;
>> + bool dirty;
>> +};
>> +
>> /**
>> * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor
>> resource
>> * @hdr: common header for different domain types
>> @@ -116,6 +123,7 @@ struct rdt_mon_domain {
>> struct delayed_work cqm_limbo;
>> int mbm_work_cpu;
>> int cqm_work_cpu;
>> + /* Allocate num_mbm_cntrs entries in each domain */
>> + struct resctrl_monitor_cfg *mon_cfg;
>> };
>>
>>
>> When a user requests an assignment for total event to the default group
>> for domain 0, you go search in rdt_mon_domain(dom 0) for empty mon_cfg
>> entry.
>>
>> If there is an empty entry, then use that entry for assignment and
>> update closid, rmid, evtid and dirty = 1. We can get all these
>> information from default group here.
>>
>> Does this make sense?
>
> Yes, sounds correct.
I will probably add cntr_id in resctrl_monitor_cfg structure and
initialize during the allocation. And rename the field 'dirty' to
'active'(or something similar) to hold the assign state for that entry.
That way we have all the information required for assignment at one
place. We don't need to update the rdtgroup structure.
Reinette, What do you think about this approach?
>
>>
>>>
>>> For the benefit of displaying mbm_assign_control, I put a pointer back
>>> to any counter array entry allocated in the mbm_state struct only
>>> because it's an existing structure that exists for every rmid-domain
>>> combination.
>>
>> Pointer in mbm_state may not be required here.
>>
>> We are going to loop over resctrl groups. We can search the
>> rdt_mon_domain to see if specific closid, rmid, evtid is already
>> assigned or not in that domain.
>
> No, not required I guess. High-performance CPUs can probably search a
> 32-entry array very quickly.
Ok.
--
- Babu Moger
Hi Babu and Peter,
On 11/29/24 9:06 AM, Moger, Babu wrote:
> Hi Peter, Reinette,
>
> On 11/29/2024 3:59 AM, Peter Newman wrote:
>> Hi Babu,
>>
>> On Thu, Nov 28, 2024 at 8:35 PM Moger, Babu <bmoger@amd.com> wrote:
>>>
>>> Hi Peter,
>>>
>>> On 11/28/2024 5:10 AM, Peter Newman wrote:
>>>> Hi Babu, Reinette,
>>>>
>>>> On Wed, Nov 27, 2024 at 8:05 PM Reinette Chatre
>>>> <reinette.chatre@intel.com> wrote:
>>>>>
>>>>> Hi Babu,
>>>>>
>>>>> On 11/27/24 6:57 AM, Moger, Babu wrote:
>>
>>>>>> 1. Each group needs to remember counter ids in each domain for each event.
>>>>>> For example:
>>>>>> Resctrl group mon1
>>>>>> Total event
>>>>>> dom 0 cntr_id 1,
>>>>>> dom 1 cntr_id 10
>>>>>> dom 2 cntr_id 11
>>>>>>
>>>>>> Local event
>>>>>> dom 0 cntr_id 2,
>>>>>> dom 1 cntr_id 15
>>>>>> dom 2 cntr_id 10
>>>>>
>>>>> Indeed. The challenge here is that domains may come and go so it cannot be a simple
>>>>> static array. As an alternative it can be an xarray indexed by the domain ID with
>>>>> pointers to a struct like below to contain the counters associated with the monitor
>>>>> group:
>>>>> struct cntr_id {
>>>>> u32 mbm_total;
>>>>> u32 mbm_local;
>>>>> }
>>>>>
>>>>>
>>>>> Thinking more about how this array needs to be managed made me wonder how the
>>>>> current implementation deals with domains that come and go. I do not think
>>>>> this is currently handled. For example, if a new domain comes online and
>>>>> monitoring groups had counters dynamically assigned, then these counters are
>>>>> not configured to the newly online domain.
>>>
>>> I am trying to understand the details of your approach here.
>>>>
>>>> In my prototype, I allocated a counter id-indexed array to each
>>>> monitoring domain structure for tracking the counter allocations,
>>>> because the hardware counters are all domain-scoped. That way the
>>>> tracking data goes away when the hardware does.
>>>>
>>>> I was focused on allowing all pending counter updates to a domain
>>>> resulting from a single mbm_assign_control write to be batched and
>>>> processed in a single IPI, so I structured the counter tracker
>>>> something like this:
>>>
>>> Not sure what you meant here. How are you batching two IPIs for two domains?
>>>
>>> #echo "//0=t;1=t" > /sys/fs/resctrl/info/L3_MON/mbm_assign_control
>>>
>>> This is still a single write. Two IPIs are sent separately, one for each
>>> domain.
>>>
>>> Are you doing something different?
>>
>> I said "all pending counter updates to a domain", whereby I meant
>> targeting a single domain.
>>
>> Depending on the CPU of the caller, your example write requires 1 or 2 IPIs.
>>
>> What is important is that the following write also requires 1 or 2 IPIs:
>>
>> (assuming /sys/fs/resctrl/mon_groups/[g1-g31] exist, line breaks added
>> for readability)
>>
>> echo $'//0=t;1=t\n
>> /g1/0=t;1=t\n
>> /g2/0=t;1=t\n
>> /g3/0=t;1=t\n
>> /g4/0=t;1=t\n
>> /g5/0=t;1=t\n
>> /g6/0=t;1=t\n
>> /g7/0=t;1=t\n
>> /g8/0=t;1=t\n
>> /g9/0=t;1=t\n
>> /g10/0=t;1=t\n
>> /g11/0=t;1=t\n
>> /g12/0=t;1=t\n
>> /g13/0=t;1=t\n
>> /g14/0=t;1=t\n
>> /g15/0=t;1=t\n
>> /g16/0=t;1=t\n
>> /g17/0=t;1=t\n
>> /g18/0=t;1=t\n
>> /g19/0=t;1=t\n
>> /g20/0=t;1=t\n
>> /g21/0=t;1=t\n
>> /g22/0=t;1=t\n
>> /g23/0=t;1=t\n
>> /g24/0=t;1=t\n
>> /g25/0=t;1=t\n
>> /g26/0=t;1=t\n
>> /g27/0=t;1=t\n
>> /g28/0=t;1=t\n
>> /g29/0=t;1=t\n
>> /g30/0=t;1=t\n
>> /g31/0=t;1=t\n'
>>
>> My ultimate goal is for a thread bound to a particular domain to be
>> able to unassign and reassign the local domain's 32 counters in a
>> single write() with no IPIs at all. And when IPIs are required, then
>> no more than one per domain, regardless of the number of groups
>> updated.
>>
>
> Yes. I think I got the idea. Thanks.
>
>>
>>>
>>>>
>>>> struct resctrl_monitor_cfg {
>>>> int closid;
>>>> int rmid;
>>>> int evtid;
>>>> bool dirty;
>>>> };
>>>>
>>>> This mirrors the info needed in whatever register configures the
>>>> counter, plus a dirty flag to skip over the ones that don't need to be
>>>> updated.
>>>
>>> This is what my understanding of your implementation.
>>>
>>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
>>> index d94abba1c716..9cebf065cc97 100644
>>> --- a/include/linux/resctrl.h
>>> +++ b/include/linux/resctrl.h
>>> @@ -94,6 +94,13 @@ struct rdt_ctrl_domain {
>>> u32 *mbps_val;
>>> };
>>>
>>> +struct resctrl_monitor_cfg {
>>> + int closid;
>>> + int rmid;
>>> + int evtid;
>>> + bool dirty;
>>> +};
>>> +
>>> /**
>>> * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor
>>> resource
>>> * @hdr: common header for different domain types
>>> @@ -116,6 +123,7 @@ struct rdt_mon_domain {
>>> struct delayed_work cqm_limbo;
>>> int mbm_work_cpu;
>>> int cqm_work_cpu;
>>> + /* Allocate num_mbm_cntrs entries in each domain */
>>> + struct resctrl_monitor_cfg *mon_cfg;
>>> };
>>>
>>>
>>> When a user requests an assignment for total event to the default group
>>> for domain 0, you go search in rdt_mon_domain(dom 0) for empty mon_cfg
>>> entry.
>>>
>>> If there is an empty entry, then use that entry for assignment and
>>> update closid, rmid, evtid and dirty = 1. We can get all these
>>> information from default group here.
>>>
>>> Does this make sense?
>>
>> Yes, sounds correct.
>
> I will probably add cntr_id in resctrl_monitor_cfg structure and
> initialize during the allocation. And rename the field 'dirty' to
> 'active'(or something similar) to hold the assign state for that
> entry. That way we have all the information required for assignment
> at one place. We don't need to update the rdtgroup structure.
>
> Reinette, What do you think about this approach?
I think this approach is in the right direction. Thanks to Peter for
the guidance here.
I do not think that it is necessary to add cntr_id to resctrl_monitor_cfg
though, I think the cntr_id would be the index to the array instead?
It may also be worthwhile to consider using a pointer to the resource
group instead of storing closid and rmid directly. If used to indicate
initialization then an initialized pointer is easier to distinguish than
the closid/rmid that may have zero as valid values.
I expect evtid will be enum resctrl_event_id and that raises the question
of whether "0" can indeed be used as an "uninitialized" value since doing
so would change the meaning of the enum. It may indeed keep things
separated by maintaining evtid as an enum resctrl_event_id and note the
initialization differently ... either via a pointer to a resource group
or entirely separately as Babu indicates later.
>>>> For the benefit of displaying mbm_assign_control, I put a pointer back
>>>> to any counter array entry allocated in the mbm_state struct only
>>>> because it's an existing structure that exists for every rmid-domain
>>>> combination.
>>>
>>> Pointer in mbm_state may not be required here.
>>>
>>> We are going to loop over resctrl groups. We can search the
>>> rdt_mon_domain to see if specific closid, rmid, evtid is already
>>> assigned or not in that domain.
>>
>> No, not required I guess. High-performance CPUs can probably search a
>> 32-entry array very quickly.
>
> Ok.
>
This is not so clear to me. I am wondering about the scenario when a resource
group is removed and its counters need to be freed. Searching which counters
need to be freed would then require a search of the array within every domain,
of which I understand there can be many? Having a pointer from the mbm_state
may help here.
Reinette
Hi Reinette,
On 12/2/24 12:33, Reinette Chatre wrote:
> Hi Babu and Peter,
>
> On 11/29/24 9:06 AM, Moger, Babu wrote:
>> Hi Peter, Reinette,
>>
>> On 11/29/2024 3:59 AM, Peter Newman wrote:
>>> Hi Babu,
>>>
>>> On Thu, Nov 28, 2024 at 8:35 PM Moger, Babu <bmoger@amd.com> wrote:
>>>>
>>>> Hi Peter,
>>>>
>>>> On 11/28/2024 5:10 AM, Peter Newman wrote:
>>>>> Hi Babu, Reinette,
>>>>>
>>>>> On Wed, Nov 27, 2024 at 8:05 PM Reinette Chatre
>>>>> <reinette.chatre@intel.com> wrote:
>>>>>>
>>>>>> Hi Babu,
>>>>>>
>>>>>> On 11/27/24 6:57 AM, Moger, Babu wrote:
>>>
>>>>>>> 1. Each group needs to remember counter ids in each domain for each event.
>>>>>>> For example:
>>>>>>> Resctrl group mon1
>>>>>>> Total event
>>>>>>> dom 0 cntr_id 1,
>>>>>>> dom 1 cntr_id 10
>>>>>>> dom 2 cntr_id 11
>>>>>>>
>>>>>>> Local event
>>>>>>> dom 0 cntr_id 2,
>>>>>>> dom 1 cntr_id 15
>>>>>>> dom 2 cntr_id 10
>>>>>>
>>>>>> Indeed. The challenge here is that domains may come and go so it cannot be a simple
>>>>>> static array. As an alternative it can be an xarray indexed by the domain ID with
>>>>>> pointers to a struct like below to contain the counters associated with the monitor
>>>>>> group:
>>>>>> struct cntr_id {
>>>>>> u32 mbm_total;
>>>>>> u32 mbm_local;
>>>>>> }
>>>>>>
>>>>>>
>>>>>> Thinking more about how this array needs to be managed made me wonder how the
>>>>>> current implementation deals with domains that come and go. I do not think
>>>>>> this is currently handled. For example, if a new domain comes online and
>>>>>> monitoring groups had counters dynamically assigned, then these counters are
>>>>>> not configured to the newly online domain.
>>>>
>>>> I am trying to understand the details of your approach here.
>>>>>
>>>>> In my prototype, I allocated a counter id-indexed array to each
>>>>> monitoring domain structure for tracking the counter allocations,
>>>>> because the hardware counters are all domain-scoped. That way the
>>>>> tracking data goes away when the hardware does.
>>>>>
>>>>> I was focused on allowing all pending counter updates to a domain
>>>>> resulting from a single mbm_assign_control write to be batched and
>>>>> processed in a single IPI, so I structured the counter tracker
>>>>> something like this:
>>>>
>>>> Not sure what you meant here. How are you batching two IPIs for two domains?
>>>>
>>>> #echo "//0=t;1=t" > /sys/fs/resctrl/info/L3_MON/mbm_assign_control
>>>>
>>>> This is still a single write. Two IPIs are sent separately, one for each
>>>> domain.
>>>>
>>>> Are you doing something different?
>>>
>>> I said "all pending counter updates to a domain", whereby I meant
>>> targeting a single domain.
>>>
>>> Depending on the CPU of the caller, your example write requires 1 or 2 IPIs.
>>>
>>> What is important is that the following write also requires 1 or 2 IPIs:
>>>
>>> (assuming /sys/fs/resctrl/mon_groups/[g1-g31] exist, line breaks added
>>> for readability)
>>>
>>> echo $'//0=t;1=t\n
>>> /g1/0=t;1=t\n
>>> /g2/0=t;1=t\n
>>> /g3/0=t;1=t\n
>>> /g4/0=t;1=t\n
>>> /g5/0=t;1=t\n
>>> /g6/0=t;1=t\n
>>> /g7/0=t;1=t\n
>>> /g8/0=t;1=t\n
>>> /g9/0=t;1=t\n
>>> /g10/0=t;1=t\n
>>> /g11/0=t;1=t\n
>>> /g12/0=t;1=t\n
>>> /g13/0=t;1=t\n
>>> /g14/0=t;1=t\n
>>> /g15/0=t;1=t\n
>>> /g16/0=t;1=t\n
>>> /g17/0=t;1=t\n
>>> /g18/0=t;1=t\n
>>> /g19/0=t;1=t\n
>>> /g20/0=t;1=t\n
>>> /g21/0=t;1=t\n
>>> /g22/0=t;1=t\n
>>> /g23/0=t;1=t\n
>>> /g24/0=t;1=t\n
>>> /g25/0=t;1=t\n
>>> /g26/0=t;1=t\n
>>> /g27/0=t;1=t\n
>>> /g28/0=t;1=t\n
>>> /g29/0=t;1=t\n
>>> /g30/0=t;1=t\n
>>> /g31/0=t;1=t\n'
>>>
>>> My ultimate goal is for a thread bound to a particular domain to be
>>> able to unassign and reassign the local domain's 32 counters in a
>>> single write() with no IPIs at all. And when IPIs are required, then
>>> no more than one per domain, regardless of the number of groups
>>> updated.
>>>
>>
>> Yes. I think I got the idea. Thanks.
>>
>>>
>>>>
>>>>>
>>>>> struct resctrl_monitor_cfg {
>>>>> int closid;
>>>>> int rmid;
>>>>> int evtid;
>>>>> bool dirty;
>>>>> };
>>>>>
>>>>> This mirrors the info needed in whatever register configures the
>>>>> counter, plus a dirty flag to skip over the ones that don't need to be
>>>>> updated.
>>>>
>>>> This is what my understanding of your implementation.
>>>>
>>>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
>>>> index d94abba1c716..9cebf065cc97 100644
>>>> --- a/include/linux/resctrl.h
>>>> +++ b/include/linux/resctrl.h
>>>> @@ -94,6 +94,13 @@ struct rdt_ctrl_domain {
>>>> u32 *mbps_val;
>>>> };
>>>>
>>>> +struct resctrl_monitor_cfg {
>>>> + int closid;
>>>> + int rmid;
>>>> + int evtid;
>>>> + bool dirty;
>>>> +};
>>>> +
>>>> /**
>>>> * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor
>>>> resource
>>>> * @hdr: common header for different domain types
>>>> @@ -116,6 +123,7 @@ struct rdt_mon_domain {
>>>> struct delayed_work cqm_limbo;
>>>> int mbm_work_cpu;
>>>> int cqm_work_cpu;
>>>> + /* Allocate num_mbm_cntrs entries in each domain */
>>>> + struct resctrl_monitor_cfg *mon_cfg;
>>>> };
>>>>
>>>>
>>>> When a user requests an assignment for total event to the default group
>>>> for domain 0, you go search in rdt_mon_domain(dom 0) for empty mon_cfg
>>>> entry.
>>>>
>>>> If there is an empty entry, then use that entry for assignment and
>>>> update closid, rmid, evtid and dirty = 1. We can get all these
>>>> information from default group here.
>>>>
>>>> Does this make sense?
>>>
>>> Yes, sounds correct.
>>
>> I will probably add cntr_id in resctrl_monitor_cfg structure and
>> initialize during the allocation. And rename the field 'dirty' to
>> 'active'(or something similar) to hold the assign state for that
>> entry. That way we have all the information required for assignment
>> at one place. We don't need to update the rdtgroup structure.
>>
>> Reinette, What do you think about this approach?
>
> I think this approach is in the right direction. Thanks to Peter for
> the guidance here.
> I do not think that it is necessary to add cntr_id to resctrl_monitor_cfg
> though, I think the cntr_id would be the index to the array instead?
Yes. I think We can use the index as cntn_id. Will let you know otherwise.
>
> It may also be worthwhile to consider using a pointer to the resource
> group instead of storing closid and rmid directly. If used to indicate
> initialization then an initialized pointer is easier to distinguish than
> the closid/rmid that may have zero as valid values.
Sure. Sounds good.
>
> I expect evtid will be enum resctrl_event_id and that raises the question
> of whether "0" can indeed be used as an "uninitialized" value since doing
> so would change the meaning of the enum. It may indeed keep things
> separated by maintaining evtid as an enum resctrl_event_id and note the
> initialization differently ... either via a pointer to a resource group
> or entirely separately as Babu indicates later.
Sure. Will add evtid as enum resctrl_event_id and use the "state" to
indicate assign/unassign/dirty status.
>
>>>>> For the benefit of displaying mbm_assign_control, I put a pointer back
>>>>> to any counter array entry allocated in the mbm_state struct only
>>>>> because it's an existing structure that exists for every rmid-domain
>>>>> combination.
>>>>
>>>> Pointer in mbm_state may not be required here.
>>>>
>>>> We are going to loop over resctrl groups. We can search the
>>>> rdt_mon_domain to see if specific closid, rmid, evtid is already
>>>> assigned or not in that domain.
>>>
>>> No, not required I guess. High-performance CPUs can probably search a
>>> 32-entry array very quickly.
>>
>> Ok.
>>
>
> This is not so clear to me. I am wondering about the scenario when a resource
> group is removed and its counters need to be freed. Searching which counters
> need to be freed would then require a search of the array within every domain,
> of which I understand there can be many? Having a pointer from the mbm_state
> may help here.
Sure. Will add the allocated entry pointer in mbm_state.
>
> Reinette
>
>
--
Thanks
Babu Moger
Hi Babu,
On 12/2/24 11:48 AM, Moger, Babu wrote:
> On 12/2/24 12:33, Reinette Chatre wrote:
>> On 11/29/24 9:06 AM, Moger, Babu wrote:
>>> On 11/29/2024 3:59 AM, Peter Newman wrote:
>>>> On Thu, Nov 28, 2024 at 8:35 PM Moger, Babu <bmoger@amd.com> wrote:
>>>>> On 11/28/2024 5:10 AM, Peter Newman wrote:
>>>>>> On Wed, Nov 27, 2024 at 8:05 PM Reinette Chatre
>>>>>> <reinette.chatre@intel.com> wrote:
>>>>>>>
>>>>>>> Hi Babu,
>>>>>>>
>>>>>>> On 11/27/24 6:57 AM, Moger, Babu wrote:
>>>>
>>>>>>>> 1. Each group needs to remember counter ids in each domain for each event.
>>>>>>>> For example:
>>>>>>>> Resctrl group mon1
>>>>>>>> Total event
>>>>>>>> dom 0 cntr_id 1,
>>>>>>>> dom 1 cntr_id 10
>>>>>>>> dom 2 cntr_id 11
>>>>>>>>
>>>>>>>> Local event
>>>>>>>> dom 0 cntr_id 2,
>>>>>>>> dom 1 cntr_id 15
>>>>>>>> dom 2 cntr_id 10
>>>>>>>
>>>>>>> Indeed. The challenge here is that domains may come and go so it cannot be a simple
>>>>>>> static array. As an alternative it can be an xarray indexed by the domain ID with
>>>>>>> pointers to a struct like below to contain the counters associated with the monitor
>>>>>>> group:
>>>>>>> struct cntr_id {
>>>>>>> u32 mbm_total;
>>>>>>> u32 mbm_local;
>>>>>>> }
>>>>>>>
>>>>>>>
>>>>>>> Thinking more about how this array needs to be managed made me wonder how the
>>>>>>> current implementation deals with domains that come and go. I do not think
>>>>>>> this is currently handled. For example, if a new domain comes online and
>>>>>>> monitoring groups had counters dynamically assigned, then these counters are
>>>>>>> not configured to the newly online domain.
>>>>>
>>>>> I am trying to understand the details of your approach here.
>>>>>>
>>>>>> In my prototype, I allocated a counter id-indexed array to each
>>>>>> monitoring domain structure for tracking the counter allocations,
>>>>>> because the hardware counters are all domain-scoped. That way the
>>>>>> tracking data goes away when the hardware does.
>>>>>>
>>>>>> I was focused on allowing all pending counter updates to a domain
>>>>>> resulting from a single mbm_assign_control write to be batched and
>>>>>> processed in a single IPI, so I structured the counter tracker
>>>>>> something like this:
>>>>>
>>>>> Not sure what you meant here. How are you batching two IPIs for two domains?
>>>>>
>>>>> #echo "//0=t;1=t" > /sys/fs/resctrl/info/L3_MON/mbm_assign_control
>>>>>
>>>>> This is still a single write. Two IPIs are sent separately, one for each
>>>>> domain.
>>>>>
>>>>> Are you doing something different?
>>>>
>>>> I said "all pending counter updates to a domain", whereby I meant
>>>> targeting a single domain.
>>>>
>>>> Depending on the CPU of the caller, your example write requires 1 or 2 IPIs.
>>>>
>>>> What is important is that the following write also requires 1 or 2 IPIs:
>>>>
>>>> (assuming /sys/fs/resctrl/mon_groups/[g1-g31] exist, line breaks added
>>>> for readability)
>>>>
>>>> echo $'//0=t;1=t\n
>>>> /g1/0=t;1=t\n
>>>> /g2/0=t;1=t\n
>>>> /g3/0=t;1=t\n
>>>> /g4/0=t;1=t\n
>>>> /g5/0=t;1=t\n
>>>> /g6/0=t;1=t\n
>>>> /g7/0=t;1=t\n
>>>> /g8/0=t;1=t\n
>>>> /g9/0=t;1=t\n
>>>> /g10/0=t;1=t\n
>>>> /g11/0=t;1=t\n
>>>> /g12/0=t;1=t\n
>>>> /g13/0=t;1=t\n
>>>> /g14/0=t;1=t\n
>>>> /g15/0=t;1=t\n
>>>> /g16/0=t;1=t\n
>>>> /g17/0=t;1=t\n
>>>> /g18/0=t;1=t\n
>>>> /g19/0=t;1=t\n
>>>> /g20/0=t;1=t\n
>>>> /g21/0=t;1=t\n
>>>> /g22/0=t;1=t\n
>>>> /g23/0=t;1=t\n
>>>> /g24/0=t;1=t\n
>>>> /g25/0=t;1=t\n
>>>> /g26/0=t;1=t\n
>>>> /g27/0=t;1=t\n
>>>> /g28/0=t;1=t\n
>>>> /g29/0=t;1=t\n
>>>> /g30/0=t;1=t\n
>>>> /g31/0=t;1=t\n'
>>>>
>>>> My ultimate goal is for a thread bound to a particular domain to be
>>>> able to unassign and reassign the local domain's 32 counters in a
>>>> single write() with no IPIs at all. And when IPIs are required, then
>>>> no more than one per domain, regardless of the number of groups
>>>> updated.
>>>>
>>>
>>> Yes. I think I got the idea. Thanks.
>>>
>>>>
>>>>>
>>>>>>
>>>>>> struct resctrl_monitor_cfg {
>>>>>> int closid;
>>>>>> int rmid;
>>>>>> int evtid;
>>>>>> bool dirty;
>>>>>> };
>>>>>>
>>>>>> This mirrors the info needed in whatever register configures the
>>>>>> counter, plus a dirty flag to skip over the ones that don't need to be
>>>>>> updated.
>>>>>
>>>>> This is what my understanding of your implementation.
>>>>>
>>>>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
>>>>> index d94abba1c716..9cebf065cc97 100644
>>>>> --- a/include/linux/resctrl.h
>>>>> +++ b/include/linux/resctrl.h
>>>>> @@ -94,6 +94,13 @@ struct rdt_ctrl_domain {
>>>>> u32 *mbps_val;
>>>>> };
>>>>>
>>>>> +struct resctrl_monitor_cfg {
>>>>> + int closid;
>>>>> + int rmid;
>>>>> + int evtid;
>>>>> + bool dirty;
>>>>> +};
>>>>> +
>>>>> /**
>>>>> * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor
>>>>> resource
>>>>> * @hdr: common header for different domain types
>>>>> @@ -116,6 +123,7 @@ struct rdt_mon_domain {
>>>>> struct delayed_work cqm_limbo;
>>>>> int mbm_work_cpu;
>>>>> int cqm_work_cpu;
>>>>> + /* Allocate num_mbm_cntrs entries in each domain */
>>>>> + struct resctrl_monitor_cfg *mon_cfg;
>>>>> };
>>>>>
>>>>>
>>>>> When a user requests an assignment for total event to the default group
>>>>> for domain 0, you go search in rdt_mon_domain(dom 0) for empty mon_cfg
>>>>> entry.
>>>>>
>>>>> If there is an empty entry, then use that entry for assignment and
>>>>> update closid, rmid, evtid and dirty = 1. We can get all these
>>>>> information from default group here.
>>>>>
>>>>> Does this make sense?
>>>>
>>>> Yes, sounds correct.
>>>
>>> I will probably add cntr_id in resctrl_monitor_cfg structure and
>>> initialize during the allocation. And rename the field 'dirty' to
>>> 'active'(or something similar) to hold the assign state for that
>>> entry. That way we have all the information required for assignment
>>> at one place. We don't need to update the rdtgroup structure.
>>>
>>> Reinette, What do you think about this approach?
>>
>> I think this approach is in the right direction. Thanks to Peter for
>> the guidance here.
>> I do not think that it is necessary to add cntr_id to resctrl_monitor_cfg
>> though, I think the cntr_id would be the index to the array instead?
>
> Yes. I think We can use the index as cntn_id. Will let you know otherwise.
>
>
>>
>> It may also be worthwhile to consider using a pointer to the resource
>> group instead of storing closid and rmid directly. If used to indicate
>> initialization then an initialized pointer is easier to distinguish than
>> the closid/rmid that may have zero as valid values.
>
> Sure. Sounds good.
>
>>
>> I expect evtid will be enum resctrl_event_id and that raises the question
>> of whether "0" can indeed be used as an "uninitialized" value since doing
>> so would change the meaning of the enum. It may indeed keep things
>> separated by maintaining evtid as an enum resctrl_event_id and note the
>> initialization differently ... either via a pointer to a resource group
>> or entirely separately as Babu indicates later.
>
> Sure. Will add evtid as enum resctrl_event_id and use the "state" to
> indicate assign/unassign/dirty status.
Is "assign/unassign" state needed? If resctrl_monitor_cfg contains a pointer
to the resource group to which the counter has been assigned then I expect NULL
means unassigned and a value means assigned?
Reinette
Hi Reinette,
On 12/2/24 14:15, Reinette Chatre wrote:
> Hi Babu,
>
> On 12/2/24 11:48 AM, Moger, Babu wrote:
>> On 12/2/24 12:33, Reinette Chatre wrote:
>>> On 11/29/24 9:06 AM, Moger, Babu wrote:
>>>> On 11/29/2024 3:59 AM, Peter Newman wrote:
>>>>> On Thu, Nov 28, 2024 at 8:35 PM Moger, Babu <bmoger@amd.com> wrote:
>>>>>> On 11/28/2024 5:10 AM, Peter Newman wrote:
>>>>>>> On Wed, Nov 27, 2024 at 8:05 PM Reinette Chatre
>>>>>>> <reinette.chatre@intel.com> wrote:
>>>>>>>>
>>>>>>>> Hi Babu,
>>>>>>>>
>>>>>>>> On 11/27/24 6:57 AM, Moger, Babu wrote:
>>>>>
>>>>>>>>> 1. Each group needs to remember counter ids in each domain for each event.
>>>>>>>>> For example:
>>>>>>>>> Resctrl group mon1
>>>>>>>>> Total event
>>>>>>>>> dom 0 cntr_id 1,
>>>>>>>>> dom 1 cntr_id 10
>>>>>>>>> dom 2 cntr_id 11
>>>>>>>>>
>>>>>>>>> Local event
>>>>>>>>> dom 0 cntr_id 2,
>>>>>>>>> dom 1 cntr_id 15
>>>>>>>>> dom 2 cntr_id 10
>>>>>>>>
>>>>>>>> Indeed. The challenge here is that domains may come and go so it cannot be a simple
>>>>>>>> static array. As an alternative it can be an xarray indexed by the domain ID with
>>>>>>>> pointers to a struct like below to contain the counters associated with the monitor
>>>>>>>> group:
>>>>>>>> struct cntr_id {
>>>>>>>> u32 mbm_total;
>>>>>>>> u32 mbm_local;
>>>>>>>> }
>>>>>>>>
>>>>>>>>
>>>>>>>> Thinking more about how this array needs to be managed made me wonder how the
>>>>>>>> current implementation deals with domains that come and go. I do not think
>>>>>>>> this is currently handled. For example, if a new domain comes online and
>>>>>>>> monitoring groups had counters dynamically assigned, then these counters are
>>>>>>>> not configured to the newly online domain.
>>>>>>
>>>>>> I am trying to understand the details of your approach here.
>>>>>>>
>>>>>>> In my prototype, I allocated a counter id-indexed array to each
>>>>>>> monitoring domain structure for tracking the counter allocations,
>>>>>>> because the hardware counters are all domain-scoped. That way the
>>>>>>> tracking data goes away when the hardware does.
>>>>>>>
>>>>>>> I was focused on allowing all pending counter updates to a domain
>>>>>>> resulting from a single mbm_assign_control write to be batched and
>>>>>>> processed in a single IPI, so I structured the counter tracker
>>>>>>> something like this:
>>>>>>
>>>>>> Not sure what you meant here. How are you batching two IPIs for two domains?
>>>>>>
>>>>>> #echo "//0=t;1=t" > /sys/fs/resctrl/info/L3_MON/mbm_assign_control
>>>>>>
>>>>>> This is still a single write. Two IPIs are sent separately, one for each
>>>>>> domain.
>>>>>>
>>>>>> Are you doing something different?
>>>>>
>>>>> I said "all pending counter updates to a domain", whereby I meant
>>>>> targeting a single domain.
>>>>>
>>>>> Depending on the CPU of the caller, your example write requires 1 or 2 IPIs.
>>>>>
>>>>> What is important is that the following write also requires 1 or 2 IPIs:
>>>>>
>>>>> (assuming /sys/fs/resctrl/mon_groups/[g1-g31] exist, line breaks added
>>>>> for readability)
>>>>>
>>>>> echo $'//0=t;1=t\n
>>>>> /g1/0=t;1=t\n
>>>>> /g2/0=t;1=t\n
>>>>> /g3/0=t;1=t\n
>>>>> /g4/0=t;1=t\n
>>>>> /g5/0=t;1=t\n
>>>>> /g6/0=t;1=t\n
>>>>> /g7/0=t;1=t\n
>>>>> /g8/0=t;1=t\n
>>>>> /g9/0=t;1=t\n
>>>>> /g10/0=t;1=t\n
>>>>> /g11/0=t;1=t\n
>>>>> /g12/0=t;1=t\n
>>>>> /g13/0=t;1=t\n
>>>>> /g14/0=t;1=t\n
>>>>> /g15/0=t;1=t\n
>>>>> /g16/0=t;1=t\n
>>>>> /g17/0=t;1=t\n
>>>>> /g18/0=t;1=t\n
>>>>> /g19/0=t;1=t\n
>>>>> /g20/0=t;1=t\n
>>>>> /g21/0=t;1=t\n
>>>>> /g22/0=t;1=t\n
>>>>> /g23/0=t;1=t\n
>>>>> /g24/0=t;1=t\n
>>>>> /g25/0=t;1=t\n
>>>>> /g26/0=t;1=t\n
>>>>> /g27/0=t;1=t\n
>>>>> /g28/0=t;1=t\n
>>>>> /g29/0=t;1=t\n
>>>>> /g30/0=t;1=t\n
>>>>> /g31/0=t;1=t\n'
>>>>>
>>>>> My ultimate goal is for a thread bound to a particular domain to be
>>>>> able to unassign and reassign the local domain's 32 counters in a
>>>>> single write() with no IPIs at all. And when IPIs are required, then
>>>>> no more than one per domain, regardless of the number of groups
>>>>> updated.
>>>>>
>>>>
>>>> Yes. I think I got the idea. Thanks.
>>>>
>>>>>
>>>>>>
>>>>>>>
>>>>>>> struct resctrl_monitor_cfg {
>>>>>>> int closid;
>>>>>>> int rmid;
>>>>>>> int evtid;
>>>>>>> bool dirty;
>>>>>>> };
>>>>>>>
>>>>>>> This mirrors the info needed in whatever register configures the
>>>>>>> counter, plus a dirty flag to skip over the ones that don't need to be
>>>>>>> updated.
>>>>>>
>>>>>> This is what my understanding of your implementation.
>>>>>>
>>>>>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
>>>>>> index d94abba1c716..9cebf065cc97 100644
>>>>>> --- a/include/linux/resctrl.h
>>>>>> +++ b/include/linux/resctrl.h
>>>>>> @@ -94,6 +94,13 @@ struct rdt_ctrl_domain {
>>>>>> u32 *mbps_val;
>>>>>> };
>>>>>>
>>>>>> +struct resctrl_monitor_cfg {
>>>>>> + int closid;
>>>>>> + int rmid;
>>>>>> + int evtid;
>>>>>> + bool dirty;
>>>>>> +};
>>>>>> +
>>>>>> /**
>>>>>> * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor
>>>>>> resource
>>>>>> * @hdr: common header for different domain types
>>>>>> @@ -116,6 +123,7 @@ struct rdt_mon_domain {
>>>>>> struct delayed_work cqm_limbo;
>>>>>> int mbm_work_cpu;
>>>>>> int cqm_work_cpu;
>>>>>> + /* Allocate num_mbm_cntrs entries in each domain */
>>>>>> + struct resctrl_monitor_cfg *mon_cfg;
>>>>>> };
>>>>>>
>>>>>>
>>>>>> When a user requests an assignment for total event to the default group
>>>>>> for domain 0, you go search in rdt_mon_domain(dom 0) for empty mon_cfg
>>>>>> entry.
>>>>>>
>>>>>> If there is an empty entry, then use that entry for assignment and
>>>>>> update closid, rmid, evtid and dirty = 1. We can get all these
>>>>>> information from default group here.
>>>>>>
>>>>>> Does this make sense?
>>>>>
>>>>> Yes, sounds correct.
>>>>
>>>> I will probably add cntr_id in resctrl_monitor_cfg structure and
>>>> initialize during the allocation. And rename the field 'dirty' to
>>>> 'active'(or something similar) to hold the assign state for that
>>>> entry. That way we have all the information required for assignment
>>>> at one place. We don't need to update the rdtgroup structure.
>>>>
>>>> Reinette, What do you think about this approach?
>>>
>>> I think this approach is in the right direction. Thanks to Peter for
>>> the guidance here.
>>> I do not think that it is necessary to add cntr_id to resctrl_monitor_cfg
>>> though, I think the cntr_id would be the index to the array instead?
>>
>> Yes. I think We can use the index as cntn_id. Will let you know otherwise.
>>
>>
>>>
>>> It may also be worthwhile to consider using a pointer to the resource
>>> group instead of storing closid and rmid directly. If used to indicate
>>> initialization then an initialized pointer is easier to distinguish than
>>> the closid/rmid that may have zero as valid values.
>>
>> Sure. Sounds good.
>>
>>>
>>> I expect evtid will be enum resctrl_event_id and that raises the question
>>> of whether "0" can indeed be used as an "uninitialized" value since doing
>>> so would change the meaning of the enum. It may indeed keep things
>>> separated by maintaining evtid as an enum resctrl_event_id and note the
>>> initialization differently ... either via a pointer to a resource group
>>> or entirely separately as Babu indicates later.
>>
>> Sure. Will add evtid as enum resctrl_event_id and use the "state" to
>> indicate assign/unassign/dirty status.
>
> Is "assign/unassign" state needed? If resctrl_monitor_cfg contains a pointer
> to the resource group to which the counter has been assigned then I expect NULL
> means unassigned and a value means assigned?
Yes. We use the rdtgroup pointer to check the assign/unassign state.
I will drop the 'state' field. Peter can add state when he wants use it
for optimization later.
I think we need to have the 'cntr_id" field here in resctrl_monitor_cfg.
When we access the pointer from mbm_state, we wont know what is cntr_id
index it came from.
--
Thanks
Babu Moger
Hi Babu,
On 12/2/24 12:42 PM, Moger, Babu wrote:
> Hi Reinette,
>
> On 12/2/24 14:15, Reinette Chatre wrote:
>> Hi Babu,
>>
>> On 12/2/24 11:48 AM, Moger, Babu wrote:
>>> On 12/2/24 12:33, Reinette Chatre wrote:
>>>> On 11/29/24 9:06 AM, Moger, Babu wrote:
>>>>> On 11/29/2024 3:59 AM, Peter Newman wrote:
>>>>>> On Thu, Nov 28, 2024 at 8:35 PM Moger, Babu <bmoger@amd.com> wrote:
>>>>>>> On 11/28/2024 5:10 AM, Peter Newman wrote:
>>>>>>>> On Wed, Nov 27, 2024 at 8:05 PM Reinette Chatre
>>>>>>>> <reinette.chatre@intel.com> wrote:
>>>>>>>>>
>>>>>>>>> Hi Babu,
>>>>>>>>>
>>>>>>>>> On 11/27/24 6:57 AM, Moger, Babu wrote:
>>>>>>
>>>>>>>>>> 1. Each group needs to remember counter ids in each domain for each event.
>>>>>>>>>> For example:
>>>>>>>>>> Resctrl group mon1
>>>>>>>>>> Total event
>>>>>>>>>> dom 0 cntr_id 1,
>>>>>>>>>> dom 1 cntr_id 10
>>>>>>>>>> dom 2 cntr_id 11
>>>>>>>>>>
>>>>>>>>>> Local event
>>>>>>>>>> dom 0 cntr_id 2,
>>>>>>>>>> dom 1 cntr_id 15
>>>>>>>>>> dom 2 cntr_id 10
>>>>>>>>>
>>>>>>>>> Indeed. The challenge here is that domains may come and go so it cannot be a simple
>>>>>>>>> static array. As an alternative it can be an xarray indexed by the domain ID with
>>>>>>>>> pointers to a struct like below to contain the counters associated with the monitor
>>>>>>>>> group:
>>>>>>>>> struct cntr_id {
>>>>>>>>> u32 mbm_total;
>>>>>>>>> u32 mbm_local;
>>>>>>>>> }
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Thinking more about how this array needs to be managed made me wonder how the
>>>>>>>>> current implementation deals with domains that come and go. I do not think
>>>>>>>>> this is currently handled. For example, if a new domain comes online and
>>>>>>>>> monitoring groups had counters dynamically assigned, then these counters are
>>>>>>>>> not configured to the newly online domain.
>>>>>>>
>>>>>>> I am trying to understand the details of your approach here.
>>>>>>>>
>>>>>>>> In my prototype, I allocated a counter id-indexed array to each
>>>>>>>> monitoring domain structure for tracking the counter allocations,
>>>>>>>> because the hardware counters are all domain-scoped. That way the
>>>>>>>> tracking data goes away when the hardware does.
>>>>>>>>
>>>>>>>> I was focused on allowing all pending counter updates to a domain
>>>>>>>> resulting from a single mbm_assign_control write to be batched and
>>>>>>>> processed in a single IPI, so I structured the counter tracker
>>>>>>>> something like this:
>>>>>>>
>>>>>>> Not sure what you meant here. How are you batching two IPIs for two domains?
>>>>>>>
>>>>>>> #echo "//0=t;1=t" > /sys/fs/resctrl/info/L3_MON/mbm_assign_control
>>>>>>>
>>>>>>> This is still a single write. Two IPIs are sent separately, one for each
>>>>>>> domain.
>>>>>>>
>>>>>>> Are you doing something different?
>>>>>>
>>>>>> I said "all pending counter updates to a domain", whereby I meant
>>>>>> targeting a single domain.
>>>>>>
>>>>>> Depending on the CPU of the caller, your example write requires 1 or 2 IPIs.
>>>>>>
>>>>>> What is important is that the following write also requires 1 or 2 IPIs:
>>>>>>
>>>>>> (assuming /sys/fs/resctrl/mon_groups/[g1-g31] exist, line breaks added
>>>>>> for readability)
>>>>>>
>>>>>> echo $'//0=t;1=t\n
>>>>>> /g1/0=t;1=t\n
>>>>>> /g2/0=t;1=t\n
>>>>>> /g3/0=t;1=t\n
>>>>>> /g4/0=t;1=t\n
>>>>>> /g5/0=t;1=t\n
>>>>>> /g6/0=t;1=t\n
>>>>>> /g7/0=t;1=t\n
>>>>>> /g8/0=t;1=t\n
>>>>>> /g9/0=t;1=t\n
>>>>>> /g10/0=t;1=t\n
>>>>>> /g11/0=t;1=t\n
>>>>>> /g12/0=t;1=t\n
>>>>>> /g13/0=t;1=t\n
>>>>>> /g14/0=t;1=t\n
>>>>>> /g15/0=t;1=t\n
>>>>>> /g16/0=t;1=t\n
>>>>>> /g17/0=t;1=t\n
>>>>>> /g18/0=t;1=t\n
>>>>>> /g19/0=t;1=t\n
>>>>>> /g20/0=t;1=t\n
>>>>>> /g21/0=t;1=t\n
>>>>>> /g22/0=t;1=t\n
>>>>>> /g23/0=t;1=t\n
>>>>>> /g24/0=t;1=t\n
>>>>>> /g25/0=t;1=t\n
>>>>>> /g26/0=t;1=t\n
>>>>>> /g27/0=t;1=t\n
>>>>>> /g28/0=t;1=t\n
>>>>>> /g29/0=t;1=t\n
>>>>>> /g30/0=t;1=t\n
>>>>>> /g31/0=t;1=t\n'
>>>>>>
>>>>>> My ultimate goal is for a thread bound to a particular domain to be
>>>>>> able to unassign and reassign the local domain's 32 counters in a
>>>>>> single write() with no IPIs at all. And when IPIs are required, then
>>>>>> no more than one per domain, regardless of the number of groups
>>>>>> updated.
>>>>>>
>>>>>
>>>>> Yes. I think I got the idea. Thanks.
>>>>>
>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> struct resctrl_monitor_cfg {
>>>>>>>> int closid;
>>>>>>>> int rmid;
>>>>>>>> int evtid;
>>>>>>>> bool dirty;
>>>>>>>> };
>>>>>>>>
>>>>>>>> This mirrors the info needed in whatever register configures the
>>>>>>>> counter, plus a dirty flag to skip over the ones that don't need to be
>>>>>>>> updated.
>>>>>>>
>>>>>>> This is what my understanding of your implementation.
>>>>>>>
>>>>>>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
>>>>>>> index d94abba1c716..9cebf065cc97 100644
>>>>>>> --- a/include/linux/resctrl.h
>>>>>>> +++ b/include/linux/resctrl.h
>>>>>>> @@ -94,6 +94,13 @@ struct rdt_ctrl_domain {
>>>>>>> u32 *mbps_val;
>>>>>>> };
>>>>>>>
>>>>>>> +struct resctrl_monitor_cfg {
>>>>>>> + int closid;
>>>>>>> + int rmid;
>>>>>>> + int evtid;
>>>>>>> + bool dirty;
>>>>>>> +};
>>>>>>> +
>>>>>>> /**
>>>>>>> * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor
>>>>>>> resource
>>>>>>> * @hdr: common header for different domain types
>>>>>>> @@ -116,6 +123,7 @@ struct rdt_mon_domain {
>>>>>>> struct delayed_work cqm_limbo;
>>>>>>> int mbm_work_cpu;
>>>>>>> int cqm_work_cpu;
>>>>>>> + /* Allocate num_mbm_cntrs entries in each domain */
>>>>>>> + struct resctrl_monitor_cfg *mon_cfg;
>>>>>>> };
>>>>>>>
>>>>>>>
>>>>>>> When a user requests an assignment for total event to the default group
>>>>>>> for domain 0, you go search in rdt_mon_domain(dom 0) for empty mon_cfg
>>>>>>> entry.
>>>>>>>
>>>>>>> If there is an empty entry, then use that entry for assignment and
>>>>>>> update closid, rmid, evtid and dirty = 1. We can get all these
>>>>>>> information from default group here.
>>>>>>>
>>>>>>> Does this make sense?
>>>>>>
>>>>>> Yes, sounds correct.
>>>>>
>>>>> I will probably add cntr_id in resctrl_monitor_cfg structure and
>>>>> initialize during the allocation. And rename the field 'dirty' to
>>>>> 'active'(or something similar) to hold the assign state for that
>>>>> entry. That way we have all the information required for assignment
>>>>> at one place. We don't need to update the rdtgroup structure.
>>>>>
>>>>> Reinette, What do you think about this approach?
>>>>
>>>> I think this approach is in the right direction. Thanks to Peter for
>>>> the guidance here.
>>>> I do not think that it is necessary to add cntr_id to resctrl_monitor_cfg
>>>> though, I think the cntr_id would be the index to the array instead?
>>>
>>> Yes. I think We can use the index as cntn_id. Will let you know otherwise.
>>>
>>>
>>>>
>>>> It may also be worthwhile to consider using a pointer to the resource
>>>> group instead of storing closid and rmid directly. If used to indicate
>>>> initialization then an initialized pointer is easier to distinguish than
>>>> the closid/rmid that may have zero as valid values.
>>>
>>> Sure. Sounds good.
>>>
>>>>
>>>> I expect evtid will be enum resctrl_event_id and that raises the question
>>>> of whether "0" can indeed be used as an "uninitialized" value since doing
>>>> so would change the meaning of the enum. It may indeed keep things
>>>> separated by maintaining evtid as an enum resctrl_event_id and note the
>>>> initialization differently ... either via a pointer to a resource group
>>>> or entirely separately as Babu indicates later.
>>>
>>> Sure. Will add evtid as enum resctrl_event_id and use the "state" to
>>> indicate assign/unassign/dirty status.
>>
>> Is "assign/unassign" state needed? If resctrl_monitor_cfg contains a pointer
>> to the resource group to which the counter has been assigned then I expect NULL
>> means unassigned and a value means assigned?
>
> Yes. We use the rdtgroup pointer to check the assign/unassign state.
>
> I will drop the 'state' field. Peter can add state when he wants use it
> for optimization later.
>
> I think we need to have the 'cntr_id" field here in resctrl_monitor_cfg.
> When we access the pointer from mbm_state, we wont know what is cntr_id
> index it came from.
>
oh, good point. I wonder how Peter addressed this in his PoC. As an alternative,
could the cntr_id be used in mbm_state instead of a pointer?
Reinette
Hi Reinette,
On 12/2/24 15:09, Reinette Chatre wrote:
> Hi Babu,
>
> On 12/2/24 12:42 PM, Moger, Babu wrote:
>> Hi Reinette,
>>
>> On 12/2/24 14:15, Reinette Chatre wrote:
>>> Hi Babu,
>>>
>>> On 12/2/24 11:48 AM, Moger, Babu wrote:
>>>> On 12/2/24 12:33, Reinette Chatre wrote:
>>>>> On 11/29/24 9:06 AM, Moger, Babu wrote:
>>>>>> On 11/29/2024 3:59 AM, Peter Newman wrote:
>>>>>>> On Thu, Nov 28, 2024 at 8:35 PM Moger, Babu <bmoger@amd.com> wrote:
>>>>>>>> On 11/28/2024 5:10 AM, Peter Newman wrote:
>>>>>>>>> On Wed, Nov 27, 2024 at 8:05 PM Reinette Chatre
>>>>>>>>> <reinette.chatre@intel.com> wrote:
>>>>>>>>>>
>>>>>>>>>> Hi Babu,
>>>>>>>>>>
>>>>>>>>>> On 11/27/24 6:57 AM, Moger, Babu wrote:
>>>>>>>
>>>>>>>>>>> 1. Each group needs to remember counter ids in each domain for each event.
>>>>>>>>>>> For example:
>>>>>>>>>>> Resctrl group mon1
>>>>>>>>>>> Total event
>>>>>>>>>>> dom 0 cntr_id 1,
>>>>>>>>>>> dom 1 cntr_id 10
>>>>>>>>>>> dom 2 cntr_id 11
>>>>>>>>>>>
>>>>>>>>>>> Local event
>>>>>>>>>>> dom 0 cntr_id 2,
>>>>>>>>>>> dom 1 cntr_id 15
>>>>>>>>>>> dom 2 cntr_id 10
>>>>>>>>>>
>>>>>>>>>> Indeed. The challenge here is that domains may come and go so it cannot be a simple
>>>>>>>>>> static array. As an alternative it can be an xarray indexed by the domain ID with
>>>>>>>>>> pointers to a struct like below to contain the counters associated with the monitor
>>>>>>>>>> group:
>>>>>>>>>> struct cntr_id {
>>>>>>>>>> u32 mbm_total;
>>>>>>>>>> u32 mbm_local;
>>>>>>>>>> }
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Thinking more about how this array needs to be managed made me wonder how the
>>>>>>>>>> current implementation deals with domains that come and go. I do not think
>>>>>>>>>> this is currently handled. For example, if a new domain comes online and
>>>>>>>>>> monitoring groups had counters dynamically assigned, then these counters are
>>>>>>>>>> not configured to the newly online domain.
>>>>>>>>
>>>>>>>> I am trying to understand the details of your approach here.
>>>>>>>>>
>>>>>>>>> In my prototype, I allocated a counter id-indexed array to each
>>>>>>>>> monitoring domain structure for tracking the counter allocations,
>>>>>>>>> because the hardware counters are all domain-scoped. That way the
>>>>>>>>> tracking data goes away when the hardware does.
>>>>>>>>>
>>>>>>>>> I was focused on allowing all pending counter updates to a domain
>>>>>>>>> resulting from a single mbm_assign_control write to be batched and
>>>>>>>>> processed in a single IPI, so I structured the counter tracker
>>>>>>>>> something like this:
>>>>>>>>
>>>>>>>> Not sure what you meant here. How are you batching two IPIs for two domains?
>>>>>>>>
>>>>>>>> #echo "//0=t;1=t" > /sys/fs/resctrl/info/L3_MON/mbm_assign_control
>>>>>>>>
>>>>>>>> This is still a single write. Two IPIs are sent separately, one for each
>>>>>>>> domain.
>>>>>>>>
>>>>>>>> Are you doing something different?
>>>>>>>
>>>>>>> I said "all pending counter updates to a domain", whereby I meant
>>>>>>> targeting a single domain.
>>>>>>>
>>>>>>> Depending on the CPU of the caller, your example write requires 1 or 2 IPIs.
>>>>>>>
>>>>>>> What is important is that the following write also requires 1 or 2 IPIs:
>>>>>>>
>>>>>>> (assuming /sys/fs/resctrl/mon_groups/[g1-g31] exist, line breaks added
>>>>>>> for readability)
>>>>>>>
>>>>>>> echo $'//0=t;1=t\n
>>>>>>> /g1/0=t;1=t\n
>>>>>>> /g2/0=t;1=t\n
>>>>>>> /g3/0=t;1=t\n
>>>>>>> /g4/0=t;1=t\n
>>>>>>> /g5/0=t;1=t\n
>>>>>>> /g6/0=t;1=t\n
>>>>>>> /g7/0=t;1=t\n
>>>>>>> /g8/0=t;1=t\n
>>>>>>> /g9/0=t;1=t\n
>>>>>>> /g10/0=t;1=t\n
>>>>>>> /g11/0=t;1=t\n
>>>>>>> /g12/0=t;1=t\n
>>>>>>> /g13/0=t;1=t\n
>>>>>>> /g14/0=t;1=t\n
>>>>>>> /g15/0=t;1=t\n
>>>>>>> /g16/0=t;1=t\n
>>>>>>> /g17/0=t;1=t\n
>>>>>>> /g18/0=t;1=t\n
>>>>>>> /g19/0=t;1=t\n
>>>>>>> /g20/0=t;1=t\n
>>>>>>> /g21/0=t;1=t\n
>>>>>>> /g22/0=t;1=t\n
>>>>>>> /g23/0=t;1=t\n
>>>>>>> /g24/0=t;1=t\n
>>>>>>> /g25/0=t;1=t\n
>>>>>>> /g26/0=t;1=t\n
>>>>>>> /g27/0=t;1=t\n
>>>>>>> /g28/0=t;1=t\n
>>>>>>> /g29/0=t;1=t\n
>>>>>>> /g30/0=t;1=t\n
>>>>>>> /g31/0=t;1=t\n'
>>>>>>>
>>>>>>> My ultimate goal is for a thread bound to a particular domain to be
>>>>>>> able to unassign and reassign the local domain's 32 counters in a
>>>>>>> single write() with no IPIs at all. And when IPIs are required, then
>>>>>>> no more than one per domain, regardless of the number of groups
>>>>>>> updated.
>>>>>>>
>>>>>>
>>>>>> Yes. I think I got the idea. Thanks.
>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>> struct resctrl_monitor_cfg {
>>>>>>>>> int closid;
>>>>>>>>> int rmid;
>>>>>>>>> int evtid;
>>>>>>>>> bool dirty;
>>>>>>>>> };
>>>>>>>>>
>>>>>>>>> This mirrors the info needed in whatever register configures the
>>>>>>>>> counter, plus a dirty flag to skip over the ones that don't need to be
>>>>>>>>> updated.
>>>>>>>>
>>>>>>>> This is what my understanding of your implementation.
>>>>>>>>
>>>>>>>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
>>>>>>>> index d94abba1c716..9cebf065cc97 100644
>>>>>>>> --- a/include/linux/resctrl.h
>>>>>>>> +++ b/include/linux/resctrl.h
>>>>>>>> @@ -94,6 +94,13 @@ struct rdt_ctrl_domain {
>>>>>>>> u32 *mbps_val;
>>>>>>>> };
>>>>>>>>
>>>>>>>> +struct resctrl_monitor_cfg {
>>>>>>>> + int closid;
>>>>>>>> + int rmid;
>>>>>>>> + int evtid;
>>>>>>>> + bool dirty;
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> /**
>>>>>>>> * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor
>>>>>>>> resource
>>>>>>>> * @hdr: common header for different domain types
>>>>>>>> @@ -116,6 +123,7 @@ struct rdt_mon_domain {
>>>>>>>> struct delayed_work cqm_limbo;
>>>>>>>> int mbm_work_cpu;
>>>>>>>> int cqm_work_cpu;
>>>>>>>> + /* Allocate num_mbm_cntrs entries in each domain */
>>>>>>>> + struct resctrl_monitor_cfg *mon_cfg;
>>>>>>>> };
>>>>>>>>
>>>>>>>>
>>>>>>>> When a user requests an assignment for total event to the default group
>>>>>>>> for domain 0, you go search in rdt_mon_domain(dom 0) for empty mon_cfg
>>>>>>>> entry.
>>>>>>>>
>>>>>>>> If there is an empty entry, then use that entry for assignment and
>>>>>>>> update closid, rmid, evtid and dirty = 1. We can get all these
>>>>>>>> information from default group here.
>>>>>>>>
>>>>>>>> Does this make sense?
>>>>>>>
>>>>>>> Yes, sounds correct.
>>>>>>
>>>>>> I will probably add cntr_id in resctrl_monitor_cfg structure and
>>>>>> initialize during the allocation. And rename the field 'dirty' to
>>>>>> 'active'(or something similar) to hold the assign state for that
>>>>>> entry. That way we have all the information required for assignment
>>>>>> at one place. We don't need to update the rdtgroup structure.
>>>>>>
>>>>>> Reinette, What do you think about this approach?
>>>>>
>>>>> I think this approach is in the right direction. Thanks to Peter for
>>>>> the guidance here.
>>>>> I do not think that it is necessary to add cntr_id to resctrl_monitor_cfg
>>>>> though, I think the cntr_id would be the index to the array instead?
>>>>
>>>> Yes. I think We can use the index as cntn_id. Will let you know otherwise.
>>>>
>>>>
>>>>>
>>>>> It may also be worthwhile to consider using a pointer to the resource
>>>>> group instead of storing closid and rmid directly. If used to indicate
>>>>> initialization then an initialized pointer is easier to distinguish than
>>>>> the closid/rmid that may have zero as valid values.
>>>>
>>>> Sure. Sounds good.
>>>>
>>>>>
>>>>> I expect evtid will be enum resctrl_event_id and that raises the question
>>>>> of whether "0" can indeed be used as an "uninitialized" value since doing
>>>>> so would change the meaning of the enum. It may indeed keep things
>>>>> separated by maintaining evtid as an enum resctrl_event_id and note the
>>>>> initialization differently ... either via a pointer to a resource group
>>>>> or entirely separately as Babu indicates later.
>>>>
>>>> Sure. Will add evtid as enum resctrl_event_id and use the "state" to
>>>> indicate assign/unassign/dirty status.
>>>
>>> Is "assign/unassign" state needed? If resctrl_monitor_cfg contains a pointer
>>> to the resource group to which the counter has been assigned then I expect NULL
>>> means unassigned and a value means assigned?
>>
>> Yes. We use the rdtgroup pointer to check the assign/unassign state.
>>
>> I will drop the 'state' field. Peter can add state when he wants use it
>> for optimization later.
>>
>> I think we need to have the 'cntr_id" field here in resctrl_monitor_cfg.
>> When we access the pointer from mbm_state, we wont know what is cntr_id
>> index it came from.
>>
>
> oh, good point. I wonder how Peter addressed this in his PoC. As an alternative,
> could the cntr_id be used in mbm_state instead of a pointer?
>
Yes. It can be done.
I thought it would be better to have everything at once place.
struct resctrl_monitor_cfg {
unsigned int cntr_id;
enum resctrl_event_id evtid;
struct rdtgroup *rgtgrp;
};
This will have everything required to assign/unassign the event.
Thanks
Babu Moger
Hi Babu,
On 12/2/24 1:28 PM, Moger, Babu wrote:
> Hi Reinette,
>
> On 12/2/24 15:09, Reinette Chatre wrote:
>> Hi Babu,
>>
>> On 12/2/24 12:42 PM, Moger, Babu wrote:
>>> Hi Reinette,
>>>
>>> On 12/2/24 14:15, Reinette Chatre wrote:
>>>> Hi Babu,
>>>>
>>>> On 12/2/24 11:48 AM, Moger, Babu wrote:
>>>>> On 12/2/24 12:33, Reinette Chatre wrote:
>>>>>> On 11/29/24 9:06 AM, Moger, Babu wrote:
>>>>>>> On 11/29/2024 3:59 AM, Peter Newman wrote:
>>>>>>>> On Thu, Nov 28, 2024 at 8:35 PM Moger, Babu <bmoger@amd.com> wrote:
>>>>>>>>> On 11/28/2024 5:10 AM, Peter Newman wrote:
>>>>>>>>>> On Wed, Nov 27, 2024 at 8:05 PM Reinette Chatre
>>>>>>>>>> <reinette.chatre@intel.com> wrote:
>>>>>>>>>>>
>>>>>>>>>>> Hi Babu,
>>>>>>>>>>>
>>>>>>>>>>> On 11/27/24 6:57 AM, Moger, Babu wrote:
>>>>>>>>
>>>>>>>>>>>> 1. Each group needs to remember counter ids in each domain for each event.
>>>>>>>>>>>> For example:
>>>>>>>>>>>> Resctrl group mon1
>>>>>>>>>>>> Total event
>>>>>>>>>>>> dom 0 cntr_id 1,
>>>>>>>>>>>> dom 1 cntr_id 10
>>>>>>>>>>>> dom 2 cntr_id 11
>>>>>>>>>>>>
>>>>>>>>>>>> Local event
>>>>>>>>>>>> dom 0 cntr_id 2,
>>>>>>>>>>>> dom 1 cntr_id 15
>>>>>>>>>>>> dom 2 cntr_id 10
>>>>>>>>>>>
>>>>>>>>>>> Indeed. The challenge here is that domains may come and go so it cannot be a simple
>>>>>>>>>>> static array. As an alternative it can be an xarray indexed by the domain ID with
>>>>>>>>>>> pointers to a struct like below to contain the counters associated with the monitor
>>>>>>>>>>> group:
>>>>>>>>>>> struct cntr_id {
>>>>>>>>>>> u32 mbm_total;
>>>>>>>>>>> u32 mbm_local;
>>>>>>>>>>> }
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Thinking more about how this array needs to be managed made me wonder how the
>>>>>>>>>>> current implementation deals with domains that come and go. I do not think
>>>>>>>>>>> this is currently handled. For example, if a new domain comes online and
>>>>>>>>>>> monitoring groups had counters dynamically assigned, then these counters are
>>>>>>>>>>> not configured to the newly online domain.
>>>>>>>>>
>>>>>>>>> I am trying to understand the details of your approach here.
>>>>>>>>>>
>>>>>>>>>> In my prototype, I allocated a counter id-indexed array to each
>>>>>>>>>> monitoring domain structure for tracking the counter allocations,
>>>>>>>>>> because the hardware counters are all domain-scoped. That way the
>>>>>>>>>> tracking data goes away when the hardware does.
>>>>>>>>>>
>>>>>>>>>> I was focused on allowing all pending counter updates to a domain
>>>>>>>>>> resulting from a single mbm_assign_control write to be batched and
>>>>>>>>>> processed in a single IPI, so I structured the counter tracker
>>>>>>>>>> something like this:
>>>>>>>>>
>>>>>>>>> Not sure what you meant here. How are you batching two IPIs for two domains?
>>>>>>>>>
>>>>>>>>> #echo "//0=t;1=t" > /sys/fs/resctrl/info/L3_MON/mbm_assign_control
>>>>>>>>>
>>>>>>>>> This is still a single write. Two IPIs are sent separately, one for each
>>>>>>>>> domain.
>>>>>>>>>
>>>>>>>>> Are you doing something different?
>>>>>>>>
>>>>>>>> I said "all pending counter updates to a domain", whereby I meant
>>>>>>>> targeting a single domain.
>>>>>>>>
>>>>>>>> Depending on the CPU of the caller, your example write requires 1 or 2 IPIs.
>>>>>>>>
>>>>>>>> What is important is that the following write also requires 1 or 2 IPIs:
>>>>>>>>
>>>>>>>> (assuming /sys/fs/resctrl/mon_groups/[g1-g31] exist, line breaks added
>>>>>>>> for readability)
>>>>>>>>
>>>>>>>> echo $'//0=t;1=t\n
>>>>>>>> /g1/0=t;1=t\n
>>>>>>>> /g2/0=t;1=t\n
>>>>>>>> /g3/0=t;1=t\n
>>>>>>>> /g4/0=t;1=t\n
>>>>>>>> /g5/0=t;1=t\n
>>>>>>>> /g6/0=t;1=t\n
>>>>>>>> /g7/0=t;1=t\n
>>>>>>>> /g8/0=t;1=t\n
>>>>>>>> /g9/0=t;1=t\n
>>>>>>>> /g10/0=t;1=t\n
>>>>>>>> /g11/0=t;1=t\n
>>>>>>>> /g12/0=t;1=t\n
>>>>>>>> /g13/0=t;1=t\n
>>>>>>>> /g14/0=t;1=t\n
>>>>>>>> /g15/0=t;1=t\n
>>>>>>>> /g16/0=t;1=t\n
>>>>>>>> /g17/0=t;1=t\n
>>>>>>>> /g18/0=t;1=t\n
>>>>>>>> /g19/0=t;1=t\n
>>>>>>>> /g20/0=t;1=t\n
>>>>>>>> /g21/0=t;1=t\n
>>>>>>>> /g22/0=t;1=t\n
>>>>>>>> /g23/0=t;1=t\n
>>>>>>>> /g24/0=t;1=t\n
>>>>>>>> /g25/0=t;1=t\n
>>>>>>>> /g26/0=t;1=t\n
>>>>>>>> /g27/0=t;1=t\n
>>>>>>>> /g28/0=t;1=t\n
>>>>>>>> /g29/0=t;1=t\n
>>>>>>>> /g30/0=t;1=t\n
>>>>>>>> /g31/0=t;1=t\n'
>>>>>>>>
>>>>>>>> My ultimate goal is for a thread bound to a particular domain to be
>>>>>>>> able to unassign and reassign the local domain's 32 counters in a
>>>>>>>> single write() with no IPIs at all. And when IPIs are required, then
>>>>>>>> no more than one per domain, regardless of the number of groups
>>>>>>>> updated.
>>>>>>>>
>>>>>>>
>>>>>>> Yes. I think I got the idea. Thanks.
>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> struct resctrl_monitor_cfg {
>>>>>>>>>> int closid;
>>>>>>>>>> int rmid;
>>>>>>>>>> int evtid;
>>>>>>>>>> bool dirty;
>>>>>>>>>> };
>>>>>>>>>>
>>>>>>>>>> This mirrors the info needed in whatever register configures the
>>>>>>>>>> counter, plus a dirty flag to skip over the ones that don't need to be
>>>>>>>>>> updated.
>>>>>>>>>
>>>>>>>>> This is what my understanding of your implementation.
>>>>>>>>>
>>>>>>>>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
>>>>>>>>> index d94abba1c716..9cebf065cc97 100644
>>>>>>>>> --- a/include/linux/resctrl.h
>>>>>>>>> +++ b/include/linux/resctrl.h
>>>>>>>>> @@ -94,6 +94,13 @@ struct rdt_ctrl_domain {
>>>>>>>>> u32 *mbps_val;
>>>>>>>>> };
>>>>>>>>>
>>>>>>>>> +struct resctrl_monitor_cfg {
>>>>>>>>> + int closid;
>>>>>>>>> + int rmid;
>>>>>>>>> + int evtid;
>>>>>>>>> + bool dirty;
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> /**
>>>>>>>>> * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor
>>>>>>>>> resource
>>>>>>>>> * @hdr: common header for different domain types
>>>>>>>>> @@ -116,6 +123,7 @@ struct rdt_mon_domain {
>>>>>>>>> struct delayed_work cqm_limbo;
>>>>>>>>> int mbm_work_cpu;
>>>>>>>>> int cqm_work_cpu;
>>>>>>>>> + /* Allocate num_mbm_cntrs entries in each domain */
>>>>>>>>> + struct resctrl_monitor_cfg *mon_cfg;
>>>>>>>>> };
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> When a user requests an assignment for total event to the default group
>>>>>>>>> for domain 0, you go search in rdt_mon_domain(dom 0) for empty mon_cfg
>>>>>>>>> entry.
>>>>>>>>>
>>>>>>>>> If there is an empty entry, then use that entry for assignment and
>>>>>>>>> update closid, rmid, evtid and dirty = 1. We can get all these
>>>>>>>>> information from default group here.
>>>>>>>>>
>>>>>>>>> Does this make sense?
>>>>>>>>
>>>>>>>> Yes, sounds correct.
>>>>>>>
>>>>>>> I will probably add cntr_id in resctrl_monitor_cfg structure and
>>>>>>> initialize during the allocation. And rename the field 'dirty' to
>>>>>>> 'active'(or something similar) to hold the assign state for that
>>>>>>> entry. That way we have all the information required for assignment
>>>>>>> at one place. We don't need to update the rdtgroup structure.
>>>>>>>
>>>>>>> Reinette, What do you think about this approach?
>>>>>>
>>>>>> I think this approach is in the right direction. Thanks to Peter for
>>>>>> the guidance here.
>>>>>> I do not think that it is necessary to add cntr_id to resctrl_monitor_cfg
>>>>>> though, I think the cntr_id would be the index to the array instead?
>>>>>
>>>>> Yes. I think We can use the index as cntn_id. Will let you know otherwise.
>>>>>
>>>>>
>>>>>>
>>>>>> It may also be worthwhile to consider using a pointer to the resource
>>>>>> group instead of storing closid and rmid directly. If used to indicate
>>>>>> initialization then an initialized pointer is easier to distinguish than
>>>>>> the closid/rmid that may have zero as valid values.
>>>>>
>>>>> Sure. Sounds good.
>>>>>
>>>>>>
>>>>>> I expect evtid will be enum resctrl_event_id and that raises the question
>>>>>> of whether "0" can indeed be used as an "uninitialized" value since doing
>>>>>> so would change the meaning of the enum. It may indeed keep things
>>>>>> separated by maintaining evtid as an enum resctrl_event_id and note the
>>>>>> initialization differently ... either via a pointer to a resource group
>>>>>> or entirely separately as Babu indicates later.
>>>>>
>>>>> Sure. Will add evtid as enum resctrl_event_id and use the "state" to
>>>>> indicate assign/unassign/dirty status.
>>>>
>>>> Is "assign/unassign" state needed? If resctrl_monitor_cfg contains a pointer
>>>> to the resource group to which the counter has been assigned then I expect NULL
>>>> means unassigned and a value means assigned?
>>>
>>> Yes. We use the rdtgroup pointer to check the assign/unassign state.
>>>
>>> I will drop the 'state' field. Peter can add state when he wants use it
>>> for optimization later.
>>>
>>> I think we need to have the 'cntr_id" field here in resctrl_monitor_cfg.
>>> When we access the pointer from mbm_state, we wont know what is cntr_id
>>> index it came from.
>>>
>>
>> oh, good point. I wonder how Peter addressed this in his PoC. As an alternative,
>> could the cntr_id be used in mbm_state instead of a pointer?
>>
>
> Yes. It can be done.
>
> I thought it would be better to have everything at once place.
>
> struct resctrl_monitor_cfg {
> unsigned int cntr_id;
> enum resctrl_event_id evtid;
> struct rdtgroup *rgtgrp;
> };
>
> This will have everything required to assign/unassign the event.
>
The "everything in one place" argument is not clear to me since the cntr_id
is indeed present already as the index to the array that stores this structure.
Including the cntr_id seems redundant to me. This is similar to several
other data structures in resctrl that are indexed either by closid or rmid,
without needing to store closid/rmid in these data structures self.
Reinette
Hi Reinette,
On 12/2/24 15:47, Reinette Chatre wrote:
> Hi Babu,
>
> On 12/2/24 1:28 PM, Moger, Babu wrote:
>> Hi Reinette,
>>
>> On 12/2/24 15:09, Reinette Chatre wrote:
>>> Hi Babu,
>>>
>>> On 12/2/24 12:42 PM, Moger, Babu wrote:
>>>> Hi Reinette,
>>>>
>>>> On 12/2/24 14:15, Reinette Chatre wrote:
>>>>> Hi Babu,
>>>>>
>>>>> On 12/2/24 11:48 AM, Moger, Babu wrote:
>>>>>> On 12/2/24 12:33, Reinette Chatre wrote:
>>>>>>> On 11/29/24 9:06 AM, Moger, Babu wrote:
>>>>>>>> On 11/29/2024 3:59 AM, Peter Newman wrote:
>>>>>>>>> On Thu, Nov 28, 2024 at 8:35 PM Moger, Babu <bmoger@amd.com> wrote:
>>>>>>>>>> On 11/28/2024 5:10 AM, Peter Newman wrote:
>>>>>>>>>>> On Wed, Nov 27, 2024 at 8:05 PM Reinette Chatre
>>>>>>>>>>> <reinette.chatre@intel.com> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> Hi Babu,
>>>>>>>>>>>>
>>>>>>>>>>>> On 11/27/24 6:57 AM, Moger, Babu wrote:
>>>>>>>>>
>>>>>>>>>>>>> 1. Each group needs to remember counter ids in each domain for each event.
>>>>>>>>>>>>> For example:
>>>>>>>>>>>>> Resctrl group mon1
>>>>>>>>>>>>> Total event
>>>>>>>>>>>>> dom 0 cntr_id 1,
>>>>>>>>>>>>> dom 1 cntr_id 10
>>>>>>>>>>>>> dom 2 cntr_id 11
>>>>>>>>>>>>>
>>>>>>>>>>>>> Local event
>>>>>>>>>>>>> dom 0 cntr_id 2,
>>>>>>>>>>>>> dom 1 cntr_id 15
>>>>>>>>>>>>> dom 2 cntr_id 10
>>>>>>>>>>>>
>>>>>>>>>>>> Indeed. The challenge here is that domains may come and go so it cannot be a simple
>>>>>>>>>>>> static array. As an alternative it can be an xarray indexed by the domain ID with
>>>>>>>>>>>> pointers to a struct like below to contain the counters associated with the monitor
>>>>>>>>>>>> group:
>>>>>>>>>>>> struct cntr_id {
>>>>>>>>>>>> u32 mbm_total;
>>>>>>>>>>>> u32 mbm_local;
>>>>>>>>>>>> }
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Thinking more about how this array needs to be managed made me wonder how the
>>>>>>>>>>>> current implementation deals with domains that come and go. I do not think
>>>>>>>>>>>> this is currently handled. For example, if a new domain comes online and
>>>>>>>>>>>> monitoring groups had counters dynamically assigned, then these counters are
>>>>>>>>>>>> not configured to the newly online domain.
>>>>>>>>>>
>>>>>>>>>> I am trying to understand the details of your approach here.
>>>>>>>>>>>
>>>>>>>>>>> In my prototype, I allocated a counter id-indexed array to each
>>>>>>>>>>> monitoring domain structure for tracking the counter allocations,
>>>>>>>>>>> because the hardware counters are all domain-scoped. That way the
>>>>>>>>>>> tracking data goes away when the hardware does.
>>>>>>>>>>>
>>>>>>>>>>> I was focused on allowing all pending counter updates to a domain
>>>>>>>>>>> resulting from a single mbm_assign_control write to be batched and
>>>>>>>>>>> processed in a single IPI, so I structured the counter tracker
>>>>>>>>>>> something like this:
>>>>>>>>>>
>>>>>>>>>> Not sure what you meant here. How are you batching two IPIs for two domains?
>>>>>>>>>>
>>>>>>>>>> #echo "//0=t;1=t" > /sys/fs/resctrl/info/L3_MON/mbm_assign_control
>>>>>>>>>>
>>>>>>>>>> This is still a single write. Two IPIs are sent separately, one for each
>>>>>>>>>> domain.
>>>>>>>>>>
>>>>>>>>>> Are you doing something different?
>>>>>>>>>
>>>>>>>>> I said "all pending counter updates to a domain", whereby I meant
>>>>>>>>> targeting a single domain.
>>>>>>>>>
>>>>>>>>> Depending on the CPU of the caller, your example write requires 1 or 2 IPIs.
>>>>>>>>>
>>>>>>>>> What is important is that the following write also requires 1 or 2 IPIs:
>>>>>>>>>
>>>>>>>>> (assuming /sys/fs/resctrl/mon_groups/[g1-g31] exist, line breaks added
>>>>>>>>> for readability)
>>>>>>>>>
>>>>>>>>> echo $'//0=t;1=t\n
>>>>>>>>> /g1/0=t;1=t\n
>>>>>>>>> /g2/0=t;1=t\n
>>>>>>>>> /g3/0=t;1=t\n
>>>>>>>>> /g4/0=t;1=t\n
>>>>>>>>> /g5/0=t;1=t\n
>>>>>>>>> /g6/0=t;1=t\n
>>>>>>>>> /g7/0=t;1=t\n
>>>>>>>>> /g8/0=t;1=t\n
>>>>>>>>> /g9/0=t;1=t\n
>>>>>>>>> /g10/0=t;1=t\n
>>>>>>>>> /g11/0=t;1=t\n
>>>>>>>>> /g12/0=t;1=t\n
>>>>>>>>> /g13/0=t;1=t\n
>>>>>>>>> /g14/0=t;1=t\n
>>>>>>>>> /g15/0=t;1=t\n
>>>>>>>>> /g16/0=t;1=t\n
>>>>>>>>> /g17/0=t;1=t\n
>>>>>>>>> /g18/0=t;1=t\n
>>>>>>>>> /g19/0=t;1=t\n
>>>>>>>>> /g20/0=t;1=t\n
>>>>>>>>> /g21/0=t;1=t\n
>>>>>>>>> /g22/0=t;1=t\n
>>>>>>>>> /g23/0=t;1=t\n
>>>>>>>>> /g24/0=t;1=t\n
>>>>>>>>> /g25/0=t;1=t\n
>>>>>>>>> /g26/0=t;1=t\n
>>>>>>>>> /g27/0=t;1=t\n
>>>>>>>>> /g28/0=t;1=t\n
>>>>>>>>> /g29/0=t;1=t\n
>>>>>>>>> /g30/0=t;1=t\n
>>>>>>>>> /g31/0=t;1=t\n'
>>>>>>>>>
>>>>>>>>> My ultimate goal is for a thread bound to a particular domain to be
>>>>>>>>> able to unassign and reassign the local domain's 32 counters in a
>>>>>>>>> single write() with no IPIs at all. And when IPIs are required, then
>>>>>>>>> no more than one per domain, regardless of the number of groups
>>>>>>>>> updated.
>>>>>>>>>
>>>>>>>>
>>>>>>>> Yes. I think I got the idea. Thanks.
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> struct resctrl_monitor_cfg {
>>>>>>>>>>> int closid;
>>>>>>>>>>> int rmid;
>>>>>>>>>>> int evtid;
>>>>>>>>>>> bool dirty;
>>>>>>>>>>> };
>>>>>>>>>>>
>>>>>>>>>>> This mirrors the info needed in whatever register configures the
>>>>>>>>>>> counter, plus a dirty flag to skip over the ones that don't need to be
>>>>>>>>>>> updated.
>>>>>>>>>>
>>>>>>>>>> This is what my understanding of your implementation.
>>>>>>>>>>
>>>>>>>>>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
>>>>>>>>>> index d94abba1c716..9cebf065cc97 100644
>>>>>>>>>> --- a/include/linux/resctrl.h
>>>>>>>>>> +++ b/include/linux/resctrl.h
>>>>>>>>>> @@ -94,6 +94,13 @@ struct rdt_ctrl_domain {
>>>>>>>>>> u32 *mbps_val;
>>>>>>>>>> };
>>>>>>>>>>
>>>>>>>>>> +struct resctrl_monitor_cfg {
>>>>>>>>>> + int closid;
>>>>>>>>>> + int rmid;
>>>>>>>>>> + int evtid;
>>>>>>>>>> + bool dirty;
>>>>>>>>>> +};
>>>>>>>>>> +
>>>>>>>>>> /**
>>>>>>>>>> * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor
>>>>>>>>>> resource
>>>>>>>>>> * @hdr: common header for different domain types
>>>>>>>>>> @@ -116,6 +123,7 @@ struct rdt_mon_domain {
>>>>>>>>>> struct delayed_work cqm_limbo;
>>>>>>>>>> int mbm_work_cpu;
>>>>>>>>>> int cqm_work_cpu;
>>>>>>>>>> + /* Allocate num_mbm_cntrs entries in each domain */
>>>>>>>>>> + struct resctrl_monitor_cfg *mon_cfg;
>>>>>>>>>> };
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> When a user requests an assignment for total event to the default group
>>>>>>>>>> for domain 0, you go search in rdt_mon_domain(dom 0) for empty mon_cfg
>>>>>>>>>> entry.
>>>>>>>>>>
>>>>>>>>>> If there is an empty entry, then use that entry for assignment and
>>>>>>>>>> update closid, rmid, evtid and dirty = 1. We can get all these
>>>>>>>>>> information from default group here.
>>>>>>>>>>
>>>>>>>>>> Does this make sense?
>>>>>>>>>
>>>>>>>>> Yes, sounds correct.
>>>>>>>>
>>>>>>>> I will probably add cntr_id in resctrl_monitor_cfg structure and
>>>>>>>> initialize during the allocation. And rename the field 'dirty' to
>>>>>>>> 'active'(or something similar) to hold the assign state for that
>>>>>>>> entry. That way we have all the information required for assignment
>>>>>>>> at one place. We don't need to update the rdtgroup structure.
>>>>>>>>
>>>>>>>> Reinette, What do you think about this approach?
>>>>>>>
>>>>>>> I think this approach is in the right direction. Thanks to Peter for
>>>>>>> the guidance here.
>>>>>>> I do not think that it is necessary to add cntr_id to resctrl_monitor_cfg
>>>>>>> though, I think the cntr_id would be the index to the array instead?
>>>>>>
>>>>>> Yes. I think We can use the index as cntn_id. Will let you know otherwise.
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> It may also be worthwhile to consider using a pointer to the resource
>>>>>>> group instead of storing closid and rmid directly. If used to indicate
>>>>>>> initialization then an initialized pointer is easier to distinguish than
>>>>>>> the closid/rmid that may have zero as valid values.
>>>>>>
>>>>>> Sure. Sounds good.
>>>>>>
>>>>>>>
>>>>>>> I expect evtid will be enum resctrl_event_id and that raises the question
>>>>>>> of whether "0" can indeed be used as an "uninitialized" value since doing
>>>>>>> so would change the meaning of the enum. It may indeed keep things
>>>>>>> separated by maintaining evtid as an enum resctrl_event_id and note the
>>>>>>> initialization differently ... either via a pointer to a resource group
>>>>>>> or entirely separately as Babu indicates later.
>>>>>>
>>>>>> Sure. Will add evtid as enum resctrl_event_id and use the "state" to
>>>>>> indicate assign/unassign/dirty status.
>>>>>
>>>>> Is "assign/unassign" state needed? If resctrl_monitor_cfg contains a pointer
>>>>> to the resource group to which the counter has been assigned then I expect NULL
>>>>> means unassigned and a value means assigned?
>>>>
>>>> Yes. We use the rdtgroup pointer to check the assign/unassign state.
>>>>
>>>> I will drop the 'state' field. Peter can add state when he wants use it
>>>> for optimization later.
>>>>
>>>> I think we need to have the 'cntr_id" field here in resctrl_monitor_cfg.
>>>> When we access the pointer from mbm_state, we wont know what is cntr_id
>>>> index it came from.
>>>>
>>>
>>> oh, good point. I wonder how Peter addressed this in his PoC. As an alternative,
>>> could the cntr_id be used in mbm_state instead of a pointer?
>>>
>>
>> Yes. It can be done.
>>
>> I thought it would be better to have everything at once place.
>>
>> struct resctrl_monitor_cfg {
>> unsigned int cntr_id;
>> enum resctrl_event_id evtid;
>> struct rdtgroup *rgtgrp;
>> };
>>
>> This will have everything required to assign/unassign the event.
>>
>
> The "everything in one place" argument is not clear to me since the cntr_id
> is indeed present already as the index to the array that stores this structure.
> Including the cntr_id seems redundant to me. This is similar to several
> other data structures in resctrl that are indexed either by closid or rmid,
> without needing to store closid/rmid in these data structures self.
>
Ok. That is fine. Will remove cntr_id index from resctrl_monitor_cfg.
Will add it in mbm_state. That should be good.
--
Thanks
Babu Moger
Hi Babu,
On Fri, Nov 29, 2024 at 6:06 PM Moger, Babu <bmoger@amd.com> wrote:
>
> Hi Peter, Reinette,
>
> On 11/29/2024 3:59 AM, Peter Newman wrote:
> > Hi Babu,
> >
> > On Thu, Nov 28, 2024 at 8:35 PM Moger, Babu <bmoger@amd.com> wrote:
> >>
> >> Hi Peter,
> >>
> >> On 11/28/2024 5:10 AM, Peter Newman wrote:
> >>> In my prototype, I allocated a counter id-indexed array to each
> >>> monitoring domain structure for tracking the counter allocations,
> >>> because the hardware counters are all domain-scoped. That way the
> >>> tracking data goes away when the hardware does.
> >>>
> >>> I was focused on allowing all pending counter updates to a domain
> >>> resulting from a single mbm_assign_control write to be batched and
> >>> processed in a single IPI, so I structured the counter tracker
> >>> something like this:
> >>
> >> Not sure what you meant here. How are you batching two IPIs for two domains?
> >>
> >> #echo "//0=t;1=t" > /sys/fs/resctrl/info/L3_MON/mbm_assign_control
> >>
> >> This is still a single write. Two IPIs are sent separately, one for each
> >> domain.
> >>
> >> Are you doing something different?
> >
> > I said "all pending counter updates to a domain", whereby I meant
> > targeting a single domain.
> >
> > Depending on the CPU of the caller, your example write requires 1 or 2 IPIs.
> >
> > What is important is that the following write also requires 1 or 2 IPIs:
> >
> > (assuming /sys/fs/resctrl/mon_groups/[g1-g31] exist, line breaks added
> > for readability)
> >
> > echo $'//0=t;1=t\n
> > /g1/0=t;1=t\n
> > /g2/0=t;1=t\n
> > /g3/0=t;1=t\n
> > /g4/0=t;1=t\n
> > /g5/0=t;1=t\n
> > /g6/0=t;1=t\n
> > /g7/0=t;1=t\n
> > /g8/0=t;1=t\n
> > /g9/0=t;1=t\n
> > /g10/0=t;1=t\n
> > /g11/0=t;1=t\n
> > /g12/0=t;1=t\n
> > /g13/0=t;1=t\n
> > /g14/0=t;1=t\n
> > /g15/0=t;1=t\n
> > /g16/0=t;1=t\n
> > /g17/0=t;1=t\n
> > /g18/0=t;1=t\n
> > /g19/0=t;1=t\n
> > /g20/0=t;1=t\n
> > /g21/0=t;1=t\n
> > /g22/0=t;1=t\n
> > /g23/0=t;1=t\n
> > /g24/0=t;1=t\n
> > /g25/0=t;1=t\n
> > /g26/0=t;1=t\n
> > /g27/0=t;1=t\n
> > /g28/0=t;1=t\n
> > /g29/0=t;1=t\n
> > /g30/0=t;1=t\n
> > /g31/0=t;1=t\n'
> >
> > My ultimate goal is for a thread bound to a particular domain to be
> > able to unassign and reassign the local domain's 32 counters in a
> > single write() with no IPIs at all. And when IPIs are required, then
> > no more than one per domain, regardless of the number of groups
> > updated.
> >
>
> Yes. I think I got the idea. Thanks.
>
> >
> >>
> >>>
> >>> struct resctrl_monitor_cfg {
> >>> int closid;
> >>> int rmid;
> >>> int evtid;
> >>> bool dirty;
> >>> };
> >>>
> >>> This mirrors the info needed in whatever register configures the
> >>> counter, plus a dirty flag to skip over the ones that don't need to be
> >>> updated.
> >>
> >> This is what my understanding of your implementation.
> >>
> >> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> >> index d94abba1c716..9cebf065cc97 100644
> >> --- a/include/linux/resctrl.h
> >> +++ b/include/linux/resctrl.h
> >> @@ -94,6 +94,13 @@ struct rdt_ctrl_domain {
> >> u32 *mbps_val;
> >> };
> >>
> >> +struct resctrl_monitor_cfg {
> >> + int closid;
> >> + int rmid;
> >> + int evtid;
> >> + bool dirty;
> >> +};
> >> +
> >> /**
> >> * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor
> >> resource
> >> * @hdr: common header for different domain types
> >> @@ -116,6 +123,7 @@ struct rdt_mon_domain {
> >> struct delayed_work cqm_limbo;
> >> int mbm_work_cpu;
> >> int cqm_work_cpu;
> >> + /* Allocate num_mbm_cntrs entries in each domain */
> >> + struct resctrl_monitor_cfg *mon_cfg;
> >> };
> >>
> >>
> >> When a user requests an assignment for total event to the default group
> >> for domain 0, you go search in rdt_mon_domain(dom 0) for empty mon_cfg
> >> entry.
> >>
> >> If there is an empty entry, then use that entry for assignment and
> >> update closid, rmid, evtid and dirty = 1. We can get all these
> >> information from default group here.
> >>
> >> Does this make sense?
> >
> > Yes, sounds correct.
>
> I will probably add cntr_id in resctrl_monitor_cfg structure and
> initialize during the allocation. And rename the field 'dirty' to
> 'active'(or something similar) to hold the assign state for that entry.
> That way we have all the information required for assignment at one
> place. We don't need to update the rdtgroup structure.
It concerns me that you want to say "active" instead of "dirty". What
I'm proposing is a write-back cache of the config values so that a
series of remote updates to many groups can be written back to
hardware all at once.
Therefore we want to track which entries are "dirty", implying that
they differ from what was last written to the registers and therefore
need to be flushed by the remote domain. Whether the counter is
enabled or not is already implicit in the configuration values (evtid
!= 0).
-Peter
Hi Peter,
On 12/2/24 04:43, Peter Newman wrote:
> Hi Babu,
>
> On Fri, Nov 29, 2024 at 6:06 PM Moger, Babu <bmoger@amd.com> wrote:
>>
>> Hi Peter, Reinette,
>>
>> On 11/29/2024 3:59 AM, Peter Newman wrote:
>>> Hi Babu,
>>>
>>> On Thu, Nov 28, 2024 at 8:35 PM Moger, Babu <bmoger@amd.com> wrote:
>>>>
>>>> Hi Peter,
>>>>
>>>> On 11/28/2024 5:10 AM, Peter Newman wrote:
>
>>>>> In my prototype, I allocated a counter id-indexed array to each
>>>>> monitoring domain structure for tracking the counter allocations,
>>>>> because the hardware counters are all domain-scoped. That way the
>>>>> tracking data goes away when the hardware does.
>>>>>
>>>>> I was focused on allowing all pending counter updates to a domain
>>>>> resulting from a single mbm_assign_control write to be batched and
>>>>> processed in a single IPI, so I structured the counter tracker
>>>>> something like this:
>>>>
>>>> Not sure what you meant here. How are you batching two IPIs for two domains?
>>>>
>>>> #echo "//0=t;1=t" > /sys/fs/resctrl/info/L3_MON/mbm_assign_control
>>>>
>>>> This is still a single write. Two IPIs are sent separately, one for each
>>>> domain.
>>>>
>>>> Are you doing something different?
>>>
>>> I said "all pending counter updates to a domain", whereby I meant
>>> targeting a single domain.
>>>
>>> Depending on the CPU of the caller, your example write requires 1 or 2 IPIs.
>>>
>>> What is important is that the following write also requires 1 or 2 IPIs:
>>>
>>> (assuming /sys/fs/resctrl/mon_groups/[g1-g31] exist, line breaks added
>>> for readability)
>>>
>>> echo $'//0=t;1=t\n
>>> /g1/0=t;1=t\n
>>> /g2/0=t;1=t\n
>>> /g3/0=t;1=t\n
>>> /g4/0=t;1=t\n
>>> /g5/0=t;1=t\n
>>> /g6/0=t;1=t\n
>>> /g7/0=t;1=t\n
>>> /g8/0=t;1=t\n
>>> /g9/0=t;1=t\n
>>> /g10/0=t;1=t\n
>>> /g11/0=t;1=t\n
>>> /g12/0=t;1=t\n
>>> /g13/0=t;1=t\n
>>> /g14/0=t;1=t\n
>>> /g15/0=t;1=t\n
>>> /g16/0=t;1=t\n
>>> /g17/0=t;1=t\n
>>> /g18/0=t;1=t\n
>>> /g19/0=t;1=t\n
>>> /g20/0=t;1=t\n
>>> /g21/0=t;1=t\n
>>> /g22/0=t;1=t\n
>>> /g23/0=t;1=t\n
>>> /g24/0=t;1=t\n
>>> /g25/0=t;1=t\n
>>> /g26/0=t;1=t\n
>>> /g27/0=t;1=t\n
>>> /g28/0=t;1=t\n
>>> /g29/0=t;1=t\n
>>> /g30/0=t;1=t\n
>>> /g31/0=t;1=t\n'
>>>
>>> My ultimate goal is for a thread bound to a particular domain to be
>>> able to unassign and reassign the local domain's 32 counters in a
>>> single write() with no IPIs at all. And when IPIs are required, then
>>> no more than one per domain, regardless of the number of groups
>>> updated.
>>>
>>
>> Yes. I think I got the idea. Thanks.
>>
>>>
>>>>
>>>>>
>>>>> struct resctrl_monitor_cfg {
>>>>> int closid;
>>>>> int rmid;
>>>>> int evtid;
>>>>> bool dirty;
>>>>> };
>>>>>
>>>>> This mirrors the info needed in whatever register configures the
>>>>> counter, plus a dirty flag to skip over the ones that don't need to be
>>>>> updated.
>>>>
>>>> This is what my understanding of your implementation.
>>>>
>>>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
>>>> index d94abba1c716..9cebf065cc97 100644
>>>> --- a/include/linux/resctrl.h
>>>> +++ b/include/linux/resctrl.h
>>>> @@ -94,6 +94,13 @@ struct rdt_ctrl_domain {
>>>> u32 *mbps_val;
>>>> };
>>>>
>>>> +struct resctrl_monitor_cfg {
>>>> + int closid;
>>>> + int rmid;
>>>> + int evtid;
>>>> + bool dirty;
>>>> +};
>>>> +
>>>> /**
>>>> * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor
>>>> resource
>>>> * @hdr: common header for different domain types
>>>> @@ -116,6 +123,7 @@ struct rdt_mon_domain {
>>>> struct delayed_work cqm_limbo;
>>>> int mbm_work_cpu;
>>>> int cqm_work_cpu;
>>>> + /* Allocate num_mbm_cntrs entries in each domain */
>>>> + struct resctrl_monitor_cfg *mon_cfg;
>>>> };
>>>>
>>>>
>>>> When a user requests an assignment for total event to the default group
>>>> for domain 0, you go search in rdt_mon_domain(dom 0) for empty mon_cfg
>>>> entry.
>>>>
>>>> If there is an empty entry, then use that entry for assignment and
>>>> update closid, rmid, evtid and dirty = 1. We can get all these
>>>> information from default group here.
>>>>
>>>> Does this make sense?
>>>
>>> Yes, sounds correct.
>>
>> I will probably add cntr_id in resctrl_monitor_cfg structure and
>> initialize during the allocation. And rename the field 'dirty' to
>> 'active'(or something similar) to hold the assign state for that entry.
>> That way we have all the information required for assignment at one
>> place. We don't need to update the rdtgroup structure.
>
> It concerns me that you want to say "active" instead of "dirty". What
> I'm proposing is a write-back cache of the config values so that a
> series of remote updates to many groups can be written back to
> hardware all at once.
>
> Therefore we want to track which entries are "dirty", implying that
> they differ from what was last written to the registers and therefore
> need to be flushed by the remote domain. Whether the counter is
> enabled or not is already implicit in the configuration values (evtid
> != 0).
>
That is correct. But I wanted to add the "state" explicitly. Makes it easy
to search. We can overload it if you want both.
int state;
#define ASSIGN_STATE_ACTIVE BIT(0)
#define ASSIGN_STATE_DIRTY BIT(1)
--
Thanks
Babu Moger
Hi Babu,
On Wed, Oct 30, 2024 at 12:24 AM Babu Moger <babu.moger@amd.com> wrote:
>
> Provide the interface to display the number of free monitoring counters
> available for assignment in each doamin when mbm_cntr_assign is supported.
>
> Signed-off-by: Babu Moger <babu.moger@amd.com>
> ---
> v9: New patch.
> ---
> Documentation/arch/x86/resctrl.rst | 4 ++++
> arch/x86/kernel/cpu/resctrl/monitor.c | 1 +
> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 33 ++++++++++++++++++++++++++
> 3 files changed, 38 insertions(+)
>
> diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst
> index 2f3a86278e84..2bc58d974934 100644
> --- a/Documentation/arch/x86/resctrl.rst
> +++ b/Documentation/arch/x86/resctrl.rst
> @@ -302,6 +302,10 @@ with the following files:
> memory bandwidth tracking to a single memory bandwidth event per
> monitoring group.
>
> +"available_mbm_cntrs":
> + The number of free monitoring counters available assignment in each domain
> + when the architecture supports mbm_cntr_assign mode.
It seems you need to clarify that counters are only available to a
domain when they're available in all domains:
resctrl# for i in `seq 100`; do
> mkdir mon_groups/m${i}
> done
resctrl# cat info/L3_MON/available_mbm_cntrs
0=0;1=0;2=0;3=0;4=0;5=0;6=0;7=0;8=0;9=0;10=0;11=0;12=0;16=0;17=0;18=0;19=0;20=0;21=0;22=0;23=0;24=0;25=0;26=0;27=0;28=0
resctrl# cd info/L3_MON/
L3_MON# echo '/m1/0=_' > mbm_assign_control
L3_MON# cat available_mbm_cntrs
0=2;1=0;2=0;3=0;4=0;5=0;6=0;7=0;8=0;9=0;10=0;11=0;12=0;16=0;17=0;18=0;19=0;20=0;21=0;22=0;23=0;24=0;25=0;26=0;27=0;28=0
L3_MON# echo '/m16/0+t' > mbm_assign_control
-bash: echo: write error: Invalid argument
L3_MON# cat ../last_cmd_status
Out of MBM assignable counters
Assign operation '+t' failed on the group /m16/
L3_MON# rmdir ../../mon_groups/m1
L3_MON# cat available_mbm_cntrs
0=2;1=2;2=2;3=2;4=2;5=2;6=2;7=2;8=2;9=2;10=2;11=2;12=2;16=2;17=2;18=2;19=2;20=2;21=2;22=2;23=2;24=2;25=2;26=2;27=2;28=2
L3_MON# echo '/m16/0+t' > mbm_assign_control
L3_MON#
-Peter
Hi Peter,
On 11/4/24 08:14, Peter Newman wrote:
> Hi Babu,
>
> On Wed, Oct 30, 2024 at 12:24 AM Babu Moger <babu.moger@amd.com> wrote:
>>
>> Provide the interface to display the number of free monitoring counters
>> available for assignment in each doamin when mbm_cntr_assign is supported.
>>
>> Signed-off-by: Babu Moger <babu.moger@amd.com>
>> ---
>> v9: New patch.
>> ---
>> Documentation/arch/x86/resctrl.rst | 4 ++++
>> arch/x86/kernel/cpu/resctrl/monitor.c | 1 +
>> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 33 ++++++++++++++++++++++++++
>> 3 files changed, 38 insertions(+)
>>
>> diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst
>> index 2f3a86278e84..2bc58d974934 100644
>> --- a/Documentation/arch/x86/resctrl.rst
>> +++ b/Documentation/arch/x86/resctrl.rst
>> @@ -302,6 +302,10 @@ with the following files:
>> memory bandwidth tracking to a single memory bandwidth event per
>> monitoring group.
>>
>> +"available_mbm_cntrs":
>> + The number of free monitoring counters available assignment in each domain
>> + when the architecture supports mbm_cntr_assign mode.
>
> It seems you need to clarify that counters are only available to a
> domain when they're available in all domains:
Yes. Makes sense.
>
> resctrl# for i in `seq 100`; do
>> mkdir mon_groups/m${i}
>> done
> resctrl# cat info/L3_MON/available_mbm_cntrs
> 0=0;1=0;2=0;3=0;4=0;5=0;6=0;7=0;8=0;9=0;10=0;11=0;12=0;16=0;17=0;18=0;19=0;20=0;21=0;22=0;23=0;24=0;25=0;26=0;27=0;28=0
>
> resctrl# cd info/L3_MON/
> L3_MON# echo '/m1/0=_' > mbm_assign_control
> L3_MON# cat available_mbm_cntrs
> 0=2;1=0;2=0;3=0;4=0;5=0;6=0;7=0;8=0;9=0;10=0;11=0;12=0;16=0;17=0;18=0;19=0;20=0;21=0;22=0;23=0;24=0;25=0;26=0;27=0;28=0
> L3_MON# echo '/m16/0+t' > mbm_assign_control
> -bash: echo: write error: Invalid argument
> L3_MON# cat ../last_cmd_status
> Out of MBM assignable counters
> Assign operation '+t' failed on the group /m16/
>
> L3_MON# rmdir ../../mon_groups/m1
> L3_MON# cat available_mbm_cntrs
> 0=2;1=2;2=2;3=2;4=2;5=2;6=2;7=2;8=2;9=2;10=2;11=2;12=2;16=2;17=2;18=2;19=2;20=2;21=2;22=2;23=2;24=2;25=2;26=2;27=2;28=2
> L3_MON# echo '/m16/0+t' > mbm_assign_control
> L3_MON#
>
Test case looks good to me. Thanks for trying out.
--
Thanks
Babu Moger
> Provide the interface to display the number of free monitoring counters > available for assignment in each doamin when mbm_cntr_assign is supported. s/doamin/domain/ -Tony
On 10/29/24 18:57, Luck, Tony wrote: >> Provide the interface to display the number of free monitoring counters >> available for assignment in each doamin when mbm_cntr_assign is supported. > > s/doamin/domain/ > Sure. Thanks -- Thanks Babu Moger
© 2016 - 2026 Red Hat, Inc.