arch/x86/kernel/cpu/resctrl/core.c | 6 ++++-- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 6 +++--- arch/x86/kernel/cpu/resctrl/internal.h | 4 ++-- arch/x86/kernel/cpu/resctrl/monitor.c | 6 +----- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 6 +++--- include/linux/resctrl.h | 4 ++-- 6 files changed, 15 insertions(+), 17 deletions(-)
In the resctrl subsystem's Sub-NUMA Cluster (SNC) mode, the rdt_mon_domain
structure previously relied on the cacheinfo interface to store L3 cache
information (e.g., shared_cpu_map) for monitoring. However, this approach
introduces risks when CPUs go offline:
1. Inconsistency: The ci field in rdt_mon_domain was initialized using the
first online CPU of a NUMA node. If this CPU later goes offline, the
shared_cpu_map (managed by the cache subsystem) is cleared, leading to
incorrect or undefined behavior when accessed via rdt_mon_domain.
2. Lifecycle dependency: The cacheinfo structure's lifecycle is managed
by the cache subsystem, making it unsafe for resctrl to hold
long-term references.
To resolve these issues and align with design principles:
1. Replace direct cacheinfo references in struct rdt_mon_domain and struct
rmid_read with the cacheinfo ID (a unique identifier for the L3 cache).
2. Use hdr.cpu_mask (already maintained by resctrl) to replace
shared_cpu_map logic for determining valid CPUs for RMID counter reads
via the MSR interface.
Signed-off-by: Qinyun Tan <qinyuntan@linux.alibaba.com>
---
arch/x86/kernel/cpu/resctrl/core.c | 6 ++++--
arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 6 +++---
arch/x86/kernel/cpu/resctrl/internal.h | 4 ++--
arch/x86/kernel/cpu/resctrl/monitor.c | 6 +-----
arch/x86/kernel/cpu/resctrl/rdtgroup.c | 6 +++---
include/linux/resctrl.h | 4 ++--
6 files changed, 15 insertions(+), 17 deletions(-)
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index cf29681d01e04..a0dff742e9e93 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -516,6 +516,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
struct rdt_hw_mon_domain *hw_dom;
struct rdt_domain_hdr *hdr;
struct rdt_mon_domain *d;
+ struct cacheinfo *ci;
int err;
lockdep_assert_held(&domain_list_lock);
@@ -543,12 +544,13 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
d = &hw_dom->d_resctrl;
d->hdr.id = id;
d->hdr.type = RESCTRL_MON_DOMAIN;
- d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
- if (!d->ci) {
+ ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+ if (!ci) {
pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
mon_domain_free(hw_dom);
return;
}
+ d->ci_id = ci->id;
cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
arch_mon_domain_online(r, d);
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 0a0ac5f6112ec..f9768669ce806 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -690,10 +690,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
* one that matches this cache id.
*/
list_for_each_entry(d, &r->mon_domains, hdr.list) {
- if (d->ci->id == domid) {
- rr.ci = d->ci;
+ if (d->ci_id == domid) {
+ rr.ci_id = d->ci_id;
mon_event_read(&rr, r, NULL, rdtgrp,
- &d->ci->shared_cpu_map, evtid, false);
+ &d->hdr.cpu_mask, evtid, false);
goto checkresult;
}
}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index eaae99602b617..91e71db554a9c 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -136,7 +136,7 @@ union mon_data_bits {
* domains in @r sharing L3 @ci.id
* @evtid: Which monitor event to read.
* @first: Initialize MBM counter when true.
- * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
+ * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains.
* @err: Error encountered when reading counter.
* @val: Returned value of event counter. If @rgrp is a parent resource group,
* @val includes the sum of event counts from its child resource groups.
@@ -150,7 +150,7 @@ struct rmid_read {
struct rdt_mon_domain *d;
enum resctrl_event_id evtid;
bool first;
- struct cacheinfo *ci;
+ unsigned int ci_id;
int err;
u64 val;
void *arch_mon_ctx;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index a93ed7d2a1602..bedccd62158c3 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -620,10 +620,6 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
return 0;
}
- /* Summing domains that share a cache, must be on a CPU for that cache. */
- if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
- return -EINVAL;
-
/*
* Legacy files must report the sum of an event across all
* domains that share the same L3 cache instance.
@@ -633,7 +629,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
*/
ret = -EINVAL;
list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
- if (d->ci->id != rr->ci->id)
+ if (d->ci_id != rr->ci_id)
continue;
err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
rr->evtid, &tval, rr->arch_mon_ctx);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index cc4a54145c83d..075fdca2080d8 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -3146,7 +3146,7 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
char name[32];
snc_mode = r->mon_scope == RESCTRL_L3_NODE;
- sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
+ sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id);
if (snc_mode)
sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
@@ -3171,7 +3171,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
return -EPERM;
priv.u.rid = r->rid;
- priv.u.domid = do_sum ? d->ci->id : d->hdr.id;
+ priv.u.domid = do_sum ? d->ci_id : d->hdr.id;
priv.u.sum = do_sum;
list_for_each_entry(mevt, &r->evt_list, list) {
priv.u.evtid = mevt->evtid;
@@ -3198,7 +3198,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
lockdep_assert_held(&rdtgroup_mutex);
snc_mode = r->mon_scope == RESCTRL_L3_NODE;
- sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
+ sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id);
kn = kernfs_find_and_get(parent_kn, name);
if (kn) {
/*
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 880351ca3dfcb..c990670d18c02 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -145,7 +145,7 @@ struct rdt_ctrl_domain {
/**
* struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource
* @hdr: common header for different domain types
- * @ci: cache info for this domain
+ * @ci_id: cache info id for this domain
* @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold
* @mbm_total: saved state for MBM total bandwidth
* @mbm_local: saved state for MBM local bandwidth
@@ -156,7 +156,7 @@ struct rdt_ctrl_domain {
*/
struct rdt_mon_domain {
struct rdt_domain_hdr hdr;
- struct cacheinfo *ci;
+ unsigned int ci_id;
unsigned long *rmid_busy_llc;
struct mbm_state *mbm_total;
struct mbm_state *mbm_local;
--
2.43.5
Hi Qinyun, Thank you for catching this. On 5/26/25 12:37 AM, Qinyun Tan wrote: > In the resctrl subsystem's Sub-NUMA Cluster (SNC) mode, the rdt_mon_domain > structure previously relied on the cacheinfo interface to store L3 cache "previously relied" -> "relies" > information (e.g., shared_cpu_map) for monitoring. However, this approach > introduces risks when CPUs go offline: > > 1. Inconsistency: The ci field in rdt_mon_domain was initialized using the "was initialized" -> "is initialized" > first online CPU of a NUMA node. If this CPU later goes offline, the > shared_cpu_map (managed by the cache subsystem) is cleared, leading to "is cleared" sounds like the shared_cpu_map is empty. Looking at cache_shared_cpu_map_remove() it seems that the worst case is when the CPU goes offline the shared_cpu_map only contains the offline CPU itself. If there remains a reference to that shared_cpu_map then it will thus be to a cpumask with an offline CPU. Are you seeing other scenarios? > incorrect or undefined behavior when accessed via rdt_mon_domain. Could you please elaborate what "incorrect and undefined behavior" you encountered? It looks like reading the top level event would not be able to read an event from one (or more?) of the online domains since the shared_cpu_map will contain an offline CPU causing smp_call_on_cpu() to return with a failure that is not caught ... the symptom may this be that there are no errors but data is wrong? > > 2. Lifecycle dependency: The cacheinfo structure's lifecycle is managed > by the cache subsystem, making it unsafe for resctrl to hold > long-term references. This is not obvious to me. Could you please elaborate how resctrl could have a reference to a removed structure? > > To resolve these issues and align with design principles: > > 1. Replace direct cacheinfo references in struct rdt_mon_domain and struct > rmid_read with the cacheinfo ID (a unique identifier for the L3 cache). > > 2. Use hdr.cpu_mask (already maintained by resctrl) to replace > shared_cpu_map logic for determining valid CPUs for RMID counter reads > via the MSR interface. I think it will help to explain why it is ok now to use hdr.cpu_mask instead of the shared_cpu_map. In support of this you could mention that the original solution aimed to read the counters on any CPU associated with the L3 cache that the sub-numa domain forms part of, but this solution uses the cpumask of one of the sub-numa domains that is known to be associated with the L3 cache. This is a reduced set of CPUs from previously intended but known to be accurate. Alternative may be to build a local cpumask from the all the sub-numa domains but it is not clear to me how much this will improve things. Considering this I do not think the references are "unnecessary" as the subject claims since the solution does not replace the original cpumask with an identical one. > Needs a "Fixes:" tag. > Signed-off-by: Qinyun Tan <qinyuntan@linux.alibaba.com> > --- Reinette
Hi Reinette Chatre, On 5/28/25 7:36 AM, Reinette Chatre wrote: > Hi Qinyun, > > Thank you for catching this. > > On 5/26/25 12:37 AM, Qinyun Tan wrote: >> In the resctrl subsystem's Sub-NUMA Cluster (SNC) mode, the rdt_mon_domain >> structure previously relied on the cacheinfo interface to store L3 cache > > "previously relied" -> "relies" Thanks, it will be corrected in the next patch. >> information (e.g., shared_cpu_map) for monitoring. However, this approach >> introduces risks when CPUs go offline: >> >> 1. Inconsistency: The ci field in rdt_mon_domain was initialized using the > > "was initialized" -> "is initialized" The same as above.> >> first online CPU of a NUMA node. If this CPU later goes offline, the >> shared_cpu_map (managed by the cache subsystem) is cleared, leading to > > "is cleared" sounds like the shared_cpu_map is empty. Looking at > cache_shared_cpu_map_remove() it seems that the worst case is when the > CPU goes offline the shared_cpu_map only contains the offline CPU itself. > If there remains a reference to that shared_cpu_map then it will thus be > to a cpumask with an offline CPU. Are you seeing other scenarios? > Yes, you are right. Once this CPU goes offline, its shared_cpu_map will only include itself. If we then try to call this offline CPU using smp_call_on_cpu, it will result in a failure. >> incorrect or undefined behavior when accessed via rdt_mon_domain. > > Could you please elaborate what "incorrect and undefined behavior" you > encountered? > It looks like reading the top level event would not be able to read > an event from one (or more?) of the online domains since the shared_cpu_map > will contain an offline CPU causing smp_call_on_cpu() to return with a failure > that is not caught ... the symptom may this be that there are no errors but > data is wrong? Yes, there won't be any errors here, but when reading the top-level events, it may not retrieve any values. For example, in the SNC-3 mode, suppose cpus 0, 40, and 80 are the firtst online cpus on node0, node1, and node2 respectively. If cpus 0, 40, and 80 are all offline, At this point, reading "the top level event" will result in a zero. Why hasn’t this issue been discovered earlier? The reason is that CPU0 is always the first online CPU on node0. The cacheinfo stored in the first rdt_mon_domain corresponds to CPU0. When rdtgroup_mondata_show reads the top-level event, it iterates through all rdt_mon_domain entries, using if (d->ci->id == domid) to find the first rdt_mon_domain that shares the resource. It then selects a CPU from the corresponding cacheinfo to perform the monitoring group data read operation. In a single-socket environment, all CPUs typically share the L3 Cache, which means this traversal action will usually lead directly to CPU0's cacheinfo. Additionally, since the mainline kernel no longer supports taking CPU0 offline, that cacheinfo remains valid indefinitely. > >> >> 2. Lifecycle dependency: The cacheinfo structure's lifecycle is managed >> by the cache subsystem, making it unsafe for resctrl to hold >> long-term references. > > This is not obvious to me. Could you please elaborate how resctrl could > have a reference to a removed structure? As mentioned above, although the cacheinfo of each CPU is not freed in the latest mainline, the shared_cpu_map within the cacheinfo will be modified as CPUs go online or offline. Each rdt_mon_domain directly references the cacheinfo of the first online CPU in the node, and the shared_cpu_map is used in various processes. This situation is bound to lead to some problems. > >> >> To resolve these issues and align with design principles: >> >> 1. Replace direct cacheinfo references in struct rdt_mon_domain and struct >> rmid_read with the cacheinfo ID (a unique identifier for the L3 cache). >> >> 2. Use hdr.cpu_mask (already maintained by resctrl) to replace >> shared_cpu_map logic for determining valid CPUs for RMID counter reads >> via the MSR interface. > > I think it will help to explain why it is ok now to use hdr.cpu_mask instead > of the shared_cpu_map. In support of this you could mention that the original > solution aimed to read the counters on any CPU associated with the L3 cache > that the sub-numa domain forms part of, but this solution uses the > cpumask of one of the sub-numa domains that is known to be associated with > the L3 cache. This is a reduced set of CPUs from previously intended but > known to be accurate. Alternative may be to build a local cpumask from the > all the sub-numa domains but it is not clear to me how much this will > improve things. > > Considering this I do not think the references are "unnecessary" as the > subject claims since the solution does not replace the original cpumask with > an identical one. Thanks a lot, you are correct. hdr.cpu_mask is a subset of shared_cpu_map, and we can almost use hdr.cpu_mask to replace the functionality of shared_cpu_map. In fact, in resctrl, the only purpose of using cpu_mask is to select a CPU that shares the cache resource for performing monitoring group data reads. Therefore, I think there is no need to build a local cpumask from all the sub-NUMA domains in this context. I will provide a more detailed description in my commit log moving forward. > >> > > Needs a "Fixes:" tag. Ok,it will be added in the next patch.> >> Signed-off-by: Qinyun Tan <qinyuntan@linux.alibaba.com> >> --- > > Reinette -- Thanks Qinyun Tan
Hi Qinyun Tan, On 5/27/25 8:32 PM, qinyuntan wrote: > On 5/28/25 7:36 AM, Reinette Chatre wrote: >> On 5/26/25 12:37 AM, Qinyun Tan wrote: >>> first online CPU of a NUMA node. If this CPU later goes offline, the >>> shared_cpu_map (managed by the cache subsystem) is cleared, leading to >> >> "is cleared" sounds like the shared_cpu_map is empty. Looking at >> cache_shared_cpu_map_remove() it seems that the worst case is when the >> CPU goes offline the shared_cpu_map only contains the offline CPU itself. >> If there remains a reference to that shared_cpu_map then it will thus be >> to a cpumask with an offline CPU. Are you seeing other scenarios? >> > Yes, you are right. Once this CPU goes offline, its shared_cpu_map > will only include itself. If we then try to call this offline CPU > using smp_call_on_cpu, it will result in a failure. Thank you for confirming. Could you please update this changelog to reflect this detail? Doing so will remove confusion and make the change easier to consume by making it obvious to reader what the problem is. >>> incorrect or undefined behavior when accessed via rdt_mon_domain. >> >> Could you please elaborate what "incorrect and undefined behavior" you >> encountered? >> It looks like reading the top level event would not be able to read >> an event from one (or more?) of the online domains since the shared_cpu_map >> will contain an offline CPU causing smp_call_on_cpu() to return with a failure >> that is not caught ... the symptom may this be that there are no errors but >> data is wrong? > > Yes, there won't be any errors here, but when reading the top-level events, it may not retrieve any values. > > For example, in the SNC-3 mode, suppose cpus 0, 40, and 80 are the > firtst online cpus on node0, node1, and node2 respectively. If cpus > 0, 40, and 80 are all offline, At this point, reading "the top level > event" will result in a zero. This is SNC-3 mode with a single socket example where CPU 0 cannot go offline. I thus do not see this happening for the reason you provide below. I *can* see how this happens on a second socket when the first online CPU of the first node of that (the second) socket goes offline. > Why hasn’t this issue been discovered earlier? The reason is that CPU0 is always the first online CPU on node0. The cacheinfo stored in the first rdt_mon_domain corresponds to CPU0. When rdtgroup_mondata_show reads the top-level event, it iterates through all rdt_mon_domain entries, using if (d->ci->id == domid) to find the first rdt_mon_domain that shares the resource. It then selects a CPU from the corresponding cacheinfo to perform the monitoring group data read operation. In a single-socket environment, all CPUs typically share the L3 Cache, which means this traversal action will usually lead directly to CPU0's cacheinfo. Additionally, since the mainline kernel no longer supports taking CPU0 offline, that cacheinfo remains valid indefinitely. > >> >>> >>> 2. Lifecycle dependency: The cacheinfo structure's lifecycle is managed >>> by the cache subsystem, making it unsafe for resctrl to hold >>> long-term references. >> >> This is not obvious to me. Could you please elaborate how resctrl could >> have a reference to a removed structure? > As mentioned above, although the cacheinfo of each CPU is not freed > in the latest mainline, the shared_cpu_map within the cacheinfo will > be modified as CPUs go online or offline. Each rdt_mon_domain > directly references the cacheinfo of the first online CPU in the > node, and the shared_cpu_map is used in various processes. This > situation is bound to lead to some problems. Claiming that it is unsafe for resctrl to hold a reference implies that resctrl uses an invalid pointer. This is not the case here. The pointer is valid, but the data pointed to by it does not support resctrl's usage. I thus do not believe that this "lifecycle dependency" point is a valid motivation for this change. >> >>> >>> To resolve these issues and align with design principles: >>> >>> 1. Replace direct cacheinfo references in struct rdt_mon_domain and struct >>> rmid_read with the cacheinfo ID (a unique identifier for the L3 cache). >>> >>> 2. Use hdr.cpu_mask (already maintained by resctrl) to replace >>> shared_cpu_map logic for determining valid CPUs for RMID counter reads >>> via the MSR interface. >> >> I think it will help to explain why it is ok now to use hdr.cpu_mask instead >> of the shared_cpu_map. In support of this you could mention that the original >> solution aimed to read the counters on any CPU associated with the L3 cache >> that the sub-numa domain forms part of, but this solution uses the >> cpumask of one of the sub-numa domains that is known to be associated with >> the L3 cache. This is a reduced set of CPUs from previously intended but >> known to be accurate. Alternative may be to build a local cpumask from the >> all the sub-numa domains but it is not clear to me how much this will >> improve things. >> >> Considering this I do not think the references are "unnecessary" as the >> subject claims since the solution does not replace the original cpumask with >> an identical one. > > Thanks a lot, you are correct. hdr.cpu_mask is a subset of > shared_cpu_map, and we can almost use hdr.cpu_mask to replace the > functionality of shared_cpu_map. > > In fact, in resctrl, the only purpose of using cpu_mask is to select > a CPU that shares the cache resource for performing monitoring group > data reads. Therefore, I think there is no need to build a local > cpumask from all the sub-NUMA domains in this context. One issue I can think of here is when there is a usage where the user does task isolation on the numa node boundary. Let's consider the SNC-3 example again with node3, node4, and node5 on the second socket, "L3 cache ID 1". If all CPUs on node3 are in tick_nohz_full_mask while none of the node4 and node5 CPUs are in tick_nohz_full_mask then one of node3's CPUs will get an unnecessary IPI. > > I will provide a more detailed description in my commit log moving > forward. Thank you very much. Reinette
Hi Reinette Chatre,
On 5/28/25 12:49 PM, Reinette Chatre wrote:
> Hi Qinyun Tan,
>
> On 5/27/25 8:32 PM, qinyuntan wrote:
>> On 5/28/25 7:36 AM, Reinette Chatre wrote:
>>> On 5/26/25 12:37 AM, Qinyun Tan wrote:
>
>
>>>> first online CPU of a NUMA node. If this CPU later goes offline, the
>>>> shared_cpu_map (managed by the cache subsystem) is cleared, leading to
>>>
>>> "is cleared" sounds like the shared_cpu_map is empty. Looking at
>>> cache_shared_cpu_map_remove() it seems that the worst case is when the
>>> CPU goes offline the shared_cpu_map only contains the offline CPU itself.
>>> If there remains a reference to that shared_cpu_map then it will thus be
>>> to a cpumask with an offline CPU. Are you seeing other scenarios?
>>>
>> Yes, you are right. Once this CPU goes offline, its shared_cpu_map
>> will only include itself. If we then try to call this offline CPU
>> using smp_call_on_cpu, it will result in a failure.
>
> Thank you for confirming. Could you please update this changelog to reflect
> this detail? Doing so will remove confusion and make the change easier to
> consume by making it obvious to reader what the problem is.
Thank you for your patient review, I will update this changelog in the
next patch.
>
>>>> incorrect or undefined behavior when accessed via rdt_mon_domain.
>>>
>>> Could you please elaborate what "incorrect and undefined behavior" you
>>> encountered?
>>> It looks like reading the top level event would not be able to read
>>> an event from one (or more?) of the online domains since the shared_cpu_map
>>> will contain an offline CPU causing smp_call_on_cpu() to return with a failure
>>> that is not caught ... the symptom may this be that there are no errors but
>>> data is wrong?
>>
>> Yes, there won't be any errors here, but when reading the top-level events, it may not retrieve any values.
>>
>> For example, in the SNC-3 mode, suppose cpus 0, 40, and 80 are the
>> firtst online cpus on node0, node1, and node2 respectively. If cpus
>> 0, 40, and 80 are all offline, At this point, reading "the top level
>> event" will result in a zero.
>
> This is SNC-3 mode with a single socket example where CPU 0 cannot
> go offline. I thus do not see this happening for the reason you provide below.
>
> I *can* see how this happens on a second socket when the first online CPU
> of the first node of that (the second) socket goes offline.
>
>> Why hasn’t this issue been discovered earlier? The reason is that CPU0 is always the first online CPU on node0. The cacheinfo stored in the first rdt_mon_domain corresponds to CPU0. When rdtgroup_mondata_show reads the top-level event, it iterates through all rdt_mon_domain entries, using if (d->ci->id == domid) to find the first rdt_mon_domain that shares the resource. It then selects a CPU from the corresponding cacheinfo to perform the monitoring group data read operation. In a single-socket environment, all CPUs typically share the L3 Cache, which means this traversal action will usually lead directly to CPU0's cacheinfo. Additionally, since the mainline kernel no longer supports taking CPU0 offline, that cacheinfo remains valid indefinitely.
>>
>>>
>>>>
>>>> 2. Lifecycle dependency: The cacheinfo structure's lifecycle is managed
>>>> by the cache subsystem, making it unsafe for resctrl to hold
>>>> long-term references.
>>>
>>> This is not obvious to me. Could you please elaborate how resctrl could
>>> have a reference to a removed structure?
>> As mentioned above, although the cacheinfo of each CPU is not freed
>> in the latest mainline, the shared_cpu_map within the cacheinfo will
>> be modified as CPUs go online or offline. Each rdt_mon_domain
>> directly references the cacheinfo of the first online CPU in the
>> node, and the shared_cpu_map is used in various processes. This
>> situation is bound to lead to some problems.
>
> Claiming that it is unsafe for resctrl to hold a reference implies that
> resctrl uses an invalid pointer. This is not the case here. The pointer
> is valid, but the data pointed to by it does not support resctrl's usage. I
> thus do not believe that this "lifecycle dependency" point is a valid
> motivation for this change.My description is indeed inaccurate. I will adjust it in the next patch.
Thanks.
>
>>>
>>>>
>>>> To resolve these issues and align with design principles:
>>>>
>>>> 1. Replace direct cacheinfo references in struct rdt_mon_domain and struct
>>>> rmid_read with the cacheinfo ID (a unique identifier for the L3 cache).
>>>>
>>>> 2. Use hdr.cpu_mask (already maintained by resctrl) to replace
>>>> shared_cpu_map logic for determining valid CPUs for RMID counter reads
>>>> via the MSR interface.
>>>
>>> I think it will help to explain why it is ok now to use hdr.cpu_mask instead
>>> of the shared_cpu_map. In support of this you could mention that the original
>>> solution aimed to read the counters on any CPU associated with the L3 cache
>>> that the sub-numa domain forms part of, but this solution uses the
>>> cpumask of one of the sub-numa domains that is known to be associated with
>>> the L3 cache. This is a reduced set of CPUs from previously intended but
>>> known to be accurate. Alternative may be to build a local cpumask from the
>>> all the sub-numa domains but it is not clear to me how much this will
>>> improve things.
>>>
>>> Considering this I do not think the references are "unnecessary" as the
>>> subject claims since the solution does not replace the original cpumask with
>>> an identical one.
>>
>> Thanks a lot, you are correct. hdr.cpu_mask is a subset of
>> shared_cpu_map, and we can almost use hdr.cpu_mask to replace the
>> functionality of shared_cpu_map.
>>
>> In fact, in resctrl, the only purpose of using cpu_mask is to select
>> a CPU that shares the cache resource for performing monitoring group
>> data reads. Therefore, I think there is no need to build a local
>> cpumask from all the sub-NUMA domains in this context.
>
> One issue I can think of here is when there is a usage where the user does
> task isolation on the numa node boundary. Let's consider the SNC-3 example
> again with node3, node4, and node5 on the second socket, "L3 cache ID 1".
> If all CPUs on node3 are in tick_nohz_full_mask while none of the node4 and
> node5 CPUs are in tick_nohz_full_mask then one of node3's CPUs will get
> an unnecessary IPI.
>
You are right, how about this? First, obtain any cpu in hdr.cpu_mask,
and then use the cacheinfo shared_cpu_map of this cpu:
diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
index 9337787461d2d..d43f438465ad0 100644
--- a/fs/resctrl/ctrlmondata.c
+++ b/fs/resctrl/ctrlmondata.c
@@ -596,7 +596,8 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
struct rdtgroup *rdtgrp;
struct rdt_resource *r;
struct mon_data *md;
- int domid, ret = 0;
+ struct cacheinfo *ci;
+ int domid, cpu, ret = 0;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (!rdtgrp) {
@@ -625,8 +626,12 @@ int rdtgroup_mondata_show(struct seq_file *m, void
*arg)
list_for_each_entry(d, &r->mon_domains, hdr.list) {
if (d->ci_id == domid) {
rr.ci_id = d->ci_id;
+ cpu = cpumask_any(&d->hdr.cpu_mask)
+ ci = get_cpu_cacheinfo_level(cpu,
RESCTRL_L3_CACHE);
+ if (!ci)
+ continue;
mon_event_read(&rr, r, NULL, rdtgrp,
- &d->hdr.cpu_mask, evtid,
false);
+ &ci->shared_cpu_map,
evtid, false);
goto checkresult;
}
}
>>
>> I will provide a more detailed description in my commit log moving
>> forward.
>
> Thank you very much.
>
> Reinette
Thanks
Qinyun Tan
Hi Qinyun Tan,
On 5/27/25 11:37 PM, qinyuntan wrote:
> On 5/28/25 12:49 PM, Reinette Chatre wrote:
...
>> One issue I can think of here is when there is a usage where the user does
>> task isolation on the numa node boundary. Let's consider the SNC-3 example
>> again with node3, node4, and node5 on the second socket, "L3 cache ID 1".
>> If all CPUs on node3 are in tick_nohz_full_mask while none of the node4 and
>> node5 CPUs are in tick_nohz_full_mask then one of node3's CPUs will get
>> an unnecessary IPI.
>>
> You are right, how about this? First, obtain any cpu in hdr.cpu_mask, and then use the cacheinfo shared_cpu_map of this cpu:
>
> diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
> index 9337787461d2d..d43f438465ad0 100644
> --- a/fs/resctrl/ctrlmondata.c
> +++ b/fs/resctrl/ctrlmondata.c
> @@ -596,7 +596,8 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
> struct rdtgroup *rdtgrp;
> struct rdt_resource *r;
> struct mon_data *md;
> - int domid, ret = 0;
> + struct cacheinfo *ci;
> + int domid, cpu, ret = 0;
>
> rdtgrp = rdtgroup_kn_lock_live(of->kn);
> if (!rdtgrp) {
> @@ -625,8 +626,12 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
> list_for_each_entry(d, &r->mon_domains, hdr.list) {
> if (d->ci_id == domid) {
> rr.ci_id = d->ci_id;
> + cpu = cpumask_any(&d->hdr.cpu_mask)
> + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
> + if (!ci)
> + continue;
> mon_event_read(&rr, r, NULL, rdtgrp,
> - &d->hdr.cpu_mask, evtid, false);
> + &ci->shared_cpu_map, evtid, false);
> goto checkresult;
> }
> }
>
This looks good to me. Much better than what I was thinking about.
Apart from the items already mentioned I would like to add a couple of style comments:
- Please order variable declarations at beginning of function in reverse fir tree order.
For example, above snippet would look like:
struct rdtgroup *rdtgrp;
int domid, cpu, ret = 0;
struct rdt_resource *r;
struct cacheinfo *ci;
struct mon_data *md;
- Please align struct member names in tabular format (re. the struct rmid_read changes).
- Please ensure struct descriptions are aligned with the text surrounding it. (re.
the struct rmid_read and struct rdt_mon_domain changes).
Thank you very much.
Reinette
Hi Reinette Chatre,
On 5/28/25 2:37 PM, qinyuntan wrote:
> Hi Reinette Chatre,
>
> On 5/28/25 12:49 PM, Reinette Chatre wrote:
>> Hi Qinyun Tan,
>>
>> On 5/27/25 8:32 PM, qinyuntan wrote:
>>> On 5/28/25 7:36 AM, Reinette Chatre wrote:
>>>> On 5/26/25 12:37 AM, Qinyun Tan wrote:
>>
>>
>>>>> first online CPU of a NUMA node. If this CPU later goes offline, the
>>>>> shared_cpu_map (managed by the cache subsystem) is cleared, leading to
>>>>
>>>> "is cleared" sounds like the shared_cpu_map is empty. Looking at
>>>> cache_shared_cpu_map_remove() it seems that the worst case is when the
>>>> CPU goes offline the shared_cpu_map only contains the offline CPU
>>>> itself.
>>>> If there remains a reference to that shared_cpu_map then it will
>>>> thus be
>>>> to a cpumask with an offline CPU. Are you seeing other scenarios?
>>>>
>>> Yes, you are right. Once this CPU goes offline, its shared_cpu_map
>>> will only include itself. If we then try to call this offline CPU
>>> using smp_call_on_cpu, it will result in a failure.
>>
>> Thank you for confirming. Could you please update this changelog to
>> reflect
>> this detail? Doing so will remove confusion and make the change easier to
>> consume by making it obvious to reader what the problem is.
> Thank you for your patient review, I will update this changelog in the
> next patch.
>
>>>>> incorrect or undefined behavior when accessed via rdt_mon_domain.
>>>>
>>>> Could you please elaborate what "incorrect and undefined behavior" you
>>>> encountered?
>>>> It looks like reading the top level event would not be able to read
>>>> an event from one (or more?) of the online domains since the
>>>> shared_cpu_map
>>>> will contain an offline CPU causing smp_call_on_cpu() to return with
>>>> a failure
>>>> that is not caught ... the symptom may this be that there are no
>>>> errors but
>>>> data is wrong?
>>>
>>> Yes, there won't be any errors here, but when reading the top-level
>>> events, it may not retrieve any values.
>>>
>>> For example, in the SNC-3 mode, suppose cpus 0, 40, and 80 are the
>>> firtst online cpus on node0, node1, and node2 respectively. If cpus
>>> 0, 40, and 80 are all offline, At this point, reading "the top level
>>> event" will result in a zero.
>>
>> This is SNC-3 mode with a single socket example where CPU 0 cannot
>> go offline. I thus do not see this happening for the reason you
>> provide below.
>>
>> I *can* see how this happens on a second socket when the first online CPU
>> of the first node of that (the second) socket goes offline.
>
>>> Why hasn’t this issue been discovered earlier? The reason is that
>>> CPU0 is always the first online CPU on node0. The cacheinfo stored in
>>> the first rdt_mon_domain corresponds to CPU0. When
>>> rdtgroup_mondata_show reads the top-level event, it iterates through
>>> all rdt_mon_domain entries, using if (d->ci->id == domid) to find the
>>> first rdt_mon_domain that shares the resource. It then selects a CPU
>>> from the corresponding cacheinfo to perform the monitoring group data
>>> read operation. In a single-socket environment, all CPUs typically
>>> share the L3 Cache, which means this traversal action will usually
>>> lead directly to CPU0's cacheinfo. Additionally, since the mainline
>>> kernel no longer supports taking CPU0 offline, that cacheinfo remains
>>> valid indefinitely.
>>>
>>>>
>>>>>
>>>>> 2. Lifecycle dependency: The cacheinfo structure's lifecycle is
>>>>> managed
>>>>> by the cache subsystem, making it unsafe for resctrl to hold
>>>>> long-term references.
>>>>
>>>> This is not obvious to me. Could you please elaborate how resctrl could
>>>> have a reference to a removed structure?
>>> As mentioned above, although the cacheinfo of each CPU is not freed
>>> in the latest mainline, the shared_cpu_map within the cacheinfo will
>>> be modified as CPUs go online or offline. Each rdt_mon_domain
>>> directly references the cacheinfo of the first online CPU in the
>>> node, and the shared_cpu_map is used in various processes. This
>>> situation is bound to lead to some problems.
>>
>> Claiming that it is unsafe for resctrl to hold a reference implies that
>> resctrl uses an invalid pointer. This is not the case here. The pointer
>> is valid, but the data pointed to by it does not support resctrl's
>> usage. I
>> thus do not believe that this "lifecycle dependency" point is a valid
>> motivation for this change.My description is indeed inaccurate. I will
>> adjust it in the next patch.
> Thanks.
Sorry, I accidentally pressed backspace twice. It should be:
'''
My description is indeed inaccurate. I will adjust it in the next patch.
Thanks.
'''
>
>>>>
>>>>>
>>>>> To resolve these issues and align with design principles:
>>>>>
>>>>> 1. Replace direct cacheinfo references in struct rdt_mon_domain and
>>>>> struct
>>>>> rmid_read with the cacheinfo ID (a unique identifier for the L3
>>>>> cache).
>>>>>
>>>>> 2. Use hdr.cpu_mask (already maintained by resctrl) to replace
>>>>> shared_cpu_map logic for determining valid CPUs for RMID counter reads
>>>>> via the MSR interface.
>>>>
>>>> I think it will help to explain why it is ok now to use hdr.cpu_mask
>>>> instead
>>>> of the shared_cpu_map. In support of this you could mention that the
>>>> original
>>>> solution aimed to read the counters on any CPU associated with the
>>>> L3 cache
>>>> that the sub-numa domain forms part of, but this solution uses the
>>>> cpumask of one of the sub-numa domains that is known to be
>>>> associated with
>>>> the L3 cache. This is a reduced set of CPUs from previously intended
>>>> but
>>>> known to be accurate. Alternative may be to build a local cpumask
>>>> from the
>>>> all the sub-numa domains but it is not clear to me how much this will
>>>> improve things.
>>>>
>>>> Considering this I do not think the references are "unnecessary" as the
>>>> subject claims since the solution does not replace the original
>>>> cpumask with
>>>> an identical one.
>>>
>>> Thanks a lot, you are correct. hdr.cpu_mask is a subset of
>>> shared_cpu_map, and we can almost use hdr.cpu_mask to replace the
>>> functionality of shared_cpu_map.
>>>
>>> In fact, in resctrl, the only purpose of using cpu_mask is to select
>>> a CPU that shares the cache resource for performing monitoring group
>>> data reads. Therefore, I think there is no need to build a local
>>> cpumask from all the sub-NUMA domains in this context.
>>
>> One issue I can think of here is when there is a usage where the user
>> does
>> task isolation on the numa node boundary. Let's consider the SNC-3
>> example
>> again with node3, node4, and node5 on the second socket, "L3 cache ID 1".
>> If all CPUs on node3 are in tick_nohz_full_mask while none of the
>> node4 and
>> node5 CPUs are in tick_nohz_full_mask then one of node3's CPUs will get
>> an unnecessary IPI.
>>
> You are right, how about this? First, obtain any cpu in hdr.cpu_mask,
> and then use the cacheinfo shared_cpu_map of this cpu:
>
> diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
> index 9337787461d2d..d43f438465ad0 100644
> --- a/fs/resctrl/ctrlmondata.c
> +++ b/fs/resctrl/ctrlmondata.c
> @@ -596,7 +596,8 @@ int rdtgroup_mondata_show(struct seq_file *m, void
> *arg)
> struct rdtgroup *rdtgrp;
> struct rdt_resource *r;
> struct mon_data *md;
> - int domid, ret = 0;
> + struct cacheinfo *ci;
> + int domid, cpu, ret = 0;
>
> rdtgrp = rdtgroup_kn_lock_live(of->kn);
> if (!rdtgrp) {
> @@ -625,8 +626,12 @@ int rdtgroup_mondata_show(struct seq_file *m, void
> *arg)
> list_for_each_entry(d, &r->mon_domains, hdr.list) {
> if (d->ci_id == domid) {
> rr.ci_id = d->ci_id;
> + cpu = cpumask_any(&d->hdr.cpu_mask)
> + ci = get_cpu_cacheinfo_level(cpu,
> RESCTRL_L3_CACHE);
> + if (!ci)
> + continue;
> mon_event_read(&rr, r, NULL, rdtgrp,
> - &d->hdr.cpu_mask, evtid,
> false);
> + &ci->shared_cpu_map,
> evtid, false);
> goto checkresult;
> }
> }
>
>>>
>>> I will provide a more detailed description in my commit log moving
>>> forward.
>>
>> Thank you very much.
>>
>> Reinette
>
> Thanks
> Qinyun Tan
>
> > 2. Lifecycle dependency: The cacheinfo structure's lifecycle is managed > > by the cache subsystem, making it unsafe for resctrl to hold > > long-term references. > > This is not obvious to me. Could you please elaborate how resctrl could > have a reference to a removed structure? get_cpu_cacheinfo_level() returns a pointer to a per-cpu structure. While it appears that those don't get freed and re-used when a CPU is taken offline, it does seem highly dubious to keep using one for an offline CPU (which is what happens if the first CPU that comes online in a domain is taken offline). -Tony
Hi Tony, On 5/27/25 5:14 PM, Luck, Tony wrote: >>> 2. Lifecycle dependency: The cacheinfo structure's lifecycle is managed >>> by the cache subsystem, making it unsafe for resctrl to hold >>> long-term references. >> >> This is not obvious to me. Could you please elaborate how resctrl could >> have a reference to a removed structure? > > get_cpu_cacheinfo_level() returns a pointer to a per-cpu structure. > > While it appears that those don't get freed and re-used when a CPU is > taken offline, it does seem highly dubious to keep using one for an > offline CPU (which is what happens if the first CPU that comes online > in a domain is taken offline). I do not understand what your goal is with this response. You seem to both defend and refute this "lifecycle dependency" motivation/claim. I am not disagreeing with this fix but I would like to ensure that I understand all motivations for it. Reinette
On Mon, May 26, 2025 at 03:37:44PM +0800, Qinyun Tan wrote:
Hi Qinyun Tan,
> In the resctrl subsystem's Sub-NUMA Cluster (SNC) mode, the rdt_mon_domain
> structure previously relied on the cacheinfo interface to store L3 cache
> information (e.g., shared_cpu_map) for monitoring. However, this approach
> introduces risks when CPUs go offline:
>
> 1. Inconsistency: The ci field in rdt_mon_domain was initialized using the
> first online CPU of a NUMA node. If this CPU later goes offline, the
> shared_cpu_map (managed by the cache subsystem) is cleared, leading to
> incorrect or undefined behavior when accessed via rdt_mon_domain.
>
> 2. Lifecycle dependency: The cacheinfo structure's lifecycle is managed
> by the cache subsystem, making it unsafe for resctrl to hold
> long-term references.
You are correct. Saving a pointer to the per-cpu cacheinfo leads to
the problems that you describe.
> To resolve these issues and align with design principles:
>
> 1. Replace direct cacheinfo references in struct rdt_mon_domain and struct
> rmid_read with the cacheinfo ID (a unique identifier for the L3 cache).
>
> 2. Use hdr.cpu_mask (already maintained by resctrl) to replace
> shared_cpu_map logic for determining valid CPUs for RMID counter reads
> via the MSR interface.
>
> Signed-off-by: Qinyun Tan <qinyuntan@linux.alibaba.com>
> ---
> arch/x86/kernel/cpu/resctrl/core.c | 6 ++++--
> arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 6 +++---
> arch/x86/kernel/cpu/resctrl/internal.h | 4 ++--
> arch/x86/kernel/cpu/resctrl/monitor.c | 6 +-----
> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 6 +++---
> include/linux/resctrl.h | 4 ++--
> 6 files changed, 15 insertions(+), 17 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
> index cf29681d01e04..a0dff742e9e93 100644
> --- a/arch/x86/kernel/cpu/resctrl/core.c
> +++ b/arch/x86/kernel/cpu/resctrl/core.c
> @@ -516,6 +516,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
> struct rdt_hw_mon_domain *hw_dom;
> struct rdt_domain_hdr *hdr;
> struct rdt_mon_domain *d;
> + struct cacheinfo *ci;
> int err;
>
> lockdep_assert_held(&domain_list_lock);
> @@ -543,12 +544,13 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
> d = &hw_dom->d_resctrl;
> d->hdr.id = id;
> d->hdr.type = RESCTRL_MON_DOMAIN;
> - d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
> - if (!d->ci) {
> + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
> + if (!ci) {
> pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
> mon_domain_free(hw_dom);
> return;
> }
> + d->ci_id = ci->id;
> cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
>
> arch_mon_domain_online(r, d);
> diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
> index 0a0ac5f6112ec..f9768669ce806 100644
> --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
> +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
> @@ -690,10 +690,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
> * one that matches this cache id.
> */
> list_for_each_entry(d, &r->mon_domains, hdr.list) {
> - if (d->ci->id == domid) {
> - rr.ci = d->ci;
> + if (d->ci_id == domid) {
> + rr.ci_id = d->ci_id;
> mon_event_read(&rr, r, NULL, rdtgrp,
> - &d->ci->shared_cpu_map, evtid, false);
> + &d->hdr.cpu_mask, evtid, false);
This change restricts choice of CPUs to execute the read to one of the
CPUs in the SNC domain, instead of any that share the L3 cache.
> goto checkresult;
> }
> }
> diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
> index eaae99602b617..91e71db554a9c 100644
> --- a/arch/x86/kernel/cpu/resctrl/internal.h
> +++ b/arch/x86/kernel/cpu/resctrl/internal.h
> @@ -136,7 +136,7 @@ union mon_data_bits {
> * domains in @r sharing L3 @ci.id
> * @evtid: Which monitor event to read.
> * @first: Initialize MBM counter when true.
> - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
> + * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains.
> * @err: Error encountered when reading counter.
> * @val: Returned value of event counter. If @rgrp is a parent resource group,
> * @val includes the sum of event counts from its child resource groups.
> @@ -150,7 +150,7 @@ struct rmid_read {
> struct rdt_mon_domain *d;
> enum resctrl_event_id evtid;
> bool first;
> - struct cacheinfo *ci;
> + unsigned int ci_id;
> int err;
> u64 val;
> void *arch_mon_ctx;
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index a93ed7d2a1602..bedccd62158c3 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -620,10 +620,6 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
> return 0;
> }
>
> - /* Summing domains that share a cache, must be on a CPU for that cache. */
> - if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
> - return -EINVAL;
> -
This sanity check that code is executing on a CPU that shares the L3
cache has gone. But I don't see any code to replace it based on checking
your new "ci_id" field.
Should it be something like:
struct cacheinfo *ci;
ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
if (!ci || ci->id != rr->ci_id)
return -EINVAL;
> /*
> * Legacy files must report the sum of an event across all
> * domains that share the same L3 cache instance.
> @@ -633,7 +629,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
> */
> ret = -EINVAL;
> list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
> - if (d->ci->id != rr->ci->id)
> + if (d->ci_id != rr->ci_id)
> continue;
> err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
> rr->evtid, &tval, rr->arch_mon_ctx);
> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index cc4a54145c83d..075fdca2080d8 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -3146,7 +3146,7 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
> char name[32];
>
> snc_mode = r->mon_scope == RESCTRL_L3_NODE;
> - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
> + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id);
> if (snc_mode)
> sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
>
> @@ -3171,7 +3171,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
> return -EPERM;
>
> priv.u.rid = r->rid;
> - priv.u.domid = do_sum ? d->ci->id : d->hdr.id;
> + priv.u.domid = do_sum ? d->ci_id : d->hdr.id;
> priv.u.sum = do_sum;
> list_for_each_entry(mevt, &r->evt_list, list) {
> priv.u.evtid = mevt->evtid;
> @@ -3198,7 +3198,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
> lockdep_assert_held(&rdtgroup_mutex);
>
> snc_mode = r->mon_scope == RESCTRL_L3_NODE;
> - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
> + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id);
> kn = kernfs_find_and_get(parent_kn, name);
> if (kn) {
> /*
> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> index 880351ca3dfcb..c990670d18c02 100644
> --- a/include/linux/resctrl.h
> +++ b/include/linux/resctrl.h
> @@ -145,7 +145,7 @@ struct rdt_ctrl_domain {
> /**
> * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource
> * @hdr: common header for different domain types
> - * @ci: cache info for this domain
> + * @ci_id: cache info id for this domain
> * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold
> * @mbm_total: saved state for MBM total bandwidth
> * @mbm_local: saved state for MBM local bandwidth
> @@ -156,7 +156,7 @@ struct rdt_ctrl_domain {
> */
> struct rdt_mon_domain {
> struct rdt_domain_hdr hdr;
> - struct cacheinfo *ci;
> + unsigned int ci_id;
> unsigned long *rmid_busy_llc;
> struct mbm_state *mbm_total;
> struct mbm_state *mbm_local;
> --
> 2.43.5
>
One other note. Linus just[1] merged the patches that split the
architecture independent portions of resctrl into "fs/resctrl"
(moving just over 7000 lines out of arch/x86/kernel/cpu/resctrl).
Some parts of this patch touch code that moved. But it should be
fairly easy to track the new location as the function names did
not change in the move. Please base new version of the patch on
upstream.
-Tony
[1] After you wrote this patch, and about 4 hours before I'm writing
this reply!
On 5/28/25 5:25 AM, Luck, Tony wrote:
> On Mon, May 26, 2025 at 03:37:44PM +0800, Qinyun Tan wrote:
>
> Hi Qinyun Tan,
>
>> In the resctrl subsystem's Sub-NUMA Cluster (SNC) mode, the rdt_mon_domain
>> structure previously relied on the cacheinfo interface to store L3 cache
>> information (e.g., shared_cpu_map) for monitoring. However, this approach
>> introduces risks when CPUs go offline:
>>
>> 1. Inconsistency: The ci field in rdt_mon_domain was initialized using the
>> first online CPU of a NUMA node. If this CPU later goes offline, the
>> shared_cpu_map (managed by the cache subsystem) is cleared, leading to
>> incorrect or undefined behavior when accessed via rdt_mon_domain.
>>
>> 2. Lifecycle dependency: The cacheinfo structure's lifecycle is managed
>> by the cache subsystem, making it unsafe for resctrl to hold
>> long-term references.
>
> You are correct. Saving a pointer to the per-cpu cacheinfo leads to
> the problems that you describe.
>
>> To resolve these issues and align with design principles:
>>
>> 1. Replace direct cacheinfo references in struct rdt_mon_domain and struct
>> rmid_read with the cacheinfo ID (a unique identifier for the L3 cache).
>>
>> 2. Use hdr.cpu_mask (already maintained by resctrl) to replace
>> shared_cpu_map logic for determining valid CPUs for RMID counter reads
>> via the MSR interface.
>>
>> Signed-off-by: Qinyun Tan <qinyuntan@linux.alibaba.com>
>> ---
>> arch/x86/kernel/cpu/resctrl/core.c | 6 ++++--
>> arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 6 +++---
>> arch/x86/kernel/cpu/resctrl/internal.h | 4 ++--
>> arch/x86/kernel/cpu/resctrl/monitor.c | 6 +-----
>> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 6 +++---
>> include/linux/resctrl.h | 4 ++--
>> 6 files changed, 15 insertions(+), 17 deletions(-)
>>
>> diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
>> index cf29681d01e04..a0dff742e9e93 100644
>> --- a/arch/x86/kernel/cpu/resctrl/core.c
>> +++ b/arch/x86/kernel/cpu/resctrl/core.c
>> @@ -516,6 +516,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
>> struct rdt_hw_mon_domain *hw_dom;
>> struct rdt_domain_hdr *hdr;
>> struct rdt_mon_domain *d;
>> + struct cacheinfo *ci;
>> int err;
>>
>> lockdep_assert_held(&domain_list_lock);
>> @@ -543,12 +544,13 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
>> d = &hw_dom->d_resctrl;
>> d->hdr.id = id;
>> d->hdr.type = RESCTRL_MON_DOMAIN;
>> - d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
>> - if (!d->ci) {
>> + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
>> + if (!ci) {
>> pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
>> mon_domain_free(hw_dom);
>> return;
>> }
>> + d->ci_id = ci->id;
>> cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
>>
>> arch_mon_domain_online(r, d);
>> diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
>> index 0a0ac5f6112ec..f9768669ce806 100644
>> --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
>> +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
>> @@ -690,10 +690,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
>> * one that matches this cache id.
>> */
>> list_for_each_entry(d, &r->mon_domains, hdr.list) {
>> - if (d->ci->id == domid) {
>> - rr.ci = d->ci;
>> + if (d->ci_id == domid) {
>> + rr.ci_id = d->ci_id;
>> mon_event_read(&rr, r, NULL, rdtgrp,
>> - &d->ci->shared_cpu_map, evtid, false);
>> + &d->hdr.cpu_mask, evtid, false);
>
> This change restricts choice of CPUs to execute the read to one of the
> CPUs in the SNC domain, instead of any that share the L3 cache.
hdr.cpu_mask is a subset of ci->shared_cpu_map, and using it should have
no side effects?>
>> goto checkresult;
>> }
>> }
>> diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
>> index eaae99602b617..91e71db554a9c 100644
>> --- a/arch/x86/kernel/cpu/resctrl/internal.h
>> +++ b/arch/x86/kernel/cpu/resctrl/internal.h
>> @@ -136,7 +136,7 @@ union mon_data_bits {
>> * domains in @r sharing L3 @ci.id
>> * @evtid: Which monitor event to read.
>> * @first: Initialize MBM counter when true.
>> - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
>> + * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains.
>> * @err: Error encountered when reading counter.
>> * @val: Returned value of event counter. If @rgrp is a parent resource group,
>> * @val includes the sum of event counts from its child resource groups.
>> @@ -150,7 +150,7 @@ struct rmid_read {
>> struct rdt_mon_domain *d;
>> enum resctrl_event_id evtid;
>> bool first;
>> - struct cacheinfo *ci;
>> + unsigned int ci_id;
>> int err;
>> u64 val;
>> void *arch_mon_ctx;
>> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
>> index a93ed7d2a1602..bedccd62158c3 100644
>> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
>> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
>> @@ -620,10 +620,6 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
>> return 0;
>> }
>>
>> - /* Summing domains that share a cache, must be on a CPU for that cache. */
>> - if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
>> - return -EINVAL;
>> -
>
> This sanity check that code is executing on a CPU that shares the L3
> cache has gone. But I don't see any code to replace it based on checking
> your new "ci_id" field.
You are right,I will add this check in the next patch.>
> Should it be something like:
>
> struct cacheinfo *ci;
>
> ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
> if (!ci || ci->id != rr->ci_id)
> return -EINVAL;
>
>> /*
>> * Legacy files must report the sum of an event across all
>> * domains that share the same L3 cache instance.
>> @@ -633,7 +629,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
>> */
>> ret = -EINVAL;
>> list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
>> - if (d->ci->id != rr->ci->id)
>> + if (d->ci_id != rr->ci_id)
>> continue;
>> err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
>> rr->evtid, &tval, rr->arch_mon_ctx);
>> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>> index cc4a54145c83d..075fdca2080d8 100644
>> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>> @@ -3146,7 +3146,7 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
>> char name[32];
>>
>> snc_mode = r->mon_scope == RESCTRL_L3_NODE;
>> - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
>> + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id);
>> if (snc_mode)
>> sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
>>
>> @@ -3171,7 +3171,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
>> return -EPERM;
>>
>> priv.u.rid = r->rid;
>> - priv.u.domid = do_sum ? d->ci->id : d->hdr.id;
>> + priv.u.domid = do_sum ? d->ci_id : d->hdr.id;
>> priv.u.sum = do_sum;
>> list_for_each_entry(mevt, &r->evt_list, list) {
>> priv.u.evtid = mevt->evtid;
>> @@ -3198,7 +3198,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
>> lockdep_assert_held(&rdtgroup_mutex);
>>
>> snc_mode = r->mon_scope == RESCTRL_L3_NODE;
>> - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
>> + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id);
>> kn = kernfs_find_and_get(parent_kn, name);
>> if (kn) {
>> /*
>> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
>> index 880351ca3dfcb..c990670d18c02 100644
>> --- a/include/linux/resctrl.h
>> +++ b/include/linux/resctrl.h
>> @@ -145,7 +145,7 @@ struct rdt_ctrl_domain {
>> /**
>> * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource
>> * @hdr: common header for different domain types
>> - * @ci: cache info for this domain
>> + * @ci_id: cache info id for this domain
>> * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold
>> * @mbm_total: saved state for MBM total bandwidth
>> * @mbm_local: saved state for MBM local bandwidth
>> @@ -156,7 +156,7 @@ struct rdt_ctrl_domain {
>> */
>> struct rdt_mon_domain {
>> struct rdt_domain_hdr hdr;
>> - struct cacheinfo *ci;
>> + unsigned int ci_id;
>> unsigned long *rmid_busy_llc;
>> struct mbm_state *mbm_total;
>> struct mbm_state *mbm_local;
>> --
>> 2.43.5
>>
>
> One other note. Linus just[1] merged the patches that split the
> architecture independent portions of resctrl into "fs/resctrl"
> (moving just over 7000 lines out of arch/x86/kernel/cpu/resctrl).
>
> Some parts of this patch touch code that moved. But it should be
> fairly easy to track the new location as the function names did
> not change in the move. Please base new version of the patch on
> upstream.
>
> -Tony
>
> [1] After you wrote this patch, and about 4 hours before I'm writing
> this reply!Ok. Soon I will rebase the master and then resend thispatch.
--
Thanks
Qinyun Tan
© 2016 - 2025 Red Hat, Inc.