[v5] x86/resctrl: monitored closid+rmid together, separate arch/fs locking

[PATCH v5 12/24] x86/resctrl: Make resctrl_arch_rmid_read() retry when it is interrupted

Posted by James Morse 2 years, 6 months ago

resctrl_arch_rmid_read() could be called by resctrl in process context,
and then called by the PMU driver from irq context on the same CPU.
This could cause struct arch_mbm_state's prev_msr value to go backwards,
leading to the chunks value being incremented multiple times.

The struct arch_mbm_state holds both the previous msr value, and a count
of the number of chunks. These two fields need to be updated atomically.
Similarly __rmid_read() must write to one MSR and read from another,
this must be proteted from re-entrance.

Read the prev_msr before accessing the hardware, and cmpxchg() the value
back. If the value has changed, the whole thing is re-attempted. To protect
the MSR, __rmid_read() will retry reads for QM_CTR if QM_EVTSEL has changed
from the selected value.

Signed-off-by: James Morse <james.morse@arm.com>

---
Changes since v4:
 * Added retry loop in __rmid_read() to protect the CPU MSRs.
---
 arch/x86/kernel/cpu/resctrl/internal.h |  5 +--
 arch/x86/kernel/cpu/resctrl/monitor.c  | 45 ++++++++++++++++++++------
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index a32d307292a1..7012f42a82ee 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -2,6 +2,7 @@
 #ifndef _ASM_X86_RESCTRL_INTERNAL_H
 #define _ASM_X86_RESCTRL_INTERNAL_H
 
+#include <linux/atomic.h>
 #include <linux/resctrl.h>
 #include <linux/sched.h>
 #include <linux/kernfs.h>
@@ -338,8 +339,8 @@ struct mbm_state {
  *		find this struct.
  */
 struct arch_mbm_state {
-	u64	chunks;
-	u64	prev_msr;
+	atomic64_t	chunks;
+	atomic64_t	prev_msr;
 };
 
 /**
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index f0670795b446..62350bbd23e0 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -16,6 +16,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/percpu.h>
 #include <linux/sizes.h>
 #include <linux/slab.h>
 
@@ -24,6 +25,9 @@
 
 #include "internal.h"
 
+/* Sequence number for writes to IA32 QM_EVTSEL */
+static DEFINE_PER_CPU(u64, qm_evtsel_seq);
+
 struct rmid_entry {
 	/*
 	 * Some architectures's resctrl_arch_rmid_read() needs the CLOSID value
@@ -178,7 +182,7 @@ static inline struct rmid_entry *__rmid_entry(u32 idx)
 
 static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
 {
-	u64 msr_val;
+	u64 msr_val, seq;
 
 	/*
 	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
@@ -187,9 +191,16 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
 	 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
 	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
 	 * are error bits.
+	 * A per-cpu sequence counter is incremented each time QM_EVTSEL is
+	 * written. This is used to detect if this function was interrupted by
+	 * another call without re-reading the MSRs. Retry the MSR read when
+	 * this happens as the QM_CTR value may belong to a different event.
 	 */
-	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
-	rdmsrl(MSR_IA32_QM_CTR, msr_val);
+	do {
+		seq = this_cpu_inc_return(qm_evtsel_seq);
+		wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+		rdmsrl(MSR_IA32_QM_CTR, msr_val);
+	} while (seq != this_cpu_read(qm_evtsel_seq));
 
 	if (msr_val & RMID_VAL_ERROR)
 		return -EIO;
@@ -225,13 +236,15 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
 {
 	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
 	struct arch_mbm_state *am;
+	u64 msr_val;
 
 	am = get_arch_mbm_state(hw_dom, rmid, eventid);
 	if (am) {
 		memset(am, 0, sizeof(*am));
 
 		/* Record any initial, non-zero count value. */
-		__rmid_read(rmid, eventid, &am->prev_msr);
+		__rmid_read(rmid, eventid, &msr_val);
+		atomic64_set(&am->prev_msr, msr_val);
 	}
 }
 
@@ -266,23 +279,35 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
 {
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+	u64 start_msr_val, old_msr_val, msr_val, chunks;
 	struct arch_mbm_state *am;
-	u64 msr_val, chunks;
-	int ret;
+	int ret = 0;
 
 	if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
 		return -EINVAL;
 
+interrupted:
+	am = get_arch_mbm_state(hw_dom, rmid, eventid);
+	if (am)
+		start_msr_val = atomic64_read(&am->prev_msr);
+
 	ret = __rmid_read(rmid, eventid, &msr_val);
 	if (ret)
 		return ret;
 
 	am = get_arch_mbm_state(hw_dom, rmid, eventid);
 	if (am) {
-		am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
-						 hw_res->mbm_width);
-		chunks = get_corrected_mbm_count(rmid, am->chunks);
-		am->prev_msr = msr_val;
+		old_msr_val = atomic64_cmpxchg(&am->prev_msr, start_msr_val,
+					       msr_val);
+		if (old_msr_val != start_msr_val)
+			goto interrupted;
+
+		chunks = mbm_overflow_count(start_msr_val, msr_val,
+					    hw_res->mbm_width);
+		atomic64_add(chunks, &am->chunks);
+
+		chunks = get_corrected_mbm_count(rmid,
+						 atomic64_read(&am->chunks));
 	} else {
 		chunks = msr_val;
 	}
-- 
2.39.2

Re: [PATCH v5 12/24] x86/resctrl: Make resctrl_arch_rmid_read() retry when it is interrupted

Posted by Reinette Chatre 2 years, 6 months ago

Hi James,

On 7/28/2023 9:42 AM, James Morse wrote:
> resctrl_arch_rmid_read() could be called by resctrl in process context,
> and then called by the PMU driver from irq context on the same CPU.

The changelog is written as a bug report of current behavior.
This does not seem to describe current but instead planned future behavior.


> This could cause struct arch_mbm_state's prev_msr value to go backwards,
> leading to the chunks value being incremented multiple times.
> 
> The struct arch_mbm_state holds both the previous msr value, and a count
> of the number of chunks. These two fields need to be updated atomically.
> Similarly __rmid_read() must write to one MSR and read from another,
> this must be proteted from re-entrance.

proteted -> protected

> 
> Read the prev_msr before accessing the hardware, and cmpxchg() the value
> back. If the value has changed, the whole thing is re-attempted. To protect
> the MSR, __rmid_read() will retry reads for QM_CTR if QM_EVTSEL has changed
> from the selected value.

The latter part of the sentence does not seem to match with what the
patch does.

> 
> Signed-off-by: James Morse <james.morse@arm.com>
> 
> ---
> Changes since v4:
>  * Added retry loop in __rmid_read() to protect the CPU MSRs.
> ---
>  arch/x86/kernel/cpu/resctrl/internal.h |  5 +--
>  arch/x86/kernel/cpu/resctrl/monitor.c  | 45 ++++++++++++++++++++------
>  2 files changed, 38 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
> index a32d307292a1..7012f42a82ee 100644
> --- a/arch/x86/kernel/cpu/resctrl/internal.h
> +++ b/arch/x86/kernel/cpu/resctrl/internal.h
> @@ -2,6 +2,7 @@
>  #ifndef _ASM_X86_RESCTRL_INTERNAL_H
>  #define _ASM_X86_RESCTRL_INTERNAL_H
>  
> +#include <linux/atomic.h>
>  #include <linux/resctrl.h>
>  #include <linux/sched.h>
>  #include <linux/kernfs.h>
> @@ -338,8 +339,8 @@ struct mbm_state {
>   *		find this struct.
>   */
>  struct arch_mbm_state {
> -	u64	chunks;
> -	u64	prev_msr;
> +	atomic64_t	chunks;
> +	atomic64_t	prev_msr;
>  };
>  
>  /**
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index f0670795b446..62350bbd23e0 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -16,6 +16,7 @@
>   */
>  
>  #include <linux/module.h>
> +#include <linux/percpu.h>
>  #include <linux/sizes.h>
>  #include <linux/slab.h>
>  
> @@ -24,6 +25,9 @@
>  
>  #include "internal.h"
>  
> +/* Sequence number for writes to IA32 QM_EVTSEL */
> +static DEFINE_PER_CPU(u64, qm_evtsel_seq);
> +
>  struct rmid_entry {
>  	/*
>  	 * Some architectures's resctrl_arch_rmid_read() needs the CLOSID value
> @@ -178,7 +182,7 @@ static inline struct rmid_entry *__rmid_entry(u32 idx)
>  
>  static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
>  {
> -	u64 msr_val;
> +	u64 msr_val, seq;
>  
>  	/*
>  	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
> @@ -187,9 +191,16 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
>  	 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
>  	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
>  	 * are error bits.
> +	 * A per-cpu sequence counter is incremented each time QM_EVTSEL is
> +	 * written. This is used to detect if this function was interrupted by
> +	 * another call without re-reading the MSRs. Retry the MSR read when
> +	 * this happens as the QM_CTR value may belong to a different event.
>  	 */
> -	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
> -	rdmsrl(MSR_IA32_QM_CTR, msr_val);
> +	do {
> +		seq = this_cpu_inc_return(qm_evtsel_seq);
> +		wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
> +		rdmsrl(MSR_IA32_QM_CTR, msr_val);
> +	} while (seq != this_cpu_read(qm_evtsel_seq));
>  
>  	if (msr_val & RMID_VAL_ERROR)
>  		return -EIO;
> @@ -225,13 +236,15 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
>  {
>  	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
>  	struct arch_mbm_state *am;
> +	u64 msr_val;
>  
>  	am = get_arch_mbm_state(hw_dom, rmid, eventid);
>  	if (am) {
>  		memset(am, 0, sizeof(*am));
>  
>  		/* Record any initial, non-zero count value. */
> -		__rmid_read(rmid, eventid, &am->prev_msr);
> +		__rmid_read(rmid, eventid, &msr_val);
> +		atomic64_set(&am->prev_msr, msr_val);
>  	}
>  }
>  
> @@ -266,23 +279,35 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
>  {
>  	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
>  	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
> +	u64 start_msr_val, old_msr_val, msr_val, chunks;
>  	struct arch_mbm_state *am;
> -	u64 msr_val, chunks;
> -	int ret;
> +	int ret = 0;
>  
>  	if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
>  		return -EINVAL;
>  
> +interrupted:
> +	am = get_arch_mbm_state(hw_dom, rmid, eventid);
> +	if (am)
> +		start_msr_val = atomic64_read(&am->prev_msr);
> +
>  	ret = __rmid_read(rmid, eventid, &msr_val);
>  	if (ret)
>  		return ret;
>  
>  	am = get_arch_mbm_state(hw_dom, rmid, eventid);
>  	if (am) {
> -		am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
> -						 hw_res->mbm_width);
> -		chunks = get_corrected_mbm_count(rmid, am->chunks);
> -		am->prev_msr = msr_val;
> +		old_msr_val = atomic64_cmpxchg(&am->prev_msr, start_msr_val,
> +					       msr_val);
> +		if (old_msr_val != start_msr_val)
> +			goto interrupted;
> +

hmmm ... what if interruption occurs here? 

> +		chunks = mbm_overflow_count(start_msr_val, msr_val,
> +					    hw_res->mbm_width);
> +		atomic64_add(chunks, &am->chunks);
> +
> +		chunks = get_corrected_mbm_count(rmid,
> +						 atomic64_read(&am->chunks));
>  	} else {
>  		chunks = msr_val;
>  	}

Reinette

Re: [PATCH v5 12/24] x86/resctrl: Make resctrl_arch_rmid_read() retry when it is interrupted

Posted by James Morse 2 years, 5 months ago

Hi Reinette,

On 09/08/2023 23:35, Reinette Chatre wrote:
> On 7/28/2023 9:42 AM, James Morse wrote:
>> resctrl_arch_rmid_read() could be called by resctrl in process context,
>> and then called by the PMU driver from irq context on the same CPU.
> 
> The changelog is written as a bug report of current behavior.
> This does not seem to describe current but instead planned future behavior.

I pulled this patch from much later in the tree as it's about to be a problem in this
series. I haven't yet decided if its an existing bug in resctrl....

... it doesn't look like this can affect the path through mon_event_read(), as
generic_exec_single() masks interrupts.
But an incoming IPI from mon_event_read can corrupt the values for the limbo worker, which
at the worst would result in early re-use. And the MBM overflow worker ... which would
corrupt the value seen by user-space.
free_rmid() is equally affected, the outcome for limbo is the same spurious delay or early
re-use.

I'll change the commit messages to describe that, and float this earlier in the series.
The backport will be a problem. This applies cleanly to v6.1.46, but for v5.15.127 there
are at least 13 dependencies ... its probably not worth trying to fix as chances are
no-one is seeing this happen in reality.

>> This could cause struct arch_mbm_state's prev_msr value to go backwards,
>> leading to the chunks value being incremented multiple times.
>>
>> The struct arch_mbm_state holds both the previous msr value, and a count
>> of the number of chunks. These two fields need to be updated atomically.
>> Similarly __rmid_read() must write to one MSR and read from another,
>> this must be proteted from re-entrance.
> 
> proteted -> protected
> 
>>
>> Read the prev_msr before accessing the hardware, and cmpxchg() the value
>> back. If the value has changed, the whole thing is re-attempted. To protect
>> the MSR, __rmid_read() will retry reads for QM_CTR if QM_EVTSEL has changed
>> from the selected value.
> 
> The latter part of the sentence does not seem to match with what the
> patch does.

>> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
>> index f0670795b446..62350bbd23e0 100644
>> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
>> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
>> @@ -266,23 +279,35 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
>>  {
>>  	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
>>  	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
>> +	u64 start_msr_val, old_msr_val, msr_val, chunks;
>>  	struct arch_mbm_state *am;
>> -	u64 msr_val, chunks;
>> -	int ret;
>> +	int ret = 0;
>>  
>>  	if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
>>  		return -EINVAL;
>>  
>> +interrupted:
>> +	am = get_arch_mbm_state(hw_dom, rmid, eventid);
>> +	if (am)
>> +		start_msr_val = atomic64_read(&am->prev_msr);
>> +
>>  	ret = __rmid_read(rmid, eventid, &msr_val);
>>  	if (ret)
>>  		return ret;
>>  
>>  	am = get_arch_mbm_state(hw_dom, rmid, eventid);
>>  	if (am) {
>> -		am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
>> -						 hw_res->mbm_width);
>> -		chunks = get_corrected_mbm_count(rmid, am->chunks);
>> -		am->prev_msr = msr_val;
>> +		old_msr_val = atomic64_cmpxchg(&am->prev_msr, start_msr_val,
>> +					       msr_val);
>> +		if (old_msr_val != start_msr_val)
>> +			goto interrupted;
>> +

> hmmm ... what if interruption occurs here? 

This is after the MSR write/read, so this function can't get a torn value from the
hardware. (e.g. reads the wrong RMID). The operations on struct arch_mbm_state are atomic,
so are still safe if the function becomes re-entrant.

If the re-entrant call accessed the same RMID and the same counter, its atomic64_add()
would be based on the prev_msr value this call read - because the above cmpxchg succeeded.

(put another way:)
The interrupting call returns a lower value, consistent with the first call not having
finished yet. The interrupted call returns the correct value, which is larger than it
read, because it completed after the interrupting call.

>> +		chunks = mbm_overflow_count(start_msr_val, msr_val,
>> +					    hw_res->mbm_width);
>> +		atomic64_add(chunks, &am->chunks);
>> +
>> +		chunks = get_corrected_mbm_count(rmid,
>> +						 atomic64_read(&am->chunks));
>>  	} else {
>>  		chunks = msr_val;
>>  	}

Thanks,

James

Re: [PATCH v5 12/24] x86/resctrl: Make resctrl_arch_rmid_read() retry when it is interrupted

Posted by Reinette Chatre 2 years, 5 months ago

Hi James,

On 8/24/2023 9:55 AM, James Morse wrote:
> Hi Reinette,
> 
> On 09/08/2023 23:35, Reinette Chatre wrote:
>> On 7/28/2023 9:42 AM, James Morse wrote:
>>> resctrl_arch_rmid_read() could be called by resctrl in process context,
>>> and then called by the PMU driver from irq context on the same CPU.
>>
>> The changelog is written as a bug report of current behavior.
>> This does not seem to describe current but instead planned future behavior.
> 
> I pulled this patch from much later in the tree as it's about to be a problem in this
> series. I haven't yet decided if its an existing bug in resctrl....
> 
> ... it doesn't look like this can affect the path through mon_event_read(), as
> generic_exec_single() masks interrupts.
> But an incoming IPI from mon_event_read can corrupt the values for the limbo worker, which
> at the worst would result in early re-use. And the MBM overflow worker ... which would
> corrupt the value seen by user-space.
> free_rmid() is equally affected, the outcome for limbo is the same spurious delay or early
> re-use.

Apologies but these races are not obvious to me. Let me take the first, where the
race could be between mon_event_read() and the limbo worker. From what I can tell
mon_event_read() can be called from user space when creating a new monitoring
group or when viewing data associated with a monitoring group. In both cases
rdtgroup_mutex is held from the time user space triggers the request until
all IPIs are completed. Compare that with the limbo worker, cqm_handle_limbo(),
that also obtains rdtgroup_mutex before it attempts to do its work.
Considering this example I am not able to see how an incoming IPI from
mon_event_read() can interfere with the limbo worker since both
holding rdtgroup_mutex prevents them from running concurrently.

Similarly, the MBM overflow worker takes rdtgroup_mutex, and free_rmid()
is run with rdtgroup_mutex held.

> I'll change the commit messages to describe that, and float this earlier in the series.
> The backport will be a problem. This applies cleanly to v6.1.46, but for v5.15.127 there
> are at least 13 dependencies ... its probably not worth trying to fix as chances are
> no-one is seeing this happen in reality.
> 
> 
>>> This could cause struct arch_mbm_state's prev_msr value to go backwards,
>>> leading to the chunks value being incremented multiple times.
>>>
>>> The struct arch_mbm_state holds both the previous msr value, and a count
>>> of the number of chunks. These two fields need to be updated atomically.
>>> Similarly __rmid_read() must write to one MSR and read from another,
>>> this must be proteted from re-entrance.
>>
>> proteted -> protected
>>
>>>
>>> Read the prev_msr before accessing the hardware, and cmpxchg() the value
>>> back. If the value has changed, the whole thing is re-attempted. To protect
>>> the MSR, __rmid_read() will retry reads for QM_CTR if QM_EVTSEL has changed
>>> from the selected value.
>>
>> The latter part of the sentence does not seem to match with what the
>> patch does.
> 
> 
>>> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
>>> index f0670795b446..62350bbd23e0 100644
>>> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
>>> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
>>> @@ -266,23 +279,35 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
>>>  {
>>>  	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
>>>  	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
>>> +	u64 start_msr_val, old_msr_val, msr_val, chunks;
>>>  	struct arch_mbm_state *am;
>>> -	u64 msr_val, chunks;
>>> -	int ret;
>>> +	int ret = 0;
>>>  
>>>  	if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
>>>  		return -EINVAL;
>>>  
>>> +interrupted:
>>> +	am = get_arch_mbm_state(hw_dom, rmid, eventid);
>>> +	if (am)
>>> +		start_msr_val = atomic64_read(&am->prev_msr);
>>> +
>>>  	ret = __rmid_read(rmid, eventid, &msr_val);
>>>  	if (ret)
>>>  		return ret;
>>>  
>>>  	am = get_arch_mbm_state(hw_dom, rmid, eventid);
>>>  	if (am) {
>>> -		am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
>>> -						 hw_res->mbm_width);
>>> -		chunks = get_corrected_mbm_count(rmid, am->chunks);
>>> -		am->prev_msr = msr_val;
>>> +		old_msr_val = atomic64_cmpxchg(&am->prev_msr, start_msr_val,
>>> +					       msr_val);
>>> +		if (old_msr_val != start_msr_val)
>>> +			goto interrupted;
>>> +
> 
>> hmmm ... what if interruption occurs here? 
> 
> This is after the MSR write/read, so this function can't get a torn value from the
> hardware. (e.g. reads the wrong RMID). The operations on struct arch_mbm_state are atomic,
> so are still safe if the function becomes re-entrant.
> 
> If the re-entrant call accessed the same RMID and the same counter, its atomic64_add()
> would be based on the prev_msr value this call read - because the above cmpxchg succeeded.
> 
> (put another way:)
> The interrupting call returns a lower value, consistent with the first call not having
> finished yet. The interrupted call returns the correct value, which is larger than it
> read, because it completed after the interrupting call.
> 

I see, thank you. If this does end up being needed for a future
concurrency issue, could you please add a comment describing
this behavior where a later call can return a lower value and
why that is ok? It looks to me, as accomplished with the use of
atomic64_add(), as though this scenario would
end with the correct arch_mbm_state even though the members
are not updated atomically as a unit. 

> 
>>> +		chunks = mbm_overflow_count(start_msr_val, msr_val,
>>> +					    hw_res->mbm_width);
>>> +		atomic64_add(chunks, &am->chunks);
>>> +
>>> +		chunks = get_corrected_mbm_count(rmid,
>>> +						 atomic64_read(&am->chunks));
>>>  	} else {
>>>  		chunks = msr_val;
>>>  	}
> 
> 

Reinette

Re: [PATCH v5 12/24] x86/resctrl: Make resctrl_arch_rmid_read() retry when it is interrupted

Posted by James Morse 2 years, 5 months ago

Hi Reinette,

On 8/25/23 00:01, Reinette Chatre wrote:
> On 8/24/2023 9:55 AM, James Morse wrote:
>> On 09/08/2023 23:35, Reinette Chatre wrote:
>>> On 7/28/2023 9:42 AM, James Morse wrote:
>>>> resctrl_arch_rmid_read() could be called by resctrl in process context,
>>>> and then called by the PMU driver from irq context on the same CPU.
>>>
>>> The changelog is written as a bug report of current behavior.
>>> This does not seem to describe current but instead planned future behavior.
>>
>> I pulled this patch from much later in the tree as it's about to be a problem in this
>> series. I haven't yet decided if its an existing bug in resctrl....
>>
>> ... it doesn't look like this can affect the path through mon_event_read(), as
>> generic_exec_single() masks interrupts.
>> But an incoming IPI from mon_event_read can corrupt the values for the limbo worker, which
>> at the worst would result in early re-use. And the MBM overflow worker ... which would
>> corrupt the value seen by user-space.
>> free_rmid() is equally affected, the outcome for limbo is the same spurious delay or early
>> re-use.
> 
> Apologies but these races are not obvious to me. Let me take the first, where the
> race could be between mon_event_read() and the limbo worker. From what I can tell
> mon_event_read() can be called from user space when creating a new monitoring
> group or when viewing data associated with a monitoring group. In both cases
> rdtgroup_mutex is held from the time user space triggers the request until
> all IPIs are completed. Compare that with the limbo worker, cqm_handle_limbo(),
> that also obtains rdtgroup_mutex before it attempts to do its work.
> Considering this example I am not able to see how an incoming IPI from
> mon_event_read() can interfere with the limbo worker since both
> holding rdtgroup_mutex prevents them from running concurrently.
> 
> Similarly, the MBM overflow worker takes rdtgroup_mutex, and free_rmid()
> is run with rdtgroup_mutex held.

Yes, sorry -I'd forgotten about that! I'll need to dig into this again.

Part of the problem is I'm looking at this from a different angle - something I haven't described properly: the resctrl_arch_ calls shouldn't depend on lock that is private to resctrl.

This allows for multiple callers, (e.g. resctrl_pmu that I haven't posted yet), and allows MPAM's
overflow interrupt to eventually be something resctrl could support.
It also allows the resctrl_arch_ calls to have lockdep asserts for their dependencies.

Yes the resctrl_mutex is what prevents this from being a problem today.
(I haven't yet looked at how Peter's series solves the same problem.)

... it may be possible to move this patch back of the 'fold' to live with the PMU code ...


>> I'll change the commit messages to describe that, and float this earlier in the series.
>> The backport will be a problem. This applies cleanly to v6.1.46, but for v5.15.127 there
>> are at least 13 dependencies ... its probably not worth trying to fix as chances are
>> no-one is seeing this happen in reality.

I'll remove that wording around this and mention the mutex.

[...]

>>>> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
>>>> index f0670795b446..62350bbd23e0 100644
>>>> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
>>>> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
>>>> @@ -266,23 +279,35 @@ int :/(struct rdt_resource *r, struct rdt_domain *d,
>>>>   {
>>>>   	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
>>>>   	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
>>>> +	u64 start_msr_val, old_msr_val, msr_val, chunks;
>>>>   	struct arch_mbm_state *am;
>>>> -	u64 msr_val, chunks;
>>>> -	int ret;
>>>> +	int ret = 0;
>>>>   
>>>>   	if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
>>>>   		return -EINVAL;
>>>>   
>>>> +interrupted:
>>>> +	am = get_arch_mbm_state(hw_dom, rmid, eventid);
>>>> +	if (am)
>>>> +		start_msr_val = atomic64_read(&am->prev_msr);
>>>> +
>>>>   	ret = __rmid_read(rmid, eventid, &msr_val);
>>>>   	if (ret)
>>>>   		return ret;
>>>>   
>>>>   	am = get_arch_mbm_state(hw_dom, rmid, eventid);
>>>>   	if (am) {
>>>> -		am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
>>>> -						 hw_res->mbm_width);
>>>> -		chunks = get_corrected_mbm_count(rmid, am->chunks);
>>>> -		am->prev_msr = msr_val;
>>>> +		old_msr_val = atomic64_cmpxchg(&am->prev_msr, start_msr_val,
>>>> +					       msr_val);
>>>> +		if (old_msr_val != start_msr_val)
>>>> +			goto interrupted;
>>>> +
>>
>>> hmmm ... what if interruption occurs here?
>>
>> This is after the MSR write/read, so this function can't get a torn value from the
>> hardware. (e.g. reads the wrong RMID). The operations on struct arch_mbm_state are atomic,
>> so are still safe if the function becomes re-entrant.
>>
>> If the re-entrant call accessed the same RMID and the same counter, its atomic64_add()
>> would be based on the prev_msr value this call read - because the above cmpxchg succeeded.
>>
>> (put another way:)
>> The interrupting call returns a lower value, consistent with the first call not having
>> finished yet. The interrupted call returns the correct value, which is larger than it
>> read, because it completed after the interrupting call.

> I see, thank you. If this does end up being needed for a future
> concurrency issue, could you please add a comment describing
> this behavior where a later call can return a lower value and
> why that is ok? It looks to me, as accomplished with the use of
> atomic64_add(), as though this scenario would
> end with the correct arch_mbm_state even though the members
> are not updated atomically as a unit.

Sure my stab at that is:
/*
  * At this point the hardware values have been read without
  * being interrupted. Interrupts that occur later will read
  * the updated am->prev_msr and safely increment am->chunks
  * with the new data using atomic64_add().
  */


Thanks,

James

Re: [PATCH v5 12/24] x86/resctrl: Make resctrl_arch_rmid_read() retry when it is interrupted

Posted by Reinette Chatre 2 years, 5 months ago

Hi James,

On 9/8/2023 8:58 AM, James Morse wrote:
> On 8/25/23 00:01, Reinette Chatre wrote:
>> On 8/24/2023 9:55 AM, James Morse wrote:
>>> On 09/08/2023 23:35, Reinette Chatre wrote:
>>>> On 7/28/2023 9:42 AM, James Morse wrote:
>>>>> resctrl_arch_rmid_read() could be called by resctrl in process context,
>>>>> and then called by the PMU driver from irq context on the same CPU.
>>>>
>>>> The changelog is written as a bug report of current behavior.
>>>> This does not seem to describe current but instead planned future behavior.
>>>
>>> I pulled this patch from much later in the tree as it's about to be a problem in this
>>> series. I haven't yet decided if its an existing bug in resctrl....
>>>
>>> ... it doesn't look like this can affect the path through mon_event_read(), as
>>> generic_exec_single() masks interrupts.
>>> But an incoming IPI from mon_event_read can corrupt the values for the limbo worker, which
>>> at the worst would result in early re-use. And the MBM overflow worker ... which would
>>> corrupt the value seen by user-space.
>>> free_rmid() is equally affected, the outcome for limbo is the same spurious delay or early
>>> re-use.
>>
>> Apologies but these races are not obvious to me. Let me take the first, where the
>> race could be between mon_event_read() and the limbo worker. From what I can tell
>> mon_event_read() can be called from user space when creating a new monitoring
>> group or when viewing data associated with a monitoring group. In both cases
>> rdtgroup_mutex is held from the time user space triggers the request until
>> all IPIs are completed. Compare that with the limbo worker, cqm_handle_limbo(),
>> that also obtains rdtgroup_mutex before it attempts to do its work.
>> Considering this example I am not able to see how an incoming IPI from
>> mon_event_read() can interfere with the limbo worker since both
>> holding rdtgroup_mutex prevents them from running concurrently.
>>
>> Similarly, the MBM overflow worker takes rdtgroup_mutex, and free_rmid()
>> is run with rdtgroup_mutex held.
> 
> Yes, sorry -I'd forgotten about that! I'll need to dig into this again.
> 
> Part of the problem is I'm looking at this from a different angle - something I haven't described properly: the resctrl_arch_ calls shouldn't depend on lock that is private to resctrl.
> 
> This allows for multiple callers, (e.g. resctrl_pmu that I haven't posted yet), and allows MPAM's
> overflow interrupt to eventually be something resctrl could support.
> It also allows the resctrl_arch_ calls to have lockdep asserts for their dependencies.
> 
> Yes the resctrl_mutex is what prevents this from being a problem today.
> (I haven't yet looked at how Peter's series solves the same problem.)
> 
> ... it may be possible to move this patch back of the 'fold' to live with the PMU code ...

In its current form this patch does appear to be out of place in 
this series.


>>> I'll change the commit messages to describe that, and float this earlier in the series.
>>> The backport will be a problem. This applies cleanly to v6.1.46, but for v5.15.127 there
>>> are at least 13 dependencies ... its probably not worth trying to fix as chances are
>>> no-one is seeing this happen in reality.
> 
> I'll remove that wording around this and mention the mutex.
> 
> [...]
> 
>>>>> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
>>>>> index f0670795b446..62350bbd23e0 100644
>>>>> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
>>>>> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
>>>>> @@ -266,23 +279,35 @@ int :/(struct rdt_resource *r, struct rdt_domain *d,
>>>>>   {
>>>>>       struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
>>>>>       struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
>>>>> +    u64 start_msr_val, old_msr_val, msr_val, chunks;
>>>>>       struct arch_mbm_state *am;
>>>>> -    u64 msr_val, chunks;
>>>>> -    int ret;
>>>>> +    int ret = 0;
>>>>>         if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
>>>>>           return -EINVAL;
>>>>>   +interrupted:
>>>>> +    am = get_arch_mbm_state(hw_dom, rmid, eventid);
>>>>> +    if (am)
>>>>> +        start_msr_val = atomic64_read(&am->prev_msr);
>>>>> +
>>>>>       ret = __rmid_read(rmid, eventid, &msr_val);
>>>>>       if (ret)
>>>>>           return ret;
>>>>>         am = get_arch_mbm_state(hw_dom, rmid, eventid);
>>>>>       if (am) {
>>>>> -        am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
>>>>> -                         hw_res->mbm_width);
>>>>> -        chunks = get_corrected_mbm_count(rmid, am->chunks);
>>>>> -        am->prev_msr = msr_val;
>>>>> +        old_msr_val = atomic64_cmpxchg(&am->prev_msr, start_msr_val,
>>>>> +                           msr_val);
>>>>> +        if (old_msr_val != start_msr_val)
>>>>> +            goto interrupted;
>>>>> +
>>>
>>>> hmmm ... what if interruption occurs here?
>>>
>>> This is after the MSR write/read, so this function can't get a torn value from the
>>> hardware. (e.g. reads the wrong RMID). The operations on struct arch_mbm_state are atomic,
>>> so are still safe if the function becomes re-entrant.
>>>
>>> If the re-entrant call accessed the same RMID and the same counter, its atomic64_add()
>>> would be based on the prev_msr value this call read - because the above cmpxchg succeeded.
>>>
>>> (put another way:)
>>> The interrupting call returns a lower value, consistent with the first call not having
>>> finished yet. The interrupted call returns the correct value, which is larger than it
>>> read, because it completed after the interrupting call.
> 
>> I see, thank you. If this does end up being needed for a future
>> concurrency issue, could you please add a comment describing
>> this behavior where a later call can return a lower value and
>> why that is ok? It looks to me, as accomplished with the use of
>> atomic64_add(), as though this scenario would
>> end with the correct arch_mbm_state even though the members
>> are not updated atomically as a unit.
> 
> Sure my stab at that is:
> /*
>  * At this point the hardware values have been read without
>  * being interrupted. Interrupts that occur later will read
>  * the updated am->prev_msr and safely increment am->chunks
>  * with the new data using atomic64_add().
>  */

The comment is useful and appears to address that accurate
arch_mbm_state is maintained. My question was related to the
higher level behavior encountered by the callers. repeating my
question: "could you please add a comment describing this behavior where
a later call can return a lower value and why that is ok?"

Reinette