iommu/iova: Bettering utilizing cpu_rcaches in no-strict mode

[PATCH] iommu/iova: Bettering utilizing cpu_rcaches in no-strict mode

Posted by Zhang Zekun 1 year, 7 months ago

Currently, when iommu working in no-strict mode, fq_flush_timeout()
will always try to free iovas on one cpu. Freeing the iovas from all
cpus on one cpu is not cache-friendly to iova_rcache, because it will
first filling up the cpu_rcache and then pushing iovas to the depot,
iovas in the depot will finally goto the underlying rbtree if the
depot_size is greater than num_online_cpus(). Let fq_flush_timeout()
freeing iovas on cpus who call dma_unmap* APIs, can decrease the overall
time caused by fq_flush_timeout() by better utilizing the iova_rcache,
and minimizing the competition for the iova_rbtree_lock().

Signed-off-by: Zhang Zekun <zhangzekun11@huawei.com>
---
 drivers/iommu/dma-iommu.c | 22 +++++++++++++---------
 drivers/iommu/iova.c      | 21 ++++++++++++++-------
 include/linux/iova.h      |  7 +++++++
 3 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 43520e7275cc..217b7c70d06c 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -145,7 +145,8 @@ static inline unsigned int fq_ring_add(struct iova_fq *fq)
 	return idx;
 }
 
-static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
+static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq *fq,
+				int cpu)
 {
 	u64 counter = atomic64_read(&cookie->fq_flush_finish_cnt);
 	unsigned int idx;
@@ -158,20 +159,21 @@ static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq
 			break;
 
 		iommu_put_pages_list(&fq->entries[idx].freelist);
-		free_iova_fast(&cookie->iovad,
+		free_iova_fast_cpu(&cookie->iovad,
 			       fq->entries[idx].iova_pfn,
-			       fq->entries[idx].pages);
+			       fq->entries[idx].pages, cpu);
 
 		fq->head = (fq->head + 1) & fq->mod_mask;
 	}
 }
 
-static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
+static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq,
+			 int cpu)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&fq->lock, flags);
-	fq_ring_free_locked(cookie, fq);
+	fq_ring_free_locked(cookie, fq, cpu);
 	spin_unlock_irqrestore(&fq->lock, flags);
 }
 
@@ -191,10 +193,11 @@ static void fq_flush_timeout(struct timer_list *t)
 	fq_flush_iotlb(cookie);
 
 	if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) {
-		fq_ring_free(cookie, cookie->single_fq);
+		cpu = smp_processor_id();
+		fq_ring_free(cookie, cookie->single_fq, cpu);
 	} else {
 		for_each_possible_cpu(cpu)
-			fq_ring_free(cookie, per_cpu_ptr(cookie->percpu_fq, cpu));
+			fq_ring_free(cookie, per_cpu_ptr(cookie->percpu_fq, cpu), cpu);
 	}
 }
 
@@ -205,6 +208,7 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
 	struct iova_fq *fq;
 	unsigned long flags;
 	unsigned int idx;
+	int cpu = smp_processor_id();
 
 	/*
 	 * Order against the IOMMU driver's pagetable update from unmapping
@@ -227,11 +231,11 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
 	 * flushed out on another CPU. This makes the fq_full() check below less
 	 * likely to be true.
 	 */
-	fq_ring_free_locked(cookie, fq);
+	fq_ring_free_locked(cookie, fq, cpu);
 
 	if (fq_full(fq)) {
 		fq_flush_iotlb(cookie);
-		fq_ring_free_locked(cookie, fq);
+		fq_ring_free_locked(cookie, fq, cpu);
 	}
 
 	idx = fq_ring_add(fq);
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index d59d0ea2fd21..46a2188c263b 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -20,7 +20,7 @@
 
 static bool iova_rcache_insert(struct iova_domain *iovad,
 			       unsigned long pfn,
-			       unsigned long size);
+			       unsigned long size, int cpu);
 static unsigned long iova_rcache_get(struct iova_domain *iovad,
 				     unsigned long size,
 				     unsigned long limit_pfn);
@@ -423,12 +423,19 @@ EXPORT_SYMBOL_GPL(alloc_iova_fast);
 void
 free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
 {
-	if (iova_rcache_insert(iovad, pfn, size))
+	free_iova_fast_cpu(iovad, pfn, size, smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(free_iova_fast);
+
+void
+free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
+		   unsigned long size, int cpu)
+{
+	if (iova_rcache_insert(iovad, pfn, size, cpu))
 		return;
 
 	free_iova(iovad, pfn);
 }
-EXPORT_SYMBOL_GPL(free_iova_fast);
 
 static void iova_domain_free_rcaches(struct iova_domain *iovad)
 {
@@ -762,13 +769,13 @@ EXPORT_SYMBOL_GPL(iova_domain_init_rcaches);
  */
 static bool __iova_rcache_insert(struct iova_domain *iovad,
 				 struct iova_rcache *rcache,
-				 unsigned long iova_pfn)
+				 unsigned long iova_pfn, int cpu)
 {
 	struct iova_cpu_rcache *cpu_rcache;
 	bool can_insert = false;
 	unsigned long flags;
 
-	cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
+	cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
 	spin_lock_irqsave(&cpu_rcache->lock, flags);
 
 	if (!iova_magazine_full(cpu_rcache->loaded)) {
@@ -799,14 +806,14 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
 }
 
 static bool iova_rcache_insert(struct iova_domain *iovad, unsigned long pfn,
-			       unsigned long size)
+			       unsigned long size, int cpu)
 {
 	unsigned int log_size = order_base_2(size);
 
 	if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
 		return false;
 
-	return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn);
+	return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn, cpu);
 }
 
 /*
diff --git a/include/linux/iova.h b/include/linux/iova.h
index d2c4fd923efa..91e4e3d5bcb3 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -93,6 +93,8 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
 	bool size_aligned);
 void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
 		    unsigned long size);
+void free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
+			unsigned long size, int cpu);
 unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
 			      unsigned long limit_pfn, bool flush_rcache);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
@@ -134,6 +136,11 @@ static inline void free_iova_fast(struct iova_domain *iovad,
 {
 }
 
+static inline void free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
+				      unsigned long size, int cpu);
+{
+}
+
 static inline unsigned long alloc_iova_fast(struct iova_domain *iovad,
 					    unsigned long size,
 					    unsigned long limit_pfn,
-- 
2.17.1

Re: [PATCH] iommu/iova: Bettering utilizing cpu_rcaches in no-strict mode

Posted by Robin Murphy 1 year, 7 months ago

On 2024-06-24 9:39 am, Zhang Zekun wrote:
> Currently, when iommu working in no-strict mode, fq_flush_timeout()
> will always try to free iovas on one cpu. Freeing the iovas from all
> cpus on one cpu is not cache-friendly to iova_rcache, because it will
> first filling up the cpu_rcache and then pushing iovas to the depot,
> iovas in the depot will finally goto the underlying rbtree if the
> depot_size is greater than num_online_cpus().

That is the design intent - if the excess magazines sit in the depot 
long enough to be reclaimed then other CPUs didn't want them either. 
We're trying to minimise the amount of unused cached IOVAs sitting 
around wasting memory, since IOVA memory consumption has proven to be 
quite significant on large systems.

As alluded to in the original cover letter, 100ms for IOVA_DEPOT_DELAY 
was just my arbitrary value of "long enough" to keep the initial 
implementation straightforward - I do expect that certain workloads 
might benefit from tuning it differently, but without proof of what they 
are and what they want, there's little justification for introducing 
extra complexity and potential user ABI yet.

> Let fq_flush_timeout()
> freeing iovas on cpus who call dma_unmap* APIs, can decrease the overall
> time caused by fq_flush_timeout() by better utilizing the iova_rcache,
> and minimizing the competition for the iova_rbtree_lock().

I would have assumed that a single CPU simply throwing magazines into 
the depot list from its own percpu cache is quicker, or at least no 
slower, than doing the same while causing additional contention/sharing 
by interfering with other percpu caches as well. And where does the 
rbtree come into that either way? If an observable performance issue 
actually exists here, I'd like a more detailed breakdown to understand it.

Thanks,
Robin.

> Signed-off-by: Zhang Zekun <zhangzekun11@huawei.com>
> ---
>   drivers/iommu/dma-iommu.c | 22 +++++++++++++---------
>   drivers/iommu/iova.c      | 21 ++++++++++++++-------
>   include/linux/iova.h      |  7 +++++++
>   3 files changed, 34 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 43520e7275cc..217b7c70d06c 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -145,7 +145,8 @@ static inline unsigned int fq_ring_add(struct iova_fq *fq)
>   	return idx;
>   }
>   
> -static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
> +static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq *fq,
> +				int cpu)
>   {
>   	u64 counter = atomic64_read(&cookie->fq_flush_finish_cnt);
>   	unsigned int idx;
> @@ -158,20 +159,21 @@ static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq
>   			break;
>   
>   		iommu_put_pages_list(&fq->entries[idx].freelist);
> -		free_iova_fast(&cookie->iovad,
> +		free_iova_fast_cpu(&cookie->iovad,
>   			       fq->entries[idx].iova_pfn,
> -			       fq->entries[idx].pages);
> +			       fq->entries[idx].pages, cpu);
>   
>   		fq->head = (fq->head + 1) & fq->mod_mask;
>   	}
>   }
>   
> -static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
> +static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq,
> +			 int cpu)
>   {
>   	unsigned long flags;
>   
>   	spin_lock_irqsave(&fq->lock, flags);
> -	fq_ring_free_locked(cookie, fq);
> +	fq_ring_free_locked(cookie, fq, cpu);
>   	spin_unlock_irqrestore(&fq->lock, flags);
>   }
>   
> @@ -191,10 +193,11 @@ static void fq_flush_timeout(struct timer_list *t)
>   	fq_flush_iotlb(cookie);
>   
>   	if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) {
> -		fq_ring_free(cookie, cookie->single_fq);
> +		cpu = smp_processor_id();
> +		fq_ring_free(cookie, cookie->single_fq, cpu);
>   	} else {
>   		for_each_possible_cpu(cpu)
> -			fq_ring_free(cookie, per_cpu_ptr(cookie->percpu_fq, cpu));
> +			fq_ring_free(cookie, per_cpu_ptr(cookie->percpu_fq, cpu), cpu);
>   	}
>   }
>   
> @@ -205,6 +208,7 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
>   	struct iova_fq *fq;
>   	unsigned long flags;
>   	unsigned int idx;
> +	int cpu = smp_processor_id();
>   
>   	/*
>   	 * Order against the IOMMU driver's pagetable update from unmapping
> @@ -227,11 +231,11 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
>   	 * flushed out on another CPU. This makes the fq_full() check below less
>   	 * likely to be true.
>   	 */
> -	fq_ring_free_locked(cookie, fq);
> +	fq_ring_free_locked(cookie, fq, cpu);
>   
>   	if (fq_full(fq)) {
>   		fq_flush_iotlb(cookie);
> -		fq_ring_free_locked(cookie, fq);
> +		fq_ring_free_locked(cookie, fq, cpu);
>   	}
>   
>   	idx = fq_ring_add(fq);
> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
> index d59d0ea2fd21..46a2188c263b 100644
> --- a/drivers/iommu/iova.c
> +++ b/drivers/iommu/iova.c
> @@ -20,7 +20,7 @@
>   
>   static bool iova_rcache_insert(struct iova_domain *iovad,
>   			       unsigned long pfn,
> -			       unsigned long size);
> +			       unsigned long size, int cpu);
>   static unsigned long iova_rcache_get(struct iova_domain *iovad,
>   				     unsigned long size,
>   				     unsigned long limit_pfn);
> @@ -423,12 +423,19 @@ EXPORT_SYMBOL_GPL(alloc_iova_fast);
>   void
>   free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
>   {
> -	if (iova_rcache_insert(iovad, pfn, size))
> +	free_iova_fast_cpu(iovad, pfn, size, smp_processor_id());
> +}
> +EXPORT_SYMBOL_GPL(free_iova_fast);
> +
> +void
> +free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
> +		   unsigned long size, int cpu)
> +{
> +	if (iova_rcache_insert(iovad, pfn, size, cpu))
>   		return;
>   
>   	free_iova(iovad, pfn);
>   }
> -EXPORT_SYMBOL_GPL(free_iova_fast);
>   
>   static void iova_domain_free_rcaches(struct iova_domain *iovad)
>   {
> @@ -762,13 +769,13 @@ EXPORT_SYMBOL_GPL(iova_domain_init_rcaches);
>    */
>   static bool __iova_rcache_insert(struct iova_domain *iovad,
>   				 struct iova_rcache *rcache,
> -				 unsigned long iova_pfn)
> +				 unsigned long iova_pfn, int cpu)
>   {
>   	struct iova_cpu_rcache *cpu_rcache;
>   	bool can_insert = false;
>   	unsigned long flags;
>   
> -	cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
> +	cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
>   	spin_lock_irqsave(&cpu_rcache->lock, flags);
>   
>   	if (!iova_magazine_full(cpu_rcache->loaded)) {
> @@ -799,14 +806,14 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
>   }
>   
>   static bool iova_rcache_insert(struct iova_domain *iovad, unsigned long pfn,
> -			       unsigned long size)
> +			       unsigned long size, int cpu)
>   {
>   	unsigned int log_size = order_base_2(size);
>   
>   	if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
>   		return false;
>   
> -	return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn);
> +	return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn, cpu);
>   }
>   
>   /*
> diff --git a/include/linux/iova.h b/include/linux/iova.h
> index d2c4fd923efa..91e4e3d5bcb3 100644
> --- a/include/linux/iova.h
> +++ b/include/linux/iova.h
> @@ -93,6 +93,8 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
>   	bool size_aligned);
>   void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
>   		    unsigned long size);
> +void free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
> +			unsigned long size, int cpu);
>   unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
>   			      unsigned long limit_pfn, bool flush_rcache);
>   struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
> @@ -134,6 +136,11 @@ static inline void free_iova_fast(struct iova_domain *iovad,
>   {
>   }
>   
> +static inline void free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
> +				      unsigned long size, int cpu);
> +{
> +}
> +
>   static inline unsigned long alloc_iova_fast(struct iova_domain *iovad,
>   					    unsigned long size,
>   					    unsigned long limit_pfn,

Re: [PATCH] iommu/iova: Bettering utilizing cpu_rcaches in no-strict mode

Posted by zhangzekun (A) 1 year, 7 months ago

在 2024/6/24 18:56, Robin Murphy 写道:
> On 2024-06-24 9:39 am, Zhang Zekun wrote:
>> Currently, when iommu working in no-strict mode, fq_flush_timeout()
>> will always try to free iovas on one cpu. Freeing the iovas from all
>> cpus on one cpu is not cache-friendly to iova_rcache, because it will
>> first filling up the cpu_rcache and then pushing iovas to the depot,
>> iovas in the depot will finally goto the underlying rbtree if the
>> depot_size is greater than num_online_cpus().
> 
> That is the design intent - if the excess magazines sit in the depot 
> long enough to be reclaimed then other CPUs didn't want them either. 
> We're trying to minimise the amount of unused cached IOVAs sitting 
> around wasting memory, since IOVA memory consumption has proven to be 
> quite significant on large systems.

Hi, Robin,

It does waste some memory after this change, but since we have been 
freeing iova on each cpu in strict-mode for years, I think this change 
seems reasonable to make the iova free logic identical to strict-mode.
This patch try to make the speed of allcating and freeing iova roughly 
same by better utilizing the iova_rcache, or we will more likely enter 
the slowpath. The save of memory consumption is actually at the cost of 
performance, I am not sure if we need such a optimization for no-strict 
mode which is mainly used for performance consideration.

> 
> As alluded to in the original cover letter, 100ms for IOVA_DEPOT_DELAY 
> was just my arbitrary value of "long enough" to keep the initial 
> implementation straightforward - I do expect that certain workloads 
> might benefit from tuning it differently, but without proof of what they 
> are and what they want, there's little justification for introducing 
> extra complexity and potential user ABI yet.
> 
>> Let fq_flush_timeout()
>> freeing iovas on cpus who call dma_unmap* APIs, can decrease the overall
>> time caused by fq_flush_timeout() by better utilizing the iova_rcache,
>> and minimizing the competition for the iova_rbtree_lock().
> 
> I would have assumed that a single CPU simply throwing magazines into 
> the depot list from its own percpu cache is quicker, or at least no 
> slower, than doing the same while causing additional contention/sharing 
> by interfering with other percpu caches as well. And where does the 
> rbtree come into that either way? If an observable performance issue 
> actually exists here, I'd like a more detailed breakdown to understand it.
> 
> Thanks,
> Robin.
> 

This patch is firstly intent to minimize the chance of softlock issue in 
fq_flush_timeout(), which is already dicribed erarlier in [1], which has 
beed applied in a commercial kernel[2] for years.

However, the later tests show that this single patch is not enough to 
fix the softlockup issue, since the root cause of softlockup is the 
underlying iova_rbtree_lock. In our softlockup scenarios, the average
time cost to get this spinlock is about 6ms.

[1] https://lkml.org/lkml/2023/2/16/124
[2] https://gitee.com/openeuler/kernel/tree/OLK-5.10/

Thanks,
Zekun

Re: [PATCH] iommu/iova: Bettering utilizing cpu_rcaches in no-strict mode

Posted by Robin Murphy 1 year, 7 months ago

On 2024-06-24 2:13 pm, zhangzekun (A) wrote:
> 
> 在 2024/6/24 18:56, Robin Murphy 写道:
>> On 2024-06-24 9:39 am, Zhang Zekun wrote:
>>> Currently, when iommu working in no-strict mode, fq_flush_timeout()
>>> will always try to free iovas on one cpu. Freeing the iovas from all
>>> cpus on one cpu is not cache-friendly to iova_rcache, because it will
>>> first filling up the cpu_rcache and then pushing iovas to the depot,
>>> iovas in the depot will finally goto the underlying rbtree if the
>>> depot_size is greater than num_online_cpus().
>>
>> That is the design intent - if the excess magazines sit in the depot 
>> long enough to be reclaimed then other CPUs didn't want them either. 
>> We're trying to minimise the amount of unused cached IOVAs sitting 
>> around wasting memory, since IOVA memory consumption has proven to be 
>> quite significant on large systems.
> 
> Hi, Robin,
> 
> It does waste some memory after this change, but since we have been 
> freeing iova on each cpu in strict-mode for years, I think this change 
> seems reasonable to make the iova free logic identical to strict-mode.
> This patch try to make the speed of allcating and freeing iova roughly 
> same by better utilizing the iova_rcache, or we will more likely enter 
> the slowpath. The save of memory consumption is actually at the cost of 
> performance, I am not sure if we need such a optimization for no-strict 
> mode which is mainly used for performance consideration.
> 
>>
>> As alluded to in the original cover letter, 100ms for IOVA_DEPOT_DELAY 
>> was just my arbitrary value of "long enough" to keep the initial 
>> implementation straightforward - I do expect that certain workloads 
>> might benefit from tuning it differently, but without proof of what 
>> they are and what they want, there's little justification for 
>> introducing extra complexity and potential user ABI yet.
>>
>>> Let fq_flush_timeout()
>>> freeing iovas on cpus who call dma_unmap* APIs, can decrease the overall
>>> time caused by fq_flush_timeout() by better utilizing the iova_rcache,
>>> and minimizing the competition for the iova_rbtree_lock().
>>
>> I would have assumed that a single CPU simply throwing magazines into 
>> the depot list from its own percpu cache is quicker, or at least no 
>> slower, than doing the same while causing additional 
>> contention/sharing by interfering with other percpu caches as well. 
>> And where does the rbtree come into that either way? If an observable 
>> performance issue actually exists here, I'd like a more detailed 
>> breakdown to understand it.
>>
>> Thanks,
>> Robin.
>>
> 
> This patch is firstly intent to minimize the chance of softlock issue in 
> fq_flush_timeout(), which is already dicribed erarlier in [1], which has 
> beed applied in a commercial kernel[2] for years.
> 
> However, the later tests show that this single patch is not enough to 
> fix the softlockup issue, since the root cause of softlockup is the 
> underlying iova_rbtree_lock. In our softlockup scenarios, the average
> time cost to get this spinlock is about 6ms.

That should already be fixed, though. The only reason for 
fq_flush_timeout() to interact with the rbtree at all was due to the 
notion of a fixed-size depot which could become full. That no longer 
exists since 911aa1245da8 ("iommu/iova: Make the rcache depot scale 
better").

Thanks,
Robin.

Re: [PATCH] iommu/iova: Bettering utilizing cpu_rcaches in no-strict mode

Posted by zhangzekun (A) 1 year, 7 months ago


在 2024/6/24 21:32, Robin Murphy 写道:

>> This patch is firstly intent to minimize the chance of softlock issue 
>> in fq_flush_timeout(), which is already dicribed erarlier in [1], 
>> which has beed applied in a commercial kernel[2] for years.
>>
>> However, the later tests show that this single patch is not enough to 
>> fix the softlockup issue, since the root cause of softlockup is the 
>> underlying iova_rbtree_lock. In our softlockup scenarios, the average
>> time cost to get this spinlock is about 6ms.
> 
> That should already be fixed, though. The only reason for 
> fq_flush_timeout() to interact with the rbtree at all was due to the 
> notion of a fixed-size depot which could become full. That no longer 
> exists since 911aa1245da8 ("iommu/iova: Make the rcache depot scale 
> better").
> 
> Thanks,
> Robin.
> 
Hi, Robin,

The commit 911aa1245da8 ("iommu/iova: Make the rcache depot scale 
better") can reduce the risks of softlockup, but can not fix it 
entirely. We do solve a softlockup issue[1] with that patch, and that is
why it has aleady been backported in our branch. The softlockup issue 
which we met recently is a 5.10-based kernel with that patch already 
backported, which can be found in [2].

In 6.7-rc4, which has already applied patch  911aa1245da8 ("iommu/iova: 
Make the rcache depot scale better"). We can also get the following
call trace. It seems still existing the issue, and we fix this issue 
using the patch in [3], when the call trace first came up.

rcu: INFO: rcu_preempt self-detected stall on CPU
rcu: 	85-....: (5249 ticks this GP) idle=4d04/1/0x4000000000000000 
softirq=428240/428240 fqs=1882
rcu: 	(t=5253 jiffies g=2005741 q=19446136 ncpus=128)
CPU: 85 PID: 2401130 Comm: kpktgend_85 Kdump: loaded Tainted: G 
  O       6.7.0-rc4+ #1

pc : _raw_spin_unlock_irqrestore+0x14/0x78
lr : fq_flush_timeout+0x94/0x118
sp : ffff8000802abdf0
x29: ffff8000802abdf0 x28: ffff20201142e480 x27: 0000000000000002
x26: ffffdeeb67e82b20 x25: 0000000000000000 x24: ffffdeeb67e87400
x23: ffffdeeb67e82ba0 x22: ffff202009db1c00 x21: ffffdeeb67e82f40
x20: 0000000000000000 x19: ffffdbe06f0140a8 x18: 0000000000040000
x17: 0000000000000001 x16: 0000000000000040 x15: ffffffffffffffff
x14: 0000000000001fff x13: 000000000003ffff x12: 0000000000000259
x11: 000000000007ffff x10: 000000000000000d x9 : ffffdeeb6549ec84
x8 : ffff0020886e0000 x7 : 000000000000964d x6 : 000001afa2986acb
x5 : 01ffffffffffffff x4 : 0000000000000000 x3 : ffff202006b57d70
x2 : 0000000000000015 x1 : 0000000000000000 x0 : ffffdbe06f0140a8
Call trace:
  _raw_spin_unlock_irqrestore+0x14/0x78
  call_timer_fn+0x3c/0x1d0
  expire_timers+0xcc/0x190
  run_timer_softirq+0xfc/0x268
  __do_softirq+0x128/0x3dc
  ____do_softirq+0x18/0x30
  call_on_irq_stack+0x24/0x30
  do_softirq_own_stack+0x24/0x38
  irq_exit_rcu+0xc0/0xe8
  el1_interrupt+0x48/0xc0
  el1h_64_irq_handler+0x18/0x28
  el1h_64_irq+0x78/0x80
  __schedule+0xf28/0x12a0
  schedule+0x3c/0x108
  schedule_timeout+0xa0/0x1d0
  pktgen_thread_worker+0x1180/0x15d0
  kthread+0x120/0x130
  ret_from_fork+0x10/0x20

[1] https://gitee.com/openeuler/kernel/issues/I8KS9A
[2] https://gitee.com/openeuler/kernel/tree/OLK-5.10/
[3] https://lkml.org/lkml/2023/2/16/124

Thanks,
Zekun

Re: [PATCH] iommu/iova: Bettering utilizing cpu_rcaches in no-strict mode

Posted by Robin Murphy 1 year, 7 months ago

On 2024-06-25 2:29 am, zhangzekun (A) wrote:
> 
> 
> 在 2024/6/24 21:32, Robin Murphy 写道:
> 
>>> This patch is firstly intent to minimize the chance of softlock issue 
>>> in fq_flush_timeout(), which is already dicribed erarlier in [1], 
>>> which has beed applied in a commercial kernel[2] for years.
>>>
>>> However, the later tests show that this single patch is not enough to 
>>> fix the softlockup issue, since the root cause of softlockup is the 
>>> underlying iova_rbtree_lock. In our softlockup scenarios, the average
>>> time cost to get this spinlock is about 6ms.
>>
>> That should already be fixed, though. The only reason for 
>> fq_flush_timeout() to interact with the rbtree at all was due to the 
>> notion of a fixed-size depot which could become full. That no longer 
>> exists since 911aa1245da8 ("iommu/iova: Make the rcache depot scale 
>> better").
>>
>> Thanks,
>> Robin.
>>
> Hi, Robin,
> 
> The commit 911aa1245da8 ("iommu/iova: Make the rcache depot scale 
> better") can reduce the risks of softlockup, but can not fix it 
> entirely. We do solve a softlockup issue[1] with that patch, and that is
> why it has aleady been backported in our branch. The softlockup issue 
> which we met recently is a 5.10-based kernel with that patch already 
> backported, which can be found in [2].

Sorry, I was implying some context that I should have made clear - yes, 
the softlockup can still happen in general if the flush queues are full 
of IOVAs which are too large for the rcache mechanism at all, so are 
always freed directly to the rbtree, but then there's no way *this* 
patch could make any difference to that case either.

Thanks,
Robin.

> In 6.7-rc4, which has already applied patch  911aa1245da8 ("iommu/iova: 
> Make the rcache depot scale better"). We can also get the following
> call trace. It seems still existing the issue, and we fix this issue 
> using the patch in [3], when the call trace first came up.
> 
> rcu: INFO: rcu_preempt self-detected stall on CPU
> rcu:     85-....: (5249 ticks this GP) idle=4d04/1/0x4000000000000000 
> softirq=428240/428240 fqs=1882
> rcu:     (t=5253 jiffies g=2005741 q=19446136 ncpus=128)
> CPU: 85 PID: 2401130 Comm: kpktgend_85 Kdump: loaded Tainted: G  O       
> 6.7.0-rc4+ #1
> 
> pc : _raw_spin_unlock_irqrestore+0x14/0x78
> lr : fq_flush_timeout+0x94/0x118
> sp : ffff8000802abdf0
> x29: ffff8000802abdf0 x28: ffff20201142e480 x27: 0000000000000002
> x26: ffffdeeb67e82b20 x25: 0000000000000000 x24: ffffdeeb67e87400
> x23: ffffdeeb67e82ba0 x22: ffff202009db1c00 x21: ffffdeeb67e82f40
> x20: 0000000000000000 x19: ffffdbe06f0140a8 x18: 0000000000040000
> x17: 0000000000000001 x16: 0000000000000040 x15: ffffffffffffffff
> x14: 0000000000001fff x13: 000000000003ffff x12: 0000000000000259
> x11: 000000000007ffff x10: 000000000000000d x9 : ffffdeeb6549ec84
> x8 : ffff0020886e0000 x7 : 000000000000964d x6 : 000001afa2986acb
> x5 : 01ffffffffffffff x4 : 0000000000000000 x3 : ffff202006b57d70
> x2 : 0000000000000015 x1 : 0000000000000000 x0 : ffffdbe06f0140a8
> Call trace:
>   _raw_spin_unlock_irqrestore+0x14/0x78
>   call_timer_fn+0x3c/0x1d0
>   expire_timers+0xcc/0x190
>   run_timer_softirq+0xfc/0x268
>   __do_softirq+0x128/0x3dc
>   ____do_softirq+0x18/0x30
>   call_on_irq_stack+0x24/0x30
>   do_softirq_own_stack+0x24/0x38
>   irq_exit_rcu+0xc0/0xe8
>   el1_interrupt+0x48/0xc0
>   el1h_64_irq_handler+0x18/0x28
>   el1h_64_irq+0x78/0x80
>   __schedule+0xf28/0x12a0
>   schedule+0x3c/0x108
>   schedule_timeout+0xa0/0x1d0
>   pktgen_thread_worker+0x1180/0x15d0
>   kthread+0x120/0x130
>   ret_from_fork+0x10/0x20
> 
> [1] https://gitee.com/openeuler/kernel/issues/I8KS9A
> [2] https://gitee.com/openeuler/kernel/tree/OLK-5.10/
> [3] https://lkml.org/lkml/2023/2/16/124
> 
> Thanks,
> Zekun

Re: [PATCH] iommu/iova: Bettering utilizing cpu_rcaches in no-strict mode

Posted by zhangzekun (A) 1 year, 7 months ago


在 2024/6/26 2:03, Robin Murphy 写道:
> On 2024-06-25 2:29 am, zhangzekun (A) wrote:
>>
>>
>> 在 2024/6/24 21:32, Robin Murphy 写道:
>>
>>>> This patch is firstly intent to minimize the chance of softlock 
>>>> issue in fq_flush_timeout(), which is already dicribed erarlier in 
>>>> [1], which has beed applied in a commercial kernel[2] for years.
>>>>
>>>> However, the later tests show that this single patch is not enough 
>>>> to fix the softlockup issue, since the root cause of softlockup is 
>>>> the underlying iova_rbtree_lock. In our softlockup scenarios, the 
>>>> average
>>>> time cost to get this spinlock is about 6ms.
>>>
>>> That should already be fixed, though. The only reason for 
>>> fq_flush_timeout() to interact with the rbtree at all was due to the 
>>> notion of a fixed-size depot which could become full. That no longer 
>>> exists since 911aa1245da8 ("iommu/iova: Make the rcache depot scale 
>>> better").
>>>
>>> Thanks,
>>> Robin.
>>>
>> Hi, Robin,
>>
>> The commit 911aa1245da8 ("iommu/iova: Make the rcache depot scale 
>> better") can reduce the risks of softlockup, but can not fix it 
>> entirely. We do solve a softlockup issue[1] with that patch, and that is
>> why it has aleady been backported in our branch. The softlockup issue 
>> which we met recently is a 5.10-based kernel with that patch already 
>> backported, which can be found in [2].
> 
> Sorry, I was implying some context that I should have made clear - yes, 
> the softlockup can still happen in general if the flush queues are full 
> of IOVAs which are too large for the rcache mechanism at all, so are 
> always freed directly to the rbtree, but then there's no way *this* 
> patch could make any difference to that case either.
> 
> Thanks,
> Robin.
> 

Yes, this patch can't fix softlockup issue in this case. In such a case, 
it would be better to put the free iova logic in fq_flush_timeout() to a 
kthread and add a cond_resched() in it.

Thanks,
Zekun