drivers/iommu/dma-iommu.c | 22 +++++++++++++--------- drivers/iommu/iova.c | 21 ++++++++++++++------- include/linux/iova.h | 7 +++++++ 3 files changed, 34 insertions(+), 16 deletions(-)
Currently, when iommu working in no-strict mode, fq_flush_timeout()
will always try to free iovas on one cpu. Freeing the iovas from all
cpus on one cpu is not cache-friendly to iova_rcache, because it will
first filling up the cpu_rcache and then pushing iovas to the depot,
iovas in the depot will finally goto the underlying rbtree if the
depot_size is greater than num_online_cpus(). Let fq_flush_timeout()
freeing iovas on cpus who call dma_unmap* APIs, can decrease the overall
time caused by fq_flush_timeout() by better utilizing the iova_rcache,
and minimizing the competition for the iova_rbtree_lock().
Signed-off-by: Zhang Zekun <zhangzekun11@huawei.com>
---
drivers/iommu/dma-iommu.c | 22 +++++++++++++---------
drivers/iommu/iova.c | 21 ++++++++++++++-------
include/linux/iova.h | 7 +++++++
3 files changed, 34 insertions(+), 16 deletions(-)
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 43520e7275cc..217b7c70d06c 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -145,7 +145,8 @@ static inline unsigned int fq_ring_add(struct iova_fq *fq)
return idx;
}
-static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
+static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq *fq,
+ int cpu)
{
u64 counter = atomic64_read(&cookie->fq_flush_finish_cnt);
unsigned int idx;
@@ -158,20 +159,21 @@ static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq
break;
iommu_put_pages_list(&fq->entries[idx].freelist);
- free_iova_fast(&cookie->iovad,
+ free_iova_fast_cpu(&cookie->iovad,
fq->entries[idx].iova_pfn,
- fq->entries[idx].pages);
+ fq->entries[idx].pages, cpu);
fq->head = (fq->head + 1) & fq->mod_mask;
}
}
-static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
+static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq,
+ int cpu)
{
unsigned long flags;
spin_lock_irqsave(&fq->lock, flags);
- fq_ring_free_locked(cookie, fq);
+ fq_ring_free_locked(cookie, fq, cpu);
spin_unlock_irqrestore(&fq->lock, flags);
}
@@ -191,10 +193,11 @@ static void fq_flush_timeout(struct timer_list *t)
fq_flush_iotlb(cookie);
if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) {
- fq_ring_free(cookie, cookie->single_fq);
+ cpu = smp_processor_id();
+ fq_ring_free(cookie, cookie->single_fq, cpu);
} else {
for_each_possible_cpu(cpu)
- fq_ring_free(cookie, per_cpu_ptr(cookie->percpu_fq, cpu));
+ fq_ring_free(cookie, per_cpu_ptr(cookie->percpu_fq, cpu), cpu);
}
}
@@ -205,6 +208,7 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
struct iova_fq *fq;
unsigned long flags;
unsigned int idx;
+ int cpu = smp_processor_id();
/*
* Order against the IOMMU driver's pagetable update from unmapping
@@ -227,11 +231,11 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
* flushed out on another CPU. This makes the fq_full() check below less
* likely to be true.
*/
- fq_ring_free_locked(cookie, fq);
+ fq_ring_free_locked(cookie, fq, cpu);
if (fq_full(fq)) {
fq_flush_iotlb(cookie);
- fq_ring_free_locked(cookie, fq);
+ fq_ring_free_locked(cookie, fq, cpu);
}
idx = fq_ring_add(fq);
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index d59d0ea2fd21..46a2188c263b 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -20,7 +20,7 @@
static bool iova_rcache_insert(struct iova_domain *iovad,
unsigned long pfn,
- unsigned long size);
+ unsigned long size, int cpu);
static unsigned long iova_rcache_get(struct iova_domain *iovad,
unsigned long size,
unsigned long limit_pfn);
@@ -423,12 +423,19 @@ EXPORT_SYMBOL_GPL(alloc_iova_fast);
void
free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
{
- if (iova_rcache_insert(iovad, pfn, size))
+ free_iova_fast_cpu(iovad, pfn, size, smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(free_iova_fast);
+
+void
+free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
+ unsigned long size, int cpu)
+{
+ if (iova_rcache_insert(iovad, pfn, size, cpu))
return;
free_iova(iovad, pfn);
}
-EXPORT_SYMBOL_GPL(free_iova_fast);
static void iova_domain_free_rcaches(struct iova_domain *iovad)
{
@@ -762,13 +769,13 @@ EXPORT_SYMBOL_GPL(iova_domain_init_rcaches);
*/
static bool __iova_rcache_insert(struct iova_domain *iovad,
struct iova_rcache *rcache,
- unsigned long iova_pfn)
+ unsigned long iova_pfn, int cpu)
{
struct iova_cpu_rcache *cpu_rcache;
bool can_insert = false;
unsigned long flags;
- cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
+ cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
spin_lock_irqsave(&cpu_rcache->lock, flags);
if (!iova_magazine_full(cpu_rcache->loaded)) {
@@ -799,14 +806,14 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
}
static bool iova_rcache_insert(struct iova_domain *iovad, unsigned long pfn,
- unsigned long size)
+ unsigned long size, int cpu)
{
unsigned int log_size = order_base_2(size);
if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
return false;
- return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn);
+ return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn, cpu);
}
/*
diff --git a/include/linux/iova.h b/include/linux/iova.h
index d2c4fd923efa..91e4e3d5bcb3 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -93,6 +93,8 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
bool size_aligned);
void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
unsigned long size);
+void free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
+ unsigned long size, int cpu);
unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
unsigned long limit_pfn, bool flush_rcache);
struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
@@ -134,6 +136,11 @@ static inline void free_iova_fast(struct iova_domain *iovad,
{
}
+static inline void free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
+ unsigned long size, int cpu);
+{
+}
+
static inline unsigned long alloc_iova_fast(struct iova_domain *iovad,
unsigned long size,
unsigned long limit_pfn,
--
2.17.1
On 2024-06-24 9:39 am, Zhang Zekun wrote:
> Currently, when iommu working in no-strict mode, fq_flush_timeout()
> will always try to free iovas on one cpu. Freeing the iovas from all
> cpus on one cpu is not cache-friendly to iova_rcache, because it will
> first filling up the cpu_rcache and then pushing iovas to the depot,
> iovas in the depot will finally goto the underlying rbtree if the
> depot_size is greater than num_online_cpus().
That is the design intent - if the excess magazines sit in the depot
long enough to be reclaimed then other CPUs didn't want them either.
We're trying to minimise the amount of unused cached IOVAs sitting
around wasting memory, since IOVA memory consumption has proven to be
quite significant on large systems.
As alluded to in the original cover letter, 100ms for IOVA_DEPOT_DELAY
was just my arbitrary value of "long enough" to keep the initial
implementation straightforward - I do expect that certain workloads
might benefit from tuning it differently, but without proof of what they
are and what they want, there's little justification for introducing
extra complexity and potential user ABI yet.
> Let fq_flush_timeout()
> freeing iovas on cpus who call dma_unmap* APIs, can decrease the overall
> time caused by fq_flush_timeout() by better utilizing the iova_rcache,
> and minimizing the competition for the iova_rbtree_lock().
I would have assumed that a single CPU simply throwing magazines into
the depot list from its own percpu cache is quicker, or at least no
slower, than doing the same while causing additional contention/sharing
by interfering with other percpu caches as well. And where does the
rbtree come into that either way? If an observable performance issue
actually exists here, I'd like a more detailed breakdown to understand it.
Thanks,
Robin.
> Signed-off-by: Zhang Zekun <zhangzekun11@huawei.com>
> ---
> drivers/iommu/dma-iommu.c | 22 +++++++++++++---------
> drivers/iommu/iova.c | 21 ++++++++++++++-------
> include/linux/iova.h | 7 +++++++
> 3 files changed, 34 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 43520e7275cc..217b7c70d06c 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -145,7 +145,8 @@ static inline unsigned int fq_ring_add(struct iova_fq *fq)
> return idx;
> }
>
> -static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
> +static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq *fq,
> + int cpu)
> {
> u64 counter = atomic64_read(&cookie->fq_flush_finish_cnt);
> unsigned int idx;
> @@ -158,20 +159,21 @@ static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq
> break;
>
> iommu_put_pages_list(&fq->entries[idx].freelist);
> - free_iova_fast(&cookie->iovad,
> + free_iova_fast_cpu(&cookie->iovad,
> fq->entries[idx].iova_pfn,
> - fq->entries[idx].pages);
> + fq->entries[idx].pages, cpu);
>
> fq->head = (fq->head + 1) & fq->mod_mask;
> }
> }
>
> -static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
> +static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq,
> + int cpu)
> {
> unsigned long flags;
>
> spin_lock_irqsave(&fq->lock, flags);
> - fq_ring_free_locked(cookie, fq);
> + fq_ring_free_locked(cookie, fq, cpu);
> spin_unlock_irqrestore(&fq->lock, flags);
> }
>
> @@ -191,10 +193,11 @@ static void fq_flush_timeout(struct timer_list *t)
> fq_flush_iotlb(cookie);
>
> if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) {
> - fq_ring_free(cookie, cookie->single_fq);
> + cpu = smp_processor_id();
> + fq_ring_free(cookie, cookie->single_fq, cpu);
> } else {
> for_each_possible_cpu(cpu)
> - fq_ring_free(cookie, per_cpu_ptr(cookie->percpu_fq, cpu));
> + fq_ring_free(cookie, per_cpu_ptr(cookie->percpu_fq, cpu), cpu);
> }
> }
>
> @@ -205,6 +208,7 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
> struct iova_fq *fq;
> unsigned long flags;
> unsigned int idx;
> + int cpu = smp_processor_id();
>
> /*
> * Order against the IOMMU driver's pagetable update from unmapping
> @@ -227,11 +231,11 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
> * flushed out on another CPU. This makes the fq_full() check below less
> * likely to be true.
> */
> - fq_ring_free_locked(cookie, fq);
> + fq_ring_free_locked(cookie, fq, cpu);
>
> if (fq_full(fq)) {
> fq_flush_iotlb(cookie);
> - fq_ring_free_locked(cookie, fq);
> + fq_ring_free_locked(cookie, fq, cpu);
> }
>
> idx = fq_ring_add(fq);
> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
> index d59d0ea2fd21..46a2188c263b 100644
> --- a/drivers/iommu/iova.c
> +++ b/drivers/iommu/iova.c
> @@ -20,7 +20,7 @@
>
> static bool iova_rcache_insert(struct iova_domain *iovad,
> unsigned long pfn,
> - unsigned long size);
> + unsigned long size, int cpu);
> static unsigned long iova_rcache_get(struct iova_domain *iovad,
> unsigned long size,
> unsigned long limit_pfn);
> @@ -423,12 +423,19 @@ EXPORT_SYMBOL_GPL(alloc_iova_fast);
> void
> free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
> {
> - if (iova_rcache_insert(iovad, pfn, size))
> + free_iova_fast_cpu(iovad, pfn, size, smp_processor_id());
> +}
> +EXPORT_SYMBOL_GPL(free_iova_fast);
> +
> +void
> +free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
> + unsigned long size, int cpu)
> +{
> + if (iova_rcache_insert(iovad, pfn, size, cpu))
> return;
>
> free_iova(iovad, pfn);
> }
> -EXPORT_SYMBOL_GPL(free_iova_fast);
>
> static void iova_domain_free_rcaches(struct iova_domain *iovad)
> {
> @@ -762,13 +769,13 @@ EXPORT_SYMBOL_GPL(iova_domain_init_rcaches);
> */
> static bool __iova_rcache_insert(struct iova_domain *iovad,
> struct iova_rcache *rcache,
> - unsigned long iova_pfn)
> + unsigned long iova_pfn, int cpu)
> {
> struct iova_cpu_rcache *cpu_rcache;
> bool can_insert = false;
> unsigned long flags;
>
> - cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
> + cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
> spin_lock_irqsave(&cpu_rcache->lock, flags);
>
> if (!iova_magazine_full(cpu_rcache->loaded)) {
> @@ -799,14 +806,14 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
> }
>
> static bool iova_rcache_insert(struct iova_domain *iovad, unsigned long pfn,
> - unsigned long size)
> + unsigned long size, int cpu)
> {
> unsigned int log_size = order_base_2(size);
>
> if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
> return false;
>
> - return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn);
> + return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn, cpu);
> }
>
> /*
> diff --git a/include/linux/iova.h b/include/linux/iova.h
> index d2c4fd923efa..91e4e3d5bcb3 100644
> --- a/include/linux/iova.h
> +++ b/include/linux/iova.h
> @@ -93,6 +93,8 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
> bool size_aligned);
> void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
> unsigned long size);
> +void free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
> + unsigned long size, int cpu);
> unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
> unsigned long limit_pfn, bool flush_rcache);
> struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
> @@ -134,6 +136,11 @@ static inline void free_iova_fast(struct iova_domain *iovad,
> {
> }
>
> +static inline void free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
> + unsigned long size, int cpu);
> +{
> +}
> +
> static inline unsigned long alloc_iova_fast(struct iova_domain *iovad,
> unsigned long size,
> unsigned long limit_pfn,
在 2024/6/24 18:56, Robin Murphy 写道: > On 2024-06-24 9:39 am, Zhang Zekun wrote: >> Currently, when iommu working in no-strict mode, fq_flush_timeout() >> will always try to free iovas on one cpu. Freeing the iovas from all >> cpus on one cpu is not cache-friendly to iova_rcache, because it will >> first filling up the cpu_rcache and then pushing iovas to the depot, >> iovas in the depot will finally goto the underlying rbtree if the >> depot_size is greater than num_online_cpus(). > > That is the design intent - if the excess magazines sit in the depot > long enough to be reclaimed then other CPUs didn't want them either. > We're trying to minimise the amount of unused cached IOVAs sitting > around wasting memory, since IOVA memory consumption has proven to be > quite significant on large systems. Hi, Robin, It does waste some memory after this change, but since we have been freeing iova on each cpu in strict-mode for years, I think this change seems reasonable to make the iova free logic identical to strict-mode. This patch try to make the speed of allcating and freeing iova roughly same by better utilizing the iova_rcache, or we will more likely enter the slowpath. The save of memory consumption is actually at the cost of performance, I am not sure if we need such a optimization for no-strict mode which is mainly used for performance consideration. > > As alluded to in the original cover letter, 100ms for IOVA_DEPOT_DELAY > was just my arbitrary value of "long enough" to keep the initial > implementation straightforward - I do expect that certain workloads > might benefit from tuning it differently, but without proof of what they > are and what they want, there's little justification for introducing > extra complexity and potential user ABI yet. > >> Let fq_flush_timeout() >> freeing iovas on cpus who call dma_unmap* APIs, can decrease the overall >> time caused by fq_flush_timeout() by better utilizing the iova_rcache, >> and minimizing the competition for the iova_rbtree_lock(). > > I would have assumed that a single CPU simply throwing magazines into > the depot list from its own percpu cache is quicker, or at least no > slower, than doing the same while causing additional contention/sharing > by interfering with other percpu caches as well. And where does the > rbtree come into that either way? If an observable performance issue > actually exists here, I'd like a more detailed breakdown to understand it. > > Thanks, > Robin. > This patch is firstly intent to minimize the chance of softlock issue in fq_flush_timeout(), which is already dicribed erarlier in [1], which has beed applied in a commercial kernel[2] for years. However, the later tests show that this single patch is not enough to fix the softlockup issue, since the root cause of softlockup is the underlying iova_rbtree_lock. In our softlockup scenarios, the average time cost to get this spinlock is about 6ms. [1] https://lkml.org/lkml/2023/2/16/124 [2] https://gitee.com/openeuler/kernel/tree/OLK-5.10/ Thanks, Zekun
On 2024-06-24 2:13 pm, zhangzekun (A) wrote:
>
> 在 2024/6/24 18:56, Robin Murphy 写道:
>> On 2024-06-24 9:39 am, Zhang Zekun wrote:
>>> Currently, when iommu working in no-strict mode, fq_flush_timeout()
>>> will always try to free iovas on one cpu. Freeing the iovas from all
>>> cpus on one cpu is not cache-friendly to iova_rcache, because it will
>>> first filling up the cpu_rcache and then pushing iovas to the depot,
>>> iovas in the depot will finally goto the underlying rbtree if the
>>> depot_size is greater than num_online_cpus().
>>
>> That is the design intent - if the excess magazines sit in the depot
>> long enough to be reclaimed then other CPUs didn't want them either.
>> We're trying to minimise the amount of unused cached IOVAs sitting
>> around wasting memory, since IOVA memory consumption has proven to be
>> quite significant on large systems.
>
> Hi, Robin,
>
> It does waste some memory after this change, but since we have been
> freeing iova on each cpu in strict-mode for years, I think this change
> seems reasonable to make the iova free logic identical to strict-mode.
> This patch try to make the speed of allcating and freeing iova roughly
> same by better utilizing the iova_rcache, or we will more likely enter
> the slowpath. The save of memory consumption is actually at the cost of
> performance, I am not sure if we need such a optimization for no-strict
> mode which is mainly used for performance consideration.
>
>>
>> As alluded to in the original cover letter, 100ms for IOVA_DEPOT_DELAY
>> was just my arbitrary value of "long enough" to keep the initial
>> implementation straightforward - I do expect that certain workloads
>> might benefit from tuning it differently, but without proof of what
>> they are and what they want, there's little justification for
>> introducing extra complexity and potential user ABI yet.
>>
>>> Let fq_flush_timeout()
>>> freeing iovas on cpus who call dma_unmap* APIs, can decrease the overall
>>> time caused by fq_flush_timeout() by better utilizing the iova_rcache,
>>> and minimizing the competition for the iova_rbtree_lock().
>>
>> I would have assumed that a single CPU simply throwing magazines into
>> the depot list from its own percpu cache is quicker, or at least no
>> slower, than doing the same while causing additional
>> contention/sharing by interfering with other percpu caches as well.
>> And where does the rbtree come into that either way? If an observable
>> performance issue actually exists here, I'd like a more detailed
>> breakdown to understand it.
>>
>> Thanks,
>> Robin.
>>
>
> This patch is firstly intent to minimize the chance of softlock issue in
> fq_flush_timeout(), which is already dicribed erarlier in [1], which has
> beed applied in a commercial kernel[2] for years.
>
> However, the later tests show that this single patch is not enough to
> fix the softlockup issue, since the root cause of softlockup is the
> underlying iova_rbtree_lock. In our softlockup scenarios, the average
> time cost to get this spinlock is about 6ms.
That should already be fixed, though. The only reason for
fq_flush_timeout() to interact with the rbtree at all was due to the
notion of a fixed-size depot which could become full. That no longer
exists since 911aa1245da8 ("iommu/iova: Make the rcache depot scale
better").
Thanks,
Robin.
在 2024/6/24 21:32, Robin Murphy 写道:
>> This patch is firstly intent to minimize the chance of softlock issue
>> in fq_flush_timeout(), which is already dicribed erarlier in [1],
>> which has beed applied in a commercial kernel[2] for years.
>>
>> However, the later tests show that this single patch is not enough to
>> fix the softlockup issue, since the root cause of softlockup is the
>> underlying iova_rbtree_lock. In our softlockup scenarios, the average
>> time cost to get this spinlock is about 6ms.
>
> That should already be fixed, though. The only reason for
> fq_flush_timeout() to interact with the rbtree at all was due to the
> notion of a fixed-size depot which could become full. That no longer
> exists since 911aa1245da8 ("iommu/iova: Make the rcache depot scale
> better").
>
> Thanks,
> Robin.
>
Hi, Robin,
The commit 911aa1245da8 ("iommu/iova: Make the rcache depot scale
better") can reduce the risks of softlockup, but can not fix it
entirely. We do solve a softlockup issue[1] with that patch, and that is
why it has aleady been backported in our branch. The softlockup issue
which we met recently is a 5.10-based kernel with that patch already
backported, which can be found in [2].
In 6.7-rc4, which has already applied patch 911aa1245da8 ("iommu/iova:
Make the rcache depot scale better"). We can also get the following
call trace. It seems still existing the issue, and we fix this issue
using the patch in [3], when the call trace first came up.
rcu: INFO: rcu_preempt self-detected stall on CPU
rcu: 85-....: (5249 ticks this GP) idle=4d04/1/0x4000000000000000
softirq=428240/428240 fqs=1882
rcu: (t=5253 jiffies g=2005741 q=19446136 ncpus=128)
CPU: 85 PID: 2401130 Comm: kpktgend_85 Kdump: loaded Tainted: G
O 6.7.0-rc4+ #1
pc : _raw_spin_unlock_irqrestore+0x14/0x78
lr : fq_flush_timeout+0x94/0x118
sp : ffff8000802abdf0
x29: ffff8000802abdf0 x28: ffff20201142e480 x27: 0000000000000002
x26: ffffdeeb67e82b20 x25: 0000000000000000 x24: ffffdeeb67e87400
x23: ffffdeeb67e82ba0 x22: ffff202009db1c00 x21: ffffdeeb67e82f40
x20: 0000000000000000 x19: ffffdbe06f0140a8 x18: 0000000000040000
x17: 0000000000000001 x16: 0000000000000040 x15: ffffffffffffffff
x14: 0000000000001fff x13: 000000000003ffff x12: 0000000000000259
x11: 000000000007ffff x10: 000000000000000d x9 : ffffdeeb6549ec84
x8 : ffff0020886e0000 x7 : 000000000000964d x6 : 000001afa2986acb
x5 : 01ffffffffffffff x4 : 0000000000000000 x3 : ffff202006b57d70
x2 : 0000000000000015 x1 : 0000000000000000 x0 : ffffdbe06f0140a8
Call trace:
_raw_spin_unlock_irqrestore+0x14/0x78
call_timer_fn+0x3c/0x1d0
expire_timers+0xcc/0x190
run_timer_softirq+0xfc/0x268
__do_softirq+0x128/0x3dc
____do_softirq+0x18/0x30
call_on_irq_stack+0x24/0x30
do_softirq_own_stack+0x24/0x38
irq_exit_rcu+0xc0/0xe8
el1_interrupt+0x48/0xc0
el1h_64_irq_handler+0x18/0x28
el1h_64_irq+0x78/0x80
__schedule+0xf28/0x12a0
schedule+0x3c/0x108
schedule_timeout+0xa0/0x1d0
pktgen_thread_worker+0x1180/0x15d0
kthread+0x120/0x130
ret_from_fork+0x10/0x20
[1] https://gitee.com/openeuler/kernel/issues/I8KS9A
[2] https://gitee.com/openeuler/kernel/tree/OLK-5.10/
[3] https://lkml.org/lkml/2023/2/16/124
Thanks,
Zekun
On 2024-06-25 2:29 am, zhangzekun (A) wrote:
>
>
> 在 2024/6/24 21:32, Robin Murphy 写道:
>
>>> This patch is firstly intent to minimize the chance of softlock issue
>>> in fq_flush_timeout(), which is already dicribed erarlier in [1],
>>> which has beed applied in a commercial kernel[2] for years.
>>>
>>> However, the later tests show that this single patch is not enough to
>>> fix the softlockup issue, since the root cause of softlockup is the
>>> underlying iova_rbtree_lock. In our softlockup scenarios, the average
>>> time cost to get this spinlock is about 6ms.
>>
>> That should already be fixed, though. The only reason for
>> fq_flush_timeout() to interact with the rbtree at all was due to the
>> notion of a fixed-size depot which could become full. That no longer
>> exists since 911aa1245da8 ("iommu/iova: Make the rcache depot scale
>> better").
>>
>> Thanks,
>> Robin.
>>
> Hi, Robin,
>
> The commit 911aa1245da8 ("iommu/iova: Make the rcache depot scale
> better") can reduce the risks of softlockup, but can not fix it
> entirely. We do solve a softlockup issue[1] with that patch, and that is
> why it has aleady been backported in our branch. The softlockup issue
> which we met recently is a 5.10-based kernel with that patch already
> backported, which can be found in [2].
Sorry, I was implying some context that I should have made clear - yes,
the softlockup can still happen in general if the flush queues are full
of IOVAs which are too large for the rcache mechanism at all, so are
always freed directly to the rbtree, but then there's no way *this*
patch could make any difference to that case either.
Thanks,
Robin.
> In 6.7-rc4, which has already applied patch 911aa1245da8 ("iommu/iova:
> Make the rcache depot scale better"). We can also get the following
> call trace. It seems still existing the issue, and we fix this issue
> using the patch in [3], when the call trace first came up.
>
> rcu: INFO: rcu_preempt self-detected stall on CPU
> rcu: 85-....: (5249 ticks this GP) idle=4d04/1/0x4000000000000000
> softirq=428240/428240 fqs=1882
> rcu: (t=5253 jiffies g=2005741 q=19446136 ncpus=128)
> CPU: 85 PID: 2401130 Comm: kpktgend_85 Kdump: loaded Tainted: G O
> 6.7.0-rc4+ #1
>
> pc : _raw_spin_unlock_irqrestore+0x14/0x78
> lr : fq_flush_timeout+0x94/0x118
> sp : ffff8000802abdf0
> x29: ffff8000802abdf0 x28: ffff20201142e480 x27: 0000000000000002
> x26: ffffdeeb67e82b20 x25: 0000000000000000 x24: ffffdeeb67e87400
> x23: ffffdeeb67e82ba0 x22: ffff202009db1c00 x21: ffffdeeb67e82f40
> x20: 0000000000000000 x19: ffffdbe06f0140a8 x18: 0000000000040000
> x17: 0000000000000001 x16: 0000000000000040 x15: ffffffffffffffff
> x14: 0000000000001fff x13: 000000000003ffff x12: 0000000000000259
> x11: 000000000007ffff x10: 000000000000000d x9 : ffffdeeb6549ec84
> x8 : ffff0020886e0000 x7 : 000000000000964d x6 : 000001afa2986acb
> x5 : 01ffffffffffffff x4 : 0000000000000000 x3 : ffff202006b57d70
> x2 : 0000000000000015 x1 : 0000000000000000 x0 : ffffdbe06f0140a8
> Call trace:
> _raw_spin_unlock_irqrestore+0x14/0x78
> call_timer_fn+0x3c/0x1d0
> expire_timers+0xcc/0x190
> run_timer_softirq+0xfc/0x268
> __do_softirq+0x128/0x3dc
> ____do_softirq+0x18/0x30
> call_on_irq_stack+0x24/0x30
> do_softirq_own_stack+0x24/0x38
> irq_exit_rcu+0xc0/0xe8
> el1_interrupt+0x48/0xc0
> el1h_64_irq_handler+0x18/0x28
> el1h_64_irq+0x78/0x80
> __schedule+0xf28/0x12a0
> schedule+0x3c/0x108
> schedule_timeout+0xa0/0x1d0
> pktgen_thread_worker+0x1180/0x15d0
> kthread+0x120/0x130
> ret_from_fork+0x10/0x20
>
> [1] https://gitee.com/openeuler/kernel/issues/I8KS9A
> [2] https://gitee.com/openeuler/kernel/tree/OLK-5.10/
> [3] https://lkml.org/lkml/2023/2/16/124
>
> Thanks,
> Zekun
在 2024/6/26 2:03, Robin Murphy 写道:
> On 2024-06-25 2:29 am, zhangzekun (A) wrote:
>>
>>
>> 在 2024/6/24 21:32, Robin Murphy 写道:
>>
>>>> This patch is firstly intent to minimize the chance of softlock
>>>> issue in fq_flush_timeout(), which is already dicribed erarlier in
>>>> [1], which has beed applied in a commercial kernel[2] for years.
>>>>
>>>> However, the later tests show that this single patch is not enough
>>>> to fix the softlockup issue, since the root cause of softlockup is
>>>> the underlying iova_rbtree_lock. In our softlockup scenarios, the
>>>> average
>>>> time cost to get this spinlock is about 6ms.
>>>
>>> That should already be fixed, though. The only reason for
>>> fq_flush_timeout() to interact with the rbtree at all was due to the
>>> notion of a fixed-size depot which could become full. That no longer
>>> exists since 911aa1245da8 ("iommu/iova: Make the rcache depot scale
>>> better").
>>>
>>> Thanks,
>>> Robin.
>>>
>> Hi, Robin,
>>
>> The commit 911aa1245da8 ("iommu/iova: Make the rcache depot scale
>> better") can reduce the risks of softlockup, but can not fix it
>> entirely. We do solve a softlockup issue[1] with that patch, and that is
>> why it has aleady been backported in our branch. The softlockup issue
>> which we met recently is a 5.10-based kernel with that patch already
>> backported, which can be found in [2].
>
> Sorry, I was implying some context that I should have made clear - yes,
> the softlockup can still happen in general if the flush queues are full
> of IOVAs which are too large for the rcache mechanism at all, so are
> always freed directly to the rbtree, but then there's no way *this*
> patch could make any difference to that case either.
>
> Thanks,
> Robin.
>
Yes, this patch can't fix softlockup issue in this case. In such a case,
it would be better to put the free iova logic in fq_flush_timeout() to a
kthread and add a cond_resched() in it.
Thanks,
Zekun
© 2016 - 2025 Red Hat, Inc.