drivers/infiniband/sw/rxe/rxe_comp.c | 3 +++ drivers/infiniband/sw/rxe/rxe_req.c | 3 +++ 2 files changed, 6 insertions(+)
I encontered the following warning:
WARNING: drivers/infiniband/sw/rxe/rxe_task.c:249 at rxe_sched_task+0x1c8/0x238 [rdma_rxe], CPU#0: swapper/0/0
...
libsha1 [last unloaded: ip6_udp_tunnel]
CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G C 6.19.0-rc5-64k-v8+ #37 PREEMPT
Tainted: [C]=CRAP
Hardware name: Raspberry Pi 4 Model B Rev 1.2
Call trace:
rxe_sched_task+0x1c8/0x238 [rdma_rxe] (P)
retransmit_timer+0x130/0x188 [rdma_rxe]
call_timer_fn+0x68/0x4d0
__run_timers+0x630/0x888
...
WARNING: drivers/infiniband/sw/rxe/rxe_task.c:38 at rxe_sched_task+0x1c0/0x238 [rdma_rxe], CPU#0: swapper/0/0
...
WARNING: drivers/infiniband/sw/rxe/rxe_task.c:111 at do_work+0x488/0x5c8 [rdma_rxe], CPU#3: kworker/u17:4/93400
...
refcount_t: underflow; use-after-free.
WARNING: lib/refcount.c:28 at refcount_warn_saturate+0x138/0x1a0, CPU#3: kworker/u17:4/93400
The issue is caused by a race condition between retransmit_timer() and
rxe_destroy_qp, leading to the Queue Pair's (QP) reference count dropping
to zero during timer handler execution.
It seems this warning is harmless because rxe_qp_do_cleanup() will flush
all pending timers and requests.
Example of flow causing the issue:
CPU0 CPU1
retransmit_timer() {
spin_lock_irqsave
rxe_destroy_qp()
__rxe_cleanup()
__rxe_put() // qp->ref_count decrease to 0
rxe_qp_do_cleanup() {
if (qp->valid) {
rxe_sched_task() {
WARN_ON(rxe_read(task->qp) <= 0);
}
}
spin_unlock_irqrestore
}
spin_lock_irqsave
qp->valid = 0
spin_unlock_irqrestore
}
Ensure the QP's reference count is maintained and its validity is checked
within the timer callbacks by adding calls to rxe_get(qp) and corresponding
rxe_put(qp) after use.
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
---
drivers/infiniband/sw/rxe/rxe_comp.c | 3 +++
drivers/infiniband/sw/rxe/rxe_req.c | 3 +++
2 files changed, 6 insertions(+)
diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
index a5b2b62f596b..1390e861bd1d 100644
--- a/drivers/infiniband/sw/rxe/rxe_comp.c
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -119,12 +119,15 @@ void retransmit_timer(struct timer_list *t)
rxe_dbg_qp(qp, "retransmit timer fired\n");
+ if (!rxe_get(qp))
+ return;
spin_lock_irqsave(&qp->state_lock, flags);
if (qp->valid) {
qp->comp.timeout = 1;
rxe_sched_task(&qp->send_task);
}
spin_unlock_irqrestore(&qp->state_lock, flags);
+ rxe_put(qp);
}
void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb)
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
index 373b03f223be..12d03f390b09 100644
--- a/drivers/infiniband/sw/rxe/rxe_req.c
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -102,6 +102,8 @@ void rnr_nak_timer(struct timer_list *t)
rxe_dbg_qp(qp, "nak timer fired\n");
+ if (!rxe_get(qp))
+ return;
spin_lock_irqsave(&qp->state_lock, flags);
if (qp->valid) {
/* request a send queue retry */
@@ -110,6 +112,7 @@ void rnr_nak_timer(struct timer_list *t)
rxe_sched_task(&qp->send_task);
}
spin_unlock_irqrestore(&qp->state_lock, flags);
+ rxe_put(qp);
}
static void req_check_sq_drain_done(struct rxe_qp *qp)
--
2.41.0
On Tue, 20 Jan 2026 15:44:37 +0800, Li Zhijian wrote:
> I encontered the following warning:
> WARNING: drivers/infiniband/sw/rxe/rxe_task.c:249 at rxe_sched_task+0x1c8/0x238 [rdma_rxe], CPU#0: swapper/0/0
> ...
> libsha1 [last unloaded: ip6_udp_tunnel]
> CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G C 6.19.0-rc5-64k-v8+ #37 PREEMPT
> Tainted: [C]=CRAP
> Hardware name: Raspberry Pi 4 Model B Rev 1.2
> Call trace:
> rxe_sched_task+0x1c8/0x238 [rdma_rxe] (P)
> retransmit_timer+0x130/0x188 [rdma_rxe]
> call_timer_fn+0x68/0x4d0
> __run_timers+0x630/0x888
> ...
> WARNING: drivers/infiniband/sw/rxe/rxe_task.c:38 at rxe_sched_task+0x1c0/0x238 [rdma_rxe], CPU#0: swapper/0/0
> ...
> WARNING: drivers/infiniband/sw/rxe/rxe_task.c:111 at do_work+0x488/0x5c8 [rdma_rxe], CPU#3: kworker/u17:4/93400
> ...
> refcount_t: underflow; use-after-free.
> WARNING: lib/refcount.c:28 at refcount_warn_saturate+0x138/0x1a0, CPU#3: kworker/u17:4/93400
>
> [...]
Applied, thanks!
[1/1] RDMA/rxe: Fix race condition in QP timer handlers
https://git.kernel.org/rdma/rdma/c/87bf646921430e
Best regards,
--
Leon Romanovsky <leon@kernel.org>
On Tue, Jan 20, 2026 at 03:44:37PM +0800, Li Zhijian wrote:
> I encontered the following warning:
> WARNING: drivers/infiniband/sw/rxe/rxe_task.c:249 at rxe_sched_task+0x1c8/0x238 [rdma_rxe], CPU#0: swapper/0/0
> ...
> libsha1 [last unloaded: ip6_udp_tunnel]
> CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G C 6.19.0-rc5-64k-v8+ #37 PREEMPT
> Tainted: [C]=CRAP
> Hardware name: Raspberry Pi 4 Model B Rev 1.2
> Call trace:
> rxe_sched_task+0x1c8/0x238 [rdma_rxe] (P)
> retransmit_timer+0x130/0x188 [rdma_rxe]
> call_timer_fn+0x68/0x4d0
> __run_timers+0x630/0x888
> ...
> WARNING: drivers/infiniband/sw/rxe/rxe_task.c:38 at rxe_sched_task+0x1c0/0x238 [rdma_rxe], CPU#0: swapper/0/0
> ...
> WARNING: drivers/infiniband/sw/rxe/rxe_task.c:111 at do_work+0x488/0x5c8 [rdma_rxe], CPU#3: kworker/u17:4/93400
> ...
> refcount_t: underflow; use-after-free.
> WARNING: lib/refcount.c:28 at refcount_warn_saturate+0x138/0x1a0, CPU#3: kworker/u17:4/93400
>
> The issue is caused by a race condition between retransmit_timer() and
> rxe_destroy_qp, leading to the Queue Pair's (QP) reference count dropping
> to zero during timer handler execution.
>
> It seems this warning is harmless because rxe_qp_do_cleanup() will flush
> all pending timers and requests.
>
> Example of flow causing the issue:
>
> CPU0 CPU1
> retransmit_timer() {
> spin_lock_irqsave
> rxe_destroy_qp()
> __rxe_cleanup()
> __rxe_put() // qp->ref_count decrease to 0
> rxe_qp_do_cleanup() {
> if (qp->valid) {
> rxe_sched_task() {
> WARN_ON(rxe_read(task->qp) <= 0);
> }
> }
> spin_unlock_irqrestore
> }
> spin_lock_irqsave
> qp->valid = 0
> spin_unlock_irqrestore
> }
>
> Ensure the QP's reference count is maintained and its validity is checked
> within the timer callbacks by adding calls to rxe_get(qp) and corresponding
> rxe_put(qp) after use.
>
> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Fixes line?
Thanks
> ---
> drivers/infiniband/sw/rxe/rxe_comp.c | 3 +++
> drivers/infiniband/sw/rxe/rxe_req.c | 3 +++
> 2 files changed, 6 insertions(+)
On 25/01/2026 22:08, Leon Romanovsky wrote:
>> Ensure the QP's reference count is maintained and its validity is checked
>> within the timer callbacks by adding calls to rxe_get(qp) and corresponding
>> rxe_put(qp) after use.
>>
>> Signed-off-by: Li Zhijian<lizhijian@fujitsu.com>
> Fixes line?
I believe the following `Fixes` tag is appropriate, as this commit introduced
the WARN_ON that now triggers:
Fixes: d94671632572 ("RDMA/rxe: Rewrite rxe_task.c")
However, I'm not entirely certain if this race condition also existed
before this commit, as it involved a significant rewrite.
Thanks
Zhijian
>
> Thanks
On Tue, Jan 27, 2026 at 09:27:27AM +0000, Zhijian Li (Fujitsu) wrote:
>
>
> On 25/01/2026 22:08, Leon Romanovsky wrote:
> >> Ensure the QP's reference count is maintained and its validity is checked
> >> within the timer callbacks by adding calls to rxe_get(qp) and corresponding
> >> rxe_put(qp) after use.
> >>
> >> Signed-off-by: Li Zhijian<lizhijian@fujitsu.com>
> > Fixes line?
>
> I believe the following `Fixes` tag is appropriate, as this commit introduced
> the WARN_ON that now triggers:
>
> Fixes: d94671632572 ("RDMA/rxe: Rewrite rxe_task.c")
>
> However, I'm not entirely certain if this race condition also existed
> before this commit, as it involved a significant rewrite.
Let's include the most recent fix. It is unlikely that anyone is still
running such an old version of RXE.
Thanks
>
> Thanks
> Zhijian
>
> >
> > Thanks
在 2026/1/25 6:08, Leon Romanovsky 写道:
> On Tue, Jan 20, 2026 at 03:44:37PM +0800, Li Zhijian wrote:
>> I encontered the following warning:
>> WARNING: drivers/infiniband/sw/rxe/rxe_task.c:249 at rxe_sched_task+0x1c8/0x238 [rdma_rxe], CPU#0: swapper/0/0
>> ...
>> libsha1 [last unloaded: ip6_udp_tunnel]
>> CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G C 6.19.0-rc5-64k-v8+ #37 PREEMPT
>> Tainted: [C]=CRAP
>> Hardware name: Raspberry Pi 4 Model B Rev 1.2
>> Call trace:
>> rxe_sched_task+0x1c8/0x238 [rdma_rxe] (P)
>> retransmit_timer+0x130/0x188 [rdma_rxe]
>> call_timer_fn+0x68/0x4d0
>> __run_timers+0x630/0x888
>> ...
>> WARNING: drivers/infiniband/sw/rxe/rxe_task.c:38 at rxe_sched_task+0x1c0/0x238 [rdma_rxe], CPU#0: swapper/0/0
>> ...
>> WARNING: drivers/infiniband/sw/rxe/rxe_task.c:111 at do_work+0x488/0x5c8 [rdma_rxe], CPU#3: kworker/u17:4/93400
>> ...
>> refcount_t: underflow; use-after-free.
>> WARNING: lib/refcount.c:28 at refcount_warn_saturate+0x138/0x1a0, CPU#3: kworker/u17:4/93400
>>
>> The issue is caused by a race condition between retransmit_timer() and
>> rxe_destroy_qp, leading to the Queue Pair's (QP) reference count dropping
>> to zero during timer handler execution.
>>
>> It seems this warning is harmless because rxe_qp_do_cleanup() will flush
>> all pending timers and requests.
>>
>> Example of flow causing the issue:
>>
>> CPU0 CPU1
>> retransmit_timer() {
>> spin_lock_irqsave
>> rxe_destroy_qp()
>> __rxe_cleanup()
>> __rxe_put() // qp->ref_count decrease to 0
>> rxe_qp_do_cleanup() {
>> if (qp->valid) {
>> rxe_sched_task() {
>> WARN_ON(rxe_read(task->qp) <= 0);
>> }
>> }
>> spin_unlock_irqrestore
>> }
>> spin_lock_irqsave
>> qp->valid = 0
>> spin_unlock_irqrestore
>> }
>>
>> Ensure the QP's reference count is maintained and its validity is checked
>> within the timer callbacks by adding calls to rxe_get(qp) and corresponding
>> rxe_put(qp) after use.
>>
>> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
>
> Fixes line?
The Fixes line should be the following?
Fixes: 8700e3e7c485 ("Soft RoCE driver")
Best Regards,
Zhu Yanjun
>
> Thanks
>
>> ---
>> drivers/infiniband/sw/rxe/rxe_comp.c | 3 +++
>> drivers/infiniband/sw/rxe/rxe_req.c | 3 +++
>> 2 files changed, 6 insertions(+)
在 2026/1/19 23:44, Li Zhijian 写道:
> I encontered the following warning:
> WARNING: drivers/infiniband/sw/rxe/rxe_task.c:249 at rxe_sched_task+0x1c8/0x238 [rdma_rxe], CPU#0: swapper/0/0
> ...
> libsha1 [last unloaded: ip6_udp_tunnel]
> CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G C 6.19.0-rc5-64k-v8+ #37 PREEMPT
> Tainted: [C]=CRAP
> Hardware name: Raspberry Pi 4 Model B Rev 1.2
> Call trace:
> rxe_sched_task+0x1c8/0x238 [rdma_rxe] (P)
> retransmit_timer+0x130/0x188 [rdma_rxe]
> call_timer_fn+0x68/0x4d0
> __run_timers+0x630/0x888
> ...
> WARNING: drivers/infiniband/sw/rxe/rxe_task.c:38 at rxe_sched_task+0x1c0/0x238 [rdma_rxe], CPU#0: swapper/0/0
> ...
> WARNING: drivers/infiniband/sw/rxe/rxe_task.c:111 at do_work+0x488/0x5c8 [rdma_rxe], CPU#3: kworker/u17:4/93400
> ...
> refcount_t: underflow; use-after-free.
> WARNING: lib/refcount.c:28 at refcount_warn_saturate+0x138/0x1a0, CPU#3: kworker/u17:4/93400
>
> The issue is caused by a race condition between retransmit_timer() and
> rxe_destroy_qp, leading to the Queue Pair's (QP) reference count dropping
> to zero during timer handler execution.
>
> It seems this warning is harmless because rxe_qp_do_cleanup() will flush
> all pending timers and requests.
>
> Example of flow causing the issue:
>
> CPU0 CPU1
> retransmit_timer() {
> spin_lock_irqsave
> rxe_destroy_qp()
> __rxe_cleanup()
> __rxe_put() // qp->ref_count decrease to 0
In __rxe_cleanup, __rxe_put decrease qp->ref_count to 0.
Then in the timer functions retransmit_timer and rnr_nak_timer will
check qp and resend the packets. IMO, it may be a solution to use the
function rxe_get to check if ref_count is 0 or not.
I am fine with it.
Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
Thanks,
Zhu Yanjun
> rxe_qp_do_cleanup() {
> if (qp->valid) {
> rxe_sched_task() {
> WARN_ON(rxe_read(task->qp) <= 0);
> }
> }
> spin_unlock_irqrestore
> }
> spin_lock_irqsave
> qp->valid = 0
> spin_unlock_irqrestore
> }
>
> Ensure the QP's reference count is maintained and its validity is checked
> within the timer callbacks by adding calls to rxe_get(qp) and corresponding
> rxe_put(qp) after use.
>
> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
> ---
> drivers/infiniband/sw/rxe/rxe_comp.c | 3 +++
> drivers/infiniband/sw/rxe/rxe_req.c | 3 +++
> 2 files changed, 6 insertions(+)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
> index a5b2b62f596b..1390e861bd1d 100644
> --- a/drivers/infiniband/sw/rxe/rxe_comp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_comp.c
> @@ -119,12 +119,15 @@ void retransmit_timer(struct timer_list *t)
>
> rxe_dbg_qp(qp, "retransmit timer fired\n");
>
> + if (!rxe_get(qp))
> + return;
> spin_lock_irqsave(&qp->state_lock, flags);
> if (qp->valid) {
> qp->comp.timeout = 1;
> rxe_sched_task(&qp->send_task);
> }
> spin_unlock_irqrestore(&qp->state_lock, flags);
> + rxe_put(qp);
> }
>
> void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb)
> diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
> index 373b03f223be..12d03f390b09 100644
> --- a/drivers/infiniband/sw/rxe/rxe_req.c
> +++ b/drivers/infiniband/sw/rxe/rxe_req.c
> @@ -102,6 +102,8 @@ void rnr_nak_timer(struct timer_list *t)
>
> rxe_dbg_qp(qp, "nak timer fired\n");
>
> + if (!rxe_get(qp))
> + return;
> spin_lock_irqsave(&qp->state_lock, flags);
> if (qp->valid) {
> /* request a send queue retry */
> @@ -110,6 +112,7 @@ void rnr_nak_timer(struct timer_list *t)
> rxe_sched_task(&qp->send_task);
> }
> spin_unlock_irqrestore(&qp->state_lock, flags);
> + rxe_put(qp);
> }
>
> static void req_check_sq_drain_done(struct rxe_qp *qp)
© 2016 - 2026 Red Hat, Inc.