[PATCH] soc: fsl: qbman: fix race condition in qman_destroy_fq

Richard Genoud posted 1 patch 1 month, 2 weeks ago
drivers/soc/fsl/qbman/qman.c | 24 ++++++++++++++++++++++--
1 file changed, 22 insertions(+), 2 deletions(-)
[PATCH] soc: fsl: qbman: fix race condition in qman_destroy_fq
Posted by Richard Genoud 1 month, 2 weeks ago
When QMAN_FQ_FLAG_DYNAMIC_FQID is set, there's a race condition between
fq_table[fq->idx] state and freeing/allocating from the pool and
WARN_ON(fq_table[fq->idx]) in qman_create_fq() gets triggered.

Indeed, we can have:
         Thread A                             Thread B
    qman_destroy_fq()                    qman_create_fq()
      qman_release_fqid()
        qman_shutdown_fq()
        gen_pool_free()
           -- At this point, the fqid is available again --
                                           qman_alloc_fqid()
           -- so, we can get the just-freed fqid in thread B --
                                           fq->fqid = fqid;
                                           fq->idx = fqid * 2;
                                           WARN_ON(fq_table[fq->idx]);
                                           fq_table[fq->idx] = fq;
     fq_table[fq->idx] = NULL;

And adding some logs between qman_release_fqid() and
fq_table[fq->idx] = NULL makes the WARN_ON() trigger a lot more.

To prevent that, ensure that fq_table[fq->idx] is set to NULL before
gen_pool_free() is called by using smp_wmb().

Fixes: c535e923bb97 ("soc/fsl: Introduce DPAA 1.x QMan device driver")
Signed-off-by: Richard Genoud <richard.genoud@bootlin.com>
---
 drivers/soc/fsl/qbman/qman.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

NB: I'm not 100% sure of the need of a barrier here, since even without
it, the WARN_ON() wasn't triggered any more.

diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c
index 6b392b3ad4b1..39a3e7aab6ff 100644
--- a/drivers/soc/fsl/qbman/qman.c
+++ b/drivers/soc/fsl/qbman/qman.c
@@ -1827,6 +1827,8 @@ EXPORT_SYMBOL(qman_create_fq);
 
 void qman_destroy_fq(struct qman_fq *fq)
 {
+	int leaked;
+
 	/*
 	 * We don't need to lock the FQ as it is a pre-condition that the FQ be
 	 * quiesced. Instead, run some checks.
@@ -1834,11 +1836,29 @@ void qman_destroy_fq(struct qman_fq *fq)
 	switch (fq->state) {
 	case qman_fq_state_parked:
 	case qman_fq_state_oos:
-		if (fq_isset(fq, QMAN_FQ_FLAG_DYNAMIC_FQID))
-			qman_release_fqid(fq->fqid);
+		/*
+		 * There's a race condition here on releasing the fqid,
+		 * setting the fq_table to NULL, and freeing the fqid.
+		 * To prevent it, this order should be respected:
+		 */
+		if (fq_isset(fq, QMAN_FQ_FLAG_DYNAMIC_FQID)) {
+			leaked = qman_shutdown_fq(fq->fqid);
+			if (leaked)
+				pr_debug("FQID %d leaked\n", fq->fqid);
+		}
 
 		DPAA_ASSERT(fq_table[fq->idx]);
 		fq_table[fq->idx] = NULL;
+
+		if (fq_isset(fq, QMAN_FQ_FLAG_DYNAMIC_FQID) && !leaked) {
+			/*
+			 * fq_table[fq->idx] should be set to null before
+			 * freeing fq->fqid otherwise it could by allocated by
+			 * qman_alloc_fqid() while still being !NULL
+			 */
+			smp_wmb();
+			gen_pool_free(qm_fqalloc, fq->fqid | DPAA_GENALLOC_OFF, 1);
+		}
 		return;
 	default:
 		break;

base-commit: 9448598b22c50c8a5bb77a9103e2d49f134c9578
-- 
2.47.3
Re: [PATCH] soc: fsl: qbman: fix race condition in qman_destroy_fq
Posted by Christophe Leroy (CS GROUP) 1 day, 10 hours ago
On Tue, 23 Dec 2025 08:25:49 +0100, Richard Genoud wrote:
> When QMAN_FQ_FLAG_DYNAMIC_FQID is set, there's a race condition between
> fq_table[fq->idx] state and freeing/allocating from the pool and
> WARN_ON(fq_table[fq->idx]) in qman_create_fq() gets triggered.
> 
> Indeed, we can have:
>          Thread A                             Thread B
>     qman_destroy_fq()                    qman_create_fq()
>       qman_release_fqid()
>         qman_shutdown_fq()
>         gen_pool_free()
>            -- At this point, the fqid is available again --
>                                            qman_alloc_fqid()
>            -- so, we can get the just-freed fqid in thread B --
>                                            fq->fqid = fqid;
>                                            fq->idx = fqid * 2;
>                                            WARN_ON(fq_table[fq->idx]);
>                                            fq_table[fq->idx] = fq;
>      fq_table[fq->idx] = NULL;
> 
> [...]

Applied, thanks!

[1/1] soc: fsl: qbman: fix race condition in qman_destroy_fq
      (no commit info)

Best regards,
-- 
Christophe Leroy (CS GROUP) <chleroy@kernel.org>
Re: [PATCH] soc: fsl: qbman: fix race condition in qman_destroy_fq
Posted by Richard GENOUD 5 days, 4 hours ago
Le 23/12/2025 à 08:25, Richard Genoud a écrit :
> When QMAN_FQ_FLAG_DYNAMIC_FQID is set, there's a race condition between
> fq_table[fq->idx] state and freeing/allocating from the pool and
> WARN_ON(fq_table[fq->idx]) in qman_create_fq() gets triggered.
> 
> Indeed, we can have:
>           Thread A                             Thread B
>      qman_destroy_fq()                    qman_create_fq()
>        qman_release_fqid()
>          qman_shutdown_fq()
>          gen_pool_free()
>             -- At this point, the fqid is available again --
>                                             qman_alloc_fqid()
>             -- so, we can get the just-freed fqid in thread B --
>                                             fq->fqid = fqid;
>                                             fq->idx = fqid * 2;
>                                             WARN_ON(fq_table[fq->idx]);
>                                             fq_table[fq->idx] = fq;
>       fq_table[fq->idx] = NULL;
> 
> And adding some logs between qman_release_fqid() and
> fq_table[fq->idx] = NULL makes the WARN_ON() trigger a lot more.
> 
> To prevent that, ensure that fq_table[fq->idx] is set to NULL before
> gen_pool_free() is called by using smp_wmb().
> 

Tested on a LS1046A based board.
With this patch, the warning is not triggered anymore.

Tested-by: CHAMPSEIX Thomas <thomas.champseix@alstomgroup.com>

> Fixes: c535e923bb97 ("soc/fsl: Introduce DPAA 1.x QMan device driver")
> Signed-off-by: Richard Genoud <richard.genoud@bootlin.com>
> ---
>   drivers/soc/fsl/qbman/qman.c | 24 ++++++++++++++++++++++--
>   1 file changed, 22 insertions(+), 2 deletions(-)
> 
> NB: I'm not 100% sure of the need of a barrier here, since even without
> it, the WARN_ON() wasn't triggered any more.
> 
> diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c
> index 6b392b3ad4b1..39a3e7aab6ff 100644
> --- a/drivers/soc/fsl/qbman/qman.c
> +++ b/drivers/soc/fsl/qbman/qman.c
> @@ -1827,6 +1827,8 @@ EXPORT_SYMBOL(qman_create_fq);
>   
>   void qman_destroy_fq(struct qman_fq *fq)
>   {
> +	int leaked;
> +
>   	/*
>   	 * We don't need to lock the FQ as it is a pre-condition that the FQ be
>   	 * quiesced. Instead, run some checks.
> @@ -1834,11 +1836,29 @@ void qman_destroy_fq(struct qman_fq *fq)
>   	switch (fq->state) {
>   	case qman_fq_state_parked:
>   	case qman_fq_state_oos:
> -		if (fq_isset(fq, QMAN_FQ_FLAG_DYNAMIC_FQID))
> -			qman_release_fqid(fq->fqid);
> +		/*
> +		 * There's a race condition here on releasing the fqid,
> +		 * setting the fq_table to NULL, and freeing the fqid.
> +		 * To prevent it, this order should be respected:
> +		 */
> +		if (fq_isset(fq, QMAN_FQ_FLAG_DYNAMIC_FQID)) {
> +			leaked = qman_shutdown_fq(fq->fqid);
> +			if (leaked)
> +				pr_debug("FQID %d leaked\n", fq->fqid);
> +		}
>   
>   		DPAA_ASSERT(fq_table[fq->idx]);
>   		fq_table[fq->idx] = NULL;
> +
> +		if (fq_isset(fq, QMAN_FQ_FLAG_DYNAMIC_FQID) && !leaked) {
> +			/*
> +			 * fq_table[fq->idx] should be set to null before
> +			 * freeing fq->fqid otherwise it could by allocated by
> +			 * qman_alloc_fqid() while still being !NULL
> +			 */
> +			smp_wmb();
> +			gen_pool_free(qm_fqalloc, fq->fqid | DPAA_GENALLOC_OFF, 1);
> +		}
>   		return;
>   	default:
>   		break;
> 
> base-commit: 9448598b22c50c8a5bb77a9103e2d49f134c9578


-- 
Richard Genoud, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com
Re: [PATCH] soc: fsl: qbman: fix race condition in qman_destroy_fq
Posted by Christophe Leroy (CS GROUP) 1 day, 10 hours ago
Hi,

Le 02/02/2026 à 13:54, Richard GENOUD a écrit :
> Le 23/12/2025 à 08:25, Richard Genoud a écrit :
>> When QMAN_FQ_FLAG_DYNAMIC_FQID is set, there's a race condition between
>> fq_table[fq->idx] state and freeing/allocating from the pool and
>> WARN_ON(fq_table[fq->idx]) in qman_create_fq() gets triggered.
>>
>> Indeed, we can have:
>>           Thread A                             Thread B
>>      qman_destroy_fq()                    qman_create_fq()
>>        qman_release_fqid()
>>          qman_shutdown_fq()
>>          gen_pool_free()
>>             -- At this point, the fqid is available again --
>>                                             qman_alloc_fqid()
>>             -- so, we can get the just-freed fqid in thread B --
>>                                             fq->fqid = fqid;
>>                                             fq->idx = fqid * 2;
>>                                             WARN_ON(fq_table[fq->idx]);
>>                                             fq_table[fq->idx] = fq;
>>       fq_table[fq->idx] = NULL;
>>
>> And adding some logs between qman_release_fqid() and
>> fq_table[fq->idx] = NULL makes the WARN_ON() trigger a lot more.
>>
>> To prevent that, ensure that fq_table[fq->idx] is set to NULL before
>> gen_pool_free() is called by using smp_wmb().
>>
> 
> Tested on a LS1046A based board.
> With this patch, the warning is not triggered anymore.
> 
> Tested-by: CHAMPSEIX Thomas <thomas.champseix@alstomgroup.com>

This fix is now in linux-next. If everything goes well I will send a 
pull request for this fix in rc2 or rc3.

> 
>> Fixes: c535e923bb97 ("soc/fsl: Introduce DPAA 1.x QMan device driver")
>> Signed-off-by: Richard Genoud <richard.genoud@bootlin.com>
>> ---
>>   drivers/soc/fsl/qbman/qman.c | 24 ++++++++++++++++++++++--
>>   1 file changed, 22 insertions(+), 2 deletions(-)
>>
>> NB: I'm not 100% sure of the need of a barrier here, since even without
>> it, the WARN_ON() wasn't triggered any more.
>>
>> diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c
>> index 6b392b3ad4b1..39a3e7aab6ff 100644
>> --- a/drivers/soc/fsl/qbman/qman.c
>> +++ b/drivers/soc/fsl/qbman/qman.c
>> @@ -1827,6 +1827,8 @@ EXPORT_SYMBOL(qman_create_fq);
>>   void qman_destroy_fq(struct qman_fq *fq)
>>   {
>> +    int leaked;
>> +
>>       /*
>>        * We don't need to lock the FQ as it is a pre-condition that 
>> the FQ be
>>        * quiesced. Instead, run some checks.
>> @@ -1834,11 +1836,29 @@ void qman_destroy_fq(struct qman_fq *fq)
>>       switch (fq->state) {
>>       case qman_fq_state_parked:
>>       case qman_fq_state_oos:
>> -        if (fq_isset(fq, QMAN_FQ_FLAG_DYNAMIC_FQID))
>> -            qman_release_fqid(fq->fqid);
>> +        /*
>> +         * There's a race condition here on releasing the fqid,
>> +         * setting the fq_table to NULL, and freeing the fqid.
>> +         * To prevent it, this order should be respected:
>> +         */
>> +        if (fq_isset(fq, QMAN_FQ_FLAG_DYNAMIC_FQID)) {
>> +            leaked = qman_shutdown_fq(fq->fqid);
>> +            if (leaked)
>> +                pr_debug("FQID %d leaked\n", fq->fqid);
>> +        }
>>           DPAA_ASSERT(fq_table[fq->idx]);
>>           fq_table[fq->idx] = NULL;
>> +
>> +        if (fq_isset(fq, QMAN_FQ_FLAG_DYNAMIC_FQID) && !leaked) {
>> +            /*
>> +             * fq_table[fq->idx] should be set to null before
>> +             * freeing fq->fqid otherwise it could by allocated by
>> +             * qman_alloc_fqid() while still being !NULL
>> +             */
>> +            smp_wmb();
>> +            gen_pool_free(qm_fqalloc, fq->fqid | DPAA_GENALLOC_OFF, 1);
>> +        }
>>           return;
>>       default:
>>           break;
>>
>> base-commit: 9448598b22c50c8a5bb77a9103e2d49f134c9578
> 
> 

Re: [PATCH] soc: fsl: qbman: fix race condition in qman_destroy_fq
Posted by Christophe Leroy (CS GROUP) 1 month ago

Le 23/12/2025 à 08:25, Richard Genoud a écrit :
> [Vous ne recevez pas souvent de courriers de richard.genoud@bootlin.com. Découvrez pourquoi ceci est important à https://aka.ms/LearnAboutSenderIdentification ]
> 
> When QMAN_FQ_FLAG_DYNAMIC_FQID is set, there's a race condition between
> fq_table[fq->idx] state and freeing/allocating from the pool and
> WARN_ON(fq_table[fq->idx]) in qman_create_fq() gets triggered.
> 
> Indeed, we can have:
>           Thread A                             Thread B
>      qman_destroy_fq()                    qman_create_fq()
>        qman_release_fqid()
>          qman_shutdown_fq()
>          gen_pool_free()
>             -- At this point, the fqid is available again --
>                                             qman_alloc_fqid()
>             -- so, we can get the just-freed fqid in thread B --
>                                             fq->fqid = fqid;
>                                             fq->idx = fqid * 2;
>                                             WARN_ON(fq_table[fq->idx]);
>                                             fq_table[fq->idx] = fq;
>       fq_table[fq->idx] = NULL;
> 
> And adding some logs between qman_release_fqid() and
> fq_table[fq->idx] = NULL makes the WARN_ON() trigger a lot more.
> 
> To prevent that, ensure that fq_table[fq->idx] is set to NULL before
> gen_pool_free() is called by using smp_wmb().

You dismantle/reimplement qman_release_fqid(). Is that the only possible 
approach ?
Isn't it possible to just clear fq_table[fq->idx] _before_ calling 
qman_release_fqid() ?

> 
> Fixes: c535e923bb97 ("soc/fsl: Introduce DPAA 1.x QMan device driver")
> Signed-off-by: Richard Genoud <richard.genoud@bootlin.com>
> ---
>   drivers/soc/fsl/qbman/qman.c | 24 ++++++++++++++++++++++--
>   1 file changed, 22 insertions(+), 2 deletions(-)
> 
> NB: I'm not 100% sure of the need of a barrier here, since even without
> it, the WARN_ON() wasn't triggered any more.
> 
> diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c
> index 6b392b3ad4b1..39a3e7aab6ff 100644
> --- a/drivers/soc/fsl/qbman/qman.c
> +++ b/drivers/soc/fsl/qbman/qman.c
> @@ -1827,6 +1827,8 @@ EXPORT_SYMBOL(qman_create_fq);
> 
>   void qman_destroy_fq(struct qman_fq *fq)
>   {
> +       int leaked;
> +
>          /*
>           * We don't need to lock the FQ as it is a pre-condition that the FQ be
>           * quiesced. Instead, run some checks.
> @@ -1834,11 +1836,29 @@ void qman_destroy_fq(struct qman_fq *fq)
>          switch (fq->state) {
>          case qman_fq_state_parked:
>          case qman_fq_state_oos:
> -               if (fq_isset(fq, QMAN_FQ_FLAG_DYNAMIC_FQID))
> -                       qman_release_fqid(fq->fqid);
> +               /*
> +                * There's a race condition here on releasing the fqid,
> +                * setting the fq_table to NULL, and freeing the fqid.
> +                * To prevent it, this order should be respected:
> +                */
> +               if (fq_isset(fq, QMAN_FQ_FLAG_DYNAMIC_FQID)) {
> +                       leaked = qman_shutdown_fq(fq->fqid);
> +                       if (leaked)
> +                               pr_debug("FQID %d leaked\n", fq->fqid);
> +               }
> 
>                  DPAA_ASSERT(fq_table[fq->idx]);
>                  fq_table[fq->idx] = NULL;
> +
> +               if (fq_isset(fq, QMAN_FQ_FLAG_DYNAMIC_FQID) && !leaked) {
> +                       /*
> +                        * fq_table[fq->idx] should be set to null before
> +                        * freeing fq->fqid otherwise it could by allocated by
> +                        * qman_alloc_fqid() while still being !NULL
> +                        */
> +                       smp_wmb();
> +                       gen_pool_free(qm_fqalloc, fq->fqid | DPAA_GENALLOC_OFF, 1);
> +               }
>                  return;
>          default:
>                  break;
> 
> base-commit: 9448598b22c50c8a5bb77a9103e2d49f134c9578
> --
> 2.47.3
> 

Re: [PATCH] soc: fsl: qbman: fix race condition in qman_destroy_fq
Posted by Richard GENOUD 1 month ago
Hi Christophe,

Le 06/01/2026 à 19:29, Christophe Leroy (CS GROUP) a écrit :
> 
> 
> Le 23/12/2025 à 08:25, Richard Genoud a écrit :
>> [Vous ne recevez pas souvent de courriers de 
>> richard.genoud@bootlin.com. Découvrez pourquoi ceci est important à 
>> https://aka.ms/LearnAboutSenderIdentification ]
>>
>> When QMAN_FQ_FLAG_DYNAMIC_FQID is set, there's a race condition between
>> fq_table[fq->idx] state and freeing/allocating from the pool and
>> WARN_ON(fq_table[fq->idx]) in qman_create_fq() gets triggered.
>>
>> Indeed, we can have:
>>           Thread A                             Thread B
>>      qman_destroy_fq()                    qman_create_fq()
>>        qman_release_fqid()
>>          qman_shutdown_fq()
>>          gen_pool_free()
>>             -- At this point, the fqid is available again --
>>                                             qman_alloc_fqid()
>>             -- so, we can get the just-freed fqid in thread B --
>>                                             fq->fqid = fqid;
>>                                             fq->idx = fqid * 2;
>>                                             WARN_ON(fq_table[fq->idx]);
>>                                             fq_table[fq->idx] = fq;
>>       fq_table[fq->idx] = NULL;
>>
>> And adding some logs between qman_release_fqid() and
>> fq_table[fq->idx] = NULL makes the WARN_ON() trigger a lot more.
>>
>> To prevent that, ensure that fq_table[fq->idx] is set to NULL before
>> gen_pool_free() is called by using smp_wmb().
> 
> You dismantle/reimplement qman_release_fqid(). Is that the only possible 
> approach ?
> Isn't it possible to just clear fq_table[fq->idx] _before_ calling 
> qman_release_fqid() ?
> 
I'm afraid that clearing fq_table[fq->idx] before calling
qman_release_fqid() will create another race condition:
In qm_mr_process_task() we have:
			case QM_MR_VERB_FQRN:
			case QM_MR_VERB_FQRL:
				/* Lookup in the retirement table */
				fq = fqid_to_fq(qm_fqid_get(&msg->fq));
				if (WARN_ON(!fq))
					break;
				fq_state_change(p, fq, msg, verb);
				if (fq->cb.fqs)
					fq->cb.fqs(p, fq, msg);
				break;
https://elixir.bootlin.com/linux/v6.19-rc4/source/drivers/soc/fsl/qbman/qman.c#L1509-L1518
with fqid_to_fq(fqid) expanding to fq_table[fqid * 2];

Is it possible for those "cases" (QM_MR_VERB_FQRN/QM_MR_VERB_FQRL) to
happen after fq_table[fq->idx] is cleared and before qman_shutdown_fq()
is called?
In doubt, I chose the safe side, but I'm not 100% sure it can happen.


Thanks!

Regards,
Richard
>>
>> Fixes: c535e923bb97 ("soc/fsl: Introduce DPAA 1.x QMan device driver")
>> Signed-off-by: Richard Genoud <richard.genoud@bootlin.com>
>> ---
>>   drivers/soc/fsl/qbman/qman.c | 24 ++++++++++++++++++++++--
>>   1 file changed, 22 insertions(+), 2 deletions(-)
>>
>> NB: I'm not 100% sure of the need of a barrier here, since even without
>> it, the WARN_ON() wasn't triggered any more.
>>
>> diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c
>> index 6b392b3ad4b1..39a3e7aab6ff 100644
>> --- a/drivers/soc/fsl/qbman/qman.c
>> +++ b/drivers/soc/fsl/qbman/qman.c
>> @@ -1827,6 +1827,8 @@ EXPORT_SYMBOL(qman_create_fq);
>>
>>   void qman_destroy_fq(struct qman_fq *fq)
>>   {
>> +       int leaked;
>> +
>>          /*
>>           * We don't need to lock the FQ as it is a pre-condition that 
>> the FQ be
>>           * quiesced. Instead, run some checks.
>> @@ -1834,11 +1836,29 @@ void qman_destroy_fq(struct qman_fq *fq)
>>          switch (fq->state) {
>>          case qman_fq_state_parked:
>>          case qman_fq_state_oos:
>> -               if (fq_isset(fq, QMAN_FQ_FLAG_DYNAMIC_FQID))
>> -                       qman_release_fqid(fq->fqid);
>> +               /*
>> +                * There's a race condition here on releasing the fqid,
>> +                * setting the fq_table to NULL, and freeing the fqid.
>> +                * To prevent it, this order should be respected:
>> +                */
>> +               if (fq_isset(fq, QMAN_FQ_FLAG_DYNAMIC_FQID)) {
>> +                       leaked = qman_shutdown_fq(fq->fqid);
>> +                       if (leaked)
>> +                               pr_debug("FQID %d leaked\n", fq->fqid);
>> +               }
>>
>>                  DPAA_ASSERT(fq_table[fq->idx]);
>>                  fq_table[fq->idx] = NULL;
>> +
>> +               if (fq_isset(fq, QMAN_FQ_FLAG_DYNAMIC_FQID) && !leaked) {
>> +                       /*
>> +                        * fq_table[fq->idx] should be set to null before
>> +                        * freeing fq->fqid otherwise it could by 
>> allocated by
>> +                        * qman_alloc_fqid() while still being !NULL
>> +                        */
>> +                       smp_wmb();
>> +                       gen_pool_free(qm_fqalloc, fq->fqid | 
>> DPAA_GENALLOC_OFF, 1);
>> +               }
>>                  return;
>>          default:
>>                  break;
>>
>> base-commit: 9448598b22c50c8a5bb77a9103e2d49f134c9578
>> -- 
>> 2.47.3
>>
> 


-- 
Richard Genoud, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com