nvme-tcp: fix usage of page_frag_cache

[PATCH] nvme-tcp: fix usage of page_frag_cache

Posted by Dmitry Bogdanov 2 days, 10 hours ago

nvme uses page_frag_cache to preallocate PDU for each preallocated request
of block device. Block devices are created in parallel threads,
consequently page_frag_cache is used in not thread-safe manner.
That leads to incorrect refcounting of backstore pages and premature free.

That can be catched by !sendpage_ok inside network stack:

WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310.
	tcp_sendmsg_locked+0x782/0xce0
	tcp_sendmsg+0x27/0x40
	sock_sendmsg+0x8b/0xa0
	nvme_tcp_try_send_cmd_pdu+0x149/0x2a0
Then random panic may occur.

Fix that by serializing the usage of page_frag_cache.

Cc: stable@vger.kernel.org # 6.12
Fixes: 4e893ca81170 ("nvme_core: scan namespaces asynchronously")
Signed-off-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
---
 drivers/nvme/host/tcp.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 1413788ca7d52..823e07759e0d3 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -145,6 +145,7 @@ struct nvme_tcp_queue {
 
 	struct mutex		queue_lock;
 	struct mutex		send_mutex;
+	struct mutex		pf_cache_lock;
 	struct llist_head	req_list;
 	struct list_head	send_list;
 
@@ -556,9 +557,11 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
 	struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 
+	mutex_lock(&queue->pf_cache_lock);
 	req->pdu = page_frag_alloc(&queue->pf_cache,
 		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
 		GFP_KERNEL | __GFP_ZERO);
+	mutex_unlock(&queue->pf_cache_lock);
 	if (!req->pdu)
 		return -ENOMEM;
 
@@ -1420,9 +1423,11 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
 	struct nvme_tcp_request *async = &ctrl->async_req;
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 
+	mutex_lock(&queue->pf_cache_lock);
 	async->pdu = page_frag_alloc(&queue->pf_cache,
 		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
 		GFP_KERNEL | __GFP_ZERO);
+	mutex_unlock(&queue->pf_cache_lock);
 	if (!async->pdu)
 		return -ENOMEM;
 
@@ -1450,6 +1455,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 	kfree(queue->pdu);
 	mutex_destroy(&queue->send_mutex);
 	mutex_destroy(&queue->queue_lock);
+	mutex_destroy(&queue->pf_cache_lock);
 }
 
 static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
@@ -1772,6 +1778,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
 	INIT_LIST_HEAD(&queue->send_list);
 	mutex_init(&queue->send_mutex);
 	INIT_WORK(&queue->io_work, nvme_tcp_io_work);
+	mutex_init(&queue->pf_cache_lock);
 
 	if (qid > 0)
 		queue->cmnd_capsule_len = nctrl->ioccsz * 16;
@@ -1903,6 +1910,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
 err_destroy_mutex:
 	mutex_destroy(&queue->send_mutex);
 	mutex_destroy(&queue->queue_lock);
+	mutex_destroy(&queue->pf_cache_lock);
 	return ret;
 }
 
-- 
2.25.1

Re: [PATCH] nvme-tcp: fix usage of page_frag_cache

Posted by Chris Leech 15 hours ago

On Mon, Sep 29, 2025 at 02:19:51PM +0300, Dmitry Bogdanov wrote:
> nvme uses page_frag_cache to preallocate PDU for each preallocated request
> of block device. Block devices are created in parallel threads,
> consequently page_frag_cache is used in not thread-safe manner.
> That leads to incorrect refcounting of backstore pages and premature free.
> 
> That can be catched by !sendpage_ok inside network stack:
> 
> WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310.
> 	tcp_sendmsg_locked+0x782/0xce0
> 	tcp_sendmsg+0x27/0x40
> 	sock_sendmsg+0x8b/0xa0
> 	nvme_tcp_try_send_cmd_pdu+0x149/0x2a0
> Then random panic may occur.
> 
> Fix that by serializing the usage of page_frag_cache.

Thank you for reporting this. I think we can fix it without blocking the
async namespace scanning with a mutex, by switching from a per-queue
page_frag_cache to per-cpu. There shouldn't be a need to keep the
page_frag allocations isolated by queue anyway.

It would be great if you could test the patch which I'll send after
this.

- Chris
 
> Cc: stable@vger.kernel.org # 6.12
> Fixes: 4e893ca81170 ("nvme_core: scan namespaces asynchronously")
> Signed-off-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
> ---
>  drivers/nvme/host/tcp.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 1413788ca7d52..823e07759e0d3 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -145,6 +145,7 @@ struct nvme_tcp_queue {
>  
>  	struct mutex		queue_lock;
>  	struct mutex		send_mutex;
> +	struct mutex		pf_cache_lock;
>  	struct llist_head	req_list;
>  	struct list_head	send_list;
>  
> @@ -556,9 +557,11 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
>  	struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
>  	u8 hdgst = nvme_tcp_hdgst_len(queue);
>  
> +	mutex_lock(&queue->pf_cache_lock);
>  	req->pdu = page_frag_alloc(&queue->pf_cache,
>  		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
>  		GFP_KERNEL | __GFP_ZERO);
> +	mutex_unlock(&queue->pf_cache_lock);
>  	if (!req->pdu)
>  		return -ENOMEM;
>  
> @@ -1420,9 +1423,11 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
>  	struct nvme_tcp_request *async = &ctrl->async_req;
>  	u8 hdgst = nvme_tcp_hdgst_len(queue);
>  
> +	mutex_lock(&queue->pf_cache_lock);
>  	async->pdu = page_frag_alloc(&queue->pf_cache,
>  		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
>  		GFP_KERNEL | __GFP_ZERO);
> +	mutex_unlock(&queue->pf_cache_lock);
>  	if (!async->pdu)
>  		return -ENOMEM;
>  
> @@ -1450,6 +1455,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
>  	kfree(queue->pdu);
>  	mutex_destroy(&queue->send_mutex);
>  	mutex_destroy(&queue->queue_lock);
> +	mutex_destroy(&queue->pf_cache_lock);
>  }
>  
>  static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
> @@ -1772,6 +1778,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
>  	INIT_LIST_HEAD(&queue->send_list);
>  	mutex_init(&queue->send_mutex);
>  	INIT_WORK(&queue->io_work, nvme_tcp_io_work);
> +	mutex_init(&queue->pf_cache_lock);
>  
>  	if (qid > 0)
>  		queue->cmnd_capsule_len = nctrl->ioccsz * 16;
> @@ -1903,6 +1910,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
>  err_destroy_mutex:
>  	mutex_destroy(&queue->send_mutex);
>  	mutex_destroy(&queue->queue_lock);
> +	mutex_destroy(&queue->pf_cache_lock);
>  	return ret;
>  }
>  
> -- 
> 2.25.1
> 
>

Re: [PATCH] nvme-tcp: fix usage of page_frag_cache

Posted by Dmitry Bogdanov 4 hours ago

On Tue, Sep 30, 2025 at 11:31:26PM -0700, Chris Leech wrote:
> 
> On Mon, Sep 29, 2025 at 02:19:51PM +0300, Dmitry Bogdanov wrote:
> > nvme uses page_frag_cache to preallocate PDU for each preallocated request
> > of block device. Block devices are created in parallel threads,
> > consequently page_frag_cache is used in not thread-safe manner.
> > That leads to incorrect refcounting of backstore pages and premature free.
> >
> > That can be catched by !sendpage_ok inside network stack:
> >
> > WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310.
> >       tcp_sendmsg_locked+0x782/0xce0
> >       tcp_sendmsg+0x27/0x40
> >       sock_sendmsg+0x8b/0xa0
> >       nvme_tcp_try_send_cmd_pdu+0x149/0x2a0
> > Then random panic may occur.
> >
> > Fix that by serializing the usage of page_frag_cache.
> 
> Thank you for reporting this. I think we can fix it without blocking the
> async namespace scanning with a mutex, by switching from a per-queue
> page_frag_cache to per-cpu. There shouldn't be a need to keep the
> page_frag allocations isolated by queue anyway.
> 
> It would be great if you could test the patch which I'll send after
> this.
> 

As I commented on your patch, a naive per-cpu cache solution is
error-prone. The complete solution will be unnecessaryly difficult.
Block device creation is not a data plane, it is a control plane, so
there is no sense to use there lockless algorithms.

My patch is a simple and error-proof already.
So, I insist on this solution.

BR,
 Dmitry

[PATCH] nvme-tcp: switch to per-cpu page_frag_cache

Posted by Chris Leech 14 hours ago

nvme-tcp uses page_frag_cache to preallocate PDU for each preallocated
request of block device. Block devices are created in parallel threads,
consequently page_frag_cache is used in not thread-safe manner.
That leads to incorrect refcounting of backstore pages and premature free.

That can be catched by !sendpage_ok inside network stack:

WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310.
        tcp_sendmsg_locked+0x782/0xce0
        tcp_sendmsg+0x27/0x40
        sock_sendmsg+0x8b/0xa0
        nvme_tcp_try_send_cmd_pdu+0x149/0x2a0
Then random panic may occur.

Fix that by switching from having a per-queue page_frag_cache to a
per-cpu page_frag_cache.

Cc: stable@vger.kernel.org # 6.12
Fixes: 4e893ca81170 ("nvme_core: scan namespaces asynchronously")
Reported-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
Signed-off-by: Chris Leech <cleech@redhat.com>
---
 drivers/nvme/host/tcp.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 1413788ca7d52..a4c4ace5be0f4 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -174,7 +174,6 @@ struct nvme_tcp_queue {
 	__le32			recv_ddgst;
 	struct completion       tls_complete;
 	int                     tls_err;
-	struct page_frag_cache	pf_cache;
 
 	void (*state_change)(struct sock *);
 	void (*data_ready)(struct sock *);
@@ -201,6 +200,7 @@ struct nvme_tcp_ctrl {
 
 static LIST_HEAD(nvme_tcp_ctrl_list);
 static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
+static DEFINE_PER_CPU(struct page_frag_cache, pf_cache);
 static struct workqueue_struct *nvme_tcp_wq;
 static const struct blk_mq_ops nvme_tcp_mq_ops;
 static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
@@ -556,7 +556,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
 	struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 
-	req->pdu = page_frag_alloc(&queue->pf_cache,
+	req->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache),
 		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
 		GFP_KERNEL | __GFP_ZERO);
 	if (!req->pdu)
@@ -1420,7 +1420,7 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
 	struct nvme_tcp_request *async = &ctrl->async_req;
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 
-	async->pdu = page_frag_alloc(&queue->pf_cache,
+	async->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache),
 		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
 		GFP_KERNEL | __GFP_ZERO);
 	if (!async->pdu)
@@ -1439,7 +1439,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 	if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
 		return;
 
-	page_frag_cache_drain(&queue->pf_cache);
+	page_frag_cache_drain(this_cpu_ptr(&pf_cache));
 
 	noreclaim_flag = memalloc_noreclaim_save();
 	/* ->sock will be released by fput() */
-- 
2.50.1

Re: [PATCH] nvme-tcp: switch to per-cpu page_frag_cache

Posted by Dmitry Bogdanov 5 hours ago

On Tue, Sep 30, 2025 at 11:34:14PM -0700, Chris Leech wrote:
> 
> nvme-tcp uses page_frag_cache to preallocate PDU for each preallocated
> request of block device. Block devices are created in parallel threads,
> consequently page_frag_cache is used in not thread-safe manner.
> That leads to incorrect refcounting of backstore pages and premature free.
> 
> That can be catched by !sendpage_ok inside network stack:
> 
> WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310.
>         tcp_sendmsg_locked+0x782/0xce0
>         tcp_sendmsg+0x27/0x40
>         sock_sendmsg+0x8b/0xa0
>         nvme_tcp_try_send_cmd_pdu+0x149/0x2a0
> Then random panic may occur.
> 
> Fix that by switching from having a per-queue page_frag_cache to a
> per-cpu page_frag_cache.
> 
> Cc: stable@vger.kernel.org # 6.12
> Fixes: 4e893ca81170 ("nvme_core: scan namespaces asynchronously")
> Reported-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
> Signed-off-by: Chris Leech <cleech@redhat.com>
> ---
>  drivers/nvme/host/tcp.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 1413788ca7d52..a4c4ace5be0f4 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -174,7 +174,6 @@ struct nvme_tcp_queue {
>         __le32                  recv_ddgst;
>         struct completion       tls_complete;
>         int                     tls_err;
> -       struct page_frag_cache  pf_cache;
> 
>         void (*state_change)(struct sock *);
>         void (*data_ready)(struct sock *);
> @@ -201,6 +200,7 @@ struct nvme_tcp_ctrl {
> 
>  static LIST_HEAD(nvme_tcp_ctrl_list);
>  static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
> +static DEFINE_PER_CPU(struct page_frag_cache, pf_cache);
>  static struct workqueue_struct *nvme_tcp_wq;
>  static const struct blk_mq_ops nvme_tcp_mq_ops;
>  static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
> @@ -556,7 +556,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
>         struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
>         u8 hdgst = nvme_tcp_hdgst_len(queue);
> 
> -       req->pdu = page_frag_alloc(&queue->pf_cache,
> +       req->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache),
>                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
>                 GFP_KERNEL | __GFP_ZERO);

I am not good at scheduler subsystem, but as far as I understand,
workqueues may execute its work items in parallel up to max_active work
items on the same CPU. It means that this solution does not fix the
issue of parallel usage of the same variable.
Can anyone comment on this?

>         if (!req->pdu)
> @@ -1420,7 +1420,7 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
>         struct nvme_tcp_request *async = &ctrl->async_req;
>         u8 hdgst = nvme_tcp_hdgst_len(queue);
> 
> -       async->pdu = page_frag_alloc(&queue->pf_cache,
> +       async->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache),
>                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
>                 GFP_KERNEL | __GFP_ZERO);

This line is executed in a different(parallel) context comparing to
nvme_tcp_init_request.

>         if (!async->pdu)
> @@ -1439,7 +1439,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
>         if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
>                 return;
> 
> -       page_frag_cache_drain(&queue->pf_cache);
> +       page_frag_cache_drain(this_cpu_ptr(&pf_cache));

This line is also definitely processed in other(parallel) context comparing
to nvme_tcp_init_request. And frees the still used pages by other queues.

>         noreclaim_flag = memalloc_noreclaim_save();
>         /* ->sock will be released by fput() */
> --
> 2.50.1
> 

In total, my patch with a mutex looks more appropriate and more error-proof.

BR,
 Dmitry