drivers/nvme/host/tcp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
nvme-tcp uses page_frag_cache to preallocate PDU for each preallocated
request of block device. Block devices are created in parallel threads,
consequently page_frag_cache is used in not thread-safe manner.
That leads to incorrect refcounting of backstore pages and premature free.
That can be catched by !sendpage_ok inside network stack:
WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310.
tcp_sendmsg_locked+0x782/0xce0
tcp_sendmsg+0x27/0x40
sock_sendmsg+0x8b/0xa0
nvme_tcp_try_send_cmd_pdu+0x149/0x2a0
Then random panic may occur.
Fix that by switching from having a per-queue page_frag_cache to a
per-cpu page_frag_cache.
Cc: stable@vger.kernel.org # 6.12
Fixes: 4e893ca81170 ("nvme_core: scan namespaces asynchronously")
Reported-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
Signed-off-by: Chris Leech <cleech@redhat.com>
---
drivers/nvme/host/tcp.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 1413788ca7d52..a4c4ace5be0f4 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -174,7 +174,6 @@ struct nvme_tcp_queue {
__le32 recv_ddgst;
struct completion tls_complete;
int tls_err;
- struct page_frag_cache pf_cache;
void (*state_change)(struct sock *);
void (*data_ready)(struct sock *);
@@ -201,6 +200,7 @@ struct nvme_tcp_ctrl {
static LIST_HEAD(nvme_tcp_ctrl_list);
static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
+static DEFINE_PER_CPU(struct page_frag_cache, pf_cache);
static struct workqueue_struct *nvme_tcp_wq;
static const struct blk_mq_ops nvme_tcp_mq_ops;
static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
@@ -556,7 +556,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
u8 hdgst = nvme_tcp_hdgst_len(queue);
- req->pdu = page_frag_alloc(&queue->pf_cache,
+ req->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache),
sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
GFP_KERNEL | __GFP_ZERO);
if (!req->pdu)
@@ -1420,7 +1420,7 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
struct nvme_tcp_request *async = &ctrl->async_req;
u8 hdgst = nvme_tcp_hdgst_len(queue);
- async->pdu = page_frag_alloc(&queue->pf_cache,
+ async->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache),
sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
GFP_KERNEL | __GFP_ZERO);
if (!async->pdu)
@@ -1439,7 +1439,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
return;
- page_frag_cache_drain(&queue->pf_cache);
+ page_frag_cache_drain(this_cpu_ptr(&pf_cache));
noreclaim_flag = memalloc_noreclaim_save();
/* ->sock will be released by fput() */
--
2.50.1
On Tue, Sep 30, 2025 at 11:34:14PM -0700, Chris Leech wrote:
>
> nvme-tcp uses page_frag_cache to preallocate PDU for each preallocated
> request of block device. Block devices are created in parallel threads,
> consequently page_frag_cache is used in not thread-safe manner.
> That leads to incorrect refcounting of backstore pages and premature free.
>
> That can be catched by !sendpage_ok inside network stack:
>
> WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310.
> tcp_sendmsg_locked+0x782/0xce0
> tcp_sendmsg+0x27/0x40
> sock_sendmsg+0x8b/0xa0
> nvme_tcp_try_send_cmd_pdu+0x149/0x2a0
> Then random panic may occur.
>
> Fix that by switching from having a per-queue page_frag_cache to a
> per-cpu page_frag_cache.
>
> Cc: stable@vger.kernel.org # 6.12
> Fixes: 4e893ca81170 ("nvme_core: scan namespaces asynchronously")
> Reported-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
> Signed-off-by: Chris Leech <cleech@redhat.com>
> ---
> drivers/nvme/host/tcp.c | 8 ++++----
> 1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 1413788ca7d52..a4c4ace5be0f4 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -174,7 +174,6 @@ struct nvme_tcp_queue {
> __le32 recv_ddgst;
> struct completion tls_complete;
> int tls_err;
> - struct page_frag_cache pf_cache;
>
> void (*state_change)(struct sock *);
> void (*data_ready)(struct sock *);
> @@ -201,6 +200,7 @@ struct nvme_tcp_ctrl {
>
> static LIST_HEAD(nvme_tcp_ctrl_list);
> static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
> +static DEFINE_PER_CPU(struct page_frag_cache, pf_cache);
> static struct workqueue_struct *nvme_tcp_wq;
> static const struct blk_mq_ops nvme_tcp_mq_ops;
> static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
> @@ -556,7 +556,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
> struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
> u8 hdgst = nvme_tcp_hdgst_len(queue);
>
> - req->pdu = page_frag_alloc(&queue->pf_cache,
> + req->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache),
> sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
> GFP_KERNEL | __GFP_ZERO);
I am not good at scheduler subsystem, but as far as I understand,
workqueues may execute its work items in parallel up to max_active work
items on the same CPU. It means that this solution does not fix the
issue of parallel usage of the same variable.
Can anyone comment on this?
> if (!req->pdu)
> @@ -1420,7 +1420,7 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
> struct nvme_tcp_request *async = &ctrl->async_req;
> u8 hdgst = nvme_tcp_hdgst_len(queue);
>
> - async->pdu = page_frag_alloc(&queue->pf_cache,
> + async->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache),
> sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
> GFP_KERNEL | __GFP_ZERO);
This line is executed in a different(parallel) context comparing to
nvme_tcp_init_request.
> if (!async->pdu)
> @@ -1439,7 +1439,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
> if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
> return;
>
> - page_frag_cache_drain(&queue->pf_cache);
> + page_frag_cache_drain(this_cpu_ptr(&pf_cache));
This line is also definitely processed in other(parallel) context comparing
to nvme_tcp_init_request. And frees the still used pages by other queues.
> noreclaim_flag = memalloc_noreclaim_save();
> /* ->sock will be released by fput() */
> --
> 2.50.1
>
In total, my patch with a mutex looks more appropriate and more error-proof.
BR,
Dmitry
© 2016 - 2026 Red Hat, Inc.