drivers/nvme/host/tcp.c | 8 ++++++++ 1 file changed, 8 insertions(+)
nvme uses page_frag_cache to preallocate PDU for each preallocated request
of block device. Block devices are created in parallel threads,
consequently page_frag_cache is used in not thread-safe manner.
That leads to incorrect refcounting of backstore pages and premature free.
That can be catched by !sendpage_ok inside network stack:
WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310.
tcp_sendmsg_locked+0x782/0xce0
tcp_sendmsg+0x27/0x40
sock_sendmsg+0x8b/0xa0
nvme_tcp_try_send_cmd_pdu+0x149/0x2a0
Then random panic may occur.
Fix that by serializing the usage of page_frag_cache.
Cc: stable@vger.kernel.org # 6.12
Fixes: 4e893ca81170 ("nvme_core: scan namespaces asynchronously")
Signed-off-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
---
drivers/nvme/host/tcp.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 1413788ca7d52..823e07759e0d3 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -145,6 +145,7 @@ struct nvme_tcp_queue {
struct mutex queue_lock;
struct mutex send_mutex;
+ struct mutex pf_cache_lock;
struct llist_head req_list;
struct list_head send_list;
@@ -556,9 +557,11 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
u8 hdgst = nvme_tcp_hdgst_len(queue);
+ mutex_lock(&queue->pf_cache_lock);
req->pdu = page_frag_alloc(&queue->pf_cache,
sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
GFP_KERNEL | __GFP_ZERO);
+ mutex_unlock(&queue->pf_cache_lock);
if (!req->pdu)
return -ENOMEM;
@@ -1420,9 +1423,11 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
struct nvme_tcp_request *async = &ctrl->async_req;
u8 hdgst = nvme_tcp_hdgst_len(queue);
+ mutex_lock(&queue->pf_cache_lock);
async->pdu = page_frag_alloc(&queue->pf_cache,
sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
GFP_KERNEL | __GFP_ZERO);
+ mutex_unlock(&queue->pf_cache_lock);
if (!async->pdu)
return -ENOMEM;
@@ -1450,6 +1455,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
kfree(queue->pdu);
mutex_destroy(&queue->send_mutex);
mutex_destroy(&queue->queue_lock);
+ mutex_destroy(&queue->pf_cache_lock);
}
static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
@@ -1772,6 +1778,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
INIT_LIST_HEAD(&queue->send_list);
mutex_init(&queue->send_mutex);
INIT_WORK(&queue->io_work, nvme_tcp_io_work);
+ mutex_init(&queue->pf_cache_lock);
if (qid > 0)
queue->cmnd_capsule_len = nctrl->ioccsz * 16;
@@ -1903,6 +1910,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
err_destroy_mutex:
mutex_destroy(&queue->send_mutex);
mutex_destroy(&queue->queue_lock);
+ mutex_destroy(&queue->pf_cache_lock);
return ret;
}
--
2.25.1
On Mon, Sep 29, 2025 at 02:19:51PM +0300, Dmitry Bogdanov wrote: > nvme uses page_frag_cache to preallocate PDU for each preallocated request > of block device. Block devices are created in parallel threads, > consequently page_frag_cache is used in not thread-safe manner. > That leads to incorrect refcounting of backstore pages and premature free. > > That can be catched by !sendpage_ok inside network stack: > > WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310. > tcp_sendmsg_locked+0x782/0xce0 > tcp_sendmsg+0x27/0x40 > sock_sendmsg+0x8b/0xa0 > nvme_tcp_try_send_cmd_pdu+0x149/0x2a0 > Then random panic may occur. > > Fix that by serializing the usage of page_frag_cache. Thank you for reporting this. I think we can fix it without blocking the async namespace scanning with a mutex, by switching from a per-queue page_frag_cache to per-cpu. There shouldn't be a need to keep the page_frag allocations isolated by queue anyway. It would be great if you could test the patch which I'll send after this. - Chris > Cc: stable@vger.kernel.org # 6.12 > Fixes: 4e893ca81170 ("nvme_core: scan namespaces asynchronously") > Signed-off-by: Dmitry Bogdanov <d.bogdanov@yadro.com> > --- > drivers/nvme/host/tcp.c | 8 ++++++++ > 1 file changed, 8 insertions(+) > > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c > index 1413788ca7d52..823e07759e0d3 100644 > --- a/drivers/nvme/host/tcp.c > +++ b/drivers/nvme/host/tcp.c > @@ -145,6 +145,7 @@ struct nvme_tcp_queue { > > struct mutex queue_lock; > struct mutex send_mutex; > + struct mutex pf_cache_lock; > struct llist_head req_list; > struct list_head send_list; > > @@ -556,9 +557,11 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, > struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; > u8 hdgst = nvme_tcp_hdgst_len(queue); > > + mutex_lock(&queue->pf_cache_lock); > req->pdu = page_frag_alloc(&queue->pf_cache, > sizeof(struct nvme_tcp_cmd_pdu) + hdgst, > GFP_KERNEL | __GFP_ZERO); > + mutex_unlock(&queue->pf_cache_lock); > if (!req->pdu) > return -ENOMEM; > > @@ -1420,9 +1423,11 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl) > struct nvme_tcp_request *async = &ctrl->async_req; > u8 hdgst = nvme_tcp_hdgst_len(queue); > > + mutex_lock(&queue->pf_cache_lock); > async->pdu = page_frag_alloc(&queue->pf_cache, > sizeof(struct nvme_tcp_cmd_pdu) + hdgst, > GFP_KERNEL | __GFP_ZERO); > + mutex_unlock(&queue->pf_cache_lock); > if (!async->pdu) > return -ENOMEM; > > @@ -1450,6 +1455,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) > kfree(queue->pdu); > mutex_destroy(&queue->send_mutex); > mutex_destroy(&queue->queue_lock); > + mutex_destroy(&queue->pf_cache_lock); > } > > static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) > @@ -1772,6 +1778,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, > INIT_LIST_HEAD(&queue->send_list); > mutex_init(&queue->send_mutex); > INIT_WORK(&queue->io_work, nvme_tcp_io_work); > + mutex_init(&queue->pf_cache_lock); > > if (qid > 0) > queue->cmnd_capsule_len = nctrl->ioccsz * 16; > @@ -1903,6 +1910,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, > err_destroy_mutex: > mutex_destroy(&queue->send_mutex); > mutex_destroy(&queue->queue_lock); > + mutex_destroy(&queue->pf_cache_lock); > return ret; > } > > -- > 2.25.1 > >
On Tue, Sep 30, 2025 at 11:31:26PM -0700, Chris Leech wrote: > > On Mon, Sep 29, 2025 at 02:19:51PM +0300, Dmitry Bogdanov wrote: > > nvme uses page_frag_cache to preallocate PDU for each preallocated request > > of block device. Block devices are created in parallel threads, > > consequently page_frag_cache is used in not thread-safe manner. > > That leads to incorrect refcounting of backstore pages and premature free. > > > > That can be catched by !sendpage_ok inside network stack: > > > > WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310. > > tcp_sendmsg_locked+0x782/0xce0 > > tcp_sendmsg+0x27/0x40 > > sock_sendmsg+0x8b/0xa0 > > nvme_tcp_try_send_cmd_pdu+0x149/0x2a0 > > Then random panic may occur. > > > > Fix that by serializing the usage of page_frag_cache. > > Thank you for reporting this. I think we can fix it without blocking the > async namespace scanning with a mutex, by switching from a per-queue > page_frag_cache to per-cpu. There shouldn't be a need to keep the > page_frag allocations isolated by queue anyway. > > It would be great if you could test the patch which I'll send after > this. > As I commented on your patch, a naive per-cpu cache solution is error-prone. The complete solution will be unnecessaryly difficult. Block device creation is not a data plane, it is a control plane, so there is no sense to use there lockless algorithms. My patch is a simple and error-proof already. So, I insist on this solution. BR, Dmitry
nvme-tcp uses page_frag_cache to preallocate PDU for each preallocated
request of block device. Block devices are created in parallel threads,
consequently page_frag_cache is used in not thread-safe manner.
That leads to incorrect refcounting of backstore pages and premature free.
That can be catched by !sendpage_ok inside network stack:
WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310.
tcp_sendmsg_locked+0x782/0xce0
tcp_sendmsg+0x27/0x40
sock_sendmsg+0x8b/0xa0
nvme_tcp_try_send_cmd_pdu+0x149/0x2a0
Then random panic may occur.
Fix that by switching from having a per-queue page_frag_cache to a
per-cpu page_frag_cache.
Cc: stable@vger.kernel.org # 6.12
Fixes: 4e893ca81170 ("nvme_core: scan namespaces asynchronously")
Reported-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
Signed-off-by: Chris Leech <cleech@redhat.com>
---
drivers/nvme/host/tcp.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 1413788ca7d52..a4c4ace5be0f4 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -174,7 +174,6 @@ struct nvme_tcp_queue {
__le32 recv_ddgst;
struct completion tls_complete;
int tls_err;
- struct page_frag_cache pf_cache;
void (*state_change)(struct sock *);
void (*data_ready)(struct sock *);
@@ -201,6 +200,7 @@ struct nvme_tcp_ctrl {
static LIST_HEAD(nvme_tcp_ctrl_list);
static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
+static DEFINE_PER_CPU(struct page_frag_cache, pf_cache);
static struct workqueue_struct *nvme_tcp_wq;
static const struct blk_mq_ops nvme_tcp_mq_ops;
static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
@@ -556,7 +556,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
u8 hdgst = nvme_tcp_hdgst_len(queue);
- req->pdu = page_frag_alloc(&queue->pf_cache,
+ req->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache),
sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
GFP_KERNEL | __GFP_ZERO);
if (!req->pdu)
@@ -1420,7 +1420,7 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
struct nvme_tcp_request *async = &ctrl->async_req;
u8 hdgst = nvme_tcp_hdgst_len(queue);
- async->pdu = page_frag_alloc(&queue->pf_cache,
+ async->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache),
sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
GFP_KERNEL | __GFP_ZERO);
if (!async->pdu)
@@ -1439,7 +1439,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
return;
- page_frag_cache_drain(&queue->pf_cache);
+ page_frag_cache_drain(this_cpu_ptr(&pf_cache));
noreclaim_flag = memalloc_noreclaim_save();
/* ->sock will be released by fput() */
--
2.50.1
On Tue, Sep 30, 2025 at 11:34:14PM -0700, Chris Leech wrote: > > nvme-tcp uses page_frag_cache to preallocate PDU for each preallocated > request of block device. Block devices are created in parallel threads, > consequently page_frag_cache is used in not thread-safe manner. > That leads to incorrect refcounting of backstore pages and premature free. > > That can be catched by !sendpage_ok inside network stack: > > WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310. > tcp_sendmsg_locked+0x782/0xce0 > tcp_sendmsg+0x27/0x40 > sock_sendmsg+0x8b/0xa0 > nvme_tcp_try_send_cmd_pdu+0x149/0x2a0 > Then random panic may occur. > > Fix that by switching from having a per-queue page_frag_cache to a > per-cpu page_frag_cache. > > Cc: stable@vger.kernel.org # 6.12 > Fixes: 4e893ca81170 ("nvme_core: scan namespaces asynchronously") > Reported-by: Dmitry Bogdanov <d.bogdanov@yadro.com> > Signed-off-by: Chris Leech <cleech@redhat.com> > --- > drivers/nvme/host/tcp.c | 8 ++++---- > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c > index 1413788ca7d52..a4c4ace5be0f4 100644 > --- a/drivers/nvme/host/tcp.c > +++ b/drivers/nvme/host/tcp.c > @@ -174,7 +174,6 @@ struct nvme_tcp_queue { > __le32 recv_ddgst; > struct completion tls_complete; > int tls_err; > - struct page_frag_cache pf_cache; > > void (*state_change)(struct sock *); > void (*data_ready)(struct sock *); > @@ -201,6 +200,7 @@ struct nvme_tcp_ctrl { > > static LIST_HEAD(nvme_tcp_ctrl_list); > static DEFINE_MUTEX(nvme_tcp_ctrl_mutex); > +static DEFINE_PER_CPU(struct page_frag_cache, pf_cache); > static struct workqueue_struct *nvme_tcp_wq; > static const struct blk_mq_ops nvme_tcp_mq_ops; > static const struct blk_mq_ops nvme_tcp_admin_mq_ops; > @@ -556,7 +556,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, > struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; > u8 hdgst = nvme_tcp_hdgst_len(queue); > > - req->pdu = page_frag_alloc(&queue->pf_cache, > + req->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache), > sizeof(struct nvme_tcp_cmd_pdu) + hdgst, > GFP_KERNEL | __GFP_ZERO); I am not good at scheduler subsystem, but as far as I understand, workqueues may execute its work items in parallel up to max_active work items on the same CPU. It means that this solution does not fix the issue of parallel usage of the same variable. Can anyone comment on this? > if (!req->pdu) > @@ -1420,7 +1420,7 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl) > struct nvme_tcp_request *async = &ctrl->async_req; > u8 hdgst = nvme_tcp_hdgst_len(queue); > > - async->pdu = page_frag_alloc(&queue->pf_cache, > + async->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache), > sizeof(struct nvme_tcp_cmd_pdu) + hdgst, > GFP_KERNEL | __GFP_ZERO); This line is executed in a different(parallel) context comparing to nvme_tcp_init_request. > if (!async->pdu) > @@ -1439,7 +1439,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) > if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) > return; > > - page_frag_cache_drain(&queue->pf_cache); > + page_frag_cache_drain(this_cpu_ptr(&pf_cache)); This line is also definitely processed in other(parallel) context comparing to nvme_tcp_init_request. And frees the still used pages by other queues. > noreclaim_flag = memalloc_noreclaim_save(); > /* ->sock will be released by fput() */ > -- > 2.50.1 > In total, my patch with a mutex looks more appropriate and more error-proof. BR, Dmitry
© 2016 - 2025 Red Hat, Inc.