[RFC v2 06/11] nvme-pci: add support for dmabuf reggistration

Pavel Begunkov posted 11 patches 2 months, 2 weeks ago
[RFC v2 06/11] nvme-pci: add support for dmabuf reggistration
Posted by Pavel Begunkov 2 months, 2 weeks ago
Implement dma-token related callbacks for nvme block devices.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 drivers/nvme/host/pci.c | 95 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index e5ca8301bb8b..63e03c3dc044 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -27,6 +27,7 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/io-64-nonatomic-hi-lo.h>
 #include <linux/sed-opal.h>
+#include <linux/blk-mq-dma-token.h>
 
 #include "trace.h"
 #include "nvme.h"
@@ -482,6 +483,92 @@ static void nvme_release_descriptor_pools(struct nvme_dev *dev)
 	}
 }
 
+static void nvme_dmabuf_move_notify(struct dma_buf_attachment *attach)
+{
+	blk_mq_dma_map_move_notify(attach->importer_priv);
+}
+
+const struct dma_buf_attach_ops nvme_dmabuf_importer_ops = {
+	.move_notify = nvme_dmabuf_move_notify,
+	.allow_peer2peer = true,
+};
+
+static int nvme_init_dma_token(struct request_queue *q,
+				struct blk_mq_dma_token *token)
+{
+	struct dma_buf_attachment *attach;
+	struct nvme_ns *ns = q->queuedata;
+	struct nvme_dev *dev = to_nvme_dev(ns->ctrl);
+	struct dma_buf *dmabuf = token->dmabuf;
+
+	if (dmabuf->size % NVME_CTRL_PAGE_SIZE)
+		return -EINVAL;
+
+	attach = dma_buf_dynamic_attach(dmabuf, dev->dev,
+					&nvme_dmabuf_importer_ops, token);
+	if (IS_ERR(attach))
+		return PTR_ERR(attach);
+
+	token->private = attach;
+	return 0;
+}
+
+static void nvme_clean_dma_token(struct request_queue *q,
+				 struct blk_mq_dma_token *token)
+{
+	struct dma_buf_attachment *attach = token->private;
+
+	dma_buf_detach(token->dmabuf, attach);
+}
+
+static int nvme_dma_map(struct request_queue *q, struct blk_mq_dma_map *map)
+{
+	struct blk_mq_dma_token *token = map->token;
+	struct dma_buf_attachment *attach = token->private;
+	unsigned nr_entries;
+	unsigned long tmp, i = 0;
+	struct scatterlist *sg;
+	struct sg_table *sgt;
+	dma_addr_t *dma_list;
+
+	nr_entries = token->dmabuf->size / NVME_CTRL_PAGE_SIZE;
+	dma_list = kmalloc_array(nr_entries, sizeof(dma_list[0]), GFP_KERNEL);
+	if (!dma_list)
+		return -ENOMEM;
+
+	sgt = dma_buf_map_attachment(attach, token->dir);
+	if (IS_ERR(sgt)) {
+		kfree(dma_list);
+		return PTR_ERR(sgt);
+	}
+	map->sgt = sgt;
+
+	for_each_sgtable_dma_sg(sgt, sg, tmp) {
+		dma_addr_t dma = sg_dma_address(sg);
+		unsigned long sg_len = sg_dma_len(sg);
+
+		while (sg_len) {
+			dma_list[i++] = dma;
+			dma += NVME_CTRL_PAGE_SIZE;
+			sg_len -= NVME_CTRL_PAGE_SIZE;
+		}
+	}
+
+	map->private = dma_list;
+	return 0;
+}
+
+static void nvme_dma_unmap(struct request_queue *q, struct blk_mq_dma_map *map)
+{
+	struct blk_mq_dma_token *token = map->token;
+	struct dma_buf_attachment *attach = token->private;
+	dma_addr_t *dma_list = map->private;
+
+	dma_buf_unmap_attachment_unlocked(attach, map->sgt, token->dir);
+	map->sgt = NULL;
+	kfree(dma_list);
+}
+
 static int nvme_init_hctx_common(struct blk_mq_hw_ctx *hctx, void *data,
 		unsigned qid)
 {
@@ -1067,6 +1154,9 @@ static blk_status_t nvme_map_data(struct request *req)
 	struct blk_dma_iter iter;
 	blk_status_t ret;
 
+	if (req->bio && bio_flagged(req->bio, BIO_DMA_TOKEN))
+		return BLK_STS_RESOURCE;
+
 	/*
 	 * Try to skip the DMA iterator for single segment requests, as that
 	 * significantly improves performances for small I/O sizes.
@@ -2093,6 +2183,11 @@ static const struct blk_mq_ops nvme_mq_ops = {
 	.map_queues	= nvme_pci_map_queues,
 	.timeout	= nvme_timeout,
 	.poll		= nvme_poll,
+
+	.dma_map	= nvme_dma_map,
+	.dma_unmap 	= nvme_dma_unmap,
+	.init_dma_token =  nvme_init_dma_token,
+	.clean_dma_token = nvme_clean_dma_token,
 };
 
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
-- 
2.52.0
Re: [RFC v2 06/11] nvme-pci: add support for dmabuf reggistration
Posted by Christoph Hellwig 2 months ago
Splitting this trivial stub from the substantial parts in the next patch
feels odd.  Please merge them.

(and better commit logs and comments really would be useful for others
to understand what you've done).

> +const struct dma_buf_attach_ops nvme_dmabuf_importer_ops = {
> +	.move_notify = nvme_dmabuf_move_notify,
> +	.allow_peer2peer = true,
> +};

Tab-align the =, please.

> +static int nvme_init_dma_token(struct request_queue *q,
> +				struct blk_mq_dma_token *token)
> +{
> +	struct dma_buf_attachment *attach;
> +	struct nvme_ns *ns = q->queuedata;
> +	struct nvme_dev *dev = to_nvme_dev(ns->ctrl);
> +	struct dma_buf *dmabuf = token->dmabuf;
> +
> +	if (dmabuf->size % NVME_CTRL_PAGE_SIZE)
> +		return -EINVAL;

Why do you care about alignment to the controller page size?

> +	for_each_sgtable_dma_sg(sgt, sg, tmp) {
> +		dma_addr_t dma = sg_dma_address(sg);
> +		unsigned long sg_len = sg_dma_len(sg);
> +
> +		while (sg_len) {
> +			dma_list[i++] = dma;
> +			dma += NVME_CTRL_PAGE_SIZE;
> +			sg_len -= NVME_CTRL_PAGE_SIZE;
> +		}
> +	}

Why does this build controller pages sized chunks?
Re: [RFC v2 06/11] nvme-pci: add support for dmabuf reggistration
Posted by Keith Busch 2 months ago
On Thu, Dec 04, 2025 at 03:00:02AM -0800, Christoph Hellwig wrote:
> Why do you care about alignment to the controller page size?
> 
> > +	for_each_sgtable_dma_sg(sgt, sg, tmp) {
> > +		dma_addr_t dma = sg_dma_address(sg);
> > +		unsigned long sg_len = sg_dma_len(sg);
> > +
> > +		while (sg_len) {
> > +			dma_list[i++] = dma;
> > +			dma += NVME_CTRL_PAGE_SIZE;
> > +			sg_len -= NVME_CTRL_PAGE_SIZE;
> > +		}
> > +	}
> 
> Why does this build controller pages sized chunks?

I think the idea was that having fixed size entries aligned to the
device's PRP unit is that it's efficient to jump to the correct index
for any given offset. A vector of mixed sizes would require you walk the
list to find the correct starting point, which we want to avoid.

This is similar to the way io_uring registered memory is set up, though
io_uring has extra logic to use largest common contiguous segment size,
or even just one segment if it coalesces. We could probably do that too.

Anyway, that representation naturally translates to the PRP format, but
this could be done in the SGL format too.
Re: [RFC v2 06/11] nvme-pci: add support for dmabuf reggistration
Posted by Anuj gupta 2 months, 2 weeks ago
nit:
s/reggistration/registration/ in subject

Also a MODULE_IMPORT_NS("DMA_BUF") needs to be added, since it now uses
symbols from the DMA_BUF namespace, otherwise we got a build error