[RFC v2 05/11] block: add infra to handle dmabuf tokens

Pavel Begunkov posted 11 patches 2 months, 2 weeks ago
[RFC v2 05/11] block: add infra to handle dmabuf tokens
Posted by Pavel Begunkov 2 months, 2 weeks ago
Add blk-mq infrastructure to handle dmabuf tokens. There are two main
objects. The first is struct blk_mq_dma_token, which is an extension of
struct dma_token and passed in an iterator. The second is struct
blk_mq_dma_map, which keeps the actual mapping and unlike the token, can
be ejected (e.g. by move_notify) and recreated.

The token keeps an rcu protected pointer to the mapping, so when it
resolves a token into a mapping to pass it to a request, it'll do an rcu
protected lookup and get a percpu reference to the mapping.

If there is no current mapping attached to a token, it'll need to be
created by calling the driver (e.g. nvme) via a new callback. It
requires waiting, thefore can't be done for nowait requests and couldn't
happen deeper in the stack, e.g. during nvme request submission.

The structure split is needed because move_notify can request to
invalidate the dma mapping at any moment, and we need a way to
concurrently remove it and wait for the inflight requests using the
previous mapping to complete.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 block/Makefile                   |   1 +
 block/bdev.c                     |  14 ++
 block/blk-mq-dma-token.c         | 236 +++++++++++++++++++++++++++++++
 block/blk-mq.c                   |  20 +++
 block/fops.c                     |   1 +
 include/linux/blk-mq-dma-token.h |  60 ++++++++
 include/linux/blk-mq.h           |  21 +++
 include/linux/blkdev.h           |   3 +
 8 files changed, 356 insertions(+)
 create mode 100644 block/blk-mq-dma-token.c
 create mode 100644 include/linux/blk-mq-dma-token.h

diff --git a/block/Makefile b/block/Makefile
index c65f4da93702..0190e5aa9f00 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_BLK_INLINE_ENCRYPTION)	+= blk-crypto.o blk-crypto-profile.o \
 					   blk-crypto-sysfs.o
 obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)	+= blk-crypto-fallback.o
 obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED)	+= holder.o
+obj-$(CONFIG_DMA_SHARED_BUFFER) += blk-mq-dma-token.o
diff --git a/block/bdev.c b/block/bdev.c
index 810707cca970..da89d20f33f3 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -28,6 +28,7 @@
 #include <linux/part_stat.h>
 #include <linux/uaccess.h>
 #include <linux/stat.h>
+#include <linux/blk-mq-dma-token.h>
 #include "../fs/internal.h"
 #include "blk.h"
 
@@ -61,6 +62,19 @@ struct block_device *file_bdev(struct file *bdev_file)
 }
 EXPORT_SYMBOL(file_bdev);
 
+struct dma_token *blkdev_dma_map(struct file *file,
+				 struct dma_token_params *params)
+{
+	struct request_queue *q = bdev_get_queue(file_bdev(file));
+
+	if (!(file->f_flags & O_DIRECT))
+		return ERR_PTR(-EINVAL);
+	if (!q->mq_ops)
+		return ERR_PTR(-EINVAL);
+
+	return blk_mq_dma_map(q, params);
+}
+
 static void bdev_write_inode(struct block_device *bdev)
 {
 	struct inode *inode = BD_INODE(bdev);
diff --git a/block/blk-mq-dma-token.c b/block/blk-mq-dma-token.c
new file mode 100644
index 000000000000..cd62c4d09422
--- /dev/null
+++ b/block/blk-mq-dma-token.c
@@ -0,0 +1,236 @@
+#include <linux/blk-mq-dma-token.h>
+#include <linux/dma-resv.h>
+
+struct blk_mq_dma_fence {
+	struct dma_fence base;
+	spinlock_t lock;
+};
+
+static const char *blk_mq_fence_drv_name(struct dma_fence *fence)
+{
+	return "blk-mq";
+}
+
+const struct dma_fence_ops blk_mq_dma_fence_ops = {
+	.get_driver_name = blk_mq_fence_drv_name,
+	.get_timeline_name = blk_mq_fence_drv_name,
+};
+
+static void blk_mq_dma_token_free(struct blk_mq_dma_token *token)
+{
+	token->q->mq_ops->clean_dma_token(token->q, token);
+	dma_buf_put(token->dmabuf);
+	kfree(token);
+}
+
+static inline void blk_mq_dma_token_put(struct blk_mq_dma_token *token)
+{
+	if (refcount_dec_and_test(&token->refs))
+		blk_mq_dma_token_free(token);
+}
+
+static void blk_mq_dma_mapping_free(struct blk_mq_dma_map *map)
+{
+	struct blk_mq_dma_token *token = map->token;
+
+	if (map->sgt)
+		token->q->mq_ops->dma_unmap(token->q, map);
+
+	dma_fence_put(&map->fence->base);
+	percpu_ref_exit(&map->refs);
+	kfree(map);
+	blk_mq_dma_token_put(token);
+}
+
+static void blk_mq_dma_map_work_free(struct work_struct *work)
+{
+	struct blk_mq_dma_map *map = container_of(work, struct blk_mq_dma_map,
+						free_work);
+
+	dma_fence_signal(&map->fence->base);
+	blk_mq_dma_mapping_free(map);
+}
+
+static void blk_mq_dma_map_refs_free(struct percpu_ref *ref)
+{
+	struct blk_mq_dma_map *map = container_of(ref, struct blk_mq_dma_map, refs);
+
+	INIT_WORK(&map->free_work, blk_mq_dma_map_work_free);
+	queue_work(system_wq, &map->free_work);
+}
+
+static struct blk_mq_dma_map *blk_mq_alloc_dma_mapping(struct blk_mq_dma_token *token)
+{
+	struct blk_mq_dma_fence *fence = NULL;
+	struct blk_mq_dma_map *map;
+	int ret = -ENOMEM;
+
+	map = kzalloc(sizeof(*map), GFP_KERNEL);
+	if (!map)
+		return ERR_PTR(-ENOMEM);
+
+	fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+	if (!fence)
+		goto err;
+
+	ret = percpu_ref_init(&map->refs, blk_mq_dma_map_refs_free, 0,
+			      GFP_KERNEL);
+	if (ret)
+		goto err;
+
+	dma_fence_init(&fence->base, &blk_mq_dma_fence_ops, &fence->lock,
+			token->fence_ctx, atomic_inc_return(&token->fence_seq));
+	spin_lock_init(&fence->lock);
+	map->fence = fence;
+	map->token = token;
+	refcount_inc(&token->refs);
+	return map;
+err:
+	kfree(map);
+	kfree(fence);
+	return ERR_PTR(ret);
+}
+
+static inline
+struct blk_mq_dma_map *blk_mq_get_token_map(struct blk_mq_dma_token *token)
+{
+	struct blk_mq_dma_map *map;
+
+	guard(rcu)();
+
+	map = rcu_dereference(token->map);
+	if (unlikely(!map || !percpu_ref_tryget_live_rcu(&map->refs)))
+		return NULL;
+	return map;
+}
+
+static struct blk_mq_dma_map *
+blk_mq_create_dma_map(struct blk_mq_dma_token *token)
+{
+	struct dma_buf *dmabuf = token->dmabuf;
+	struct blk_mq_dma_map *map;
+	long ret;
+
+	guard(mutex)(&token->mapping_lock);
+
+	map = blk_mq_get_token_map(token);
+	if (map)
+		return map;
+
+	map = blk_mq_alloc_dma_mapping(token);
+	if (IS_ERR(map))
+		return NULL;
+
+	dma_resv_lock(dmabuf->resv, NULL);
+	ret = dma_resv_wait_timeout(dmabuf->resv, DMA_RESV_USAGE_BOOKKEEP,
+				    true, MAX_SCHEDULE_TIMEOUT);
+	ret = ret ? ret : -ETIME;
+	if (ret > 0)
+		ret = token->q->mq_ops->dma_map(token->q, map);
+	dma_resv_unlock(dmabuf->resv);
+
+	if (ret)
+		return ERR_PTR(ret);
+
+	percpu_ref_get(&map->refs);
+	rcu_assign_pointer(token->map, map);
+	return map;
+}
+
+static void blk_mq_dma_map_remove(struct blk_mq_dma_token *token)
+{
+	struct dma_buf *dmabuf = token->dmabuf;
+	struct blk_mq_dma_map *map;
+	int ret;
+
+	dma_resv_assert_held(dmabuf->resv);
+
+	ret = dma_resv_reserve_fences(dmabuf->resv, 1);
+	if (WARN_ON_ONCE(ret))
+		return;
+
+	map = rcu_dereference_protected(token->map,
+					dma_resv_held(dmabuf->resv));
+	if (!map)
+		return;
+	rcu_assign_pointer(token->map, NULL);
+
+	dma_resv_add_fence(dmabuf->resv, &map->fence->base,
+			   DMA_RESV_USAGE_KERNEL);
+	percpu_ref_kill(&map->refs);
+}
+
+blk_status_t blk_rq_assign_dma_map(struct request *rq,
+				   struct blk_mq_dma_token *token)
+{
+	struct blk_mq_dma_map *map;
+
+	map = blk_mq_get_token_map(token);
+	if (map)
+		goto complete;
+
+	if (rq->cmd_flags & REQ_NOWAIT)
+		return BLK_STS_AGAIN;
+
+	map = blk_mq_create_dma_map(token);
+	if (IS_ERR(map))
+		return BLK_STS_RESOURCE;
+complete:
+	rq->dma_map = map;
+	return BLK_STS_OK;
+}
+
+void blk_mq_dma_map_move_notify(struct blk_mq_dma_token *token)
+{
+	blk_mq_dma_map_remove(token);
+}
+
+static void blk_mq_release_dma_mapping(struct dma_token *base_token)
+{
+	struct blk_mq_dma_token *token = dma_token_to_blk_mq(base_token);
+	struct dma_buf *dmabuf = token->dmabuf;
+
+	dma_resv_lock(dmabuf->resv, NULL);
+	blk_mq_dma_map_remove(token);
+	dma_resv_unlock(dmabuf->resv);
+
+	blk_mq_dma_token_put(token);
+}
+
+struct dma_token *blk_mq_dma_map(struct request_queue *q,
+				  struct dma_token_params *params)
+{
+	struct dma_buf *dmabuf = params->dmabuf;
+	struct blk_mq_dma_token *token;
+	int ret;
+
+	if (!q->mq_ops->dma_map || !q->mq_ops->dma_unmap ||
+	    !q->mq_ops->init_dma_token || !q->mq_ops->clean_dma_token)
+		return ERR_PTR(-EINVAL);
+
+	token = kzalloc(sizeof(*token), GFP_KERNEL);
+	if (!token)
+		return ERR_PTR(-ENOMEM);
+
+	get_dma_buf(dmabuf);
+	token->fence_ctx = dma_fence_context_alloc(1);
+	token->dmabuf = dmabuf;
+	token->dir = params->dir;
+	token->base.release = blk_mq_release_dma_mapping;
+	token->q = q;
+	refcount_set(&token->refs, 1);
+	mutex_init(&token->mapping_lock);
+
+	if (!blk_get_queue(q)) {
+		kfree(token);
+		return ERR_PTR(-EFAULT);
+	}
+
+	ret = token->q->mq_ops->init_dma_token(token->q, token);
+	if (ret) {
+		kfree(token);
+		blk_put_queue(q);
+		return ERR_PTR(ret);
+	}
+	return &token->base;
+}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f2650c97a75e..1ff3a7e3191b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -29,6 +29,7 @@
 #include <linux/blk-crypto.h>
 #include <linux/part_stat.h>
 #include <linux/sched/isolation.h>
+#include <linux/blk-mq-dma-token.h>
 
 #include <trace/events/block.h>
 
@@ -439,6 +440,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->nr_integrity_segments = 0;
 	rq->end_io = NULL;
 	rq->end_io_data = NULL;
+	rq->dma_map = NULL;
 
 	blk_crypto_rq_set_defaults(rq);
 	INIT_LIST_HEAD(&rq->queuelist);
@@ -794,6 +796,7 @@ static void __blk_mq_free_request(struct request *rq)
 	blk_pm_mark_last_busy(rq);
 	rq->mq_hctx = NULL;
 
+	blk_rq_drop_dma_map(rq);
 	if (rq->tag != BLK_MQ_NO_TAG) {
 		blk_mq_dec_active_requests(hctx);
 		blk_mq_put_tag(hctx->tags, ctx, rq->tag);
@@ -3214,6 +3217,23 @@ void blk_mq_submit_bio(struct bio *bio)
 
 	blk_mq_bio_to_request(rq, bio, nr_segs);
 
+	if (bio_flagged(bio, BIO_DMA_TOKEN)) {
+		struct blk_mq_dma_token *token;
+		blk_status_t ret;
+
+		token = dma_token_to_blk_mq(bio->dma_token);
+		ret = blk_rq_assign_dma_map(rq, token);
+		if (ret) {
+			if (ret == BLK_STS_AGAIN) {
+				bio_wouldblock_error(bio);
+			} else {
+				bio->bi_status = BLK_STS_RESOURCE;
+				bio_endio(bio);
+			}
+			goto queue_exit;
+		}
+	}
+
 	ret = blk_crypto_rq_get_keyslot(rq);
 	if (ret != BLK_STS_OK) {
 		bio->bi_status = ret;
diff --git a/block/fops.c b/block/fops.c
index 41f8795874a9..ac52fe1a4b8d 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -973,6 +973,7 @@ const struct file_operations def_blk_fops = {
 	.fallocate	= blkdev_fallocate,
 	.uring_cmd	= blkdev_uring_cmd,
 	.fop_flags	= FOP_BUFFER_RASYNC,
+	.dma_map	= blkdev_dma_map,
 };
 
 static __init int blkdev_init(void)
diff --git a/include/linux/blk-mq-dma-token.h b/include/linux/blk-mq-dma-token.h
new file mode 100644
index 000000000000..4a8d84addc06
--- /dev/null
+++ b/include/linux/blk-mq-dma-token.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BLK_MQ_DMA_TOKEN_H
+#define BLK_MQ_DMA_TOKEN_H
+
+#include <linux/blk-mq.h>
+#include <linux/dma_token.h>
+#include <linux/percpu-refcount.h>
+
+struct blk_mq_dma_token;
+struct blk_mq_dma_fence;
+
+struct blk_mq_dma_map {
+	void				*private;
+
+	struct percpu_ref		refs;
+	struct sg_table			*sgt;
+	struct blk_mq_dma_token		*token;
+	struct blk_mq_dma_fence		*fence;
+	struct work_struct		free_work;
+};
+
+struct blk_mq_dma_token {
+	struct dma_token		base;
+	enum dma_data_direction		dir;
+
+	void				*private;
+
+	struct dma_buf			*dmabuf;
+	struct blk_mq_dma_map __rcu	*map;
+	struct request_queue		*q;
+
+	struct mutex			mapping_lock;
+	refcount_t			refs;
+
+	atomic_t			fence_seq;
+	u64				fence_ctx;
+};
+
+static inline
+struct blk_mq_dma_token *dma_token_to_blk_mq(struct dma_token *token)
+{
+	return container_of(token, struct blk_mq_dma_token, base);
+}
+
+blk_status_t blk_rq_assign_dma_map(struct request *req,
+				   struct blk_mq_dma_token *token);
+
+static inline void blk_rq_drop_dma_map(struct request *rq)
+{
+	if (rq->dma_map) {
+		percpu_ref_put(&rq->dma_map->refs);
+		rq->dma_map = NULL;
+	}
+}
+
+void blk_mq_dma_map_move_notify(struct blk_mq_dma_token *token);
+struct dma_token *blk_mq_dma_map(struct request_queue *q,
+				 struct dma_token_params *params);
+
+#endif /* BLK_MQ_DMA_TOKEN_H */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b54506b3b76d..4745d1e183f2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -94,6 +94,9 @@ enum mq_rq_state {
 	MQ_RQ_COMPLETE		= 2,
 };
 
+struct blk_mq_dma_map;
+struct blk_mq_dma_token;
+
 /*
  * Try to put the fields that are referenced together in the same cacheline.
  *
@@ -170,6 +173,8 @@ struct request {
 
 	unsigned long deadline;
 
+	struct blk_mq_dma_map	*dma_map;
+
 	/*
 	 * The hash is used inside the scheduler, and killed once the
 	 * request reaches the dispatch list. The ipi_list is only used
@@ -675,6 +680,21 @@ struct blk_mq_ops {
 	 */
 	void (*map_queues)(struct blk_mq_tag_set *set);
 
+	/**
+	 * @map_dmabuf: Allows drivers to pre-map a dmabuf. The resulting driver
+	 * specific mapping will be wrapped into dma_token and passed to the
+	 * read / write path in an iterator.
+	 */
+	int (*dma_map)(struct request_queue *q, struct blk_mq_dma_map *);
+	void (*dma_unmap)(struct request_queue *q, struct blk_mq_dma_map *);
+	int (*init_dma_token)(struct request_queue *q,
+			      struct blk_mq_dma_token *token);
+	void (*clean_dma_token)(struct request_queue *q,
+				struct blk_mq_dma_token *token);
+
+	struct dma_buf_attachment *(*dma_attach)(struct request_queue *q,
+					struct dma_token_params *params);
+
 #ifdef CONFIG_BLK_DEBUG_FS
 	/**
 	 * @show_rq: Used by the debugfs implementation to show driver-specific
@@ -946,6 +966,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
 void blk_mq_freeze_queue_nomemsave(struct request_queue *q);
 void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q);
+
 static inline unsigned int __must_check
 blk_mq_freeze_queue(struct request_queue *q)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cb4ba09959ee..dec75348f8dc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1777,6 +1777,9 @@ struct block_device *file_bdev(struct file *bdev_file);
 bool disk_live(struct gendisk *disk);
 unsigned int block_size(struct block_device *bdev);
 
+struct dma_token *blkdev_dma_map(struct file *file,
+				 struct dma_token_params *params);
+
 #ifdef CONFIG_BLOCK
 void invalidate_bdev(struct block_device *bdev);
 int sync_blockdev(struct block_device *bdev);
-- 
2.52.0
Re: [RFC v2 05/11] block: add infra to handle dmabuf tokens
Posted by Anuj gupta 1 day, 23 hours ago
> +
> +       dma_fence_init(&fence->base, &blk_mq_dma_fence_ops, &fence->lock,
> +                       token->fence_ctx, atomic_inc_return(&token->fence_seq));
> +       spin_lock_init(&fence->lock);

nit lock should be initialized before handing its address to
dma_fence_init()
Re: [RFC v2 05/11] block: add infra to handle dmabuf tokens
Posted by Pavel Begunkov 1 day, 20 hours ago
On 2/6/26 15:08, Anuj gupta wrote:
>> +
>> +       dma_fence_init(&fence->base, &blk_mq_dma_fence_ops, &fence->lock,
>> +                       token->fence_ctx, atomic_inc_return(&token->fence_seq));
>> +       spin_lock_init(&fence->lock);
> 
> nit lock should be initialized before handing its address to
> dma_fence_init()

Good catch, thanks, I'll apply that and other suggestions. And I still
need to address bits Christoph pointed out during review.

-- 
Pavel Begunkov
Re: [RFC v2 05/11] block: add infra to handle dmabuf tokens
Posted by Nitesh Shetty 2 weeks, 4 days ago
On 23/11/25 10:51PM, Pavel Begunkov wrote:
>Add blk-mq infrastructure to handle dmabuf tokens. There are two main
>objects. The first is struct blk_mq_dma_token, which is an extension of
>struct dma_token and passed in an iterator. The second is struct
>blk_mq_dma_map, which keeps the actual mapping and unlike the token, can
>be ejected (e.g. by move_notify) and recreated.
>
>The token keeps an rcu protected pointer to the mapping, so when it
>resolves a token into a mapping to pass it to a request, it'll do an rcu
>protected lookup and get a percpu reference to the mapping.
>
>If there is no current mapping attached to a token, it'll need to be
>created by calling the driver (e.g. nvme) via a new callback. It
>requires waiting, thefore can't be done for nowait requests and couldn't
>happen deeper in the stack, e.g. during nvme request submission.
>
>The structure split is needed because move_notify can request to
>invalidate the dma mapping at any moment, and we need a way to
>concurrently remove it and wait for the inflight requests using the
>previous mapping to complete.
>
>Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>---
> block/Makefile                   |   1 +
> block/bdev.c                     |  14 ++
> block/blk-mq-dma-token.c         | 236 +++++++++++++++++++++++++++++++
> block/blk-mq.c                   |  20 +++
> block/fops.c                     |   1 +
> include/linux/blk-mq-dma-token.h |  60 ++++++++
> include/linux/blk-mq.h           |  21 +++
> include/linux/blkdev.h           |   3 +
> 8 files changed, 356 insertions(+)
> create mode 100644 block/blk-mq-dma-token.c
> create mode 100644 include/linux/blk-mq-dma-token.h
>
>diff --git a/block/Makefile b/block/Makefile
>index c65f4da93702..0190e5aa9f00 100644
>--- a/block/Makefile
>+++ b/block/Makefile
>@@ -36,3 +36,4 @@ obj-$(CONFIG_BLK_INLINE_ENCRYPTION)	+= blk-crypto.o blk-crypto-profile.o \
> 					   blk-crypto-sysfs.o
> obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)	+= blk-crypto-fallback.o
> obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED)	+= holder.o
>+obj-$(CONFIG_DMA_SHARED_BUFFER) += blk-mq-dma-token.o
>diff --git a/block/bdev.c b/block/bdev.c
>index 810707cca970..da89d20f33f3 100644
>--- a/block/bdev.c
>+++ b/block/bdev.c
>@@ -28,6 +28,7 @@
> #include <linux/part_stat.h>
> #include <linux/uaccess.h>
> #include <linux/stat.h>
>+#include <linux/blk-mq-dma-token.h>
> #include "../fs/internal.h"
> #include "blk.h"
>
>@@ -61,6 +62,19 @@ struct block_device *file_bdev(struct file *bdev_file)
> }
> EXPORT_SYMBOL(file_bdev);
>
>+struct dma_token *blkdev_dma_map(struct file *file,
>+				 struct dma_token_params *params)
>+{
>+	struct request_queue *q = bdev_get_queue(file_bdev(file));
>+
>+	if (!(file->f_flags & O_DIRECT))
>+		return ERR_PTR(-EINVAL);
>+	if (!q->mq_ops)
>+		return ERR_PTR(-EINVAL);
>+
>+	return blk_mq_dma_map(q, params);
>+}
>+
> static void bdev_write_inode(struct block_device *bdev)
> {
> 	struct inode *inode = BD_INODE(bdev);
>diff --git a/block/blk-mq-dma-token.c b/block/blk-mq-dma-token.c
>new file mode 100644
>index 000000000000..cd62c4d09422
>--- /dev/null
>+++ b/block/blk-mq-dma-token.c
>@@ -0,0 +1,236 @@
>+#include <linux/blk-mq-dma-token.h>
>+#include <linux/dma-resv.h>
>+
>+struct blk_mq_dma_fence {
>+	struct dma_fence base;
>+	spinlock_t lock;
>+};
>+
>+static const char *blk_mq_fence_drv_name(struct dma_fence *fence)
>+{
>+	return "blk-mq";
>+}
>+
>+const struct dma_fence_ops blk_mq_dma_fence_ops = {
>+	.get_driver_name = blk_mq_fence_drv_name,
>+	.get_timeline_name = blk_mq_fence_drv_name,
>+};
>+
>+static void blk_mq_dma_token_free(struct blk_mq_dma_token *token)
>+{
>+	token->q->mq_ops->clean_dma_token(token->q, token);
>+	dma_buf_put(token->dmabuf);
>+	kfree(token);
>+}
>+
>+static inline void blk_mq_dma_token_put(struct blk_mq_dma_token *token)
>+{
>+	if (refcount_dec_and_test(&token->refs))
>+		blk_mq_dma_token_free(token);
>+}
>+
>+static void blk_mq_dma_mapping_free(struct blk_mq_dma_map *map)
>+{
>+	struct blk_mq_dma_token *token = map->token;
>+
>+	if (map->sgt)
>+		token->q->mq_ops->dma_unmap(token->q, map);
>+
>+	dma_fence_put(&map->fence->base);
>+	percpu_ref_exit(&map->refs);
>+	kfree(map);
>+	blk_mq_dma_token_put(token);
>+}
>+
>+static void blk_mq_dma_map_work_free(struct work_struct *work)
>+{
>+	struct blk_mq_dma_map *map = container_of(work, struct blk_mq_dma_map,
>+						free_work);
>+
>+	dma_fence_signal(&map->fence->base);
>+	blk_mq_dma_mapping_free(map);
>+}
>+
>+static void blk_mq_dma_map_refs_free(struct percpu_ref *ref)
>+{
>+	struct blk_mq_dma_map *map = container_of(ref, struct blk_mq_dma_map, refs);
>+
>+	INIT_WORK(&map->free_work, blk_mq_dma_map_work_free);
>+	queue_work(system_wq, &map->free_work);
>+}
>+
>+static struct blk_mq_dma_map *blk_mq_alloc_dma_mapping(struct blk_mq_dma_token *token)
>+{
>+	struct blk_mq_dma_fence *fence = NULL;
>+	struct blk_mq_dma_map *map;
>+	int ret = -ENOMEM;
>+
>+	map = kzalloc(sizeof(*map), GFP_KERNEL);
>+	if (!map)
>+		return ERR_PTR(-ENOMEM);
>+
>+	fence = kzalloc(sizeof(*fence), GFP_KERNEL);
>+	if (!fence)
>+		goto err;
>+
>+	ret = percpu_ref_init(&map->refs, blk_mq_dma_map_refs_free, 0,
>+			      GFP_KERNEL);
>+	if (ret)
>+		goto err;
>+
>+	dma_fence_init(&fence->base, &blk_mq_dma_fence_ops, &fence->lock,
>+			token->fence_ctx, atomic_inc_return(&token->fence_seq));
>+	spin_lock_init(&fence->lock);
>+	map->fence = fence;
>+	map->token = token;
>+	refcount_inc(&token->refs);
>+	return map;
>+err:
>+	kfree(map);
>+	kfree(fence);
>+	return ERR_PTR(ret);
>+}
>+
>+static inline
>+struct blk_mq_dma_map *blk_mq_get_token_map(struct blk_mq_dma_token *token)
>+{
>+	struct blk_mq_dma_map *map;
>+
>+	guard(rcu)();
>+
>+	map = rcu_dereference(token->map);
>+	if (unlikely(!map || !percpu_ref_tryget_live_rcu(&map->refs)))
>+		return NULL;
>+	return map;
>+}
>+
>+static struct blk_mq_dma_map *
>+blk_mq_create_dma_map(struct blk_mq_dma_token *token)
>+{
>+	struct dma_buf *dmabuf = token->dmabuf;
>+	struct blk_mq_dma_map *map;
>+	long ret;
>+
>+	guard(mutex)(&token->mapping_lock);
>+
>+	map = blk_mq_get_token_map(token);
>+	if (map)
>+		return map;
>+
>+	map = blk_mq_alloc_dma_mapping(token);
>+	if (IS_ERR(map))
>+		return NULL;
>+
>+	dma_resv_lock(dmabuf->resv, NULL);
>+	ret = dma_resv_wait_timeout(dmabuf->resv, DMA_RESV_USAGE_BOOKKEEP,
>+				    true, MAX_SCHEDULE_TIMEOUT);
>+	ret = ret ? ret : -ETIME;
>+	if (ret > 0)
>+		ret = token->q->mq_ops->dma_map(token->q, map);
>+	dma_resv_unlock(dmabuf->resv);
>+
>+	if (ret)
>+		return ERR_PTR(ret);
>+
>+	percpu_ref_get(&map->refs);
>+	rcu_assign_pointer(token->map, map);
>+	return map;
>+}
>+
>+static void blk_mq_dma_map_remove(struct blk_mq_dma_token *token)
>+{
>+	struct dma_buf *dmabuf = token->dmabuf;
>+	struct blk_mq_dma_map *map;
>+	int ret;
>+
>+	dma_resv_assert_held(dmabuf->resv);
>+
>+	ret = dma_resv_reserve_fences(dmabuf->resv, 1);
>+	if (WARN_ON_ONCE(ret))
>+		return;
>+
>+	map = rcu_dereference_protected(token->map,
>+					dma_resv_held(dmabuf->resv));
>+	if (!map)
>+		return;
>+	rcu_assign_pointer(token->map, NULL);
>+
>+	dma_resv_add_fence(dmabuf->resv, &map->fence->base,
>+			   DMA_RESV_USAGE_KERNEL);
>+	percpu_ref_kill(&map->refs);
>+}
>+
>+blk_status_t blk_rq_assign_dma_map(struct request *rq,
>+				   struct blk_mq_dma_token *token)
>+{
>+	struct blk_mq_dma_map *map;
>+
>+	map = blk_mq_get_token_map(token);
>+	if (map)
>+		goto complete;
>+
>+	if (rq->cmd_flags & REQ_NOWAIT)
>+		return BLK_STS_AGAIN;
>+
>+	map = blk_mq_create_dma_map(token);
>+	if (IS_ERR(map))
>+		return BLK_STS_RESOURCE;
>+complete:
>+	rq->dma_map = map;
>+	return BLK_STS_OK;
>+}
>+
>+void blk_mq_dma_map_move_notify(struct blk_mq_dma_token *token)
>+{
>+	blk_mq_dma_map_remove(token);
>+}
>+
>+static void blk_mq_release_dma_mapping(struct dma_token *base_token)
>+{
>+	struct blk_mq_dma_token *token = dma_token_to_blk_mq(base_token);
>+	struct dma_buf *dmabuf = token->dmabuf;
>+
>+	dma_resv_lock(dmabuf->resv, NULL);
>+	blk_mq_dma_map_remove(token);
>+	dma_resv_unlock(dmabuf->resv);
>+
>+	blk_mq_dma_token_put(token);
>+}
>+
>+struct dma_token *blk_mq_dma_map(struct request_queue *q,
>+				  struct dma_token_params *params)
>+{
>+	struct dma_buf *dmabuf = params->dmabuf;
>+	struct blk_mq_dma_token *token;
>+	int ret;
>+
>+	if (!q->mq_ops->dma_map || !q->mq_ops->dma_unmap ||
>+	    !q->mq_ops->init_dma_token || !q->mq_ops->clean_dma_token)
>+		return ERR_PTR(-EINVAL);
>+
>+	token = kzalloc(sizeof(*token), GFP_KERNEL);
>+	if (!token)
>+		return ERR_PTR(-ENOMEM);
>+
>+	get_dma_buf(dmabuf);
>+	token->fence_ctx = dma_fence_context_alloc(1);
>+	token->dmabuf = dmabuf;
>+	token->dir = params->dir;
>+	token->base.release = blk_mq_release_dma_mapping;
>+	token->q = q;
>+	refcount_set(&token->refs, 1);
>+	mutex_init(&token->mapping_lock);
>+
>+	if (!blk_get_queue(q)) {
>+		kfree(token);
>+		return ERR_PTR(-EFAULT);
>+	}
>+
>+	ret = token->q->mq_ops->init_dma_token(token->q, token);
>+	if (ret) {
>+		kfree(token);
>+		blk_put_queue(q);
>+		return ERR_PTR(ret);
>+	}
>+	return &token->base;
>+}
>diff --git a/block/blk-mq.c b/block/blk-mq.c
>index f2650c97a75e..1ff3a7e3191b 100644
>--- a/block/blk-mq.c
>+++ b/block/blk-mq.c
>@@ -29,6 +29,7 @@
> #include <linux/blk-crypto.h>
> #include <linux/part_stat.h>
> #include <linux/sched/isolation.h>
>+#include <linux/blk-mq-dma-token.h>
>
> #include <trace/events/block.h>
>
>@@ -439,6 +440,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
> 	rq->nr_integrity_segments = 0;
> 	rq->end_io = NULL;
> 	rq->end_io_data = NULL;
>+	rq->dma_map = NULL;
>
> 	blk_crypto_rq_set_defaults(rq);
> 	INIT_LIST_HEAD(&rq->queuelist);
>@@ -794,6 +796,7 @@ static void __blk_mq_free_request(struct request *rq)
> 	blk_pm_mark_last_busy(rq);
> 	rq->mq_hctx = NULL;
>
>+	blk_rq_drop_dma_map(rq);
blk_rq_drop_dma_map(rq), needs to be added in blk_mq_end_request_batch
as well[1], otherwise I am seeing we leave with increased reference
count in dma-buf exporter side.

Thanks,
Nitesh

[1]
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1214,6 +1214,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)

                  blk_crypto_free_request(rq);
                  blk_pm_mark_last_busy(rq);
+               blk_rq_drop_dma_map(rq);
Re: [RFC v2 05/11] block: add infra to handle dmabuf tokens
Posted by Pavel Begunkov 2 weeks, 3 days ago
On 1/21/26 07:37, Nitesh Shetty wrote:
> On 23/11/25 10:51PM, Pavel Begunkov wrote:
>> Add blk-mq infrastructure to handle dmabuf tokens. There are two main
>> objects. The first is struct blk_mq_dma_token, which is an extension of
>> struct dma_token and passed in an iterator. The second is struct
>> blk_mq_dma_map, which keeps the actual mapping and unlike the token, can
>> be ejected (e.g. by move_notify) and recreated.
>>
>> The token keeps an rcu protected pointer to the mapping, so when it
>> resolves a token into a mapping to pass it to a request, it'll do an rcu
>> protected lookup and get a percpu reference to the mapping.
>>
>> If there is no current mapping attached to a token, it'll need to be
>> created by calling the driver (e.g. nvme) via a new callback. It
>> requires waiting, thefore can't be done for nowait requests and couldn't
>> happen deeper in the stack, e.g. during nvme request submission.
>>
>> The structure split is needed because move_notify can request to
>> invalidate the dma mapping at any moment, and we need a way to
>> concurrently remove it and wait for the inflight requests using the
>> previous mapping to complete.
>>
>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>> ---
>> block/Makefile                   |   1 +
>> block/bdev.c                     |  14 ++
>> block/blk-mq-dma-token.c         | 236 +++++++++++++++++++++++++++++++
>> block/blk-mq.c                   |  20 +++
>> block/fops.c                     |   1 +
>> include/linux/blk-mq-dma-token.h |  60 ++++++++
>> include/linux/blk-mq.h           |  21 +++
>> include/linux/blkdev.h           |   3 +
>> 8 files changed, 356 insertions(+)
>> create mode 100644 block/blk-mq-dma-token.c
>> create mode 100644 include/linux/blk-mq-dma-token.h
>>
>> diff --git a/block/Makefile b/block/Makefile
>> index c65f4da93702..0190e5aa9f00 100644
...
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index f2650c97a75e..1ff3a7e3191b 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -29,6 +29,7 @@
>> #include <linux/blk-crypto.h>
>> #include <linux/part_stat.h>
>> #include <linux/sched/isolation.h>
>> +#include <linux/blk-mq-dma-token.h>
>>
>> #include <trace/events/block.h>
>>
>> @@ -439,6 +440,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>>     rq->nr_integrity_segments = 0;
>>     rq->end_io = NULL;
>>     rq->end_io_data = NULL;
>> +    rq->dma_map = NULL;
>>
>>     blk_crypto_rq_set_defaults(rq);
>>     INIT_LIST_HEAD(&rq->queuelist);
>> @@ -794,6 +796,7 @@ static void __blk_mq_free_request(struct request *rq)
>>     blk_pm_mark_last_busy(rq);
>>     rq->mq_hctx = NULL;
>>
>> +    blk_rq_drop_dma_map(rq);
> blk_rq_drop_dma_map(rq), needs to be added in blk_mq_end_request_batch
> as well[1], otherwise I am seeing we leave with increased reference
> count in dma-buf exporter side.
> 
> Thanks,
> Nitesh
> 
> [1]
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1214,6 +1214,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
> 
>                   blk_crypto_free_request(rq);
>                   blk_pm_mark_last_busy(rq);
> +               blk_rq_drop_dma_map(rq);

Ah yes, thanks Nitesh

-- 
Pavel Begunkov

Re: [RFC v2 05/11] block: add infra to handle dmabuf tokens
Posted by Christoph Hellwig 2 months ago
On Sun, Nov 23, 2025 at 10:51:25PM +0000, Pavel Begunkov wrote:
> +struct dma_token *blkdev_dma_map(struct file *file,
> +				 struct dma_token_params *params)

Given that this is a direct file operation instance it should be
in block/fops.c.  If we do want a generic helper below it, it
should take a struct block_device instead.  But we can probably
defer that until a user for that shows up.
Re: [RFC v2 05/11] block: add infra to handle dmabuf tokens
Posted by Christoph Hellwig 2 months ago
On Sun, Nov 23, 2025 at 10:51:25PM +0000, Pavel Begunkov wrote:
> Add blk-mq infrastructure to handle dmabuf tokens. There are two main

Please spell out infrastructure in the subject as well.

> +struct dma_token *blkdev_dma_map(struct file *file,
> +				 struct dma_token_params *params)
> +{
> +	struct request_queue *q = bdev_get_queue(file_bdev(file));
> +
> +	if (!(file->f_flags & O_DIRECT))
> +		return ERR_PTR(-EINVAL);

Shouldn't the O_DIRECT check be in the caller?

> +++ b/block/blk-mq-dma-token.c

Missing SPDX and Copyright statement.

> @@ -0,0 +1,236 @@
> +#include <linux/blk-mq-dma-token.h>
> +#include <linux/dma-resv.h>
> +
> +struct blk_mq_dma_fence {
> +	struct dma_fence base;
> +	spinlock_t lock;
> +};

And a high-level comment explaining the fencing logic would be nice
as well.

> +	struct blk_mq_dma_map *map = container_of(ref, struct blk_mq_dma_map, refs);

Overly long line.

> +static struct blk_mq_dma_map *blk_mq_alloc_dma_mapping(struct blk_mq_dma_token *token)

Another one.  Also kinda inconsistent between _map in the data structure
and _mapping in the function name.

> +static inline
> +struct blk_mq_dma_map *blk_mq_get_token_map(struct blk_mq_dma_token *token)

Really odd return value / scope formatting.

> +{
> +	struct blk_mq_dma_map *map;
> +
> +	guard(rcu)();
> +
> +	map = rcu_dereference(token->map);
> +	if (unlikely(!map || !percpu_ref_tryget_live_rcu(&map->refs)))
> +		return NULL;
> +	return map;

Please use good old rcu_read_unlock to make this readable.

> +	guard(mutex)(&token->mapping_lock);

Same.

> +
> +	map = blk_mq_get_token_map(token);
> +	if (map)
> +		return map;
> +
> +	map = blk_mq_alloc_dma_mapping(token);
> +	if (IS_ERR(map))
> +		return NULL;
> +
> +	dma_resv_lock(dmabuf->resv, NULL);
> +	ret = dma_resv_wait_timeout(dmabuf->resv, DMA_RESV_USAGE_BOOKKEEP,
> +				    true, MAX_SCHEDULE_TIMEOUT);
> +	ret = ret ? ret : -ETIME;

	if (!ret)
		ret = -ETIME;

> +blk_status_t blk_rq_assign_dma_map(struct request *rq,
> +				   struct blk_mq_dma_token *token)
> +{
> +	struct blk_mq_dma_map *map;
> +
> +	map = blk_mq_get_token_map(token);
> +	if (map)
> +		goto complete;
> +
> +	if (rq->cmd_flags & REQ_NOWAIT)
> +		return BLK_STS_AGAIN;
> +
> +	map = blk_mq_create_dma_map(token);
> +	if (IS_ERR(map))
> +		return BLK_STS_RESOURCE;

Having a few comments, that say this is creating the map lazily
would probably helper the reader.  Also why not keep the !map
case in the branch, as the map case should be the fast path and
thus usually be straight line in the function?

> +void blk_mq_dma_map_move_notify(struct blk_mq_dma_token *token)
> +{
> +	blk_mq_dma_map_remove(token);
> +}

Is there a good reason for having this blk_mq_dma_map_move_notify
wrapper?

> +	if (bio_flagged(bio, BIO_DMA_TOKEN)) {
> +		struct blk_mq_dma_token *token;
> +		blk_status_t ret;
> +
> +		token = dma_token_to_blk_mq(bio->dma_token);
> +		ret = blk_rq_assign_dma_map(rq, token);
> +		if (ret) {
> +			if (ret == BLK_STS_AGAIN) {
> +				bio_wouldblock_error(bio);
> +			} else {
> +				bio->bi_status = BLK_STS_RESOURCE;
> +				bio_endio(bio);
> +			}
> +			goto queue_exit;
> +		}
> +	}

Any reason to not just keep the dma_token_to_blk_mq?  Also why is this
overriding non-BLK_STS_AGAIN errors with BLK_STS_RESOURCE?

(I really wish we could make all BLK_STS_AGAIN errors be quiet without
the explicit setting of BIO_QUIET, which is a bit annoying, but that's
not for this patch).

> +static inline
> +struct blk_mq_dma_token *dma_token_to_blk_mq(struct dma_token *token)

More odd formatting.
Re: [RFC v2 05/11] block: add infra to handle dmabuf tokens
Posted by Pavel Begunkov 1 month, 4 weeks ago
On 12/4/25 10:56, Christoph Hellwig wrote:
> On Sun, Nov 23, 2025 at 10:51:25PM +0000, Pavel Begunkov wrote:
...
>> +	struct request_queue *q = bdev_get_queue(file_bdev(file));
>> +
>> +	if (!(file->f_flags & O_DIRECT))
>> +		return ERR_PTR(-EINVAL);
> 
> Shouldn't the O_DIRECT check be in the caller?

If the interface will get implemented e.g. for net at some point, it
won't be O_DIRECT. If you want some extra safety for fs implementing
it, I can add sth like below in the common path:

if (reg_or_block_file(file))
	// check O_DIRECT

> And a high-level comment explaining the fencing logic would be nice
> as well.

I'll add some comments around

...
>> +static inline
>> +struct blk_mq_dma_map *blk_mq_get_token_map(struct blk_mq_dma_token *token)
> 
> Really odd return value / scope formatting.

static inline struct blk_mq_dma_map
*blk_mq_get_token_map(...)

Do you prefer this? It's too long to sanely fit it in
either way. Though I didn't have this problem in v3.

  
>> +{
>> +	struct blk_mq_dma_map *map;
>> +
>> +	guard(rcu)();
>> +
>> +	map = rcu_dereference(token->map);
>> +	if (unlikely(!map || !percpu_ref_tryget_live_rcu(&map->refs)))
>> +		return NULL;
>> +	return map;
> 
> Please use good old rcu_read_unlock to make this readable.

Come on, it's pretty readable and less error prone, especially
for longer functions. Maybe you prefer scoped guards?

scoped_guard(rcu) {
	map = token->map;
	if (!map)
		return;
}

...
>> +blk_status_t blk_rq_assign_dma_map(struct request *rq,
>> +				   struct blk_mq_dma_token *token)
>> +{
>> +	struct blk_mq_dma_map *map;
>> +
>> +	map = blk_mq_get_token_map(token);
>> +	if (map)
>> +		goto complete;
>> +
>> +	if (rq->cmd_flags & REQ_NOWAIT)
>> +		return BLK_STS_AGAIN;
>> +
>> +	map = blk_mq_create_dma_map(token);
>> +	if (IS_ERR(map))
>> +		return BLK_STS_RESOURCE;
> 
> Having a few comments, that say this is creating the map lazily
> would probably helper the reader.  Also why not keep the !map
> case in the branch, as the map case should be the fast path and
> thus usually be straight line in the function?
> 
>> +void blk_mq_dma_map_move_notify(struct blk_mq_dma_token *token)
>> +{
>> +	blk_mq_dma_map_remove(token);
>> +}
> 
> Is there a good reason for having this blk_mq_dma_map_move_notify
> wrapper?

I was reused it before and reusing in the next iteration, maybe
v2 wasn't for some reason.

> 
>> +	if (bio_flagged(bio, BIO_DMA_TOKEN)) {
>> +		struct blk_mq_dma_token *token;
>> +		blk_status_t ret;
>> +
>> +		token = dma_token_to_blk_mq(bio->dma_token);
>> +		ret = blk_rq_assign_dma_map(rq, token);
>> +		if (ret) {
>> +			if (ret == BLK_STS_AGAIN) {
>> +				bio_wouldblock_error(bio);
>> +			} else {
>> +				bio->bi_status = BLK_STS_RESOURCE;
>> +				bio_endio(bio);
>> +			}
>> +			goto queue_exit;
>> +		}
>> +	}
> 
> Any reason to not just keep the dma_token_to_blk_mq?  Also why is this
> overriding non-BLK_STS_AGAIN errors with BLK_STS_RESOURCE?

Yeah, it should've been errno_to_blk_status()

-- 
Pavel Begunkov
Re: [RFC v2 05/11] block: add infra to handle dmabuf tokens
Posted by Anuj gupta 2 months, 2 weeks ago
> +void blk_mq_dma_map_move_notify(struct blk_mq_dma_token *token)
> +{
> +       blk_mq_dma_map_remove(token);
> +}
this needs to be exported as it is referenced from the nvme-pci driver,
otherwise we get a build error