Add a file callback that maps a dmabuf for the given file and returns
an opaque token of type struct dma_token representing the mapping. The
implementation details are hidden from the caller, and the implementors
are normally expected to extend the structure.
The callback callers will be able to pass the token with an IO request,
which implemented in following patches as a new iterator type. The user
should release the token once it's not needed by calling the provided
release callback via appropriate helpers.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
include/linux/dma_token.h | 35 +++++++++++++++++++++++++++++++++++
include/linux/fs.h | 4 ++++
2 files changed, 39 insertions(+)
create mode 100644 include/linux/dma_token.h
diff --git a/include/linux/dma_token.h b/include/linux/dma_token.h
new file mode 100644
index 000000000000..9194b34282c2
--- /dev/null
+++ b/include/linux/dma_token.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_DMA_TOKEN_H
+#define _LINUX_DMA_TOKEN_H
+
+#include <linux/dma-buf.h>
+
+struct dma_token_params {
+ struct dma_buf *dmabuf;
+ enum dma_data_direction dir;
+};
+
+struct dma_token {
+ void (*release)(struct dma_token *);
+};
+
+static inline void dma_token_release(struct dma_token *token)
+{
+ token->release(token);
+}
+
+static inline struct dma_token *
+dma_token_create(struct file *file, struct dma_token_params *params)
+{
+ struct dma_token *res;
+
+ if (!file->f_op->dma_map)
+ return ERR_PTR(-EOPNOTSUPP);
+ res = file->f_op->dma_map(file, params);
+
+ WARN_ON_ONCE(!IS_ERR(res) && !res->release);
+
+ return res;
+}
+
+#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..0ce9a53fabec 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2262,6 +2262,8 @@ struct dir_context {
struct iov_iter;
struct io_uring_cmd;
struct offset_ctx;
+struct dma_token;
+struct dma_token_params;
typedef unsigned int __bitwise fop_flags_t;
@@ -2309,6 +2311,8 @@ struct file_operations {
int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
unsigned int poll_flags);
int (*mmap_prepare)(struct vm_area_desc *);
+ struct dma_token *(*dma_map)(struct file *,
+ struct dma_token_params *);
} __randomize_layout;
/* Supports async buffered reads */
--
2.52.0
On 11/23/25 23:51, Pavel Begunkov wrote:
> Add a file callback that maps a dmabuf for the given file and returns
> an opaque token of type struct dma_token representing the mapping.
I'm really scratching my head what you mean with that?
And why the heck would we need to pass a DMA-buf to a struct file?
Regards,
Christian.
> The
> implementation details are hidden from the caller, and the implementors
> are normally expected to extend the structure.
>
> The callback callers will be able to pass the token with an IO request,
> which implemented in following patches as a new iterator type. The user
> should release the token once it's not needed by calling the provided
> release callback via appropriate helpers.
>
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
> include/linux/dma_token.h | 35 +++++++++++++++++++++++++++++++++++
> include/linux/fs.h | 4 ++++
> 2 files changed, 39 insertions(+)
> create mode 100644 include/linux/dma_token.h
>
> diff --git a/include/linux/dma_token.h b/include/linux/dma_token.h
> new file mode 100644
> index 000000000000..9194b34282c2
> --- /dev/null
> +++ b/include/linux/dma_token.h
> @@ -0,0 +1,35 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_DMA_TOKEN_H
> +#define _LINUX_DMA_TOKEN_H
> +
> +#include <linux/dma-buf.h>
> +
> +struct dma_token_params {
> + struct dma_buf *dmabuf;
> + enum dma_data_direction dir;
> +};
> +
> +struct dma_token {
> + void (*release)(struct dma_token *);
> +};
> +
> +static inline void dma_token_release(struct dma_token *token)
> +{
> + token->release(token);
> +}
> +
> +static inline struct dma_token *
> +dma_token_create(struct file *file, struct dma_token_params *params)
> +{
> + struct dma_token *res;
> +
> + if (!file->f_op->dma_map)
> + return ERR_PTR(-EOPNOTSUPP);
> + res = file->f_op->dma_map(file, params);
> +
> + WARN_ON_ONCE(!IS_ERR(res) && !res->release);
> +
> + return res;
> +}
> +
> +#endif
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index c895146c1444..0ce9a53fabec 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2262,6 +2262,8 @@ struct dir_context {
> struct iov_iter;
> struct io_uring_cmd;
> struct offset_ctx;
> +struct dma_token;
> +struct dma_token_params;
>
> typedef unsigned int __bitwise fop_flags_t;
>
> @@ -2309,6 +2311,8 @@ struct file_operations {
> int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
> unsigned int poll_flags);
> int (*mmap_prepare)(struct vm_area_desc *);
> + struct dma_token *(*dma_map)(struct file *,
> + struct dma_token_params *);
> } __randomize_layout;
>
> /* Supports async buffered reads */
On Thu, Dec 04, 2025 at 11:46:45AM +0100, Christian König wrote: > On 11/23/25 23:51, Pavel Begunkov wrote: > > Add a file callback that maps a dmabuf for the given file and returns > > an opaque token of type struct dma_token representing the mapping. > > I'm really scratching my head what you mean with that? > > And why the heck would we need to pass a DMA-buf to a struct file? I find the naming pretty confusing a well. But what this does is to tell the file system/driver that it should expect a future read_iter/write_iter operation that takes data from / puts data into the dmabuf passed to this operation.
On 12/4/25 12:07, Christoph Hellwig wrote: > On Thu, Dec 04, 2025 at 11:46:45AM +0100, Christian König wrote: >> On 11/23/25 23:51, Pavel Begunkov wrote: >>> Add a file callback that maps a dmabuf for the given file and returns >>> an opaque token of type struct dma_token representing the mapping. >> >> I'm really scratching my head what you mean with that? >> >> And why the heck would we need to pass a DMA-buf to a struct file? > > I find the naming pretty confusing a well. But what this does is to > tell the file system/driver that it should expect a future > read_iter/write_iter operation that takes data from / puts data into > the dmabuf passed to this operation. That explanation makes much more sense. The remaining question is why does the underlying file system / driver needs to know that it will get addresses from a DMA-buf? Regards, Christian.
On Thu, Dec 04, 2025 at 12:09:46PM +0100, Christian König wrote: > > I find the naming pretty confusing a well. But what this does is to > > tell the file system/driver that it should expect a future > > read_iter/write_iter operation that takes data from / puts data into > > the dmabuf passed to this operation. > > That explanation makes much more sense. > > The remaining question is why does the underlying file system / driver > needs to know that it will get addresses from a DMA-buf? This eventually ends up calling dma_buf_dynamic_attach and provides a way to find the dma_buf_attachment later in the I/O path.
On Thu, Dec 04, 2025 at 02:10:25PM +0100, Christoph Hellwig wrote: > On Thu, Dec 04, 2025 at 12:09:46PM +0100, Christian König wrote: > > > I find the naming pretty confusing a well. But what this does is to > > > tell the file system/driver that it should expect a future > > > read_iter/write_iter operation that takes data from / puts data into > > > the dmabuf passed to this operation. > > > > That explanation makes much more sense. > > > > The remaining question is why does the underlying file system / driver > > needs to know that it will get addresses from a DMA-buf? > > This eventually ends up calling dma_buf_dynamic_attach and provides > a way to find the dma_buf_attachment later in the I/O path. Maybe it can be named as ->dma_buf_attach()? For wiring dma-buf and the importer side(nvme). But I am wondering why not make it as one subsystem interface, such as nvme ioctl, then the whole implementation can be simplified a lot. It is reasonable because subsystem is exactly the side for consuming/importing the dma-buf. Thanks, Ming
On 1/4/26 02:42, Ming Lei wrote: > On Thu, Dec 04, 2025 at 02:10:25PM +0100, Christoph Hellwig wrote: >> On Thu, Dec 04, 2025 at 12:09:46PM +0100, Christian König wrote: >>>> I find the naming pretty confusing a well. But what this does is to >>>> tell the file system/driver that it should expect a future >>>> read_iter/write_iter operation that takes data from / puts data into >>>> the dmabuf passed to this operation. >>> >>> That explanation makes much more sense. >>> >>> The remaining question is why does the underlying file system / driver >>> needs to know that it will get addresses from a DMA-buf? >> >> This eventually ends up calling dma_buf_dynamic_attach and provides >> a way to find the dma_buf_attachment later in the I/O path. > > Maybe it can be named as ->dma_buf_attach()? For wiring dma-buf and the > importer side(nvme). Yeah that would make it much more cleaner. Also some higher level documentation would certainly help. > But I am wondering why not make it as one subsystem interface, such as nvme > ioctl, then the whole implementation can be simplified a lot. It is reasonable > because subsystem is exactly the side for consuming/importing the dma-buf. Yeah that it might be better if it's more nvme specific came to me as well. Regards, Christian. > > > Thanks, > Ming >
On Wed, Jan 07, 2026 at 04:56:05PM +0100, Christian König wrote: > > But I am wondering why not make it as one subsystem interface, such as nvme > > ioctl, then the whole implementation can be simplified a lot. It is reasonable > > because subsystem is exactly the side for consuming/importing the dma-buf. > > Yeah that it might be better if it's more nvme specific came to me as well. The feature is in no way nvme specific. nvme is just the initial underlying driver. It makes total sense to support this for any high performance block device, and to pass it through file systems.
On Wed, Jan 07, 2026 at 05:01:51PM +0100, Christoph Hellwig wrote: > On Wed, Jan 07, 2026 at 04:56:05PM +0100, Christian König wrote: > > > But I am wondering why not make it as one subsystem interface, such as nvme > > > ioctl, then the whole implementation can be simplified a lot. It is reasonable > > > because subsystem is exactly the side for consuming/importing the dma-buf. > > > > Yeah that it might be better if it's more nvme specific came to me as well. > > The feature is in no way nvme specific. nvme is just the initial > underlying driver. It makes total sense to support this for any high > performance block device, and to pass it through file systems. But why does FS care the dma buffer attachment? Since high performance host controller is exactly the dma buffer attachment point. If the callback is added in `struct file_operations` for wiring dma buffer and the importer(host contrller), you will see it is hard to let it cross device mapper/raid or other stackable block devices. Thanks, Ming
On Thu, Jan 08, 2026 at 10:19:18AM +0800, Ming Lei wrote: > > The feature is in no way nvme specific. nvme is just the initial > > underlying driver. It makes total sense to support this for any high > > performance block device, and to pass it through file systems. > > But why does FS care the dma buffer attachment? Since high performance > host controller is exactly the dma buffer attachment point. I can't parse what you're trying to say here. > If the callback is added in `struct file_operations` for wiring dma buffer > and the importer(host contrller), you will see it is hard to let it cross device > mapper/raid or other stackable block devices. Why? But even when not stacking, the registration still needs to go through the file system even for a single device, never mind multiple controlled by the file system.
On Thu, Jan 08, 2026 at 11:17:03AM +0100, Christoph Hellwig wrote: > On Thu, Jan 08, 2026 at 10:19:18AM +0800, Ming Lei wrote: > > > The feature is in no way nvme specific. nvme is just the initial > > > underlying driver. It makes total sense to support this for any high > > > performance block device, and to pass it through file systems. > > > > But why does FS care the dma buffer attachment? Since high performance > > host controller is exactly the dma buffer attachment point. > > I can't parse what you're trying to say here. dma buffer attachment is simply none of FS's business. > > > If the callback is added in `struct file_operations` for wiring dma buffer > > and the importer(host contrller), you will see it is hard to let it cross device > > mapper/raid or other stackable block devices. > > Why? > > But even when not stacking, the registration still needs to go > through the file system even for a single device, never mind multiple > controlled by the file system. dma_buf can have multiple importers, so why does it have to go through FS for single device only? If the registered buffer is attached to single device before going through FS, it can not support stacking block device, and it can't or not easily to use for multiple block device, no matter if they are behind same host controller or multiple. Thanks, Ming
On Fri, Jan 09, 2026 at 10:10:57AM +0800, Ming Lei wrote: > On Thu, Jan 08, 2026 at 11:17:03AM +0100, Christoph Hellwig wrote: > > On Thu, Jan 08, 2026 at 10:19:18AM +0800, Ming Lei wrote: > > > > The feature is in no way nvme specific. nvme is just the initial > > > > underlying driver. It makes total sense to support this for any high > > > > performance block device, and to pass it through file systems. > > > > > > But why does FS care the dma buffer attachment? Since high performance > > > host controller is exactly the dma buffer attachment point. > > > > I can't parse what you're trying to say here. > > dma buffer attachment is simply none of FS's business. The file systems should indeed never do a dma buffer attachment itself, but that's not the point. > > But even when not stacking, the registration still needs to go > > through the file system even for a single device, never mind multiple > > controlled by the file system. > > dma_buf can have multiple importers, so why does it have to go through FS for > single device only? > > If the registered buffer is attached to single device before going > through FS, it can not support stacking block device, and it can't or not > easily to use for multiple block device, no matter if they are behind same > host controller or multiple. Because the file system, or the file_operations instance to be more specific, is the only entity that known what block device(s) or other DMA capable device(s) like (R)NIC a file maps to.
On 1/4/26 01:42, Ming Lei wrote: > On Thu, Dec 04, 2025 at 02:10:25PM +0100, Christoph Hellwig wrote: >> On Thu, Dec 04, 2025 at 12:09:46PM +0100, Christian König wrote: >>>> I find the naming pretty confusing a well. But what this does is to >>>> tell the file system/driver that it should expect a future >>>> read_iter/write_iter operation that takes data from / puts data into >>>> the dmabuf passed to this operation. >>> >>> That explanation makes much more sense. >>> >>> The remaining question is why does the underlying file system / driver >>> needs to know that it will get addresses from a DMA-buf? >> >> This eventually ends up calling dma_buf_dynamic_attach and provides >> a way to find the dma_buf_attachment later in the I/O path. > > Maybe it can be named as ->dma_buf_attach()? For wiring dma-buf and the > importer side(nvme). > > But I am wondering why not make it as one subsystem interface, such as nvme > ioctl, then the whole implementation can be simplified a lot. It is reasonable > because subsystem is exactly the side for consuming/importing the dma-buf. It's not an nvme specific interface, and so a file op was much more convenient. And ioctls for registering it into io_uring would also be problematic. I simplified some of the layering for the next version, but most of the complexity comes from handling in blk-mq-dma-token.h, it'd be same even if made nvme specific. In fact, I had it all first in nvme but then had to move to block/ because of sleeping. -- Pavel Begunkov
On Tue, Jan 06, 2026 at 07:51:12PM +0000, Pavel Begunkov wrote: >> But I am wondering why not make it as one subsystem interface, such as nvme >> ioctl, then the whole implementation can be simplified a lot. It is reasonable >> because subsystem is exactly the side for consuming/importing the dma-buf. > > It's not an nvme specific interface, and so a file op was much more > convenient. It is the much better abstraction. Also the nvme subsystems is not an actor, and registering things to the subsystems does not work. The nvme controller is the entity that does the dma mapping, and this interface works very well for that.
On Sun, Nov 23, 2025 at 10:51:21PM +0000, Pavel Begunkov wrote:
> +static inline struct dma_token *
> +dma_token_create(struct file *file, struct dma_token_params *params)
> +{
> + struct dma_token *res;
> +
> + if (!file->f_op->dma_map)
> + return ERR_PTR(-EOPNOTSUPP);
> + res = file->f_op->dma_map(file, params);
Calling the file operation ->dmap_map feels really misleading.
create_token as in the function name is already much better, but
it really is not just dma, but dmabuf related, and that should really
be encoded in the name.
Also why not pass the dmabuf and direction directly instead of wrapping
it in the odd params struct making the whole thing hard to follow?
On 12/4/25 10:42, Christoph Hellwig wrote:
> On Sun, Nov 23, 2025 at 10:51:21PM +0000, Pavel Begunkov wrote:
>> +static inline struct dma_token *
>> +dma_token_create(struct file *file, struct dma_token_params *params)
>> +{
>> + struct dma_token *res;
>> +
>> + if (!file->f_op->dma_map)
>> + return ERR_PTR(-EOPNOTSUPP);
>> + res = file->f_op->dma_map(file, params);
>
> Calling the file operation ->dmap_map feels really misleading.
agreed
> create_token as in the function name is already much better, but
> it really is not just dma, but dmabuf related, and that should really
> be encoded in the name.
>
> Also why not pass the dmabuf and direction directly instead of wrapping
> it in the odd params struct making the whole thing hard to follow?
I added it after I forgot about the direction and had to plumb
it through all the layers. In a draft of v3 I had I already
removed it as dmabuf is passed with the token to drivers.
--
Pavel Begunkov
© 2016 - 2026 Red Hat, Inc.