From: Leon Romanovsky <leonro@nvidia.com>
Make sure that CPU is not synced and IOMMU is configured to take
MMIO path by providing newly introduced DMA_ATTR_MMIO attribute.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
block/blk-mq-dma.c | 13 +++++++++++--
include/linux/blk-mq-dma.h | 6 +++++-
include/linux/blk_types.h | 2 ++
3 files changed, 18 insertions(+), 3 deletions(-)
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 37e2142be4f7..d415088ed9fd 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -87,8 +87,13 @@ static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
struct blk_dma_iter *iter, struct phys_vec *vec)
{
+ unsigned int attrs = 0;
+
+ if (req->cmd_flags & REQ_MMIO)
+ attrs = DMA_ATTR_MMIO;
+
iter->addr = dma_map_phys(dma_dev, vec->paddr, vec->len,
- rq_dma_dir(req), 0);
+ rq_dma_dir(req), attrs);
if (dma_mapping_error(dma_dev, iter->addr)) {
iter->status = BLK_STS_RESOURCE;
return false;
@@ -103,14 +108,17 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
{
enum dma_data_direction dir = rq_dma_dir(req);
unsigned int mapped = 0;
+ unsigned int attrs = 0;
int error;
iter->addr = state->addr;
iter->len = dma_iova_size(state);
+ if (req->cmd_flags & REQ_MMIO)
+ attrs = DMA_ATTR_MMIO;
do {
error = dma_iova_link(dma_dev, state, vec->paddr, mapped,
- vec->len, dir, 0);
+ vec->len, dir, attrs);
if (error)
break;
mapped += vec->len;
@@ -176,6 +184,7 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
* same as non-P2P transfers below and during unmap.
*/
req->cmd_flags &= ~REQ_P2PDMA;
+ req->cmd_flags |= REQ_MMIO;
break;
default:
iter->status = BLK_STS_INVAL;
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index c26a01aeae00..6c55f5e58511 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -48,12 +48,16 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state)
static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev,
struct dma_iova_state *state, size_t mapped_len)
{
+ unsigned int attrs = 0;
+
if (req->cmd_flags & REQ_P2PDMA)
return true;
if (dma_use_iova(state)) {
+ if (req->cmd_flags & REQ_MMIO)
+ attrs = DMA_ATTR_MMIO;
dma_iova_destroy(dma_dev, state, mapped_len, rq_dma_dir(req),
- 0);
+ attrs);
return true;
}
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 09b99d52fd36..283058bcb5b1 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -387,6 +387,7 @@ enum req_flag_bits {
__REQ_FS_PRIVATE, /* for file system (submitter) use */
__REQ_ATOMIC, /* for atomic write operations */
__REQ_P2PDMA, /* contains P2P DMA pages */
+ __REQ_MMIO, /* contains MMIO memory */
/*
* Command specific flags, keep last:
*/
@@ -420,6 +421,7 @@ enum req_flag_bits {
#define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
#define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC)
#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA)
+#define REQ_MMIO (__force blk_opf_t)(1ULL << __REQ_MMIO)
#define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
--
2.50.1
On Tue, Aug 19, 2025 at 08:36:59PM +0300, Leon Romanovsky wrote: > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h > index 09b99d52fd36..283058bcb5b1 100644 > --- a/include/linux/blk_types.h > +++ b/include/linux/blk_types.h > @@ -387,6 +387,7 @@ enum req_flag_bits { > __REQ_FS_PRIVATE, /* for file system (submitter) use */ > __REQ_ATOMIC, /* for atomic write operations */ > __REQ_P2PDMA, /* contains P2P DMA pages */ > + __REQ_MMIO, /* contains MMIO memory */ > /* > * Command specific flags, keep last: > */ > @@ -420,6 +421,7 @@ enum req_flag_bits { > #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) > #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) > #define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) > +#define REQ_MMIO (__force blk_opf_t)(1ULL << __REQ_MMIO) Now that my integrity metadata DMA series is staged, I don't think we can use REQ flags like this because data and metadata may have different mapping types. I think we should add a flags field to the dma_iova_state instead.
On Thu, Aug 28, 2025 at 09:19:20AM -0600, Keith Busch wrote: > On Tue, Aug 19, 2025 at 08:36:59PM +0300, Leon Romanovsky wrote: > > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h > > index 09b99d52fd36..283058bcb5b1 100644 > > --- a/include/linux/blk_types.h > > +++ b/include/linux/blk_types.h > > @@ -387,6 +387,7 @@ enum req_flag_bits { > > __REQ_FS_PRIVATE, /* for file system (submitter) use */ > > __REQ_ATOMIC, /* for atomic write operations */ > > __REQ_P2PDMA, /* contains P2P DMA pages */ > > + __REQ_MMIO, /* contains MMIO memory */ > > /* > > * Command specific flags, keep last: > > */ > > @@ -420,6 +421,7 @@ enum req_flag_bits { > > #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) > > #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) > > #define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) > > +#define REQ_MMIO (__force blk_opf_t)(1ULL << __REQ_MMIO) > > Now that my integrity metadata DMA series is staged, I don't think we > can use REQ flags like this because data and metadata may have different > mapping types. I think we should add a flags field to the dma_iova_state > instead. Before integrity metadata code was merged, the assumption was that request is only one type or p2p or host. Is it still holding now? And we can't store in dma_iova_state() as HMM/RDMA code works in page-based granularity and one dma_iova_state() can mix different types. Thanks >
On Thu, Aug 28, 2025 at 07:54:27PM +0300, Leon Romanovsky wrote: > On Thu, Aug 28, 2025 at 09:19:20AM -0600, Keith Busch wrote: > > On Tue, Aug 19, 2025 at 08:36:59PM +0300, Leon Romanovsky wrote: > > > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h > > > index 09b99d52fd36..283058bcb5b1 100644 > > > --- a/include/linux/blk_types.h > > > +++ b/include/linux/blk_types.h > > > @@ -387,6 +387,7 @@ enum req_flag_bits { > > > __REQ_FS_PRIVATE, /* for file system (submitter) use */ > > > __REQ_ATOMIC, /* for atomic write operations */ > > > __REQ_P2PDMA, /* contains P2P DMA pages */ > > > + __REQ_MMIO, /* contains MMIO memory */ > > > /* > > > * Command specific flags, keep last: > > > */ > > > @@ -420,6 +421,7 @@ enum req_flag_bits { > > > #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) > > > #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) > > > #define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) > > > +#define REQ_MMIO (__force blk_opf_t)(1ULL << __REQ_MMIO) > > > > Now that my integrity metadata DMA series is staged, I don't think we > > can use REQ flags like this because data and metadata may have different > > mapping types. I think we should add a flags field to the dma_iova_state > > instead. > > Before integrity metadata code was merged, the assumption was that request is > only one type or p2p or host. Is it still holding now? I don't think that was ever the case. Metadata is allocated independently of the data payload, usually by the kernel in bio_integrity_prep() just before dispatching the request. The bio may have a p2p data payload, but the integrity metadata is just a kmalloc buf in that path. > And we can't store in dma_iova_state() as HMM/RDMA code works in page-based > granularity and one dma_iova_state() can mix different types. I see.
On Thu, Aug 28, 2025 at 11:15:20AM -0600, Keith Busch wrote: > On Thu, Aug 28, 2025 at 07:54:27PM +0300, Leon Romanovsky wrote: > > On Thu, Aug 28, 2025 at 09:19:20AM -0600, Keith Busch wrote: > > > On Tue, Aug 19, 2025 at 08:36:59PM +0300, Leon Romanovsky wrote: > > > > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h > > > > index 09b99d52fd36..283058bcb5b1 100644 > > > > --- a/include/linux/blk_types.h > > > > +++ b/include/linux/blk_types.h > > > > @@ -387,6 +387,7 @@ enum req_flag_bits { > > > > __REQ_FS_PRIVATE, /* for file system (submitter) use */ > > > > __REQ_ATOMIC, /* for atomic write operations */ > > > > __REQ_P2PDMA, /* contains P2P DMA pages */ > > > > + __REQ_MMIO, /* contains MMIO memory */ > > > > /* > > > > * Command specific flags, keep last: > > > > */ > > > > @@ -420,6 +421,7 @@ enum req_flag_bits { > > > > #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) > > > > #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) > > > > #define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) > > > > +#define REQ_MMIO (__force blk_opf_t)(1ULL << __REQ_MMIO) > > > > > > Now that my integrity metadata DMA series is staged, I don't think we > > > can use REQ flags like this because data and metadata may have different > > > mapping types. I think we should add a flags field to the dma_iova_state > > > instead. > > > > Before integrity metadata code was merged, the assumption was that request is > > only one type or p2p or host. Is it still holding now? > > I don't think that was ever the case. Metadata is allocated > independently of the data payload, usually by the kernel in > bio_integrity_prep() just before dispatching the request. The bio may > have a p2p data payload, but the integrity metadata is just a kmalloc > buf in that path. Then you should do two dma mapping operations today, that is how the API was built. You shouldn't mix P2P and non P2P within a single operation right now.. Jason
On Thu, Aug 28, 2025 at 03:41:15PM -0300, Jason Gunthorpe wrote: > On Thu, Aug 28, 2025 at 11:15:20AM -0600, Keith Busch wrote: > > > > I don't think that was ever the case. Metadata is allocated > > independently of the data payload, usually by the kernel in > > bio_integrity_prep() just before dispatching the request. The bio may > > have a p2p data payload, but the integrity metadata is just a kmalloc > > buf in that path. > > Then you should do two dma mapping operations today, that is how the > API was built. You shouldn't mix P2P and non P2P within a single > operation right now.. Data and metadata are mapped as separate operations. They're just different parts of one blk-mq request.
On Thu, Aug 28, 2025 at 01:10:32PM -0600, Keith Busch wrote: > On Thu, Aug 28, 2025 at 03:41:15PM -0300, Jason Gunthorpe wrote: > > On Thu, Aug 28, 2025 at 11:15:20AM -0600, Keith Busch wrote: > > > > > > I don't think that was ever the case. Metadata is allocated > > > independently of the data payload, usually by the kernel in > > > bio_integrity_prep() just before dispatching the request. The bio may > > > have a p2p data payload, but the integrity metadata is just a kmalloc > > > buf in that path. > > > > Then you should do two dma mapping operations today, that is how the > > API was built. You shouldn't mix P2P and non P2P within a single > > operation right now.. > > Data and metadata are mapped as separate operations. They're just > different parts of one blk-mq request. In that case the new bit leon proposes should only be used for the unmap of the data pages and the metadata unmap should always be unmapped as CPU? Jason
On Thu, Aug 28, 2025 at 04:18:20PM -0300, Jason Gunthorpe wrote: > On Thu, Aug 28, 2025 at 01:10:32PM -0600, Keith Busch wrote: > > > > Data and metadata are mapped as separate operations. They're just > > different parts of one blk-mq request. > > In that case the new bit leon proposes should only be used for the > unmap of the data pages and the metadata unmap should always be > unmapped as CPU? The common path uses host allocated memory to attach integrity metadata, but that isn't the only path. A user can attach their own metadata with nvme passthrough or the recent io_uring application metadata, and that could have been allocated from anywhere. In truth though, I hadn't tried p2p metadata before today, and it looks like bio_integrity_map_user() is missing the P2P extraction flags to make that work. Just added this patch below, now I can set p2p or host memory independently for data and integrity payloads: --- diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 6b077ca937f6b..cf45603e378d5 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -265,6 +265,7 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; + iov_iter_extraction_t extraction_flags = 0; size_t offset, bytes = iter->count; unsigned int nr_bvecs; int ret, nr_vecs; @@ -286,7 +287,12 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) } copy = !iov_iter_is_aligned(iter, align, align); - ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset); + + if (blk_queue_pci_p2pdma(q)) + extraction_flags |= ITER_ALLOW_P2PDMA; + + ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, + extraction_flags, &offset); if (unlikely(ret < 0)) goto free_bvec; --
On Thu, Aug 28, 2025 at 02:54:35PM -0600, Keith Busch wrote: > In truth though, I hadn't tried p2p metadata before today, and it looks > like bio_integrity_map_user() is missing the P2P extraction flags to > make that work. Just added this patch below, now I can set p2p or host > memory independently for data and integrity payloads: I think it is a bit more than that, you have to make sure all the meta data is the same, either all p2p or all cpu and then record this somehow so the DMA mapping knows what kind it is. Once that is all done then the above should still be OK, the dma unmap of the data can follow Leon's new flag and the dma unmap of the integrity can follow however integrity kept track (in the bio_integrity_payload perhaps?) ?? Jason
On Thu, Aug 28, 2025 at 08:45:42PM -0300, Jason Gunthorpe wrote: > On Thu, Aug 28, 2025 at 02:54:35PM -0600, Keith Busch wrote: > > > In truth though, I hadn't tried p2p metadata before today, and it looks > > like bio_integrity_map_user() is missing the P2P extraction flags to > > make that work. Just added this patch below, now I can set p2p or host > > memory independently for data and integrity payloads: > > I think it is a bit more than that, you have to make sure all the meta > data is the same, either all p2p or all cpu and then record this > somehow so the DMA mapping knows what kind it is. Sure, I can get all that added in for the real patch. > Once that is all done then the above should still be OK, the dma unmap > of the data can follow Leon's new flag and the dma unmap of the > integrity can follow however integrity kept track (in the > bio_integrity_payload perhaps?) ?? We have available bits in the bio_integrity_payload bip_flags, so that sounds doable. I think we'll need to rearrange some things so we can reuse the important code for data and metadata mapping/unmapping, but doesn't look too bad. I'll get started on that.
On Tue, Aug 19, 2025 at 08:36:59PM +0300, Leon Romanovsky wrote: > From: Leon Romanovsky <leonro@nvidia.com> > > Make sure that CPU is not synced and IOMMU is configured to take > MMIO path by providing newly introduced DMA_ATTR_MMIO attribute. We may have a minor patch conflict here with my unmerged dma metadata series, but not a big deal. Looks good. Reviewed-by: Keith Busch <kbusch@kernel.org>
© 2016 - 2025 Red Hat, Inc.