[v1] block: Enable proper MMIO memory handling for P2P DMA

[PATCH 3/3] block-dma: properly take MMIO path

Posted by Leon Romanovsky 3 months, 3 weeks ago

From: Leon Romanovsky <leonro@nvidia.com>

Make sure that CPU is not synced and IOMMU is configured to take
MMIO path by providing newly introduced DMA_ATTR_MMIO attribute.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 block/blk-mq-dma.c            | 10 ++++++++--
 include/linux/bio-integrity.h |  1 +
 include/linux/blk-integrity.h |  3 ++-
 include/linux/blk-mq-dma.h    | 14 +++++++++++---
 include/linux/blk_types.h     |  2 ++
 5 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 4ba7b0323da4..e1f460da95d7 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -94,7 +94,7 @@ static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
 		struct blk_dma_iter *iter, struct phys_vec *vec)
 {
 	iter->addr = dma_map_phys(dma_dev, vec->paddr, vec->len,
-			rq_dma_dir(req), 0);
+			rq_dma_dir(req), iter->iter.attrs);
 	if (dma_mapping_error(dma_dev, iter->addr)) {
 		iter->status = BLK_STS_RESOURCE;
 		return false;
@@ -116,7 +116,7 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
 
 	do {
 		error = dma_iova_link(dma_dev, state, vec->paddr, mapped,
-				vec->len, dir, 0);
+				vec->len, dir, iter->iter.attrs);
 		if (error)
 			break;
 		mapped += vec->len;
@@ -184,6 +184,12 @@ static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
 		 * P2P transfers through the host bridge are treated the
 		 * same as non-P2P transfers below and during unmap.
 		 */
+		if (iter->iter.is_integrity)
+			bio_integrity(req->bio)->bip_flags |= BIP_MMIO;
+		else
+			req->cmd_flags |= REQ_MMIO;
+		iter->iter.attrs |= DMA_ATTR_MMIO;
+		fallthrough;
 	case PCI_P2PDMA_MAP_NONE:
 		break;
 	default:
diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h
index 851254f36eb3..b77b2cfb7b0f 100644
--- a/include/linux/bio-integrity.h
+++ b/include/linux/bio-integrity.h
@@ -14,6 +14,7 @@ enum bip_flags {
 	BIP_CHECK_REFTAG	= 1 << 6, /* reftag check */
 	BIP_CHECK_APPTAG	= 1 << 7, /* apptag check */
 	BIP_P2P_DMA		= 1 << 8, /* using P2P address */
+	BIP_MMIO		= 1 << 9, /* contains MMIO memory */
 };
 
 struct bio_integrity_payload {
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index b659373788f6..34648d6c14d7 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -33,7 +33,8 @@ static inline bool blk_rq_integrity_dma_unmap(struct request *req,
 		size_t mapped_len)
 {
 	return blk_dma_unmap(req, dma_dev, state, mapped_len,
-			bio_integrity(req->bio)->bip_flags & BIP_P2P_DMA);
+			bio_integrity(req->bio)->bip_flags & BIP_P2P_DMA,
+			bio_integrity(req->bio)->bip_flags & BIP_MMIO);
 }
 
 int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index 51829958d872..916ca1deaf2c 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -10,6 +10,7 @@ struct blk_map_iter {
 	struct bio			*bio;
 	struct bio_vec			*bvecs;
 	bool				is_integrity;
+	unsigned int			attrs;
 };
 
 struct blk_dma_iter {
@@ -49,19 +50,25 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state)
  * @state:	DMA IOVA state
  * @mapped_len: number of bytes to unmap
  * @is_p2p:	true if mapped with PCI_P2PDMA_MAP_BUS_ADDR
+ * @is_mmio:	true if mapped with PCI_P2PDMA_MAP_THRU_HOST_BRIDGE
  *
  * Returns %false if the callers need to manually unmap every DMA segment
  * mapped using @iter or %true if no work is left to be done.
  */
 static inline bool blk_dma_unmap(struct request *req, struct device *dma_dev,
-		struct dma_iova_state *state, size_t mapped_len, bool is_p2p)
+		struct dma_iova_state *state, size_t mapped_len, bool is_p2p,
+		bool is_mmio)
 {
 	if (is_p2p)
 		return true;
 
 	if (dma_use_iova(state)) {
+		unsigned int attrs = 0;
+
+		if (is_mmio)
+			attrs = DMA_ATTR_MMIO;
 		dma_iova_destroy(dma_dev, state, mapped_len, rq_dma_dir(req),
-				 0);
+				 attrs);
 		return true;
 	}
 
@@ -72,7 +79,8 @@ static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev,
 		struct dma_iova_state *state, size_t mapped_len)
 {
 	return blk_dma_unmap(req, dma_dev, state, mapped_len,
-				req->cmd_flags & REQ_P2PDMA);
+			     req->cmd_flags & REQ_P2PDMA,
+			     req->cmd_flags & REQ_MMIO);
 }
 
 #endif /* BLK_MQ_DMA_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 8e8d1cc8b06c..9affa3b2d047 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -382,6 +382,7 @@ enum req_flag_bits {
 	__REQ_FS_PRIVATE,	/* for file system (submitter) use */
 	__REQ_ATOMIC,		/* for atomic write operations */
 	__REQ_P2PDMA,		/* contains P2P DMA pages */
+	__REQ_MMIO,		/* contains MMIO memory */
 	/*
 	 * Command specific flags, keep last:
 	 */
@@ -415,6 +416,7 @@ enum req_flag_bits {
 #define REQ_FS_PRIVATE	(__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
 #define REQ_ATOMIC	(__force blk_opf_t)(1ULL << __REQ_ATOMIC)
 #define REQ_P2PDMA	(__force blk_opf_t)(1ULL << __REQ_P2PDMA)
+#define REQ_MMIO	(__force blk_opf_t)(1ULL << __REQ_MMIO)
 
 #define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
 

-- 
2.51.0

Re: [PATCH 3/3] block-dma: properly take MMIO path

Posted by Christoph Hellwig 3 months, 3 weeks ago

On Fri, Oct 17, 2025 at 08:32:00AM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
> 
> Make sure that CPU is not synced and IOMMU is configured to take
> MMIO path by providing newly introduced DMA_ATTR_MMIO attribute.

Please write a commit log that explains this.  Where was DMA_ATTR_MMIO
recently introduced?  Why?  What does this actually fix or improve?

> @@ -184,6 +184,12 @@ static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
>  		 * P2P transfers through the host bridge are treated the
>  		 * same as non-P2P transfers below and during unmap.
>  		 */
> +		if (iter->iter.is_integrity)
> +			bio_integrity(req->bio)->bip_flags |= BIP_MMIO;
> +		else
> +			req->cmd_flags |= REQ_MMIO;
> +		iter->iter.attrs |= DMA_ATTR_MMIO;

REQ_MMIO / BIP_MMIO is not block layer state, but driver state resulting
from the dma mapping.  Reflecting it in block layer data structures
is not a good idea.  This is really something that just needs to be
communicated outward and recorded in the driver.  For nvme I suspect
two new flags in nvme_iod_flags would be the right place, assuming
we actually need it.  But do we need it?  If REQ_/BIP_P2PDMA is set,
these are always true.

Re: [PATCH 3/3] block-dma: properly take MMIO path

Posted by Leon Romanovsky 3 months, 3 weeks ago

On Fri, Oct 17, 2025 at 08:25:19AM +0200, Christoph Hellwig wrote:
> On Fri, Oct 17, 2025 at 08:32:00AM +0300, Leon Romanovsky wrote:
> > From: Leon Romanovsky <leonro@nvidia.com>
> > 
> > Make sure that CPU is not synced and IOMMU is configured to take
> > MMIO path by providing newly introduced DMA_ATTR_MMIO attribute.

<...>

> > +		if (iter->iter.is_integrity)
> > +			bio_integrity(req->bio)->bip_flags |= BIP_MMIO;
> > +		else
> > +			req->cmd_flags |= REQ_MMIO;
> > +		iter->iter.attrs |= DMA_ATTR_MMIO;
> 
> REQ_MMIO / BIP_MMIO is not block layer state, but driver state resulting
> from the dma mapping.  Reflecting it in block layer data structures
> is not a good idea.  This is really something that just needs to be
> communicated outward and recorded in the driver.  For nvme I suspect
> two new flags in nvme_iod_flags would be the right place, assuming
> we actually need it.  But do we need it?  If REQ_/BIP_P2PDMA is set,
> these are always true.

We have three different flows.
1. Regular one, backed by struct page, e.g. dma_map_page()
2. PCI_P2PDMA_MAP_BUS_ADDR - non-DMA flow
3. PCI_P2PDMA_MAP_THRU_HOST_BRIDGE - DMA without struct page, e.g. dma_map_resource()

There is a need for two bits to represent them.

Thanks

Re: [PATCH 3/3] block-dma: properly take MMIO path

Posted by Leon Romanovsky 3 months, 3 weeks ago

On Fri, Oct 17, 2025 at 08:25:19AM +0200, Christoph Hellwig wrote:
> On Fri, Oct 17, 2025 at 08:32:00AM +0300, Leon Romanovsky wrote:
> > From: Leon Romanovsky <leonro@nvidia.com>
> > 
> > Make sure that CPU is not synced and IOMMU is configured to take
> > MMIO path by providing newly introduced DMA_ATTR_MMIO attribute.
> 
> Please write a commit log that explains this.  Where was DMA_ATTR_MMIO
> recently introduced?  Why?  What does this actually fix or improve?

What about this commit message?

Author: Leon Romanovsky <leonro@nvidia.com>
Date:   Mon Oct 13 18:34:12 2025 +0300

    block-dma: properly take MMIO path

    In commit eadaa8b255f3 ("dma-mapping: introduce new DMA attribute to
    indicate MMIO memory"), DMA_ATTR_MMIO attribute was added to describe
    MMIO addresses, which requite to avoid any memory cache flushing, as
    an outcome of the discussion pointed in Link tag below.

    In case of PCI_P2PDMA_MAP_THRU_HOST_BRIDGE transfer, blk-mq-dm logic
    treated this as regular page and relied on "struct page" DMA flow.
    That flow performs CPU cache flushing, which shouldn't be done here,
    and doesn't set IOMMU_MMIO flag in DMA-IOMMU case.

    Link: https://lore.kernel.org/all/f912c446-1ae9-4390-9c11-00dce7bf0fd3@arm.com/
    Signed-off-by: Leon Romanovsky <leonro@nvidia.com>

Re: [PATCH 3/3] block-dma: properly take MMIO path

Posted by Christoph Hellwig 3 months, 3 weeks ago

On Mon, Oct 20, 2025 at 11:52:31AM +0300, Leon Romanovsky wrote:
> What about this commit message?

Much bettwer.  Btw, what is the plan for getting rid of the
"automatic" p2p handling, which would be the logical conflusion from
this?

Re: [PATCH 3/3] block-dma: properly take MMIO path

Posted by Leon Romanovsky 3 months, 3 weeks ago

On Mon, Oct 20, 2025 at 02:30:27PM +0200, Christoph Hellwig wrote:
> On Mon, Oct 20, 2025 at 11:52:31AM +0300, Leon Romanovsky wrote:
> > What about this commit message?
> 
> Much bettwer.  Btw, what is the plan for getting rid of the
> "automatic" p2p handling, which would be the logical conflusion from
> this?

I continued with "automatic" p2p code and think that it is structured
pretty well. Why do you want to remove it?

The code in v2 looks like this:

@@ -184,6 +184,8 @@ static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
                 * P2P transfers through the host bridge are treated the
                 * same as non-P2P transfers below and during unmap.
                 */
+               iter->attrs |= DMA_ATTR_MMIO;
+               fallthrough;
        case PCI_P2PDMA_MAP_NONE:
                break;
        default:

...

@@ -1038,6 +1051,9 @@ static blk_status_t nvme_map_data(struct request *req)
        if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
                return iter.status;
 
+       if (iter.attrs & DMA_ATTR_MMIO)
+               iod->flags |= IOD_DATA_MMIO;
+
        if (use_sgl == SGL_FORCED ||
            (use_sgl == SGL_SUPPORTED &&
             (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold)))
@@ -1060,6 +1076,9 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
                                                &iod->meta_dma_state, &iter))
                return iter.status;
 
+       if (iter.attrs & DMA_ATTR_MMIO)
+               iod->flags |= IOD_META_MMIO;
+
        if (blk_rq_dma_map_coalesce(&iod->meta_dma_state))
                entries = 1;

...

@@ -733,8 +739,11 @@ static void nvme_unmap_metadata(struct request *req)
                return;
        }

+       if (iod->flags & IOD_META_MMIO)
+               attrs |= DMA_ATTR_MMIO;
+
        if (!blk_rq_integrity_dma_unmap(req, dma_dev, &iod->meta_dma_state,
-                                       iod->meta_total_len)) {
+                                       iod->meta_total_len, attrs)) {
                if (nvme_pci_cmd_use_meta_sgl(&iod->cmd))
                        nvme_free_sgls(req, sge, &sge[1], attrs);
                else

The code is here (waiting for kbuild results)  https://git.kernel.org/pub/scm/linux/kernel/git/leon/linux-rdma.git/log/?h=block-with-mmio-v2

Thanks