dma-mapping: arm64: support batched cache sync

[PATCH v2 4/8] dma-mapping: Separate DMA sync issuing and completion waiting

Posted by Barry Song 1 month, 2 weeks ago

From: Barry Song <baohua@kernel.org>

Currently, arch_sync_dma_for_cpu and arch_sync_dma_for_device
always wait for the completion of each DMA buffer. That is,
issuing the DMA sync and waiting for completion is done in a
single API call.

For scatter-gather lists with multiple entries, this means
issuing and waiting is repeated for each entry, which can hurt
performance. Architectures like ARM64 may be able to issue all
DMA sync operations for all entries first and then wait for
completion together.

To address this, arch_sync_dma_for_* now issues DMA operations in
batch, followed by a flush. On ARM64, the flush is implemented
using a dsb instruction within arch_sync_dma_flush().

For now, add arch_sync_dma_flush() after each
arch_sync_dma_for_*() call. arch_sync_dma_flush() is defined as a
no-op on all architectures except arm64, so this patch does not
change existing behavior. Subsequent patches will introduce true
batching for SG DMA buffers.

Cc: Leon Romanovsky <leon@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Ada Couprie Diaz <ada.coupriediaz@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Tangquan Zheng <zhengtangquan@oppo.com>
Signed-off-by: Barry Song <baohua@kernel.org>
---
 arch/arm64/include/asm/cache.h |  6 ++++++
 arch/arm64/mm/dma-mapping.c    |  4 ++--
 drivers/iommu/dma-iommu.c      | 37 +++++++++++++++++++++++++---------
 drivers/xen/swiotlb-xen.c      | 24 ++++++++++++++--------
 include/linux/dma-map-ops.h    |  6 ++++++
 kernel/dma/direct.c            |  8 ++++++--
 kernel/dma/direct.h            |  9 +++++++--
 kernel/dma/swiotlb.c           |  4 +++-
 8 files changed, 73 insertions(+), 25 deletions(-)

diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index dd2c8586a725..487fb7c355ed 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -87,6 +87,12 @@ int cache_line_size(void);
 
 #define dma_get_cache_alignment	cache_line_size
 
+static inline void arch_sync_dma_flush(void)
+{
+	dsb(sy);
+}
+#define arch_sync_dma_flush arch_sync_dma_flush
+
 /* Compress a u64 MPIDR value into 32 bits. */
 static inline u64 arch_compact_of_hwid(u64 id)
 {
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index b2b5792b2caa..ae1ae0280eef 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -17,7 +17,7 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
 {
 	unsigned long start = (unsigned long)phys_to_virt(paddr);
 
-	dcache_clean_poc(start, start + size);
+	dcache_clean_poc_nosync(start, start + size);
 }
 
 void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
@@ -28,7 +28,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 	if (dir == DMA_TO_DEVICE)
 		return;
 
-	dcache_inval_poc(start, start + size);
+	dcache_inval_poc_nosync(start, start + size);
 }
 
 void arch_dma_prep_coherent(struct page *page, size_t size)
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index c92088855450..6827763a3877 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1095,8 +1095,10 @@ void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
 		return;
 
 	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
-	if (!dev_is_dma_coherent(dev))
+	if (!dev_is_dma_coherent(dev)) {
 		arch_sync_dma_for_cpu(phys, size, dir);
+		arch_sync_dma_flush();
+	}
 
 	swiotlb_sync_single_for_cpu(dev, phys, size, dir);
 }
@@ -1112,8 +1114,10 @@ void iommu_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
 	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
 	swiotlb_sync_single_for_device(dev, phys, size, dir);
 
-	if (!dev_is_dma_coherent(dev))
+	if (!dev_is_dma_coherent(dev)) {
 		arch_sync_dma_for_device(phys, size, dir);
+		arch_sync_dma_flush();
+	}
 }
 
 void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
@@ -1122,13 +1126,16 @@ void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
 	struct scatterlist *sg;
 	int i;
 
-	if (sg_dma_is_swiotlb(sgl))
+	if (sg_dma_is_swiotlb(sgl)) {
 		for_each_sg(sgl, sg, nelems, i)
 			iommu_dma_sync_single_for_cpu(dev, sg_dma_address(sg),
 						      sg->length, dir);
-	else if (!dev_is_dma_coherent(dev))
-		for_each_sg(sgl, sg, nelems, i)
+	} else if (!dev_is_dma_coherent(dev)) {
+		for_each_sg(sgl, sg, nelems, i) {
 			arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
+			arch_sync_dma_flush();
+		}
+	}
 }
 
 void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
@@ -1143,8 +1150,10 @@ void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
 							 sg_dma_address(sg),
 							 sg->length, dir);
 	else if (!dev_is_dma_coherent(dev))
-		for_each_sg(sgl, sg, nelems, i)
+		for_each_sg(sgl, sg, nelems, i) {
 			arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
+			arch_sync_dma_flush();
+		}
 }
 
 static phys_addr_t iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys,
@@ -1219,8 +1228,10 @@ dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
 			return DMA_MAPPING_ERROR;
 	}
 
-	if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+	if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) {
 		arch_sync_dma_for_device(phys, size, dir);
+		arch_sync_dma_flush();
+	}
 
 	iova = __iommu_dma_map(dev, phys, size, prot, dma_mask);
 	if (iova == DMA_MAPPING_ERROR && !(attrs & DMA_ATTR_MMIO))
@@ -1242,8 +1253,10 @@ void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle,
 	if (WARN_ON(!phys))
 		return;
 
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev))
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev)) {
 		arch_sync_dma_for_cpu(phys, size, dir);
+		arch_sync_dma_flush();
+	}
 
 	__iommu_dma_unmap(dev, dma_handle, size);
 
@@ -1836,8 +1849,10 @@ static int __dma_iova_link(struct device *dev, dma_addr_t addr,
 	bool coherent = dev_is_dma_coherent(dev);
 	int prot = dma_info_to_prot(dir, coherent, attrs);
 
-	if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+	if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) {
 		arch_sync_dma_for_device(phys, size, dir);
+		arch_sync_dma_flush();
+	}
 
 	return iommu_map_nosync(iommu_get_dma_domain(dev), addr, phys, size,
 			prot, GFP_ATOMIC);
@@ -2008,8 +2023,10 @@ static void iommu_dma_iova_unlink_range_slow(struct device *dev,
 			end - addr, iovad->granule - iova_start_pad);
 
 		if (!dev_is_dma_coherent(dev) &&
-		    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+		    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) {
 			arch_sync_dma_for_cpu(phys, len, dir);
+			arch_sync_dma_flush();
+		}
 
 		swiotlb_tbl_unmap_single(dev, phys, len, dir, attrs);
 
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index ccf25027bec1..b79917e785a5 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -262,10 +262,12 @@ static dma_addr_t xen_swiotlb_map_phys(struct device *dev, phys_addr_t phys,
 
 done:
 	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
-		if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr))))
+		if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr)))) {
 			arch_sync_dma_for_device(phys, size, dir);
-		else
+			arch_sync_dma_flush();
+		} else {
 			xen_dma_sync_for_device(dev, dev_addr, size, dir);
+		}
 	}
 	return dev_addr;
 }
@@ -287,10 +289,12 @@ static void xen_swiotlb_unmap_phys(struct device *hwdev, dma_addr_t dev_addr,
 	BUG_ON(dir == DMA_NONE);
 
 	if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
-		if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr))))
+		if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr)))) {
 			arch_sync_dma_for_cpu(paddr, size, dir);
-		else
+			arch_sync_dma_flush();
+		} else {
 			xen_dma_sync_for_cpu(hwdev, dev_addr, size, dir);
+		}
 	}
 
 	/* NOTE: We use dev_addr here, not paddr! */
@@ -308,10 +312,12 @@ xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr,
 	struct io_tlb_pool *pool;
 
 	if (!dev_is_dma_coherent(dev)) {
-		if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr))))
+		if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) {
 			arch_sync_dma_for_cpu(paddr, size, dir);
-		else
+			arch_sync_dma_flush();
+		} else {
 			xen_dma_sync_for_cpu(dev, dma_addr, size, dir);
+		}
 	}
 
 	pool = xen_swiotlb_find_pool(dev, dma_addr);
@@ -331,10 +337,12 @@ xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr,
 		__swiotlb_sync_single_for_device(dev, paddr, size, dir, pool);
 
 	if (!dev_is_dma_coherent(dev)) {
-		if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr))))
+		if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) {
 			arch_sync_dma_for_device(paddr, size, dir);
-		else
+			arch_sync_dma_flush();
+		} else {
 			xen_dma_sync_for_device(dev, dma_addr, size, dir);
+		}
 	}
 }
 
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 4809204c674c..e7dd8a63b40e 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -361,6 +361,12 @@ static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 }
 #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
 
+#ifndef arch_sync_dma_flush
+static inline void arch_sync_dma_flush(void)
+{
+}
+#endif
+
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
 void arch_sync_dma_for_cpu_all(void);
 #else
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 50c3fe2a1d55..a219911c7b90 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -402,9 +402,11 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 
 		swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
 
-		if (!dev_is_dma_coherent(dev))
+		if (!dev_is_dma_coherent(dev)) {
 			arch_sync_dma_for_device(paddr, sg->length,
 					dir);
+			arch_sync_dma_flush();
+		}
 	}
 }
 #endif
@@ -421,8 +423,10 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 	for_each_sg(sgl, sg, nents, i) {
 		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
-		if (!dev_is_dma_coherent(dev))
+		if (!dev_is_dma_coherent(dev)) {
 			arch_sync_dma_for_cpu(paddr, sg->length, dir);
+			arch_sync_dma_flush();
+		}
 
 		swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
 
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index da2fadf45bcd..a69326eed266 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -60,8 +60,10 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
 
 	swiotlb_sync_single_for_device(dev, paddr, size, dir);
 
-	if (!dev_is_dma_coherent(dev))
+	if (!dev_is_dma_coherent(dev)) {
 		arch_sync_dma_for_device(paddr, size, dir);
+		arch_sync_dma_flush();
+	}
 }
 
 static inline void dma_direct_sync_single_for_cpu(struct device *dev,
@@ -71,6 +73,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
 
 	if (!dev_is_dma_coherent(dev)) {
 		arch_sync_dma_for_cpu(paddr, size, dir);
+		arch_sync_dma_flush();
 		arch_sync_dma_for_cpu_all();
 	}
 
@@ -109,8 +112,10 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
 	}
 
 	if (!dev_is_dma_coherent(dev) &&
-	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) {
 		arch_sync_dma_for_device(phys, size, dir);
+		arch_sync_dma_flush();
+	}
 	return dma_addr;
 
 err_overflow:
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index a547c7693135..7cdbfcdfef86 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1595,8 +1595,10 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
 		return DMA_MAPPING_ERROR;
 	}
 
-	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
 		arch_sync_dma_for_device(swiotlb_addr, size, dir);
+		arch_sync_dma_flush();
+	}
 	return dma_addr;
 }
 
-- 
2.43.0

Re: [PATCH v2 4/8] dma-mapping: Separate DMA sync issuing and completion waiting

Posted by Jürgen Groß 1 month ago

On 26.12.25 23:52, Barry Song wrote:
> From: Barry Song <baohua@kernel.org>
> 
> Currently, arch_sync_dma_for_cpu and arch_sync_dma_for_device
> always wait for the completion of each DMA buffer. That is,
> issuing the DMA sync and waiting for completion is done in a
> single API call.
> 
> For scatter-gather lists with multiple entries, this means
> issuing and waiting is repeated for each entry, which can hurt
> performance. Architectures like ARM64 may be able to issue all
> DMA sync operations for all entries first and then wait for
> completion together.
> 
> To address this, arch_sync_dma_for_* now issues DMA operations in
> batch, followed by a flush. On ARM64, the flush is implemented
> using a dsb instruction within arch_sync_dma_flush().
> 
> For now, add arch_sync_dma_flush() after each
> arch_sync_dma_for_*() call. arch_sync_dma_flush() is defined as a
> no-op on all architectures except arm64, so this patch does not
> change existing behavior. Subsequent patches will introduce true
> batching for SG DMA buffers.
> 
> Cc: Leon Romanovsky <leon@kernel.org>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Marek Szyprowski <m.szyprowski@samsung.com>
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: Ada Couprie Diaz <ada.coupriediaz@arm.com>
> Cc: Ard Biesheuvel <ardb@kernel.org>
> Cc: Marc Zyngier <maz@kernel.org>
> Cc: Anshuman Khandual <anshuman.khandual@arm.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: Suren Baghdasaryan <surenb@google.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Juergen Gross <jgross@suse.com>
> Cc: Stefano Stabellini <sstabellini@kernel.org>
> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> Cc: Tangquan Zheng <zhengtangquan@oppo.com>
> Signed-off-by: Barry Song <baohua@kernel.org>

Reviewed-by: Juergen Gross <jgross@suse.com> # drivers/xen/swiotlb-xen.c


Juergen

Re: [PATCH v2 4/8] dma-mapping: Separate DMA sync issuing and completion waiting

Posted by Leon Romanovsky 1 month, 1 week ago

On Sat, Dec 27, 2025 at 11:52:44AM +1300, Barry Song wrote:
> From: Barry Song <baohua@kernel.org>
> 
> Currently, arch_sync_dma_for_cpu and arch_sync_dma_for_device
> always wait for the completion of each DMA buffer. That is,
> issuing the DMA sync and waiting for completion is done in a
> single API call.
> 
> For scatter-gather lists with multiple entries, this means
> issuing and waiting is repeated for each entry, which can hurt
> performance. Architectures like ARM64 may be able to issue all
> DMA sync operations for all entries first and then wait for
> completion together.
> 
> To address this, arch_sync_dma_for_* now issues DMA operations in
> batch, followed by a flush. On ARM64, the flush is implemented
> using a dsb instruction within arch_sync_dma_flush().
> 
> For now, add arch_sync_dma_flush() after each
> arch_sync_dma_for_*() call. arch_sync_dma_flush() is defined as a
> no-op on all architectures except arm64, so this patch does not
> change existing behavior. Subsequent patches will introduce true
> batching for SG DMA buffers.
> 
> Cc: Leon Romanovsky <leon@kernel.org>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Marek Szyprowski <m.szyprowski@samsung.com>
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: Ada Couprie Diaz <ada.coupriediaz@arm.com>
> Cc: Ard Biesheuvel <ardb@kernel.org>
> Cc: Marc Zyngier <maz@kernel.org>
> Cc: Anshuman Khandual <anshuman.khandual@arm.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: Suren Baghdasaryan <surenb@google.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Juergen Gross <jgross@suse.com>
> Cc: Stefano Stabellini <sstabellini@kernel.org>
> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> Cc: Tangquan Zheng <zhengtangquan@oppo.com>
> Signed-off-by: Barry Song <baohua@kernel.org>
> ---
>  arch/arm64/include/asm/cache.h |  6 ++++++
>  arch/arm64/mm/dma-mapping.c    |  4 ++--
>  drivers/iommu/dma-iommu.c      | 37 +++++++++++++++++++++++++---------
>  drivers/xen/swiotlb-xen.c      | 24 ++++++++++++++--------
>  include/linux/dma-map-ops.h    |  6 ++++++
>  kernel/dma/direct.c            |  8 ++++++--
>  kernel/dma/direct.h            |  9 +++++++--
>  kernel/dma/swiotlb.c           |  4 +++-
>  8 files changed, 73 insertions(+), 25 deletions(-)

<...>

> +#ifndef arch_sync_dma_flush
> +static inline void arch_sync_dma_flush(void)
> +{
> +}
> +#endif

Over the weekend I realized a useful advantage of the ARCH_HAVE_* config
options: they make it straightforward to inspect the entire DMA path simply
by looking at the .config.

Thanks,
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>

Re: [PATCH v2 4/8] dma-mapping: Separate DMA sync issuing and completion waiting

Posted by Barry Song 1 month, 1 week ago

On Sun, Dec 28, 2025 at 9:07 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Sat, Dec 27, 2025 at 11:52:44AM +1300, Barry Song wrote:
> > From: Barry Song <baohua@kernel.org>
> >
> > Currently, arch_sync_dma_for_cpu and arch_sync_dma_for_device
> > always wait for the completion of each DMA buffer. That is,
> > issuing the DMA sync and waiting for completion is done in a
> > single API call.
> >
> > For scatter-gather lists with multiple entries, this means
> > issuing and waiting is repeated for each entry, which can hurt
> > performance. Architectures like ARM64 may be able to issue all
> > DMA sync operations for all entries first and then wait for
> > completion together.
> >
> > To address this, arch_sync_dma_for_* now issues DMA operations in
> > batch, followed by a flush. On ARM64, the flush is implemented
> > using a dsb instruction within arch_sync_dma_flush().
> >
> > For now, add arch_sync_dma_flush() after each
> > arch_sync_dma_for_*() call. arch_sync_dma_flush() is defined as a
> > no-op on all architectures except arm64, so this patch does not
> > change existing behavior. Subsequent patches will introduce true
> > batching for SG DMA buffers.
> >
> > Cc: Leon Romanovsky <leon@kernel.org>
> > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > Cc: Will Deacon <will@kernel.org>
> > Cc: Marek Szyprowski <m.szyprowski@samsung.com>
> > Cc: Robin Murphy <robin.murphy@arm.com>
> > Cc: Ada Couprie Diaz <ada.coupriediaz@arm.com>
> > Cc: Ard Biesheuvel <ardb@kernel.org>
> > Cc: Marc Zyngier <maz@kernel.org>
> > Cc: Anshuman Khandual <anshuman.khandual@arm.com>
> > Cc: Ryan Roberts <ryan.roberts@arm.com>
> > Cc: Suren Baghdasaryan <surenb@google.com>
> > Cc: Joerg Roedel <joro@8bytes.org>
> > Cc: Juergen Gross <jgross@suse.com>
> > Cc: Stefano Stabellini <sstabellini@kernel.org>
> > Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> > Cc: Tangquan Zheng <zhengtangquan@oppo.com>
> > Signed-off-by: Barry Song <baohua@kernel.org>
> > ---
> >  arch/arm64/include/asm/cache.h |  6 ++++++
> >  arch/arm64/mm/dma-mapping.c    |  4 ++--
> >  drivers/iommu/dma-iommu.c      | 37 +++++++++++++++++++++++++---------
> >  drivers/xen/swiotlb-xen.c      | 24 ++++++++++++++--------
> >  include/linux/dma-map-ops.h    |  6 ++++++
> >  kernel/dma/direct.c            |  8 ++++++--
> >  kernel/dma/direct.h            |  9 +++++++--
> >  kernel/dma/swiotlb.c           |  4 +++-
> >  8 files changed, 73 insertions(+), 25 deletions(-)
>
> <...>
>
> > +#ifndef arch_sync_dma_flush
> > +static inline void arch_sync_dma_flush(void)
> > +{
> > +}
> > +#endif
>
> Over the weekend I realized a useful advantage of the ARCH_HAVE_* config
> options: they make it straightforward to inspect the entire DMA path simply
> by looking at the .config.

I am not quite sure how much this benefits users, as the same
information could also be obtained by grepping for
#define arch_sync_dma_flush in the source code.

>
> Thanks,
> Reviewed-by: Leon Romanovsky <leonro@nvidia.com>

Thanks very much, Leon, for reviewing this over the weekend. One thing
you might have missed is that I place arch_sync_dma_flush() after all
arch_sync_dma_for_*() calls, for both single and sg cases. I also
used a Python script to scan the code and verify that every
arch_sync_dma_for_*() is followed by arch_sync_dma_flush(), to ensure
that no call is left out.

In the subsequent patches, for sg cases, the per-entry flush is
replaced by a single flush of the entire sg. Each sg case has
different characteristics: some are straightforward, while others
can be tricky and involve additional contexts.

Thanks
Barry

Re: [PATCH v2 4/8] dma-mapping: Separate DMA sync issuing and completion waiting

Posted by Leon Romanovsky 1 month, 1 week ago

On Sun, Dec 28, 2025 at 10:45:13AM +1300, Barry Song wrote:
> On Sun, Dec 28, 2025 at 9:07 AM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Sat, Dec 27, 2025 at 11:52:44AM +1300, Barry Song wrote:
> > > From: Barry Song <baohua@kernel.org>
> > >
> > > Currently, arch_sync_dma_for_cpu and arch_sync_dma_for_device
> > > always wait for the completion of each DMA buffer. That is,
> > > issuing the DMA sync and waiting for completion is done in a
> > > single API call.
> > >
> > > For scatter-gather lists with multiple entries, this means
> > > issuing and waiting is repeated for each entry, which can hurt
> > > performance. Architectures like ARM64 may be able to issue all
> > > DMA sync operations for all entries first and then wait for
> > > completion together.
> > >
> > > To address this, arch_sync_dma_for_* now issues DMA operations in
> > > batch, followed by a flush. On ARM64, the flush is implemented
> > > using a dsb instruction within arch_sync_dma_flush().
> > >
> > > For now, add arch_sync_dma_flush() after each
> > > arch_sync_dma_for_*() call. arch_sync_dma_flush() is defined as a
> > > no-op on all architectures except arm64, so this patch does not
> > > change existing behavior. Subsequent patches will introduce true
> > > batching for SG DMA buffers.
> > >
> > > Cc: Leon Romanovsky <leon@kernel.org>
> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > Cc: Will Deacon <will@kernel.org>
> > > Cc: Marek Szyprowski <m.szyprowski@samsung.com>
> > > Cc: Robin Murphy <robin.murphy@arm.com>
> > > Cc: Ada Couprie Diaz <ada.coupriediaz@arm.com>
> > > Cc: Ard Biesheuvel <ardb@kernel.org>
> > > Cc: Marc Zyngier <maz@kernel.org>
> > > Cc: Anshuman Khandual <anshuman.khandual@arm.com>
> > > Cc: Ryan Roberts <ryan.roberts@arm.com>
> > > Cc: Suren Baghdasaryan <surenb@google.com>
> > > Cc: Joerg Roedel <joro@8bytes.org>
> > > Cc: Juergen Gross <jgross@suse.com>
> > > Cc: Stefano Stabellini <sstabellini@kernel.org>
> > > Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> > > Cc: Tangquan Zheng <zhengtangquan@oppo.com>
> > > Signed-off-by: Barry Song <baohua@kernel.org>
> > > ---
> > >  arch/arm64/include/asm/cache.h |  6 ++++++
> > >  arch/arm64/mm/dma-mapping.c    |  4 ++--
> > >  drivers/iommu/dma-iommu.c      | 37 +++++++++++++++++++++++++---------
> > >  drivers/xen/swiotlb-xen.c      | 24 ++++++++++++++--------
> > >  include/linux/dma-map-ops.h    |  6 ++++++
> > >  kernel/dma/direct.c            |  8 ++++++--
> > >  kernel/dma/direct.h            |  9 +++++++--
> > >  kernel/dma/swiotlb.c           |  4 +++-
> > >  8 files changed, 73 insertions(+), 25 deletions(-)
> >
> > <...>
> >
> > > +#ifndef arch_sync_dma_flush
> > > +static inline void arch_sync_dma_flush(void)
> > > +{
> > > +}
> > > +#endif
> >
> > Over the weekend I realized a useful advantage of the ARCH_HAVE_* config
> > options: they make it straightforward to inspect the entire DMA path simply
> > by looking at the .config.
> 
> I am not quite sure how much this benefits users, as the same
> information could also be obtained by grepping for
> #define arch_sync_dma_flush in the source code.

It differs slightly. Users no longer need to grep around or guess whether this
platform used the arch_sync_dma_flush path. A simple grep for ARCH_HAVE_ in
/proc/config.gz provides the answer.

> 
> >
> > Thanks,
> > Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
> 
> Thanks very much, Leon, for reviewing this over the weekend. One thing
> you might have missed is that I place arch_sync_dma_flush() after all
> arch_sync_dma_for_*() calls, for both single and sg cases. I also
> used a Python script to scan the code and verify that every
> arch_sync_dma_for_*() is followed by arch_sync_dma_flush(), to ensure
> that no call is left out.
> 
> In the subsequent patches, for sg cases, the per-entry flush is
> replaced by a single flush of the entire sg. Each sg case has
> different characteristics: some are straightforward, while others
> can be tricky and involve additional contexts.

I didn't overlook it, and I understand your rationale. However, this is
not how kernel patches should be structured. You should not introduce
code in patch X and then move it elsewhere in patch X + Y.

Place the code in the correct location from the start. Your patches are
small enough to review as is.

Thanks"
> 
> Thanks
> Barry

Re: [PATCH v2 4/8] dma-mapping: Separate DMA sync issuing and completion waiting

Posted by Barry Song 1 month, 1 week ago

On Mon, Dec 29, 2025 at 3:49 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Sun, Dec 28, 2025 at 10:45:13AM +1300, Barry Song wrote:
> > On Sun, Dec 28, 2025 at 9:07 AM Leon Romanovsky <leon@kernel.org> wrote:
> > >
> > > On Sat, Dec 27, 2025 at 11:52:44AM +1300, Barry Song wrote:
> > > > From: Barry Song <baohua@kernel.org>
> > > >
> > > > Currently, arch_sync_dma_for_cpu and arch_sync_dma_for_device
> > > > always wait for the completion of each DMA buffer. That is,
> > > > issuing the DMA sync and waiting for completion is done in a
> > > > single API call.
> > > >
> > > > For scatter-gather lists with multiple entries, this means
> > > > issuing and waiting is repeated for each entry, which can hurt
> > > > performance. Architectures like ARM64 may be able to issue all
> > > > DMA sync operations for all entries first and then wait for
> > > > completion together.
> > > >
> > > > To address this, arch_sync_dma_for_* now issues DMA operations in
> > > > batch, followed by a flush. On ARM64, the flush is implemented
> > > > using a dsb instruction within arch_sync_dma_flush().
> > > >
> > > > For now, add arch_sync_dma_flush() after each
> > > > arch_sync_dma_for_*() call. arch_sync_dma_flush() is defined as a
> > > > no-op on all architectures except arm64, so this patch does not
> > > > change existing behavior. Subsequent patches will introduce true
> > > > batching for SG DMA buffers.
> > > >
> > > > Cc: Leon Romanovsky <leon@kernel.org>
> > > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > > Cc: Will Deacon <will@kernel.org>
> > > > Cc: Marek Szyprowski <m.szyprowski@samsung.com>
> > > > Cc: Robin Murphy <robin.murphy@arm.com>
> > > > Cc: Ada Couprie Diaz <ada.coupriediaz@arm.com>
> > > > Cc: Ard Biesheuvel <ardb@kernel.org>
> > > > Cc: Marc Zyngier <maz@kernel.org>
> > > > Cc: Anshuman Khandual <anshuman.khandual@arm.com>
> > > > Cc: Ryan Roberts <ryan.roberts@arm.com>
> > > > Cc: Suren Baghdasaryan <surenb@google.com>
> > > > Cc: Joerg Roedel <joro@8bytes.org>
> > > > Cc: Juergen Gross <jgross@suse.com>
> > > > Cc: Stefano Stabellini <sstabellini@kernel.org>
> > > > Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> > > > Cc: Tangquan Zheng <zhengtangquan@oppo.com>
> > > > Signed-off-by: Barry Song <baohua@kernel.org>
> > > > ---
> > > >  arch/arm64/include/asm/cache.h |  6 ++++++
> > > >  arch/arm64/mm/dma-mapping.c    |  4 ++--
> > > >  drivers/iommu/dma-iommu.c      | 37 +++++++++++++++++++++++++---------
> > > >  drivers/xen/swiotlb-xen.c      | 24 ++++++++++++++--------
> > > >  include/linux/dma-map-ops.h    |  6 ++++++
> > > >  kernel/dma/direct.c            |  8 ++++++--
> > > >  kernel/dma/direct.h            |  9 +++++++--
> > > >  kernel/dma/swiotlb.c           |  4 +++-
> > > >  8 files changed, 73 insertions(+), 25 deletions(-)
> > >
> > > <...>
> > >
> > > > +#ifndef arch_sync_dma_flush
> > > > +static inline void arch_sync_dma_flush(void)
> > > > +{
> > > > +}
> > > > +#endif
> > >
> > > Over the weekend I realized a useful advantage of the ARCH_HAVE_* config
> > > options: they make it straightforward to inspect the entire DMA path simply
> > > by looking at the .config.
> >
> > I am not quite sure how much this benefits users, as the same
> > information could also be obtained by grepping for
> > #define arch_sync_dma_flush in the source code.
>
> It differs slightly. Users no longer need to grep around or guess whether this
> platform used the arch_sync_dma_flush path. A simple grep for ARCH_HAVE_ in
> /proc/config.gz provides the answer.

In any case, it is only two or three lines of code, so I am fine with
either approach. Perhaps Marek, Robin, and others have a point here?

>
> >
> > >
> > > Thanks,
> > > Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
> >
> > Thanks very much, Leon, for reviewing this over the weekend. One thing
> > you might have missed is that I place arch_sync_dma_flush() after all
> > arch_sync_dma_for_*() calls, for both single and sg cases. I also
> > used a Python script to scan the code and verify that every
> > arch_sync_dma_for_*() is followed by arch_sync_dma_flush(), to ensure
> > that no call is left out.
> >
> > In the subsequent patches, for sg cases, the per-entry flush is
> > replaced by a single flush of the entire sg. Each sg case has
> > different characteristics: some are straightforward, while others
> > can be tricky and involve additional contexts.
>
> I didn't overlook it, and I understand your rationale. However, this is
> not how kernel patches should be structured. You should not introduce
> code in patch X and then move it elsewhere in patch X + Y.

I am not quite convinced by this concern. This patch only
separates DMA sync issuing from completion waiting, and it
reflects that the development is done step by step.

>
> Place the code in the correct location from the start. Your patches are
> small enough to review as is.

My point is that this patch places the code in the correct locations
from the start. It splits arch_sync_dma_for_*() into
arch_sync_dma_for_*() plus arch_sync_dma_flush() everywhere, without
introducing any functional changes from the outset.
The subsequent patches clearly show which parts are truly batched.

In the meantime, I do not have a strong preference here. If you think
it is better to move some of the straightforward batching code here,
I can follow that approach. Perhaps I could move patch 5, patch 8,
and the iommu_dma_iova_unlink_range_slow change from patch 7 here,
while keeping

  [PATCH 6] dma-mapping: Support batch mode for
  dma_direct_{map,unmap}_sg

and the IOVA link part from patch 7 as separate patches, since that
part is not straightforward. The IOVA link changes affect both
__dma_iova_link() and dma_iova_sync(), which are two separate
functions and require a deeper understanding of the contexts to
determine correctness. That part also lacks testing.

Would that be okay with you?

Thanks
Barry

Re: [PATCH v2 4/8] dma-mapping: Separate DMA sync issuing and completion waiting

Posted by Marek Szyprowski 1 month, 1 week ago

On 28.12.2025 22:38, Barry Song wrote:
> On Mon, Dec 29, 2025 at 3:49 AM Leon Romanovsky <leon@kernel.org> wrote:
>> On Sun, Dec 28, 2025 at 10:45:13AM +1300, Barry Song wrote:
>>> On Sun, Dec 28, 2025 at 9:07 AM Leon Romanovsky <leon@kernel.org> wrote:
>>>> On Sat, Dec 27, 2025 at 11:52:44AM +1300, Barry Song wrote:
>>>>> From: Barry Song <baohua@kernel.org>
>>>>>
>>>>> Currently, arch_sync_dma_for_cpu and arch_sync_dma_for_device
>>>>> always wait for the completion of each DMA buffer. That is,
>>>>> issuing the DMA sync and waiting for completion is done in a
>>>>> single API call.
>>>>>
>>>>> For scatter-gather lists with multiple entries, this means
>>>>> issuing and waiting is repeated for each entry, which can hurt
>>>>> performance. Architectures like ARM64 may be able to issue all
>>>>> DMA sync operations for all entries first and then wait for
>>>>> completion together.
>>>>>
>>>>> To address this, arch_sync_dma_for_* now issues DMA operations in
>>>>> batch, followed by a flush. On ARM64, the flush is implemented
>>>>> using a dsb instruction within arch_sync_dma_flush().
>>>>>
>>>>> For now, add arch_sync_dma_flush() after each
>>>>> arch_sync_dma_for_*() call. arch_sync_dma_flush() is defined as a
>>>>> no-op on all architectures except arm64, so this patch does not
>>>>> change existing behavior. Subsequent patches will introduce true
>>>>> batching for SG DMA buffers.
>>>>>
>>>>> Cc: Leon Romanovsky <leon@kernel.org>
>>>>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>>>>> Cc: Will Deacon <will@kernel.org>
>>>>> Cc: Marek Szyprowski <m.szyprowski@samsung.com>
>>>>> Cc: Robin Murphy <robin.murphy@arm.com>
>>>>> Cc: Ada Couprie Diaz <ada.coupriediaz@arm.com>
>>>>> Cc: Ard Biesheuvel <ardb@kernel.org>
>>>>> Cc: Marc Zyngier <maz@kernel.org>
>>>>> Cc: Anshuman Khandual <anshuman.khandual@arm.com>
>>>>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>>>>> Cc: Suren Baghdasaryan <surenb@google.com>
>>>>> Cc: Joerg Roedel <joro@8bytes.org>
>>>>> Cc: Juergen Gross <jgross@suse.com>
>>>>> Cc: Stefano Stabellini <sstabellini@kernel.org>
>>>>> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>>>> Cc: Tangquan Zheng <zhengtangquan@oppo.com>
>>>>> Signed-off-by: Barry Song <baohua@kernel.org>
>>>>> ---
>>>>>   arch/arm64/include/asm/cache.h |  6 ++++++
>>>>>   arch/arm64/mm/dma-mapping.c    |  4 ++--
>>>>>   drivers/iommu/dma-iommu.c      | 37 +++++++++++++++++++++++++---------
>>>>>   drivers/xen/swiotlb-xen.c      | 24 ++++++++++++++--------
>>>>>   include/linux/dma-map-ops.h    |  6 ++++++
>>>>>   kernel/dma/direct.c            |  8 ++++++--
>>>>>   kernel/dma/direct.h            |  9 +++++++--
>>>>>   kernel/dma/swiotlb.c           |  4 +++-
>>>>>   8 files changed, 73 insertions(+), 25 deletions(-)
>>>> <...>
>>>>
>>>>> +#ifndef arch_sync_dma_flush
>>>>> +static inline void arch_sync_dma_flush(void)
>>>>> +{
>>>>> +}
>>>>> +#endif
>>>> Over the weekend I realized a useful advantage of the ARCH_HAVE_* config
>>>> options: they make it straightforward to inspect the entire DMA path simply
>>>> by looking at the .config.
>>> I am not quite sure how much this benefits users, as the same
>>> information could also be obtained by grepping for
>>> #define arch_sync_dma_flush in the source code.
>> It differs slightly. Users no longer need to grep around or guess whether this
>> platform used the arch_sync_dma_flush path. A simple grep for ARCH_HAVE_ in
>> /proc/config.gz provides the answer.
> In any case, it is only two or three lines of code, so I am fine with
> either approach. Perhaps Marek, Robin, and others have a point here?

If possible I would suggest to follow the already used style in the 
given code even if it means a bit larger patch.

>>>> Thanks,
>>>> Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
>>> Thanks very much, Leon, for reviewing this over the weekend. One thing
>>> you might have missed is that I place arch_sync_dma_flush() after all
>>> arch_sync_dma_for_*() calls, for both single and sg cases. I also
>>> used a Python script to scan the code and verify that every
>>> arch_sync_dma_for_*() is followed by arch_sync_dma_flush(), to ensure
>>> that no call is left out.
>>>
>>> In the subsequent patches, for sg cases, the per-entry flush is
>>> replaced by a single flush of the entire sg. Each sg case has
>>> different characteristics: some are straightforward, while others
>>> can be tricky and involve additional contexts.
>> I didn't overlook it, and I understand your rationale. However, this is
>> not how kernel patches should be structured. You should not introduce
>> code in patch X and then move it elsewhere in patch X + Y.
> I am not quite convinced by this concern. This patch only
> separates DMA sync issuing from completion waiting, and it
> reflects that the development is done step by step.
>
>> Place the code in the correct location from the start. Your patches are
>> small enough to review as is.
> My point is that this patch places the code in the correct locations
> from the start. It splits arch_sync_dma_for_*() into
> arch_sync_dma_for_*() plus arch_sync_dma_flush() everywhere, without
> introducing any functional changes from the outset.
> The subsequent patches clearly show which parts are truly batched.
>
> In the meantime, I do not have a strong preference here. If you think
> it is better to move some of the straightforward batching code here,
> I can follow that approach. Perhaps I could move patch 5, patch 8,
> and the iommu_dma_iova_unlink_range_slow change from patch 7 here,
> while keeping
>
>    [PATCH 6] dma-mapping: Support batch mode for
>    dma_direct_{map,unmap}_sg
>
> and the IOVA link part from patch 7 as separate patches, since that
> part is not straightforward. The IOVA link changes affect both
> __dma_iova_link() and dma_iova_sync(), which are two separate
> functions and require a deeper understanding of the contexts to
> determine correctness. That part also lacks testing.
>
> Would that be okay with you?

Yes, this will be okay. The changes are easy to understand, so we don't 
need to go there with such very small steps.

Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland

Re: [PATCH v2 4/8] dma-mapping: Separate DMA sync issuing and completion waiting

Posted by Leon Romanovsky 1 month, 1 week ago

On Mon, Dec 29, 2025 at 10:38:26AM +1300, Barry Song wrote:
> On Mon, Dec 29, 2025 at 3:49 AM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Sun, Dec 28, 2025 at 10:45:13AM +1300, Barry Song wrote:
> > > On Sun, Dec 28, 2025 at 9:07 AM Leon Romanovsky <leon@kernel.org> wrote:
> > > >
> > > > On Sat, Dec 27, 2025 at 11:52:44AM +1300, Barry Song wrote:
> > > > > From: Barry Song <baohua@kernel.org>
> > > > >
> > > > > Currently, arch_sync_dma_for_cpu and arch_sync_dma_for_device
> > > > > always wait for the completion of each DMA buffer. That is,
> > > > > issuing the DMA sync and waiting for completion is done in a
> > > > > single API call.
> > > > >
> > > > > For scatter-gather lists with multiple entries, this means
> > > > > issuing and waiting is repeated for each entry, which can hurt
> > > > > performance. Architectures like ARM64 may be able to issue all
> > > > > DMA sync operations for all entries first and then wait for
> > > > > completion together.
> > > > >
> > > > > To address this, arch_sync_dma_for_* now issues DMA operations in
> > > > > batch, followed by a flush. On ARM64, the flush is implemented
> > > > > using a dsb instruction within arch_sync_dma_flush().
> > > > >
> > > > > For now, add arch_sync_dma_flush() after each
> > > > > arch_sync_dma_for_*() call. arch_sync_dma_flush() is defined as a
> > > > > no-op on all architectures except arm64, so this patch does not
> > > > > change existing behavior. Subsequent patches will introduce true
> > > > > batching for SG DMA buffers.
> > > > >
> > > > > Cc: Leon Romanovsky <leon@kernel.org>
> > > > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > > > Cc: Will Deacon <will@kernel.org>
> > > > > Cc: Marek Szyprowski <m.szyprowski@samsung.com>
> > > > > Cc: Robin Murphy <robin.murphy@arm.com>
> > > > > Cc: Ada Couprie Diaz <ada.coupriediaz@arm.com>
> > > > > Cc: Ard Biesheuvel <ardb@kernel.org>
> > > > > Cc: Marc Zyngier <maz@kernel.org>
> > > > > Cc: Anshuman Khandual <anshuman.khandual@arm.com>
> > > > > Cc: Ryan Roberts <ryan.roberts@arm.com>
> > > > > Cc: Suren Baghdasaryan <surenb@google.com>
> > > > > Cc: Joerg Roedel <joro@8bytes.org>
> > > > > Cc: Juergen Gross <jgross@suse.com>
> > > > > Cc: Stefano Stabellini <sstabellini@kernel.org>
> > > > > Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> > > > > Cc: Tangquan Zheng <zhengtangquan@oppo.com>
> > > > > Signed-off-by: Barry Song <baohua@kernel.org>
> > > > > ---
> > > > >  arch/arm64/include/asm/cache.h |  6 ++++++
> > > > >  arch/arm64/mm/dma-mapping.c    |  4 ++--
> > > > >  drivers/iommu/dma-iommu.c      | 37 +++++++++++++++++++++++++---------
> > > > >  drivers/xen/swiotlb-xen.c      | 24 ++++++++++++++--------
> > > > >  include/linux/dma-map-ops.h    |  6 ++++++
> > > > >  kernel/dma/direct.c            |  8 ++++++--
> > > > >  kernel/dma/direct.h            |  9 +++++++--
> > > > >  kernel/dma/swiotlb.c           |  4 +++-
> > > > >  8 files changed, 73 insertions(+), 25 deletions(-)
> > > >
> > > > <...>
> > > >
> > > > > +#ifndef arch_sync_dma_flush
> > > > > +static inline void arch_sync_dma_flush(void)
> > > > > +{
> > > > > +}
> > > > > +#endif
> > > >
> > > > Over the weekend I realized a useful advantage of the ARCH_HAVE_* config
> > > > options: they make it straightforward to inspect the entire DMA path simply
> > > > by looking at the .config.
> > >
> > > I am not quite sure how much this benefits users, as the same
> > > information could also be obtained by grepping for
> > > #define arch_sync_dma_flush in the source code.
> >
> > It differs slightly. Users no longer need to grep around or guess whether this
> > platform used the arch_sync_dma_flush path. A simple grep for ARCH_HAVE_ in
> > /proc/config.gz provides the answer.
> 
> In any case, it is only two or three lines of code, so I am fine with
> either approach. Perhaps Marek, Robin, and others have a point here?
> 
> >
> > >
> > > >
> > > > Thanks,
> > > > Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
> > >
> > > Thanks very much, Leon, for reviewing this over the weekend. One thing
> > > you might have missed is that I place arch_sync_dma_flush() after all
> > > arch_sync_dma_for_*() calls, for both single and sg cases. I also
> > > used a Python script to scan the code and verify that every
> > > arch_sync_dma_for_*() is followed by arch_sync_dma_flush(), to ensure
> > > that no call is left out.
> > >
> > > In the subsequent patches, for sg cases, the per-entry flush is
> > > replaced by a single flush of the entire sg. Each sg case has
> > > different characteristics: some are straightforward, while others
> > > can be tricky and involve additional contexts.
> >
> > I didn't overlook it, and I understand your rationale. However, this is
> > not how kernel patches should be structured. You should not introduce
> > code in patch X and then move it elsewhere in patch X + Y.
> 
> I am not quite convinced by this concern. This patch only
> separates DMA sync issuing from completion waiting, and it
> reflects that the development is done step by step.
> 
> >
> > Place the code in the correct location from the start. Your patches are
> > small enough to review as is.
> 
> My point is that this patch places the code in the correct locations
> from the start. It splits arch_sync_dma_for_*() into
> arch_sync_dma_for_*() plus arch_sync_dma_flush() everywhere, without
> introducing any functional changes from the outset.
> The subsequent patches clearly show which parts are truly batched.
> 
> In the meantime, I do not have a strong preference here. If you think
> it is better to move some of the straightforward batching code here,
> I can follow that approach. Perhaps I could move patch 5, patch 8,
> and the iommu_dma_iova_unlink_range_slow change from patch 7 here,
> while keeping
> 
>   [PATCH 6] dma-mapping: Support batch mode for
>   dma_direct_{map,unmap}_sg
> 
> and the IOVA link part from patch 7 as separate patches, since that
> part is not straightforward. The IOVA link changes affect both
> __dma_iova_link() and dma_iova_sync(), which are two separate
> functions and require a deeper understanding of the contexts to
> determine correctness. That part also lacks testing.

Don't worry about testing. NVME, RDMA and GPU are using this path
and someone will test it.

> 
> Would that be okay with you?

I don't know, need to see the code.

Thanks

> 
> Thanks
> Barry