[v1] dma-mapping: arm64: support batched cache sync

[RFC PATCH 5/5] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Barry Song 3 months, 1 week ago

From: Barry Song <v-songbaohua@oppo.com>

This enables dma_direct_sync_sg_for_device, dma_direct_sync_sg_for_cpu,
dma_direct_map_sg, and dma_direct_unmap_sg to use batched DMA sync
operations when possible. This significantly improves performance on
devices without hardware cache coherence.

Tangquan's initial results show that batched synchronization can reduce
dma_map_sg() time by 64.61% and dma_unmap_sg() time by 66.60% on an MTK
phone platform (MediaTek Dimensity 9500). The tests were performed by
pinning the task to CPU7 and fixing the CPU frequency at 2.6 GHz,
running dma_map_sg() and dma_unmap_sg() on 10 MB buffers (10 MB / 4 KB
sg entries per buffer) for 200 iterations and then averaging the
results.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Ada Couprie Diaz <ada.coupriediaz@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tangquan Zheng <zhengtangquan@oppo.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Cc: iommu@lists.linux.dev
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 kernel/dma/direct.c | 53 +++++++++++++++++++++++++---
 kernel/dma/direct.h | 86 +++++++++++++++++++++++++++++++++++++++------
 2 files changed, 123 insertions(+), 16 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 1f9ee9759426..a0b45f84a91f 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -403,9 +403,16 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 		swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_device(paddr, sg->length,
-					dir);
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+			arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);
+#else
+			arch_sync_dma_for_device(paddr, sg->length, dir);
+#endif
 	}
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_batch_flush();
+#endif
 }
 #endif
 
@@ -422,7 +429,11 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
 		if (!dev_is_dma_coherent(dev))
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+			arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
+#else
 			arch_sync_dma_for_cpu(paddr, sg->length, dir);
+#endif
 
 		swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
 
@@ -430,8 +441,12 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 			arch_dma_mark_clean(paddr, sg->length);
 	}
 
-	if (!dev_is_dma_coherent(dev))
+	if (!dev_is_dma_coherent(dev)) {
 		arch_sync_dma_for_cpu_all();
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+		arch_sync_dma_batch_flush();
+#endif
+	}
 }
 
 /*
@@ -443,14 +458,29 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 {
 	struct scatterlist *sg;
 	int i;
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+	bool need_sync = false;
+#endif
 
 	for_each_sg(sgl,  sg, nents, i) {
-		if (sg_dma_is_bus_address(sg))
+		if (sg_dma_is_bus_address(sg)) {
 			sg_dma_unmark_bus_address(sg);
-		else
+		} else {
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+			need_sync = true;
+			dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
+					      sg_dma_len(sg), dir, attrs);
+
+#else
 			dma_direct_unmap_phys(dev, sg->dma_address,
 					      sg_dma_len(sg), dir, attrs);
+#endif
+		}
 	}
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+	if (need_sync && !dev_is_dma_coherent(dev))
+		arch_sync_dma_batch_flush();
+#endif
 }
 #endif
 
@@ -460,6 +490,9 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 	struct pci_p2pdma_map_state p2pdma_state = {};
 	struct scatterlist *sg;
 	int i, ret;
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+	bool need_sync = false;
+#endif
 
 	for_each_sg(sgl, sg, nents, i) {
 		switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
@@ -471,8 +504,14 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 			 */
 			break;
 		case PCI_P2PDMA_MAP_NONE:
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+			need_sync = true;
+			sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
+					sg->length, dir, attrs);
+#else
 			sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
 					sg->length, dir, attrs);
+#endif
 			if (sg->dma_address == DMA_MAPPING_ERROR) {
 				ret = -EIO;
 				goto out_unmap;
@@ -490,6 +529,10 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		sg_dma_len(sg) = sg->length;
 	}
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+	if (need_sync && !dev_is_dma_coherent(dev))
+		arch_sync_dma_batch_flush();
+#endif
 	return nents;
 
 out_unmap:
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index da2fadf45bcd..a211bab26478 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -64,15 +64,11 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
 		arch_sync_dma_for_device(paddr, size, dir);
 }
 
-static inline void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+static inline void __dma_direct_sync_single_for_cpu(struct device *dev,
+		phys_addr_t paddr, size_t size, enum dma_data_direction dir)
 {
-	phys_addr_t paddr = dma_to_phys(dev, addr);
-
-	if (!dev_is_dma_coherent(dev)) {
-		arch_sync_dma_for_cpu(paddr, size, dir);
+	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_cpu_all();
-	}
 
 	swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
 
@@ -80,7 +76,31 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
 		arch_dma_mark_clean(paddr, size);
 }
 
-static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+static inline void dma_direct_sync_single_for_cpu_batch_add(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	phys_addr_t paddr = dma_to_phys(dev, addr);
+
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_for_cpu_batch_add(paddr, size, dir);
+
+	__dma_direct_sync_single_for_cpu(dev, paddr, size, dir);
+}
+#endif
+
+static inline void dma_direct_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	phys_addr_t paddr = dma_to_phys(dev, addr);
+
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_for_cpu(paddr, size, dir);
+
+	__dma_direct_sync_single_for_cpu(dev, paddr, size, dir);
+}
+
+static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
 		phys_addr_t phys, size_t size, enum dma_data_direction dir,
 		unsigned long attrs)
 {
@@ -108,9 +128,6 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
 		}
 	}
 
-	if (!dev_is_dma_coherent(dev) &&
-	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
-		arch_sync_dma_for_device(phys, size, dir);
 	return dma_addr;
 
 err_overflow:
@@ -121,6 +138,53 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
 	return DMA_MAPPING_ERROR;
 }
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
+		phys_addr_t phys, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	dma_addr_t dma_addr = __dma_direct_map_phys(dev, phys, size, dir, attrs);
+
+	if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
+	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+		arch_sync_dma_for_device_batch_add(phys, size, dir);
+
+	return dma_addr;
+}
+#endif
+
+static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+		phys_addr_t phys, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	dma_addr_t dma_addr = __dma_direct_map_phys(dev, phys, size, dir, attrs);
+
+	if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
+	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+		arch_sync_dma_for_device(phys, size, dir);
+
+	return dma_addr;
+}
+
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+static inline void dma_direct_unmap_phys_batch_add(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	phys_addr_t phys;
+
+	if (attrs & DMA_ATTR_MMIO)
+		/* nothing to do: uncached and no swiotlb */
+		return;
+
+	phys = dma_to_phys(dev, addr);
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_direct_sync_single_for_cpu_batch_add(dev, addr, size, dir);
+
+	swiotlb_tbl_unmap_single(dev, phys, size, dir,
+					 attrs | DMA_ATTR_SKIP_CPU_SYNC);
+}
+#endif
+
 static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
-- 
2.39.3 (Apple Git-146)

Re: [RFC PATCH 5/5] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Catalin Marinas 2 months, 3 weeks ago

On Wed, Oct 29, 2025 at 10:31:15AM +0800, Barry Song wrote:
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 1f9ee9759426..a0b45f84a91f 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -403,9 +403,16 @@ void dma_direct_sync_sg_for_device(struct device *dev,
>  		swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
>  
>  		if (!dev_is_dma_coherent(dev))
> -			arch_sync_dma_for_device(paddr, sg->length,
> -					dir);
> +#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
> +			arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);
> +#else
> +			arch_sync_dma_for_device(paddr, sg->length, dir);
> +#endif
>  	}
> +#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
> +	if (!dev_is_dma_coherent(dev))
> +		arch_sync_dma_batch_flush();
> +#endif
>  }
>  #endif

Just a high-level comment for now. I'm not opposed to the idea of
batching the DSB barriers, we do this for ptes. However, the way it's
implemented in the generic files, with lots of #ifdefs, makes the code
pretty unreadable.

Can we have something like arch_sync_dma_begin/end() and let the arch
code handle the barriers as they see fit?

-- 
Catalin

Re: [RFC PATCH 5/5] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Barry Song 2 months, 3 weeks ago

On Fri, Nov 14, 2025 at 2:19 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> On Wed, Oct 29, 2025 at 10:31:15AM +0800, Barry Song wrote:
> > diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> > index 1f9ee9759426..a0b45f84a91f 100644
> > --- a/kernel/dma/direct.c
> > +++ b/kernel/dma/direct.c
> > @@ -403,9 +403,16 @@ void dma_direct_sync_sg_for_device(struct device *dev,
> >               swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
> >
> >               if (!dev_is_dma_coherent(dev))
> > -                     arch_sync_dma_for_device(paddr, sg->length,
> > -                                     dir);
> > +#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
> > +                     arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);
> > +#else
> > +                     arch_sync_dma_for_device(paddr, sg->length, dir);
> > +#endif
> >       }
> > +#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
> > +     if (!dev_is_dma_coherent(dev))
> > +             arch_sync_dma_batch_flush();
> > +#endif
> >  }
> >  #endif
>
> Just a high-level comment for now. I'm not opposed to the idea of
> batching the DSB barriers, we do this for ptes. However, the way it's


Thanks, Catalin. I agree we need batching, as phones and embedded systems
could use many DMA buffers while some chips lack DMA-coherency.


> implemented in the generic files, with lots of #ifdefs, makes the code
> pretty unreadable.
>
> Can we have something like arch_sync_dma_begin/end() and let the arch
> code handle the barriers as they see fit?


I guess I can refactor it as below and then remove the #ifdef/#else/#endif blocks.

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 8fcd0a9c1f39..73bca4d7149d 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -373,6 +373,20 @@ void arch_sync_dma_for_device_batch_add(phys_addr_t paddr, size_t size,
 void arch_sync_dma_for_cpu_batch_add(phys_addr_t paddr, size_t size,
 		enum dma_data_direction dir);
 void arch_sync_dma_batch_flush(void);
+#else
+static inline void arch_sync_dma_for_device_batch_add(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
+{
+	arch_sync_dma_for_device(paddr, size, dir);
+}
+static inline void arch_sync_dma_for_cpu_batch_add(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
+{
+	arch_sync_dma_for_cpu(paddr, size, dir);
+}
+static inline void arch_sync_dma_batch_flush(void)
+{
+}
 #endif
 
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index a0b45f84a91f..69b14b0c0501 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -403,16 +403,10 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 		swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
 
 		if (!dev_is_dma_coherent(dev))
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
 			arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);
-#else
-			arch_sync_dma_for_device(paddr, sg->length, dir);
-#endif
 	}
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_batch_flush();
-#endif
 }
 #endif
 
@@ -429,11 +423,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
 		if (!dev_is_dma_coherent(dev))
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
 			arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
-#else
-			arch_sync_dma_for_cpu(paddr, sg->length, dir);
-#endif
 
 		swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
 
@@ -443,9 +433,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 
 	if (!dev_is_dma_coherent(dev)) {
 		arch_sync_dma_for_cpu_all();
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
 		arch_sync_dma_batch_flush();
-#endif
 	}
 }
 
@@ -458,29 +446,19 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 {
 	struct scatterlist *sg;
 	int i;
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
 	bool need_sync = false;
-#endif
 
 	for_each_sg(sgl,  sg, nents, i) {
 		if (sg_dma_is_bus_address(sg)) {
 			sg_dma_unmark_bus_address(sg);
 		} else {
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
 			need_sync = true;
 			dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
 					      sg_dma_len(sg), dir, attrs);
-
-#else
-			dma_direct_unmap_phys(dev, sg->dma_address,
-					      sg_dma_len(sg), dir, attrs);
-#endif
 		}
 	}
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
 	if (need_sync && !dev_is_dma_coherent(dev))
 		arch_sync_dma_batch_flush();
-#endif
 }
 #endif
 
@@ -490,9 +468,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 	struct pci_p2pdma_map_state p2pdma_state = {};
 	struct scatterlist *sg;
 	int i, ret;
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
 	bool need_sync = false;
-#endif
 
 	for_each_sg(sgl, sg, nents, i) {
 		switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
@@ -504,14 +480,9 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 			 */
 			break;
 		case PCI_P2PDMA_MAP_NONE:
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
 			need_sync = true;
 			sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
 					sg->length, dir, attrs);
-#else
-			sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
-					sg->length, dir, attrs);
-#endif
 			if (sg->dma_address == DMA_MAPPING_ERROR) {
 				ret = -EIO;
 				goto out_unmap;
@@ -529,10 +500,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		sg_dma_len(sg) = sg->length;
 	}
 
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
 	if (need_sync && !dev_is_dma_coherent(dev))
 		arch_sync_dma_batch_flush();
-#endif
 	return nents;
 
 out_unmap:


Thanks
Barry

Re: [RFC PATCH 5/5] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Marek Szyprowski 2 months, 2 weeks ago

Hi Barry,

On 17.11.2025 22:12, Barry Song wrote:
> On Fri, Nov 14, 2025 at 2:19 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
>> On Wed, Oct 29, 2025 at 10:31:15AM +0800, Barry Song wrote:
>>> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
>>> index 1f9ee9759426..a0b45f84a91f 100644
>>> --- a/kernel/dma/direct.c
>>> +++ b/kernel/dma/direct.c
>>> @@ -403,9 +403,16 @@ void dma_direct_sync_sg_for_device(struct device *dev,
>>>                swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
>>>
>>>                if (!dev_is_dma_coherent(dev))
>>> -                     arch_sync_dma_for_device(paddr, sg->length,
>>> -                                     dir);
>>> +#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>>> +                     arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);
>>> +#else
>>> +                     arch_sync_dma_for_device(paddr, sg->length, dir);
>>> +#endif
>>>        }
>>> +#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>>> +     if (!dev_is_dma_coherent(dev))
>>> +             arch_sync_dma_batch_flush();
>>> +#endif
>>>   }
>>>   #endif
>> Just a high-level comment for now. I'm not opposed to the idea of
>> batching the DSB barriers, we do this for ptes. However, the way it's
>
> Thanks, Catalin. I agree we need batching, as phones and embedded systems
> could use many DMA buffers while some chips lack DMA-coherency.
>
>
>> implemented in the generic files, with lots of #ifdefs, makes the code
>> pretty unreadable.
>>
>> Can we have something like arch_sync_dma_begin/end() and let the arch
>> code handle the barriers as they see fit?
>
> I guess I can refactor it as below and then remove the #ifdef/#else/#endif blocks.
>
> diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
> index 8fcd0a9c1f39..73bca4d7149d 100644
> --- a/include/linux/dma-map-ops.h
> +++ b/include/linux/dma-map-ops.h
> @@ -373,6 +373,20 @@ void arch_sync_dma_for_device_batch_add(phys_addr_t paddr, size_t size,
>   void arch_sync_dma_for_cpu_batch_add(phys_addr_t paddr, size_t size,
>   		enum dma_data_direction dir);
>   void arch_sync_dma_batch_flush(void);
> +#else
> +static inline void arch_sync_dma_for_device_batch_add(phys_addr_t paddr, size_t size,
> +		enum dma_data_direction dir)
> +{
> +	arch_sync_dma_for_device(paddr, size, dir);
> +}
> +static inline void arch_sync_dma_for_cpu_batch_add(phys_addr_t paddr, size_t size,
> +		enum dma_data_direction dir)
> +{
> +	arch_sync_dma_for_cpu(paddr, size, dir);
> +}
> +static inline void arch_sync_dma_batch_flush(void)
> +{
> +}
>   #endif
>   
>   #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index a0b45f84a91f..69b14b0c0501 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -403,16 +403,10 @@ void dma_direct_sync_sg_for_device(struct device *dev,
>   		swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
>   
>   		if (!dev_is_dma_coherent(dev))
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   			arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);
> -#else
> -			arch_sync_dma_for_device(paddr, sg->length, dir);
> -#endif
>   	}
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   	if (!dev_is_dma_coherent(dev))
>   		arch_sync_dma_batch_flush();
> -#endif
>   }
>   #endif
>   
> @@ -429,11 +423,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
>   		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
>   
>   		if (!dev_is_dma_coherent(dev))
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   			arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
> -#else
> -			arch_sync_dma_for_cpu(paddr, sg->length, dir);
> -#endif
>   
>   		swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
>   
> @@ -443,9 +433,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
>   
>   	if (!dev_is_dma_coherent(dev)) {
>   		arch_sync_dma_for_cpu_all();
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   		arch_sync_dma_batch_flush();
> -#endif
>   	}
>   }
>   
> @@ -458,29 +446,19 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
>   {
>   	struct scatterlist *sg;
>   	int i;
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   	bool need_sync = false;
> -#endif
>   
>   	for_each_sg(sgl,  sg, nents, i) {
>   		if (sg_dma_is_bus_address(sg)) {
>   			sg_dma_unmark_bus_address(sg);
>   		} else {
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   			need_sync = true;
>   			dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
>   					      sg_dma_len(sg), dir, attrs);
> -
> -#else
> -			dma_direct_unmap_phys(dev, sg->dma_address,
> -					      sg_dma_len(sg), dir, attrs);
> -#endif
>   		}
>   	}
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   	if (need_sync && !dev_is_dma_coherent(dev))
>   		arch_sync_dma_batch_flush();
> -#endif
>   }
>   #endif
>   
> @@ -490,9 +468,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
>   	struct pci_p2pdma_map_state p2pdma_state = {};
>   	struct scatterlist *sg;
>   	int i, ret;
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   	bool need_sync = false;
> -#endif
>   
>   	for_each_sg(sgl, sg, nents, i) {
>   		switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
> @@ -504,14 +480,9 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
>   			 */
>   			break;
>   		case PCI_P2PDMA_MAP_NONE:
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   			need_sync = true;
>   			sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
>   					sg->length, dir, attrs);
> -#else
> -			sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
> -					sg->length, dir, attrs);
> -#endif
>   			if (sg->dma_address == DMA_MAPPING_ERROR) {
>   				ret = -EIO;
>   				goto out_unmap;
> @@ -529,10 +500,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
>   		sg_dma_len(sg) = sg->length;
>   	}
>   
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   	if (need_sync && !dev_is_dma_coherent(dev))
>   		arch_sync_dma_batch_flush();
> -#endif
>   	return nents;
>   
>   out_unmap:
>
>
This version looks a bit better to me. Similar batching could be added 
also to dma_iova_link()/dma_iova_sync() paths.

Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland

Re: [RFC PATCH 5/5] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Barry Song 2 months, 2 weeks ago

On Sat, Nov 22, 2025 at 12:09 AM Marek Szyprowski <m.szyprowski@samsung.com> wrote:
>
> Hi Barry,
>
[...]
> This version looks a bit better to me. Similar batching could be added
> also to dma_iova_link()/dma_iova_sync() paths.

Thanks, Marek. I will respin a new version. For dma_iova, I assume you meant
something like the following?

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 7944a3af4545..7bb6ed663236 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1837,7 +1837,7 @@ static int __dma_iova_link(struct device *dev, dma_addr_t addr,
 	int prot = dma_info_to_prot(dir, coherent, attrs);
 
 	if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
-		arch_sync_dma_for_device(phys, size, dir);
+		arch_sync_dma_for_device_batch_add(phys, size, dir);
 
 	return iommu_map_nosync(iommu_get_dma_domain(dev), addr, phys, size,
 			prot, GFP_ATOMIC);
@@ -1980,6 +1980,8 @@ int dma_iova_sync(struct device *dev, struct dma_iova_state *state,
 	dma_addr_t addr = state->addr + offset;
 	size_t iova_start_pad = iova_offset(iovad, addr);
 
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_batch_flush();
 	return iommu_sync_map(domain, addr - iova_start_pad,
 		      iova_align(iovad, size + iova_start_pad));
 }

If so, I don't really have such hardware to test. I wonder if I can make it as
patch 6/6 when respinning, and still mark it as RFC v2. Or should I leave it as
is and expect someone with the hardware to test and send it?

Thanks
Barry

Re: [RFC PATCH 5/5] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Marek Szyprowski 2 months, 2 weeks ago

On 22.11.2025 00:28, Barry Song wrote:
> On Sat, Nov 22, 2025 at 12:09 AM Marek Szyprowski <m.szyprowski@samsung.com> wrote:
> [...]
>> This version looks a bit better to me. Similar batching could be added
>> also to dma_iova_link()/dma_iova_sync() paths.
> Thanks, Marek. I will respin a new version. For dma_iova, I assume you meant
> something like the following?
>
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 7944a3af4545..7bb6ed663236 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -1837,7 +1837,7 @@ static int __dma_iova_link(struct device *dev, dma_addr_t addr,
>   	int prot = dma_info_to_prot(dir, coherent, attrs);
>   
>   	if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
> -		arch_sync_dma_for_device(phys, size, dir);
> +		arch_sync_dma_for_device_batch_add(phys, size, dir);
>   
>   	return iommu_map_nosync(iommu_get_dma_domain(dev), addr, phys, size,
>   			prot, GFP_ATOMIC);
> @@ -1980,6 +1980,8 @@ int dma_iova_sync(struct device *dev, struct dma_iova_state *state,
>   	dma_addr_t addr = state->addr + offset;
>   	size_t iova_start_pad = iova_offset(iovad, addr);
>   
> +	if (!dev_is_dma_coherent(dev))
> +		arch_sync_dma_batch_flush();
>   	return iommu_sync_map(domain, addr - iova_start_pad,
>   		      iova_align(iovad, size + iova_start_pad));
>   }
>
> If so, I don't really have such hardware to test. I wonder if I can make it as
> patch 6/6 when respinning, and still mark it as RFC v2. Or should I leave it as
> is and expect someone with the hardware to test and send it?

Yes, I meant something like the above diff, also for dma_iova_unlink(). 
It can be an additional, 6th patch with RFC tag, assuming You have no 
way to test it.

Please notice that I've just sent a patch touching the similar code paths:

https://lore.kernel.org/all/20251124170955.3884351-1-m.szyprowski@samsung.com/

Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland

[RFC PATCH 1/5] arm64: Provide dcache_by_myline_op_nosync helper
[RFC PATCH 2/5] arm64: Provide dcache_clean_poc_nosync helper
[RFC PATCH 3/5] arm64: Provide dcache_inval_poc_nosync helper
[RFC PATCH 4/5] arm64: Provide arch_sync_dma_ batched helpers
[RFC PATCH 5/5] dma-mapping: Allow batched DMA sync operations if supported by the arch