[v1] dma-mapping: arm64: support batched cache sync

[PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Barry Song 1 month, 3 weeks ago

From: Barry Song <v-songbaohua@oppo.com>

This enables dma_direct_sync_sg_for_device, dma_direct_sync_sg_for_cpu,
dma_direct_map_sg, and dma_direct_unmap_sg to use batched DMA sync
operations when possible. This significantly improves performance on
devices without hardware cache coherence.

Tangquan's initial results show that batched synchronization can reduce
dma_map_sg() time by 64.61% and dma_unmap_sg() time by 66.60% on an MTK
phone platform (MediaTek Dimensity 9500). The tests were performed by
pinning the task to CPU7 and fixing the CPU frequency at 2.6 GHz,
running dma_map_sg() and dma_unmap_sg() on 10 MB buffers (10 MB / 4 KB
sg entries per buffer) for 200 iterations and then averaging the
results.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Ada Couprie Diaz <ada.coupriediaz@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tangquan Zheng <zhengtangquan@oppo.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 kernel/dma/direct.c | 28 ++++++++++-----
 kernel/dma/direct.h | 86 +++++++++++++++++++++++++++++++++++++++------
 2 files changed, 95 insertions(+), 19 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 50c3fe2a1d55..ed2339b0c5e7 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -403,9 +403,10 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 		swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_device(paddr, sg->length,
-					dir);
+			arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);
 	}
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_batch_flush();
 }
 #endif
 
@@ -422,7 +423,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_cpu(paddr, sg->length, dir);
+			arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
 
 		swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
 
@@ -430,8 +431,10 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 			arch_dma_mark_clean(paddr, sg->length);
 	}
 
-	if (!dev_is_dma_coherent(dev))
+	if (!dev_is_dma_coherent(dev)) {
 		arch_sync_dma_for_cpu_all();
+		arch_sync_dma_batch_flush();
+	}
 }
 
 /*
@@ -443,14 +446,19 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 {
 	struct scatterlist *sg;
 	int i;
+	bool need_sync = false;
 
 	for_each_sg(sgl,  sg, nents, i) {
-		if (sg_dma_is_bus_address(sg))
+		if (sg_dma_is_bus_address(sg)) {
 			sg_dma_unmark_bus_address(sg);
-		else
-			dma_direct_unmap_phys(dev, sg->dma_address,
+		} else {
+			need_sync = true;
+			dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
 					      sg_dma_len(sg), dir, attrs);
+		}
 	}
+	if (need_sync && !dev_is_dma_coherent(dev))
+		arch_sync_dma_batch_flush();
 }
 #endif
 
@@ -460,6 +468,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 	struct pci_p2pdma_map_state p2pdma_state = {};
 	struct scatterlist *sg;
 	int i, ret;
+	bool need_sync = false;
 
 	for_each_sg(sgl, sg, nents, i) {
 		switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
@@ -471,7 +480,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 			 */
 			break;
 		case PCI_P2PDMA_MAP_NONE:
-			sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
+			need_sync = true;
+			sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
 					sg->length, dir, attrs);
 			if (sg->dma_address == DMA_MAPPING_ERROR) {
 				ret = -EIO;
@@ -491,6 +501,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		sg_dma_len(sg) = sg->length;
 	}
 
+	if (need_sync && !dev_is_dma_coherent(dev))
+		arch_sync_dma_batch_flush();
 	return nents;
 
 out_unmap:
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index da2fadf45bcd..a211bab26478 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -64,15 +64,11 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
 		arch_sync_dma_for_device(paddr, size, dir);
 }
 
-static inline void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+static inline void __dma_direct_sync_single_for_cpu(struct device *dev,
+		phys_addr_t paddr, size_t size, enum dma_data_direction dir)
 {
-	phys_addr_t paddr = dma_to_phys(dev, addr);
-
-	if (!dev_is_dma_coherent(dev)) {
-		arch_sync_dma_for_cpu(paddr, size, dir);
+	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_cpu_all();
-	}
 
 	swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
 
@@ -80,7 +76,31 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
 		arch_dma_mark_clean(paddr, size);
 }
 
-static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+static inline void dma_direct_sync_single_for_cpu_batch_add(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	phys_addr_t paddr = dma_to_phys(dev, addr);
+
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_for_cpu_batch_add(paddr, size, dir);
+
+	__dma_direct_sync_single_for_cpu(dev, paddr, size, dir);
+}
+#endif
+
+static inline void dma_direct_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	phys_addr_t paddr = dma_to_phys(dev, addr);
+
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_for_cpu(paddr, size, dir);
+
+	__dma_direct_sync_single_for_cpu(dev, paddr, size, dir);
+}
+
+static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
 		phys_addr_t phys, size_t size, enum dma_data_direction dir,
 		unsigned long attrs)
 {
@@ -108,9 +128,6 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
 		}
 	}
 
-	if (!dev_is_dma_coherent(dev) &&
-	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
-		arch_sync_dma_for_device(phys, size, dir);
 	return dma_addr;
 
 err_overflow:
@@ -121,6 +138,53 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
 	return DMA_MAPPING_ERROR;
 }
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
+		phys_addr_t phys, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	dma_addr_t dma_addr = __dma_direct_map_phys(dev, phys, size, dir, attrs);
+
+	if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
+	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+		arch_sync_dma_for_device_batch_add(phys, size, dir);
+
+	return dma_addr;
+}
+#endif
+
+static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+		phys_addr_t phys, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	dma_addr_t dma_addr = __dma_direct_map_phys(dev, phys, size, dir, attrs);
+
+	if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
+	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+		arch_sync_dma_for_device(phys, size, dir);
+
+	return dma_addr;
+}
+
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+static inline void dma_direct_unmap_phys_batch_add(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	phys_addr_t phys;
+
+	if (attrs & DMA_ATTR_MMIO)
+		/* nothing to do: uncached and no swiotlb */
+		return;
+
+	phys = dma_to_phys(dev, addr);
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_direct_sync_single_for_cpu_batch_add(dev, addr, size, dir);
+
+	swiotlb_tbl_unmap_single(dev, phys, size, dir,
+					 attrs | DMA_ATTR_SKIP_CPU_SYNC);
+}
+#endif
+
 static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
-- 
2.39.3 (Apple Git-146)

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by kernel test robot 1 month, 2 weeks ago

Hi Barry,

kernel test robot noticed the following build errors:

[auto build test ERROR on linus/master]
[also build test ERROR on next-20251219]
[cannot apply to arm64/for-next/core v6.16-rc1]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Barry-Song/arm64-Provide-dcache_by_myline_op_nosync-helper/20251219-195810
base:   linus/master
patch link:    https://lore.kernel.org/r/20251219053658.84978-6-21cnbao%40gmail.com
patch subject: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20251220/202512201836.f6KX6WMH-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251220/202512201836.f6KX6WMH-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512201836.f6KX6WMH-lkp@intel.com/

All errors (new ones prefixed by >>):

>> kernel/dma/direct.c:456:4: error: call to undeclared function 'dma_direct_unmap_phys_batch_add'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     456 |                         dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
         |                         ^
   kernel/dma/direct.c:456:4: note: did you mean 'dma_direct_unmap_phys'?
   kernel/dma/direct.h:188:20: note: 'dma_direct_unmap_phys' declared here
     188 | static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
         |                    ^
>> kernel/dma/direct.c:484:22: error: call to undeclared function 'dma_direct_map_phys_batch_add'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     484 |                         sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
         |                                           ^
   2 errors generated.


vim +/dma_direct_unmap_phys_batch_add +456 kernel/dma/direct.c

   439	
   440	/*
   441	 * Unmaps segments, except for ones marked as pci_p2pdma which do not
   442	 * require any further action as they contain a bus address.
   443	 */
   444	void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
   445			int nents, enum dma_data_direction dir, unsigned long attrs)
   446	{
   447		struct scatterlist *sg;
   448		int i;
   449		bool need_sync = false;
   450	
   451		for_each_sg(sgl,  sg, nents, i) {
   452			if (sg_dma_is_bus_address(sg)) {
   453				sg_dma_unmark_bus_address(sg);
   454			} else {
   455				need_sync = true;
 > 456				dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
   457						      sg_dma_len(sg), dir, attrs);
   458			}
   459		}
   460		if (need_sync && !dev_is_dma_coherent(dev))
   461			arch_sync_dma_batch_flush();
   462	}
   463	#endif
   464	
   465	int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
   466			enum dma_data_direction dir, unsigned long attrs)
   467	{
   468		struct pci_p2pdma_map_state p2pdma_state = {};
   469		struct scatterlist *sg;
   470		int i, ret;
   471		bool need_sync = false;
   472	
   473		for_each_sg(sgl, sg, nents, i) {
   474			switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
   475			case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
   476				/*
   477				 * Any P2P mapping that traverses the PCI host bridge
   478				 * must be mapped with CPU physical address and not PCI
   479				 * bus addresses.
   480				 */
   481				break;
   482			case PCI_P2PDMA_MAP_NONE:
   483				need_sync = true;
 > 484				sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
   485						sg->length, dir, attrs);
   486				if (sg->dma_address == DMA_MAPPING_ERROR) {
   487					ret = -EIO;
   488					goto out_unmap;
   489				}
   490				break;
   491			case PCI_P2PDMA_MAP_BUS_ADDR:
   492				sg->dma_address = pci_p2pdma_bus_addr_map(
   493					p2pdma_state.mem, sg_phys(sg));
   494				sg_dma_len(sg) = sg->length;
   495				sg_dma_mark_bus_address(sg);
   496				continue;
   497			default:
   498				ret = -EREMOTEIO;
   499				goto out_unmap;
   500			}
   501			sg_dma_len(sg) = sg->length;
   502		}
   503	
   504		if (need_sync && !dev_is_dma_coherent(dev))
   505			arch_sync_dma_batch_flush();
   506		return nents;
   507	
   508	out_unmap:
   509		dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
   510		return ret;
   511	}
   512	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Barry Song 1 month, 2 weeks ago

>
> All errors (new ones prefixed by >>):
>
> >> kernel/dma/direct.c:456:4: error: call to undeclared function 'dma_direct_unmap_phys_batch_add'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
>      456 |                         dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
>          |                         ^
>    kernel/dma/direct.c:456:4: note: did you mean 'dma_direct_unmap_phys'?
>    kernel/dma/direct.h:188:20: note: 'dma_direct_unmap_phys' declared here
>      188 | static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
>          |                    ^
> >> kernel/dma/direct.c:484:22: error: call to undeclared function 'dma_direct_map_phys_batch_add'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
>      484 |                         sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
>          |                                           ^
>    2 errors generated.
>
>

Thanks very much for the report.
Can you please check if the below diff fix the build issue?

From 5541aa1efa19777e435c9f3cca7cd2c6a490d9f1 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Sun, 21 Dec 2025 13:09:36 +0800
Subject: [PATCH] kernel/dma: Fix build errors for dma_direct_map_phys

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202512201836.f6KX6WMH-lkp@intel.com/
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 kernel/dma/direct.h | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index a211bab26478..bcc398b5aa6b 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -138,8 +138,7 @@ static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
 	return DMA_MAPPING_ERROR;
 }
 
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
-static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
+static inline dma_addr_t dma_direct_map_phys(struct device *dev,
 		phys_addr_t phys, size_t size, enum dma_data_direction dir,
 		unsigned long attrs)
 {
@@ -147,13 +146,13 @@ static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
 
 	if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
 	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
-		arch_sync_dma_for_device_batch_add(phys, size, dir);
+		arch_sync_dma_for_device(phys, size, dir);
 
 	return dma_addr;
 }
-#endif
 
-static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
 		phys_addr_t phys, size_t size, enum dma_data_direction dir,
 		unsigned long attrs)
 {
@@ -161,13 +160,20 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
 
 	if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
 	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
-		arch_sync_dma_for_device(phys, size, dir);
+		arch_sync_dma_for_device_batch_add(phys, size, dir);
 
 	return dma_addr;
 }
+#else
+static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
+		phys_addr_t phys, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	return dma_direct_map_phys(dev, phys, size, dir, attrs);
+}
+#endif
 
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
-static inline void dma_direct_unmap_phys_batch_add(struct device *dev, dma_addr_t addr,
+static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 	phys_addr_t phys;
@@ -178,14 +184,14 @@ static inline void dma_direct_unmap_phys_batch_add(struct device *dev, dma_addr_
 
 	phys = dma_to_phys(dev, addr);
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		dma_direct_sync_single_for_cpu_batch_add(dev, addr, size, dir);
+		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
 
 	swiotlb_tbl_unmap_single(dev, phys, size, dir,
 					 attrs | DMA_ATTR_SKIP_CPU_SYNC);
 }
-#endif
 
-static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+static inline void dma_direct_unmap_phys_batch_add(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 	phys_addr_t phys;
@@ -196,9 +202,17 @@ static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
 
 	phys = dma_to_phys(dev, addr);
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+		dma_direct_sync_single_for_cpu_batch_add(dev, addr, size, dir);
 
 	swiotlb_tbl_unmap_single(dev, phys, size, dir,
 					 attrs | DMA_ATTR_SKIP_CPU_SYNC);
 }
+#else
+static inline void dma_direct_unmap_phys_batch_add(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	dma_direct_unmap_phys(dev, addr, size, dir, attrs);
+}
+#endif
+
 #endif /* _KERNEL_DMA_DIRECT_H */
-- 
2.39.3 (Apple Git-146)

Thanks
Barry

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by kernel test robot 1 month, 2 weeks ago

Hi Barry,

kernel test robot noticed the following build errors:

[auto build test ERROR on linus/master]
[also build test ERROR on v6.19-rc1 next-20251219]
[cannot apply to arm64/for-next/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Barry-Song/arm64-Provide-dcache_by_myline_op_nosync-helper/20251219-195810
base:   linus/master
patch link:    https://lore.kernel.org/r/20251219053658.84978-6-21cnbao%40gmail.com
patch subject: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch
config: x86_64-rhel-9.4 (https://download.01.org/0day-ci/archive/20251221/202512211320.LaiSSLAc-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251221/202512211320.LaiSSLAc-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512211320.LaiSSLAc-lkp@intel.com/

All errors (new ones prefixed by >>):

   kernel/dma/direct.c: In function 'dma_direct_unmap_sg':
>> kernel/dma/direct.c:456:25: error: implicit declaration of function 'dma_direct_unmap_phys_batch_add'; did you mean 'dma_direct_unmap_phys'? [-Wimplicit-function-declaration]
     456 |                         dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         |                         dma_direct_unmap_phys
   kernel/dma/direct.c: In function 'dma_direct_map_sg':
>> kernel/dma/direct.c:484:43: error: implicit declaration of function 'dma_direct_map_phys_batch_add'; did you mean 'dma_direct_map_phys'? [-Wimplicit-function-declaration]
     484 |                         sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
         |                                           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         |                                           dma_direct_map_phys


vim +456 kernel/dma/direct.c

   439	
   440	/*
   441	 * Unmaps segments, except for ones marked as pci_p2pdma which do not
   442	 * require any further action as they contain a bus address.
   443	 */
   444	void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
   445			int nents, enum dma_data_direction dir, unsigned long attrs)
   446	{
   447		struct scatterlist *sg;
   448		int i;
   449		bool need_sync = false;
   450	
   451		for_each_sg(sgl,  sg, nents, i) {
   452			if (sg_dma_is_bus_address(sg)) {
   453				sg_dma_unmark_bus_address(sg);
   454			} else {
   455				need_sync = true;
 > 456				dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
   457						      sg_dma_len(sg), dir, attrs);
   458			}
   459		}
   460		if (need_sync && !dev_is_dma_coherent(dev))
   461			arch_sync_dma_batch_flush();
   462	}
   463	#endif
   464	
   465	int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
   466			enum dma_data_direction dir, unsigned long attrs)
   467	{
   468		struct pci_p2pdma_map_state p2pdma_state = {};
   469		struct scatterlist *sg;
   470		int i, ret;
   471		bool need_sync = false;
   472	
   473		for_each_sg(sgl, sg, nents, i) {
   474			switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
   475			case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
   476				/*
   477				 * Any P2P mapping that traverses the PCI host bridge
   478				 * must be mapped with CPU physical address and not PCI
   479				 * bus addresses.
   480				 */
   481				break;
   482			case PCI_P2PDMA_MAP_NONE:
   483				need_sync = true;
 > 484				sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
   485						sg->length, dir, attrs);
   486				if (sg->dma_address == DMA_MAPPING_ERROR) {
   487					ret = -EIO;
   488					goto out_unmap;
   489				}
   490				break;
   491			case PCI_P2PDMA_MAP_BUS_ADDR:
   492				sg->dma_address = pci_p2pdma_bus_addr_map(
   493					p2pdma_state.mem, sg_phys(sg));
   494				sg_dma_len(sg) = sg->length;
   495				sg_dma_mark_bus_address(sg);
   496				continue;
   497			default:
   498				ret = -EREMOTEIO;
   499				goto out_unmap;
   500			}
   501			sg_dma_len(sg) = sg->length;
   502		}
   503	
   504		if (need_sync && !dev_is_dma_coherent(dev))
   505			arch_sync_dma_batch_flush();
   506		return nents;
   507	
   508	out_unmap:
   509		dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
   510		return ret;
   511	}
   512	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by kernel test robot 1 month, 2 weeks ago

Hi Barry,

kernel test robot noticed the following build errors:

[auto build test ERROR on linus/master]
[also build test ERROR on v6.19-rc2 next-20251219]
[cannot apply to arm64/for-next/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Barry-Song/arm64-Provide-dcache_by_myline_op_nosync-helper/20251219-195810
base:   linus/master
patch link:    https://lore.kernel.org/r/20251219053658.84978-6-21cnbao%40gmail.com
patch subject: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch
config: i386-buildonly-randconfig-006-20251222 (https://download.01.org/0day-ci/archive/20251222/202512222029.Dd6Vs1Eg-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251222/202512222029.Dd6Vs1Eg-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512222029.Dd6Vs1Eg-lkp@intel.com/

All errors (new ones prefixed by >>):

>> kernel/dma/direct.c:456:4: error: call to undeclared function 'dma_direct_unmap_phys_batch_add'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     456 |                         dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
         |                         ^
   kernel/dma/direct.c:456:4: note: did you mean 'dma_direct_unmap_phys'?
   kernel/dma/direct.h:188:20: note: 'dma_direct_unmap_phys' declared here
     188 | static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
         |                    ^
>> kernel/dma/direct.c:484:22: error: call to undeclared function 'dma_direct_map_phys_batch_add'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     484 |                         sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
         |                                           ^
   2 errors generated.


vim +/dma_direct_unmap_phys_batch_add +456 kernel/dma/direct.c

   439	
   440	/*
   441	 * Unmaps segments, except for ones marked as pci_p2pdma which do not
   442	 * require any further action as they contain a bus address.
   443	 */
   444	void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
   445			int nents, enum dma_data_direction dir, unsigned long attrs)
   446	{
   447		struct scatterlist *sg;
   448		int i;
   449		bool need_sync = false;
   450	
   451		for_each_sg(sgl,  sg, nents, i) {
   452			if (sg_dma_is_bus_address(sg)) {
   453				sg_dma_unmark_bus_address(sg);
   454			} else {
   455				need_sync = true;
 > 456				dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
   457						      sg_dma_len(sg), dir, attrs);
   458			}
   459		}
   460		if (need_sync && !dev_is_dma_coherent(dev))
   461			arch_sync_dma_batch_flush();
   462	}
   463	#endif
   464	
   465	int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
   466			enum dma_data_direction dir, unsigned long attrs)
   467	{
   468		struct pci_p2pdma_map_state p2pdma_state = {};
   469		struct scatterlist *sg;
   470		int i, ret;
   471		bool need_sync = false;
   472	
   473		for_each_sg(sgl, sg, nents, i) {
   474			switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
   475			case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
   476				/*
   477				 * Any P2P mapping that traverses the PCI host bridge
   478				 * must be mapped with CPU physical address and not PCI
   479				 * bus addresses.
   480				 */
   481				break;
   482			case PCI_P2PDMA_MAP_NONE:
   483				need_sync = true;
 > 484				sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
   485						sg->length, dir, attrs);
   486				if (sg->dma_address == DMA_MAPPING_ERROR) {
   487					ret = -EIO;
   488					goto out_unmap;
   489				}
   490				break;
   491			case PCI_P2PDMA_MAP_BUS_ADDR:
   492				sg->dma_address = pci_p2pdma_bus_addr_map(
   493					p2pdma_state.mem, sg_phys(sg));
   494				sg_dma_len(sg) = sg->length;
   495				sg_dma_mark_bus_address(sg);
   496				continue;
   497			default:
   498				ret = -EREMOTEIO;
   499				goto out_unmap;
   500			}
   501			sg_dma_len(sg) = sg->length;
   502		}
   503	
   504		if (need_sync && !dev_is_dma_coherent(dev))
   505			arch_sync_dma_batch_flush();
   506		return nents;
   507	
   508	out_unmap:
   509		dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
   510		return ret;
   511	}
   512	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by kernel test robot 1 month, 2 weeks ago

Hi Barry,

kernel test robot noticed the following build errors:

[auto build test ERROR on linus/master]
[also build test ERROR on v6.19-rc2 next-20251219]
[cannot apply to arm64/for-next/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Barry-Song/arm64-Provide-dcache_by_myline_op_nosync-helper/20251219-195810
base:   linus/master
patch link:    https://lore.kernel.org/r/20251219053658.84978-6-21cnbao%40gmail.com
patch subject: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch
config: x86_64-randconfig-161-20251222 (https://download.01.org/0day-ci/archive/20251222/202512222137.rpXOEE5p-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251222/202512222137.rpXOEE5p-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512222137.rpXOEE5p-lkp@intel.com/

All errors (new ones prefixed by >>):

   kernel/dma/direct.c: In function 'dma_direct_unmap_sg':
>> kernel/dma/direct.c:456:25: error: implicit declaration of function 'dma_direct_unmap_phys_batch_add'; did you mean 'dma_direct_unmap_phys'? [-Wimplicit-function-declaration]
     456 |                         dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         |                         dma_direct_unmap_phys
   kernel/dma/direct.c: In function 'dma_direct_map_sg':
>> kernel/dma/direct.c:484:43: error: implicit declaration of function 'dma_direct_map_phys_batch_add'; did you mean 'dma_direct_map_phys'? [-Wimplicit-function-declaration]
     484 |                         sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
         |                                           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         |                                           dma_direct_map_phys


vim +456 kernel/dma/direct.c

   439	
   440	/*
   441	 * Unmaps segments, except for ones marked as pci_p2pdma which do not
   442	 * require any further action as they contain a bus address.
   443	 */
   444	void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
   445			int nents, enum dma_data_direction dir, unsigned long attrs)
   446	{
   447		struct scatterlist *sg;
   448		int i;
   449		bool need_sync = false;
   450	
   451		for_each_sg(sgl,  sg, nents, i) {
   452			if (sg_dma_is_bus_address(sg)) {
   453				sg_dma_unmark_bus_address(sg);
   454			} else {
   455				need_sync = true;
 > 456				dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
   457						      sg_dma_len(sg), dir, attrs);
   458			}
   459		}
   460		if (need_sync && !dev_is_dma_coherent(dev))
   461			arch_sync_dma_batch_flush();
   462	}
   463	#endif
   464	
   465	int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
   466			enum dma_data_direction dir, unsigned long attrs)
   467	{
   468		struct pci_p2pdma_map_state p2pdma_state = {};
   469		struct scatterlist *sg;
   470		int i, ret;
   471		bool need_sync = false;
   472	
   473		for_each_sg(sgl, sg, nents, i) {
   474			switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
   475			case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
   476				/*
   477				 * Any P2P mapping that traverses the PCI host bridge
   478				 * must be mapped with CPU physical address and not PCI
   479				 * bus addresses.
   480				 */
   481				break;
   482			case PCI_P2PDMA_MAP_NONE:
   483				need_sync = true;
 > 484				sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
   485						sg->length, dir, attrs);
   486				if (sg->dma_address == DMA_MAPPING_ERROR) {
   487					ret = -EIO;
   488					goto out_unmap;
   489				}
   490				break;
   491			case PCI_P2PDMA_MAP_BUS_ADDR:
   492				sg->dma_address = pci_p2pdma_bus_addr_map(
   493					p2pdma_state.mem, sg_phys(sg));
   494				sg_dma_len(sg) = sg->length;
   495				sg_dma_mark_bus_address(sg);
   496				continue;
   497			default:
   498				ret = -EREMOTEIO;
   499				goto out_unmap;
   500			}
   501			sg_dma_len(sg) = sg->length;
   502		}
   503	
   504		if (need_sync && !dev_is_dma_coherent(dev))
   505			arch_sync_dma_batch_flush();
   506		return nents;
   507	
   508	out_unmap:
   509		dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
   510		return ret;
   511	}
   512	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Leon Romanovsky 1 month, 2 weeks ago

On Fri, Dec 19, 2025 at 01:36:57PM +0800, Barry Song wrote:
> From: Barry Song <v-songbaohua@oppo.com>
> 
> This enables dma_direct_sync_sg_for_device, dma_direct_sync_sg_for_cpu,
> dma_direct_map_sg, and dma_direct_unmap_sg to use batched DMA sync
> operations when possible. This significantly improves performance on
> devices without hardware cache coherence.
> 
> Tangquan's initial results show that batched synchronization can reduce
> dma_map_sg() time by 64.61% and dma_unmap_sg() time by 66.60% on an MTK
> phone platform (MediaTek Dimensity 9500). The tests were performed by
> pinning the task to CPU7 and fixing the CPU frequency at 2.6 GHz,
> running dma_map_sg() and dma_unmap_sg() on 10 MB buffers (10 MB / 4 KB
> sg entries per buffer) for 200 iterations and then averaging the
> results.
> 
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Marek Szyprowski <m.szyprowski@samsung.com>
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: Ada Couprie Diaz <ada.coupriediaz@arm.com>
> Cc: Ard Biesheuvel <ardb@kernel.org>
> Cc: Marc Zyngier <maz@kernel.org>
> Cc: Anshuman Khandual <anshuman.khandual@arm.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: Suren Baghdasaryan <surenb@google.com>
> Cc: Tangquan Zheng <zhengtangquan@oppo.com>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  kernel/dma/direct.c | 28 ++++++++++-----
>  kernel/dma/direct.h | 86 +++++++++++++++++++++++++++++++++++++++------
>  2 files changed, 95 insertions(+), 19 deletions(-)

<...>

>  		if (!dev_is_dma_coherent(dev))
> -			arch_sync_dma_for_device(paddr, sg->length,
> -					dir);
> +			arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);

<...>

> -static inline dma_addr_t dma_direct_map_phys(struct device *dev,
> +#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
> +static inline void dma_direct_sync_single_for_cpu_batch_add(struct device *dev,
> +		dma_addr_t addr, size_t size, enum dma_data_direction dir)
> +{
> +	phys_addr_t paddr = dma_to_phys(dev, addr);
> +
> +	if (!dev_is_dma_coherent(dev))
> +		arch_sync_dma_for_cpu_batch_add(paddr, size, dir);
> +
> +	__dma_direct_sync_single_for_cpu(dev, paddr, size, dir);
> +}
> +#endif
> +
> +static inline void dma_direct_sync_single_for_cpu(struct device *dev,
> +		dma_addr_t addr, size_t size, enum dma_data_direction dir)
> +{
> +	phys_addr_t paddr = dma_to_phys(dev, addr);
> +
> +	if (!dev_is_dma_coherent(dev))
> +		arch_sync_dma_for_cpu(paddr, size, dir);
> +
> +	__dma_direct_sync_single_for_cpu(dev, paddr, size, dir);
> +}
> +

I'm wondering why you don't implement this batch‑sync support inside the
arch_sync_dma_*() functions. Doing so would minimize changes to the generic
kernel/dma/* code and reduce the amount of #ifdef‑based spaghetti.

Thanks."

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Barry Song 1 month, 2 weeks ago

On Sun, Dec 21, 2025 at 7:55 PM Leon Romanovsky <leon@kernel.org> wrote:
[...]
> > +
>
> I'm wondering why you don't implement this batch‑sync support inside the
> arch_sync_dma_*() functions. Doing so would minimize changes to the generic
> kernel/dma/* code and reduce the amount of #ifdef‑based spaghetti.
>

There are two cases: mapping an sg list and mapping a single
buffer. The former can be batched with
arch_sync_dma_*_batch_add() and flushed via
arch_sync_dma_batch_flush(), while the latter requires all work to
be done inside arch_sync_dma_*(). Therefore,
arch_sync_dma_*() cannot always batch and flush.

But yes, I can drop the ifdef in this patch. I have rewritten the entire
patch as shown below, and it will be tested today prior to
resending v2. Before I send v2, you are very welcome to comment.


From c03aae12c608b25fc1a84931ce78dbe3ef0f1ebe Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Wed, 29 Oct 2025 10:31:15 +0800
Subject: [PATCH v2 FOR DISCUSION 5/6] dma-mapping: Allow batched DMA sync operations

This enables dma_direct_sync_sg_for_device, dma_direct_sync_sg_for_cpu,
dma_direct_map_sg, and dma_direct_unmap_sg to use batched DMA sync
operations when possible. This significantly improves performance on
devices without hardware cache coherence.

Tangquan's initial results show that batched synchronization can reduce
dma_map_sg() time by 64.61% and dma_unmap_sg() time by 66.60% on an MTK
phone platform (MediaTek Dimensity 9500). The tests were performed by
pinning the task to CPU7 and fixing the CPU frequency at 2.6 GHz,
running dma_map_sg() and dma_unmap_sg() on 10 MB buffers (10 MB / 4 KB
sg entries per buffer) for 200 iterations and then averaging the
results.

Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 kernel/dma/direct.c | 28 +++++++++++++++------
 kernel/dma/direct.h | 59 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 69 insertions(+), 18 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 50c3fe2a1d55..ed2339b0c5e7 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -403,9 +403,10 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 		swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_device(paddr, sg->length,
-					dir);
+			arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);
 	}
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_batch_flush();
 }
 #endif
 
@@ -422,7 +423,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_cpu(paddr, sg->length, dir);
+			arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
 
 		swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
 
@@ -430,8 +431,10 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 			arch_dma_mark_clean(paddr, sg->length);
 	}
 
-	if (!dev_is_dma_coherent(dev))
+	if (!dev_is_dma_coherent(dev)) {
 		arch_sync_dma_for_cpu_all();
+		arch_sync_dma_batch_flush();
+	}
 }
 
 /*
@@ -443,14 +446,19 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 {
 	struct scatterlist *sg;
 	int i;
+	bool need_sync = false;
 
 	for_each_sg(sgl,  sg, nents, i) {
-		if (sg_dma_is_bus_address(sg))
+		if (sg_dma_is_bus_address(sg)) {
 			sg_dma_unmark_bus_address(sg);
-		else
-			dma_direct_unmap_phys(dev, sg->dma_address,
+		} else {
+			need_sync = true;
+			dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
 					      sg_dma_len(sg), dir, attrs);
+		}
 	}
+	if (need_sync && !dev_is_dma_coherent(dev))
+		arch_sync_dma_batch_flush();
 }
 #endif
 
@@ -460,6 +468,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 	struct pci_p2pdma_map_state p2pdma_state = {};
 	struct scatterlist *sg;
 	int i, ret;
+	bool need_sync = false;
 
 	for_each_sg(sgl, sg, nents, i) {
 		switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
@@ -471,7 +480,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 			 */
 			break;
 		case PCI_P2PDMA_MAP_NONE:
-			sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
+			need_sync = true;
+			sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
 					sg->length, dir, attrs);
 			if (sg->dma_address == DMA_MAPPING_ERROR) {
 				ret = -EIO;
@@ -491,6 +501,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		sg_dma_len(sg) = sg->length;
 	}
 
+	if (need_sync && !dev_is_dma_coherent(dev))
+		arch_sync_dma_batch_flush();
 	return nents;
 
 out_unmap:
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index da2fadf45bcd..2e25af887204 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -64,13 +64,16 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
 		arch_sync_dma_for_device(paddr, size, dir);
 }
 
-static inline void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+static inline void __dma_direct_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir,
+		bool flush)
 {
 	phys_addr_t paddr = dma_to_phys(dev, addr);
 
 	if (!dev_is_dma_coherent(dev)) {
-		arch_sync_dma_for_cpu(paddr, size, dir);
+		arch_sync_dma_for_cpu_batch_add(paddr, size, dir);
+		if (flush)
+			arch_sync_dma_batch_flush();
 		arch_sync_dma_for_cpu_all();
 	}
 
@@ -80,9 +83,15 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
 		arch_dma_mark_clean(paddr, size);
 }
 
-static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+static inline void dma_direct_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	__dma_direct_sync_single_for_cpu(dev, addr, size, dir, true);
+}
+
+static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
 		phys_addr_t phys, size_t size, enum dma_data_direction dir,
-		unsigned long attrs)
+		unsigned long attrs, bool flush)
 {
 	dma_addr_t dma_addr;
 
@@ -109,8 +118,11 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
 	}
 
 	if (!dev_is_dma_coherent(dev) &&
-	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
-		arch_sync_dma_for_device(phys, size, dir);
+	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) {
+		arch_sync_dma_for_device_batch_add(phys, size, dir);
+		if (flush)
+			arch_sync_dma_batch_flush();
+	}
 	return dma_addr;
 
 err_overflow:
@@ -121,8 +133,23 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
 	return DMA_MAPPING_ERROR;
 }
 
-static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
+static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+		phys_addr_t phys, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	return __dma_direct_map_phys(dev, phys, size, dir, attrs, true);
+}
+
+static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
+		phys_addr_t phys, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	return __dma_direct_map_phys(dev, phys, size, dir, attrs, false);
+}
+
+static inline void __dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs,
+		bool flush)
 {
 	phys_addr_t phys;
 
@@ -132,9 +159,21 @@ static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
 
 	phys = dma_to_phys(dev, addr);
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+		__dma_direct_sync_single_for_cpu(dev, addr, size, dir, flush);
 
 	swiotlb_tbl_unmap_single(dev, phys, size, dir,
 					 attrs | DMA_ATTR_SKIP_CPU_SYNC);
 }
+
+static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	__dma_direct_unmap_phys(dev, addr, size, dir, attrs, true);
+}
+
+static inline void dma_direct_unmap_phys_batch_add(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	__dma_direct_unmap_phys(dev, addr, size, dir, attrs, false);
+}
 #endif /* _KERNEL_DMA_DIRECT_H */
-- 
2.39.3 (Apple Git-146)

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Leon Romanovsky 1 month, 2 weeks ago

On Mon, Dec 22, 2025 at 03:24:58AM +0800, Barry Song wrote:
> On Sun, Dec 21, 2025 at 7:55 PM Leon Romanovsky <leon@kernel.org> wrote:
> [...]
> > > +
> >
> > I'm wondering why you don't implement this batch‑sync support inside the
> > arch_sync_dma_*() functions. Doing so would minimize changes to the generic
> > kernel/dma/* code and reduce the amount of #ifdef‑based spaghetti.
> >
> 
> There are two cases: mapping an sg list and mapping a single
> buffer. The former can be batched with
> arch_sync_dma_*_batch_add() and flushed via
> arch_sync_dma_batch_flush(), while the latter requires all work to
> be done inside arch_sync_dma_*(). Therefore,
> arch_sync_dma_*() cannot always batch and flush.

Probably in all cases you can call the _batch_ variant, followed by _flush_,  
even when handling a single page. This keeps the code consistent across all  
paths. On platforms that do not support _batch_, the _flush_ operation will be  
a NOP anyway.

I would also rename arch_sync_dma_batch_flush() to arch_sync_dma_flush().

You can also minimize changes in dma_direct_map_phys() too, by extending
it's signature to provide if flush is needed or not.

dma_direct_map_phys(....) -> dma_direct_map_phys(...., bool flush):

static inline dma_addr_t dma_direct_map_phys(...., bool flush)
{
	....

	if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
        {
	    	arch_sync_dma_for_device(phys, size, dir);
		if (flush)
			arch_sync_dma_flush();
	}
}

Thanks

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Barry Song 1 month, 2 weeks ago

On Mon, Dec 22, 2025 at 9:49 PM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Mon, Dec 22, 2025 at 03:24:58AM +0800, Barry Song wrote:
> > On Sun, Dec 21, 2025 at 7:55 PM Leon Romanovsky <leon@kernel.org> wrote:
> > [...]
> > > > +
> > >
> > > I'm wondering why you don't implement this batch‑sync support inside the
> > > arch_sync_dma_*() functions. Doing so would minimize changes to the generic
> > > kernel/dma/* code and reduce the amount of #ifdef‑based spaghetti.
> > >
> >
> > There are two cases: mapping an sg list and mapping a single
> > buffer. The former can be batched with
> > arch_sync_dma_*_batch_add() and flushed via
> > arch_sync_dma_batch_flush(), while the latter requires all work to
> > be done inside arch_sync_dma_*(). Therefore,
> > arch_sync_dma_*() cannot always batch and flush.
>
> Probably in all cases you can call the _batch_ variant, followed by _flush_,
> even when handling a single page. This keeps the code consistent across all
> paths. On platforms that do not support _batch_, the _flush_ operation will be
> a NOP anyway.

We have a lot of code outside kernel/dma that also calls
arch_sync_dma_for_* such as arch/arm, arch/mips, drivers/xen,
I guess we don’t want to modify so many things?

for kernel/dma, we have two "single" callers only:
kernel/dma/direct.h, kernel/dma/swiotlb.c.  and they looks quite
straightforward:

static inline void dma_direct_sync_single_for_device(struct device *dev,
                dma_addr_t addr, size_t size, enum dma_data_direction dir)
{
        phys_addr_t paddr = dma_to_phys(dev, addr);

        swiotlb_sync_single_for_device(dev, paddr, size, dir);

        if (!dev_is_dma_coherent(dev))
                arch_sync_dma_for_device(paddr, size, dir);
}

I guess moving to arch_sync_dma_for_device_batch + flush
doesn’t really look much better, does it?

>
> I would also rename arch_sync_dma_batch_flush() to arch_sync_dma_flush().

Sure.

>
> You can also minimize changes in dma_direct_map_phys() too, by extending
> it's signature to provide if flush is needed or not.

Yes. I have

static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
                phys_addr_t phys, size_t size, enum dma_data_direction dir,
                unsigned long attrs, bool flush)

and two wrappers:
static inline dma_addr_t dma_direct_map_phys(struct device *dev,
                phys_addr_t phys, size_t size, enum dma_data_direction dir,
                unsigned long attrs)
{
        return __dma_direct_map_phys(dev, phys, size, dir, attrs, true);
}

static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
                phys_addr_t phys, size_t size, enum dma_data_direction dir,
                unsigned long attrs)
{
        return __dma_direct_map_phys(dev, phys, size, dir, attrs, false);
}

If you prefer exposing "flush" directly in dma_direct_map_phys()
and updating its callers with flush=true, I think that’s fine.

It could be also true for dma_direct_sync_single_for_device().

>
> dma_direct_map_phys(....) -> dma_direct_map_phys(...., bool flush):
>
> static inline dma_addr_t dma_direct_map_phys(...., bool flush)
> {
>         ....
>
>         if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
>             !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
>         {
>                 arch_sync_dma_for_device(phys, size, dir);
>                 if (flush)
>                         arch_sync_dma_flush();
>         }
> }
>

Thanks
Barry

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Leon Romanovsky 1 month, 2 weeks ago

On Tue, Dec 23, 2025 at 01:02:55PM +1300, Barry Song wrote:
> On Mon, Dec 22, 2025 at 9:49 PM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Mon, Dec 22, 2025 at 03:24:58AM +0800, Barry Song wrote:
> > > On Sun, Dec 21, 2025 at 7:55 PM Leon Romanovsky <leon@kernel.org> wrote:
> > > [...]
> > > > > +
> > > >
> > > > I'm wondering why you don't implement this batch‑sync support inside the
> > > > arch_sync_dma_*() functions. Doing so would minimize changes to the generic
> > > > kernel/dma/* code and reduce the amount of #ifdef‑based spaghetti.
> > > >
> > >
> > > There are two cases: mapping an sg list and mapping a single
> > > buffer. The former can be batched with
> > > arch_sync_dma_*_batch_add() and flushed via
> > > arch_sync_dma_batch_flush(), while the latter requires all work to
> > > be done inside arch_sync_dma_*(). Therefore,
> > > arch_sync_dma_*() cannot always batch and flush.
> >
> > Probably in all cases you can call the _batch_ variant, followed by _flush_,
> > even when handling a single page. This keeps the code consistent across all
> > paths. On platforms that do not support _batch_, the _flush_ operation will be
> > a NOP anyway.
> 
> We have a lot of code outside kernel/dma that also calls
> arch_sync_dma_for_* such as arch/arm, arch/mips, drivers/xen,
> I guess we don’t want to modify so many things?

Aren't they using internal, arch specific, arch_sync_dma_for_* implementations?

> 
> for kernel/dma, we have two "single" callers only:
> kernel/dma/direct.h, kernel/dma/swiotlb.c.  and they looks quite
> straightforward:
> 
> static inline void dma_direct_sync_single_for_device(struct device *dev,
>                 dma_addr_t addr, size_t size, enum dma_data_direction dir)
> {
>         phys_addr_t paddr = dma_to_phys(dev, addr);
> 
>         swiotlb_sync_single_for_device(dev, paddr, size, dir);
> 
>         if (!dev_is_dma_coherent(dev))
>                 arch_sync_dma_for_device(paddr, size, dir);
> }
> 
> I guess moving to arch_sync_dma_for_device_batch + flush
> doesn’t really look much better, does it?
> 
> >
> > I would also rename arch_sync_dma_batch_flush() to arch_sync_dma_flush().
> 
> Sure.
> 
> >
> > You can also minimize changes in dma_direct_map_phys() too, by extending
> > it's signature to provide if flush is needed or not.
> 
> Yes. I have
> 
> static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
>                 phys_addr_t phys, size_t size, enum dma_data_direction dir,
>                 unsigned long attrs, bool flush)

My suggestion is to use it directly, without wrappers.

> 
> and two wrappers:
> static inline dma_addr_t dma_direct_map_phys(struct device *dev,
>                 phys_addr_t phys, size_t size, enum dma_data_direction dir,
>                 unsigned long attrs)
> {
>         return __dma_direct_map_phys(dev, phys, size, dir, attrs, true);
> }
> 
> static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
>                 phys_addr_t phys, size_t size, enum dma_data_direction dir,
>                 unsigned long attrs)
> {
>         return __dma_direct_map_phys(dev, phys, size, dir, attrs, false);
> }
> 
> If you prefer exposing "flush" directly in dma_direct_map_phys()
> and updating its callers with flush=true, I think that’s fine.

Yes

> 
> It could be also true for dma_direct_sync_single_for_device().
> 
> >
> > dma_direct_map_phys(....) -> dma_direct_map_phys(...., bool flush):
> >
> > static inline dma_addr_t dma_direct_map_phys(...., bool flush)
> > {
> >         ....
> >
> >         if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
> >             !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
> >         {
> >                 arch_sync_dma_for_device(phys, size, dir);
> >                 if (flush)
> >                         arch_sync_dma_flush();
> >         }
> > }
> >
> 
> Thanks
> Barry
>

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Barry Song 1 month, 2 weeks ago

On Wed, Dec 24, 2025 at 3:14 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Tue, Dec 23, 2025 at 01:02:55PM +1300, Barry Song wrote:
> > On Mon, Dec 22, 2025 at 9:49 PM Leon Romanovsky <leon@kernel.org> wrote:
> > >
> > > On Mon, Dec 22, 2025 at 03:24:58AM +0800, Barry Song wrote:
> > > > On Sun, Dec 21, 2025 at 7:55 PM Leon Romanovsky <leon@kernel.org> wrote:
> > > > [...]
> > > > > > +
> > > > >
> > > > > I'm wondering why you don't implement this batch‑sync support inside the
> > > > > arch_sync_dma_*() functions. Doing so would minimize changes to the generic
> > > > > kernel/dma/* code and reduce the amount of #ifdef‑based spaghetti.
> > > > >
> > > >
> > > > There are two cases: mapping an sg list and mapping a single
> > > > buffer. The former can be batched with
> > > > arch_sync_dma_*_batch_add() and flushed via
> > > > arch_sync_dma_batch_flush(), while the latter requires all work to
> > > > be done inside arch_sync_dma_*(). Therefore,
> > > > arch_sync_dma_*() cannot always batch and flush.
> > >
> > > Probably in all cases you can call the _batch_ variant, followed by _flush_,
> > > even when handling a single page. This keeps the code consistent across all
> > > paths. On platforms that do not support _batch_, the _flush_ operation will be
> > > a NOP anyway.
> >
> > We have a lot of code outside kernel/dma that also calls
> > arch_sync_dma_for_* such as arch/arm, arch/mips, drivers/xen,
> > I guess we don’t want to modify so many things?
>
> Aren't they using internal, arch specific, arch_sync_dma_for_* implementations?

for arch/arm, arch/mips, they are arch-specific implementations.
xen is an exception:

static void xen_swiotlb_unmap_phys(struct device *hwdev, dma_addr_t dev_addr,
                size_t size, enum dma_data_direction dir, unsigned long attrs)
{
        phys_addr_t paddr = xen_dma_to_phys(hwdev, dev_addr);
        struct io_tlb_pool *pool;

        BUG_ON(dir == DMA_NONE);

        if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
                if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr))))
                        arch_sync_dma_for_cpu(paddr, size, dir);
                else
                        xen_dma_sync_for_cpu(hwdev, dev_addr, size, dir);
        }

        /* NOTE: We use dev_addr here, not paddr! */
        pool = xen_swiotlb_find_pool(hwdev, dev_addr);
        if (pool)
                __swiotlb_tbl_unmap_single(hwdev, paddr, size, dir,
                                           attrs, pool);
}

>
> >
> > for kernel/dma, we have two "single" callers only:
> > kernel/dma/direct.h, kernel/dma/swiotlb.c.  and they looks quite
> > straightforward:
> >
> > static inline void dma_direct_sync_single_for_device(struct device *dev,
> >                 dma_addr_t addr, size_t size, enum dma_data_direction dir)
> > {
> >         phys_addr_t paddr = dma_to_phys(dev, addr);
> >
> >         swiotlb_sync_single_for_device(dev, paddr, size, dir);
> >
> >         if (!dev_is_dma_coherent(dev))
> >                 arch_sync_dma_for_device(paddr, size, dir);
> > }
> >
> > I guess moving to arch_sync_dma_for_device_batch + flush
> > doesn’t really look much better, does it?
> >
> > >
> > > I would also rename arch_sync_dma_batch_flush() to arch_sync_dma_flush().
> >
> > Sure.
> >
> > >
> > > You can also minimize changes in dma_direct_map_phys() too, by extending
> > > it's signature to provide if flush is needed or not.
> >
> > Yes. I have
> >
> > static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
> >                 phys_addr_t phys, size_t size, enum dma_data_direction dir,
> >                 unsigned long attrs, bool flush)
>
> My suggestion is to use it directly, without wrappers.
>
> >
> > and two wrappers:
> > static inline dma_addr_t dma_direct_map_phys(struct device *dev,
> >                 phys_addr_t phys, size_t size, enum dma_data_direction dir,
> >                 unsigned long attrs)
> > {
> >         return __dma_direct_map_phys(dev, phys, size, dir, attrs, true);
> > }
> >
> > static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
> >                 phys_addr_t phys, size_t size, enum dma_data_direction dir,
> >                 unsigned long attrs)
> > {
> >         return __dma_direct_map_phys(dev, phys, size, dir, attrs, false);
> > }
> >
> > If you prefer exposing "flush" directly in dma_direct_map_phys()
> > and updating its callers with flush=true, I think that’s fine.
>
> Yes
>

OK. Could you take a look at [1] and see if any further
improvements are needed before I send v2?

[1] https://lore.kernel.org/lkml/20251223023648.31614-1-21cnbao@gmail.com/

Thanks
Barry

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Leon Romanovsky 1 month, 2 weeks ago

On Wed, Dec 24, 2025 at 02:29:13PM +1300, Barry Song wrote:
> On Wed, Dec 24, 2025 at 3:14 AM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Tue, Dec 23, 2025 at 01:02:55PM +1300, Barry Song wrote:
> > > On Mon, Dec 22, 2025 at 9:49 PM Leon Romanovsky <leon@kernel.org> wrote:
> > > >
> > > > On Mon, Dec 22, 2025 at 03:24:58AM +0800, Barry Song wrote:
> > > > > On Sun, Dec 21, 2025 at 7:55 PM Leon Romanovsky <leon@kernel.org> wrote:
> > > > > [...]
> > > > > > > +
> > > > > >
> > > > > > I'm wondering why you don't implement this batch‑sync support inside the
> > > > > > arch_sync_dma_*() functions. Doing so would minimize changes to the generic
> > > > > > kernel/dma/* code and reduce the amount of #ifdef‑based spaghetti.
> > > > > >
> > > > >
> > > > > There are two cases: mapping an sg list and mapping a single
> > > > > buffer. The former can be batched with
> > > > > arch_sync_dma_*_batch_add() and flushed via
> > > > > arch_sync_dma_batch_flush(), while the latter requires all work to
> > > > > be done inside arch_sync_dma_*(). Therefore,
> > > > > arch_sync_dma_*() cannot always batch and flush.
> > > >
> > > > Probably in all cases you can call the _batch_ variant, followed by _flush_,
> > > > even when handling a single page. This keeps the code consistent across all
> > > > paths. On platforms that do not support _batch_, the _flush_ operation will be
> > > > a NOP anyway.
> > >
> > > We have a lot of code outside kernel/dma that also calls
> > > arch_sync_dma_for_* such as arch/arm, arch/mips, drivers/xen,
> > > I guess we don’t want to modify so many things?
> >
> > Aren't they using internal, arch specific, arch_sync_dma_for_* implementations?
> 
> for arch/arm, arch/mips, they are arch-specific implementations.
> xen is an exception:

Right, and this is the only location outside of kernel/dma where you need to
invoke arch_sync_dma_flush().

> 
> static void xen_swiotlb_unmap_phys(struct device *hwdev, dma_addr_t dev_addr,
>                 size_t size, enum dma_data_direction dir, unsigned long attrs)
> {
>         phys_addr_t paddr = xen_dma_to_phys(hwdev, dev_addr);
>         struct io_tlb_pool *pool;
> 
>         BUG_ON(dir == DMA_NONE);
> 
>         if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
>                 if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr))))
>                         arch_sync_dma_for_cpu(paddr, size, dir);
>                 else
>                         xen_dma_sync_for_cpu(hwdev, dev_addr, size, dir);
>         }
> 
>         /* NOTE: We use dev_addr here, not paddr! */
>         pool = xen_swiotlb_find_pool(hwdev, dev_addr);
>         if (pool)
>                 __swiotlb_tbl_unmap_single(hwdev, paddr, size, dir,
>                                            attrs, pool);
> }
> 
> >
> > >
> > > for kernel/dma, we have two "single" callers only:
> > > kernel/dma/direct.h, kernel/dma/swiotlb.c.  and they looks quite
> > > straightforward:
> > >
> > > static inline void dma_direct_sync_single_for_device(struct device *dev,
> > >                 dma_addr_t addr, size_t size, enum dma_data_direction dir)
> > > {
> > >         phys_addr_t paddr = dma_to_phys(dev, addr);
> > >
> > >         swiotlb_sync_single_for_device(dev, paddr, size, dir);
> > >
> > >         if (!dev_is_dma_coherent(dev))
> > >                 arch_sync_dma_for_device(paddr, size, dir);
> > > }
> > >
> > > I guess moving to arch_sync_dma_for_device_batch + flush
> > > doesn’t really look much better, does it?
> > >
> > > >
> > > > I would also rename arch_sync_dma_batch_flush() to arch_sync_dma_flush().
> > >
> > > Sure.
> > >
> > > >
> > > > You can also minimize changes in dma_direct_map_phys() too, by extending
> > > > it's signature to provide if flush is needed or not.
> > >
> > > Yes. I have
> > >
> > > static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
> > >                 phys_addr_t phys, size_t size, enum dma_data_direction dir,
> > >                 unsigned long attrs, bool flush)
> >
> > My suggestion is to use it directly, without wrappers.
> >
> > >
> > > and two wrappers:
> > > static inline dma_addr_t dma_direct_map_phys(struct device *dev,
> > >                 phys_addr_t phys, size_t size, enum dma_data_direction dir,
> > >                 unsigned long attrs)
> > > {
> > >         return __dma_direct_map_phys(dev, phys, size, dir, attrs, true);
> > > }
> > >
> > > static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
> > >                 phys_addr_t phys, size_t size, enum dma_data_direction dir,
> > >                 unsigned long attrs)
> > > {
> > >         return __dma_direct_map_phys(dev, phys, size, dir, attrs, false);
> > > }
> > >
> > > If you prefer exposing "flush" directly in dma_direct_map_phys()
> > > and updating its callers with flush=true, I think that’s fine.
> >
> > Yes
> >
> 
> OK. Could you take a look at [1] and see if any further
> improvements are needed before I send v2?

Everything looks ok, except these renames:
-			arch_sync_dma_for_cpu(paddr, sg->length, dir);
+			arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);

Thanks

> 
> [1] https://lore.kernel.org/lkml/20251223023648.31614-1-21cnbao@gmail.com/
> 
> Thanks
> Barry
>

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Barry Song 1 month, 2 weeks ago

> > >
> >
> > OK. Could you take a look at [1] and see if any further
> > improvements are needed before I send v2?
>
> Everything looks ok, except these renames:
> -                       arch_sync_dma_for_cpu(paddr, sg->length, dir);
> +                       arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);

Thanks!
I'm happy to drop the rename as outlined below-feedback welcome :-)

diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index dd2c8586a725..487fb7c355ed 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -87,6 +87,12 @@ int cache_line_size(void);
 
 #define dma_get_cache_alignment	cache_line_size
 
+static inline void arch_sync_dma_flush(void)
+{
+	dsb(sy);
+}
+#define arch_sync_dma_flush arch_sync_dma_flush
+
 /* Compress a u64 MPIDR value into 32 bits. */
 static inline u64 arch_compact_of_hwid(u64 id)
 {
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index b2b5792b2caa..ae1ae0280eef 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -17,7 +17,7 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
 {
 	unsigned long start = (unsigned long)phys_to_virt(paddr);
 
-	dcache_clean_poc(start, start + size);
+	dcache_clean_poc_nosync(start, start + size);
 }
 
 void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
@@ -28,7 +28,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 	if (dir == DMA_TO_DEVICE)
 		return;
 
-	dcache_inval_poc(start, start + size);
+	dcache_inval_poc_nosync(start, start + size);
 }
 
 void arch_dma_prep_coherent(struct page *page, size_t size)
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 4809204c674c..e7dd8a63b40e 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -361,6 +361,12 @@ static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 }
 #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
 
+#ifndef arch_sync_dma_flush
+static inline void arch_sync_dma_flush(void)
+{
+}
+#endif
+
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
 void arch_sync_dma_for_cpu_all(void);
 #else

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Leon Romanovsky 1 month, 2 weeks ago

On Thu, Dec 25, 2025 at 06:45:09PM +1300, Barry Song wrote:
> > > >
> > >
> > > OK. Could you take a look at [1] and see if any further
> > > improvements are needed before I send v2?
> >
> > Everything looks ok, except these renames:
> > -                       arch_sync_dma_for_cpu(paddr, sg->length, dir);
> > +                       arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
> 
> Thanks!
> I'm happy to drop the rename as outlined below-feedback welcome :-)
> 
> diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
> index dd2c8586a725..487fb7c355ed 100644
> --- a/arch/arm64/include/asm/cache.h
> +++ b/arch/arm64/include/asm/cache.h
> @@ -87,6 +87,12 @@ int cache_line_size(void);
>  
>  #define dma_get_cache_alignment	cache_line_size
>  
> +static inline void arch_sync_dma_flush(void)
> +{
> +	dsb(sy);
> +}
> +#define arch_sync_dma_flush arch_sync_dma_flush
> +
>  /* Compress a u64 MPIDR value into 32 bits. */
>  static inline u64 arch_compact_of_hwid(u64 id)
>  {
> diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> index b2b5792b2caa..ae1ae0280eef 100644
> --- a/arch/arm64/mm/dma-mapping.c
> +++ b/arch/arm64/mm/dma-mapping.c
> @@ -17,7 +17,7 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
>  {
>  	unsigned long start = (unsigned long)phys_to_virt(paddr);
>  
> -	dcache_clean_poc(start, start + size);
> +	dcache_clean_poc_nosync(start, start + size);
>  }
>  
>  void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> @@ -28,7 +28,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
>  	if (dir == DMA_TO_DEVICE)
>  		return;
>  
> -	dcache_inval_poc(start, start + size);
> +	dcache_inval_poc_nosync(start, start + size);
>  }
>  
>  void arch_dma_prep_coherent(struct page *page, size_t size)
> diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
> index 4809204c674c..e7dd8a63b40e 100644
> --- a/include/linux/dma-map-ops.h
> +++ b/include/linux/dma-map-ops.h
> @@ -361,6 +361,12 @@ static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
>  }
>  #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
>  
> +#ifndef arch_sync_dma_flush

You likely need to wrap this in "#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FLUSH"
as done in the surrounding code.

Thanks

> +static inline void arch_sync_dma_flush(void)
> +{
> +}
> +#endif
> +
>  #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
>  void arch_sync_dma_for_cpu_all(void);
>  #else
>

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Barry Song 1 month, 2 weeks ago

On Fri, Dec 26, 2025 at 1:36 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Thu, Dec 25, 2025 at 06:45:09PM +1300, Barry Song wrote:
> > > > >
> > > >
> > > > OK. Could you take a look at [1] and see if any further
> > > > improvements are needed before I send v2?
> > >
> > > Everything looks ok, except these renames:
> > > -                       arch_sync_dma_for_cpu(paddr, sg->length, dir);
> > > +                       arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
> >
> > Thanks!
> > I'm happy to drop the rename as outlined below-feedback welcome :-)
> >
> > diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
> > index dd2c8586a725..487fb7c355ed 100644
> > --- a/arch/arm64/include/asm/cache.h
> > +++ b/arch/arm64/include/asm/cache.h
> > @@ -87,6 +87,12 @@ int cache_line_size(void);
> >
> >  #define dma_get_cache_alignment      cache_line_size
> >
> > +static inline void arch_sync_dma_flush(void)
> > +{
> > +     dsb(sy);
> > +}
> > +#define arch_sync_dma_flush arch_sync_dma_flush
> > +
> >  /* Compress a u64 MPIDR value into 32 bits. */
> >  static inline u64 arch_compact_of_hwid(u64 id)
> >  {
> > diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> > index b2b5792b2caa..ae1ae0280eef 100644
> > --- a/arch/arm64/mm/dma-mapping.c
> > +++ b/arch/arm64/mm/dma-mapping.c
> > @@ -17,7 +17,7 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
> >  {
> >       unsigned long start = (unsigned long)phys_to_virt(paddr);
> >
> > -     dcache_clean_poc(start, start + size);
> > +     dcache_clean_poc_nosync(start, start + size);
> >  }
> >
> >  void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> > @@ -28,7 +28,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> >       if (dir == DMA_TO_DEVICE)
> >               return;
> >
> > -     dcache_inval_poc(start, start + size);
> > +     dcache_inval_poc_nosync(start, start + size);
> >  }
> >
> >  void arch_dma_prep_coherent(struct page *page, size_t size)
> > diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
> > index 4809204c674c..e7dd8a63b40e 100644
> > --- a/include/linux/dma-map-ops.h
> > +++ b/include/linux/dma-map-ops.h
> > @@ -361,6 +361,12 @@ static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> >  }
> >  #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
> >
> > +#ifndef arch_sync_dma_flush
>
> You likely need to wrap this in "#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FLUSH"
> as done in the surrounding code.

I've dropped the new Kconfig option and now rely on whether
arch_sync_dma_flush() is provided by the architecture. If an arch
does not define arch_sync_dma_flush() in its asm/cache.h, a no-op
implementation is used instead.

Do you still prefer keeping a config option to match the surrounding
code style? Note that on arm64, arch_sync_dma_flush() is already a
static inline rather than an extern, so it is not strictly aligned
with the others.
Having both CONFIG_ARCH_HAS_SYNC_DMA_FLUSH and
"#ifndef arch_sync_dma_flush" seems duplicated.

Another potential optimization would be to drop these options
entirely and handle this via ifndefs, letting each architecture
define the macros in asm/cache.h instead.

Whether arch implements arch_sync_dma_for_xx() as static inline or
as external functions makes no difference.

- #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
- void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,-
                enum dma_data_direction dir);
- #else
+ #ifndef arch_sync_dma_for_cpu
static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
                enum dma_data_direction dir)
{
}
#endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */

>
> Thanks
>
> > +static inline void arch_sync_dma_flush(void)
> > +{
> > +}
> > +#endif
> > +
> >  #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
> >  void arch_sync_dma_for_cpu_all(void);
> >  #else
> >

Thanks
Barry

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Leon Romanovsky 1 month, 2 weeks ago

On Fri, Dec 26, 2025 at 02:31:42AM +1300, Barry Song wrote:
> On Fri, Dec 26, 2025 at 1:36 AM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Thu, Dec 25, 2025 at 06:45:09PM +1300, Barry Song wrote:
> > > > > >
> > > > >
> > > > > OK. Could you take a look at [1] and see if any further
> > > > > improvements are needed before I send v2?
> > > >
> > > > Everything looks ok, except these renames:
> > > > -                       arch_sync_dma_for_cpu(paddr, sg->length, dir);
> > > > +                       arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
> > >
> > > Thanks!
> > > I'm happy to drop the rename as outlined below-feedback welcome :-)
> > >
> > > diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
> > > index dd2c8586a725..487fb7c355ed 100644
> > > --- a/arch/arm64/include/asm/cache.h
> > > +++ b/arch/arm64/include/asm/cache.h
> > > @@ -87,6 +87,12 @@ int cache_line_size(void);
> > >
> > >  #define dma_get_cache_alignment      cache_line_size
> > >
> > > +static inline void arch_sync_dma_flush(void)
> > > +{
> > > +     dsb(sy);
> > > +}
> > > +#define arch_sync_dma_flush arch_sync_dma_flush
> > > +
> > >  /* Compress a u64 MPIDR value into 32 bits. */
> > >  static inline u64 arch_compact_of_hwid(u64 id)
> > >  {
> > > diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> > > index b2b5792b2caa..ae1ae0280eef 100644
> > > --- a/arch/arm64/mm/dma-mapping.c
> > > +++ b/arch/arm64/mm/dma-mapping.c
> > > @@ -17,7 +17,7 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
> > >  {
> > >       unsigned long start = (unsigned long)phys_to_virt(paddr);
> > >
> > > -     dcache_clean_poc(start, start + size);
> > > +     dcache_clean_poc_nosync(start, start + size);
> > >  }
> > >
> > >  void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> > > @@ -28,7 +28,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> > >       if (dir == DMA_TO_DEVICE)
> > >               return;
> > >
> > > -     dcache_inval_poc(start, start + size);
> > > +     dcache_inval_poc_nosync(start, start + size);
> > >  }
> > >
> > >  void arch_dma_prep_coherent(struct page *page, size_t size)
> > > diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
> > > index 4809204c674c..e7dd8a63b40e 100644
> > > --- a/include/linux/dma-map-ops.h
> > > +++ b/include/linux/dma-map-ops.h
> > > @@ -361,6 +361,12 @@ static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> > >  }
> > >  #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
> > >
> > > +#ifndef arch_sync_dma_flush
> >
> > You likely need to wrap this in "#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FLUSH"
> > as done in the surrounding code.
> 
> I've dropped the new Kconfig option and now rely on whether
> arch_sync_dma_flush() is provided by the architecture. If an arch
> does not define arch_sync_dma_flush() in its asm/cache.h, a no-op
> implementation is used instead.

I know.

> 
> Do you still prefer keeping a config option to match the surrounding
> code style?

I don't have a strong preference here. Go ahead and try your current
version and see how people respond.

> Note that on arm64, arch_sync_dma_flush() is already a
> static inline rather than an extern, so it is not strictly aligned
> with the others.
> Having both CONFIG_ARCH_HAS_SYNC_DMA_FLUSH and
> "#ifndef arch_sync_dma_flush" seems duplicated.
> 
> Another potential optimization would be to drop these options
> entirely and handle this via ifndefs, letting each architecture
> define the macros in asm/cache.h instead.
> 
> Whether arch implements arch_sync_dma_for_xx() as static inline or
> as external functions makes no difference.
> 
> - #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
> - void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,-
>                 enum dma_data_direction dir);
> - #else
> + #ifndef arch_sync_dma_for_cpu
> static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
>                 enum dma_data_direction dir)
> {
> }
> #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
> 
> >
> > Thanks
> >
> > > +static inline void arch_sync_dma_flush(void)
> > > +{
> > > +}
> > > +#endif
> > > +
> > >  #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
> > >  void arch_sync_dma_for_cpu_all(void);
> > >  #else
> > >
> 
> Thanks
> Barry
>

Re: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch

Posted by Barry Song 1 month, 2 weeks ago

>
> >
> > I would also rename arch_sync_dma_batch_flush() to arch_sync_dma_flush().
>
> Sure.
>
> >
> > You can also minimize changes in dma_direct_map_phys() too, by extending
> > it's signature to provide if flush is needed or not.
>
> Yes. I have
>
> static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
>                 phys_addr_t phys, size_t size, enum dma_data_direction dir,
>                 unsigned long attrs, bool flush)
>
> and two wrappers:
> static inline dma_addr_t dma_direct_map_phys(struct device *dev,
>                 phys_addr_t phys, size_t size, enum dma_data_direction dir,
>                 unsigned long attrs)
> {
>         return __dma_direct_map_phys(dev, phys, size, dir, attrs, true);
> }
>
> static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
>                 phys_addr_t phys, size_t size, enum dma_data_direction dir,
>                 unsigned long attrs)
> {
>         return __dma_direct_map_phys(dev, phys, size, dir, attrs, false);
> }
>
> If you prefer exposing "flush" directly in dma_direct_map_phys()
> and updating its callers with flush=true, I think that’s fine.
>
> It could be also true for dma_direct_sync_single_for_device().

sorry for typo. I meant dma_direct_sync_single_for_cpu().
With flush passed as an argument, the patch becomes the following.
Please feel free to comment before I send v2.


diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 50c3fe2a1d55..5c65d213eb37 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -403,9 +403,11 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 		swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_device(paddr, sg->length,
+			arch_sync_dma_for_device_batch_add(paddr, sg->length,
 					dir);
 	}
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_flush();
 }
 #endif
 
@@ -422,7 +424,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_cpu(paddr, sg->length, dir);
+			arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
 
 		swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
 
@@ -430,8 +432,10 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 			arch_dma_mark_clean(paddr, sg->length);
 	}
 
-	if (!dev_is_dma_coherent(dev))
+	if (!dev_is_dma_coherent(dev)) {
 		arch_sync_dma_for_cpu_all();
+		arch_sync_dma_flush();
+	}
 }
 
 /*
@@ -443,14 +447,19 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 {
 	struct scatterlist *sg;
 	int i;
+	bool need_sync = false;
 
 	for_each_sg(sgl,  sg, nents, i) {
-		if (sg_dma_is_bus_address(sg))
+		if (sg_dma_is_bus_address(sg)) {
 			sg_dma_unmark_bus_address(sg);
-		else
+		} else {
+			need_sync = true;
 			dma_direct_unmap_phys(dev, sg->dma_address,
-					      sg_dma_len(sg), dir, attrs);
+					      sg_dma_len(sg), dir, attrs, false);
+		}
 	}
+	if (need_sync && !dev_is_dma_coherent(dev))
+		arch_sync_dma_flush();
 }
 #endif
 
@@ -460,6 +469,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 	struct pci_p2pdma_map_state p2pdma_state = {};
 	struct scatterlist *sg;
 	int i, ret;
+	bool need_sync = false;
 
 	for_each_sg(sgl, sg, nents, i) {
 		switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
@@ -471,8 +481,9 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 			 */
 			break;
 		case PCI_P2PDMA_MAP_NONE:
+			need_sync = true;
 			sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
-					sg->length, dir, attrs);
+					sg->length, dir, attrs, false);
 			if (sg->dma_address == DMA_MAPPING_ERROR) {
 				ret = -EIO;
 				goto out_unmap;
@@ -491,6 +502,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		sg_dma_len(sg) = sg->length;
 	}
 
+	if (need_sync && !dev_is_dma_coherent(dev))
+		arch_sync_dma_flush();
 	return nents;
 
 out_unmap:
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index da2fadf45bcd..b13eb5bfd051 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -65,12 +65,15 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
 }
 
 static inline void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+		dma_addr_t addr, size_t size, enum dma_data_direction dir,
+		bool flush)
 {
 	phys_addr_t paddr = dma_to_phys(dev, addr);
 
 	if (!dev_is_dma_coherent(dev)) {
-		arch_sync_dma_for_cpu(paddr, size, dir);
+		arch_sync_dma_for_cpu_batch_add(paddr, size, dir);
+		if (flush)
+			arch_sync_dma_flush();
 		arch_sync_dma_for_cpu_all();
 	}
 
@@ -82,7 +85,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
 
 static inline dma_addr_t dma_direct_map_phys(struct device *dev,
 		phys_addr_t phys, size_t size, enum dma_data_direction dir,
-		unsigned long attrs)
+		unsigned long attrs, bool flush)
 {
 	dma_addr_t dma_addr;
 
@@ -109,8 +112,11 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
 	}
 
 	if (!dev_is_dma_coherent(dev) &&
-	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
-		arch_sync_dma_for_device(phys, size, dir);
+	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) {
+		arch_sync_dma_for_device_batch_add(phys, size, dir);
+		if (flush)
+			arch_sync_dma_flush();
+	}
 	return dma_addr;
 
 err_overflow:
@@ -122,7 +128,8 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
 }
 
 static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
+		size_t size, enum dma_data_direction dir, unsigned long attrs,
+		bool flush)
 {
 	phys_addr_t phys;
 
@@ -132,9 +139,10 @@ static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
 
 	phys = dma_to_phys(dev, addr);
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+		dma_direct_sync_single_for_cpu(dev, addr, size, dir, flush);
 
 	swiotlb_tbl_unmap_single(dev, phys, size, dir,
 					 attrs | DMA_ATTR_SKIP_CPU_SYNC);
 }
+
 #endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 37163eb49f9f..d8cfa56a3cbb 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -166,7 +166,7 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
 
 	if (dma_map_direct(dev, ops) ||
 	    (!is_mmio && arch_dma_map_phys_direct(dev, phys + size)))
-		addr = dma_direct_map_phys(dev, phys, size, dir, attrs);
+		addr = dma_direct_map_phys(dev, phys, size, dir, attrs, true);
 	else if (use_dma_iommu(dev))
 		addr = iommu_dma_map_phys(dev, phys, size, dir, attrs);
 	else if (ops->map_phys)
@@ -207,7 +207,7 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size,
 	BUG_ON(!valid_dma_direction(dir));
 	if (dma_map_direct(dev, ops) ||
 	    (!is_mmio && arch_dma_unmap_phys_direct(dev, addr + size)))
-		dma_direct_unmap_phys(dev, addr, size, dir, attrs);
+		dma_direct_unmap_phys(dev, addr, size, dir, attrs, true);
 	else if (use_dma_iommu(dev))
 		iommu_dma_unmap_phys(dev, addr, size, dir, attrs);
 	else if (ops->unmap_phys)
@@ -373,7 +373,7 @@ void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
 
 	BUG_ON(!valid_dma_direction(dir));
 	if (dma_map_direct(dev, ops))
-		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+		dma_direct_sync_single_for_cpu(dev, addr, size, dir, true);
 	else if (use_dma_iommu(dev))
 		iommu_dma_sync_single_for_cpu(dev, addr, size, dir);
 	else if (ops->sync_single_for_cpu)
-- 
2.43.0

[PATCH 1/6] arm64: Provide dcache_by_myline_op_nosync helper
[PATCH 2/6] arm64: Provide dcache_clean_poc_nosync helper
[PATCH 3/6] arm64: Provide dcache_inval_poc_nosync helper
[PATCH 4/6] arm64: Provide arch_sync_dma_ batched helpers
[PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch
[PATCH RFC 6/6] dma-iommu: Allow DMA sync batching for IOVA link/unlink