From: Barry Song <v-songbaohua@oppo.com>
This enables dma_direct_sync_sg_for_device, dma_direct_sync_sg_for_cpu,
dma_direct_map_sg, and dma_direct_unmap_sg to use batched DMA sync
operations when possible. This significantly improves performance on
devices without hardware cache coherence.
Tangquan's initial results show that batched synchronization can reduce
dma_map_sg() time by 64.61% and dma_unmap_sg() time by 66.60% on an MTK
phone platform (MediaTek Dimensity 9500). The tests were performed by
pinning the task to CPU7 and fixing the CPU frequency at 2.6 GHz,
running dma_map_sg() and dma_unmap_sg() on 10 MB buffers (10 MB / 4 KB
sg entries per buffer) for 200 iterations and then averaging the
results.
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Ada Couprie Diaz <ada.coupriediaz@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tangquan Zheng <zhengtangquan@oppo.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
kernel/dma/direct.c | 28 ++++++++++-----
kernel/dma/direct.h | 86 +++++++++++++++++++++++++++++++++++++++------
2 files changed, 95 insertions(+), 19 deletions(-)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 50c3fe2a1d55..ed2339b0c5e7 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -403,9 +403,10 @@ void dma_direct_sync_sg_for_device(struct device *dev,
swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_device(paddr, sg->length,
- dir);
+ arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);
}
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_batch_flush();
}
#endif
@@ -422,7 +423,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_cpu(paddr, sg->length, dir);
+ arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
@@ -430,8 +431,10 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
arch_dma_mark_clean(paddr, sg->length);
}
- if (!dev_is_dma_coherent(dev))
+ if (!dev_is_dma_coherent(dev)) {
arch_sync_dma_for_cpu_all();
+ arch_sync_dma_batch_flush();
+ }
}
/*
@@ -443,14 +446,19 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
{
struct scatterlist *sg;
int i;
+ bool need_sync = false;
for_each_sg(sgl, sg, nents, i) {
- if (sg_dma_is_bus_address(sg))
+ if (sg_dma_is_bus_address(sg)) {
sg_dma_unmark_bus_address(sg);
- else
- dma_direct_unmap_phys(dev, sg->dma_address,
+ } else {
+ need_sync = true;
+ dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
sg_dma_len(sg), dir, attrs);
+ }
}
+ if (need_sync && !dev_is_dma_coherent(dev))
+ arch_sync_dma_batch_flush();
}
#endif
@@ -460,6 +468,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
struct pci_p2pdma_map_state p2pdma_state = {};
struct scatterlist *sg;
int i, ret;
+ bool need_sync = false;
for_each_sg(sgl, sg, nents, i) {
switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
@@ -471,7 +480,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
*/
break;
case PCI_P2PDMA_MAP_NONE:
- sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
+ need_sync = true;
+ sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
sg->length, dir, attrs);
if (sg->dma_address == DMA_MAPPING_ERROR) {
ret = -EIO;
@@ -491,6 +501,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
sg_dma_len(sg) = sg->length;
}
+ if (need_sync && !dev_is_dma_coherent(dev))
+ arch_sync_dma_batch_flush();
return nents;
out_unmap:
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index da2fadf45bcd..a211bab26478 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -64,15 +64,11 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
arch_sync_dma_for_device(paddr, size, dir);
}
-static inline void dma_direct_sync_single_for_cpu(struct device *dev,
- dma_addr_t addr, size_t size, enum dma_data_direction dir)
+static inline void __dma_direct_sync_single_for_cpu(struct device *dev,
+ phys_addr_t paddr, size_t size, enum dma_data_direction dir)
{
- phys_addr_t paddr = dma_to_phys(dev, addr);
-
- if (!dev_is_dma_coherent(dev)) {
- arch_sync_dma_for_cpu(paddr, size, dir);
+ if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu_all();
- }
swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
@@ -80,7 +76,31 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
arch_dma_mark_clean(paddr, size);
}
-static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+static inline void dma_direct_sync_single_for_cpu_batch_add(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ phys_addr_t paddr = dma_to_phys(dev, addr);
+
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_cpu_batch_add(paddr, size, dir);
+
+ __dma_direct_sync_single_for_cpu(dev, paddr, size, dir);
+}
+#endif
+
+static inline void dma_direct_sync_single_for_cpu(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ phys_addr_t paddr = dma_to_phys(dev, addr);
+
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_cpu(paddr, size, dir);
+
+ __dma_direct_sync_single_for_cpu(dev, paddr, size, dir);
+}
+
+static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
phys_addr_t phys, size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
@@ -108,9 +128,6 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
}
}
- if (!dev_is_dma_coherent(dev) &&
- !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
- arch_sync_dma_for_device(phys, size, dir);
return dma_addr;
err_overflow:
@@ -121,6 +138,53 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
return DMA_MAPPING_ERROR;
}
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
+ phys_addr_t phys, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ dma_addr_t dma_addr = __dma_direct_map_phys(dev, phys, size, dir, attrs);
+
+ if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
+ !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+ arch_sync_dma_for_device_batch_add(phys, size, dir);
+
+ return dma_addr;
+}
+#endif
+
+static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+ phys_addr_t phys, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ dma_addr_t dma_addr = __dma_direct_map_phys(dev, phys, size, dir, attrs);
+
+ if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
+ !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+ arch_sync_dma_for_device(phys, size, dir);
+
+ return dma_addr;
+}
+
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+static inline void dma_direct_unmap_phys_batch_add(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ phys_addr_t phys;
+
+ if (attrs & DMA_ATTR_MMIO)
+ /* nothing to do: uncached and no swiotlb */
+ return;
+
+ phys = dma_to_phys(dev, addr);
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ dma_direct_sync_single_for_cpu_batch_add(dev, addr, size, dir);
+
+ swiotlb_tbl_unmap_single(dev, phys, size, dir,
+ attrs | DMA_ATTR_SKIP_CPU_SYNC);
+}
+#endif
+
static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
--
2.39.3 (Apple Git-146)
Hi Barry,
kernel test robot noticed the following build errors:
[auto build test ERROR on linus/master]
[also build test ERROR on next-20251219]
[cannot apply to arm64/for-next/core v6.16-rc1]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Barry-Song/arm64-Provide-dcache_by_myline_op_nosync-helper/20251219-195810
base: linus/master
patch link: https://lore.kernel.org/r/20251219053658.84978-6-21cnbao%40gmail.com
patch subject: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20251220/202512201836.f6KX6WMH-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251220/202512201836.f6KX6WMH-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512201836.f6KX6WMH-lkp@intel.com/
All errors (new ones prefixed by >>):
>> kernel/dma/direct.c:456:4: error: call to undeclared function 'dma_direct_unmap_phys_batch_add'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
456 | dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
| ^
kernel/dma/direct.c:456:4: note: did you mean 'dma_direct_unmap_phys'?
kernel/dma/direct.h:188:20: note: 'dma_direct_unmap_phys' declared here
188 | static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
| ^
>> kernel/dma/direct.c:484:22: error: call to undeclared function 'dma_direct_map_phys_batch_add'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
484 | sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
| ^
2 errors generated.
vim +/dma_direct_unmap_phys_batch_add +456 kernel/dma/direct.c
439
440 /*
441 * Unmaps segments, except for ones marked as pci_p2pdma which do not
442 * require any further action as they contain a bus address.
443 */
444 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
445 int nents, enum dma_data_direction dir, unsigned long attrs)
446 {
447 struct scatterlist *sg;
448 int i;
449 bool need_sync = false;
450
451 for_each_sg(sgl, sg, nents, i) {
452 if (sg_dma_is_bus_address(sg)) {
453 sg_dma_unmark_bus_address(sg);
454 } else {
455 need_sync = true;
> 456 dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
457 sg_dma_len(sg), dir, attrs);
458 }
459 }
460 if (need_sync && !dev_is_dma_coherent(dev))
461 arch_sync_dma_batch_flush();
462 }
463 #endif
464
465 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
466 enum dma_data_direction dir, unsigned long attrs)
467 {
468 struct pci_p2pdma_map_state p2pdma_state = {};
469 struct scatterlist *sg;
470 int i, ret;
471 bool need_sync = false;
472
473 for_each_sg(sgl, sg, nents, i) {
474 switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
475 case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
476 /*
477 * Any P2P mapping that traverses the PCI host bridge
478 * must be mapped with CPU physical address and not PCI
479 * bus addresses.
480 */
481 break;
482 case PCI_P2PDMA_MAP_NONE:
483 need_sync = true;
> 484 sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
485 sg->length, dir, attrs);
486 if (sg->dma_address == DMA_MAPPING_ERROR) {
487 ret = -EIO;
488 goto out_unmap;
489 }
490 break;
491 case PCI_P2PDMA_MAP_BUS_ADDR:
492 sg->dma_address = pci_p2pdma_bus_addr_map(
493 p2pdma_state.mem, sg_phys(sg));
494 sg_dma_len(sg) = sg->length;
495 sg_dma_mark_bus_address(sg);
496 continue;
497 default:
498 ret = -EREMOTEIO;
499 goto out_unmap;
500 }
501 sg_dma_len(sg) = sg->length;
502 }
503
504 if (need_sync && !dev_is_dma_coherent(dev))
505 arch_sync_dma_batch_flush();
506 return nents;
507
508 out_unmap:
509 dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
510 return ret;
511 }
512
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
>
> All errors (new ones prefixed by >>):
>
> >> kernel/dma/direct.c:456:4: error: call to undeclared function 'dma_direct_unmap_phys_batch_add'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
> 456 | dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
> | ^
> kernel/dma/direct.c:456:4: note: did you mean 'dma_direct_unmap_phys'?
> kernel/dma/direct.h:188:20: note: 'dma_direct_unmap_phys' declared here
> 188 | static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
> | ^
> >> kernel/dma/direct.c:484:22: error: call to undeclared function 'dma_direct_map_phys_batch_add'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
> 484 | sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
> | ^
> 2 errors generated.
>
>
Thanks very much for the report.
Can you please check if the below diff fix the build issue?
From 5541aa1efa19777e435c9f3cca7cd2c6a490d9f1 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Sun, 21 Dec 2025 13:09:36 +0800
Subject: [PATCH] kernel/dma: Fix build errors for dma_direct_map_phys
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202512201836.f6KX6WMH-lkp@intel.com/
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
kernel/dma/direct.h | 38 ++++++++++++++++++++++++++------------
1 file changed, 26 insertions(+), 12 deletions(-)
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index a211bab26478..bcc398b5aa6b 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -138,8 +138,7 @@ static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
return DMA_MAPPING_ERROR;
}
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
-static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
+static inline dma_addr_t dma_direct_map_phys(struct device *dev,
phys_addr_t phys, size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
@@ -147,13 +146,13 @@ static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
!(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
- arch_sync_dma_for_device_batch_add(phys, size, dir);
+ arch_sync_dma_for_device(phys, size, dir);
return dma_addr;
}
-#endif
-static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
phys_addr_t phys, size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
@@ -161,13 +160,20 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
!(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
- arch_sync_dma_for_device(phys, size, dir);
+ arch_sync_dma_for_device_batch_add(phys, size, dir);
return dma_addr;
}
+#else
+static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
+ phys_addr_t phys, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ return dma_direct_map_phys(dev, phys, size, dir, attrs);
+}
+#endif
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
-static inline void dma_direct_unmap_phys_batch_add(struct device *dev, dma_addr_t addr,
+static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
phys_addr_t phys;
@@ -178,14 +184,14 @@ static inline void dma_direct_unmap_phys_batch_add(struct device *dev, dma_addr_
phys = dma_to_phys(dev, addr);
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_direct_sync_single_for_cpu_batch_add(dev, addr, size, dir);
+ dma_direct_sync_single_for_cpu(dev, addr, size, dir);
swiotlb_tbl_unmap_single(dev, phys, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
}
-#endif
-static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+static inline void dma_direct_unmap_phys_batch_add(struct device *dev, dma_addr_t addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
phys_addr_t phys;
@@ -196,9 +202,17 @@ static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
phys = dma_to_phys(dev, addr);
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+ dma_direct_sync_single_for_cpu_batch_add(dev, addr, size, dir);
swiotlb_tbl_unmap_single(dev, phys, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
}
+#else
+static inline void dma_direct_unmap_phys_batch_add(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ dma_direct_unmap_phys(dev, addr, size, dir, attrs);
+}
+#endif
+
#endif /* _KERNEL_DMA_DIRECT_H */
--
2.39.3 (Apple Git-146)
Thanks
Barry
Hi Barry,
kernel test robot noticed the following build errors:
[auto build test ERROR on linus/master]
[also build test ERROR on v6.19-rc1 next-20251219]
[cannot apply to arm64/for-next/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Barry-Song/arm64-Provide-dcache_by_myline_op_nosync-helper/20251219-195810
base: linus/master
patch link: https://lore.kernel.org/r/20251219053658.84978-6-21cnbao%40gmail.com
patch subject: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch
config: x86_64-rhel-9.4 (https://download.01.org/0day-ci/archive/20251221/202512211320.LaiSSLAc-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251221/202512211320.LaiSSLAc-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512211320.LaiSSLAc-lkp@intel.com/
All errors (new ones prefixed by >>):
kernel/dma/direct.c: In function 'dma_direct_unmap_sg':
>> kernel/dma/direct.c:456:25: error: implicit declaration of function 'dma_direct_unmap_phys_batch_add'; did you mean 'dma_direct_unmap_phys'? [-Wimplicit-function-declaration]
456 | dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
| dma_direct_unmap_phys
kernel/dma/direct.c: In function 'dma_direct_map_sg':
>> kernel/dma/direct.c:484:43: error: implicit declaration of function 'dma_direct_map_phys_batch_add'; did you mean 'dma_direct_map_phys'? [-Wimplicit-function-declaration]
484 | sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
| dma_direct_map_phys
vim +456 kernel/dma/direct.c
439
440 /*
441 * Unmaps segments, except for ones marked as pci_p2pdma which do not
442 * require any further action as they contain a bus address.
443 */
444 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
445 int nents, enum dma_data_direction dir, unsigned long attrs)
446 {
447 struct scatterlist *sg;
448 int i;
449 bool need_sync = false;
450
451 for_each_sg(sgl, sg, nents, i) {
452 if (sg_dma_is_bus_address(sg)) {
453 sg_dma_unmark_bus_address(sg);
454 } else {
455 need_sync = true;
> 456 dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
457 sg_dma_len(sg), dir, attrs);
458 }
459 }
460 if (need_sync && !dev_is_dma_coherent(dev))
461 arch_sync_dma_batch_flush();
462 }
463 #endif
464
465 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
466 enum dma_data_direction dir, unsigned long attrs)
467 {
468 struct pci_p2pdma_map_state p2pdma_state = {};
469 struct scatterlist *sg;
470 int i, ret;
471 bool need_sync = false;
472
473 for_each_sg(sgl, sg, nents, i) {
474 switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
475 case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
476 /*
477 * Any P2P mapping that traverses the PCI host bridge
478 * must be mapped with CPU physical address and not PCI
479 * bus addresses.
480 */
481 break;
482 case PCI_P2PDMA_MAP_NONE:
483 need_sync = true;
> 484 sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
485 sg->length, dir, attrs);
486 if (sg->dma_address == DMA_MAPPING_ERROR) {
487 ret = -EIO;
488 goto out_unmap;
489 }
490 break;
491 case PCI_P2PDMA_MAP_BUS_ADDR:
492 sg->dma_address = pci_p2pdma_bus_addr_map(
493 p2pdma_state.mem, sg_phys(sg));
494 sg_dma_len(sg) = sg->length;
495 sg_dma_mark_bus_address(sg);
496 continue;
497 default:
498 ret = -EREMOTEIO;
499 goto out_unmap;
500 }
501 sg_dma_len(sg) = sg->length;
502 }
503
504 if (need_sync && !dev_is_dma_coherent(dev))
505 arch_sync_dma_batch_flush();
506 return nents;
507
508 out_unmap:
509 dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
510 return ret;
511 }
512
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi Barry,
kernel test robot noticed the following build errors:
[auto build test ERROR on linus/master]
[also build test ERROR on v6.19-rc2 next-20251219]
[cannot apply to arm64/for-next/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Barry-Song/arm64-Provide-dcache_by_myline_op_nosync-helper/20251219-195810
base: linus/master
patch link: https://lore.kernel.org/r/20251219053658.84978-6-21cnbao%40gmail.com
patch subject: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch
config: i386-buildonly-randconfig-006-20251222 (https://download.01.org/0day-ci/archive/20251222/202512222029.Dd6Vs1Eg-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251222/202512222029.Dd6Vs1Eg-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512222029.Dd6Vs1Eg-lkp@intel.com/
All errors (new ones prefixed by >>):
>> kernel/dma/direct.c:456:4: error: call to undeclared function 'dma_direct_unmap_phys_batch_add'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
456 | dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
| ^
kernel/dma/direct.c:456:4: note: did you mean 'dma_direct_unmap_phys'?
kernel/dma/direct.h:188:20: note: 'dma_direct_unmap_phys' declared here
188 | static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
| ^
>> kernel/dma/direct.c:484:22: error: call to undeclared function 'dma_direct_map_phys_batch_add'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
484 | sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
| ^
2 errors generated.
vim +/dma_direct_unmap_phys_batch_add +456 kernel/dma/direct.c
439
440 /*
441 * Unmaps segments, except for ones marked as pci_p2pdma which do not
442 * require any further action as they contain a bus address.
443 */
444 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
445 int nents, enum dma_data_direction dir, unsigned long attrs)
446 {
447 struct scatterlist *sg;
448 int i;
449 bool need_sync = false;
450
451 for_each_sg(sgl, sg, nents, i) {
452 if (sg_dma_is_bus_address(sg)) {
453 sg_dma_unmark_bus_address(sg);
454 } else {
455 need_sync = true;
> 456 dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
457 sg_dma_len(sg), dir, attrs);
458 }
459 }
460 if (need_sync && !dev_is_dma_coherent(dev))
461 arch_sync_dma_batch_flush();
462 }
463 #endif
464
465 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
466 enum dma_data_direction dir, unsigned long attrs)
467 {
468 struct pci_p2pdma_map_state p2pdma_state = {};
469 struct scatterlist *sg;
470 int i, ret;
471 bool need_sync = false;
472
473 for_each_sg(sgl, sg, nents, i) {
474 switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
475 case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
476 /*
477 * Any P2P mapping that traverses the PCI host bridge
478 * must be mapped with CPU physical address and not PCI
479 * bus addresses.
480 */
481 break;
482 case PCI_P2PDMA_MAP_NONE:
483 need_sync = true;
> 484 sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
485 sg->length, dir, attrs);
486 if (sg->dma_address == DMA_MAPPING_ERROR) {
487 ret = -EIO;
488 goto out_unmap;
489 }
490 break;
491 case PCI_P2PDMA_MAP_BUS_ADDR:
492 sg->dma_address = pci_p2pdma_bus_addr_map(
493 p2pdma_state.mem, sg_phys(sg));
494 sg_dma_len(sg) = sg->length;
495 sg_dma_mark_bus_address(sg);
496 continue;
497 default:
498 ret = -EREMOTEIO;
499 goto out_unmap;
500 }
501 sg_dma_len(sg) = sg->length;
502 }
503
504 if (need_sync && !dev_is_dma_coherent(dev))
505 arch_sync_dma_batch_flush();
506 return nents;
507
508 out_unmap:
509 dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
510 return ret;
511 }
512
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi Barry,
kernel test robot noticed the following build errors:
[auto build test ERROR on linus/master]
[also build test ERROR on v6.19-rc2 next-20251219]
[cannot apply to arm64/for-next/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Barry-Song/arm64-Provide-dcache_by_myline_op_nosync-helper/20251219-195810
base: linus/master
patch link: https://lore.kernel.org/r/20251219053658.84978-6-21cnbao%40gmail.com
patch subject: [PATCH 5/6] dma-mapping: Allow batched DMA sync operations if supported by the arch
config: x86_64-randconfig-161-20251222 (https://download.01.org/0day-ci/archive/20251222/202512222137.rpXOEE5p-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251222/202512222137.rpXOEE5p-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512222137.rpXOEE5p-lkp@intel.com/
All errors (new ones prefixed by >>):
kernel/dma/direct.c: In function 'dma_direct_unmap_sg':
>> kernel/dma/direct.c:456:25: error: implicit declaration of function 'dma_direct_unmap_phys_batch_add'; did you mean 'dma_direct_unmap_phys'? [-Wimplicit-function-declaration]
456 | dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
| dma_direct_unmap_phys
kernel/dma/direct.c: In function 'dma_direct_map_sg':
>> kernel/dma/direct.c:484:43: error: implicit declaration of function 'dma_direct_map_phys_batch_add'; did you mean 'dma_direct_map_phys'? [-Wimplicit-function-declaration]
484 | sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
| dma_direct_map_phys
vim +456 kernel/dma/direct.c
439
440 /*
441 * Unmaps segments, except for ones marked as pci_p2pdma which do not
442 * require any further action as they contain a bus address.
443 */
444 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
445 int nents, enum dma_data_direction dir, unsigned long attrs)
446 {
447 struct scatterlist *sg;
448 int i;
449 bool need_sync = false;
450
451 for_each_sg(sgl, sg, nents, i) {
452 if (sg_dma_is_bus_address(sg)) {
453 sg_dma_unmark_bus_address(sg);
454 } else {
455 need_sync = true;
> 456 dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
457 sg_dma_len(sg), dir, attrs);
458 }
459 }
460 if (need_sync && !dev_is_dma_coherent(dev))
461 arch_sync_dma_batch_flush();
462 }
463 #endif
464
465 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
466 enum dma_data_direction dir, unsigned long attrs)
467 {
468 struct pci_p2pdma_map_state p2pdma_state = {};
469 struct scatterlist *sg;
470 int i, ret;
471 bool need_sync = false;
472
473 for_each_sg(sgl, sg, nents, i) {
474 switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
475 case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
476 /*
477 * Any P2P mapping that traverses the PCI host bridge
478 * must be mapped with CPU physical address and not PCI
479 * bus addresses.
480 */
481 break;
482 case PCI_P2PDMA_MAP_NONE:
483 need_sync = true;
> 484 sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
485 sg->length, dir, attrs);
486 if (sg->dma_address == DMA_MAPPING_ERROR) {
487 ret = -EIO;
488 goto out_unmap;
489 }
490 break;
491 case PCI_P2PDMA_MAP_BUS_ADDR:
492 sg->dma_address = pci_p2pdma_bus_addr_map(
493 p2pdma_state.mem, sg_phys(sg));
494 sg_dma_len(sg) = sg->length;
495 sg_dma_mark_bus_address(sg);
496 continue;
497 default:
498 ret = -EREMOTEIO;
499 goto out_unmap;
500 }
501 sg_dma_len(sg) = sg->length;
502 }
503
504 if (need_sync && !dev_is_dma_coherent(dev))
505 arch_sync_dma_batch_flush();
506 return nents;
507
508 out_unmap:
509 dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
510 return ret;
511 }
512
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On Fri, Dec 19, 2025 at 01:36:57PM +0800, Barry Song wrote:
> From: Barry Song <v-songbaohua@oppo.com>
>
> This enables dma_direct_sync_sg_for_device, dma_direct_sync_sg_for_cpu,
> dma_direct_map_sg, and dma_direct_unmap_sg to use batched DMA sync
> operations when possible. This significantly improves performance on
> devices without hardware cache coherence.
>
> Tangquan's initial results show that batched synchronization can reduce
> dma_map_sg() time by 64.61% and dma_unmap_sg() time by 66.60% on an MTK
> phone platform (MediaTek Dimensity 9500). The tests were performed by
> pinning the task to CPU7 and fixing the CPU frequency at 2.6 GHz,
> running dma_map_sg() and dma_unmap_sg() on 10 MB buffers (10 MB / 4 KB
> sg entries per buffer) for 200 iterations and then averaging the
> results.
>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Marek Szyprowski <m.szyprowski@samsung.com>
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: Ada Couprie Diaz <ada.coupriediaz@arm.com>
> Cc: Ard Biesheuvel <ardb@kernel.org>
> Cc: Marc Zyngier <maz@kernel.org>
> Cc: Anshuman Khandual <anshuman.khandual@arm.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: Suren Baghdasaryan <surenb@google.com>
> Cc: Tangquan Zheng <zhengtangquan@oppo.com>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
> kernel/dma/direct.c | 28 ++++++++++-----
> kernel/dma/direct.h | 86 +++++++++++++++++++++++++++++++++++++++------
> 2 files changed, 95 insertions(+), 19 deletions(-)
<...>
> if (!dev_is_dma_coherent(dev))
> - arch_sync_dma_for_device(paddr, sg->length,
> - dir);
> + arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);
<...>
> -static inline dma_addr_t dma_direct_map_phys(struct device *dev,
> +#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
> +static inline void dma_direct_sync_single_for_cpu_batch_add(struct device *dev,
> + dma_addr_t addr, size_t size, enum dma_data_direction dir)
> +{
> + phys_addr_t paddr = dma_to_phys(dev, addr);
> +
> + if (!dev_is_dma_coherent(dev))
> + arch_sync_dma_for_cpu_batch_add(paddr, size, dir);
> +
> + __dma_direct_sync_single_for_cpu(dev, paddr, size, dir);
> +}
> +#endif
> +
> +static inline void dma_direct_sync_single_for_cpu(struct device *dev,
> + dma_addr_t addr, size_t size, enum dma_data_direction dir)
> +{
> + phys_addr_t paddr = dma_to_phys(dev, addr);
> +
> + if (!dev_is_dma_coherent(dev))
> + arch_sync_dma_for_cpu(paddr, size, dir);
> +
> + __dma_direct_sync_single_for_cpu(dev, paddr, size, dir);
> +}
> +
I'm wondering why you don't implement this batch‑sync support inside the
arch_sync_dma_*() functions. Doing so would minimize changes to the generic
kernel/dma/* code and reduce the amount of #ifdef‑based spaghetti.
Thanks."
On Sun, Dec 21, 2025 at 7:55 PM Leon Romanovsky <leon@kernel.org> wrote:
[...]
> > +
>
> I'm wondering why you don't implement this batch‑sync support inside the
> arch_sync_dma_*() functions. Doing so would minimize changes to the generic
> kernel/dma/* code and reduce the amount of #ifdef‑based spaghetti.
>
There are two cases: mapping an sg list and mapping a single
buffer. The former can be batched with
arch_sync_dma_*_batch_add() and flushed via
arch_sync_dma_batch_flush(), while the latter requires all work to
be done inside arch_sync_dma_*(). Therefore,
arch_sync_dma_*() cannot always batch and flush.
But yes, I can drop the ifdef in this patch. I have rewritten the entire
patch as shown below, and it will be tested today prior to
resending v2. Before I send v2, you are very welcome to comment.
From c03aae12c608b25fc1a84931ce78dbe3ef0f1ebe Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Wed, 29 Oct 2025 10:31:15 +0800
Subject: [PATCH v2 FOR DISCUSION 5/6] dma-mapping: Allow batched DMA sync operations
This enables dma_direct_sync_sg_for_device, dma_direct_sync_sg_for_cpu,
dma_direct_map_sg, and dma_direct_unmap_sg to use batched DMA sync
operations when possible. This significantly improves performance on
devices without hardware cache coherence.
Tangquan's initial results show that batched synchronization can reduce
dma_map_sg() time by 64.61% and dma_unmap_sg() time by 66.60% on an MTK
phone platform (MediaTek Dimensity 9500). The tests were performed by
pinning the task to CPU7 and fixing the CPU frequency at 2.6 GHz,
running dma_map_sg() and dma_unmap_sg() on 10 MB buffers (10 MB / 4 KB
sg entries per buffer) for 200 iterations and then averaging the
results.
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
kernel/dma/direct.c | 28 +++++++++++++++------
kernel/dma/direct.h | 59 +++++++++++++++++++++++++++++++++++++--------
2 files changed, 69 insertions(+), 18 deletions(-)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 50c3fe2a1d55..ed2339b0c5e7 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -403,9 +403,10 @@ void dma_direct_sync_sg_for_device(struct device *dev,
swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_device(paddr, sg->length,
- dir);
+ arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);
}
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_batch_flush();
}
#endif
@@ -422,7 +423,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_cpu(paddr, sg->length, dir);
+ arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
@@ -430,8 +431,10 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
arch_dma_mark_clean(paddr, sg->length);
}
- if (!dev_is_dma_coherent(dev))
+ if (!dev_is_dma_coherent(dev)) {
arch_sync_dma_for_cpu_all();
+ arch_sync_dma_batch_flush();
+ }
}
/*
@@ -443,14 +446,19 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
{
struct scatterlist *sg;
int i;
+ bool need_sync = false;
for_each_sg(sgl, sg, nents, i) {
- if (sg_dma_is_bus_address(sg))
+ if (sg_dma_is_bus_address(sg)) {
sg_dma_unmark_bus_address(sg);
- else
- dma_direct_unmap_phys(dev, sg->dma_address,
+ } else {
+ need_sync = true;
+ dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
sg_dma_len(sg), dir, attrs);
+ }
}
+ if (need_sync && !dev_is_dma_coherent(dev))
+ arch_sync_dma_batch_flush();
}
#endif
@@ -460,6 +468,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
struct pci_p2pdma_map_state p2pdma_state = {};
struct scatterlist *sg;
int i, ret;
+ bool need_sync = false;
for_each_sg(sgl, sg, nents, i) {
switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
@@ -471,7 +480,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
*/
break;
case PCI_P2PDMA_MAP_NONE:
- sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
+ need_sync = true;
+ sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
sg->length, dir, attrs);
if (sg->dma_address == DMA_MAPPING_ERROR) {
ret = -EIO;
@@ -491,6 +501,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
sg_dma_len(sg) = sg->length;
}
+ if (need_sync && !dev_is_dma_coherent(dev))
+ arch_sync_dma_batch_flush();
return nents;
out_unmap:
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index da2fadf45bcd..2e25af887204 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -64,13 +64,16 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
arch_sync_dma_for_device(paddr, size, dir);
}
-static inline void dma_direct_sync_single_for_cpu(struct device *dev,
- dma_addr_t addr, size_t size, enum dma_data_direction dir)
+static inline void __dma_direct_sync_single_for_cpu(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir,
+ bool flush)
{
phys_addr_t paddr = dma_to_phys(dev, addr);
if (!dev_is_dma_coherent(dev)) {
- arch_sync_dma_for_cpu(paddr, size, dir);
+ arch_sync_dma_for_cpu_batch_add(paddr, size, dir);
+ if (flush)
+ arch_sync_dma_batch_flush();
arch_sync_dma_for_cpu_all();
}
@@ -80,9 +83,15 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
arch_dma_mark_clean(paddr, size);
}
-static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+static inline void dma_direct_sync_single_for_cpu(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ __dma_direct_sync_single_for_cpu(dev, addr, size, dir, true);
+}
+
+static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
phys_addr_t phys, size_t size, enum dma_data_direction dir,
- unsigned long attrs)
+ unsigned long attrs, bool flush)
{
dma_addr_t dma_addr;
@@ -109,8 +118,11 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
}
if (!dev_is_dma_coherent(dev) &&
- !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
- arch_sync_dma_for_device(phys, size, dir);
+ !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) {
+ arch_sync_dma_for_device_batch_add(phys, size, dir);
+ if (flush)
+ arch_sync_dma_batch_flush();
+ }
return dma_addr;
err_overflow:
@@ -121,8 +133,23 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
return DMA_MAPPING_ERROR;
}
-static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
- size_t size, enum dma_data_direction dir, unsigned long attrs)
+static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+ phys_addr_t phys, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ return __dma_direct_map_phys(dev, phys, size, dir, attrs, true);
+}
+
+static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
+ phys_addr_t phys, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ return __dma_direct_map_phys(dev, phys, size, dir, attrs, false);
+}
+
+static inline void __dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs,
+ bool flush)
{
phys_addr_t phys;
@@ -132,9 +159,21 @@ static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
phys = dma_to_phys(dev, addr);
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+ __dma_direct_sync_single_for_cpu(dev, addr, size, dir, flush);
swiotlb_tbl_unmap_single(dev, phys, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
}
+
+static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ __dma_direct_unmap_phys(dev, addr, size, dir, attrs, true);
+}
+
+static inline void dma_direct_unmap_phys_batch_add(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ __dma_direct_unmap_phys(dev, addr, size, dir, attrs, false);
+}
#endif /* _KERNEL_DMA_DIRECT_H */
--
2.39.3 (Apple Git-146)
On Mon, Dec 22, 2025 at 03:24:58AM +0800, Barry Song wrote:
> On Sun, Dec 21, 2025 at 7:55 PM Leon Romanovsky <leon@kernel.org> wrote:
> [...]
> > > +
> >
> > I'm wondering why you don't implement this batch‑sync support inside the
> > arch_sync_dma_*() functions. Doing so would minimize changes to the generic
> > kernel/dma/* code and reduce the amount of #ifdef‑based spaghetti.
> >
>
> There are two cases: mapping an sg list and mapping a single
> buffer. The former can be batched with
> arch_sync_dma_*_batch_add() and flushed via
> arch_sync_dma_batch_flush(), while the latter requires all work to
> be done inside arch_sync_dma_*(). Therefore,
> arch_sync_dma_*() cannot always batch and flush.
Probably in all cases you can call the _batch_ variant, followed by _flush_,
even when handling a single page. This keeps the code consistent across all
paths. On platforms that do not support _batch_, the _flush_ operation will be
a NOP anyway.
I would also rename arch_sync_dma_batch_flush() to arch_sync_dma_flush().
You can also minimize changes in dma_direct_map_phys() too, by extending
it's signature to provide if flush is needed or not.
dma_direct_map_phys(....) -> dma_direct_map_phys(...., bool flush):
static inline dma_addr_t dma_direct_map_phys(...., bool flush)
{
....
if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
!(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
{
arch_sync_dma_for_device(phys, size, dir);
if (flush)
arch_sync_dma_flush();
}
}
Thanks
On Mon, Dec 22, 2025 at 9:49 PM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Mon, Dec 22, 2025 at 03:24:58AM +0800, Barry Song wrote:
> > On Sun, Dec 21, 2025 at 7:55 PM Leon Romanovsky <leon@kernel.org> wrote:
> > [...]
> > > > +
> > >
> > > I'm wondering why you don't implement this batch‑sync support inside the
> > > arch_sync_dma_*() functions. Doing so would minimize changes to the generic
> > > kernel/dma/* code and reduce the amount of #ifdef‑based spaghetti.
> > >
> >
> > There are two cases: mapping an sg list and mapping a single
> > buffer. The former can be batched with
> > arch_sync_dma_*_batch_add() and flushed via
> > arch_sync_dma_batch_flush(), while the latter requires all work to
> > be done inside arch_sync_dma_*(). Therefore,
> > arch_sync_dma_*() cannot always batch and flush.
>
> Probably in all cases you can call the _batch_ variant, followed by _flush_,
> even when handling a single page. This keeps the code consistent across all
> paths. On platforms that do not support _batch_, the _flush_ operation will be
> a NOP anyway.
We have a lot of code outside kernel/dma that also calls
arch_sync_dma_for_* such as arch/arm, arch/mips, drivers/xen,
I guess we don’t want to modify so many things?
for kernel/dma, we have two "single" callers only:
kernel/dma/direct.h, kernel/dma/swiotlb.c. and they looks quite
straightforward:
static inline void dma_direct_sync_single_for_device(struct device *dev,
dma_addr_t addr, size_t size, enum dma_data_direction dir)
{
phys_addr_t paddr = dma_to_phys(dev, addr);
swiotlb_sync_single_for_device(dev, paddr, size, dir);
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_device(paddr, size, dir);
}
I guess moving to arch_sync_dma_for_device_batch + flush
doesn’t really look much better, does it?
>
> I would also rename arch_sync_dma_batch_flush() to arch_sync_dma_flush().
Sure.
>
> You can also minimize changes in dma_direct_map_phys() too, by extending
> it's signature to provide if flush is needed or not.
Yes. I have
static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
phys_addr_t phys, size_t size, enum dma_data_direction dir,
unsigned long attrs, bool flush)
and two wrappers:
static inline dma_addr_t dma_direct_map_phys(struct device *dev,
phys_addr_t phys, size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
return __dma_direct_map_phys(dev, phys, size, dir, attrs, true);
}
static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
phys_addr_t phys, size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
return __dma_direct_map_phys(dev, phys, size, dir, attrs, false);
}
If you prefer exposing "flush" directly in dma_direct_map_phys()
and updating its callers with flush=true, I think that’s fine.
It could be also true for dma_direct_sync_single_for_device().
>
> dma_direct_map_phys(....) -> dma_direct_map_phys(...., bool flush):
>
> static inline dma_addr_t dma_direct_map_phys(...., bool flush)
> {
> ....
>
> if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
> !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
> {
> arch_sync_dma_for_device(phys, size, dir);
> if (flush)
> arch_sync_dma_flush();
> }
> }
>
Thanks
Barry
On Tue, Dec 23, 2025 at 01:02:55PM +1300, Barry Song wrote:
> On Mon, Dec 22, 2025 at 9:49 PM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Mon, Dec 22, 2025 at 03:24:58AM +0800, Barry Song wrote:
> > > On Sun, Dec 21, 2025 at 7:55 PM Leon Romanovsky <leon@kernel.org> wrote:
> > > [...]
> > > > > +
> > > >
> > > > I'm wondering why you don't implement this batch‑sync support inside the
> > > > arch_sync_dma_*() functions. Doing so would minimize changes to the generic
> > > > kernel/dma/* code and reduce the amount of #ifdef‑based spaghetti.
> > > >
> > >
> > > There are two cases: mapping an sg list and mapping a single
> > > buffer. The former can be batched with
> > > arch_sync_dma_*_batch_add() and flushed via
> > > arch_sync_dma_batch_flush(), while the latter requires all work to
> > > be done inside arch_sync_dma_*(). Therefore,
> > > arch_sync_dma_*() cannot always batch and flush.
> >
> > Probably in all cases you can call the _batch_ variant, followed by _flush_,
> > even when handling a single page. This keeps the code consistent across all
> > paths. On platforms that do not support _batch_, the _flush_ operation will be
> > a NOP anyway.
>
> We have a lot of code outside kernel/dma that also calls
> arch_sync_dma_for_* such as arch/arm, arch/mips, drivers/xen,
> I guess we don’t want to modify so many things?
Aren't they using internal, arch specific, arch_sync_dma_for_* implementations?
>
> for kernel/dma, we have two "single" callers only:
> kernel/dma/direct.h, kernel/dma/swiotlb.c. and they looks quite
> straightforward:
>
> static inline void dma_direct_sync_single_for_device(struct device *dev,
> dma_addr_t addr, size_t size, enum dma_data_direction dir)
> {
> phys_addr_t paddr = dma_to_phys(dev, addr);
>
> swiotlb_sync_single_for_device(dev, paddr, size, dir);
>
> if (!dev_is_dma_coherent(dev))
> arch_sync_dma_for_device(paddr, size, dir);
> }
>
> I guess moving to arch_sync_dma_for_device_batch + flush
> doesn’t really look much better, does it?
>
> >
> > I would also rename arch_sync_dma_batch_flush() to arch_sync_dma_flush().
>
> Sure.
>
> >
> > You can also minimize changes in dma_direct_map_phys() too, by extending
> > it's signature to provide if flush is needed or not.
>
> Yes. I have
>
> static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
> phys_addr_t phys, size_t size, enum dma_data_direction dir,
> unsigned long attrs, bool flush)
My suggestion is to use it directly, without wrappers.
>
> and two wrappers:
> static inline dma_addr_t dma_direct_map_phys(struct device *dev,
> phys_addr_t phys, size_t size, enum dma_data_direction dir,
> unsigned long attrs)
> {
> return __dma_direct_map_phys(dev, phys, size, dir, attrs, true);
> }
>
> static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
> phys_addr_t phys, size_t size, enum dma_data_direction dir,
> unsigned long attrs)
> {
> return __dma_direct_map_phys(dev, phys, size, dir, attrs, false);
> }
>
> If you prefer exposing "flush" directly in dma_direct_map_phys()
> and updating its callers with flush=true, I think that’s fine.
Yes
>
> It could be also true for dma_direct_sync_single_for_device().
>
> >
> > dma_direct_map_phys(....) -> dma_direct_map_phys(...., bool flush):
> >
> > static inline dma_addr_t dma_direct_map_phys(...., bool flush)
> > {
> > ....
> >
> > if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
> > !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
> > {
> > arch_sync_dma_for_device(phys, size, dir);
> > if (flush)
> > arch_sync_dma_flush();
> > }
> > }
> >
>
> Thanks
> Barry
>
On Wed, Dec 24, 2025 at 3:14 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Tue, Dec 23, 2025 at 01:02:55PM +1300, Barry Song wrote:
> > On Mon, Dec 22, 2025 at 9:49 PM Leon Romanovsky <leon@kernel.org> wrote:
> > >
> > > On Mon, Dec 22, 2025 at 03:24:58AM +0800, Barry Song wrote:
> > > > On Sun, Dec 21, 2025 at 7:55 PM Leon Romanovsky <leon@kernel.org> wrote:
> > > > [...]
> > > > > > +
> > > > >
> > > > > I'm wondering why you don't implement this batch‑sync support inside the
> > > > > arch_sync_dma_*() functions. Doing so would minimize changes to the generic
> > > > > kernel/dma/* code and reduce the amount of #ifdef‑based spaghetti.
> > > > >
> > > >
> > > > There are two cases: mapping an sg list and mapping a single
> > > > buffer. The former can be batched with
> > > > arch_sync_dma_*_batch_add() and flushed via
> > > > arch_sync_dma_batch_flush(), while the latter requires all work to
> > > > be done inside arch_sync_dma_*(). Therefore,
> > > > arch_sync_dma_*() cannot always batch and flush.
> > >
> > > Probably in all cases you can call the _batch_ variant, followed by _flush_,
> > > even when handling a single page. This keeps the code consistent across all
> > > paths. On platforms that do not support _batch_, the _flush_ operation will be
> > > a NOP anyway.
> >
> > We have a lot of code outside kernel/dma that also calls
> > arch_sync_dma_for_* such as arch/arm, arch/mips, drivers/xen,
> > I guess we don’t want to modify so many things?
>
> Aren't they using internal, arch specific, arch_sync_dma_for_* implementations?
for arch/arm, arch/mips, they are arch-specific implementations.
xen is an exception:
static void xen_swiotlb_unmap_phys(struct device *hwdev, dma_addr_t dev_addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
phys_addr_t paddr = xen_dma_to_phys(hwdev, dev_addr);
struct io_tlb_pool *pool;
BUG_ON(dir == DMA_NONE);
if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr))))
arch_sync_dma_for_cpu(paddr, size, dir);
else
xen_dma_sync_for_cpu(hwdev, dev_addr, size, dir);
}
/* NOTE: We use dev_addr here, not paddr! */
pool = xen_swiotlb_find_pool(hwdev, dev_addr);
if (pool)
__swiotlb_tbl_unmap_single(hwdev, paddr, size, dir,
attrs, pool);
}
>
> >
> > for kernel/dma, we have two "single" callers only:
> > kernel/dma/direct.h, kernel/dma/swiotlb.c. and they looks quite
> > straightforward:
> >
> > static inline void dma_direct_sync_single_for_device(struct device *dev,
> > dma_addr_t addr, size_t size, enum dma_data_direction dir)
> > {
> > phys_addr_t paddr = dma_to_phys(dev, addr);
> >
> > swiotlb_sync_single_for_device(dev, paddr, size, dir);
> >
> > if (!dev_is_dma_coherent(dev))
> > arch_sync_dma_for_device(paddr, size, dir);
> > }
> >
> > I guess moving to arch_sync_dma_for_device_batch + flush
> > doesn’t really look much better, does it?
> >
> > >
> > > I would also rename arch_sync_dma_batch_flush() to arch_sync_dma_flush().
> >
> > Sure.
> >
> > >
> > > You can also minimize changes in dma_direct_map_phys() too, by extending
> > > it's signature to provide if flush is needed or not.
> >
> > Yes. I have
> >
> > static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
> > phys_addr_t phys, size_t size, enum dma_data_direction dir,
> > unsigned long attrs, bool flush)
>
> My suggestion is to use it directly, without wrappers.
>
> >
> > and two wrappers:
> > static inline dma_addr_t dma_direct_map_phys(struct device *dev,
> > phys_addr_t phys, size_t size, enum dma_data_direction dir,
> > unsigned long attrs)
> > {
> > return __dma_direct_map_phys(dev, phys, size, dir, attrs, true);
> > }
> >
> > static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
> > phys_addr_t phys, size_t size, enum dma_data_direction dir,
> > unsigned long attrs)
> > {
> > return __dma_direct_map_phys(dev, phys, size, dir, attrs, false);
> > }
> >
> > If you prefer exposing "flush" directly in dma_direct_map_phys()
> > and updating its callers with flush=true, I think that’s fine.
>
> Yes
>
OK. Could you take a look at [1] and see if any further
improvements are needed before I send v2?
[1] https://lore.kernel.org/lkml/20251223023648.31614-1-21cnbao@gmail.com/
Thanks
Barry
On Wed, Dec 24, 2025 at 02:29:13PM +1300, Barry Song wrote:
> On Wed, Dec 24, 2025 at 3:14 AM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Tue, Dec 23, 2025 at 01:02:55PM +1300, Barry Song wrote:
> > > On Mon, Dec 22, 2025 at 9:49 PM Leon Romanovsky <leon@kernel.org> wrote:
> > > >
> > > > On Mon, Dec 22, 2025 at 03:24:58AM +0800, Barry Song wrote:
> > > > > On Sun, Dec 21, 2025 at 7:55 PM Leon Romanovsky <leon@kernel.org> wrote:
> > > > > [...]
> > > > > > > +
> > > > > >
> > > > > > I'm wondering why you don't implement this batch‑sync support inside the
> > > > > > arch_sync_dma_*() functions. Doing so would minimize changes to the generic
> > > > > > kernel/dma/* code and reduce the amount of #ifdef‑based spaghetti.
> > > > > >
> > > > >
> > > > > There are two cases: mapping an sg list and mapping a single
> > > > > buffer. The former can be batched with
> > > > > arch_sync_dma_*_batch_add() and flushed via
> > > > > arch_sync_dma_batch_flush(), while the latter requires all work to
> > > > > be done inside arch_sync_dma_*(). Therefore,
> > > > > arch_sync_dma_*() cannot always batch and flush.
> > > >
> > > > Probably in all cases you can call the _batch_ variant, followed by _flush_,
> > > > even when handling a single page. This keeps the code consistent across all
> > > > paths. On platforms that do not support _batch_, the _flush_ operation will be
> > > > a NOP anyway.
> > >
> > > We have a lot of code outside kernel/dma that also calls
> > > arch_sync_dma_for_* such as arch/arm, arch/mips, drivers/xen,
> > > I guess we don’t want to modify so many things?
> >
> > Aren't they using internal, arch specific, arch_sync_dma_for_* implementations?
>
> for arch/arm, arch/mips, they are arch-specific implementations.
> xen is an exception:
Right, and this is the only location outside of kernel/dma where you need to
invoke arch_sync_dma_flush().
>
> static void xen_swiotlb_unmap_phys(struct device *hwdev, dma_addr_t dev_addr,
> size_t size, enum dma_data_direction dir, unsigned long attrs)
> {
> phys_addr_t paddr = xen_dma_to_phys(hwdev, dev_addr);
> struct io_tlb_pool *pool;
>
> BUG_ON(dir == DMA_NONE);
>
> if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
> if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr))))
> arch_sync_dma_for_cpu(paddr, size, dir);
> else
> xen_dma_sync_for_cpu(hwdev, dev_addr, size, dir);
> }
>
> /* NOTE: We use dev_addr here, not paddr! */
> pool = xen_swiotlb_find_pool(hwdev, dev_addr);
> if (pool)
> __swiotlb_tbl_unmap_single(hwdev, paddr, size, dir,
> attrs, pool);
> }
>
> >
> > >
> > > for kernel/dma, we have two "single" callers only:
> > > kernel/dma/direct.h, kernel/dma/swiotlb.c. and they looks quite
> > > straightforward:
> > >
> > > static inline void dma_direct_sync_single_for_device(struct device *dev,
> > > dma_addr_t addr, size_t size, enum dma_data_direction dir)
> > > {
> > > phys_addr_t paddr = dma_to_phys(dev, addr);
> > >
> > > swiotlb_sync_single_for_device(dev, paddr, size, dir);
> > >
> > > if (!dev_is_dma_coherent(dev))
> > > arch_sync_dma_for_device(paddr, size, dir);
> > > }
> > >
> > > I guess moving to arch_sync_dma_for_device_batch + flush
> > > doesn’t really look much better, does it?
> > >
> > > >
> > > > I would also rename arch_sync_dma_batch_flush() to arch_sync_dma_flush().
> > >
> > > Sure.
> > >
> > > >
> > > > You can also minimize changes in dma_direct_map_phys() too, by extending
> > > > it's signature to provide if flush is needed or not.
> > >
> > > Yes. I have
> > >
> > > static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
> > > phys_addr_t phys, size_t size, enum dma_data_direction dir,
> > > unsigned long attrs, bool flush)
> >
> > My suggestion is to use it directly, without wrappers.
> >
> > >
> > > and two wrappers:
> > > static inline dma_addr_t dma_direct_map_phys(struct device *dev,
> > > phys_addr_t phys, size_t size, enum dma_data_direction dir,
> > > unsigned long attrs)
> > > {
> > > return __dma_direct_map_phys(dev, phys, size, dir, attrs, true);
> > > }
> > >
> > > static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
> > > phys_addr_t phys, size_t size, enum dma_data_direction dir,
> > > unsigned long attrs)
> > > {
> > > return __dma_direct_map_phys(dev, phys, size, dir, attrs, false);
> > > }
> > >
> > > If you prefer exposing "flush" directly in dma_direct_map_phys()
> > > and updating its callers with flush=true, I think that’s fine.
> >
> > Yes
> >
>
> OK. Could you take a look at [1] and see if any further
> improvements are needed before I send v2?
Everything looks ok, except these renames:
- arch_sync_dma_for_cpu(paddr, sg->length, dir);
+ arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
Thanks
>
> [1] https://lore.kernel.org/lkml/20251223023648.31614-1-21cnbao@gmail.com/
>
> Thanks
> Barry
>
> > >
> >
> > OK. Could you take a look at [1] and see if any further
> > improvements are needed before I send v2?
>
> Everything looks ok, except these renames:
> - arch_sync_dma_for_cpu(paddr, sg->length, dir);
> + arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
Thanks!
I'm happy to drop the rename as outlined below-feedback welcome :-)
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index dd2c8586a725..487fb7c355ed 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -87,6 +87,12 @@ int cache_line_size(void);
#define dma_get_cache_alignment cache_line_size
+static inline void arch_sync_dma_flush(void)
+{
+ dsb(sy);
+}
+#define arch_sync_dma_flush arch_sync_dma_flush
+
/* Compress a u64 MPIDR value into 32 bits. */
static inline u64 arch_compact_of_hwid(u64 id)
{
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index b2b5792b2caa..ae1ae0280eef 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -17,7 +17,7 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
{
unsigned long start = (unsigned long)phys_to_virt(paddr);
- dcache_clean_poc(start, start + size);
+ dcache_clean_poc_nosync(start, start + size);
}
void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
@@ -28,7 +28,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
if (dir == DMA_TO_DEVICE)
return;
- dcache_inval_poc(start, start + size);
+ dcache_inval_poc_nosync(start, start + size);
}
void arch_dma_prep_coherent(struct page *page, size_t size)
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 4809204c674c..e7dd8a63b40e 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -361,6 +361,12 @@ static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
}
#endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
+#ifndef arch_sync_dma_flush
+static inline void arch_sync_dma_flush(void)
+{
+}
+#endif
+
#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
void arch_sync_dma_for_cpu_all(void);
#else
On Thu, Dec 25, 2025 at 06:45:09PM +1300, Barry Song wrote:
> > > >
> > >
> > > OK. Could you take a look at [1] and see if any further
> > > improvements are needed before I send v2?
> >
> > Everything looks ok, except these renames:
> > - arch_sync_dma_for_cpu(paddr, sg->length, dir);
> > + arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
>
> Thanks!
> I'm happy to drop the rename as outlined below-feedback welcome :-)
>
> diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
> index dd2c8586a725..487fb7c355ed 100644
> --- a/arch/arm64/include/asm/cache.h
> +++ b/arch/arm64/include/asm/cache.h
> @@ -87,6 +87,12 @@ int cache_line_size(void);
>
> #define dma_get_cache_alignment cache_line_size
>
> +static inline void arch_sync_dma_flush(void)
> +{
> + dsb(sy);
> +}
> +#define arch_sync_dma_flush arch_sync_dma_flush
> +
> /* Compress a u64 MPIDR value into 32 bits. */
> static inline u64 arch_compact_of_hwid(u64 id)
> {
> diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> index b2b5792b2caa..ae1ae0280eef 100644
> --- a/arch/arm64/mm/dma-mapping.c
> +++ b/arch/arm64/mm/dma-mapping.c
> @@ -17,7 +17,7 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
> {
> unsigned long start = (unsigned long)phys_to_virt(paddr);
>
> - dcache_clean_poc(start, start + size);
> + dcache_clean_poc_nosync(start, start + size);
> }
>
> void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> @@ -28,7 +28,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> if (dir == DMA_TO_DEVICE)
> return;
>
> - dcache_inval_poc(start, start + size);
> + dcache_inval_poc_nosync(start, start + size);
> }
>
> void arch_dma_prep_coherent(struct page *page, size_t size)
> diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
> index 4809204c674c..e7dd8a63b40e 100644
> --- a/include/linux/dma-map-ops.h
> +++ b/include/linux/dma-map-ops.h
> @@ -361,6 +361,12 @@ static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> }
> #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
>
> +#ifndef arch_sync_dma_flush
You likely need to wrap this in "#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FLUSH"
as done in the surrounding code.
Thanks
> +static inline void arch_sync_dma_flush(void)
> +{
> +}
> +#endif
> +
> #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
> void arch_sync_dma_for_cpu_all(void);
> #else
>
On Fri, Dec 26, 2025 at 1:36 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Thu, Dec 25, 2025 at 06:45:09PM +1300, Barry Song wrote:
> > > > >
> > > >
> > > > OK. Could you take a look at [1] and see if any further
> > > > improvements are needed before I send v2?
> > >
> > > Everything looks ok, except these renames:
> > > - arch_sync_dma_for_cpu(paddr, sg->length, dir);
> > > + arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
> >
> > Thanks!
> > I'm happy to drop the rename as outlined below-feedback welcome :-)
> >
> > diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
> > index dd2c8586a725..487fb7c355ed 100644
> > --- a/arch/arm64/include/asm/cache.h
> > +++ b/arch/arm64/include/asm/cache.h
> > @@ -87,6 +87,12 @@ int cache_line_size(void);
> >
> > #define dma_get_cache_alignment cache_line_size
> >
> > +static inline void arch_sync_dma_flush(void)
> > +{
> > + dsb(sy);
> > +}
> > +#define arch_sync_dma_flush arch_sync_dma_flush
> > +
> > /* Compress a u64 MPIDR value into 32 bits. */
> > static inline u64 arch_compact_of_hwid(u64 id)
> > {
> > diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> > index b2b5792b2caa..ae1ae0280eef 100644
> > --- a/arch/arm64/mm/dma-mapping.c
> > +++ b/arch/arm64/mm/dma-mapping.c
> > @@ -17,7 +17,7 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
> > {
> > unsigned long start = (unsigned long)phys_to_virt(paddr);
> >
> > - dcache_clean_poc(start, start + size);
> > + dcache_clean_poc_nosync(start, start + size);
> > }
> >
> > void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> > @@ -28,7 +28,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> > if (dir == DMA_TO_DEVICE)
> > return;
> >
> > - dcache_inval_poc(start, start + size);
> > + dcache_inval_poc_nosync(start, start + size);
> > }
> >
> > void arch_dma_prep_coherent(struct page *page, size_t size)
> > diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
> > index 4809204c674c..e7dd8a63b40e 100644
> > --- a/include/linux/dma-map-ops.h
> > +++ b/include/linux/dma-map-ops.h
> > @@ -361,6 +361,12 @@ static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> > }
> > #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
> >
> > +#ifndef arch_sync_dma_flush
>
> You likely need to wrap this in "#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FLUSH"
> as done in the surrounding code.
I've dropped the new Kconfig option and now rely on whether
arch_sync_dma_flush() is provided by the architecture. If an arch
does not define arch_sync_dma_flush() in its asm/cache.h, a no-op
implementation is used instead.
Do you still prefer keeping a config option to match the surrounding
code style? Note that on arm64, arch_sync_dma_flush() is already a
static inline rather than an extern, so it is not strictly aligned
with the others.
Having both CONFIG_ARCH_HAS_SYNC_DMA_FLUSH and
"#ifndef arch_sync_dma_flush" seems duplicated.
Another potential optimization would be to drop these options
entirely and handle this via ifndefs, letting each architecture
define the macros in asm/cache.h instead.
Whether arch implements arch_sync_dma_for_xx() as static inline or
as external functions makes no difference.
- #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
- void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,-
enum dma_data_direction dir);
- #else
+ #ifndef arch_sync_dma_for_cpu
static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
enum dma_data_direction dir)
{
}
#endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
>
> Thanks
>
> > +static inline void arch_sync_dma_flush(void)
> > +{
> > +}
> > +#endif
> > +
> > #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
> > void arch_sync_dma_for_cpu_all(void);
> > #else
> >
Thanks
Barry
On Fri, Dec 26, 2025 at 02:31:42AM +1300, Barry Song wrote:
> On Fri, Dec 26, 2025 at 1:36 AM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Thu, Dec 25, 2025 at 06:45:09PM +1300, Barry Song wrote:
> > > > > >
> > > > >
> > > > > OK. Could you take a look at [1] and see if any further
> > > > > improvements are needed before I send v2?
> > > >
> > > > Everything looks ok, except these renames:
> > > > - arch_sync_dma_for_cpu(paddr, sg->length, dir);
> > > > + arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
> > >
> > > Thanks!
> > > I'm happy to drop the rename as outlined below-feedback welcome :-)
> > >
> > > diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
> > > index dd2c8586a725..487fb7c355ed 100644
> > > --- a/arch/arm64/include/asm/cache.h
> > > +++ b/arch/arm64/include/asm/cache.h
> > > @@ -87,6 +87,12 @@ int cache_line_size(void);
> > >
> > > #define dma_get_cache_alignment cache_line_size
> > >
> > > +static inline void arch_sync_dma_flush(void)
> > > +{
> > > + dsb(sy);
> > > +}
> > > +#define arch_sync_dma_flush arch_sync_dma_flush
> > > +
> > > /* Compress a u64 MPIDR value into 32 bits. */
> > > static inline u64 arch_compact_of_hwid(u64 id)
> > > {
> > > diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> > > index b2b5792b2caa..ae1ae0280eef 100644
> > > --- a/arch/arm64/mm/dma-mapping.c
> > > +++ b/arch/arm64/mm/dma-mapping.c
> > > @@ -17,7 +17,7 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
> > > {
> > > unsigned long start = (unsigned long)phys_to_virt(paddr);
> > >
> > > - dcache_clean_poc(start, start + size);
> > > + dcache_clean_poc_nosync(start, start + size);
> > > }
> > >
> > > void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> > > @@ -28,7 +28,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> > > if (dir == DMA_TO_DEVICE)
> > > return;
> > >
> > > - dcache_inval_poc(start, start + size);
> > > + dcache_inval_poc_nosync(start, start + size);
> > > }
> > >
> > > void arch_dma_prep_coherent(struct page *page, size_t size)
> > > diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
> > > index 4809204c674c..e7dd8a63b40e 100644
> > > --- a/include/linux/dma-map-ops.h
> > > +++ b/include/linux/dma-map-ops.h
> > > @@ -361,6 +361,12 @@ static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> > > }
> > > #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
> > >
> > > +#ifndef arch_sync_dma_flush
> >
> > You likely need to wrap this in "#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FLUSH"
> > as done in the surrounding code.
>
> I've dropped the new Kconfig option and now rely on whether
> arch_sync_dma_flush() is provided by the architecture. If an arch
> does not define arch_sync_dma_flush() in its asm/cache.h, a no-op
> implementation is used instead.
I know.
>
> Do you still prefer keeping a config option to match the surrounding
> code style?
I don't have a strong preference here. Go ahead and try your current
version and see how people respond.
> Note that on arm64, arch_sync_dma_flush() is already a
> static inline rather than an extern, so it is not strictly aligned
> with the others.
> Having both CONFIG_ARCH_HAS_SYNC_DMA_FLUSH and
> "#ifndef arch_sync_dma_flush" seems duplicated.
>
> Another potential optimization would be to drop these options
> entirely and handle this via ifndefs, letting each architecture
> define the macros in asm/cache.h instead.
>
> Whether arch implements arch_sync_dma_for_xx() as static inline or
> as external functions makes no difference.
>
> - #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
> - void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,-
> enum dma_data_direction dir);
> - #else
> + #ifndef arch_sync_dma_for_cpu
> static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
> enum dma_data_direction dir)
> {
> }
> #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
>
> >
> > Thanks
> >
> > > +static inline void arch_sync_dma_flush(void)
> > > +{
> > > +}
> > > +#endif
> > > +
> > > #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
> > > void arch_sync_dma_for_cpu_all(void);
> > > #else
> > >
>
> Thanks
> Barry
>
>
> >
> > I would also rename arch_sync_dma_batch_flush() to arch_sync_dma_flush().
>
> Sure.
>
> >
> > You can also minimize changes in dma_direct_map_phys() too, by extending
> > it's signature to provide if flush is needed or not.
>
> Yes. I have
>
> static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
> phys_addr_t phys, size_t size, enum dma_data_direction dir,
> unsigned long attrs, bool flush)
>
> and two wrappers:
> static inline dma_addr_t dma_direct_map_phys(struct device *dev,
> phys_addr_t phys, size_t size, enum dma_data_direction dir,
> unsigned long attrs)
> {
> return __dma_direct_map_phys(dev, phys, size, dir, attrs, true);
> }
>
> static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
> phys_addr_t phys, size_t size, enum dma_data_direction dir,
> unsigned long attrs)
> {
> return __dma_direct_map_phys(dev, phys, size, dir, attrs, false);
> }
>
> If you prefer exposing "flush" directly in dma_direct_map_phys()
> and updating its callers with flush=true, I think that’s fine.
>
> It could be also true for dma_direct_sync_single_for_device().
sorry for typo. I meant dma_direct_sync_single_for_cpu().
With flush passed as an argument, the patch becomes the following.
Please feel free to comment before I send v2.
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 50c3fe2a1d55..5c65d213eb37 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -403,9 +403,11 @@ void dma_direct_sync_sg_for_device(struct device *dev,
swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_device(paddr, sg->length,
+ arch_sync_dma_for_device_batch_add(paddr, sg->length,
dir);
}
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_flush();
}
#endif
@@ -422,7 +424,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_cpu(paddr, sg->length, dir);
+ arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
@@ -430,8 +432,10 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
arch_dma_mark_clean(paddr, sg->length);
}
- if (!dev_is_dma_coherent(dev))
+ if (!dev_is_dma_coherent(dev)) {
arch_sync_dma_for_cpu_all();
+ arch_sync_dma_flush();
+ }
}
/*
@@ -443,14 +447,19 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
{
struct scatterlist *sg;
int i;
+ bool need_sync = false;
for_each_sg(sgl, sg, nents, i) {
- if (sg_dma_is_bus_address(sg))
+ if (sg_dma_is_bus_address(sg)) {
sg_dma_unmark_bus_address(sg);
- else
+ } else {
+ need_sync = true;
dma_direct_unmap_phys(dev, sg->dma_address,
- sg_dma_len(sg), dir, attrs);
+ sg_dma_len(sg), dir, attrs, false);
+ }
}
+ if (need_sync && !dev_is_dma_coherent(dev))
+ arch_sync_dma_flush();
}
#endif
@@ -460,6 +469,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
struct pci_p2pdma_map_state p2pdma_state = {};
struct scatterlist *sg;
int i, ret;
+ bool need_sync = false;
for_each_sg(sgl, sg, nents, i) {
switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
@@ -471,8 +481,9 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
*/
break;
case PCI_P2PDMA_MAP_NONE:
+ need_sync = true;
sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
- sg->length, dir, attrs);
+ sg->length, dir, attrs, false);
if (sg->dma_address == DMA_MAPPING_ERROR) {
ret = -EIO;
goto out_unmap;
@@ -491,6 +502,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
sg_dma_len(sg) = sg->length;
}
+ if (need_sync && !dev_is_dma_coherent(dev))
+ arch_sync_dma_flush();
return nents;
out_unmap:
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index da2fadf45bcd..b13eb5bfd051 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -65,12 +65,15 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
}
static inline void dma_direct_sync_single_for_cpu(struct device *dev,
- dma_addr_t addr, size_t size, enum dma_data_direction dir)
+ dma_addr_t addr, size_t size, enum dma_data_direction dir,
+ bool flush)
{
phys_addr_t paddr = dma_to_phys(dev, addr);
if (!dev_is_dma_coherent(dev)) {
- arch_sync_dma_for_cpu(paddr, size, dir);
+ arch_sync_dma_for_cpu_batch_add(paddr, size, dir);
+ if (flush)
+ arch_sync_dma_flush();
arch_sync_dma_for_cpu_all();
}
@@ -82,7 +85,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
static inline dma_addr_t dma_direct_map_phys(struct device *dev,
phys_addr_t phys, size_t size, enum dma_data_direction dir,
- unsigned long attrs)
+ unsigned long attrs, bool flush)
{
dma_addr_t dma_addr;
@@ -109,8 +112,11 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
}
if (!dev_is_dma_coherent(dev) &&
- !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
- arch_sync_dma_for_device(phys, size, dir);
+ !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) {
+ arch_sync_dma_for_device_batch_add(phys, size, dir);
+ if (flush)
+ arch_sync_dma_flush();
+ }
return dma_addr;
err_overflow:
@@ -122,7 +128,8 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
}
static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
- size_t size, enum dma_data_direction dir, unsigned long attrs)
+ size_t size, enum dma_data_direction dir, unsigned long attrs,
+ bool flush)
{
phys_addr_t phys;
@@ -132,9 +139,10 @@ static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
phys = dma_to_phys(dev, addr);
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+ dma_direct_sync_single_for_cpu(dev, addr, size, dir, flush);
swiotlb_tbl_unmap_single(dev, phys, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
}
+
#endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 37163eb49f9f..d8cfa56a3cbb 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -166,7 +166,7 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
if (dma_map_direct(dev, ops) ||
(!is_mmio && arch_dma_map_phys_direct(dev, phys + size)))
- addr = dma_direct_map_phys(dev, phys, size, dir, attrs);
+ addr = dma_direct_map_phys(dev, phys, size, dir, attrs, true);
else if (use_dma_iommu(dev))
addr = iommu_dma_map_phys(dev, phys, size, dir, attrs);
else if (ops->map_phys)
@@ -207,7 +207,7 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size,
BUG_ON(!valid_dma_direction(dir));
if (dma_map_direct(dev, ops) ||
(!is_mmio && arch_dma_unmap_phys_direct(dev, addr + size)))
- dma_direct_unmap_phys(dev, addr, size, dir, attrs);
+ dma_direct_unmap_phys(dev, addr, size, dir, attrs, true);
else if (use_dma_iommu(dev))
iommu_dma_unmap_phys(dev, addr, size, dir, attrs);
else if (ops->unmap_phys)
@@ -373,7 +373,7 @@ void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
BUG_ON(!valid_dma_direction(dir));
if (dma_map_direct(dev, ops))
- dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+ dma_direct_sync_single_for_cpu(dev, addr, size, dir, true);
else if (use_dma_iommu(dev))
iommu_dma_sync_single_for_cpu(dev, addr, size, dir);
else if (ops->sync_single_for_cpu)
--
2.43.0
© 2016 - 2026 Red Hat, Inc.