From: Catalin Marinas <catalin.marinas@arm.com>
Hardware DMA limit might not be power of 2. When RAM range starts above
0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
can not encode this limit.
Use direct phys_addr_t limit address for DMA zone limit.
Following commits will add explicit base address to DMA zone.
---
Catalin,
This is taken almost verbatim from your email:
https://lore.kernel.org/all/ZZ2HnHJV3gdzu1Aj@arm.com/
Would you provide your sign-off?
Thanks,
baruch
---
arch/arm64/mm/init.c | 32 ++++++++++----------------------
arch/powerpc/mm/mem.c | 9 ++++-----
arch/s390/mm/init.c | 2 +-
include/linux/dma-direct.h | 2 +-
kernel/dma/direct.c | 6 +++---
kernel/dma/pool.c | 2 +-
kernel/dma/swiotlb.c | 4 ++--
7 files changed, 22 insertions(+), 35 deletions(-)
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 03efd86dce0a..00508c69ca9e 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -113,36 +113,24 @@ static void __init arch_reserve_crashkernel(void)
low_size, high);
}
-/*
- * Return the maximum physical address for a zone accessible by the given bits
- * limit. If DRAM starts above 32-bit, expand the zone to the maximum
- * available memory, otherwise cap it at 32-bit.
- */
-static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
+static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
{
- phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits);
- phys_addr_t phys_start = memblock_start_of_DRAM();
-
- if (phys_start > U32_MAX)
- zone_mask = PHYS_ADDR_MAX;
- else if (phys_start > zone_mask)
- zone_mask = U32_MAX;
-
- return min(zone_mask, memblock_end_of_DRAM() - 1) + 1;
+ return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
}
static void __init zone_sizes_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES] = {0};
- unsigned int __maybe_unused acpi_zone_dma_bits;
- unsigned int __maybe_unused dt_zone_dma_bits;
- phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32);
+ phys_addr_t __maybe_unused acpi_zone_dma_limit;
+ phys_addr_t __maybe_unused dt_zone_dma_limit;
+ phys_addr_t __maybe_unused dma32_phys_limit =
+ max_zone_phys(DMA_BIT_MASK(32));
#ifdef CONFIG_ZONE_DMA
- acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
- dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
- zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
- arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
+ acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address();
+ dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL);
+ zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit);
+ arm64_dma_phys_limit = max_zone_phys(zone_dma_limit);
max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
#endif
#ifdef CONFIG_ZONE_DMA32
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 3a440004b97d..4d6f575fd354 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -214,7 +214,7 @@ static int __init mark_nonram_nosave(void)
* everything else. GFP_DMA32 page allocations automatically fall back to
* ZONE_DMA.
*
- * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the
+ * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the
* generic DMA mapping code. 32-bit only devices (if not handled by an IOMMU
* anyway) will take a first dip into ZONE_NORMAL and get otherwise served by
* ZONE_DMA.
@@ -250,13 +250,12 @@ void __init paging_init(void)
* powerbooks.
*/
if (IS_ENABLED(CONFIG_PPC32))
- zone_dma_bits = 30;
+ zone_dma_limit = DMA_BIT_MASK(30);
else
- zone_dma_bits = 31;
+ zone_dma_limit = DMA_BIT_MASK(31);
#ifdef CONFIG_ZONE_DMA
- max_zone_pfns[ZONE_DMA] = min(max_low_pfn,
- 1UL << (zone_dma_bits - PAGE_SHIFT));
+ max_zone_pfns[ZONE_DMA] = min(max_low_pfn, zone_dma_limit >> PAGE_SHIFT);
#endif
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index f6391442c0c2..5feaa60933b7 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -95,7 +95,7 @@ void __init paging_init(void)
vmem_map_init();
sparse_init();
- zone_dma_bits = 31;
+ zone_dma_limit = DMA_BIT_MASK(31);
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS);
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 3eb3589ff43e..7cf76f1d3239 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -12,7 +12,7 @@
#include <linux/mem_encrypt.h>
#include <linux/swiotlb.h>
-extern unsigned int zone_dma_bits;
+extern phys_addr_t zone_dma_limit;
/*
* Record the mapping of CPU physical to DMA addresses for a given region.
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 4d543b1e9d57..3b2ebcd4f576 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -20,7 +20,7 @@
* it for entirely different regions. In that case the arch code needs to
* override the variable below for dma-direct to work properly.
*/
-unsigned int zone_dma_bits __ro_after_init = 24;
+phys_addr_t zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);
static inline dma_addr_t phys_to_dma_direct(struct device *dev,
phys_addr_t phys)
@@ -59,7 +59,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit)
* zones.
*/
*phys_limit = dma_to_phys(dev, dma_limit);
- if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
+ if (*phys_limit <= zone_dma_limit)
return GFP_DMA;
if (*phys_limit <= DMA_BIT_MASK(32))
return GFP_DMA32;
@@ -584,7 +584,7 @@ int dma_direct_supported(struct device *dev, u64 mask)
* part of the check.
*/
if (IS_ENABLED(CONFIG_ZONE_DMA))
- min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits));
+ min_mask = min_t(u64, min_mask, zone_dma_limit);
return mask >= phys_to_dma_unencrypted(dev, min_mask);
}
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index d10613eb0f63..410a7b40e496 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -70,7 +70,7 @@ static bool cma_in_zone(gfp_t gfp)
/* CMA can't cross zone boundaries, see cma_activate_area() */
end = cma_get_base(cma) + size - 1;
if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
- return end <= DMA_BIT_MASK(zone_dma_bits);
+ return end <= zone_dma_limit;
if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
return end <= DMA_BIT_MASK(32);
return true;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 86fe172b5958..96d6eee7d215 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -446,7 +446,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
if (!remap)
io_tlb_default_mem.can_grow = true;
if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA))
- io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits);
+ io_tlb_default_mem.phys_limit = zone_dma_limit;
else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32);
else
@@ -625,7 +625,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
}
gfp &= ~GFP_ZONEMASK;
- if (phys_limit <= DMA_BIT_MASK(zone_dma_bits))
+ if (phys_limit <= zone_dma_limit)
gfp |= __GFP_DMA;
else if (phys_limit <= DMA_BIT_MASK(32))
gfp |= __GFP_DMA32;
--
2.43.0
(finally getting around to looking at this series, sorry for the delay)
On Tue, Apr 09, 2024 at 09:17:54AM +0300, Baruch Siach wrote:
> From: Catalin Marinas <catalin.marinas@arm.com>
>
> Hardware DMA limit might not be power of 2. When RAM range starts above
> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
> can not encode this limit.
>
> Use direct phys_addr_t limit address for DMA zone limit.
>
> Following commits will add explicit base address to DMA zone.
>
> ---
> Catalin,
>
> This is taken almost verbatim from your email:
>
> https://lore.kernel.org/all/ZZ2HnHJV3gdzu1Aj@arm.com/
>
> Would you provide your sign-off?
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Thanks for writing a commit log. However, I think more work is needed.
See below.
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 03efd86dce0a..00508c69ca9e 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -113,36 +113,24 @@ static void __init arch_reserve_crashkernel(void)
> low_size, high);
> }
>
> -/*
> - * Return the maximum physical address for a zone accessible by the given bits
> - * limit. If DRAM starts above 32-bit, expand the zone to the maximum
> - * available memory, otherwise cap it at 32-bit.
> - */
> -static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
> +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
> {
> - phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits);
> - phys_addr_t phys_start = memblock_start_of_DRAM();
> -
> - if (phys_start > U32_MAX)
> - zone_mask = PHYS_ADDR_MAX;
> - else if (phys_start > zone_mask)
> - zone_mask = U32_MAX;
> -
> - return min(zone_mask, memblock_end_of_DRAM() - 1) + 1;
> + return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
> }
>
> static void __init zone_sizes_init(void)
> {
> unsigned long max_zone_pfns[MAX_NR_ZONES] = {0};
> - unsigned int __maybe_unused acpi_zone_dma_bits;
> - unsigned int __maybe_unused dt_zone_dma_bits;
> - phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32);
> + phys_addr_t __maybe_unused acpi_zone_dma_limit;
> + phys_addr_t __maybe_unused dt_zone_dma_limit;
> + phys_addr_t __maybe_unused dma32_phys_limit =
> + max_zone_phys(DMA_BIT_MASK(32));
>
> #ifdef CONFIG_ZONE_DMA
> - acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
> - dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
> - zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
> - arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
> + acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address();
> + dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL);
> + zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit);
> + arm64_dma_phys_limit = max_zone_phys(zone_dma_limit);
> max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
> #endif
> #ifdef CONFIG_ZONE_DMA32
I think this goes wrong if zone_dma_limit ends up above 32-bit (e.g. no
restrictive dma-ranges properties) but the start of RAM is below 4G.
We'd simply reduce ZONE_DMA32 to zero and ZONE_DMA potentially covering
the whole RAM. Prior to this change, we capped zone_dma_bits to 32 via
min3(). I think we should maintain this cap if memblock_start_of_DRAM()
is below 4G.
We could fix this up in max_zone_phys() above:
if (memblock_start_of_DRAM() < U32_MAX)
zone_limit = min(U32_MAX, zone_limit);
return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 4d543b1e9d57..3b2ebcd4f576 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -20,7 +20,7 @@
> * it for entirely different regions. In that case the arch code needs to
> * override the variable below for dma-direct to work properly.
> */
> -unsigned int zone_dma_bits __ro_after_init = 24;
> +phys_addr_t zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);
>
> static inline dma_addr_t phys_to_dma_direct(struct device *dev,
> phys_addr_t phys)
> @@ -59,7 +59,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit)
> * zones.
> */
> *phys_limit = dma_to_phys(dev, dma_limit);
> - if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
> + if (*phys_limit <= zone_dma_limit)
> return GFP_DMA;
> if (*phys_limit <= DMA_BIT_MASK(32))
> return GFP_DMA32;
It's worth noting that if ZONE_DMA ends up entirely above 32-bit, there
won't be any ZONE_DMA32. Thinking about it, this could be a potential
problem. For example, if a device has a 32-bit DMA mask and an offset
that lifts this into the 32-36G range, the above may fail to set
GFP_DMA32.
Actually, I think these checks can go wrong even with the current
implementation, assuming RAM below 4G and no DMA offsets. For example,
we have two devices, one with a coherent mask of 30 bits, the other 31
bits. zone_dma_bits would be set to the smaller of the two, so 30 bit
(as per of_dma_get_max_cpu_address()). For the second device, phys_limit
would be ((1 << 31) - 1) but that's higher than DMA_BIT_MASK(30) so we
fail to set GFP_DMA. We do set GFP_DMA32 because of the second test but
that's not sufficient since that's 32-bit rather than 31-bit as the
device needs. Similarly if we have some weird device with a 33-bit DMA
coherent mask but the RAM is addressed by more bits. We'd fail to set
GFP_DMA32.
Ignoring this patch, I think the checks above in mainline should be
something like:
if (*phys_limit < DMA_BIT_MASK(32))
return GFP_DMA;
if (*phys_limit < memblock_end_of_DRAM())
return GFP_DMA32;
IOW, zone_dma_bits is pretty useless for this check IMHO. It gives us
the minimum hence not sufficient to test for devices that fall between
ZONE_DMA and ZONE_DMA32 coherent masks.
With your series, the above test wouldn't work since we don't have a
zone_dma32_limit and zone_dma_limit is above DMA_BIT_MASK(32). We might
need to introduce zone_dma32_limit and maybe drop zone_dma_limit
altogether.
--
Catalin
© 2016 - 2025 Red Hat, Inc.