[v1] mm: remove nth_page()

[PATCH v1 21/36] mm/cma: refuse handing out non-contiguous page ranges

Posted by David Hildenbrand 5 months, 2 weeks ago

Let's disallow handing out PFN ranges with non-contiguous pages, so we
can remove the nth-page usage in __cma_alloc(), and so any callers don't
have to worry about that either when wanting to blindly iterate pages.

This is really only a problem in configs with SPARSEMEM but without
SPARSEMEM_VMEMMAP, and only when we would cross memory sections in some
cases.

Will this cause harm? Probably not, because it's mostly 32bit that does
not support SPARSEMEM_VMEMMAP. If this ever becomes a problem we could
look into allocating the memmap for the memory sections spanned by a
single CMA region in one go from memblock.

Reviewed-by: Alexandru Elisei <alexandru.elisei@arm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 include/linux/mm.h |  6 ++++++
 mm/cma.c           | 39 ++++++++++++++++++++++++---------------
 mm/util.c          | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 15 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f6880e3225c5c..2ca1eb2db63ec 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -209,9 +209,15 @@ extern unsigned long sysctl_user_reserve_kbytes;
 extern unsigned long sysctl_admin_reserve_kbytes;
 
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
+bool page_range_contiguous(const struct page *page, unsigned long nr_pages);
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
 #else
 #define nth_page(page,n) ((page) + (n))
+static inline bool page_range_contiguous(const struct page *page,
+		unsigned long nr_pages)
+{
+	return true;
+}
 #endif
 
 /* to align the pointer to the (next) page boundary */
diff --git a/mm/cma.c b/mm/cma.c
index e56ec64d0567e..813e6dc7b0954 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -780,10 +780,8 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
 				unsigned long count, unsigned int align,
 				struct page **pagep, gfp_t gfp)
 {
-	unsigned long mask, offset;
-	unsigned long pfn = -1;
-	unsigned long start = 0;
 	unsigned long bitmap_maxno, bitmap_no, bitmap_count;
+	unsigned long start, pfn, mask, offset;
 	int ret = -EBUSY;
 	struct page *page = NULL;
 
@@ -795,7 +793,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
 	if (bitmap_count > bitmap_maxno)
 		goto out;
 
-	for (;;) {
+	for (start = 0; ; start = bitmap_no + mask + 1) {
 		spin_lock_irq(&cma->lock);
 		/*
 		 * If the request is larger than the available number
@@ -812,6 +810,22 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
 			spin_unlock_irq(&cma->lock);
 			break;
 		}
+
+		pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
+		page = pfn_to_page(pfn);
+
+		/*
+		 * Do not hand out page ranges that are not contiguous, so
+		 * callers can just iterate the pages without having to worry
+		 * about these corner cases.
+		 */
+		if (!page_range_contiguous(page, count)) {
+			spin_unlock_irq(&cma->lock);
+			pr_warn_ratelimited("%s: %s: skipping incompatible area [0x%lx-0x%lx]",
+					    __func__, cma->name, pfn, pfn + count - 1);
+			continue;
+		}
+
 		bitmap_set(cmr->bitmap, bitmap_no, bitmap_count);
 		cma->available_count -= count;
 		/*
@@ -821,29 +835,24 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
 		 */
 		spin_unlock_irq(&cma->lock);
 
-		pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
 		mutex_lock(&cma->alloc_mutex);
 		ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp);
 		mutex_unlock(&cma->alloc_mutex);
-		if (ret == 0) {
-			page = pfn_to_page(pfn);
+		if (!ret)
 			break;
-		}
 
 		cma_clear_bitmap(cma, cmr, pfn, count);
 		if (ret != -EBUSY)
 			break;
 
 		pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n",
-			 __func__, pfn, pfn_to_page(pfn));
+			 __func__, pfn, page);
 
-		trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn),
-					   count, align);
-		/* try again with a bit different memory target */
-		start = bitmap_no + mask + 1;
+		trace_cma_alloc_busy_retry(cma->name, pfn, page, count, align);
 	}
 out:
-	*pagep = page;
+	if (!ret)
+		*pagep = page;
 	return ret;
 }
 
@@ -882,7 +891,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
 	 */
 	if (page) {
 		for (i = 0; i < count; i++)
-			page_kasan_tag_reset(nth_page(page, i));
+			page_kasan_tag_reset(page + i);
 	}
 
 	if (ret && !(gfp & __GFP_NOWARN)) {
diff --git a/mm/util.c b/mm/util.c
index d235b74f7aff7..0bf349b19b652 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1280,4 +1280,37 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
 {
 	return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0);
 }
+
+#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
+/**
+ * page_range_contiguous - test whether the page range is contiguous
+ * @page: the start of the page range.
+ * @nr_pages: the number of pages in the range.
+ *
+ * Test whether the page range is contiguous, such that they can be iterated
+ * naively, corresponding to iterating a contiguous PFN range.
+ *
+ * This function should primarily only be used for debug checks, or when
+ * working with page ranges that are not naturally contiguous (e.g., pages
+ * within a folio are).
+ *
+ * Returns true if contiguous, otherwise false.
+ */
+bool page_range_contiguous(const struct page *page, unsigned long nr_pages)
+{
+	const unsigned long start_pfn = page_to_pfn(page);
+	const unsigned long end_pfn = start_pfn + nr_pages;
+	unsigned long pfn;
+
+	/*
+	 * The memmap is allocated per memory section. We need to check
+	 * each involved memory section once.
+	 */
+	for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
+	     pfn < end_pfn; pfn += PAGES_PER_SECTION)
+		if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn)))
+			return false;
+	return true;
+}
+#endif
 #endif /* CONFIG_MMU */
-- 
2.50.1

Re: [PATCH v1 21/36] mm/cma: refuse handing out non-contiguous page ranges

Posted by Lorenzo Stoakes 5 months, 2 weeks ago

On Thu, Aug 28, 2025 at 12:01:25AM +0200, David Hildenbrand wrote:
> Let's disallow handing out PFN ranges with non-contiguous pages, so we
> can remove the nth-page usage in __cma_alloc(), and so any callers don't
> have to worry about that either when wanting to blindly iterate pages.
>
> This is really only a problem in configs with SPARSEMEM but without
> SPARSEMEM_VMEMMAP, and only when we would cross memory sections in some
> cases.

I'm guessing this is something that we don't need to worry about in
reality?

>
> Will this cause harm? Probably not, because it's mostly 32bit that does
> not support SPARSEMEM_VMEMMAP. If this ever becomes a problem we could
> look into allocating the memmap for the memory sections spanned by a
> single CMA region in one go from memblock.
>
> Reviewed-by: Alexandru Elisei <alexandru.elisei@arm.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>

LGTM other than refactoring point below.

CMA stuff looks fine afaict after staring at it for a while, on proviso
that handing out ranges within the same section is always going to be the
case.

Anyway overall,

LGTM, so:

Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>


> ---
>  include/linux/mm.h |  6 ++++++
>  mm/cma.c           | 39 ++++++++++++++++++++++++---------------
>  mm/util.c          | 33 +++++++++++++++++++++++++++++++++
>  3 files changed, 63 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index f6880e3225c5c..2ca1eb2db63ec 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -209,9 +209,15 @@ extern unsigned long sysctl_user_reserve_kbytes;
>  extern unsigned long sysctl_admin_reserve_kbytes;
>
>  #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
> +bool page_range_contiguous(const struct page *page, unsigned long nr_pages);
>  #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
>  #else
>  #define nth_page(page,n) ((page) + (n))
> +static inline bool page_range_contiguous(const struct page *page,
> +		unsigned long nr_pages)
> +{
> +	return true;
> +}
>  #endif
>
>  /* to align the pointer to the (next) page boundary */
> diff --git a/mm/cma.c b/mm/cma.c
> index e56ec64d0567e..813e6dc7b0954 100644
> --- a/mm/cma.c
> +++ b/mm/cma.c
> @@ -780,10 +780,8 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
>  				unsigned long count, unsigned int align,
>  				struct page **pagep, gfp_t gfp)
>  {
> -	unsigned long mask, offset;
> -	unsigned long pfn = -1;
> -	unsigned long start = 0;
>  	unsigned long bitmap_maxno, bitmap_no, bitmap_count;
> +	unsigned long start, pfn, mask, offset;
>  	int ret = -EBUSY;
>  	struct page *page = NULL;
>
> @@ -795,7 +793,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
>  	if (bitmap_count > bitmap_maxno)
>  		goto out;
>
> -	for (;;) {
> +	for (start = 0; ; start = bitmap_no + mask + 1) {
>  		spin_lock_irq(&cma->lock);
>  		/*
>  		 * If the request is larger than the available number
> @@ -812,6 +810,22 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
>  			spin_unlock_irq(&cma->lock);
>  			break;
>  		}
> +
> +		pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
> +		page = pfn_to_page(pfn);
> +
> +		/*
> +		 * Do not hand out page ranges that are not contiguous, so
> +		 * callers can just iterate the pages without having to worry
> +		 * about these corner cases.
> +		 */
> +		if (!page_range_contiguous(page, count)) {
> +			spin_unlock_irq(&cma->lock);
> +			pr_warn_ratelimited("%s: %s: skipping incompatible area [0x%lx-0x%lx]",
> +					    __func__, cma->name, pfn, pfn + count - 1);
> +			continue;
> +		}
> +
>  		bitmap_set(cmr->bitmap, bitmap_no, bitmap_count);
>  		cma->available_count -= count;
>  		/*
> @@ -821,29 +835,24 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
>  		 */
>  		spin_unlock_irq(&cma->lock);
>
> -		pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
>  		mutex_lock(&cma->alloc_mutex);
>  		ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp);
>  		mutex_unlock(&cma->alloc_mutex);
> -		if (ret == 0) {
> -			page = pfn_to_page(pfn);
> +		if (!ret)
>  			break;
> -		}
>
>  		cma_clear_bitmap(cma, cmr, pfn, count);
>  		if (ret != -EBUSY)
>  			break;
>
>  		pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n",
> -			 __func__, pfn, pfn_to_page(pfn));
> +			 __func__, pfn, page);
>
> -		trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn),
> -					   count, align);
> -		/* try again with a bit different memory target */
> -		start = bitmap_no + mask + 1;
> +		trace_cma_alloc_busy_retry(cma->name, pfn, page, count, align);
>  	}
>  out:
> -	*pagep = page;
> +	if (!ret)
> +		*pagep = page;
>  	return ret;
>  }
>
> @@ -882,7 +891,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
>  	 */
>  	if (page) {
>  		for (i = 0; i < count; i++)
> -			page_kasan_tag_reset(nth_page(page, i));
> +			page_kasan_tag_reset(page + i);
>  	}
>
>  	if (ret && !(gfp & __GFP_NOWARN)) {
> diff --git a/mm/util.c b/mm/util.c
> index d235b74f7aff7..0bf349b19b652 100644
> --- a/mm/util.c
> +++ b/mm/util.c
> @@ -1280,4 +1280,37 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
>  {
>  	return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0);
>  }
> +
> +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
> +/**
> + * page_range_contiguous - test whether the page range is contiguous
> + * @page: the start of the page range.
> + * @nr_pages: the number of pages in the range.
> + *
> + * Test whether the page range is contiguous, such that they can be iterated
> + * naively, corresponding to iterating a contiguous PFN range.
> + *
> + * This function should primarily only be used for debug checks, or when
> + * working with page ranges that are not naturally contiguous (e.g., pages
> + * within a folio are).
> + *
> + * Returns true if contiguous, otherwise false.
> + */
> +bool page_range_contiguous(const struct page *page, unsigned long nr_pages)
> +{
> +	const unsigned long start_pfn = page_to_pfn(page);
> +	const unsigned long end_pfn = start_pfn + nr_pages;
> +	unsigned long pfn;
> +
> +	/*
> +	 * The memmap is allocated per memory section. We need to check
> +	 * each involved memory section once.
> +	 */
> +	for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
> +	     pfn < end_pfn; pfn += PAGES_PER_SECTION)
> +		if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn)))
> +			return false;

I find this pretty confusing, my test for this is how many times I have to read
the code to understand what it's doing :)

So we have something like:

  (pfn of page)
   start_pfn        pfn = align UP
        |                 |
        v                 v
 |         section        |
        <----------------->
          pfn - start_pfn

Then check page + (pfn - start_pfn) == pfn_to_page(pfn)

And loop such that:

  (pfn of page)
   start_pfn                                      pfn
        |                                          |
        v                                          v
 |         section        |         section        |
        <------------------------------------------>
                        pfn - start_pfn

Again check page + (pfn - start_pfn) == pfn_to_page(pfn)

And so on.

So the logic looks good, but it's just... that took me a hot second to
parse :)

I think a few simple fixups

bool page_range_contiguous(const struct page *page, unsigned long nr_pages)
{
	const unsigned long start_pfn = page_to_pfn(page);
	const unsigned long end_pfn = start_pfn + nr_pages;
	/* The PFN of the start of the next section. */
	unsigned long pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
	/* The page we'd expected to see if the range were contiguous. */
	struct page *expected = page + (pfn - start_pfn);

	/*
	 * The memmap is allocated per memory section. We need to check
	 * each involved memory section once.
	 */
	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION, expected += PAGES_PER_SECTION)
		if (unlikely(expected != pfn_to_page(pfn)))
			return false;
	return true;
}

> +	return true;
> +}
> +#endif
>  #endif /* CONFIG_MMU */
> --
> 2.50.1
>

Re: [PATCH v1 21/36] mm/cma: refuse handing out non-contiguous page ranges

Posted by David Hildenbrand 5 months, 1 week ago

On 28.08.25 19:28, Lorenzo Stoakes wrote:
> On Thu, Aug 28, 2025 at 12:01:25AM +0200, David Hildenbrand wrote:
>> Let's disallow handing out PFN ranges with non-contiguous pages, so we
>> can remove the nth-page usage in __cma_alloc(), and so any callers don't
>> have to worry about that either when wanting to blindly iterate pages.
>>
>> This is really only a problem in configs with SPARSEMEM but without
>> SPARSEMEM_VMEMMAP, and only when we would cross memory sections in some
>> cases.
> 
> I'm guessing this is something that we don't need to worry about in
> reality?

That my theory yes.

> 
>>
>> Will this cause harm? Probably not, because it's mostly 32bit that does
>> not support SPARSEMEM_VMEMMAP. If this ever becomes a problem we could
>> look into allocating the memmap for the memory sections spanned by a
>> single CMA region in one go from memblock.
>>
>> Reviewed-by: Alexandru Elisei <alexandru.elisei@arm.com>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
> 
> LGTM other than refactoring point below.
> 
> CMA stuff looks fine afaict after staring at it for a while, on proviso
> that handing out ranges within the same section is always going to be the
> case.
> 
> Anyway overall,
> 
> LGTM, so:
> 
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> 
> 
>> ---
>>   include/linux/mm.h |  6 ++++++
>>   mm/cma.c           | 39 ++++++++++++++++++++++++---------------
>>   mm/util.c          | 33 +++++++++++++++++++++++++++++++++
>>   3 files changed, 63 insertions(+), 15 deletions(-)
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index f6880e3225c5c..2ca1eb2db63ec 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -209,9 +209,15 @@ extern unsigned long sysctl_user_reserve_kbytes;
>>   extern unsigned long sysctl_admin_reserve_kbytes;
>>
>>   #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
>> +bool page_range_contiguous(const struct page *page, unsigned long nr_pages);
>>   #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
>>   #else
>>   #define nth_page(page,n) ((page) + (n))
>> +static inline bool page_range_contiguous(const struct page *page,
>> +		unsigned long nr_pages)
>> +{
>> +	return true;
>> +}
>>   #endif
>>
>>   /* to align the pointer to the (next) page boundary */
>> diff --git a/mm/cma.c b/mm/cma.c
>> index e56ec64d0567e..813e6dc7b0954 100644
>> --- a/mm/cma.c
>> +++ b/mm/cma.c
>> @@ -780,10 +780,8 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
>>   				unsigned long count, unsigned int align,
>>   				struct page **pagep, gfp_t gfp)
>>   {
>> -	unsigned long mask, offset;
>> -	unsigned long pfn = -1;
>> -	unsigned long start = 0;
>>   	unsigned long bitmap_maxno, bitmap_no, bitmap_count;
>> +	unsigned long start, pfn, mask, offset;
>>   	int ret = -EBUSY;
>>   	struct page *page = NULL;
>>
>> @@ -795,7 +793,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
>>   	if (bitmap_count > bitmap_maxno)
>>   		goto out;
>>
>> -	for (;;) {
>> +	for (start = 0; ; start = bitmap_no + mask + 1) {
>>   		spin_lock_irq(&cma->lock);
>>   		/*
>>   		 * If the request is larger than the available number
>> @@ -812,6 +810,22 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
>>   			spin_unlock_irq(&cma->lock);
>>   			break;
>>   		}
>> +
>> +		pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
>> +		page = pfn_to_page(pfn);
>> +
>> +		/*
>> +		 * Do not hand out page ranges that are not contiguous, so
>> +		 * callers can just iterate the pages without having to worry
>> +		 * about these corner cases.
>> +		 */
>> +		if (!page_range_contiguous(page, count)) {
>> +			spin_unlock_irq(&cma->lock);
>> +			pr_warn_ratelimited("%s: %s: skipping incompatible area [0x%lx-0x%lx]",
>> +					    __func__, cma->name, pfn, pfn + count - 1);
>> +			continue;
>> +		}
>> +
>>   		bitmap_set(cmr->bitmap, bitmap_no, bitmap_count);
>>   		cma->available_count -= count;
>>   		/*
>> @@ -821,29 +835,24 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
>>   		 */
>>   		spin_unlock_irq(&cma->lock);
>>
>> -		pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
>>   		mutex_lock(&cma->alloc_mutex);
>>   		ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp);
>>   		mutex_unlock(&cma->alloc_mutex);
>> -		if (ret == 0) {
>> -			page = pfn_to_page(pfn);
>> +		if (!ret)
>>   			break;
>> -		}
>>
>>   		cma_clear_bitmap(cma, cmr, pfn, count);
>>   		if (ret != -EBUSY)
>>   			break;
>>
>>   		pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n",
>> -			 __func__, pfn, pfn_to_page(pfn));
>> +			 __func__, pfn, page);
>>
>> -		trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn),
>> -					   count, align);
>> -		/* try again with a bit different memory target */
>> -		start = bitmap_no + mask + 1;
>> +		trace_cma_alloc_busy_retry(cma->name, pfn, page, count, align);
>>   	}
>>   out:
>> -	*pagep = page;
>> +	if (!ret)
>> +		*pagep = page;
>>   	return ret;
>>   }
>>
>> @@ -882,7 +891,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
>>   	 */
>>   	if (page) {
>>   		for (i = 0; i < count; i++)
>> -			page_kasan_tag_reset(nth_page(page, i));
>> +			page_kasan_tag_reset(page + i);
>>   	}
>>
>>   	if (ret && !(gfp & __GFP_NOWARN)) {
>> diff --git a/mm/util.c b/mm/util.c
>> index d235b74f7aff7..0bf349b19b652 100644
>> --- a/mm/util.c
>> +++ b/mm/util.c
>> @@ -1280,4 +1280,37 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
>>   {
>>   	return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0);
>>   }
>> +
>> +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
>> +/**
>> + * page_range_contiguous - test whether the page range is contiguous
>> + * @page: the start of the page range.
>> + * @nr_pages: the number of pages in the range.
>> + *
>> + * Test whether the page range is contiguous, such that they can be iterated
>> + * naively, corresponding to iterating a contiguous PFN range.
>> + *
>> + * This function should primarily only be used for debug checks, or when
>> + * working with page ranges that are not naturally contiguous (e.g., pages
>> + * within a folio are).
>> + *
>> + * Returns true if contiguous, otherwise false.
>> + */
>> +bool page_range_contiguous(const struct page *page, unsigned long nr_pages)
>> +{
>> +	const unsigned long start_pfn = page_to_pfn(page);
>> +	const unsigned long end_pfn = start_pfn + nr_pages;
>> +	unsigned long pfn;
>> +
>> +	/*
>> +	 * The memmap is allocated per memory section. We need to check
>> +	 * each involved memory section once.
>> +	 */
>> +	for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
>> +	     pfn < end_pfn; pfn += PAGES_PER_SECTION)
>> +		if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn)))
>> +			return false;
> 
> I find this pretty confusing, my test for this is how many times I have to read
> the code to understand what it's doing :)
> 
> So we have something like:
> 
>    (pfn of page)
>     start_pfn        pfn = align UP
>          |                 |
>          v                 v
>   |         section        |
>          <----------------->
>            pfn - start_pfn
> 
> Then check page + (pfn - start_pfn) == pfn_to_page(pfn)
> 
> And loop such that:
> 
>    (pfn of page)
>     start_pfn                                      pfn
>          |                                          |
>          v                                          v
>   |         section        |         section        |
>          <------------------------------------------>
>                          pfn - start_pfn
> 
> Again check page + (pfn - start_pfn) == pfn_to_page(pfn)
> 
> And so on.
> 
> So the logic looks good, but it's just... that took me a hot second to
> parse :)
> 
> I think a few simple fixups
> 
> bool page_range_contiguous(const struct page *page, unsigned long nr_pages)
> {
> 	const unsigned long start_pfn = page_to_pfn(page);
> 	const unsigned long end_pfn = start_pfn + nr_pages;
> 	/* The PFN of the start of the next section. */
> 	unsigned long pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
> 	/* The page we'd expected to see if the range were contiguous. */
> 	struct page *expected = page + (pfn - start_pfn);
> 
> 	/*
> 	 * The memmap is allocated per memory section. We need to check
> 	 * each involved memory section once.
> 	 */
> 	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION, expected += PAGES_PER_SECTION)
> 		if (unlikely(expected != pfn_to_page(pfn)))
> 			return false;
> 	return true;
> }
> 

Hm, I prefer my variant, especially where the pfn is calculated in the for loop. Likely a
matter of personal taste.

But I can see why skipping the first section might be a surprise when not
having the semantics of ALIGN() in the cache.

So I'll add the following on top:

diff --git a/mm/util.c b/mm/util.c
index 0bf349b19b652..fbdb73aaf35fe 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1303,8 +1303,10 @@ bool page_range_contiguous(const struct page *page, unsigned long nr_pages)
         unsigned long pfn;
  
         /*
-        * The memmap is allocated per memory section. We need to check
-        * each involved memory section once.
+        * The memmap is allocated per memory section, so no need to check
+        * within the first section. However, we need to check each other
+        * spanned memory section once, making sure the first page in a
+        * section could similarly be reached by just iterating pages.
          */
         for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
              pfn < end_pfn; pfn += PAGES_PER_SECTION)

Thanks!

-- 
Cheers

David / dhildenb

Re: [PATCH v1 21/36] mm/cma: refuse handing out non-contiguous page ranges

Posted by Lorenzo Stoakes 5 months, 1 week ago

On Fri, Aug 29, 2025 at 04:34:54PM +0200, David Hildenbrand wrote:
> On 28.08.25 19:28, Lorenzo Stoakes wrote:
> > On Thu, Aug 28, 2025 at 12:01:25AM +0200, David Hildenbrand wrote:
> > > Let's disallow handing out PFN ranges with non-contiguous pages, so we
> > > can remove the nth-page usage in __cma_alloc(), and so any callers don't
> > > have to worry about that either when wanting to blindly iterate pages.
> > >
> > > This is really only a problem in configs with SPARSEMEM but without
> > > SPARSEMEM_VMEMMAP, and only when we would cross memory sections in some
> > > cases.
> >
> > I'm guessing this is something that we don't need to worry about in
> > reality?
>
> That my theory yes.

Let's hope correct haha, but seems reasonable.

>
> >
> > >
> > > Will this cause harm? Probably not, because it's mostly 32bit that does
> > > not support SPARSEMEM_VMEMMAP. If this ever becomes a problem we could
> > > look into allocating the memmap for the memory sections spanned by a
> > > single CMA region in one go from memblock.
> > >
> > > Reviewed-by: Alexandru Elisei <alexandru.elisei@arm.com>
> > > Signed-off-by: David Hildenbrand <david@redhat.com>
> >
> > LGTM other than refactoring point below.
> >
> > CMA stuff looks fine afaict after staring at it for a while, on proviso
> > that handing out ranges within the same section is always going to be the
> > case.
> >
> > Anyway overall,
> >
> > LGTM, so:
> >
> > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> >
> >
> > > ---
> > >   include/linux/mm.h |  6 ++++++
> > >   mm/cma.c           | 39 ++++++++++++++++++++++++---------------
> > >   mm/util.c          | 33 +++++++++++++++++++++++++++++++++
> > >   3 files changed, 63 insertions(+), 15 deletions(-)
> > >
> > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > index f6880e3225c5c..2ca1eb2db63ec 100644
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -209,9 +209,15 @@ extern unsigned long sysctl_user_reserve_kbytes;
> > >   extern unsigned long sysctl_admin_reserve_kbytes;
> > >
> > >   #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
> > > +bool page_range_contiguous(const struct page *page, unsigned long nr_pages);
> > >   #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
> > >   #else
> > >   #define nth_page(page,n) ((page) + (n))
> > > +static inline bool page_range_contiguous(const struct page *page,
> > > +		unsigned long nr_pages)
> > > +{
> > > +	return true;
> > > +}
> > >   #endif
> > >
> > >   /* to align the pointer to the (next) page boundary */
> > > diff --git a/mm/cma.c b/mm/cma.c
> > > index e56ec64d0567e..813e6dc7b0954 100644
> > > --- a/mm/cma.c
> > > +++ b/mm/cma.c
> > > @@ -780,10 +780,8 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
> > >   				unsigned long count, unsigned int align,
> > >   				struct page **pagep, gfp_t gfp)
> > >   {
> > > -	unsigned long mask, offset;
> > > -	unsigned long pfn = -1;
> > > -	unsigned long start = 0;
> > >   	unsigned long bitmap_maxno, bitmap_no, bitmap_count;
> > > +	unsigned long start, pfn, mask, offset;
> > >   	int ret = -EBUSY;
> > >   	struct page *page = NULL;
> > >
> > > @@ -795,7 +793,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
> > >   	if (bitmap_count > bitmap_maxno)
> > >   		goto out;
> > >
> > > -	for (;;) {
> > > +	for (start = 0; ; start = bitmap_no + mask + 1) {
> > >   		spin_lock_irq(&cma->lock);
> > >   		/*
> > >   		 * If the request is larger than the available number
> > > @@ -812,6 +810,22 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
> > >   			spin_unlock_irq(&cma->lock);
> > >   			break;
> > >   		}
> > > +
> > > +		pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
> > > +		page = pfn_to_page(pfn);
> > > +
> > > +		/*
> > > +		 * Do not hand out page ranges that are not contiguous, so
> > > +		 * callers can just iterate the pages without having to worry
> > > +		 * about these corner cases.
> > > +		 */
> > > +		if (!page_range_contiguous(page, count)) {
> > > +			spin_unlock_irq(&cma->lock);
> > > +			pr_warn_ratelimited("%s: %s: skipping incompatible area [0x%lx-0x%lx]",
> > > +					    __func__, cma->name, pfn, pfn + count - 1);
> > > +			continue;
> > > +		}
> > > +
> > >   		bitmap_set(cmr->bitmap, bitmap_no, bitmap_count);
> > >   		cma->available_count -= count;
> > >   		/*
> > > @@ -821,29 +835,24 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
> > >   		 */
> > >   		spin_unlock_irq(&cma->lock);
> > >
> > > -		pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
> > >   		mutex_lock(&cma->alloc_mutex);
> > >   		ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp);
> > >   		mutex_unlock(&cma->alloc_mutex);
> > > -		if (ret == 0) {
> > > -			page = pfn_to_page(pfn);
> > > +		if (!ret)
> > >   			break;
> > > -		}
> > >
> > >   		cma_clear_bitmap(cma, cmr, pfn, count);
> > >   		if (ret != -EBUSY)
> > >   			break;
> > >
> > >   		pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n",
> > > -			 __func__, pfn, pfn_to_page(pfn));
> > > +			 __func__, pfn, page);
> > >
> > > -		trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn),
> > > -					   count, align);
> > > -		/* try again with a bit different memory target */
> > > -		start = bitmap_no + mask + 1;
> > > +		trace_cma_alloc_busy_retry(cma->name, pfn, page, count, align);
> > >   	}
> > >   out:
> > > -	*pagep = page;
> > > +	if (!ret)
> > > +		*pagep = page;
> > >   	return ret;
> > >   }
> > >
> > > @@ -882,7 +891,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
> > >   	 */
> > >   	if (page) {
> > >   		for (i = 0; i < count; i++)
> > > -			page_kasan_tag_reset(nth_page(page, i));
> > > +			page_kasan_tag_reset(page + i);
> > >   	}
> > >
> > >   	if (ret && !(gfp & __GFP_NOWARN)) {
> > > diff --git a/mm/util.c b/mm/util.c
> > > index d235b74f7aff7..0bf349b19b652 100644
> > > --- a/mm/util.c
> > > +++ b/mm/util.c
> > > @@ -1280,4 +1280,37 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
> > >   {
> > >   	return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0);
> > >   }
> > > +
> > > +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
> > > +/**
> > > + * page_range_contiguous - test whether the page range is contiguous
> > > + * @page: the start of the page range.
> > > + * @nr_pages: the number of pages in the range.
> > > + *
> > > + * Test whether the page range is contiguous, such that they can be iterated
> > > + * naively, corresponding to iterating a contiguous PFN range.
> > > + *
> > > + * This function should primarily only be used for debug checks, or when
> > > + * working with page ranges that are not naturally contiguous (e.g., pages
> > > + * within a folio are).
> > > + *
> > > + * Returns true if contiguous, otherwise false.
> > > + */
> > > +bool page_range_contiguous(const struct page *page, unsigned long nr_pages)
> > > +{
> > > +	const unsigned long start_pfn = page_to_pfn(page);
> > > +	const unsigned long end_pfn = start_pfn + nr_pages;
> > > +	unsigned long pfn;
> > > +
> > > +	/*
> > > +	 * The memmap is allocated per memory section. We need to check
> > > +	 * each involved memory section once.
> > > +	 */
> > > +	for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
> > > +	     pfn < end_pfn; pfn += PAGES_PER_SECTION)
> > > +		if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn)))
> > > +			return false;
> >
> > I find this pretty confusing, my test for this is how many times I have to read
> > the code to understand what it's doing :)
> >
> > So we have something like:
> >
> >    (pfn of page)
> >     start_pfn        pfn = align UP
> >          |                 |
> >          v                 v
> >   |         section        |
> >          <----------------->
> >            pfn - start_pfn
> >
> > Then check page + (pfn - start_pfn) == pfn_to_page(pfn)
> >
> > And loop such that:
> >
> >    (pfn of page)
> >     start_pfn                                      pfn
> >          |                                          |
> >          v                                          v
> >   |         section        |         section        |
> >          <------------------------------------------>
> >                          pfn - start_pfn
> >
> > Again check page + (pfn - start_pfn) == pfn_to_page(pfn)
> >
> > And so on.
> >
> > So the logic looks good, but it's just... that took me a hot second to
> > parse :)
> >
> > I think a few simple fixups
> >
> > bool page_range_contiguous(const struct page *page, unsigned long nr_pages)
> > {
> > 	const unsigned long start_pfn = page_to_pfn(page);
> > 	const unsigned long end_pfn = start_pfn + nr_pages;
> > 	/* The PFN of the start of the next section. */
> > 	unsigned long pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
> > 	/* The page we'd expected to see if the range were contiguous. */
> > 	struct page *expected = page + (pfn - start_pfn);
> >
> > 	/*
> > 	 * The memmap is allocated per memory section. We need to check
> > 	 * each involved memory section once.
> > 	 */
> > 	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION, expected += PAGES_PER_SECTION)
> > 		if (unlikely(expected != pfn_to_page(pfn)))
> > 			return false;
> > 	return true;
> > }
> >
>
> Hm, I prefer my variant, especially where the pfn is calculated in the for loop. Likely a
> matter of personal taste.

Sure this is always a factor in code :)

>
> But I can see why skipping the first section might be a surprise when not
> having the semantics of ALIGN() in the cache.

Yup!

>
> So I'll add the following on top:
>
> diff --git a/mm/util.c b/mm/util.c
> index 0bf349b19b652..fbdb73aaf35fe 100644
> --- a/mm/util.c
> +++ b/mm/util.c
> @@ -1303,8 +1303,10 @@ bool page_range_contiguous(const struct page *page, unsigned long nr_pages)
>         unsigned long pfn;
>         /*
> -        * The memmap is allocated per memory section. We need to check
> -        * each involved memory section once.
> +        * The memmap is allocated per memory section, so no need to check
> +        * within the first section. However, we need to check each other
> +        * spanned memory section once, making sure the first page in a
> +        * section could similarly be reached by just iterating pages.
>          */
>         for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
>              pfn < end_pfn; pfn += PAGES_PER_SECTION)

Cool this helps clarify things, that'll do fine!

>
> Thanks!
>
> --
> Cheers
>
> David / dhildenb
>
>

Cheers, Lorenzo