[PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages

David Hildenbrand (Arm) posted 1 patch 1 month, 2 weeks ago
arch/x86/mm/init_64.c | 40 ++++++++++++++++++++++++++--------------
1 file changed, 26 insertions(+), 14 deletions(-)
[PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
Posted by David Hildenbrand (Arm) 1 month, 2 weeks ago
In commit bf9e4e30f353 ("x86/mm: use pagetable_free()"), we switched
from freeing non-boot page tables through __free_pages() to
pagetable_free().

However, the function is also called to free vmemmap pages.

Given that vmemmap pages are not page tables, already the page_ptdesc(page)
is wrong. But worse, pagetable_free() calls

	__free_pages(page, compound_order(page));

As vmemmap pages are not compound pages (see vmemmap_alloc_block()) --
except for HVO, which doesn't apply here -- we will only free the first
page when freeing a PMD-sized vmemmap page, leaking the other ones.

Fix it by properly decoupling pagetable and vmemmap freeing.
free_pagetable() no longer has to mess with SECTION_INFO, as only the
vmemmap is marked like that in register_page_bootmem_memmap().

The indentation in remove_pmd_table() is messed up, let's fix that
while touching it.

Note that we'll try to get rid of that bootmem info handling soon. For
now, we'll handle it similar to free_pagetable(), just avoiding the
ifdef.

Tested-by: Lance Yang <lance.yang@linux.dev>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
Cc: stable@vger.kernel.org
Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
---
Reproduced and tested with a simple VM with a virtio-mem device,
repeatedly adding and removing memory.

Found by code inspection while working on bootmem_info removal.
---
Changes in v2:
- Don't mess with the altmap with PTEs and add a comment why.
- Simplify "unsigned long nr_pages" handling.
- Link to v1: https://lore.kernel.org/r/20260428-vmemmap-v1-1-b2aa1e6db2c0@kernel.org
---
 arch/x86/mm/init_64.c | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index df2261fa4f98..7e20b22d658b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1014,7 +1014,7 @@ static void __meminit free_pagetable(struct page *page, int order)
 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
 		enum bootmem_type type = bootmem_type(page);
 
-		if (type == SECTION_INFO || type == MIX_SECTION_INFO) {
+		if (type == MIX_SECTION_INFO) {
 			while (nr_pages--)
 				put_page_bootmem(page++);
 		} else {
@@ -1028,13 +1028,24 @@ static void __meminit free_pagetable(struct page *page, int order)
 	}
 }
 
-static void __meminit free_hugepage_table(struct page *page,
+static void __meminit free_vmemmap_pages(struct page *page, unsigned int order,
 		struct vmem_altmap *altmap)
 {
-	if (altmap)
-		vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE);
-	else
-		free_pagetable(page, get_order(PMD_SIZE));
+	unsigned long nr_pages = 1u << order;
+
+	if (altmap) {
+		vmem_altmap_free(altmap, nr_pages);
+	} else if (PageReserved(page)) {
+		if (IS_ENABLED(CONFIG_HAVE_BOOTMEM_INFO_NODE) &&
+		    bootmem_type(page) == SECTION_INFO) {
+			while (nr_pages--)
+				put_page_bootmem(page++);
+		} else {
+			free_reserved_pages(page, nr_pages);
+		}
+	} else {
+		__free_pages(page, order);
+	}
 }
 
 static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
@@ -1118,7 +1129,8 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
 			return;
 
 		if (!direct)
-			free_pagetable(pte_page(*pte), 0);
+			/* We never populate base pages from the altmap. */
+			free_vmemmap_pages(pte_page(*pte), 0, NULL);
 
 		spin_lock(&init_mm.page_table_lock);
 		pte_clear(&init_mm, addr, pte);
@@ -1153,19 +1165,19 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
 			if (IS_ALIGNED(addr, PMD_SIZE) &&
 			    IS_ALIGNED(next, PMD_SIZE)) {
 				if (!direct)
-					free_hugepage_table(pmd_page(*pmd),
-							    altmap);
+					free_vmemmap_pages(pmd_page(*pmd),
+							   PMD_ORDER, altmap);
 
 				spin_lock(&init_mm.page_table_lock);
 				pmd_clear(pmd);
 				spin_unlock(&init_mm.page_table_lock);
 				pages++;
 			} else if (vmemmap_pmd_is_unused(addr, next)) {
-					free_hugepage_table(pmd_page(*pmd),
-							    altmap);
-					spin_lock(&init_mm.page_table_lock);
-					pmd_clear(pmd);
-					spin_unlock(&init_mm.page_table_lock);
+				free_vmemmap_pages(pmd_page(*pmd), PMD_ORDER,
+						   altmap);
+				spin_lock(&init_mm.page_table_lock);
+				pmd_clear(pmd);
+				spin_unlock(&init_mm.page_table_lock);
 			}
 			continue;
 		}

---

base-commit: a2ddbfd1af0f54ea84bf17f0400088815d012e8d

change-id: 20260428-vmemmap-ab4b949aa727

--

Cheers,

David
Re: [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
Posted by Alison Schofield 2 weeks, 5 days ago
On Wed, Apr 29, 2026 at 12:49:14PM +0200, David Hildenbrand (Arm) wrote:
> In commit bf9e4e30f353 ("x86/mm: use pagetable_free()"), we switched
> from freeing non-boot page tables through __free_pages() to
> pagetable_free().
> 
> However, the function is also called to free vmemmap pages.
> 
> Given that vmemmap pages are not page tables, already the page_ptdesc(page)
> is wrong. But worse, pagetable_free() calls
> 
> 	__free_pages(page, compound_order(page));
> 
> As vmemmap pages are not compound pages (see vmemmap_alloc_block()) --
> except for HVO, which doesn't apply here -- we will only free the first
> page when freeing a PMD-sized vmemmap page, leaking the other ones.

Hi David,

Sneaking in here to share with nvdimm/dax folks as this affects their
nfit_test environment usage.

+ nvdimm@lists.linux.dev

NVDIMM, DAX folks,

This fixes a memory leak present since v6.19 that surfaces during DAX
and NVDIMM unit testing, as well as ad-hoc nfit_test usage. If you are
seeing the system gradually run out of memory across repeated test runs
or namespace reconfiguration cycles, this is likely the cause.

In my setup, a VM with 5.4 GiB MemAvailable and a 4 GiB nfit_test
namespace lost about 1.1 GiB of MemAvailable per DAX or NVDIMM test suite
run. The VM OOM's partway through the 4th consecutive run of either. The
number of survivable runs scales roughly with available VM memory.

Symptoms typically begin with "page allocation failure: order 0" messages
from unrelated processes. If a test run is active when memory is
sufficiently depleted, it eventually terminates w OOM.

I've tested both this posted fix and a revert of the Fixes commit and both
resolve the leak in my setup. If neither is an option, periodic reboot of
the test environment may be needed for longer test sessions.

-- Alison

> 
> Fix it by properly decoupling pagetable and vmemmap freeing.
> free_pagetable() no longer has to mess with SECTION_INFO, as only the
> vmemmap is marked like that in register_page_bootmem_memmap().
> 
> The indentation in remove_pmd_table() is messed up, let's fix that
> while touching it.
> 
> Note that we'll try to get rid of that bootmem info handling soon. For
> now, we'll handle it similar to free_pagetable(), just avoiding the
> ifdef.
> 
> Tested-by: Lance Yang <lance.yang@linux.dev>
> Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
> Cc: stable@vger.kernel.org
> Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
> ---
> Reproduced and tested with a simple VM with a virtio-mem device,
> repeatedly adding and removing memory.
> 
> Found by code inspection while working on bootmem_info removal.
> ---
> Changes in v2:
> - Don't mess with the altmap with PTEs and add a comment why.
> - Simplify "unsigned long nr_pages" handling.
> - Link to v1: https://lore.kernel.org/r/20260428-vmemmap-v1-1-b2aa1e6db2c0@kernel.org
> ---
>  arch/x86/mm/init_64.c | 40 ++++++++++++++++++++++++++--------------
>  1 file changed, 26 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index df2261fa4f98..7e20b22d658b 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -1014,7 +1014,7 @@ static void __meminit free_pagetable(struct page *page, int order)
>  #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
>  		enum bootmem_type type = bootmem_type(page);
>  
> -		if (type == SECTION_INFO || type == MIX_SECTION_INFO) {
> +		if (type == MIX_SECTION_INFO) {
>  			while (nr_pages--)
>  				put_page_bootmem(page++);
>  		} else {
> @@ -1028,13 +1028,24 @@ static void __meminit free_pagetable(struct page *page, int order)
>  	}
>  }
>  
> -static void __meminit free_hugepage_table(struct page *page,
> +static void __meminit free_vmemmap_pages(struct page *page, unsigned int order,
>  		struct vmem_altmap *altmap)
>  {
> -	if (altmap)
> -		vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE);
> -	else
> -		free_pagetable(page, get_order(PMD_SIZE));
> +	unsigned long nr_pages = 1u << order;
> +
> +	if (altmap) {
> +		vmem_altmap_free(altmap, nr_pages);
> +	} else if (PageReserved(page)) {
> +		if (IS_ENABLED(CONFIG_HAVE_BOOTMEM_INFO_NODE) &&
> +		    bootmem_type(page) == SECTION_INFO) {
> +			while (nr_pages--)
> +				put_page_bootmem(page++);
> +		} else {
> +			free_reserved_pages(page, nr_pages);
> +		}
> +	} else {
> +		__free_pages(page, order);
> +	}
>  }
>  
>  static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
> @@ -1118,7 +1129,8 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
>  			return;
>  
>  		if (!direct)
> -			free_pagetable(pte_page(*pte), 0);
> +			/* We never populate base pages from the altmap. */
> +			free_vmemmap_pages(pte_page(*pte), 0, NULL);
>  
>  		spin_lock(&init_mm.page_table_lock);
>  		pte_clear(&init_mm, addr, pte);
> @@ -1153,19 +1165,19 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
>  			if (IS_ALIGNED(addr, PMD_SIZE) &&
>  			    IS_ALIGNED(next, PMD_SIZE)) {
>  				if (!direct)
> -					free_hugepage_table(pmd_page(*pmd),
> -							    altmap);
> +					free_vmemmap_pages(pmd_page(*pmd),
> +							   PMD_ORDER, altmap);
>  
>  				spin_lock(&init_mm.page_table_lock);
>  				pmd_clear(pmd);
>  				spin_unlock(&init_mm.page_table_lock);
>  				pages++;
>  			} else if (vmemmap_pmd_is_unused(addr, next)) {
> -					free_hugepage_table(pmd_page(*pmd),
> -							    altmap);
> -					spin_lock(&init_mm.page_table_lock);
> -					pmd_clear(pmd);
> -					spin_unlock(&init_mm.page_table_lock);
> +				free_vmemmap_pages(pmd_page(*pmd), PMD_ORDER,
> +						   altmap);
> +				spin_lock(&init_mm.page_table_lock);
> +				pmd_clear(pmd);
> +				spin_unlock(&init_mm.page_table_lock);
>  			}
>  			continue;
>  		}
> 
> ---
> 
> base-commit: a2ddbfd1af0f54ea84bf17f0400088815d012e8d
> 
> change-id: 20260428-vmemmap-ab4b949aa727
> 
> --
> 
> Cheers,
> 
> David
>
Re: [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
Posted by David Hildenbrand (Arm) 1 month, 1 week ago
On 4/29/26 12:49, David Hildenbrand (Arm) wrote:
> In commit bf9e4e30f353 ("x86/mm: use pagetable_free()"), we switched
> from freeing non-boot page tables through __free_pages() to
> pagetable_free().
> 
> However, the function is also called to free vmemmap pages.
> 
> Given that vmemmap pages are not page tables, already the page_ptdesc(page)
> is wrong. But worse, pagetable_free() calls
> 
> 	__free_pages(page, compound_order(page));
> 
> As vmemmap pages are not compound pages (see vmemmap_alloc_block()) --
> except for HVO, which doesn't apply here -- we will only free the first
> page when freeing a PMD-sized vmemmap page, leaking the other ones.
> 
> Fix it by properly decoupling pagetable and vmemmap freeing.
> free_pagetable() no longer has to mess with SECTION_INFO, as only the
> vmemmap is marked like that in register_page_bootmem_memmap().
> 
> The indentation in remove_pmd_table() is messed up, let's fix that
> while touching it.
> 
> Note that we'll try to get rid of that bootmem info handling soon. For
> now, we'll handle it similar to free_pagetable(), just avoiding the
> ifdef.
> 
> Tested-by: Lance Yang <lance.yang@linux.dev>
> Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
> Cc: stable@vger.kernel.org
> Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
> ---
> Reproduced and tested with a simple VM with a virtio-mem device,
> repeatedly adding and removing memory.
> 
> Found by code inspection while working on bootmem_info removal.
> ---

@x86 maintainers, do you want to take this through your tree or should we merge
this through the MM tree?

I have another MM series coming up that will touch this code (no fixes, though).

-- 
Cheers,

David
Re: [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
Posted by Peter Zijlstra 1 month, 1 week ago
On Fri, May 08, 2026 at 11:19:26AM +0200, David Hildenbrand (Arm) wrote:
> On 4/29/26 12:49, David Hildenbrand (Arm) wrote:
> > In commit bf9e4e30f353 ("x86/mm: use pagetable_free()"), we switched
> > from freeing non-boot page tables through __free_pages() to
> > pagetable_free().
> > 
> > However, the function is also called to free vmemmap pages.
> > 
> > Given that vmemmap pages are not page tables, already the page_ptdesc(page)
> > is wrong. But worse, pagetable_free() calls
> > 
> > 	__free_pages(page, compound_order(page));
> > 
> > As vmemmap pages are not compound pages (see vmemmap_alloc_block()) --
> > except for HVO, which doesn't apply here -- we will only free the first
> > page when freeing a PMD-sized vmemmap page, leaking the other ones.
> > 
> > Fix it by properly decoupling pagetable and vmemmap freeing.
> > free_pagetable() no longer has to mess with SECTION_INFO, as only the
> > vmemmap is marked like that in register_page_bootmem_memmap().
> > 
> > The indentation in remove_pmd_table() is messed up, let's fix that
> > while touching it.
> > 
> > Note that we'll try to get rid of that bootmem info handling soon. For
> > now, we'll handle it similar to free_pagetable(), just avoiding the
> > ifdef.
> > 
> > Tested-by: Lance Yang <lance.yang@linux.dev>
> > Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> > Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
> > Cc: stable@vger.kernel.org
> > Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
> > ---
> > Reproduced and tested with a simple VM with a virtio-mem device,
> > repeatedly adding and removing memory.
> > 
> > Found by code inspection while working on bootmem_info removal.
> > ---
> 
> @x86 maintainers, do you want to take this through your tree or should we merge
> this through the MM tree?
> 
> I have another MM series coming up that will touch this code (no fixes, though).

I'm thinking this should go in rather more urgent, yes?

It looks good to me, Dave you want to stick this in x86/urgent?
Re: [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
Posted by David Hildenbrand (Arm) 1 month, 1 week ago
On 5/8/26 11:23, Peter Zijlstra wrote:
> On Fri, May 08, 2026 at 11:19:26AM +0200, David Hildenbrand (Arm) wrote:
>> On 4/29/26 12:49, David Hildenbrand (Arm) wrote:
>>> In commit bf9e4e30f353 ("x86/mm: use pagetable_free()"), we switched
>>> from freeing non-boot page tables through __free_pages() to
>>> pagetable_free().
>>>
>>> However, the function is also called to free vmemmap pages.
>>>
>>> Given that vmemmap pages are not page tables, already the page_ptdesc(page)
>>> is wrong. But worse, pagetable_free() calls
>>>
>>> 	__free_pages(page, compound_order(page));
>>>
>>> As vmemmap pages are not compound pages (see vmemmap_alloc_block()) --
>>> except for HVO, which doesn't apply here -- we will only free the first
>>> page when freeing a PMD-sized vmemmap page, leaking the other ones.
>>>
>>> Fix it by properly decoupling pagetable and vmemmap freeing.
>>> free_pagetable() no longer has to mess with SECTION_INFO, as only the
>>> vmemmap is marked like that in register_page_bootmem_memmap().
>>>
>>> The indentation in remove_pmd_table() is messed up, let's fix that
>>> while touching it.
>>>
>>> Note that we'll try to get rid of that bootmem info handling soon. For
>>> now, we'll handle it similar to free_pagetable(), just avoiding the
>>> ifdef.
>>>
>>> Tested-by: Lance Yang <lance.yang@linux.dev>
>>> Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
>>> Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
>>> Cc: stable@vger.kernel.org
>>> Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
>>> ---
>>> Reproduced and tested with a simple VM with a virtio-mem device,
>>> repeatedly adding and removing memory.
>>>
>>> Found by code inspection while working on bootmem_info removal.
>>> ---
>>
>> @x86 maintainers, do you want to take this through your tree or should we merge
>> this through the MM tree?
>>
>> I have another MM series coming up that will touch this code (no fixes, though).
> 
> I'm thinking this should go in rather more urgent, yes?

Yes, please :)

-- 
Cheers,

David
Re: [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
Posted by Andrew Morton 3 weeks, 4 days ago
On Fri, 8 May 2026 12:51:31 +0200 "David Hildenbrand (Arm)" <david@kernel.org> wrote:

> >>> Tested-by: Lance Yang <lance.yang@linux.dev>
> >>> Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> >>> Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
> >>> Cc: stable@vger.kernel.org
> >>> Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
> >>> ---
> >>> Reproduced and tested with a simple VM with a virtio-mem device,
> >>> repeatedly adding and removing memory.
> >>>
> >>> Found by code inspection while working on bootmem_info removal.
> >>> ---
> >>
> >> @x86 maintainers, do you want to take this through your tree or should we merge
> >> this through the MM tree?
> >>
> >> I have another MM series coming up that will touch this code (no fixes, though).
> > 
> > I'm thinking this should go in rather more urgent, yes?
> 
> Yes, please :)

I'm not seeing this in linux-next so I (re) queued it in mm.git's
mm-hotfixes-unstble queue, for a 7.1-rcX merge.


From: "David Hildenbrand (Arm)" <david@kernel.org>
Subject: x86/mm: fix freeing of PMD-sized vmemmap pages
Date: Wed, 29 Apr 2026 12:49:14 +0200

In commit bf9e4e30f353 ("x86/mm: use pagetable_free()"), we switched from
freeing non-boot page tables through __free_pages() to pagetable_free().

However, the function is also called to free vmemmap pages.

Given that vmemmap pages are not page tables, already the
page_ptdesc(page) is wrong.  But worse, pagetable_free() calls

	__free_pages(page, compound_order(page));

As vmemmap pages are not compound pages (see vmemmap_alloc_block()) --
except for HVO, which doesn't apply here -- we will only free the first
page when freeing a PMD-sized vmemmap page, leaking the other ones.

Fix it by properly decoupling pagetable and vmemmap freeing. 
free_pagetable() no longer has to mess with SECTION_INFO, as only the
vmemmap is marked like that in register_page_bootmem_memmap().

The indentation in remove_pmd_table() is messed up, let's fix that while
touching it.

Note that we'll try to get rid of that bootmem info handling soon.  For
now, we'll handle it similar to free_pagetable(), just avoiding the ifdef.

Link: https://lore.kernel.org/20260429-vmemmap-v2-1-8dfcacffd877@kernel.org
Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
Tested-by: Lance Yang <lance.yang@linux.dev>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Baolu Lu <baolu.lu@linux.intel.com>
Cc: "Borislav Petkov (AMD)" <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/x86/mm/init_64.c |   40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

--- a/arch/x86/mm/init_64.c~x86-mm-fix-freeing-of-pmd-sized-vmemmap-pages
+++ a/arch/x86/mm/init_64.c
@@ -1014,7 +1014,7 @@ static void __meminit free_pagetable(str
 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
 		enum bootmem_type type = bootmem_type(page);
 
-		if (type == SECTION_INFO || type == MIX_SECTION_INFO) {
+		if (type == MIX_SECTION_INFO) {
 			while (nr_pages--)
 				put_page_bootmem(page++);
 		} else {
@@ -1028,13 +1028,24 @@ static void __meminit free_pagetable(str
 	}
 }
 
-static void __meminit free_hugepage_table(struct page *page,
+static void __meminit free_vmemmap_pages(struct page *page, unsigned int order,
 		struct vmem_altmap *altmap)
 {
-	if (altmap)
-		vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE);
-	else
-		free_pagetable(page, get_order(PMD_SIZE));
+	unsigned long nr_pages = 1u << order;
+
+	if (altmap) {
+		vmem_altmap_free(altmap, nr_pages);
+	} else if (PageReserved(page)) {
+		if (IS_ENABLED(CONFIG_HAVE_BOOTMEM_INFO_NODE) &&
+		    bootmem_type(page) == SECTION_INFO) {
+			while (nr_pages--)
+				put_page_bootmem(page++);
+		} else {
+			free_reserved_pages(page, nr_pages);
+		}
+	} else {
+		__free_pages(page, order);
+	}
 }
 
 static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
@@ -1118,7 +1129,8 @@ remove_pte_table(pte_t *pte_start, unsig
 			return;
 
 		if (!direct)
-			free_pagetable(pte_page(*pte), 0);
+			/* We never populate base pages from the altmap. */
+			free_vmemmap_pages(pte_page(*pte), 0, NULL);
 
 		spin_lock(&init_mm.page_table_lock);
 		pte_clear(&init_mm, addr, pte);
@@ -1153,19 +1165,19 @@ remove_pmd_table(pmd_t *pmd_start, unsig
 			if (IS_ALIGNED(addr, PMD_SIZE) &&
 			    IS_ALIGNED(next, PMD_SIZE)) {
 				if (!direct)
-					free_hugepage_table(pmd_page(*pmd),
-							    altmap);
+					free_vmemmap_pages(pmd_page(*pmd),
+							   PMD_ORDER, altmap);
 
 				spin_lock(&init_mm.page_table_lock);
 				pmd_clear(pmd);
 				spin_unlock(&init_mm.page_table_lock);
 				pages++;
 			} else if (vmemmap_pmd_is_unused(addr, next)) {
-					free_hugepage_table(pmd_page(*pmd),
-							    altmap);
-					spin_lock(&init_mm.page_table_lock);
-					pmd_clear(pmd);
-					spin_unlock(&init_mm.page_table_lock);
+				free_vmemmap_pages(pmd_page(*pmd), PMD_ORDER,
+						   altmap);
+				spin_lock(&init_mm.page_table_lock);
+				pmd_clear(pmd);
+				spin_unlock(&init_mm.page_table_lock);
 			}
 			continue;
 		}
_
Re: [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
Posted by David Hildenbrand (Arm) 3 weeks, 3 days ago
On 5/22/26 02:35, Andrew Morton wrote:
> On Fri, 8 May 2026 12:51:31 +0200 "David Hildenbrand (Arm)" <david@kernel.org> wrote:
> 
>>>
>>> I'm thinking this should go in rather more urgent, yes?
>>
>> Yes, please :)
> 
> I'm not seeing this in linux-next so I (re) queued it in mm.git's
> mm-hotfixes-unstble queue, for a 7.1-rcX merge.

Thanks. Dave is aware and didn't get to it yet.

So I'll let him speak up if he wants to let this sit a bit longer here.

-- 
Cheers,

David
Re: [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
Posted by Dave Hansen 2 weeks, 6 days ago
On 5/22/26 15:35, David Hildenbrand (Arm) wrote:
>> I'm not seeing this in linux-next so I (re) queued it in mm.git's
>> mm-hotfixes-unstble queue, for a 7.1-rcX merge.
> Thanks. Dave is aware and didn't get to it yet.
> 
> So I'll let him speak up if he wants to let this sit a bit longer here.

This isn't a new bug and it's getting a bit late in the -rc's. I'll
queue it for the next merge window.

Thanks for the reminder.
Re: [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
Posted by Lance Yang 1 month, 2 weeks ago

On 2026/4/29 18:49, David Hildenbrand (Arm) wrote:
> In commit bf9e4e30f353 ("x86/mm: use pagetable_free()"), we switched
> from freeing non-boot page tables through __free_pages() to
> pagetable_free().
> 
> However, the function is also called to free vmemmap pages.
> 
> Given that vmemmap pages are not page tables, already the page_ptdesc(page)
> is wrong. But worse, pagetable_free() calls
> 
> 	__free_pages(page, compound_order(page));
> 
> As vmemmap pages are not compound pages (see vmemmap_alloc_block()) --
> except for HVO, which doesn't apply here -- we will only free the first
> page when freeing a PMD-sized vmemmap page, leaking the other ones.
> 
> Fix it by properly decoupling pagetable and vmemmap freeing.
> free_pagetable() no longer has to mess with SECTION_INFO, as only the
> vmemmap is marked like that in register_page_bootmem_memmap().
> 
> The indentation in remove_pmd_table() is messed up, let's fix that
> while touching it.
> 
> Note that we'll try to get rid of that bootmem info handling soon. For
> now, we'll handle it similar to free_pagetable(), just avoiding the
> ifdef.
> 
> Tested-by: Lance Yang <lance.yang@linux.dev>
> Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
> Cc: stable@vger.kernel.org
> Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
> ---
> Reproduced and tested with a simple VM with a virtio-mem device,
> repeatedly adding and removing memory.
> 
> Found by code inspection while working on bootmem_info removal.
> ---

Retested. Works as expected :)

Cheers, Lance
[tip: x86/mm] x86/mm: Fix freeing of PMD-sized vmemmap pages
Posted by tip-bot2 for David Hildenbrand (Arm) 2 weeks, 6 days ago
The following commit has been merged into the x86/mm branch of tip:

Commit-ID:     39406c05f8f150f1685839acd38ffdd69ff92031
Gitweb:        https://git.kernel.org/tip/39406c05f8f150f1685839acd38ffdd69ff92031
Author:        David Hildenbrand (Arm) <david@kernel.org>
AuthorDate:    Wed, 29 Apr 2026 12:49:14 +02:00
Committer:     Dave Hansen <dave.hansen@linux.intel.com>
CommitterDate: Wed, 27 May 2026 11:39:38 -07:00

x86/mm: Fix freeing of PMD-sized vmemmap pages

Commit bf9e4e30f353 ("x86/mm: use pagetable_free()"), switched from
freeing non-boot page tables through __free_pages() to
pagetable_free().

However, the function is also called to free vmemmap pages.

Given that vmemmap pages are not page tables, already the page_ptdesc(page)
is wrong. But worse, pagetable_free() calls:

	__free_pages(page, compound_order(page));

Since vmemmap pages are not compound pages (see vmemmap_alloc_block())
-- except for HVO, which doesn't apply here -- only first page of a
PMD-sized vmemmap page is freed, leaking the other ones.

Fix it by properly decoupling pagetable and vmemmap freeing.
free_pagetable() no longer has to mess with SECTION_INFO, as only the
vmemmap is marked like that in register_page_bootmem_memmap().

The indentation in remove_pmd_table() is messed up. Fix that while
touching it.

Bootmem info handling will soon be fixed up. For now, handle it
similar to free_pagetable(), just avoiding the ifdef.

[ dhansen: changelog munging. More imperative voice ]

Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: Lance Yang <lance.yang@linux.dev>
Link: https://lore.kernel.org/20260429-vmemmap-v2-1-8dfcacffd877@kernel.org
Link: https://patch.msgid.link/20260429-vmemmap-v2-1-8dfcacffd877@kernel.org
Cc: stable@vger.kernel.org
---
 arch/x86/mm/init_64.c | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index df2261f..7e20b22 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1014,7 +1014,7 @@ static void __meminit free_pagetable(struct page *page, int order)
 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
 		enum bootmem_type type = bootmem_type(page);
 
-		if (type == SECTION_INFO || type == MIX_SECTION_INFO) {
+		if (type == MIX_SECTION_INFO) {
 			while (nr_pages--)
 				put_page_bootmem(page++);
 		} else {
@@ -1028,13 +1028,24 @@ static void __meminit free_pagetable(struct page *page, int order)
 	}
 }
 
-static void __meminit free_hugepage_table(struct page *page,
+static void __meminit free_vmemmap_pages(struct page *page, unsigned int order,
 		struct vmem_altmap *altmap)
 {
-	if (altmap)
-		vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE);
-	else
-		free_pagetable(page, get_order(PMD_SIZE));
+	unsigned long nr_pages = 1u << order;
+
+	if (altmap) {
+		vmem_altmap_free(altmap, nr_pages);
+	} else if (PageReserved(page)) {
+		if (IS_ENABLED(CONFIG_HAVE_BOOTMEM_INFO_NODE) &&
+		    bootmem_type(page) == SECTION_INFO) {
+			while (nr_pages--)
+				put_page_bootmem(page++);
+		} else {
+			free_reserved_pages(page, nr_pages);
+		}
+	} else {
+		__free_pages(page, order);
+	}
 }
 
 static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
@@ -1118,7 +1129,8 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
 			return;
 
 		if (!direct)
-			free_pagetable(pte_page(*pte), 0);
+			/* We never populate base pages from the altmap. */
+			free_vmemmap_pages(pte_page(*pte), 0, NULL);
 
 		spin_lock(&init_mm.page_table_lock);
 		pte_clear(&init_mm, addr, pte);
@@ -1153,19 +1165,19 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
 			if (IS_ALIGNED(addr, PMD_SIZE) &&
 			    IS_ALIGNED(next, PMD_SIZE)) {
 				if (!direct)
-					free_hugepage_table(pmd_page(*pmd),
-							    altmap);
+					free_vmemmap_pages(pmd_page(*pmd),
+							   PMD_ORDER, altmap);
 
 				spin_lock(&init_mm.page_table_lock);
 				pmd_clear(pmd);
 				spin_unlock(&init_mm.page_table_lock);
 				pages++;
 			} else if (vmemmap_pmd_is_unused(addr, next)) {
-					free_hugepage_table(pmd_page(*pmd),
-							    altmap);
-					spin_lock(&init_mm.page_table_lock);
-					pmd_clear(pmd);
-					spin_unlock(&init_mm.page_table_lock);
+				free_vmemmap_pages(pmd_page(*pmd), PMD_ORDER,
+						   altmap);
+				spin_lock(&init_mm.page_table_lock);
+				pmd_clear(pmd);
+				spin_unlock(&init_mm.page_table_lock);
 			}
 			continue;
 		}