[PATCH v3 1/3] x86/mm/pat: Convert pte code to use ptdescs

Vishal Moola (Oracle) posted 3 patches 4 days, 11 hours ago
There is a newer version of this series
[PATCH v3 1/3] x86/mm/pat: Convert pte code to use ptdescs
Posted by Vishal Moola (Oracle) 4 days, 11 hours ago
In order to separately allocate ptdescs from pages, we need all allocation
and free sites to use the appropriate functions. Convert these pte
allocation/free sites to use ptdescs.

Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
---
 arch/x86/mm/pat/set_memory.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 6c6eb486f7a6..f9f9d4ca8e71 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1408,7 +1408,7 @@ static bool try_to_free_pte_page(pte_t *pte)
 		if (!pte_none(pte[i]))
 			return false;
 
-	free_page((unsigned long)pte);
+	pagetable_free(virt_to_ptdesc((void *)pte));
 	return true;
 }
 
@@ -1537,12 +1537,15 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
 	 */
 }
 
-static int alloc_pte_page(pmd_t *pmd)
+static int alloc_pte_ptdesc(pmd_t *pmd)
 {
-	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
-	if (!pte)
+	pte_t *pte;
+	struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0);
+
+	if (!ptdesc)
 		return -1;
 
+	pte = (pte_t *) ptdesc_address(ptdesc);
 	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 	return 0;
 }
@@ -1600,7 +1603,7 @@ static long populate_pmd(struct cpa_data *cpa,
 		 */
 		pmd = pmd_offset(pud, start);
 		if (pmd_none(*pmd))
-			if (alloc_pte_page(pmd))
+			if (alloc_pte_ptdesc(pmd))
 				return -1;
 
 		populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
@@ -1641,7 +1644,7 @@ static long populate_pmd(struct cpa_data *cpa,
 	if (start < end) {
 		pmd = pmd_offset(pud, start);
 		if (pmd_none(*pmd))
-			if (alloc_pte_page(pmd))
+			if (alloc_pte_ptdesc(pmd))
 				return -1;
 
 		populate_pte(cpa, start, end, num_pages - cur_pages,
-- 
2.52.0
Re: [PATCH v3 1/3] x86/mm/pat: Convert pte code to use ptdescs
Posted by Dave Hansen 3 days, 11 hours ago
On 2/2/26 09:20, Vishal Moola (Oracle) wrote:
> In order to separately allocate ptdescs from pages, we need all allocation
> and free sites to use the appropriate functions. Convert these pte
> allocation/free sites to use ptdescs.

Imperative voice, please.

> diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
> index 6c6eb486f7a6..f9f9d4ca8e71 100644
> --- a/arch/x86/mm/pat/set_memory.c
> +++ b/arch/x86/mm/pat/set_memory.c
> @@ -1408,7 +1408,7 @@ static bool try_to_free_pte_page(pte_t *pte)
>  		if (!pte_none(pte[i]))
>  			return false;
>  
> -	free_page((unsigned long)pte);
> +	pagetable_free(virt_to_ptdesc((void *)pte));
>  	return true;
>  }

This looks wrong to me, or at least that the API needs improvement. Most
callers are going to have a pointer that they've been modifying. They're
not going to have a ptdesc handy.

So I think this needs to look like:

	pagetable_free(pte);

You can convert to ptdescs internally or do whatever you want with
ptdesc sanity checks, but the API needs to be on writeable pointers. If
the API takes a const pointer that requires callers to cast it, I think
the API is broken.

> @@ -1537,12 +1537,15 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
>  	 */
>  }
>  
> -static int alloc_pte_page(pmd_t *pmd)
> +static int alloc_pte_ptdesc(pmd_t *pmd)

Why change the name? Nobody cares what this is doing internally.

>  {
> -	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
> -	if (!pte)
> +	pte_t *pte;
> +	struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0);
> +
> +	if (!ptdesc)
>  		return -1;

This also looks wrong.

What kind of maniac is ever going to allocate page tables without
__GFP_ZERO? __GFP_ZERO really should be a part of pagetable_alloc(),
don't you think?

> +	pte = (pte_t *) ptdesc_address(ptdesc);
>  	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
>  	return 0;
>  }

Why is there a cast here? ptdesc_address() returns void*, no?

Also, if there a ptdesc_pa(), this could be:

static int alloc_pte_ptdesc(pmd_t *pmd)
{
	struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0);

	if (!ptdesc)
 		return -1;

 	set_pmd(pmd, __pmd(ptdesc_pa(ptdesc) | _KERNPG_TABLE));
 	return 0;
}

This *should* be a very common pattern. After you allocate a page table
page, you almost always need its physical address because it's going to
get pointed to by other page table or hardware register.

To me, it doesn't look like the ptdesc API is very mature yet, or at
least hasn't been expanded for ease for actual users. I don't want to
grow its use in arch/x86 until it's a wee bit more mature.
Re: [PATCH v3 1/3] x86/mm/pat: Convert pte code to use ptdescs
Posted by Vishal Moola (Oracle) 3 days, 7 hours ago
On Tue, Feb 03, 2026 at 09:23:47AM -0800, Dave Hansen wrote:
> On 2/2/26 09:20, Vishal Moola (Oracle) wrote:
> > In order to separately allocate ptdescs from pages, we need all allocation
> > and free sites to use the appropriate functions. Convert these pte
> > allocation/free sites to use ptdescs.
> 
> Imperative voice, please.

I'll fix it.

> > diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
> > index 6c6eb486f7a6..f9f9d4ca8e71 100644
> > --- a/arch/x86/mm/pat/set_memory.c
> > +++ b/arch/x86/mm/pat/set_memory.c
> > @@ -1408,7 +1408,7 @@ static bool try_to_free_pte_page(pte_t *pte)
> >  		if (!pte_none(pte[i]))
> >  			return false;
> >  
> > -	free_page((unsigned long)pte);
> > +	pagetable_free(virt_to_ptdesc((void *)pte));
> >  	return true;
> >  }
> 
> This looks wrong to me, or at least that the API needs improvement. Most
> callers are going to have a pointer that they've been modifying. They're
> not going to have a ptdesc handy.

Yeah the API needs improvement. The initial API I wrote was very
barebones back when I didn't understand enough about arch differences
and similarities in page table implementation.

> So I think this needs to look like:
> 
> 	pagetable_free(pte);
> 
> You can convert to ptdescs internally or do whatever you want with
> ptdesc sanity checks, but the API needs to be on writeable pointers. If
> the API takes a const pointer that requires callers to cast it, I think
> the API is broken.

Your logic makes sense to me. I can add ptdesc-using-address apis.

> > @@ -1537,12 +1537,15 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
> >  	 */
> >  }
> >  
> > -static int alloc_pte_page(pmd_t *pmd)
> > +static int alloc_pte_ptdesc(pmd_t *pmd)
> 
> Why change the name? Nobody cares what this is doing internally.
> 
> >  {
> > -	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
> > -	if (!pte)
> > +	pte_t *pte;
> > +	struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0);
> > +
> > +	if (!ptdesc)
> >  		return -1;
> 
> This also looks wrong.
> 
> What kind of maniac is ever going to allocate page tables without
> __GFP_ZERO? __GFP_ZERO really should be a part of pagetable_alloc(),
> don't you think?

I thought the same thing... Turns out some architectures do. I didn't
question it, they might not even have good reason to do so.

Regardless, I do agree with you. I'm tempted to include
__GFP_ZERO as part of the ptdesc-using-address apis.

> > +	pte = (pte_t *) ptdesc_address(ptdesc);
> >  	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
> >  	return 0;
> >  }
> 
> Why is there a cast here? ptdesc_address() returns void*, no?

Yes it does.

Personally, I view casts as human hints to make implicit conversions
obvious. I didn't think it hurt readability so I left it in.

I don't have strong feelings either way, I can remove the casts. The
type is obvious enough here anyway.

> Also, if there a ptdesc_pa(), this could be:
> 
> static int alloc_pte_ptdesc(pmd_t *pmd)
> {
> 	struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0);
> 
> 	if (!ptdesc)
>  		return -1;
> 
>  	set_pmd(pmd, __pmd(ptdesc_pa(ptdesc) | _KERNPG_TABLE));
>  	return 0;
> }
> 
> This *should* be a very common pattern. After you allocate a page table
> page, you almost always need its physical address because it's going to
> get pointed to by other page table or hardware register.

I don't recall running into this pattern much, but I'll take a look.
It sounds sensible. If this happens far from the allocation sites, I
definitely would've missed them.

> To me, it doesn't look like the ptdesc API is very mature yet, or at
> least hasn't been expanded for ease for actual users. I don't want to
> grow its use in arch/x86 until it's a wee bit more mature.

I truly appreciate the review and comments :).

The use of struct ptdesc in cpa_collapse_large_pages() gets in the way
of short-term (stop refcounting page tables) and long term (shrinking
struct page) goals. Particularly the pagetable_free() call.

Would you be ok with taking these patches if I add these relevant apis:
1) A function that returns and address (like get_zeroed_page())
2) A function that frees by address (like free_page())
Re: [PATCH v3 1/3] x86/mm/pat: Convert pte code to use ptdescs
Posted by Dave Hansen 3 days, 7 hours ago
On 2/3/26 13:07, Vishal Moola (Oracle) wrote:
> Would you be ok with taking these patches if I add these relevant apis:
> 1) A function that returns and address (like get_zeroed_page())
> 2) A function that frees by address (like free_page())

I don't see any problems taking them if those changes are made. No
promises, but I don't see any other issues.
Re: [PATCH v3 1/3] x86/mm/pat: Convert pte code to use ptdescs
Posted by Mike Rapoport 3 days, 11 hours ago
Hi Vishal,

On Mon, Feb 02, 2026 at 09:20:03AM -0800, Vishal Moola (Oracle) wrote:
> In order to separately allocate ptdescs from pages, we need all allocation
> and free sites to use the appropriate functions. Convert these pte
> allocation/free sites to use ptdescs.
> 
> Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
> ---
>  arch/x86/mm/pat/set_memory.c | 15 +++++++++------
>  1 file changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
> index 6c6eb486f7a6..f9f9d4ca8e71 100644
> --- a/arch/x86/mm/pat/set_memory.c
> +++ b/arch/x86/mm/pat/set_memory.c
> @@ -1408,7 +1408,7 @@ static bool try_to_free_pte_page(pte_t *pte)
>  		if (!pte_none(pte[i]))
>  			return false;
>  
> -	free_page((unsigned long)pte);
> +	pagetable_free(virt_to_ptdesc((void *)pte));
>  	return true;
>  }
>  
> @@ -1537,12 +1537,15 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
>  	 */
>  }
>  
> -static int alloc_pte_page(pmd_t *pmd)
> +static int alloc_pte_ptdesc(pmd_t *pmd)
>  {
> -	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
> -	if (!pte)
> +	pte_t *pte;
> +	struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0);

AFAIR, x86 folks like reverse xmas tree for variable declarations.

> +
> +	if (!ptdesc)
>  		return -1;
>  
> +	pte = (pte_t *) ptdesc_address(ptdesc);

No need to cast void * to another pointer type.

Same comments are relevant for two other patches as well.

>  	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
>  	return 0;
>  }
> @@ -1600,7 +1603,7 @@ static long populate_pmd(struct cpa_data *cpa,
>  		 */
>  		pmd = pmd_offset(pud, start);
>  		if (pmd_none(*pmd))
> -			if (alloc_pte_page(pmd))
> +			if (alloc_pte_ptdesc(pmd))
>  				return -1;
>  
>  		populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
> @@ -1641,7 +1644,7 @@ static long populate_pmd(struct cpa_data *cpa,
>  	if (start < end) {
>  		pmd = pmd_offset(pud, start);
>  		if (pmd_none(*pmd))
> -			if (alloc_pte_page(pmd))
> +			if (alloc_pte_ptdesc(pmd))
>  				return -1;
>  
>  		populate_pte(cpa, start, end, num_pages - cur_pages,
> -- 
> 2.52.0
> 

-- 
Sincerely yours,
Mike.