This is the same thing as lookup_address_in_pgd(), but it returns the
pagetable unconditionally instead of returning NULL when the pagetable
is none. This will be used for looking up and modifying pages that are
*_none() in order to map memory into the ASI restricted address space.
For a [PATCH], if this logic is needed, the surrounding code should
probably first be somewhat refactored. It now looks pretty repetitive,
and it's confusing that lookup_address_in_pgd() returns NULL when
pmd_none() but note when pte_none(). For now here's something that
works.
Signed-off-by: Brendan Jackman <jackmanb@google.com>
---
arch/x86/include/asm/pgtable_types.h | 2 ++
arch/x86/mm/pat/set_memory.c | 34 ++++++++++++++++++++++++++++++++++
2 files changed, 36 insertions(+)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4b804531b03c3ce5cc48f0a75cb75d58b985777a..e09b509e525534f31c986d705e07b25dd9c04cb7 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -572,6 +572,8 @@ extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level);
pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
unsigned int *level, bool *nx, bool *rw);
+extern pte_t *lookup_pgtable_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level);
extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index ef4514d64c0524e5854fa106e3f37ff1e1ba10a2..d066bf2c9e93e126757bd32a7a666db89b2488b6 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -658,6 +658,40 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
return new;
}
+/*
+ * Lookup the page table entry for a virtual address in a specific pgd. Return
+ * the pointer to the entry, without implying that any mapping actually exists
+ * (the returned value may be zero).
+ */
+pte_t *lookup_pgtable_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ *level = PG_LEVEL_256T;
+ if (pgd_none(*pgd))
+ return (pte_t *)pgd;
+
+ *level = PG_LEVEL_512G;
+ p4d = p4d_offset(pgd, address);
+ if (p4d_none(*p4d) || p4d_leaf(*p4d) || !p4d_present(*p4d))
+ return (pte_t *)p4d;
+
+ *level = PG_LEVEL_1G;
+ pud = pud_offset(p4d, address);
+ if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud))
+ return (pte_t *)pud;
+
+ *level = PG_LEVEL_2M;
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd) || pmd_leaf(*pmd) || !pmd_present(*pmd))
+ return (pte_t *)pmd;
+
+ *level = PG_LEVEL_4K;
+ return pte_offset_kernel(pmd, address);
+}
+
/*
* Lookup the page table entry for a virtual address in a specific pgd.
* Return a pointer to the entry (or NULL if the entry does not exist),
--
2.49.0.rc1.451.g8f38331e32-goog
On Thu, Mar 13, 2025 at 06:11:22PM +0000, Brendan Jackman wrote:
> This is the same thing as lookup_address_in_pgd(), but it returns the
> pagetable unconditionally instead of returning NULL when the pagetable
> is none. This will be used for looking up and modifying pages that are
> *_none() in order to map memory into the ASI restricted address space.
>
> For a [PATCH], if this logic is needed, the surrounding code should
> probably first be somewhat refactored. It now looks pretty repetitive,
> and it's confusing that lookup_address_in_pgd() returns NULL when
> pmd_none() but note when pte_none(). For now here's something that
> works.
My first instinct reading this is that lookup_address_in_pgd() should be
calling lookup_pgtable_in_pgd(), but I didn't look too closely.
>
> Signed-off-by: Brendan Jackman <jackmanb@google.com>
> ---
> arch/x86/include/asm/pgtable_types.h | 2 ++
> arch/x86/mm/pat/set_memory.c | 34 ++++++++++++++++++++++++++++++++++
> 2 files changed, 36 insertions(+)
>
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index 4b804531b03c3ce5cc48f0a75cb75d58b985777a..e09b509e525534f31c986d705e07b25dd9c04cb7 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -572,6 +572,8 @@ extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
> unsigned int *level);
> pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
> unsigned int *level, bool *nx, bool *rw);
> +extern pte_t *lookup_pgtable_in_pgd(pgd_t *pgd, unsigned long address,
> + unsigned int *level);
> extern pmd_t *lookup_pmd_address(unsigned long address);
> extern phys_addr_t slow_virt_to_phys(void *__address);
> extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
> diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
> index ef4514d64c0524e5854fa106e3f37ff1e1ba10a2..d066bf2c9e93e126757bd32a7a666db89b2488b6 100644
> --- a/arch/x86/mm/pat/set_memory.c
> +++ b/arch/x86/mm/pat/set_memory.c
> @@ -658,6 +658,40 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
> return new;
> }
>
> +/*
> + * Lookup the page table entry for a virtual address in a specific pgd. Return
> + * the pointer to the entry, without implying that any mapping actually exists
> + * (the returned value may be zero).
> + */
> +pte_t *lookup_pgtable_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level)
> +{
> + p4d_t *p4d;
> + pud_t *pud;
> + pmd_t *pmd;
> +
> + *level = PG_LEVEL_256T;
> + if (pgd_none(*pgd))
> + return (pte_t *)pgd;
> +
> + *level = PG_LEVEL_512G;
> + p4d = p4d_offset(pgd, address);
> + if (p4d_none(*p4d) || p4d_leaf(*p4d) || !p4d_present(*p4d))
> + return (pte_t *)p4d;
> +
> + *level = PG_LEVEL_1G;
> + pud = pud_offset(p4d, address);
> + if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud))
> + return (pte_t *)pud;
> +
> + *level = PG_LEVEL_2M;
> + pmd = pmd_offset(pud, address);
> + if (pmd_none(*pmd) || pmd_leaf(*pmd) || !pmd_present(*pmd))
> + return (pte_t *)pmd;
> +
> + *level = PG_LEVEL_4K;
> + return pte_offset_kernel(pmd, address);
> +}
> +
> /*
> * Lookup the page table entry for a virtual address in a specific pgd.
> * Return a pointer to the entry (or NULL if the entry does not exist),
>
> --
> 2.49.0.rc1.451.g8f38331e32-goog
>
On Thu, Mar 13, 2025 at 10:09:21PM +0000, Yosry Ahmed wrote: > On Thu, Mar 13, 2025 at 06:11:22PM +0000, Brendan Jackman wrote: > > This is the same thing as lookup_address_in_pgd(), but it returns the > > pagetable unconditionally instead of returning NULL when the pagetable > > is none. This will be used for looking up and modifying pages that are > > *_none() in order to map memory into the ASI restricted address space. > > > > For a [PATCH], if this logic is needed, the surrounding code should > > probably first be somewhat refactored. It now looks pretty repetitive, > > and it's confusing that lookup_address_in_pgd() returns NULL when > > pmd_none() but note when pte_none(). For now here's something that > > works. > > My first instinct reading this is that lookup_address_in_pgd() should be > calling lookup_pgtable_in_pgd(), but I didn't look too closely. Yeah. That outer function would get a "generic" PTE pointer isntead of a strongly-typed p4d_t/pud_t/etc. So we either need to encode assumptions that all the page tables have the same structure at different levels for the bits we care about, or we need to have a switch(*level) and then be careful about pgtable_l5_enabled(). I think the former is fine but it needs a bit of care and attention to ensure we don't miss anything and avoid creating confusion/antipatterns in the code. And perhaps more importantly, lookup_adress_in_pgd_attr() sets *nx and *rw based on the level above the entry it returns. E.g. when it returns a pte_t* it sets *nx pased on pmd_flags(). I haven't looked into why this is. So yeah overall it needs a bit of research and most likely needs a couple of prep patches. Hopefully it's possible to do it in a way that leaves the existing code in a clearer state. Anyway, I was originally planning not to have asi_map()/asi_unmap() in asi.c at all, and instead just kinda make set_memory.c natively aware of ASI somehow. At that point I think this code is probably gonna look a bit different. That's something I ran out of time for and had to drop from the scope of this RFC. It's definitely not ideal in this series that e.g. page_alloc.c, asi.c, and set_memory.c are all implicitly coupled to one another (i.e. they are all colluding to ensure asi_[un]map() never has to allocate). Maybe I should've called this out as a TODO on the cover letter actually.
On Fri, Mar 14, 2025 at 09:12:04AM +0000, Brendan Jackman wrote: > On Thu, Mar 13, 2025 at 10:09:21PM +0000, Yosry Ahmed wrote: > > On Thu, Mar 13, 2025 at 06:11:22PM +0000, Brendan Jackman wrote: > > > This is the same thing as lookup_address_in_pgd(), but it returns the > > > pagetable unconditionally instead of returning NULL when the pagetable > > > is none. This will be used for looking up and modifying pages that are > > > *_none() in order to map memory into the ASI restricted address space. > > > > > > For a [PATCH], if this logic is needed, the surrounding code should > > > probably first be somewhat refactored. It now looks pretty repetitive, > > > and it's confusing that lookup_address_in_pgd() returns NULL when > > > pmd_none() but note when pte_none(). For now here's something that > > > works. > > > > My first instinct reading this is that lookup_address_in_pgd() should be > > calling lookup_pgtable_in_pgd(), but I didn't look too closely. > > Yeah. That outer function would get a "generic" PTE pointer isntead of > a strongly-typed p4d_t/pud_t/etc. So we either need to encode > assumptions that all the page tables have the same structure at > different levels for the bits we care about, or we need to have a > switch(*level) and then be careful about pgtable_l5_enabled(). I > think the former is fine but it needs a bit of care and attention to > ensure we don't miss anything and avoid creating > confusion/antipatterns in the code. Hmm another option is to have a common helper that takes in a lot of parameters to control the exact behavior (e.g. do we check for 'none'?), and have both lookup_pgtable_in_pgd() and lookup_address_in_pgd() call it with different parameters. This could avoid the need for a generic pointer, for example. > > And perhaps more importantly, lookup_adress_in_pgd_attr() sets *nx and > *rw based on the level above the entry it returns. E.g. when it > returns a pte_t* it sets *nx pased on pmd_flags(). I haven't looked > into why this is. > > So yeah overall it needs a bit of research and most likely needs a > couple of prep patches. Hopefully it's possible to do it in a way that > leaves the existing code in a clearer state. Agreed. > > Anyway, I was originally planning not to have asi_map()/asi_unmap() in > asi.c at all, and instead just kinda make set_memory.c natively aware > of ASI somehow. At that point I think this code is probably gonna look > a bit different. That's something I ran out of time for and had to > drop from the scope of this RFC. It's definitely not ideal in this > series that e.g. page_alloc.c, asi.c, and set_memory.c are all > implicitly coupled to one another (i.e. they are all colluding to > ensure asi_[un]map() never has to allocate). Maybe I should've called > this out as a TODO on the cover letter actually. Looking forward to seeing how this would look like :)
© 2016 - 2025 Red Hat, Inc.