mm: ASI direct map management

[PATCH 04/21] x86/mm/asi: set up asi_nonsensitive_pgd

Posted by Brendan Jackman 4 months, 2 weeks ago

Create the initial shared pagetable to hold all the mappings that will
be shared among ASI domains.

Mirror the physmap into the ASI pagetables, but with a maximum
granularity that's guaranteed to allow changing pageblock sensitivity
without having to allocate pagetables, and with everything as
non-present.

Signed-off-by: Brendan Jackman <jackmanb@google.com>
---
 arch/x86/include/asm/asi.h |  4 ++++
 arch/x86/mm/asi.c          | 19 +++++++++++++++++++
 arch/x86/mm/init.c         |  2 ++
 arch/x86/mm/init_64.c      | 25 +++++++++++++++++++++++--
 include/linux/asi.h        |  4 ++++
 init/main.c                |  1 +
 6 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/asi.h b/arch/x86/include/asm/asi.h
index 32a4c04c4be0f6f425c7cbcff4c58f1827a4b4c4..85062f2a23e127c736a92bb0d49e54f6fdcc2a5b 100644
--- a/arch/x86/include/asm/asi.h
+++ b/arch/x86/include/asm/asi.h
@@ -12,4 +12,8 @@ static inline bool asi_enabled_static(void)
 	return cpu_feature_enabled(X86_FEATURE_ASI);
 }
 
+void asi_init(void);
+
+extern pgd_t *asi_nonsensitive_pgd;
+
 #endif /* _ASM_X86_ASI_H */
diff --git a/arch/x86/mm/asi.c b/arch/x86/mm/asi.c
index 8c907f3c84f43f66e412ecbfa99e67390d31a66f..7225f6aec936eedf98cd263d791dd62263d62575 100644
--- a/arch/x86/mm/asi.c
+++ b/arch/x86/mm/asi.c
@@ -1,11 +1,20 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/asi.h>
 #include <linux/init.h>
+#include <linux/memblock.h>
 #include <linux/string.h>
 
 #include <asm/cmdline.h>
 #include <asm/cpufeature.h>
 
+#include "mm_internal.h"
+
+/*
+ * This is a bit like init_mm.pgd, it holds mappings shared among all ASI
+ * domains.
+ */
+pgd_t *asi_nonsensitive_pgd;
+
 void __init asi_check_boottime_disable(void)
 {
 	bool enabled = false;
@@ -26,3 +35,13 @@ void __init asi_check_boottime_disable(void)
 	if (enabled)
 		setup_force_cpu_cap(X86_FEATURE_ASI);
 }
+
+void __init asi_init(void)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_ASI))
+		return;
+
+	asi_nonsensitive_pgd = alloc_low_page();
+	if (WARN_ON(!asi_nonsensitive_pgd))
+		setup_clear_cpu_cap(X86_FEATURE_ASI);
+}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index b877a41fc291284eb271ebe764a52730d51da3fc..8fd34475af7ccd49d0124e13a87342d3bfef3e05 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -773,6 +773,8 @@ void __init init_mem_mapping(void)
 	end = max_low_pfn << PAGE_SHIFT;
 #endif
 
+	asi_init();
+
 	/* the ISA range is always mapped regardless of memory holes */
 	init_memory_mapping(0, ISA_END_ADDRESS, PAGE_KERNEL);
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index e98e85cf15f42db669696ba8195d8fc633351b26..7e0471d46767c63ceade479ae0d1bf738f14904a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -7,6 +7,7 @@
  *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
  */
 
+#include <linux/asi.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
@@ -746,7 +747,8 @@ phys_pgd_init(pgd_t *pgd_page, unsigned long paddr_start, unsigned long paddr_en
 {
 	unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
 
-	*pgd_changed = false;
+	if (pgd_changed)
+		*pgd_changed = false;
 
 	paddr_last = paddr_end;
 	vaddr = (unsigned long)__va(paddr_start);
@@ -780,7 +782,8 @@ phys_pgd_init(pgd_t *pgd_page, unsigned long paddr_start, unsigned long paddr_en
 					  (pud_t *) p4d, init);
 
 		spin_unlock(&init_mm.page_table_lock);
-		*pgd_changed = true;
+		if (pgd_changed)
+			*pgd_changed = true;
 	}
 
 	return paddr_last;
@@ -797,6 +800,24 @@ __kernel_physical_mapping_init(unsigned long paddr_start,
 
 	paddr_last = phys_pgd_init(init_mm.pgd, paddr_start, paddr_end, page_size_mask,
 				   prot, init, &pgd_changed);
+
+	/*
+	 * Set up ASI's unrestricted physmap. This needs to mapped at minimum 2M
+	 * size so that regions can be mapped and unmapped at pageblock
+	 * granularity without requiring allocations.
+	 */
+	if (asi_nonsensitive_pgd) {
+		/*
+		 * Since most memory is expected to end up sensitive, start with
+		 * everything unmapped in this pagetable.
+		 */
+		pgprot_t prot_np = __pgprot(pgprot_val(prot) & ~_PAGE_PRESENT);
+
+		VM_BUG_ON((PAGE_SHIFT + pageblock_order) < page_level_shift(PG_LEVEL_2M));
+		phys_pgd_init(asi_nonsensitive_pgd, paddr_start, paddr_end, 1 << PG_LEVEL_2M,
+			      prot_np, init, NULL);
+	}
+
 	if (pgd_changed)
 		sync_global_pgds((unsigned long)__va(paddr_start),
 				 (unsigned long)__va(paddr_end) - 1);
diff --git a/include/linux/asi.h b/include/linux/asi.h
index 1832feb1b14d63f05bbfa3f87dd07753338ed70b..cc4bc957274dbf92ce5bf6185a418d0a8d1b7748 100644
--- a/include/linux/asi.h
+++ b/include/linux/asi.h
@@ -11,5 +11,9 @@
 static inline void asi_check_boottime_disable(void) { }
 static inline bool asi_enabled_static(void) { return false; }
 
+#define asi_nonsensitive_pgd NULL
+
+static inline void asi_init(void) { };
+
 #endif /* CONFIG_MITIGATION_ADDRESS_SPACE_ISOLATION */
 #endif /* _INCLUDE_ASI_H */
diff --git a/init/main.c b/init/main.c
index 07a3116811c5d72cbab48410493b3d0f89d1f1b2..0ec230ba123613c89c4dfbede27e0441207b2f88 100644
--- a/init/main.c
+++ b/init/main.c
@@ -12,6 +12,7 @@
 
 #define DEBUG		/* Enable initcall_debug */
 
+#include <linux/asi.h>
 #include <linux/types.h>
 #include <linux/export.h>
 #include <linux/extable.h>

-- 
2.50.1

Re: [PATCH 04/21] x86/mm/asi: set up asi_nonsensitive_pgd

Posted by Borislav Petkov 2 months, 4 weeks ago

On Wed, Sep 24, 2025 at 02:59:39PM +0000, Brendan Jackman wrote:
> @@ -797,6 +800,24 @@ __kernel_physical_mapping_init(unsigned long paddr_start,
>  
>  	paddr_last = phys_pgd_init(init_mm.pgd, paddr_start, paddr_end, page_size_mask,
>  				   prot, init, &pgd_changed);
> +
> +	/*
> +	 * Set up ASI's unrestricted physmap. This needs to mapped at minimum 2M
> +	 * size so that regions can be mapped and unmapped at pageblock
> +	 * granularity without requiring allocations.
> +	 */
> +	if (asi_nonsensitive_pgd) {
> +		/*
> +		 * Since most memory is expected to end up sensitive, start with
> +		 * everything unmapped in this pagetable.
> +		 */
> +		pgprot_t prot_np = __pgprot(pgprot_val(prot) & ~_PAGE_PRESENT);
> +
> +		VM_BUG_ON((PAGE_SHIFT + pageblock_order) < page_level_shift(PG_LEVEL_2M));
> +		phys_pgd_init(asi_nonsensitive_pgd, paddr_start, paddr_end, 1 << PG_LEVEL_2M,
> +			      prot_np, init, NULL);
> +	}

This looks weird: so you have some other function - asi_init() - which *must*
run before this one so that the pgd is allocated. But then you check it here
and in order to do such a "distributed" init, you export it too.

Instead, I'd simply add a function call here - asi_init_physmap() or whatever
- which is defined in asi.c and gets *only* called from here. And that
function returns the pgd or NULL. And then you use phys_pgd_init() on it.

Also, looking at kernel_map_pages_in_pgd() - and you mentioned set_memory.c
already - and if I squint my eyes hard enough, it does look like a bunch of
redundancy between there and init_64.c. But that's nasty code so unifying that
would be a hard task.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Re: [PATCH 04/21] x86/mm/asi: set up asi_nonsensitive_pgd

Posted by Brendan Jackman 2 months, 4 weeks ago

On Tue Nov 11, 2025 at 2:55 PM UTC, Borislav Petkov wrote:
> On Wed, Sep 24, 2025 at 02:59:39PM +0000, Brendan Jackman wrote:
>> @@ -797,6 +800,24 @@ __kernel_physical_mapping_init(unsigned long paddr_start,
>>  
>>  	paddr_last = phys_pgd_init(init_mm.pgd, paddr_start, paddr_end, page_size_mask,
>>  				   prot, init, &pgd_changed);
>> +
>> +	/*
>> +	 * Set up ASI's unrestricted physmap. This needs to mapped at minimum 2M
>> +	 * size so that regions can be mapped and unmapped at pageblock
>> +	 * granularity without requiring allocations.
>> +	 */
>> +	if (asi_nonsensitive_pgd) {
>> +		/*
>> +		 * Since most memory is expected to end up sensitive, start with
>> +		 * everything unmapped in this pagetable.
>> +		 */
>> +		pgprot_t prot_np = __pgprot(pgprot_val(prot) & ~_PAGE_PRESENT);
>> +
>> +		VM_BUG_ON((PAGE_SHIFT + pageblock_order) < page_level_shift(PG_LEVEL_2M));
>> +		phys_pgd_init(asi_nonsensitive_pgd, paddr_start, paddr_end, 1 << PG_LEVEL_2M,
>> +			      prot_np, init, NULL);
>> +	}
>
> This looks weird: so you have some other function - asi_init() - which *must*
> run before this one so that the pgd is allocated. But then you check it here
> and in order to do such a "distributed" init, you export it too.
>
> Instead, I'd simply add a function call here - asi_init_physmap() or whatever
> - which is defined in asi.c and gets *only* called from here. And that
> function returns the pgd or NULL. And then you use phys_pgd_init() on it.

Well, this isn't the only place that refers to asi_nonsensitive_pgd in
this patchset - it's also used as a global from set_memory.c for the
later updates.

Still, you're right about the janky distributed init / setup ordering
issues. So yeah what you suggested with asi_init_physmap() (or whatever
we call it) still makes sense to me, it's just that we'd still have to
export it to set_memory.c

> Also, looking at kernel_map_pages_in_pgd() - and you mentioned set_memory.c
> already - and if I squint my eyes hard enough, it does look like a bunch of
> redundancy between there and init_64.c. But that's nasty code so unifying that
> would be a hard task.

Yeah :/ Some folks pointed out to me that all this logic is kinda
separated between the upper levels of pagetables which are preallocated,
and the lower level ones which are more complicated. So I am still
planning to see if I can come up with some sort of refactoring that only
affects the upper levels.

However, in the meantime I have switched tracks since David H pointed
out an opportunity for me to help out with the guest_memfd stuff [0].
That lets me start getting an interesting subset of this series without
needing any changes to the x86 code just yet.

[0] https://lore.kernel.org/all/20250924151101.2225820-1-patrick.roy@campus.lmu.de/

Re: [PATCH 04/21] x86/mm/asi: set up asi_nonsensitive_pgd

Posted by Dave Hansen 4 months, 1 week ago

On 9/24/25 07:59, Brendan Jackman wrote:
> Create the initial shared pagetable to hold all the mappings that will
> be shared among ASI domains.
> 
> Mirror the physmap into the ASI pagetables, but with a maximum
> granularity that's guaranteed to allow changing pageblock sensitivity
> without having to allocate pagetables, and with everything as
> non-present.

Could you also talk about what this granularity _actually_ is and why it
has the property of never requiring page table alloc

...
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index e98e85cf15f42db669696ba8195d8fc633351b26..7e0471d46767c63ceade479ae0d1bf738f14904a 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -7,6 +7,7 @@
>   *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
>   */
>  
> +#include <linux/asi.h>
>  #include <linux/signal.h>
>  #include <linux/sched.h>
>  #include <linux/kernel.h>
> @@ -746,7 +747,8 @@ phys_pgd_init(pgd_t *pgd_page, unsigned long paddr_start, unsigned long paddr_en
>  {
>  	unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
>  
> -	*pgd_changed = false;
> +	if (pgd_changed)
> +		*pgd_changed = false;

This 'pgd_changed' hunk isn't mentioned in the changelog.

...
> @@ -797,6 +800,24 @@ __kernel_physical_mapping_init(unsigned long paddr_start,
>  
>  	paddr_last = phys_pgd_init(init_mm.pgd, paddr_start, paddr_end, page_size_mask,
>  				   prot, init, &pgd_changed);
> +
> +	/*
> +	 * Set up ASI's unrestricted physmap. This needs to mapped at minimum 2M
> +	 * size so that regions can be mapped and unmapped at pageblock
> +	 * granularity without requiring allocations.
> +	 */

This took me a minute to wrap my head around.

Here, I think you're trying to convey that:

  1. There's a higher-level design decision that all sensitivity will be
     done at a 2M granularity. A 2MB physical region is either sensitive
     or not.
  2. Because of #1, 1GB mappings are not cool because splitting a 1GB
     mapping into 2MB needs to allocate a page table page.
  3. 4k mappings are OK because they can also have their permissions
     changed at a 2MB granularity. It's just more laborious.

The "minimum 2M size" comment really threw me off because that, to me,
also includes 1G which is a no-no here.

I also can't help but wonder if it would have been easier and more
straightforward to just start this whole exercise at 4k: force all the
ASI tables to be 4k. Then, later, add the 2MB support and tie to
pageblocks on after.


> +	if (asi_nonsensitive_pgd) {
> +		/*
> +		 * Since most memory is expected to end up sensitive, start with
> +		 * everything unmapped in this pagetable.
> +		 */
> +		pgprot_t prot_np = __pgprot(pgprot_val(prot) & ~_PAGE_PRESENT);
> +
> +		VM_BUG_ON((PAGE_SHIFT + pageblock_order) < page_level_shift(PG_LEVEL_2M));
> +		phys_pgd_init(asi_nonsensitive_pgd, paddr_start, paddr_end, 1 << PG_LEVEL_2M,
> +			      prot_np, init, NULL);
> +	}

I'm also kinda wondering what the purpose is of having a whole page
table full of !_PAGE_PRESENT entries. It would be nice to know how this
eventually gets turned into something useful.

Re: [PATCH 04/21] x86/mm/asi: set up asi_nonsensitive_pgd

Posted by Brendan Jackman 4 months, 1 week ago

On Wed Oct 1, 2025 at 8:28 PM UTC, Dave Hansen wrote:
> On 9/24/25 07:59, Brendan Jackman wrote:
>> Create the initial shared pagetable to hold all the mappings that will
>> be shared among ASI domains.
>> 
>> Mirror the physmap into the ASI pagetables, but with a maximum
>> granularity that's guaranteed to allow changing pageblock sensitivity
>> without having to allocate pagetables, and with everything as
>> non-present.
>
> Could you also talk about what this granularity _actually_ is and why it
> has the property of never requiring page table alloc

Ack, will expand on this (I think from your other comments that you
understand it now, and you're just asking me to improve the commit
message, let me know if I misread that).

>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>> index e98e85cf15f42db669696ba8195d8fc633351b26..7e0471d46767c63ceade479ae0d1bf738f14904a 100644
>> --- a/arch/x86/mm/init_64.c
>> +++ b/arch/x86/mm/init_64.c
>> @@ -7,6 +7,7 @@
>>   *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
>>   */
>>  
>> +#include <linux/asi.h>
>>  #include <linux/signal.h>
>>  #include <linux/sched.h>
>>  #include <linux/kernel.h>
>> @@ -746,7 +747,8 @@ phys_pgd_init(pgd_t *pgd_page, unsigned long paddr_start, unsigned long paddr_en
>>  {
>>  	unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
>>  
>> -	*pgd_changed = false;
>> +	if (pgd_changed)
>> +		*pgd_changed = false;
>
> This 'pgd_changed' hunk isn't mentioned in the changelog.

Oops, will add a note about that. The alternative would just be to
squash this into the commit that introduces phys_pgd_init(), let me know
if you have a preference.

>> @@ -797,6 +800,24 @@ __kernel_physical_mapping_init(unsigned long paddr_start,
>>  
>>  	paddr_last = phys_pgd_init(init_mm.pgd, paddr_start, paddr_end, page_size_mask,
>>  				   prot, init, &pgd_changed);
>> +
>> +	/*
>> +	 * Set up ASI's unrestricted physmap. This needs to mapped at minimum 2M
>> +	 * size so that regions can be mapped and unmapped at pageblock
>> +	 * granularity without requiring allocations.
>> +	 */
>
> This took me a minute to wrap my head around.
>
> Here, I think you're trying to convey that:
>
>   1. There's a higher-level design decision that all sensitivity will be
>      done at a 2M granularity. A 2MB physical region is either sensitive
>      or not.
>   2. Because of #1, 1GB mappings are not cool because splitting a 1GB
>      mapping into 2MB needs to allocate a page table page.
>   3. 4k mappings are OK because they can also have their permissions
>      changed at a 2MB granularity. It's just more laborious.
>
> The "minimum 2M size" comment really threw me off because that, to me,
> also includes 1G which is a no-no here.

Er yeah sorry that's just wrong, it should say "maximum size".

> I also can't help but wonder if it would have been easier and more
> straightforward to just start this whole exercise at 4k: force all the
> ASI tables to be 4k. Then, later, add the 2MB support and tie to
> pageblocks on after.

This would lead to a much smaller patchset, but I think it creates some
pretty yucky technical debt and complexity of its own. If you're
imagining a world where we just leave most of the allocator as-is, and
just inject "map into ASI" or "unmap from ASI" at the right moments...

I think to make this work you have to do one of:

- Say all free pages are unmapped from the restricted address space, we
  map them on-demand in allocation (if !__GFP_SENSITIVE), and unmap them
  again when they are freed. Because you can't flush the TLB
  synchronously in the free path, you need an async worker to take care
  of that for you.

  This is what we did in the Google implementation (where "don't change
  the page allocator more than you have to" kinda trumps everything) and
  it's pretty nasty. We have lots of knobs we can turn to try and make
  it perform well but in the end it's eventually gonna block deployment
  to some environment or other.

- Say free pages are mapped into the restricted address space. So if you
  get a __GFP_SENSITIVE alloc you unmap the pages and do the TLB flush
  synchronously there, unless we think the caller might be atomic, in
  which case.... I guess we'd have to have a sort of special atomic
  reserve for this? Which... seems like a weaker and more awkward
  version of the thing I'm proposing in this patchset.

  Then when you free the page you need to map it back again, which means
  you need to zero it.

I might have some tunnel-vision on this so please challenge me if it
sounds like I'm missing something.

>> +	if (asi_nonsensitive_pgd) {
>> +		/*
>> +		 * Since most memory is expected to end up sensitive, start with
>> +		 * everything unmapped in this pagetable.
>> +		 */
>> +		pgprot_t prot_np = __pgprot(pgprot_val(prot) & ~_PAGE_PRESENT);
>> +
>> +		VM_BUG_ON((PAGE_SHIFT + pageblock_order) < page_level_shift(PG_LEVEL_2M));
>> +		phys_pgd_init(asi_nonsensitive_pgd, paddr_start, paddr_end, 1 << PG_LEVEL_2M,
>> +			      prot_np, init, NULL);
>> +	}
>
> I'm also kinda wondering what the purpose is of having a whole page
> table full of !_PAGE_PRESENT entries. It would be nice to know how this
> eventually gets turned into something useful.

If you are thinking of the fact that just clearing P doesn't really do
anything for Meltdown/L1TF.. yeah that's true! We'll actually need to
munge the PFN or something too, but here I wanted do just focus on the
broad strokes of integration without worrying too much about individual
CPU mitigations. Flippping _PAGE_PRESENT is already supported by
set_memory.c and IIRC it's good enough for everything newer than
Skylake.

Other than that, these pages being unmapped is the whole point.. later
on, the subset of memory that we don't need to protect will get flipped
to being present. Everything else will trigger a pagefault if touched
and we'll switch address spaces, do the flushing etc.

Sorry if I'm missing your point here...

Re: [PATCH 04/21] x86/mm/asi: set up asi_nonsensitive_pgd

Posted by Dave Hansen 4 months, 1 week ago

On 10/2/25 07:05, Brendan Jackman wrote:
> On Wed Oct 1, 2025 at 8:28 PM UTC, Dave Hansen wrote:
...>> I also can't help but wonder if it would have been easier and more
>> straightforward to just start this whole exercise at 4k: force all the
>> ASI tables to be 4k. Then, later, add the 2MB support and tie to
>> pageblocks on after.
> 
> This would lead to a much smaller patchset, but I think it creates some
> pretty yucky technical debt and complexity of its own. If you're
> imagining a world where we just leave most of the allocator as-is, and
> just inject "map into ASI" or "unmap from ASI" at the right moments...
...

I'm trying to separate out the two problems:

 1. Have a set of page tables that never require allocations in order to
    map or unmap sensitive data.
 2. Manage each pageblock as either all sensitive or all not sensitive

There is a nonzero set of dependencies to make sure that the pageblock
size is compatible with the page table mapping size... unless you just
make the mapping size 4k.

If the mapping size is 4k, the pageblock size can be anything. There's
no dependency to satisfy.

So I'm not saying to make the sensitive/nonsensitive boundary 4k. Just
to make the _mapping_ size 4k. Then, come back later, and move the
mapping size over to 2MB as an optimization.

>>> +	if (asi_nonsensitive_pgd) {
>>> +		/*
>>> +		 * Since most memory is expected to end up sensitive, start with
>>> +		 * everything unmapped in this pagetable.
>>> +		 */
>>> +		pgprot_t prot_np = __pgprot(pgprot_val(prot) & ~_PAGE_PRESENT);
>>> +
>>> +		VM_BUG_ON((PAGE_SHIFT + pageblock_order) < page_level_shift(PG_LEVEL_2M));
>>> +		phys_pgd_init(asi_nonsensitive_pgd, paddr_start, paddr_end, 1 << PG_LEVEL_2M,
>>> +			      prot_np, init, NULL);
>>> +	}
>>
>> I'm also kinda wondering what the purpose is of having a whole page
>> table full of !_PAGE_PRESENT entries. It would be nice to know how this
>> eventually gets turned into something useful.
> 
> If you are thinking of the fact that just clearing P doesn't really do
> anything for Meltdown/L1TF.. yeah that's true! We'll actually need to
> munge the PFN or something too, but here I wanted do just focus on the
> broad strokes of integration without worrying too much about individual
> CPU mitigations. Flippping _PAGE_PRESENT is already supported by
> set_memory.c and IIRC it's good enough for everything newer than
> Skylake.
> 
> Other than that, these pages being unmapped is the whole point.. later
> on, the subset of memory that we don't need to protect will get flipped
> to being present. Everything else will trigger a pagefault if touched
> and we'll switch address spaces, do the flushing etc.
> 
> Sorry if I'm missing your point here...

What is the point of having a pgd if you can't put it in CR3? If you:

	write_cr3(asi_nonsensitive_pgd);

you'll just triple fault because all kernel text is !_PAGE_PRESENT.

The critical point is when 'asi_nonsensitive_pgd' is functional enough
that it can be loaded into CR3 and handle a switch to the normal
init_mm->pgd.

Re: [PATCH 04/21] x86/mm/asi: set up asi_nonsensitive_pgd

Posted by Brendan Jackman 4 months, 1 week ago

On Thu Oct 2, 2025 at 4:14 PM UTC, Dave Hansen wrote:
> On 10/2/25 07:05, Brendan Jackman wrote:
>> On Wed Oct 1, 2025 at 8:28 PM UTC, Dave Hansen wrote:
> ...>> I also can't help but wonder if it would have been easier and more
>>> straightforward to just start this whole exercise at 4k: force all the
>>> ASI tables to be 4k. Then, later, add the 2MB support and tie to
>>> pageblocks on after.
>> 
>> This would lead to a much smaller patchset, but I think it creates some
>> pretty yucky technical debt and complexity of its own. If you're
>> imagining a world where we just leave most of the allocator as-is, and
>> just inject "map into ASI" or "unmap from ASI" at the right moments...
> ...
>
> I'm trying to separate out the two problems:
>
>  1. Have a set of page tables that never require allocations in order to
>     map or unmap sensitive data.
>  2. Manage each pageblock as either all sensitive or all not sensitive
>
> There is a nonzero set of dependencies to make sure that the pageblock
> size is compatible with the page table mapping size... unless you just
> make the mapping size 4k.
>
> If the mapping size is 4k, the pageblock size can be anything. There's
> no dependency to satisfy.
>
> So I'm not saying to make the sensitive/nonsensitive boundary 4k. Just
> to make the _mapping_ size 4k. Then, come back later, and move the
> mapping size over to 2MB as an optimization.

Ahh thanks, I get your point now. And yep I'm sold, I'll go to 4k for
v2.

>>>> +	if (asi_nonsensitive_pgd) {
>>>> +		/*
>>>> +		 * Since most memory is expected to end up sensitive, start with
>>>> +		 * everything unmapped in this pagetable.
>>>> +		 */
>>>> +		pgprot_t prot_np = __pgprot(pgprot_val(prot) & ~_PAGE_PRESENT);
>>>> +
>>>> +		VM_BUG_ON((PAGE_SHIFT + pageblock_order) < page_level_shift(PG_LEVEL_2M));
>>>> +		phys_pgd_init(asi_nonsensitive_pgd, paddr_start, paddr_end, 1 << PG_LEVEL_2M,
>>>> +			      prot_np, init, NULL);
>>>> +	}
>>>
>>> I'm also kinda wondering what the purpose is of having a whole page
>>> table full of !_PAGE_PRESENT entries. It would be nice to know how this
>>> eventually gets turned into something useful.
>> 
>> If you are thinking of the fact that just clearing P doesn't really do
>> anything for Meltdown/L1TF.. yeah that's true! We'll actually need to
>> munge the PFN or something too, but here I wanted do just focus on the
>> broad strokes of integration without worrying too much about individual
>> CPU mitigations. Flippping _PAGE_PRESENT is already supported by
>> set_memory.c and IIRC it's good enough for everything newer than
>> Skylake.
>> 
>> Other than that, these pages being unmapped is the whole point.. later
>> on, the subset of memory that we don't need to protect will get flipped
>> to being present. Everything else will trigger a pagefault if touched
>> and we'll switch address spaces, do the flushing etc.
>> 
>> Sorry if I'm missing your point here...
>
> What is the point of having a pgd if you can't put it in CR3? If you:
>
> 	write_cr3(asi_nonsensitive_pgd);
>
> you'll just triple fault because all kernel text is !_PAGE_PRESENT.
>
> The critical point is when 'asi_nonsensitive_pgd' is functional enough
> that it can be loaded into CR3 and handle a switch to the normal
> init_mm->pgd.

Hm, are you saying that I should expand the scope of the patchset from
"set up the direct map" to "set up an ASI address space"? If so, yeah I
can do that, I don't think the patchset would get that much bigger. I
only left the other bits out because it feels weird to set up a whole
address space but never actually switch into it. Setting up the logic to
switch into it would make the patchset really big though.

Like I said in the cover letter, I could also always change tack:
we could instead start with all the address-space switching logic, but
just have the two address spaces be clones of each other. Then we could
come back and start poking holes in the ASI one for the second series. I
don't have a really strong opinion about the best place to start, but
I'll stick to my current course unless someone else does have a strong
opinion.

Re: [PATCH 04/21] x86/mm/asi: set up asi_nonsensitive_pgd

Posted by Dave Hansen 2 months, 4 weeks ago

On 10/2/25 10:19, Brendan Jackman wrote:
> On Thu Oct 2, 2025 at 4:14 PM UTC, Dave Hansen wrote:
...>> What is the point of having a pgd if you can't put it in CR3? If you:
>>
>> 	write_cr3(asi_nonsensitive_pgd);
>>
>> you'll just triple fault because all kernel text is !_PAGE_PRESENT.
>>
>> The critical point is when 'asi_nonsensitive_pgd' is functional enough
>> that it can be loaded into CR3 and handle a switch to the normal
>> init_mm->pgd.
> 
> Hm, are you saying that I should expand the scope of the patchset from
> "set up the direct map" to "set up an ASI address space"? If so, yeah I
> can do that, I don't think the patchset would get that much bigger. I
> only left the other bits out because it feels weird to set up a whole
> address space but never actually switch into it. Setting up the logic to
> switch into it would make the patchset really big though.

The patch set has to _do_ something, though. It's fine for a patch
series to add code that then gets turned on at the end of the series.
But, at the end of the series, it has to have something to show for it.

If the series is small *and* useful, all the better. But, if I have to
choose between small or useful, it's always going to be useful.

> Like I said in the cover letter, I could also always change tack:
> we could instead start with all the address-space switching logic, but
> just have the two address spaces be clones of each other. Then we could
> come back and start poking holes in the ASI one for the second series. I
> don't have a really strong opinion about the best place to start, but
> I'll stick to my current course unless someone else does have a strong
> opinion.

Yeah, but the end of the series has to have holes poked that are
marginally useful for *SOMETHING*, at least if anyone wants it applied.