The way how switch to virtual address was implemented in the
commit e66003e7be ("xen/riscv: introduce setup_initial_pages")
isn't safe enough as:
* enable_mmu() depends on hooking all exceptions
and pagefault.
* Any exception other than pagefault, or not taking a pagefault
causes it to malfunction, which means you will fail to boot
depending on where Xen was loaded into memory.
Instead of the proposed way of switching to virtual addresses was
decided to use identity mapping of the entrire Xen and after
switching to virtual addresses identity mapping is removed from
page-tables.
Since it is not easy to keep track where the identity map was mapped,
so we will look for the top-most entry exclusive to the identity
map and remove it.
Fixes: e66003e7be ("xen/riscv: introduce setup_initial_pages")
Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
Changes in V3:
- remove unrelated to the patch changes ( SPDX tags in config.h ).
- update definition of PGTBL_INITIAL_COUNT taking into account identity mapping.
- refactor remove_identity_mapping() function.
- add explanatory comments in xen.lds.S and mm.c.
- update commit message.
- move save/restore of a0/a1 registers to [PATCH v2 2/3] xen/riscv: introduce
function for physical offset calculation.
---
Changes in V2:
- update definition of PGTBL_INITIAL_COUNT and the comment above.
- code style fixes.
- 1:1 mapping for entire Xen.
- remove id_addrs array becase entire Xen is mapped.
- reverse condition for cycle inside remove_identity_mapping().
- fix page table walk in remove_identity_mapping().
- update the commit message.
- add Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
- save hart_id and dtb_addr before call MMU related C functions.
- use phys_offset variable instead of doing calcultations to get phys offset
in head.S file. ( it can be easily done as entire Xen is 1:1 mapped )
- declare enable_muu() as __init.
---
xen/arch/riscv/include/asm/mm.h | 3 +-
xen/arch/riscv/mm.c | 98 +++++++++++++++++++++------------
xen/arch/riscv/riscv64/head.S | 22 ++++++++
xen/arch/riscv/setup.c | 14 +----
xen/arch/riscv/xen.lds.S | 4 ++
5 files changed, 92 insertions(+), 49 deletions(-)
diff --git a/xen/arch/riscv/include/asm/mm.h b/xen/arch/riscv/include/asm/mm.h
index d9c4205103..085eaab7fb 100644
--- a/xen/arch/riscv/include/asm/mm.h
+++ b/xen/arch/riscv/include/asm/mm.h
@@ -13,7 +13,8 @@ extern unsigned char cpu0_boot_stack[];
void setup_initial_pagetables(void);
void enable_mmu(void);
-void cont_after_mmu_is_enabled(void);
+
+void remove_identity_mapping(void);
void calc_phys_offset(void);
diff --git a/xen/arch/riscv/mm.c b/xen/arch/riscv/mm.c
index c84a8a7c3c..34cd62eefe 100644
--- a/xen/arch/riscv/mm.c
+++ b/xen/arch/riscv/mm.c
@@ -25,6 +25,12 @@ unsigned long __ro_after_init phys_offset;
#define LOAD_TO_LINK(addr) ((unsigned long)(addr) - phys_offset)
#define LINK_TO_LOAD(addr) ((unsigned long)(addr) + phys_offset)
+/*
+ * Should be removed as soon as enough headers will be merged for inclusion of
+ * <xen/lib.h>.
+ */
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
/*
* It is expected that Xen won't be more then 2 MB.
* The check in xen.lds.S guarantees that.
@@ -35,8 +41,10 @@ unsigned long __ro_after_init phys_offset;
*
* It might be needed one more page table in case when Xen load address
* isn't 2 MB aligned.
+ *
+ * CONFIG_PAGING_LEVELS page tables are needed for identity mapping.
*/
-#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1)
+#define PGTBL_INITIAL_COUNT (CONFIG_PAGING_LEVELS * 2 + 1)
pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
stage1_pgtbl_root[PAGETABLE_ENTRIES];
@@ -75,6 +83,7 @@ static void __init setup_initial_mapping(struct mmu_desc *mmu_desc,
unsigned int index;
pte_t *pgtbl;
unsigned long page_addr;
+ bool is_identity_mapping = map_start == pa_start;
if ( (unsigned long)_start % XEN_PT_LEVEL_SIZE(0) )
{
@@ -108,16 +117,18 @@ static void __init setup_initial_mapping(struct mmu_desc *mmu_desc,
{
unsigned long paddr = (page_addr - map_start) + pa_start;
unsigned int permissions = PTE_LEAF_DEFAULT;
+ unsigned long addr = is_identity_mapping
+ ? page_addr : LINK_TO_LOAD(page_addr);
pte_t pte_to_be_written;
index = pt_index(0, page_addr);
- if ( is_kernel_text(LINK_TO_LOAD(page_addr)) ||
- is_kernel_inittext(LINK_TO_LOAD(page_addr)) )
- permissions =
- PTE_EXECUTABLE | PTE_READABLE | PTE_VALID;
+ if ( is_kernel_text(addr) ||
+ is_kernel_inittext(addr) )
+ permissions =
+ PTE_EXECUTABLE | PTE_READABLE | PTE_VALID;
- if ( is_kernel_rodata(LINK_TO_LOAD(page_addr)) )
+ if ( is_kernel_rodata(addr) )
permissions = PTE_READABLE | PTE_VALID;
pte_to_be_written = paddr_to_pte(paddr, permissions);
@@ -211,6 +222,10 @@ void __init setup_initial_pagetables(void)
unsigned long linker_start = LOAD_TO_LINK(load_start);
unsigned long linker_end = LOAD_TO_LINK(load_end);
+ /*
+ * If the overlapping check will be removed then remove_identity_mapping()
+ * logic should be updated.
+ */
if ( (linker_start != load_start) &&
(linker_start <= load_end) && (load_start <= linker_end) )
{
@@ -232,22 +247,18 @@ void __init setup_initial_pagetables(void)
linker_start,
linker_end,
load_start);
+
+ if ( linker_start == load_start )
+ return;
+
+ setup_initial_mapping(&mmu_desc,
+ load_start,
+ load_end,
+ load_start);
}
-void __init noreturn noinline enable_mmu()
+void __init enable_mmu(void)
{
- /*
- * Calculate a linker time address of the mmu_is_enabled
- * label and update CSR_STVEC with it.
- * MMU is configured in a way where linker addresses are mapped
- * on load addresses so in a case when linker addresses are not equal
- * to load addresses, after MMU is enabled, it will cause
- * an exception and jump to linker time addresses.
- * Otherwise if load addresses are equal to linker addresses the code
- * after mmu_is_enabled label will be executed without exception.
- */
- csr_write(CSR_STVEC, LOAD_TO_LINK((unsigned long)&&mmu_is_enabled));
-
/* Ensure page table writes precede loading the SATP */
sfence_vma();
@@ -255,25 +266,40 @@ void __init noreturn noinline enable_mmu()
csr_write(CSR_SATP,
PFN_DOWN((unsigned long)stage1_pgtbl_root) |
RV_STAGE1_MODE << SATP_MODE_SHIFT);
+}
- asm volatile ( ".p2align 2" );
- mmu_is_enabled:
- /*
- * Stack should be re-inited as:
- * 1. Right now an address of the stack is relative to load time
- * addresses what will cause an issue in case of load start address
- * isn't equal to linker start address.
- * 2. Addresses in stack are all load time relative which can be an
- * issue in case when load start address isn't equal to linker
- * start address.
- *
- * We can't return to the caller because the stack was reseted
- * and it may have stash some variable on the stack.
- * Jump to a brand new function as the stack was reseted
- */
+void __init remove_identity_mapping(void)
+{
+ unsigned int i;
+ pte_t *pgtbl;
+ unsigned int index, xen_index;
+ unsigned long load_start = LINK_TO_LOAD(_start);
+
+ for ( pgtbl = stage1_pgtbl_root, i = CONFIG_PAGING_LEVELS; i; i-- )
+ {
+ index = pt_index(i - 1, load_start);
+ xen_index = pt_index(i - 1, XEN_VIRT_START);
+
+ if ( index != xen_index )
+ {
+ /* remove after it will be possible to include <xen/lib.h> */
+ #define ROUNDUP(x, a) (((x) + (a) - 1) & ~((a) - 1))
+
+ unsigned long load_end = LINK_TO_LOAD(_end);
+ unsigned long pt_level_size = XEN_PT_LEVEL_SIZE(i - 1);
+ unsigned long xen_size = ROUNDUP(load_end - load_start, pt_level_size);
+ unsigned long page_entries_num = xen_size / pt_level_size;
+
+ while ( page_entries_num-- )
+ pgtbl[index++].pte = 0;
+
+ break;
- switch_stack_and_jump((unsigned long)cpu0_boot_stack + STACK_SIZE,
- cont_after_mmu_is_enabled);
+ #undef ROUNDUP
+ }
+
+ pgtbl = (pte_t *)pte_to_paddr(pgtbl[index]);
+ }
}
/*
diff --git a/xen/arch/riscv/riscv64/head.S b/xen/arch/riscv/riscv64/head.S
index 9015d06233..41983ffe63 100644
--- a/xen/arch/riscv/riscv64/head.S
+++ b/xen/arch/riscv/riscv64/head.S
@@ -39,6 +39,28 @@ ENTRY(start)
jal calc_phys_offset
+ jal setup_initial_pagetables
+
+ jal enable_mmu
+
+ la t1, phys_offset
+ REG_L t1, (t1)
+
+ /* Calculate proper VA after jump from 1:1 mapping */
+ la t0, .L_primary_switched
+ sub t0, t0, t1
+
+ /* Jump from 1:1 mapping world */
+ jr t0
+
+.L_primary_switched:
+ /*
+ * cpu0_boot_stack address is 1:1 mapping related so it should be
+ * recalculated after jump from 1:1 mapping world as 1:1 mapping
+ * will be removed soon in start_xen().
+ */
+ jal reset_stack
+
/* restore bootcpu_id and dtb address */
mv a0, s0
mv a1, s1
diff --git a/xen/arch/riscv/setup.c b/xen/arch/riscv/setup.c
index dde8fb898b..6593f601c1 100644
--- a/xen/arch/riscv/setup.c
+++ b/xen/arch/riscv/setup.c
@@ -13,20 +13,10 @@ unsigned char __initdata cpu0_boot_stack[STACK_SIZE]
void __init noreturn start_xen(unsigned long bootcpu_id,
paddr_t dtb_addr)
{
- early_printk("Hello from C env\n");
-
- setup_initial_pagetables();
-
- enable_mmu();
-
- for ( ;; )
- asm volatile ("wfi");
+ remove_identity_mapping();
- unreachable();
-}
+ early_printk("Hello from C env\n");
-void __init noreturn cont_after_mmu_is_enabled(void)
-{
early_printk("All set up\n");
for ( ;; )
diff --git a/xen/arch/riscv/xen.lds.S b/xen/arch/riscv/xen.lds.S
index 9064852173..31ccebadcb 100644
--- a/xen/arch/riscv/xen.lds.S
+++ b/xen/arch/riscv/xen.lds.S
@@ -173,4 +173,8 @@ ASSERT(IS_ALIGNED(__bss_end, POINTER_ALIGN), "__bss_end is misaligned")
ASSERT(!SIZEOF(.got), ".got non-empty")
ASSERT(!SIZEOF(.got.plt), ".got.plt non-empty")
+/*
+ * Changing the size of Xen binary can require an update of
+ * PGTBL_INITIAL_COUNT.
+ */
ASSERT(_end - _start <= MB(2), "Xen too large for early-boot assumptions")
--
2.41.0
On 17.07.2023 16:40, Oleksii Kurochko wrote:
> The way how switch to virtual address was implemented in the
> commit e66003e7be ("xen/riscv: introduce setup_initial_pages")
> isn't safe enough as:
> * enable_mmu() depends on hooking all exceptions
> and pagefault.
> * Any exception other than pagefault, or not taking a pagefault
> causes it to malfunction, which means you will fail to boot
> depending on where Xen was loaded into memory.
>
> Instead of the proposed way of switching to virtual addresses was
> decided to use identity mapping of the entrire Xen and after
> switching to virtual addresses identity mapping is removed from
> page-tables.
> Since it is not easy to keep track where the identity map was mapped,
> so we will look for the top-most entry exclusive to the identity
> map and remove it.
Doesn't this paragraph need adjustment now?
> --- a/xen/arch/riscv/mm.c
> +++ b/xen/arch/riscv/mm.c
> @@ -25,6 +25,12 @@ unsigned long __ro_after_init phys_offset;
> #define LOAD_TO_LINK(addr) ((unsigned long)(addr) - phys_offset)
> #define LINK_TO_LOAD(addr) ((unsigned long)(addr) + phys_offset)
>
> +/*
> + * Should be removed as soon as enough headers will be merged for inclusion of
> + * <xen/lib.h>.
> + */
> +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
Like said to Shawn for PPC in [1], there's now a pretty easy way to
get this macro available for use here without needing to include
xen/lib.h.
[1] https://lists.xen.org/archives/html/xen-devel/2023-07/msg01081.html
> @@ -35,8 +41,10 @@ unsigned long __ro_after_init phys_offset;
> *
> * It might be needed one more page table in case when Xen load address
> * isn't 2 MB aligned.
> + *
> + * CONFIG_PAGING_LEVELS page tables are needed for identity mapping.
> */
> -#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1)
> +#define PGTBL_INITIAL_COUNT (CONFIG_PAGING_LEVELS * 2 + 1)
Where did the "- 1" go?
> @@ -255,25 +266,40 @@ void __init noreturn noinline enable_mmu()
> csr_write(CSR_SATP,
> PFN_DOWN((unsigned long)stage1_pgtbl_root) |
> RV_STAGE1_MODE << SATP_MODE_SHIFT);
> +}
>
> - asm volatile ( ".p2align 2" );
> - mmu_is_enabled:
> - /*
> - * Stack should be re-inited as:
> - * 1. Right now an address of the stack is relative to load time
> - * addresses what will cause an issue in case of load start address
> - * isn't equal to linker start address.
> - * 2. Addresses in stack are all load time relative which can be an
> - * issue in case when load start address isn't equal to linker
> - * start address.
> - *
> - * We can't return to the caller because the stack was reseted
> - * and it may have stash some variable on the stack.
> - * Jump to a brand new function as the stack was reseted
> - */
> +void __init remove_identity_mapping(void)
> +{
> + unsigned int i;
> + pte_t *pgtbl;
> + unsigned int index, xen_index;
> + unsigned long load_start = LINK_TO_LOAD(_start);
> +
> + for ( pgtbl = stage1_pgtbl_root, i = CONFIG_PAGING_LEVELS; i; i-- )
> + {
> + index = pt_index(i - 1, load_start);
> + xen_index = pt_index(i - 1, XEN_VIRT_START);
> +
> + if ( index != xen_index )
> + {
> + /* remove after it will be possible to include <xen/lib.h> */
> + #define ROUNDUP(x, a) (((x) + (a) - 1) & ~((a) - 1))
ROUNDUP() is even part of the patch that I've submitted already.
> + unsigned long load_end = LINK_TO_LOAD(_end);
> + unsigned long pt_level_size = XEN_PT_LEVEL_SIZE(i - 1);
> + unsigned long xen_size = ROUNDUP(load_end - load_start, pt_level_size);
> + unsigned long page_entries_num = xen_size / pt_level_size;
> +
> + while ( page_entries_num-- )
> + pgtbl[index++].pte = 0;
> +
> + break;
Unless there's a "not crossing a 2Mb boundary" guarantee somewhere
that I've missed, this "break" is still too early afaict.
Jan
On Tue, 2023-07-18 at 17:03 +0200, Jan Beulich wrote:
> > + unsigned long load_end = LINK_TO_LOAD(_end);
> > + unsigned long pt_level_size = XEN_PT_LEVEL_SIZE(i -
> > 1);
> > + unsigned long xen_size = ROUNDUP(load_end -
> > load_start, pt_level_size);
> > + unsigned long page_entries_num = xen_size /
> > pt_level_size;
> > +
> > + while ( page_entries_num-- )
> > + pgtbl[index++].pte = 0;
> > +
> > + break;
>
> Unless there's a "not crossing a 2Mb boundary" guarantee somewhere
> that I've missed, this "break" is still too early afaict.
If I will add a '2 MB boundary check' for load_start and linker_start
could it be an upstreamable solution?
Something like:
if ( !IS_ALIGNED(load_start, MB(2) )
printk("load_start should be 2Mb algined\n");
and
ASSERT( !IS_ALIGNED(XEN_VIRT_START, MB(2) )
in xen.lds.S.
Then we will have completely different L0 tables for identity mapping
and not identity and the code above will be correct.
~ Oleksii
On 19.07.2023 18:35, Oleksii wrote:
> On Tue, 2023-07-18 at 17:03 +0200, Jan Beulich wrote:
>>> + unsigned long load_end = LINK_TO_LOAD(_end);
>>> + unsigned long pt_level_size = XEN_PT_LEVEL_SIZE(i -
>>> 1);
>>> + unsigned long xen_size = ROUNDUP(load_end -
>>> load_start, pt_level_size);
>>> + unsigned long page_entries_num = xen_size /
>>> pt_level_size;
>>> +
>>> + while ( page_entries_num-- )
>>> + pgtbl[index++].pte = 0;
>>> +
>>> + break;
>>
>> Unless there's a "not crossing a 2Mb boundary" guarantee somewhere
>> that I've missed, this "break" is still too early afaict.
> If I will add a '2 MB boundary check' for load_start and linker_start
> could it be an upstreamable solution?
>
> Something like:
> if ( !IS_ALIGNED(load_start, MB(2) )
> printk("load_start should be 2Mb algined\n");
> and
> ASSERT( !IS_ALIGNED(XEN_VIRT_START, MB(2) )
> in xen.lds.S.
Arranging for the linked address to be 2Mb-aligned is certainly
reasonable. Whether expecting the load address to also be depends
on whether that can be arranged for (which in turn depends on boot
loader behavior); it cannot be left to "luck".
> Then we will have completely different L0 tables for identity mapping
> and not identity and the code above will be correct.
As long as Xen won't grow beyond 2Mb total. Considering that at
some point you may want to use large page mappings for .text,
.data, and .rodata, that alone would grow Xen to 6 Mb (or really 8,
assuming .init goes separate as well). That's leaving aside the
realistic option of the mere sum of all sections being larger than
2. That said, even Arm64 with ACPI is still quite a bit below 2Mb.
x86 is nearing 2.5 though in even a somewhat limited config;
allyesconfig may well be beyond that already.
Of course you may legitimately leave dealing with that to the
future.
Jan
On Thu, 2023-07-20 at 07:58 +0200, Jan Beulich wrote:
> On 19.07.2023 18:35, Oleksii wrote:
> > On Tue, 2023-07-18 at 17:03 +0200, Jan Beulich wrote:
> > > > + unsigned long load_end = LINK_TO_LOAD(_end);
> > > > + unsigned long pt_level_size = XEN_PT_LEVEL_SIZE(i
> > > > -
> > > > 1);
> > > > + unsigned long xen_size = ROUNDUP(load_end -
> > > > load_start, pt_level_size);
> > > > + unsigned long page_entries_num = xen_size /
> > > > pt_level_size;
> > > > +
> > > > + while ( page_entries_num-- )
> > > > + pgtbl[index++].pte = 0;
> > > > +
> > > > + break;
> > >
> > > Unless there's a "not crossing a 2Mb boundary" guarantee
> > > somewhere
> > > that I've missed, this "break" is still too early afaict.
> > If I will add a '2 MB boundary check' for load_start and
> > linker_start
> > could it be an upstreamable solution?
> >
> > Something like:
> > if ( !IS_ALIGNED(load_start, MB(2) )
> > printk("load_start should be 2Mb algined\n");
> > and
> > ASSERT( !IS_ALIGNED(XEN_VIRT_START, MB(2) )
> > in xen.lds.S.
>
> Arranging for the linked address to be 2Mb-aligned is certainly
> reasonable. Whether expecting the load address to also be depends
> on whether that can be arranged for (which in turn depends on boot
> loader behavior); it cannot be left to "luck".
Maybe I didn't quite understand you here, but if Xen has an alignment
check of load address then boot loader has to follow the alignment
requirements of Xen. So it doesn't look as 'luck'.
>
> > Then we will have completely different L0 tables for identity
> > mapping
> > and not identity and the code above will be correct.
>
> As long as Xen won't grow beyond 2Mb total. Considering that at
> some point you may want to use large page mappings for .text,
> .data, and .rodata, that alone would grow Xen to 6 Mb (or really 8,
> assuming .init goes separate as well). That's leaving aside the
> realistic option of the mere sum of all sections being larger than
> 2. That said, even Arm64 with ACPI is still quite a bit below 2Mb.
> x86 is nearing 2.5 though in even a somewhat limited config;
> allyesconfig may well be beyond that already.
I am missing something about Xen size. Lets assume that Xen will be
mapped using only 4k pagees ( like it is done now ). Then if Xen will
be more then 2Mb then only what will be changed is a number of page
tables so it is only question of changing of PGTBL_INITIAL_COUNT ( in
case of RISC-V).
Could you please explain why Xen will grow to 6/8 MB in case of larger
page mappings? In case of larger page mapping fewer tables are needed.
For example, if we would like to use 2Mb pages then we will stop at L1
page table and write an physical address to L1 page table entry instead
of creating new L0 page table.
>
> Of course you may legitimately leave dealing with that to the
> future.
Then I'll send new patch series with updated alignment requirements.
~ Oleksii
On 20.07.2023 10:28, Oleksii wrote:
> On Thu, 2023-07-20 at 07:58 +0200, Jan Beulich wrote:
>> On 19.07.2023 18:35, Oleksii wrote:
>>> On Tue, 2023-07-18 at 17:03 +0200, Jan Beulich wrote:
>>>>> + unsigned long load_end = LINK_TO_LOAD(_end);
>>>>> + unsigned long pt_level_size = XEN_PT_LEVEL_SIZE(i
>>>>> -
>>>>> 1);
>>>>> + unsigned long xen_size = ROUNDUP(load_end -
>>>>> load_start, pt_level_size);
>>>>> + unsigned long page_entries_num = xen_size /
>>>>> pt_level_size;
>>>>> +
>>>>> + while ( page_entries_num-- )
>>>>> + pgtbl[index++].pte = 0;
>>>>> +
>>>>> + break;
>>>>
>>>> Unless there's a "not crossing a 2Mb boundary" guarantee
>>>> somewhere
>>>> that I've missed, this "break" is still too early afaict.
>>> If I will add a '2 MB boundary check' for load_start and
>>> linker_start
>>> could it be an upstreamable solution?
>>>
>>> Something like:
>>> if ( !IS_ALIGNED(load_start, MB(2) )
>>> printk("load_start should be 2Mb algined\n");
>>> and
>>> ASSERT( !IS_ALIGNED(XEN_VIRT_START, MB(2) )
>>> in xen.lds.S.
>>
>> Arranging for the linked address to be 2Mb-aligned is certainly
>> reasonable. Whether expecting the load address to also be depends
>> on whether that can be arranged for (which in turn depends on boot
>> loader behavior); it cannot be left to "luck".
> Maybe I didn't quite understand you here, but if Xen has an alignment
> check of load address then boot loader has to follow the alignment
> requirements of Xen. So it doesn't look as 'luck'.
That depends on (a) the alignment being properly expressed in the
final binary and (b) the boot loader honoring it. (b) is what you
double-check above, emitting a printk(), but I'm not sure about (a)
being sufficiently enforced with just the ASSERT in the linker
script. Maybe I'm wrong, though.
>>> Then we will have completely different L0 tables for identity
>>> mapping
>>> and not identity and the code above will be correct.
>>
>> As long as Xen won't grow beyond 2Mb total. Considering that at
>> some point you may want to use large page mappings for .text,
>> .data, and .rodata, that alone would grow Xen to 6 Mb (or really 8,
>> assuming .init goes separate as well). That's leaving aside the
>> realistic option of the mere sum of all sections being larger than
>> 2. That said, even Arm64 with ACPI is still quite a bit below 2Mb.
>> x86 is nearing 2.5 though in even a somewhat limited config;
>> allyesconfig may well be beyond that already.
> I am missing something about Xen size. Lets assume that Xen will be
> mapped using only 4k pagees ( like it is done now ). Then if Xen will
> be more then 2Mb then only what will be changed is a number of page
> tables so it is only question of changing of PGTBL_INITIAL_COUNT ( in
> case of RISC-V).
And the way you do the tearing down of the transient 1:1 mapping.
> Could you please explain why Xen will grow to 6/8 MB in case of larger
> page mappings? In case of larger page mapping fewer tables are needed.
> For example, if we would like to use 2Mb pages then we will stop at L1
> page table and write an physical address to L1 page table entry instead
> of creating new L0 page table.
When you use 2Mb mappings, then you will want to use separate ones
for .text, .rodata, and .data (plus perhaps .init), to express the
differing permissions correctly. Consequently you'll need more
virtual address space, but - yes - fewer page table pages. And of
course the 1:1 unmapping logic will again be slightly different.
Jan
On Thu, 2023-07-20 at 12:29 +0200, Jan Beulich wrote:
> On 20.07.2023 10:28, Oleksii wrote:
> > On Thu, 2023-07-20 at 07:58 +0200, Jan Beulich wrote:
> > > On 19.07.2023 18:35, Oleksii wrote:
> > > > On Tue, 2023-07-18 at 17:03 +0200, Jan Beulich wrote:
> > > > > > + unsigned long load_end = LINK_TO_LOAD(_end);
> > > > > > + unsigned long pt_level_size =
> > > > > > XEN_PT_LEVEL_SIZE(i
> > > > > > -
> > > > > > 1);
> > > > > > + unsigned long xen_size = ROUNDUP(load_end -
> > > > > > load_start, pt_level_size);
> > > > > > + unsigned long page_entries_num = xen_size /
> > > > > > pt_level_size;
> > > > > > +
> > > > > > + while ( page_entries_num-- )
> > > > > > + pgtbl[index++].pte = 0;
> > > > > > +
> > > > > > + break;
> > > > >
> > > > > Unless there's a "not crossing a 2Mb boundary" guarantee
> > > > > somewhere
> > > > > that I've missed, this "break" is still too early afaict.
> > > > If I will add a '2 MB boundary check' for load_start and
> > > > linker_start
> > > > could it be an upstreamable solution?
> > > >
> > > > Something like:
> > > > if ( !IS_ALIGNED(load_start, MB(2) )
> > > > printk("load_start should be 2Mb algined\n");
> > > > and
> > > > ASSERT( !IS_ALIGNED(XEN_VIRT_START, MB(2) )
> > > > in xen.lds.S.
> > >
> > > Arranging for the linked address to be 2Mb-aligned is certainly
> > > reasonable. Whether expecting the load address to also be depends
> > > on whether that can be arranged for (which in turn depends on
> > > boot
> > > loader behavior); it cannot be left to "luck".
> > Maybe I didn't quite understand you here, but if Xen has an
> > alignment
> > check of load address then boot loader has to follow the alignment
> > requirements of Xen. So it doesn't look as 'luck'.
>
> That depends on (a) the alignment being properly expressed in the
> final binary and (b) the boot loader honoring it. (b) is what you
> double-check above, emitting a printk(), but I'm not sure about (a)
> being sufficiently enforced with just the ASSERT in the linker
> script. Maybe I'm wrong, though.
It should be enough for current purpose but probably I am missing
something.
>
> > > > Then we will have completely different L0 tables for identity
> > > > mapping
> > > > and not identity and the code above will be correct.
> > >
> > > As long as Xen won't grow beyond 2Mb total. Considering that at
> > > some point you may want to use large page mappings for .text,
> > > .data, and .rodata, that alone would grow Xen to 6 Mb (or really
> > > 8,
> > > assuming .init goes separate as well). That's leaving aside the
> > > realistic option of the mere sum of all sections being larger
> > > than
> > > 2. That said, even Arm64 with ACPI is still quite a bit below
> > > 2Mb.
> > > x86 is nearing 2.5 though in even a somewhat limited config;
> > > allyesconfig may well be beyond that already.
> > I am missing something about Xen size. Lets assume that Xen will be
> > mapped using only 4k pagees ( like it is done now ). Then if Xen
> > will
> > be more then 2Mb then only what will be changed is a number of page
> > tables so it is only question of changing of PGTBL_INITIAL_COUNT (
> > in
> > case of RISC-V).
>
> And the way you do the tearing down of the transient 1:1 mapping.
It looks like removing 1:1 mapping will be the same.
Let's assume that the size of Xen is 4 MB and that load and linker
ranges don't overlap ( load and linker start address are 2Mb aligned ),
and the difference between them isn't bigger than 1 GB. Then one L2
page table, one L1 page table and two L0 page tables for identity
mapping, and two L0 page tables for non-identity mapping are needed.
Then at L1, we will have different indexes for load_start and
linker_start. So what will be needed is to clean two L1 page table
entries started from some index.
The only issue I see now is that it won't work in case if identity
mapping crosses a 1 Gb boundary. Then for identity mapping, it will be
needed two L1 page tables, and only one of them identity mapping will
be removed.
Do I miss anything else?
Wouldn't it be better to take into account that now?
>
> > Could you please explain why Xen will grow to 6/8 MB in case of
> > larger
> > page mappings? In case of larger page mapping fewer tables are
> > needed.
> > For example, if we would like to use 2Mb pages then we will stop at
> > L1
> > page table and write an physical address to L1 page table entry
> > instead
> > of creating new L0 page table.
>
> When you use 2Mb mappings, then you will want to use separate ones
> for .text, .rodata, and .data (plus perhaps .init), to express the
> differing permissions correctly. Consequently you'll need more
> virtual address space, but - yes - fewer page table pages. And of
> course the 1:1 unmapping logic will again be slightly different.
Thanks for clarification.
~ Oleksii
On 20.07.2023 15:34, Oleksii wrote: > On Thu, 2023-07-20 at 12:29 +0200, Jan Beulich wrote: >> On 20.07.2023 10:28, Oleksii wrote: >>> On Thu, 2023-07-20 at 07:58 +0200, Jan Beulich wrote: >>>> On 19.07.2023 18:35, Oleksii wrote: >>>>> Then we will have completely different L0 tables for identity >>>>> mapping >>>>> and not identity and the code above will be correct. >>>> >>>> As long as Xen won't grow beyond 2Mb total. Considering that at >>>> some point you may want to use large page mappings for .text, >>>> .data, and .rodata, that alone would grow Xen to 6 Mb (or really >>>> 8, >>>> assuming .init goes separate as well). That's leaving aside the >>>> realistic option of the mere sum of all sections being larger >>>> than >>>> 2. That said, even Arm64 with ACPI is still quite a bit below >>>> 2Mb. >>>> x86 is nearing 2.5 though in even a somewhat limited config; >>>> allyesconfig may well be beyond that already. >>> I am missing something about Xen size. Lets assume that Xen will be >>> mapped using only 4k pagees ( like it is done now ). Then if Xen >>> will >>> be more then 2Mb then only what will be changed is a number of page >>> tables so it is only question of changing of PGTBL_INITIAL_COUNT ( >>> in >>> case of RISC-V). >> >> And the way you do the tearing down of the transient 1:1 mapping. > It looks like removing 1:1 mapping will be the same. > > Let's assume that the size of Xen is 4 MB and that load and linker > ranges don't overlap ( load and linker start address are 2Mb aligned ), > and the difference between them isn't bigger than 1 GB. Then one L2 > page table, one L1 page table and two L0 page tables for identity > mapping, and two L0 page tables for non-identity mapping are needed. > Then at L1, we will have different indexes for load_start and > linker_start. So what will be needed is to clean two L1 page table > entries started from some index. > > The only issue I see now is that it won't work in case if identity > mapping crosses a 1 Gb boundary. Then for identity mapping, it will be > needed two L1 page tables, and only one of them identity mapping will > be removed. > > Do I miss anything else? Looks correct to me. > Wouldn't it be better to take into account that now? Sure, it's generally better to avoid leaving traps for someone to fall into later. Jan
On Thu, 2023-07-20 at 16:06 +0200, Jan Beulich wrote: > On 20.07.2023 15:34, Oleksii wrote: > > On Thu, 2023-07-20 at 12:29 +0200, Jan Beulich wrote: > > > On 20.07.2023 10:28, Oleksii wrote: > > > > On Thu, 2023-07-20 at 07:58 +0200, Jan Beulich wrote: > > > > > On 19.07.2023 18:35, Oleksii wrote: > > > > > > Then we will have completely different L0 tables for > > > > > > identity > > > > > > mapping > > > > > > and not identity and the code above will be correct. > > > > > > > > > > As long as Xen won't grow beyond 2Mb total. Considering that > > > > > at > > > > > some point you may want to use large page mappings for .text, > > > > > .data, and .rodata, that alone would grow Xen to 6 Mb (or > > > > > really > > > > > 8, > > > > > assuming .init goes separate as well). That's leaving aside > > > > > the > > > > > realistic option of the mere sum of all sections being larger > > > > > than > > > > > 2. That said, even Arm64 with ACPI is still quite a bit below > > > > > 2Mb. > > > > > x86 is nearing 2.5 though in even a somewhat limited config; > > > > > allyesconfig may well be beyond that already. > > > > I am missing something about Xen size. Lets assume that Xen > > > > will be > > > > mapped using only 4k pagees ( like it is done now ). Then if > > > > Xen > > > > will > > > > be more then 2Mb then only what will be changed is a number of > > > > page > > > > tables so it is only question of changing of > > > > PGTBL_INITIAL_COUNT ( > > > > in > > > > case of RISC-V). > > > > > > And the way you do the tearing down of the transient 1:1 mapping. > > It looks like removing 1:1 mapping will be the same. > > > > Let's assume that the size of Xen is 4 MB and that load and linker > > ranges don't overlap ( load and linker start address are 2Mb > > aligned ), > > and the difference between them isn't bigger than 1 GB. Then one L2 > > page table, one L1 page table and two L0 page tables for identity > > mapping, and two L0 page tables for non-identity mapping are > > needed. > > Then at L1, we will have different indexes for load_start and > > linker_start. So what will be needed is to clean two L1 page table > > entries started from some index. > > > > The only issue I see now is that it won't work in case if identity > > mapping crosses a 1 Gb boundary. Then for identity mapping, it will > > be > > needed two L1 page tables, and only one of them identity mapping > > will > > be removed. > > > > Do I miss anything else? > > Looks correct to me. > > > Wouldn't it be better to take into account that now? > > Sure, it's generally better to avoid leaving traps for someone to > fall into later. Thanks a lot. Then it make sense to update the removing identity mapping algo. ~ Oleksii
On Tue, 2023-07-18 at 17:03 +0200, Jan Beulich wrote:
> On 17.07.2023 16:40, Oleksii Kurochko wrote:
> > The way how switch to virtual address was implemented in the
> > commit e66003e7be ("xen/riscv: introduce setup_initial_pages")
> > isn't safe enough as:
> > * enable_mmu() depends on hooking all exceptions
> > and pagefault.
> > * Any exception other than pagefault, or not taking a pagefault
> > causes it to malfunction, which means you will fail to boot
> > depending on where Xen was loaded into memory.
> >
> > Instead of the proposed way of switching to virtual addresses was
> > decided to use identity mapping of the entrire Xen and after
> > switching to virtual addresses identity mapping is removed from
> > page-tables.
> > Since it is not easy to keep track where the identity map was
> > mapped,
> > so we will look for the top-most entry exclusive to the identity
> > map and remove it.
>
> Doesn't this paragraph need adjustment now?
It should be. Thanks. Ill update it in the next patch version.
>
> > --- a/xen/arch/riscv/mm.c
> > +++ b/xen/arch/riscv/mm.c
> > @@ -25,6 +25,12 @@ unsigned long __ro_after_init phys_offset;
> > #define LOAD_TO_LINK(addr) ((unsigned long)(addr) - phys_offset)
> > #define LINK_TO_LOAD(addr) ((unsigned long)(addr) + phys_offset)
> >
> > +/*
> > + * Should be removed as soon as enough headers will be merged for
> > inclusion of
> > + * <xen/lib.h>.
> > + */
> > +#define ARRAY_SIZE(arr) (sizeof(arr) /
> > sizeof((arr)[0]))
>
> Like said to Shawn for PPC in [1], there's now a pretty easy way to
> get this macro available for use here without needing to include
> xen/lib.h.
>
> [1]
> https://lists.xen.org/archives/html/xen-devel/2023-07/msg01081.html
Great. It'll be very useful for me so I'll add dependency on the patch
where ARRAY_SIZE and ROUNDUP are moved to <xen/macros.h>.
>
> > @@ -35,8 +41,10 @@ unsigned long __ro_after_init phys_offset;
> > *
> > * It might be needed one more page table in case when Xen load
> > address
> > * isn't 2 MB aligned.
> > + *
> > + * CONFIG_PAGING_LEVELS page tables are needed for identity
> > mapping.
> > */
> > -#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1)
> > +#define PGTBL_INITIAL_COUNT (CONFIG_PAGING_LEVELS * 2 + 1)
>
> Where did the "- 1" go?
My fault. Should be:
#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS * 2 - 1) + 1)
>
> > @@ -255,25 +266,40 @@ void __init noreturn noinline enable_mmu()
> > csr_write(CSR_SATP,
> > PFN_DOWN((unsigned long)stage1_pgtbl_root) |
> > RV_STAGE1_MODE << SATP_MODE_SHIFT);
> > +}
> >
> > - asm volatile ( ".p2align 2" );
> > - mmu_is_enabled:
> > - /*
> > - * Stack should be re-inited as:
> > - * 1. Right now an address of the stack is relative to load
> > time
> > - * addresses what will cause an issue in case of load start
> > address
> > - * isn't equal to linker start address.
> > - * 2. Addresses in stack are all load time relative which can
> > be an
> > - * issue in case when load start address isn't equal to
> > linker
> > - * start address.
> > - *
> > - * We can't return to the caller because the stack was reseted
> > - * and it may have stash some variable on the stack.
> > - * Jump to a brand new function as the stack was reseted
> > - */
> > +void __init remove_identity_mapping(void)
> > +{
> > + unsigned int i;
> > + pte_t *pgtbl;
> > + unsigned int index, xen_index;
> > + unsigned long load_start = LINK_TO_LOAD(_start);
> > +
> > + for ( pgtbl = stage1_pgtbl_root, i = CONFIG_PAGING_LEVELS; i;
> > i-- )
> > + {
> > + index = pt_index(i - 1, load_start);
> > + xen_index = pt_index(i - 1, XEN_VIRT_START);
> > +
> > + if ( index != xen_index )
> > + {
> > + /* remove after it will be possible to include
> > <xen/lib.h> */
> > + #define ROUNDUP(x, a) (((x) + (a) - 1) & ~((a) - 1))
>
> ROUNDUP() is even part of the patch that I've submitted already.
>
> > + unsigned long load_end = LINK_TO_LOAD(_end);
> > + unsigned long pt_level_size = XEN_PT_LEVEL_SIZE(i -
> > 1);
> > + unsigned long xen_size = ROUNDUP(load_end -
> > load_start, pt_level_size);
> > + unsigned long page_entries_num = xen_size /
> > pt_level_size;
> > +
> > + while ( page_entries_num-- )
> > + pgtbl[index++].pte = 0;
> > +
> > + break;
>
> Unless there's a "not crossing a 2Mb boundary" guarantee somewhere
> that I've missed, this "break" is still too early afaict.
You are right. I have to re-write this part again.
Thanks.
~ Oleksii
On 19.07.2023 12:39, Oleksii wrote: > On Tue, 2023-07-18 at 17:03 +0200, Jan Beulich wrote: >> On 17.07.2023 16:40, Oleksii Kurochko wrote: >>> @@ -35,8 +41,10 @@ unsigned long __ro_after_init phys_offset; >>> * >>> * It might be needed one more page table in case when Xen load >>> address >>> * isn't 2 MB aligned. >>> + * >>> + * CONFIG_PAGING_LEVELS page tables are needed for identity >>> mapping. >>> */ >>> -#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1) >>> +#define PGTBL_INITIAL_COUNT (CONFIG_PAGING_LEVELS * 2 + 1) >> >> Where did the "- 1" go? > My fault. Should be: > #define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS * 2 - 1) + 1) Why? I'd have expected #define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) * 2 + 1) Jan
On Wed, 2023-07-19 at 13:38 +0200, Jan Beulich wrote: > On 19.07.2023 12:39, Oleksii wrote: > > On Tue, 2023-07-18 at 17:03 +0200, Jan Beulich wrote: > > > On 17.07.2023 16:40, Oleksii Kurochko wrote: > > > > @@ -35,8 +41,10 @@ unsigned long __ro_after_init phys_offset; > > > > * > > > > * It might be needed one more page table in case when Xen > > > > load > > > > address > > > > * isn't 2 MB aligned. > > > > + * > > > > + * CONFIG_PAGING_LEVELS page tables are needed for identity > > > > mapping. > > > > */ > > > > -#define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) + 1) > > > > +#define PGTBL_INITIAL_COUNT (CONFIG_PAGING_LEVELS * 2 + 1) > > > > > > Where did the "- 1" go? > > My fault. Should be: > > #define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS * 2 - 1) + 1) > > Why? I'd have expected > > #define PGTBL_INITIAL_COUNT ((CONFIG_PAGING_LEVELS - 1) * 2 + 1) > I confused myself with definition of PGTBL_INITIAL_COUNT from another branch so your option is correct. Thanks. ~ Oleksii
© 2016 - 2026 Red Hat, Inc.