[for 4.22 v5 01/18] xen/riscv: detect and initialize G-stage mode

Oleksii Kurochko posted 18 patches 3 months, 3 weeks ago
There is a newer version of this series
[for 4.22 v5 01/18] xen/riscv: detect and initialize G-stage mode
Posted by Oleksii Kurochko 3 months, 3 weeks ago
Introduce gstage_mode_detect() and pre_gstage_init() to probe supported
G-stage paging modes at boot. The function iterates over possible
HGATP modes (Sv32x4 on RV32, Sv39x4/Sv48x4/Sv57x4 on RV64) and selects
the first valid one by programming CSR_HGATP and reading it back.

The selected mode is stored in gstage_mode (marked __ro_after_init)
and reported via printk. If no supported mode is found, Xen panics
since Bare mode is not expected to be used.

Finally, CSR_HGATP is cleared and a local_hfence_gvma_all() is issued
to avoid any potential speculative pollution of the TLB, as required
by the RISC-V spec.

The following issue starts to occur:
 ./<riscv>/asm/flushtlb.h:37:55: error: 'struct page_info'  declared inside
   parameter list will not be visible outside of this definition or
   declaration [-Werror]
 37 | static inline void page_set_tlbflush_timestamp(struct page_info *page)
To resolve it, forward declaration of struct page_info is added to
<asm/flushtlb.h.

Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
---
Changes in V5:
 - Add static and __initconst for local variable modes[] in
   gstage_mode_detect().
 - Change type for gstage_mode from 'unsigned long' to 'unsigned char'.
 - Update the comment inisde defintion if modes[] variable in
   gstage_mode_detect():
   - Add information about Bare mode.
   - Drop "a paged virtual-memory scheme described in Section 10.3" as it isn't
     relevant here.
 - Drop printing of function name when chosen G-stage mode message is printed.
 - Drop the call of gstage_mode_detect() from start_xen(). It will be added into
   p2m_init() when the latter will be introduced.
 - Introduce pre_gstage_init().
 - make gstage_mode_detect() static.
---
Changes in V4:
 - New patch.
---
 xen/arch/riscv/Makefile                     |  1 +
 xen/arch/riscv/include/asm/flushtlb.h       |  7 ++
 xen/arch/riscv/include/asm/p2m.h            |  4 +
 xen/arch/riscv/include/asm/riscv_encoding.h |  5 ++
 xen/arch/riscv/p2m.c                        | 96 +++++++++++++++++++++
 xen/arch/riscv/setup.c                      |  3 +
 6 files changed, 116 insertions(+)
 create mode 100644 xen/arch/riscv/p2m.c

diff --git a/xen/arch/riscv/Makefile b/xen/arch/riscv/Makefile
index e2b8aa42c8..264e265699 100644
--- a/xen/arch/riscv/Makefile
+++ b/xen/arch/riscv/Makefile
@@ -7,6 +7,7 @@ obj-y += intc.o
 obj-y += irq.o
 obj-y += mm.o
 obj-y += pt.o
+obj-y += p2m.o
 obj-$(CONFIG_RISCV_64) += riscv64/
 obj-y += sbi.o
 obj-y += setup.o
diff --git a/xen/arch/riscv/include/asm/flushtlb.h b/xen/arch/riscv/include/asm/flushtlb.h
index 51c8f753c5..e70badae0c 100644
--- a/xen/arch/riscv/include/asm/flushtlb.h
+++ b/xen/arch/riscv/include/asm/flushtlb.h
@@ -7,6 +7,13 @@
 
 #include <asm/sbi.h>
 
+struct page_info;
+
+static inline void local_hfence_gvma_all(void)
+{
+    asm volatile ( "hfence.gvma zero, zero" ::: "memory" );
+}
+
 /* Flush TLB of local processor for address va. */
 static inline void flush_tlb_one_local(vaddr_t va)
 {
diff --git a/xen/arch/riscv/include/asm/p2m.h b/xen/arch/riscv/include/asm/p2m.h
index e43c559e0c..3a5066f360 100644
--- a/xen/arch/riscv/include/asm/p2m.h
+++ b/xen/arch/riscv/include/asm/p2m.h
@@ -6,6 +6,8 @@
 
 #include <asm/page-bits.h>
 
+extern unsigned char gstage_mode;
+
 #define paddr_bits PADDR_BITS
 
 /*
@@ -88,6 +90,8 @@ static inline bool arch_acquire_resource_check(struct domain *d)
     return false;
 }
 
+void pre_gstage_init(void);
+
 #endif /* ASM__RISCV__P2M_H */
 
 /*
diff --git a/xen/arch/riscv/include/asm/riscv_encoding.h b/xen/arch/riscv/include/asm/riscv_encoding.h
index 6cc8f4eb45..b15f5ad0b4 100644
--- a/xen/arch/riscv/include/asm/riscv_encoding.h
+++ b/xen/arch/riscv/include/asm/riscv_encoding.h
@@ -131,13 +131,16 @@
 #define HGATP_MODE_SV32X4		_UL(1)
 #define HGATP_MODE_SV39X4		_UL(8)
 #define HGATP_MODE_SV48X4		_UL(9)
+#define HGATP_MODE_SV57X4		_UL(10)
 
 #define HGATP32_MODE_SHIFT		31
+#define HGATP32_MODE_MASK		_UL(0x80000000)
 #define HGATP32_VMID_SHIFT		22
 #define HGATP32_VMID_MASK		_UL(0x1FC00000)
 #define HGATP32_PPN			_UL(0x003FFFFF)
 
 #define HGATP64_MODE_SHIFT		60
+#define HGATP64_MODE_MASK		_ULL(0xF000000000000000)
 #define HGATP64_VMID_SHIFT		44
 #define HGATP64_VMID_MASK		_ULL(0x03FFF00000000000)
 #define HGATP64_PPN			_ULL(0x00000FFFFFFFFFFF)
@@ -170,6 +173,7 @@
 #define HGATP_VMID_SHIFT		HGATP64_VMID_SHIFT
 #define HGATP_VMID_MASK			HGATP64_VMID_MASK
 #define HGATP_MODE_SHIFT		HGATP64_MODE_SHIFT
+#define HGATP_MODE_MASK			HGATP64_MODE_MASK
 #else
 #define MSTATUS_SD			MSTATUS32_SD
 #define SSTATUS_SD			SSTATUS32_SD
@@ -181,6 +185,7 @@
 #define HGATP_VMID_SHIFT		HGATP32_VMID_SHIFT
 #define HGATP_VMID_MASK			HGATP32_VMID_MASK
 #define HGATP_MODE_SHIFT		HGATP32_MODE_SHIFT
+#define HGATP_MODE_MASK			HGATP32_MODE_MASK
 #endif
 
 #define TOPI_IID_SHIFT			16
diff --git a/xen/arch/riscv/p2m.c b/xen/arch/riscv/p2m.c
new file mode 100644
index 0000000000..00fe676089
--- /dev/null
+++ b/xen/arch/riscv/p2m.c
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/macros.h>
+#include <xen/sections.h>
+
+#include <asm/csr.h>
+#include <asm/flushtlb.h>
+#include <asm/riscv_encoding.h>
+
+unsigned char __ro_after_init gstage_mode;
+
+static void __init gstage_mode_detect(void)
+{
+    static const struct {
+        unsigned char mode;
+        unsigned int paging_levels;
+        const char name[8];
+    } modes[] __initconst = {
+        /*
+         * Based on the RISC-V spec:
+         *   Bare mode is always supported, regardless of SXLEN.
+         *   When SXLEN=32, the only other valid setting for MODE is Sv32.
+         *   When SXLEN=64, three paged virtual-memory schemes are defined:
+         *   Sv39, Sv48, and Sv57.
+         */
+#ifdef CONFIG_RISCV_32
+        { HGATP_MODE_SV32X4, 2, "Sv32x4" }
+#else
+        { HGATP_MODE_SV39X4, 3, "Sv39x4" },
+        { HGATP_MODE_SV48X4, 4, "Sv48x4" },
+        { HGATP_MODE_SV57X4, 5, "Sv57x4" },
+#endif
+    };
+
+    unsigned int mode_idx;
+
+    gstage_mode = HGATP_MODE_OFF;
+
+    for ( mode_idx = 0; mode_idx < ARRAY_SIZE(modes); mode_idx++ )
+    {
+        unsigned long mode = modes[mode_idx].mode;
+
+        csr_write(CSR_HGATP, MASK_INSR(mode, HGATP_MODE_MASK));
+
+        if ( MASK_EXTR(csr_read(CSR_HGATP), HGATP_MODE_MASK) == mode )
+        {
+            gstage_mode = mode;
+            break;
+        }
+    }
+
+    if ( gstage_mode == HGATP_MODE_OFF )
+        panic("Xen expects that G-stage won't be Bare mode\n");
+
+    printk("G-stage mode is %s\n", modes[mode_idx].name);
+
+    csr_write(CSR_HGATP, 0);
+
+    /*
+     * From RISC-V spec:
+     *   Speculative executions of the address-translation algorithm behave as
+     *   non-speculative executions of the algorithm do, except that they must
+     *   not set the dirty bit for a PTE, they must not trigger an exception,
+     *   and they must not create address-translation cache entries if those
+     *   entries would have been invalidated by any SFENCE.VMA instruction
+     *   executed by the hart since the speculative execution of the algorithm
+     *   began.
+     * The quote above mention explicitly SFENCE.VMA, but I assume it is true
+     * for HFENCE.VMA.
+     *
+     * Also, despite of the fact here it is mentioned that when V=0 two-stage
+     * address translation is inactivated:
+     *   The current virtualization mode, denoted V, indicates whether the hart
+     *   is currently executing in a guest. When V=1, the hart is either in
+     *   virtual S-mode (VS-mode), or in virtual U-mode (VU-mode) atop a guest
+     *   OS running in VS-mode. When V=0, the hart is either in M-mode, in
+     *   HS-mode, or in U-mode atop an OS running in HS-mode. The
+     *   virtualization mode also indicates whether two-stage address
+     *   translation is active (V=1) or inactive (V=0).
+     * But on the same side, writing to hgatp register activates it:
+     *   The hgatp register is considered active for the purposes of
+     *   the address-translation algorithm unless the effective privilege mode
+     *   is U and hstatus.HU=0.
+     *
+     * Thereby it leaves some room for speculation even in this stage of boot,
+     * so it could be that we polluted local TLB so flush all guest TLB.
+     */
+    local_hfence_gvma_all();
+}
+
+void __init pre_gstage_init(void)
+{
+    gstage_mode_detect();
+}
diff --git a/xen/arch/riscv/setup.c b/xen/arch/riscv/setup.c
index 483cdd7e17..c4f7793151 100644
--- a/xen/arch/riscv/setup.c
+++ b/xen/arch/riscv/setup.c
@@ -22,6 +22,7 @@
 #include <asm/early_printk.h>
 #include <asm/fixmap.h>
 #include <asm/intc.h>
+#include <asm/p2m.h>
 #include <asm/sbi.h>
 #include <asm/setup.h>
 #include <asm/traps.h>
@@ -148,6 +149,8 @@ void __init noreturn start_xen(unsigned long bootcpu_id,
 
     console_init_postirq();
 
+    pre_gstage_init();
+
     printk("All set up\n");
 
     machine_halt();
-- 
2.51.0
Re: [for 4.22 v5 01/18] xen/riscv: detect and initialize G-stage mode
Posted by Jan Beulich 3 months ago
On 20.10.2025 17:57, Oleksii Kurochko wrote:
> Changes in V5:
>  - Add static and __initconst for local variable modes[] in
>    gstage_mode_detect().
>  - Change type for gstage_mode from 'unsigned long' to 'unsigned char'.
>  - Update the comment inisde defintion if modes[] variable in
>    gstage_mode_detect():
>    - Add information about Bare mode.
>    - Drop "a paged virtual-memory scheme described in Section 10.3" as it isn't
>      relevant here.
>  - Drop printing of function name when chosen G-stage mode message is printed.
>  - Drop the call of gstage_mode_detect() from start_xen(). It will be added into
>    p2m_init() when the latter will be introduced.

Well, thanks, but ...

>  - Introduce pre_gstage_init().

... the same comment that I gave before now applies here: This doesn't look to
belong directly in start_xen(). In x86'es terms I'd say this is a tiny part of
paging_init().

> --- a/xen/arch/riscv/Makefile
> +++ b/xen/arch/riscv/Makefile
> @@ -7,6 +7,7 @@ obj-y += intc.o
>  obj-y += irq.o
>  obj-y += mm.o
>  obj-y += pt.o
> +obj-y += p2m.o

Nit: Please keep things sorted (numbers sort before letters).

> --- a/xen/arch/riscv/include/asm/p2m.h
> +++ b/xen/arch/riscv/include/asm/p2m.h
> @@ -6,6 +6,8 @@
>  
>  #include <asm/page-bits.h>
>  
> +extern unsigned char gstage_mode;

Better move down some, at the very least ...

>  #define paddr_bits PADDR_BITS

... past more fundamental #define-s?

> --- a/xen/arch/riscv/include/asm/riscv_encoding.h
> +++ b/xen/arch/riscv/include/asm/riscv_encoding.h
> @@ -131,13 +131,16 @@
>  #define HGATP_MODE_SV32X4		_UL(1)
>  #define HGATP_MODE_SV39X4		_UL(8)
>  #define HGATP_MODE_SV48X4		_UL(9)
> +#define HGATP_MODE_SV57X4		_UL(10)
>  
>  #define HGATP32_MODE_SHIFT		31
> +#define HGATP32_MODE_MASK		_UL(0x80000000)

Please can we avoid redundant (and then not even connected) #define-s? I
don't see why you would need HGATP32_MODE_SHIFT when you have
HGATP32_MODE_MASK. Similarly ...

>  #define HGATP32_VMID_SHIFT		22
>  #define HGATP32_VMID_MASK		_UL(0x1FC00000)

... here, while ...

>  #define HGATP32_PPN			_UL(0x003FFFFF)

... here the constant isn't even suffixed with _MASK.

> --- /dev/null
> +++ b/xen/arch/riscv/p2m.c
> @@ -0,0 +1,96 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +
> +#include <xen/init.h>
> +#include <xen/lib.h>
> +#include <xen/macros.h>
> +#include <xen/sections.h>
> +
> +#include <asm/csr.h>
> +#include <asm/flushtlb.h>
> +#include <asm/riscv_encoding.h>
> +
> +unsigned char __ro_after_init gstage_mode;
> +
> +static void __init gstage_mode_detect(void)
> +{
> +    static const struct {
> +        unsigned char mode;
> +        unsigned int paging_levels;
> +        const char name[8];
> +    } modes[] __initconst = {
> +        /*
> +         * Based on the RISC-V spec:
> +         *   Bare mode is always supported, regardless of SXLEN.
> +         *   When SXLEN=32, the only other valid setting for MODE is Sv32.
> +         *   When SXLEN=64, three paged virtual-memory schemes are defined:
> +         *   Sv39, Sv48, and Sv57.
> +         */
> +#ifdef CONFIG_RISCV_32
> +        { HGATP_MODE_SV32X4, 2, "Sv32x4" }
> +#else
> +        { HGATP_MODE_SV39X4, 3, "Sv39x4" },
> +        { HGATP_MODE_SV48X4, 4, "Sv48x4" },
> +        { HGATP_MODE_SV57X4, 5, "Sv57x4" },
> +#endif
> +    };
> +
> +    unsigned int mode_idx;
> +
> +    gstage_mode = HGATP_MODE_OFF;

Why is this not the variable's initializer?

> +    for ( mode_idx = 0; mode_idx < ARRAY_SIZE(modes); mode_idx++ )
> +    {
> +        unsigned long mode = modes[mode_idx].mode;
> +
> +        csr_write(CSR_HGATP, MASK_INSR(mode, HGATP_MODE_MASK));
> +
> +        if ( MASK_EXTR(csr_read(CSR_HGATP), HGATP_MODE_MASK) == mode )
> +        {
> +            gstage_mode = mode;
> +            break;
> +        }
> +    }

I take it that using the first available mode is only transient. To support bigger
guests, you may need to pick 48x4 or even 57x4 no matter that 39x4 is available.
I wonder whether you wouldn't be better off recording all supported modes right
away.

Jan
Re: [for 4.22 v5 01/18] xen/riscv: detect and initialize G-stage mode
Posted by Oleksii Kurochko 2 months, 3 weeks ago
On 11/6/25 2:43 PM, Jan Beulich wrote:
> On 20.10.2025 17:57, Oleksii Kurochko wrote:
>> Changes in V5:
>>   - Add static and __initconst for local variable modes[] in
>>     gstage_mode_detect().
>>   - Change type for gstage_mode from 'unsigned long' to 'unsigned char'.
>>   - Update the comment inisde defintion if modes[] variable in
>>     gstage_mode_detect():
>>     - Add information about Bare mode.
>>     - Drop "a paged virtual-memory scheme described in Section 10.3" as it isn't
>>       relevant here.
>>   - Drop printing of function name when chosen G-stage mode message is printed.
>>   - Drop the call of gstage_mode_detect() from start_xen(). It will be added into
>>     p2m_init() when the latter will be introduced.
> Well, thanks, but ...
>
>>   - Introduce pre_gstage_init().
> ... the same comment that I gave before now applies here: This doesn't look to
> belong directly in start_xen(). In x86'es terms I'd say this is a tiny part of
> paging_init().

Does it only the question of function naming now?

IMO, ideally it would be nice to have it in p2m_init(), but there is no a lot of
sense to detect supported modes each time a domain is constructed. And it is the
reason why I put directly to start_xen().

>
>> --- a/xen/arch/riscv/include/asm/riscv_encoding.h
>> +++ b/xen/arch/riscv/include/asm/riscv_encoding.h
>> @@ -131,13 +131,16 @@
>>   #define HGATP_MODE_SV32X4		_UL(1)
>>   #define HGATP_MODE_SV39X4		_UL(8)
>>   #define HGATP_MODE_SV48X4		_UL(9)
>> +#define HGATP_MODE_SV57X4		_UL(10)
>>   
>>   #define HGATP32_MODE_SHIFT		31
>> +#define HGATP32_MODE_MASK		_UL(0x80000000)
> Please can we avoid redundant (and then not even connected) #define-s? I
> don't see why you would need HGATP32_MODE_SHIFT when you have
> HGATP32_MODE_MASK. Similarly ...
>
>>   #define HGATP32_VMID_SHIFT		22
>>   #define HGATP32_VMID_MASK		_UL(0x1FC00000)
> ... here, while ...
>
>>   #define HGATP32_PPN			_UL(0x003FFFFF)
> ... here the constant isn't even suffixed with _MASK.

I agree that it is enough to have only *_MASK.

I can do a separate cleanup patch (as what mentioned below were already introduced
before) which will drop:
  HGATP32_MODE_SHIFT, HGATP32_VMID_SHIFT, HGATP64_MODE_SHIFT, HGATP64_VMID_SHIFT
and rename:
  HGATP*_PPN to HGATP*_PPN_MASK

Does it make sense?

>
>> --- /dev/null
>> +++ b/xen/arch/riscv/p2m.c
>> @@ -0,0 +1,96 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +
>> +#include <xen/init.h>
>> +#include <xen/lib.h>
>> +#include <xen/macros.h>
>> +#include <xen/sections.h>
>> +
>> +#include <asm/csr.h>
>> +#include <asm/flushtlb.h>
>> +#include <asm/riscv_encoding.h>
>> +
>> +unsigned char __ro_after_init gstage_mode;
>> +
>> +static void __init gstage_mode_detect(void)
>> +{
>> +    static const struct {
>> +        unsigned char mode;
>> +        unsigned int paging_levels;
>> +        const char name[8];
>> +    } modes[] __initconst = {
>> +        /*
>> +         * Based on the RISC-V spec:
>> +         *   Bare mode is always supported, regardless of SXLEN.
>> +         *   When SXLEN=32, the only other valid setting for MODE is Sv32.
>> +         *   When SXLEN=64, three paged virtual-memory schemes are defined:
>> +         *   Sv39, Sv48, and Sv57.
>> +         */
>> +#ifdef CONFIG_RISCV_32
>> +        { HGATP_MODE_SV32X4, 2, "Sv32x4" }
>> +#else
>> +        { HGATP_MODE_SV39X4, 3, "Sv39x4" },
>> +        { HGATP_MODE_SV48X4, 4, "Sv48x4" },
>> +        { HGATP_MODE_SV57X4, 5, "Sv57x4" },
>> +#endif
>> +    };
>> +
>> +    unsigned int mode_idx;
>> +
>> +    gstage_mode = HGATP_MODE_OFF;
> Why is this not the variable's initializer?

Good point. It should be the variable's initializer.

>> +    for ( mode_idx = 0; mode_idx < ARRAY_SIZE(modes); mode_idx++ )
>> +    {
>> +        unsigned long mode = modes[mode_idx].mode;
>> +
>> +        csr_write(CSR_HGATP, MASK_INSR(mode, HGATP_MODE_MASK));
>> +
>> +        if ( MASK_EXTR(csr_read(CSR_HGATP), HGATP_MODE_MASK) == mode )
>> +        {
>> +            gstage_mode = mode;
>> +            break;
>> +        }
>> +    }
> I take it that using the first available mode is only transient. To support bigger
> guests, you may need to pick 48x4 or even 57x4 no matter that 39x4 is available.

I considered traversing the|modes[]| array in the opposite order so that the largest
mode would be checked first. However, I decided that 39x4 is sufficiently large and
provides a good balance between the number of page tables and supported address
space, at least for now.

> I wonder whether you wouldn't be better off recording all supported modes right
> away.

What would be the use case for recording and storing all supported modes?
For example, would it be used to indicate which mode is preferable for a guest
domain via the device tree?

Also, I’d like to note that it probably doesn’t make much sense to record all
supported modes. If we traverse the|modes[]| array in the opposite order—checking
|Sv57| first—then, according to the RISC-V specification:
- Implementations that support Sv57 must also support Sv48.
- Implementations that support Sv48 must also support Sv39.
So if Sv57 is supported then lower modes are supported too. (except Sv32 for RV32)

Based on this, it seems reasonable to start checking from Sv57, right?

Thanks.

~ Oleksii
Re: [for 4.22 v5 01/18] xen/riscv: detect and initialize G-stage mode
Posted by Jan Beulich 2 months, 3 weeks ago
On 13.11.2025 17:18, Oleksii Kurochko wrote:
> On 11/6/25 2:43 PM, Jan Beulich wrote:
>> On 20.10.2025 17:57, Oleksii Kurochko wrote:
>>> Changes in V5:
>>>   - Add static and __initconst for local variable modes[] in
>>>     gstage_mode_detect().
>>>   - Change type for gstage_mode from 'unsigned long' to 'unsigned char'.
>>>   - Update the comment inisde defintion if modes[] variable in
>>>     gstage_mode_detect():
>>>     - Add information about Bare mode.
>>>     - Drop "a paged virtual-memory scheme described in Section 10.3" as it isn't
>>>       relevant here.
>>>   - Drop printing of function name when chosen G-stage mode message is printed.
>>>   - Drop the call of gstage_mode_detect() from start_xen(). It will be added into
>>>     p2m_init() when the latter will be introduced.
>> Well, thanks, but ...
>>
>>>   - Introduce pre_gstage_init().
>> ... the same comment that I gave before now applies here: This doesn't look to
>> belong directly in start_xen(). In x86'es terms I'd say this is a tiny part of
>> paging_init().
> 
> Does it only the question of function naming now?

Not just, no. My point is that you shouldn't pollute start_xen() with calls to
dozens of special-purpose functions. There wants to be one call dealing with
everything guest-mm related, I think.

> IMO, ideally it would be nice to have it in p2m_init(), but there is no a lot of
> sense to detect supported modes each time a domain is constructed. And it is the
> reason why I put directly to start_xen().

No per-domain function wants to be used for this, I agree. Hence why I pointed
you at x86'es paging_init().

>>> --- /dev/null
>>> +++ b/xen/arch/riscv/p2m.c
>>> @@ -0,0 +1,96 @@
>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>> +
>>> +#include <xen/init.h>
>>> +#include <xen/lib.h>
>>> +#include <xen/macros.h>
>>> +#include <xen/sections.h>
>>> +
>>> +#include <asm/csr.h>
>>> +#include <asm/flushtlb.h>
>>> +#include <asm/riscv_encoding.h>
>>> +
>>> +unsigned char __ro_after_init gstage_mode;
>>> +
>>> +static void __init gstage_mode_detect(void)
>>> +{
>>> +    static const struct {
>>> +        unsigned char mode;
>>> +        unsigned int paging_levels;
>>> +        const char name[8];
>>> +    } modes[] __initconst = {
>>> +        /*
>>> +         * Based on the RISC-V spec:
>>> +         *   Bare mode is always supported, regardless of SXLEN.
>>> +         *   When SXLEN=32, the only other valid setting for MODE is Sv32.
>>> +         *   When SXLEN=64, three paged virtual-memory schemes are defined:
>>> +         *   Sv39, Sv48, and Sv57.
>>> +         */
>>> +#ifdef CONFIG_RISCV_32
>>> +        { HGATP_MODE_SV32X4, 2, "Sv32x4" }
>>> +#else
>>> +        { HGATP_MODE_SV39X4, 3, "Sv39x4" },
>>> +        { HGATP_MODE_SV48X4, 4, "Sv48x4" },
>>> +        { HGATP_MODE_SV57X4, 5, "Sv57x4" },
>>> +#endif
>>> +    };
>>> +
>>> +    unsigned int mode_idx;
>>> +
>>> +    gstage_mode = HGATP_MODE_OFF;
>> Why is this not the variable's initializer?
> 
> Good point. It should be the variable's initializer.
> 
>>> +    for ( mode_idx = 0; mode_idx < ARRAY_SIZE(modes); mode_idx++ )
>>> +    {
>>> +        unsigned long mode = modes[mode_idx].mode;
>>> +
>>> +        csr_write(CSR_HGATP, MASK_INSR(mode, HGATP_MODE_MASK));
>>> +
>>> +        if ( MASK_EXTR(csr_read(CSR_HGATP), HGATP_MODE_MASK) == mode )
>>> +        {
>>> +            gstage_mode = mode;
>>> +            break;
>>> +        }
>>> +    }
>> I take it that using the first available mode is only transient. To support bigger
>> guests, you may need to pick 48x4 or even 57x4 no matter that 39x4 is available.
> 
> I considered traversing the|modes[]| array in the opposite order so that the largest
> mode would be checked first. However, I decided that 39x4 is sufficiently large and
> provides a good balance between the number of page tables and supported address
> space, at least for now.
> 
>> I wonder whether you wouldn't be better off recording all supported modes right
>> away.
> 
> What would be the use case for recording and storing all supported modes?
> For example, would it be used to indicate which mode is preferable for a guest
> domain via the device tree?

Why device tree? That's what's exposed to guests, isn't it? Here we talk about
what Xen uses to run guests. And that can vary from guest to guest.

> Also, I’d like to note that it probably doesn’t make much sense to record all
> supported modes. If we traverse the|modes[]| array in the opposite order—checking
> |Sv57| first—then, according to the RISC-V specification:
> - Implementations that support Sv57 must also support Sv48.
> - Implementations that support Sv48 must also support Sv39.
> So if Sv57 is supported then lower modes are supported too. (except Sv32 for RV32)
> 
> Based on this, it seems reasonable to start checking from Sv57, right?

No. Bigger guests want running in 48x4, huge ones in 57x4 (each: if available),
and most ones in 39x4. It doesn't matter what direction you do the checks, you
want to know what you have available.

Jan

Re: [for 4.22 v5 01/18] xen/riscv: detect and initialize G-stage mode
Posted by Oleksii Kurochko 2 months, 3 weeks ago
On 11/13/25 5:32 PM, Jan Beulich wrote:
> On 13.11.2025 17:18, Oleksii Kurochko wrote:
>> On 11/6/25 2:43 PM, Jan Beulich wrote:
>>> On 20.10.2025 17:57, Oleksii Kurochko wrote:
>>>> Changes in V5:
>>>>    - Add static and __initconst for local variable modes[] in
>>>>      gstage_mode_detect().
>>>>    - Change type for gstage_mode from 'unsigned long' to 'unsigned char'.
>>>>    - Update the comment inisde defintion if modes[] variable in
>>>>      gstage_mode_detect():
>>>>      - Add information about Bare mode.
>>>>      - Drop "a paged virtual-memory scheme described in Section 10.3" as it isn't
>>>>        relevant here.
>>>>    - Drop printing of function name when chosen G-stage mode message is printed.
>>>>    - Drop the call of gstage_mode_detect() from start_xen(). It will be added into
>>>>      p2m_init() when the latter will be introduced.
>>> Well, thanks, but ...
>>>
>>>>    - Introduce pre_gstage_init().
>>> ... the same comment that I gave before now applies here: This doesn't look to
>>> belong directly in start_xen(). In x86'es terms I'd say this is a tiny part of
>>> paging_init().
>> Does it only the question of function naming now?
> Not just, no. My point is that you shouldn't pollute start_xen() with calls to
> dozens of special-purpose functions. There wants to be one call dealing with
> everything guest-mm related, I think.

I think I understand your point now. I’ll introduce|guest_mm_init()| and move
everything related to it inside that function (at the moment, this includes
the VMID initialization and G-stage mode detection).

>
>> IMO, ideally it would be nice to have it in p2m_init(), but there is no a lot of
>> sense to detect supported modes each time a domain is constructed. And it is the
>> reason why I put directly to start_xen().
> No per-domain function wants to be used for this, I agree. Hence why I pointed
> you at x86'es paging_init().
>
>>>> --- /dev/null
>>>> +++ b/xen/arch/riscv/p2m.c
>>>> @@ -0,0 +1,96 @@
>>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>>> +
>>>> +#include <xen/init.h>
>>>> +#include <xen/lib.h>
>>>> +#include <xen/macros.h>
>>>> +#include <xen/sections.h>
>>>> +
>>>> +#include <asm/csr.h>
>>>> +#include <asm/flushtlb.h>
>>>> +#include <asm/riscv_encoding.h>
>>>> +
>>>> +unsigned char __ro_after_init gstage_mode;
>>>> +
>>>> +static void __init gstage_mode_detect(void)
>>>> +{
>>>> +    static const struct {
>>>> +        unsigned char mode;
>>>> +        unsigned int paging_levels;
>>>> +        const char name[8];
>>>> +    } modes[] __initconst = {
>>>> +        /*
>>>> +         * Based on the RISC-V spec:
>>>> +         *   Bare mode is always supported, regardless of SXLEN.
>>>> +         *   When SXLEN=32, the only other valid setting for MODE is Sv32.
>>>> +         *   When SXLEN=64, three paged virtual-memory schemes are defined:
>>>> +         *   Sv39, Sv48, and Sv57.
>>>> +         */
>>>> +#ifdef CONFIG_RISCV_32
>>>> +        { HGATP_MODE_SV32X4, 2, "Sv32x4" }
>>>> +#else
>>>> +        { HGATP_MODE_SV39X4, 3, "Sv39x4" },
>>>> +        { HGATP_MODE_SV48X4, 4, "Sv48x4" },
>>>> +        { HGATP_MODE_SV57X4, 5, "Sv57x4" },
>>>> +#endif
>>>> +    };
>>>> +
>>>> +    unsigned int mode_idx;
>>>> +
>>>> +    gstage_mode = HGATP_MODE_OFF;
>>> Why is this not the variable's initializer?
>> Good point. It should be the variable's initializer.
>>
>>>> +    for ( mode_idx = 0; mode_idx < ARRAY_SIZE(modes); mode_idx++ )
>>>> +    {
>>>> +        unsigned long mode = modes[mode_idx].mode;
>>>> +
>>>> +        csr_write(CSR_HGATP, MASK_INSR(mode, HGATP_MODE_MASK));
>>>> +
>>>> +        if ( MASK_EXTR(csr_read(CSR_HGATP), HGATP_MODE_MASK) == mode )
>>>> +        {
>>>> +            gstage_mode = mode;
>>>> +            break;
>>>> +        }
>>>> +    }
>>> I take it that using the first available mode is only transient. To support bigger
>>> guests, you may need to pick 48x4 or even 57x4 no matter that 39x4 is available.
>> I considered traversing the|modes[]| array in the opposite order so that the largest
>> mode would be checked first. However, I decided that 39x4 is sufficiently large and
>> provides a good balance between the number of page tables and supported address
>> space, at least for now.
>>
>>> I wonder whether you wouldn't be better off recording all supported modes right
>>> away.
>> What would be the use case for recording and storing all supported modes?
>> For example, would it be used to indicate which mode is preferable for a guest
>> domain via the device tree?
> Why device tree? That's what's exposed to guests, isn't it? Here we talk about
> what Xen uses to run guests. And that can vary from guest to guest.

At the same time, the bootloader also passes a device tree to Xen, and Xen
uses it, at least—to determine the RAM addresses and sizes. I also referred to
device tree because it can indicate the largest MMU mode supported on a hart:
   mmu-type:
     description:
       Identifies the largest MMU address translation mode supported by
       this hart.  These values originate from the RISC-V Privileged
       Specification document, available from
       https://riscv.org/specifications/
     $ref: /schemas/types.yaml#/definitions/string
     enum:
       - riscv,sv32
       - riscv,sv39
       - riscv,sv48
       - riscv,sv57
       - riscv,none
And so, I thought, that Xen could also re-use this information and use it as starting
value for Mode detection. But considering how much modes are supported by RISC-V spec,
it seems that it won't be too long just to detect which are supported in the way it
is done now.

>
>> Also, I’d like to note that it probably doesn’t make much sense to record all
>> supported modes. If we traverse the|modes[]| array in the opposite order—checking
>> |Sv57| first—then, according to the RISC-V specification:
>> - Implementations that support Sv57 must also support Sv48.
>> - Implementations that support Sv48 must also support Sv39.
>> So if Sv57 is supported then lower modes are supported too. (except Sv32 for RV32)
>>
>> Based on this, it seems reasonable to start checking from Sv57, right?
> No. Bigger guests want running in 48x4, huge ones in 57x4 (each: if available),
> and most ones in 39x4. It doesn't matter what direction you do the checks, you
> want to know what you have available.

My point was that if we change the direction, then once we find the first (largest)
supported MMU mode, there is no need to check the others (lower modes) as according
to the RISC-V specification, the lower modes must be supported automatically.

~ Oleksii
Re: [for 4.22 v5 01/18] xen/riscv: detect and initialize G-stage mode
Posted by Jan Beulich 2 months, 3 weeks ago
On 18.11.2025 09:56, Oleksii Kurochko wrote:
> On 11/13/25 5:32 PM, Jan Beulich wrote:
>> On 13.11.2025 17:18, Oleksii Kurochko wrote:
>>> Also, I’d like to note that it probably doesn’t make much sense to record all
>>> supported modes. If we traverse the|modes[]| array in the opposite order—checking
>>> |Sv57| first—then, according to the RISC-V specification:
>>> - Implementations that support Sv57 must also support Sv48.
>>> - Implementations that support Sv48 must also support Sv39.
>>> So if Sv57 is supported then lower modes are supported too. (except Sv32 for RV32)
>>>
>>> Based on this, it seems reasonable to start checking from Sv57, right?
>> No. Bigger guests want running in 48x4, huge ones in 57x4 (each: if available),
>> and most ones in 39x4. It doesn't matter what direction you do the checks, you
>> want to know what you have available.
> 
> My point was that if we change the direction, then once we find the first (largest)
> supported MMU mode, there is no need to check the others (lower modes) as according
> to the RISC-V specification, the lower modes must be supported automatically.

Oh, I see, makes sense.

Jan