arch/riscv/mm/init.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-)
The RISC-V kernel already has checks to ensure that memory which would
lie outside of the linear mapping is not used. However those checks
use memory_limit, which is used to implement the mem= kernel command
line option (to limit the total amount of memory, not its address
range). When memory is made up of two or more non-contiguous memory
banks this check is incorrect.
Two changes are made here:
- add a call in setup_bootmem() to memblock_cap_memory_range() which
will cause any memory which falls outside the linear mapping to be
removed from the memory regions.
- remove the check in create_linear_mapping_page_table() which was
intended to remove memory which is outside the liner mapping based
on memory_limit, as it is no longer needed. Note a check for
mapping more memory than memory_limit (to implement mem=) is
unnecessary because of the existing call to
memblock_enforce_memory_limit().
This issue was seen when booting on a SV39 platform with two memory
banks:
0x00,80000000 1GiB
0x20,00000000 32GiB
This memory range is 158GiB from top to bottom, but the linear mapping
is limited to 128GiB, so the lower block of RAM will be mapped at
PAGE_OFFSET, and the upper block straddles the top of the linear
mapping.
This causes the following Oops:
[ 0.000000] Linux version 6.10.0-rc2-gd3b8dd5b51dd-dirty (stuart.menefy@codasip.com) (riscv64-codasip-linux-gcc (GCC) 13.2.0, GNU ld (GNU Binutils) 2.41.0.20231213) #20 SMP Sat Jun 22 11:34:22 BST 2024
[ 0.000000] memblock_add: [0x0000000080000000-0x00000000bfffffff] early_init_dt_add_memory_arch+0x4a/0x52
[ 0.000000] memblock_add: [0x0000002000000000-0x00000027ffffffff] early_init_dt_add_memory_arch+0x4a/0x52
...
[ 0.000000] memblock_alloc_try_nid: 23724 bytes align=0x8 nid=-1 from=0x0000000000000000 max_addr=0x0000000000000000 early_init_dt_alloc_memory_arch+0x1e/0x48
[ 0.000000] memblock_reserve: [0x00000027ffff5350-0x00000027ffffaffb] memblock_alloc_range_nid+0xb8/0x132
[ 0.000000] Unable to handle kernel paging request at virtual address fffffffe7fff5350
[ 0.000000] Oops [#1]
[ 0.000000] Modules linked in:
[ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 6.10.0-rc2-gd3b8dd5b51dd-dirty #20
[ 0.000000] Hardware name: codasip,a70x (DT)
[ 0.000000] epc : __memset+0x8c/0x104
[ 0.000000] ra : memblock_alloc_try_nid+0x74/0x84
[ 0.000000] epc : ffffffff805e88c8 ra : ffffffff806148f6 sp : ffffffff80e03d50
[ 0.000000] gp : ffffffff80ec4158 tp : ffffffff80e0bec0 t0 : fffffffe7fff52f8
[ 0.000000] t1 : 00000027ffffb000 t2 : 5f6b636f6c626d65 s0 : ffffffff80e03d90
[ 0.000000] s1 : 0000000000005cac a0 : fffffffe7fff5350 a1 : 0000000000000000
[ 0.000000] a2 : 0000000000005cac a3 : fffffffe7fffaff8 a4 : 000000000000002c
[ 0.000000] a5 : ffffffff805e88c8 a6 : 0000000000005cac a7 : 0000000000000030
[ 0.000000] s2 : fffffffe7fff5350 s3 : ffffffffffffffff s4 : 0000000000000000
[ 0.000000] s5 : ffffffff8062347e s6 : 0000000000000000 s7 : 0000000000000001
[ 0.000000] s8 : 0000000000002000 s9 : 00000000800226d0 s10: 0000000000000000
[ 0.000000] s11: 0000000000000000 t3 : ffffffff8080a928 t4 : ffffffff8080a928
[ 0.000000] t5 : ffffffff8080a928 t6 : ffffffff8080a940
[ 0.000000] status: 0000000200000100 badaddr: fffffffe7fff5350 cause: 000000000000000f
[ 0.000000] [<ffffffff805e88c8>] __memset+0x8c/0x104
[ 0.000000] [<ffffffff8062349c>] early_init_dt_alloc_memory_arch+0x1e/0x48
[ 0.000000] [<ffffffff8043e892>] __unflatten_device_tree+0x52/0x114
[ 0.000000] [<ffffffff8062441e>] unflatten_device_tree+0x9e/0xb8
[ 0.000000] [<ffffffff806046fe>] setup_arch+0xd4/0x5bc
[ 0.000000] [<ffffffff806007aa>] start_kernel+0x76/0x81a
[ 0.000000] Code: b823 02b2 bc23 02b2 b023 04b2 b423 04b2 b823 04b2 (bc23) 04b2
[ 0.000000] ---[ end trace 0000000000000000 ]---
[ 0.000000] Kernel panic - not syncing: Attempted to kill the idle task!
[ 0.000000] ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---
The problem is that memblock (unaware that some physical memory cannot
be used) has allocated memory from the top of memory but which is
outside the linear mapping region.
Signed-off-by: Stuart Menefy <stuart.menefy@codasip.com>
Fixes: c99127c45248 ("riscv: Make sure the linear mapping does not use the kernel mapping")
Reviewed-by: David McKay <david.mckay@codasip.com>
---
arch/riscv/mm/init.c | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index e3405e4b99af..7e25606f858a 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -233,8 +233,6 @@ static void __init setup_bootmem(void)
*/
memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
- phys_ram_end = memblock_end_of_DRAM();
-
/*
* Make sure we align the start of the memory on a PMD boundary so that
* at worst, we map the linear mapping with PMD mappings.
@@ -249,6 +247,16 @@ static void __init setup_bootmem(void)
if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU))
kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base;
+ /*
+ * The size of the linear page mapping may restrict the amount of
+ * usable RAM.
+ */
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ max_mapped_addr = __pa(PAGE_OFFSET) + KERN_VIRT_SIZE;
+ memblock_cap_memory_range(phys_ram_base,
+ max_mapped_addr - phys_ram_base);
+ }
+
/*
* Reserve physical address space that would be mapped to virtual
* addresses greater than (void *)(-PAGE_SIZE) because:
@@ -265,6 +273,7 @@ static void __init setup_bootmem(void)
memblock_reserve(max_mapped_addr, (phys_addr_t)-max_mapped_addr);
}
+ phys_ram_end = memblock_end_of_DRAM();
min_low_pfn = PFN_UP(phys_ram_base);
max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end);
high_memory = (void *)(__va(PFN_PHYS(max_low_pfn)));
@@ -1289,8 +1298,6 @@ static void __init create_linear_mapping_page_table(void)
if (start <= __pa(PAGE_OFFSET) &&
__pa(PAGE_OFFSET) < end)
start = __pa(PAGE_OFFSET);
- if (end >= __pa(PAGE_OFFSET) + memory_limit)
- end = __pa(PAGE_OFFSET) + memory_limit;
create_linear_mapping_range(start, end, 0);
}
--
2.43.0
Hi Stuart,
On Sat, Jun 22, 2024 at 1:43 PM Stuart Menefy <stuart.menefy@codasip.com> wrote:
> The RISC-V kernel already has checks to ensure that memory which would
> lie outside of the linear mapping is not used. However those checks
> use memory_limit, which is used to implement the mem= kernel command
> line option (to limit the total amount of memory, not its address
> range). When memory is made up of two or more non-contiguous memory
> banks this check is incorrect.
>
> Two changes are made here:
> - add a call in setup_bootmem() to memblock_cap_memory_range() which
> will cause any memory which falls outside the linear mapping to be
> removed from the memory regions.
> - remove the check in create_linear_mapping_page_table() which was
> intended to remove memory which is outside the liner mapping based
> on memory_limit, as it is no longer needed. Note a check for
> mapping more memory than memory_limit (to implement mem=) is
> unnecessary because of the existing call to
> memblock_enforce_memory_limit().
>
> This issue was seen when booting on a SV39 platform with two memory
> banks:
> 0x00,80000000 1GiB
> 0x20,00000000 32GiB
> This memory range is 158GiB from top to bottom, but the linear mapping
> is limited to 128GiB, so the lower block of RAM will be mapped at
> PAGE_OFFSET, and the upper block straddles the top of the linear
> mapping.
>
> This causes the following Oops:
> [ 0.000000] Linux version 6.10.0-rc2-gd3b8dd5b51dd-dirty (stuart.menefy@codasip.com) (riscv64-codasip-linux-gcc (GCC) 13.2.0, GNU ld (GNU Binutils) 2.41.0.20231213) #20 SMP Sat Jun 22 11:34:22 BST 2024
> [ 0.000000] memblock_add: [0x0000000080000000-0x00000000bfffffff] early_init_dt_add_memory_arch+0x4a/0x52
> [ 0.000000] memblock_add: [0x0000002000000000-0x00000027ffffffff] early_init_dt_add_memory_arch+0x4a/0x52
> ...
> [ 0.000000] memblock_alloc_try_nid: 23724 bytes align=0x8 nid=-1 from=0x0000000000000000 max_addr=0x0000000000000000 early_init_dt_alloc_memory_arch+0x1e/0x48
> [ 0.000000] memblock_reserve: [0x00000027ffff5350-0x00000027ffffaffb] memblock_alloc_range_nid+0xb8/0x132
> [ 0.000000] Unable to handle kernel paging request at virtual address fffffffe7fff5350
> [ 0.000000] Oops [#1]
> [ 0.000000] Modules linked in:
> [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 6.10.0-rc2-gd3b8dd5b51dd-dirty #20
> [ 0.000000] Hardware name: codasip,a70x (DT)
> [ 0.000000] epc : __memset+0x8c/0x104
> [ 0.000000] ra : memblock_alloc_try_nid+0x74/0x84
> [ 0.000000] epc : ffffffff805e88c8 ra : ffffffff806148f6 sp : ffffffff80e03d50
> [ 0.000000] gp : ffffffff80ec4158 tp : ffffffff80e0bec0 t0 : fffffffe7fff52f8
> [ 0.000000] t1 : 00000027ffffb000 t2 : 5f6b636f6c626d65 s0 : ffffffff80e03d90
> [ 0.000000] s1 : 0000000000005cac a0 : fffffffe7fff5350 a1 : 0000000000000000
> [ 0.000000] a2 : 0000000000005cac a3 : fffffffe7fffaff8 a4 : 000000000000002c
> [ 0.000000] a5 : ffffffff805e88c8 a6 : 0000000000005cac a7 : 0000000000000030
> [ 0.000000] s2 : fffffffe7fff5350 s3 : ffffffffffffffff s4 : 0000000000000000
> [ 0.000000] s5 : ffffffff8062347e s6 : 0000000000000000 s7 : 0000000000000001
> [ 0.000000] s8 : 0000000000002000 s9 : 00000000800226d0 s10: 0000000000000000
> [ 0.000000] s11: 0000000000000000 t3 : ffffffff8080a928 t4 : ffffffff8080a928
> [ 0.000000] t5 : ffffffff8080a928 t6 : ffffffff8080a940
> [ 0.000000] status: 0000000200000100 badaddr: fffffffe7fff5350 cause: 000000000000000f
> [ 0.000000] [<ffffffff805e88c8>] __memset+0x8c/0x104
> [ 0.000000] [<ffffffff8062349c>] early_init_dt_alloc_memory_arch+0x1e/0x48
> [ 0.000000] [<ffffffff8043e892>] __unflatten_device_tree+0x52/0x114
> [ 0.000000] [<ffffffff8062441e>] unflatten_device_tree+0x9e/0xb8
> [ 0.000000] [<ffffffff806046fe>] setup_arch+0xd4/0x5bc
> [ 0.000000] [<ffffffff806007aa>] start_kernel+0x76/0x81a
> [ 0.000000] Code: b823 02b2 bc23 02b2 b023 04b2 b423 04b2 b823 04b2 (bc23) 04b2
> [ 0.000000] ---[ end trace 0000000000000000 ]---
> [ 0.000000] Kernel panic - not syncing: Attempted to kill the idle task!
> [ 0.000000] ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---
>
> The problem is that memblock (unaware that some physical memory cannot
> be used) has allocated memory from the top of memory but which is
> outside the linear mapping region.
>
> Signed-off-by: Stuart Menefy <stuart.menefy@codasip.com>
> Fixes: c99127c45248 ("riscv: Make sure the linear mapping does not use the kernel mapping")
> Reviewed-by: David McKay <david.mckay@codasip.com>
Thanks for your patch, which is now commit 3b6564427aea83b7 ("riscv:
Fix linear mapping checks for non-contiguous memory regions") in
v6.11-rc2.
This causes the boot to hang on SiPEED MAiX BiT (K210 64-bit nommu):
Linux version 6.11.0-rc1-k210-00008-g3b6564427aea (geert@rox)
(riscv64-linux-gnu-gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0, GNU ld
(GNU Binutils for Ubuntu) 2.38) #452 SMP Tue Aug 20 15:02:49 CEST 2024
Machine model: SiPeed MAIX BiT
Forcing kernel command line to: earlycon console=ttySIF0 rootdelay=2
root=/dev/mmcblk0p1 ro
earlycon: sifive0 at MMIO 0x0000000038000000 (options '115200n8')
printk: legacy bootconsole [sifive0] enabled
Zone ranges:
DMA32 [mem 0x0000000080000000-0x00000000807fffff]
Normal empty
Movable zone start for each node
Early memory node ranges
node 0: [mem 0x0000000080000000-0x00000000807fffff]
Initmem setup node 0 [mem 0x0000000080000000-0x00000000807fffff]
Falling back to deprecated "riscv,isa"
riscv: base ISA extensions acdfim
riscv: ELF capabilities acdfim
percpu: max_distance=0x1a000 too large for vmalloc space 0x0
percpu: Embedded 13 pages/cpu s20512 r0 d32736 u53248
percpu: wasting 6 pages per chunk
Kernel command line: earlycon console=ttySIF0 rootdelay=2 root=/dev/mmcblk0p1 ro
Dentry cache hash table entries: 1024 (order: 1, 8192 bytes, linear)
Inode-cache hash table entries: 512 (order: 0, 4096 bytes, linear)
Built 1 zonelists, mobility grouping off. Total pages: 2048
mem auto-init: stack:off, heap alloc:off, heap free:off
SLUB: HWalign=64, Order=0-1, MinObjects=0, CPUs=2, Nodes=1
rcu: Hierarchical RCU implementation.
rcu: RCU calculated value of scheduler-enlistment delay is 25 jiffies.
NR_IRQS: 64, nr_irqs: 64, preallocated irqs: 0
Adding a bit of debug info shows:
setup_bootmem:236: old phys_ram_end = 0x80800000
setup_bootmem:258: max_mapped_addr = 0x7fffffff
setup_bootmem:259: memblock_cap_memory_range(0x80000000, 0xffffffffffffffff)
memblock_remove_range:857: Removing region at 0x80000000 size 0x20d1e8
setup_bootmem:281: new phys_ram_end = 0x80800000
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -233,8 +233,6 @@ static void __init setup_bootmem(void)
> */
> memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
>
> - phys_ram_end = memblock_end_of_DRAM();
> -
> /*
> * Make sure we align the start of the memory on a PMD boundary so that
> * at worst, we map the linear mapping with PMD mappings.
> @@ -249,6 +247,16 @@ static void __init setup_bootmem(void)
> if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU))
> kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base;
>
> + /*
> + * The size of the linear page mapping may restrict the amount of
> + * usable RAM.
> + */
> + if (IS_ENABLED(CONFIG_64BIT)) {
> + max_mapped_addr = __pa(PAGE_OFFSET) + KERN_VIRT_SIZE;
> + memblock_cap_memory_range(phys_ram_base,
> + max_mapped_addr - phys_ram_base);
> + }
> +
> /*
> * Reserve physical address space that would be mapped to virtual
> * addresses greater than (void *)(-PAGE_SIZE) because:
> @@ -265,6 +273,7 @@ static void __init setup_bootmem(void)
> memblock_reserve(max_mapped_addr, (phys_addr_t)-max_mapped_addr);
> }
>
> + phys_ram_end = memblock_end_of_DRAM();
> min_low_pfn = PFN_UP(phys_ram_base);
> max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end);
> high_memory = (void *)(__va(PFN_PHYS(max_low_pfn)));
> @@ -1289,8 +1298,6 @@ static void __init create_linear_mapping_page_table(void)
> if (start <= __pa(PAGE_OFFSET) &&
> __pa(PAGE_OFFSET) < end)
> start = __pa(PAGE_OFFSET);
> - if (end >= __pa(PAGE_OFFSET) + memory_limit)
> - end = __pa(PAGE_OFFSET) + memory_limit;
>
> create_linear_mapping_range(start, end, 0);
> }
Gr{oetje,eeting}s,
Geert
--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
Hi Geert,
On 20/08/2024 15:32, Geert Uytterhoeven wrote:
> Hi Stuart,
>
> On Sat, Jun 22, 2024 at 1:43 PM Stuart Menefy <stuart.menefy@codasip.com> wrote:
>> The RISC-V kernel already has checks to ensure that memory which would
>> lie outside of the linear mapping is not used. However those checks
>> use memory_limit, which is used to implement the mem= kernel command
>> line option (to limit the total amount of memory, not its address
>> range). When memory is made up of two or more non-contiguous memory
>> banks this check is incorrect.
>>
>> Two changes are made here:
>> - add a call in setup_bootmem() to memblock_cap_memory_range() which
>> will cause any memory which falls outside the linear mapping to be
>> removed from the memory regions.
>> - remove the check in create_linear_mapping_page_table() which was
>> intended to remove memory which is outside the liner mapping based
>> on memory_limit, as it is no longer needed. Note a check for
>> mapping more memory than memory_limit (to implement mem=) is
>> unnecessary because of the existing call to
>> memblock_enforce_memory_limit().
>>
>> This issue was seen when booting on a SV39 platform with two memory
>> banks:
>> 0x00,80000000 1GiB
>> 0x20,00000000 32GiB
>> This memory range is 158GiB from top to bottom, but the linear mapping
>> is limited to 128GiB, so the lower block of RAM will be mapped at
>> PAGE_OFFSET, and the upper block straddles the top of the linear
>> mapping.
>>
>> This causes the following Oops:
>> [ 0.000000] Linux version 6.10.0-rc2-gd3b8dd5b51dd-dirty (stuart.menefy@codasip.com) (riscv64-codasip-linux-gcc (GCC) 13.2.0, GNU ld (GNU Binutils) 2.41.0.20231213) #20 SMP Sat Jun 22 11:34:22 BST 2024
>> [ 0.000000] memblock_add: [0x0000000080000000-0x00000000bfffffff] early_init_dt_add_memory_arch+0x4a/0x52
>> [ 0.000000] memblock_add: [0x0000002000000000-0x00000027ffffffff] early_init_dt_add_memory_arch+0x4a/0x52
>> ...
>> [ 0.000000] memblock_alloc_try_nid: 23724 bytes align=0x8 nid=-1 from=0x0000000000000000 max_addr=0x0000000000000000 early_init_dt_alloc_memory_arch+0x1e/0x48
>> [ 0.000000] memblock_reserve: [0x00000027ffff5350-0x00000027ffffaffb] memblock_alloc_range_nid+0xb8/0x132
>> [ 0.000000] Unable to handle kernel paging request at virtual address fffffffe7fff5350
>> [ 0.000000] Oops [#1]
>> [ 0.000000] Modules linked in:
>> [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 6.10.0-rc2-gd3b8dd5b51dd-dirty #20
>> [ 0.000000] Hardware name: codasip,a70x (DT)
>> [ 0.000000] epc : __memset+0x8c/0x104
>> [ 0.000000] ra : memblock_alloc_try_nid+0x74/0x84
>> [ 0.000000] epc : ffffffff805e88c8 ra : ffffffff806148f6 sp : ffffffff80e03d50
>> [ 0.000000] gp : ffffffff80ec4158 tp : ffffffff80e0bec0 t0 : fffffffe7fff52f8
>> [ 0.000000] t1 : 00000027ffffb000 t2 : 5f6b636f6c626d65 s0 : ffffffff80e03d90
>> [ 0.000000] s1 : 0000000000005cac a0 : fffffffe7fff5350 a1 : 0000000000000000
>> [ 0.000000] a2 : 0000000000005cac a3 : fffffffe7fffaff8 a4 : 000000000000002c
>> [ 0.000000] a5 : ffffffff805e88c8 a6 : 0000000000005cac a7 : 0000000000000030
>> [ 0.000000] s2 : fffffffe7fff5350 s3 : ffffffffffffffff s4 : 0000000000000000
>> [ 0.000000] s5 : ffffffff8062347e s6 : 0000000000000000 s7 : 0000000000000001
>> [ 0.000000] s8 : 0000000000002000 s9 : 00000000800226d0 s10: 0000000000000000
>> [ 0.000000] s11: 0000000000000000 t3 : ffffffff8080a928 t4 : ffffffff8080a928
>> [ 0.000000] t5 : ffffffff8080a928 t6 : ffffffff8080a940
>> [ 0.000000] status: 0000000200000100 badaddr: fffffffe7fff5350 cause: 000000000000000f
>> [ 0.000000] [<ffffffff805e88c8>] __memset+0x8c/0x104
>> [ 0.000000] [<ffffffff8062349c>] early_init_dt_alloc_memory_arch+0x1e/0x48
>> [ 0.000000] [<ffffffff8043e892>] __unflatten_device_tree+0x52/0x114
>> [ 0.000000] [<ffffffff8062441e>] unflatten_device_tree+0x9e/0xb8
>> [ 0.000000] [<ffffffff806046fe>] setup_arch+0xd4/0x5bc
>> [ 0.000000] [<ffffffff806007aa>] start_kernel+0x76/0x81a
>> [ 0.000000] Code: b823 02b2 bc23 02b2 b023 04b2 b423 04b2 b823 04b2 (bc23) 04b2
>> [ 0.000000] ---[ end trace 0000000000000000 ]---
>> [ 0.000000] Kernel panic - not syncing: Attempted to kill the idle task!
>> [ 0.000000] ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---
>>
>> The problem is that memblock (unaware that some physical memory cannot
>> be used) has allocated memory from the top of memory but which is
>> outside the linear mapping region.
>>
>> Signed-off-by: Stuart Menefy <stuart.menefy@codasip.com>
>> Fixes: c99127c45248 ("riscv: Make sure the linear mapping does not use the kernel mapping")
>> Reviewed-by: David McKay <david.mckay@codasip.com>
> Thanks for your patch, which is now commit 3b6564427aea83b7 ("riscv:
> Fix linear mapping checks for non-contiguous memory regions") in
> v6.11-rc2.
>
> This causes the boot to hang on SiPEED MAiX BiT (K210 64-bit nommu):
>
> Linux version 6.11.0-rc1-k210-00008-g3b6564427aea (geert@rox)
> (riscv64-linux-gnu-gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0, GNU ld
> (GNU Binutils for Ubuntu) 2.38) #452 SMP Tue Aug 20 15:02:49 CEST 2024
> Machine model: SiPeed MAIX BiT
> Forcing kernel command line to: earlycon console=ttySIF0 rootdelay=2
> root=/dev/mmcblk0p1 ro
> earlycon: sifive0 at MMIO 0x0000000038000000 (options '115200n8')
> printk: legacy bootconsole [sifive0] enabled
> Zone ranges:
> DMA32 [mem 0x0000000080000000-0x00000000807fffff]
> Normal empty
> Movable zone start for each node
> Early memory node ranges
> node 0: [mem 0x0000000080000000-0x00000000807fffff]
> Initmem setup node 0 [mem 0x0000000080000000-0x00000000807fffff]
> Falling back to deprecated "riscv,isa"
> riscv: base ISA extensions acdfim
> riscv: ELF capabilities acdfim
> percpu: max_distance=0x1a000 too large for vmalloc space 0x0
> percpu: Embedded 13 pages/cpu s20512 r0 d32736 u53248
> percpu: wasting 6 pages per chunk
> Kernel command line: earlycon console=ttySIF0 rootdelay=2 root=/dev/mmcblk0p1 ro
> Dentry cache hash table entries: 1024 (order: 1, 8192 bytes, linear)
> Inode-cache hash table entries: 512 (order: 0, 4096 bytes, linear)
> Built 1 zonelists, mobility grouping off. Total pages: 2048
> mem auto-init: stack:off, heap alloc:off, heap free:off
> SLUB: HWalign=64, Order=0-1, MinObjects=0, CPUs=2, Nodes=1
> rcu: Hierarchical RCU implementation.
> rcu: RCU calculated value of scheduler-enlistment delay is 25 jiffies.
> NR_IRQS: 64, nr_irqs: 64, preallocated irqs: 0
>
> Adding a bit of debug info shows:
>
> setup_bootmem:236: old phys_ram_end = 0x80800000
> setup_bootmem:258: max_mapped_addr = 0x7fffffff
> setup_bootmem:259: memblock_cap_memory_range(0x80000000, 0xffffffffffffffff)
> memblock_remove_range:857: Removing region at 0x80000000 size 0x20d1e8
> setup_bootmem:281: new phys_ram_end = 0x80800000
Thank you very much for the report!
The following patch should fix this, can you give it a try?
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index eb0649a61b4c..1785782c2e55 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -252,7 +252,7 @@ static void __init setup_bootmem(void)
* The size of the linear page mapping may restrict the amount of
* usable RAM.
*/
- if (IS_ENABLED(CONFIG_64BIT)) {
+ if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU)) {
max_mapped_addr = __pa(PAGE_OFFSET) + KERN_VIRT_SIZE;
memblock_cap_memory_range(phys_ram_base,
max_mapped_addr - phys_ram_base);
It makes no sense to restrict physical memory size because of linear
mapping size constraints when there is no linear mapping :)
I'll send a proper patch when you confirm it works for you :)
Thanks again,
Alex
>> --- a/arch/riscv/mm/init.c
>> +++ b/arch/riscv/mm/init.c
>> @@ -233,8 +233,6 @@ static void __init setup_bootmem(void)
>> */
>> memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
>>
>> - phys_ram_end = memblock_end_of_DRAM();
>> -
>> /*
>> * Make sure we align the start of the memory on a PMD boundary so that
>> * at worst, we map the linear mapping with PMD mappings.
>> @@ -249,6 +247,16 @@ static void __init setup_bootmem(void)
>> if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU))
>> kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base;
>>
>> + /*
>> + * The size of the linear page mapping may restrict the amount of
>> + * usable RAM.
>> + */
>> + if (IS_ENABLED(CONFIG_64BIT)) {
>> + max_mapped_addr = __pa(PAGE_OFFSET) + KERN_VIRT_SIZE;
>> + memblock_cap_memory_range(phys_ram_base,
>> + max_mapped_addr - phys_ram_base);
>> + }
>> +
>> /*
>> * Reserve physical address space that would be mapped to virtual
>> * addresses greater than (void *)(-PAGE_SIZE) because:
>> @@ -265,6 +273,7 @@ static void __init setup_bootmem(void)
>> memblock_reserve(max_mapped_addr, (phys_addr_t)-max_mapped_addr);
>> }
>>
>> + phys_ram_end = memblock_end_of_DRAM();
>> min_low_pfn = PFN_UP(phys_ram_base);
>> max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end);
>> high_memory = (void *)(__va(PFN_PHYS(max_low_pfn)));
>> @@ -1289,8 +1298,6 @@ static void __init create_linear_mapping_page_table(void)
>> if (start <= __pa(PAGE_OFFSET) &&
>> __pa(PAGE_OFFSET) < end)
>> start = __pa(PAGE_OFFSET);
>> - if (end >= __pa(PAGE_OFFSET) + memory_limit)
>> - end = __pa(PAGE_OFFSET) + memory_limit;
>>
>> create_linear_mapping_range(start, end, 0);
>> }
> Gr{oetje,eeting}s,
>
> Geert
>
On 22/06/2024 13:42, Stuart Menefy wrote:
> The RISC-V kernel already has checks to ensure that memory which would
> lie outside of the linear mapping is not used. However those checks
> use memory_limit, which is used to implement the mem= kernel command
> line option (to limit the total amount of memory, not its address
> range). When memory is made up of two or more non-contiguous memory
> banks this check is incorrect.
>
> Two changes are made here:
> - add a call in setup_bootmem() to memblock_cap_memory_range() which
> will cause any memory which falls outside the linear mapping to be
> removed from the memory regions.
> - remove the check in create_linear_mapping_page_table() which was
> intended to remove memory which is outside the liner mapping based
> on memory_limit, as it is no longer needed. Note a check for
> mapping more memory than memory_limit (to implement mem=) is
> unnecessary because of the existing call to
> memblock_enforce_memory_limit().
>
> This issue was seen when booting on a SV39 platform with two memory
> banks:
> 0x00,80000000 1GiB
> 0x20,00000000 32GiB
> This memory range is 158GiB from top to bottom, but the linear mapping
> is limited to 128GiB, so the lower block of RAM will be mapped at
> PAGE_OFFSET, and the upper block straddles the top of the linear
> mapping.
>
> This causes the following Oops:
> [ 0.000000] Linux version 6.10.0-rc2-gd3b8dd5b51dd-dirty (stuart.menefy@codasip.com) (riscv64-codasip-linux-gcc (GCC) 13.2.0, GNU ld (GNU Binutils) 2.41.0.20231213) #20 SMP Sat Jun 22 11:34:22 BST 2024
> [ 0.000000] memblock_add: [0x0000000080000000-0x00000000bfffffff] early_init_dt_add_memory_arch+0x4a/0x52
> [ 0.000000] memblock_add: [0x0000002000000000-0x00000027ffffffff] early_init_dt_add_memory_arch+0x4a/0x52
> ...
> [ 0.000000] memblock_alloc_try_nid: 23724 bytes align=0x8 nid=-1 from=0x0000000000000000 max_addr=0x0000000000000000 early_init_dt_alloc_memory_arch+0x1e/0x48
> [ 0.000000] memblock_reserve: [0x00000027ffff5350-0x00000027ffffaffb] memblock_alloc_range_nid+0xb8/0x132
> [ 0.000000] Unable to handle kernel paging request at virtual address fffffffe7fff5350
> [ 0.000000] Oops [#1]
> [ 0.000000] Modules linked in:
> [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 6.10.0-rc2-gd3b8dd5b51dd-dirty #20
> [ 0.000000] Hardware name: codasip,a70x (DT)
> [ 0.000000] epc : __memset+0x8c/0x104
> [ 0.000000] ra : memblock_alloc_try_nid+0x74/0x84
> [ 0.000000] epc : ffffffff805e88c8 ra : ffffffff806148f6 sp : ffffffff80e03d50
> [ 0.000000] gp : ffffffff80ec4158 tp : ffffffff80e0bec0 t0 : fffffffe7fff52f8
> [ 0.000000] t1 : 00000027ffffb000 t2 : 5f6b636f6c626d65 s0 : ffffffff80e03d90
> [ 0.000000] s1 : 0000000000005cac a0 : fffffffe7fff5350 a1 : 0000000000000000
> [ 0.000000] a2 : 0000000000005cac a3 : fffffffe7fffaff8 a4 : 000000000000002c
> [ 0.000000] a5 : ffffffff805e88c8 a6 : 0000000000005cac a7 : 0000000000000030
> [ 0.000000] s2 : fffffffe7fff5350 s3 : ffffffffffffffff s4 : 0000000000000000
> [ 0.000000] s5 : ffffffff8062347e s6 : 0000000000000000 s7 : 0000000000000001
> [ 0.000000] s8 : 0000000000002000 s9 : 00000000800226d0 s10: 0000000000000000
> [ 0.000000] s11: 0000000000000000 t3 : ffffffff8080a928 t4 : ffffffff8080a928
> [ 0.000000] t5 : ffffffff8080a928 t6 : ffffffff8080a940
> [ 0.000000] status: 0000000200000100 badaddr: fffffffe7fff5350 cause: 000000000000000f
> [ 0.000000] [<ffffffff805e88c8>] __memset+0x8c/0x104
> [ 0.000000] [<ffffffff8062349c>] early_init_dt_alloc_memory_arch+0x1e/0x48
> [ 0.000000] [<ffffffff8043e892>] __unflatten_device_tree+0x52/0x114
> [ 0.000000] [<ffffffff8062441e>] unflatten_device_tree+0x9e/0xb8
> [ 0.000000] [<ffffffff806046fe>] setup_arch+0xd4/0x5bc
> [ 0.000000] [<ffffffff806007aa>] start_kernel+0x76/0x81a
> [ 0.000000] Code: b823 02b2 bc23 02b2 b023 04b2 b423 04b2 b823 04b2 (bc23) 04b2
> [ 0.000000] ---[ end trace 0000000000000000 ]---
> [ 0.000000] Kernel panic - not syncing: Attempted to kill the idle task!
> [ 0.000000] ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---
>
> The problem is that memblock (unaware that some physical memory cannot
> be used) has allocated memory from the top of memory but which is
> outside the linear mapping region.
>
> Signed-off-by: Stuart Menefy <stuart.menefy@codasip.com>
> Fixes: c99127c45248 ("riscv: Make sure the linear mapping does not use the kernel mapping")
> Reviewed-by: David McKay <david.mckay@codasip.com>
>
> ---
> arch/riscv/mm/init.c | 15 +++++++++++----
> 1 file changed, 11 insertions(+), 4 deletions(-)
>
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index e3405e4b99af..7e25606f858a 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -233,8 +233,6 @@ static void __init setup_bootmem(void)
> */
> memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
>
> - phys_ram_end = memblock_end_of_DRAM();
> -
> /*
> * Make sure we align the start of the memory on a PMD boundary so that
> * at worst, we map the linear mapping with PMD mappings.
> @@ -249,6 +247,16 @@ static void __init setup_bootmem(void)
> if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU))
> kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base;
>
> + /*
> + * The size of the linear page mapping may restrict the amount of
> + * usable RAM.
> + */
> + if (IS_ENABLED(CONFIG_64BIT)) {
> + max_mapped_addr = __pa(PAGE_OFFSET) + KERN_VIRT_SIZE;
This is only true for sv39 once the following patch lands
https://lore.kernel.org/linux-riscv/20240514133614.87813-1-alexghiti@rivosinc.com/
Otherwise, sv39 is "weirdly" restricted to 124GB. You mention in the
changelog that the linear mapping size is limited to 128GB, does that
mean you tested your patch on top this one ^? If so, would you mind
adding a Tested-by to it? Otherwise, would you mind testing on top of it
:) ?
I'll see with Palmer, but maybe we can't take both patches to -fixes.
You can add:
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Thanks,
Alex
> + memblock_cap_memory_range(phys_ram_base,
> + max_mapped_addr - phys_ram_base);
> + }
> +
> /*
> * Reserve physical address space that would be mapped to virtual
> * addresses greater than (void *)(-PAGE_SIZE) because:
> @@ -265,6 +273,7 @@ static void __init setup_bootmem(void)
> memblock_reserve(max_mapped_addr, (phys_addr_t)-max_mapped_addr);
> }
>
> + phys_ram_end = memblock_end_of_DRAM();
> min_low_pfn = PFN_UP(phys_ram_base);
> max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end);
> high_memory = (void *)(__va(PFN_PHYS(max_low_pfn)));
> @@ -1289,8 +1298,6 @@ static void __init create_linear_mapping_page_table(void)
> if (start <= __pa(PAGE_OFFSET) &&
> __pa(PAGE_OFFSET) < end)
> start = __pa(PAGE_OFFSET);
> - if (end >= __pa(PAGE_OFFSET) + memory_limit)
> - end = __pa(PAGE_OFFSET) + memory_limit;
>
> create_linear_mapping_range(start, end, 0);
> }
On Mon, Jun 24, 2024 at 8:29 AM Alexandre Ghiti <alex@ghiti.fr> wrote:
>
>
> On 22/06/2024 13:42, Stuart Menefy wrote:
> > The RISC-V kernel already has checks to ensure that memory which would
> > lie outside of the linear mapping is not used. However those checks
> > use memory_limit, which is used to implement the mem= kernel command
> > line option (to limit the total amount of memory, not its address
> > range). When memory is made up of two or more non-contiguous memory
> > banks this check is incorrect.
> >
> > Two changes are made here:
> > - add a call in setup_bootmem() to memblock_cap_memory_range() which
> > will cause any memory which falls outside the linear mapping to be
> > removed from the memory regions.
> > - remove the check in create_linear_mapping_page_table() which was
> > intended to remove memory which is outside the liner mapping based
> > on memory_limit, as it is no longer needed. Note a check for
> > mapping more memory than memory_limit (to implement mem=) is
> > unnecessary because of the existing call to
> > memblock_enforce_memory_limit().
> >
> > This issue was seen when booting on a SV39 platform with two memory
> > banks:
> > 0x00,80000000 1GiB
> > 0x20,00000000 32GiB
> > This memory range is 158GiB from top to bottom, but the linear mapping
> > is limited to 128GiB, so the lower block of RAM will be mapped at
> > PAGE_OFFSET, and the upper block straddles the top of the linear
> > mapping.
> >
> > This causes the following Oops:
> > [ 0.000000] Linux version 6.10.0-rc2-gd3b8dd5b51dd-dirty (stuart.menefy@codasip.com) (riscv64-codasip-linux-gcc (GCC) 13.2.0, GNU ld (GNU Binutils) 2.41.0.20231213) #20 SMP Sat Jun 22 11:34:22 BST 2024
> > [ 0.000000] memblock_add: [0x0000000080000000-0x00000000bfffffff] early_init_dt_add_memory_arch+0x4a/0x52
> > [ 0.000000] memblock_add: [0x0000002000000000-0x00000027ffffffff] early_init_dt_add_memory_arch+0x4a/0x52
> > ...
> > [ 0.000000] memblock_alloc_try_nid: 23724 bytes align=0x8 nid=-1 from=0x0000000000000000 max_addr=0x0000000000000000 early_init_dt_alloc_memory_arch+0x1e/0x48
> > [ 0.000000] memblock_reserve: [0x00000027ffff5350-0x00000027ffffaffb] memblock_alloc_range_nid+0xb8/0x132
> > [ 0.000000] Unable to handle kernel paging request at virtual address fffffffe7fff5350
> > [ 0.000000] Oops [#1]
> > [ 0.000000] Modules linked in:
> > [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 6.10.0-rc2-gd3b8dd5b51dd-dirty #20
> > [ 0.000000] Hardware name: codasip,a70x (DT)
> > [ 0.000000] epc : __memset+0x8c/0x104
> > [ 0.000000] ra : memblock_alloc_try_nid+0x74/0x84
> > [ 0.000000] epc : ffffffff805e88c8 ra : ffffffff806148f6 sp : ffffffff80e03d50
> > [ 0.000000] gp : ffffffff80ec4158 tp : ffffffff80e0bec0 t0 : fffffffe7fff52f8
> > [ 0.000000] t1 : 00000027ffffb000 t2 : 5f6b636f6c626d65 s0 : ffffffff80e03d90
> > [ 0.000000] s1 : 0000000000005cac a0 : fffffffe7fff5350 a1 : 0000000000000000
> > [ 0.000000] a2 : 0000000000005cac a3 : fffffffe7fffaff8 a4 : 000000000000002c
> > [ 0.000000] a5 : ffffffff805e88c8 a6 : 0000000000005cac a7 : 0000000000000030
> > [ 0.000000] s2 : fffffffe7fff5350 s3 : ffffffffffffffff s4 : 0000000000000000
> > [ 0.000000] s5 : ffffffff8062347e s6 : 0000000000000000 s7 : 0000000000000001
> > [ 0.000000] s8 : 0000000000002000 s9 : 00000000800226d0 s10: 0000000000000000
> > [ 0.000000] s11: 0000000000000000 t3 : ffffffff8080a928 t4 : ffffffff8080a928
> > [ 0.000000] t5 : ffffffff8080a928 t6 : ffffffff8080a940
> > [ 0.000000] status: 0000000200000100 badaddr: fffffffe7fff5350 cause: 000000000000000f
> > [ 0.000000] [<ffffffff805e88c8>] __memset+0x8c/0x104
> > [ 0.000000] [<ffffffff8062349c>] early_init_dt_alloc_memory_arch+0x1e/0x48
> > [ 0.000000] [<ffffffff8043e892>] __unflatten_device_tree+0x52/0x114
> > [ 0.000000] [<ffffffff8062441e>] unflatten_device_tree+0x9e/0xb8
> > [ 0.000000] [<ffffffff806046fe>] setup_arch+0xd4/0x5bc
> > [ 0.000000] [<ffffffff806007aa>] start_kernel+0x76/0x81a
> > [ 0.000000] Code: b823 02b2 bc23 02b2 b023 04b2 b423 04b2 b823 04b2 (bc23) 04b2
> > [ 0.000000] ---[ end trace 0000000000000000 ]---
> > [ 0.000000] Kernel panic - not syncing: Attempted to kill the idle task!
> > [ 0.000000] ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---
> >
> > The problem is that memblock (unaware that some physical memory cannot
> > be used) has allocated memory from the top of memory but which is
> > outside the linear mapping region.
> >
> > Signed-off-by: Stuart Menefy <stuart.menefy@codasip.com>
> > Fixes: c99127c45248 ("riscv: Make sure the linear mapping does not use the kernel mapping")
> > Reviewed-by: David McKay <david.mckay@codasip.com>
> >
> > ---
> > arch/riscv/mm/init.c | 15 +++++++++++----
> > 1 file changed, 11 insertions(+), 4 deletions(-)
> >
> > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> > index e3405e4b99af..7e25606f858a 100644
> > --- a/arch/riscv/mm/init.c
> > +++ b/arch/riscv/mm/init.c
> > @@ -233,8 +233,6 @@ static void __init setup_bootmem(void)
> > */
> > memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
> >
> > - phys_ram_end = memblock_end_of_DRAM();
> > -
> > /*
> > * Make sure we align the start of the memory on a PMD boundary so that
> > * at worst, we map the linear mapping with PMD mappings.
> > @@ -249,6 +247,16 @@ static void __init setup_bootmem(void)
> > if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU))
> > kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base;
> >
> > + /*
> > + * The size of the linear page mapping may restrict the amount of
> > + * usable RAM.
> > + */
> > + if (IS_ENABLED(CONFIG_64BIT)) {
> > + max_mapped_addr = __pa(PAGE_OFFSET) + KERN_VIRT_SIZE;
>
>
> This is only true for sv39 once the following patch lands
> https://lore.kernel.org/linux-riscv/20240514133614.87813-1-alexghiti@rivosinc.com/
Hi Alex
I've seen this problem whether your patch has been applied or not.
> Otherwise, sv39 is "weirdly" restricted to 124GB. You mention in the
> changelog that the linear mapping size is limited to 128GB, does that
> mean you tested your patch on top this one ^? If so, would you mind
> adding a Tested-by to it? Otherwise, would you mind testing on top of it
> :) ?
I've tested in both cases, so I'll add a Tested-by.
Note that to actually use 128GiB on Sv39 systems yet another patch is
needed which I'll also post.
> I'll see with Palmer, but maybe we can't take both patches to -fixes.
All three patches address different issues, so I think it would be safe to
take them in any combination. However I understand the reluctance
when making changes in this area.
Thanks
Stuart
>
> You can add:
>
> Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
>
> Thanks,
>
> Alex
>
>
> > + memblock_cap_memory_range(phys_ram_base,
> > + max_mapped_addr - phys_ram_base);
> > + }
> > +
> > /*
> > * Reserve physical address space that would be mapped to virtual
> > * addresses greater than (void *)(-PAGE_SIZE) because:
> > @@ -265,6 +273,7 @@ static void __init setup_bootmem(void)
> > memblock_reserve(max_mapped_addr, (phys_addr_t)-max_mapped_addr);
> > }
> >
> > + phys_ram_end = memblock_end_of_DRAM();
> > min_low_pfn = PFN_UP(phys_ram_base);
> > max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end);
> > high_memory = (void *)(__va(PFN_PHYS(max_low_pfn)));
> > @@ -1289,8 +1298,6 @@ static void __init create_linear_mapping_page_table(void)
> > if (start <= __pa(PAGE_OFFSET) &&
> > __pa(PAGE_OFFSET) < end)
> > start = __pa(PAGE_OFFSET);
> > - if (end >= __pa(PAGE_OFFSET) + memory_limit)
> > - end = __pa(PAGE_OFFSET) + memory_limit;
> >
> > create_linear_mapping_range(start, end, 0);
> > }
On 24/06/2024 13:20, Stuart Menefy wrote:
> On Mon, Jun 24, 2024 at 8:29 AM Alexandre Ghiti <alex@ghiti.fr> wrote:
>>
>> On 22/06/2024 13:42, Stuart Menefy wrote:
>>> The RISC-V kernel already has checks to ensure that memory which would
>>> lie outside of the linear mapping is not used. However those checks
>>> use memory_limit, which is used to implement the mem= kernel command
>>> line option (to limit the total amount of memory, not its address
>>> range). When memory is made up of two or more non-contiguous memory
>>> banks this check is incorrect.
>>>
>>> Two changes are made here:
>>> - add a call in setup_bootmem() to memblock_cap_memory_range() which
>>> will cause any memory which falls outside the linear mapping to be
>>> removed from the memory regions.
>>> - remove the check in create_linear_mapping_page_table() which was
>>> intended to remove memory which is outside the liner mapping based
>>> on memory_limit, as it is no longer needed. Note a check for
>>> mapping more memory than memory_limit (to implement mem=) is
>>> unnecessary because of the existing call to
>>> memblock_enforce_memory_limit().
>>>
>>> This issue was seen when booting on a SV39 platform with two memory
>>> banks:
>>> 0x00,80000000 1GiB
>>> 0x20,00000000 32GiB
>>> This memory range is 158GiB from top to bottom, but the linear mapping
>>> is limited to 128GiB, so the lower block of RAM will be mapped at
>>> PAGE_OFFSET, and the upper block straddles the top of the linear
>>> mapping.
>>>
>>> This causes the following Oops:
>>> [ 0.000000] Linux version 6.10.0-rc2-gd3b8dd5b51dd-dirty (stuart.menefy@codasip.com) (riscv64-codasip-linux-gcc (GCC) 13.2.0, GNU ld (GNU Binutils) 2.41.0.20231213) #20 SMP Sat Jun 22 11:34:22 BST 2024
>>> [ 0.000000] memblock_add: [0x0000000080000000-0x00000000bfffffff] early_init_dt_add_memory_arch+0x4a/0x52
>>> [ 0.000000] memblock_add: [0x0000002000000000-0x00000027ffffffff] early_init_dt_add_memory_arch+0x4a/0x52
>>> ...
>>> [ 0.000000] memblock_alloc_try_nid: 23724 bytes align=0x8 nid=-1 from=0x0000000000000000 max_addr=0x0000000000000000 early_init_dt_alloc_memory_arch+0x1e/0x48
>>> [ 0.000000] memblock_reserve: [0x00000027ffff5350-0x00000027ffffaffb] memblock_alloc_range_nid+0xb8/0x132
>>> [ 0.000000] Unable to handle kernel paging request at virtual address fffffffe7fff5350
>>> [ 0.000000] Oops [#1]
>>> [ 0.000000] Modules linked in:
>>> [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 6.10.0-rc2-gd3b8dd5b51dd-dirty #20
>>> [ 0.000000] Hardware name: codasip,a70x (DT)
>>> [ 0.000000] epc : __memset+0x8c/0x104
>>> [ 0.000000] ra : memblock_alloc_try_nid+0x74/0x84
>>> [ 0.000000] epc : ffffffff805e88c8 ra : ffffffff806148f6 sp : ffffffff80e03d50
>>> [ 0.000000] gp : ffffffff80ec4158 tp : ffffffff80e0bec0 t0 : fffffffe7fff52f8
>>> [ 0.000000] t1 : 00000027ffffb000 t2 : 5f6b636f6c626d65 s0 : ffffffff80e03d90
>>> [ 0.000000] s1 : 0000000000005cac a0 : fffffffe7fff5350 a1 : 0000000000000000
>>> [ 0.000000] a2 : 0000000000005cac a3 : fffffffe7fffaff8 a4 : 000000000000002c
>>> [ 0.000000] a5 : ffffffff805e88c8 a6 : 0000000000005cac a7 : 0000000000000030
>>> [ 0.000000] s2 : fffffffe7fff5350 s3 : ffffffffffffffff s4 : 0000000000000000
>>> [ 0.000000] s5 : ffffffff8062347e s6 : 0000000000000000 s7 : 0000000000000001
>>> [ 0.000000] s8 : 0000000000002000 s9 : 00000000800226d0 s10: 0000000000000000
>>> [ 0.000000] s11: 0000000000000000 t3 : ffffffff8080a928 t4 : ffffffff8080a928
>>> [ 0.000000] t5 : ffffffff8080a928 t6 : ffffffff8080a940
>>> [ 0.000000] status: 0000000200000100 badaddr: fffffffe7fff5350 cause: 000000000000000f
>>> [ 0.000000] [<ffffffff805e88c8>] __memset+0x8c/0x104
>>> [ 0.000000] [<ffffffff8062349c>] early_init_dt_alloc_memory_arch+0x1e/0x48
>>> [ 0.000000] [<ffffffff8043e892>] __unflatten_device_tree+0x52/0x114
>>> [ 0.000000] [<ffffffff8062441e>] unflatten_device_tree+0x9e/0xb8
>>> [ 0.000000] [<ffffffff806046fe>] setup_arch+0xd4/0x5bc
>>> [ 0.000000] [<ffffffff806007aa>] start_kernel+0x76/0x81a
>>> [ 0.000000] Code: b823 02b2 bc23 02b2 b023 04b2 b423 04b2 b823 04b2 (bc23) 04b2
>>> [ 0.000000] ---[ end trace 0000000000000000 ]---
>>> [ 0.000000] Kernel panic - not syncing: Attempted to kill the idle task!
>>> [ 0.000000] ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---
>>>
>>> The problem is that memblock (unaware that some physical memory cannot
>>> be used) has allocated memory from the top of memory but which is
>>> outside the linear mapping region.
>>>
>>> Signed-off-by: Stuart Menefy <stuart.menefy@codasip.com>
>>> Fixes: c99127c45248 ("riscv: Make sure the linear mapping does not use the kernel mapping")
>>> Reviewed-by: David McKay <david.mckay@codasip.com>
>>>
>>> ---
>>> arch/riscv/mm/init.c | 15 +++++++++++----
>>> 1 file changed, 11 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>>> index e3405e4b99af..7e25606f858a 100644
>>> --- a/arch/riscv/mm/init.c
>>> +++ b/arch/riscv/mm/init.c
>>> @@ -233,8 +233,6 @@ static void __init setup_bootmem(void)
>>> */
>>> memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
>>>
>>> - phys_ram_end = memblock_end_of_DRAM();
>>> -
>>> /*
>>> * Make sure we align the start of the memory on a PMD boundary so that
>>> * at worst, we map the linear mapping with PMD mappings.
>>> @@ -249,6 +247,16 @@ static void __init setup_bootmem(void)
>>> if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU))
>>> kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base;
>>>
>>> + /*
>>> + * The size of the linear page mapping may restrict the amount of
>>> + * usable RAM.
>>> + */
>>> + if (IS_ENABLED(CONFIG_64BIT)) {
>>> + max_mapped_addr = __pa(PAGE_OFFSET) + KERN_VIRT_SIZE;
>>
>> This is only true for sv39 once the following patch lands
>> https://lore.kernel.org/linux-riscv/20240514133614.87813-1-alexghiti@rivosinc.com/
> Hi Alex
>
> I've seen this problem whether your patch has been applied or not.
Sure, I just meant the use of KERN_VIRT_SIZE above, sorry it was not clear.
>
>> Otherwise, sv39 is "weirdly" restricted to 124GB. You mention in the
>> changelog that the linear mapping size is limited to 128GB, does that
>> mean you tested your patch on top this one ^? If so, would you mind
>> adding a Tested-by to it? Otherwise, would you mind testing on top of it
>> :) ?
> I've tested in both cases, so I'll add a Tested-by.
Thanks!
>
> Note that to actually use 128GiB on Sv39 systems yet another patch is
> needed which I'll also post.
Ok, I'm curious! But if indeed another patch is needed, then we need to
merge the 3 at the same time. I'll add this other patch to my fixes branch.
Thanks,
Alex
>
>> I'll see with Palmer, but maybe we can't take both patches to -fixes.
> All three patches address different issues, so I think it would be safe to
> take them in any combination. However I understand the reluctance
> when making changes in this area.
>
> Thanks
>
> Stuart
>
>
>> You can add:
>>
>> Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
>>
>> Thanks,
>>
>> Alex
>>
>>
>>> + memblock_cap_memory_range(phys_ram_base,
>>> + max_mapped_addr - phys_ram_base);
>>> + }
>>> +
>>> /*
>>> * Reserve physical address space that would be mapped to virtual
>>> * addresses greater than (void *)(-PAGE_SIZE) because:
>>> @@ -265,6 +273,7 @@ static void __init setup_bootmem(void)
>>> memblock_reserve(max_mapped_addr, (phys_addr_t)-max_mapped_addr);
>>> }
>>>
>>> + phys_ram_end = memblock_end_of_DRAM();
>>> min_low_pfn = PFN_UP(phys_ram_base);
>>> max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end);
>>> high_memory = (void *)(__va(PFN_PHYS(max_low_pfn)));
>>> @@ -1289,8 +1298,6 @@ static void __init create_linear_mapping_page_table(void)
>>> if (start <= __pa(PAGE_OFFSET) &&
>>> __pa(PAGE_OFFSET) < end)
>>> start = __pa(PAGE_OFFSET);
>>> - if (end >= __pa(PAGE_OFFSET) + memory_limit)
>>> - end = __pa(PAGE_OFFSET) + memory_limit;
>>>
>>> create_linear_mapping_range(start, end, 0);
>>> }
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
On 24/06/2024 13:32, Alexandre Ghiti wrote:
> On 24/06/2024 13:20, Stuart Menefy wrote:
>> On Mon, Jun 24, 2024 at 8:29 AM Alexandre Ghiti <alex@ghiti.fr> wrote:
>>>
>>> On 22/06/2024 13:42, Stuart Menefy wrote:
>>>> The RISC-V kernel already has checks to ensure that memory which would
>>>> lie outside of the linear mapping is not used. However those checks
>>>> use memory_limit, which is used to implement the mem= kernel command
>>>> line option (to limit the total amount of memory, not its address
>>>> range). When memory is made up of two or more non-contiguous memory
>>>> banks this check is incorrect.
>>>>
>>>> Two changes are made here:
>>>> - add a call in setup_bootmem() to memblock_cap_memory_range()
>>>> which
>>>> will cause any memory which falls outside the linear mapping
>>>> to be
>>>> removed from the memory regions.
>>>> - remove the check in create_linear_mapping_page_table() which was
>>>> intended to remove memory which is outside the liner mapping
>>>> based
>>>> on memory_limit, as it is no longer needed. Note a check for
>>>> mapping more memory than memory_limit (to implement mem=) is
>>>> unnecessary because of the existing call to
>>>> memblock_enforce_memory_limit().
>>>>
>>>> This issue was seen when booting on a SV39 platform with two memory
>>>> banks:
>>>> 0x00,80000000 1GiB
>>>> 0x20,00000000 32GiB
>>>> This memory range is 158GiB from top to bottom, but the linear mapping
>>>> is limited to 128GiB, so the lower block of RAM will be mapped at
>>>> PAGE_OFFSET, and the upper block straddles the top of the linear
>>>> mapping.
>>>>
>>>> This causes the following Oops:
>>>> [ 0.000000] Linux version 6.10.0-rc2-gd3b8dd5b51dd-dirty
>>>> (stuart.menefy@codasip.com) (riscv64-codasip-linux-gcc (GCC)
>>>> 13.2.0, GNU ld (GNU Binutils) 2.41.0.20231213) #20 SMP Sat Jun 22
>>>> 11:34:22 BST 2024
>>>> [ 0.000000] memblock_add:
>>>> [0x0000000080000000-0x00000000bfffffff]
>>>> early_init_dt_add_memory_arch+0x4a/0x52
>>>> [ 0.000000] memblock_add:
>>>> [0x0000002000000000-0x00000027ffffffff]
>>>> early_init_dt_add_memory_arch+0x4a/0x52
>>>> ...
>>>> [ 0.000000] memblock_alloc_try_nid: 23724 bytes align=0x8 nid=-1
>>>> from=0x0000000000000000 max_addr=0x0000000000000000
>>>> early_init_dt_alloc_memory_arch+0x1e/0x48
>>>> [ 0.000000] memblock_reserve:
>>>> [0x00000027ffff5350-0x00000027ffffaffb]
>>>> memblock_alloc_range_nid+0xb8/0x132
>>>> [ 0.000000] Unable to handle kernel paging request at virtual
>>>> address fffffffe7fff5350
>>>> [ 0.000000] Oops [#1]
>>>> [ 0.000000] Modules linked in:
>>>> [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted
>>>> 6.10.0-rc2-gd3b8dd5b51dd-dirty #20
>>>> [ 0.000000] Hardware name: codasip,a70x (DT)
>>>> [ 0.000000] epc : __memset+0x8c/0x104
>>>> [ 0.000000] ra : memblock_alloc_try_nid+0x74/0x84
>>>> [ 0.000000] epc : ffffffff805e88c8 ra : ffffffff806148f6 sp :
>>>> ffffffff80e03d50
>>>> [ 0.000000] gp : ffffffff80ec4158 tp : ffffffff80e0bec0 t0 :
>>>> fffffffe7fff52f8
>>>> [ 0.000000] t1 : 00000027ffffb000 t2 : 5f6b636f6c626d65 s0 :
>>>> ffffffff80e03d90
>>>> [ 0.000000] s1 : 0000000000005cac a0 : fffffffe7fff5350 a1 :
>>>> 0000000000000000
>>>> [ 0.000000] a2 : 0000000000005cac a3 : fffffffe7fffaff8 a4 :
>>>> 000000000000002c
>>>> [ 0.000000] a5 : ffffffff805e88c8 a6 : 0000000000005cac a7 :
>>>> 0000000000000030
>>>> [ 0.000000] s2 : fffffffe7fff5350 s3 : ffffffffffffffff s4 :
>>>> 0000000000000000
>>>> [ 0.000000] s5 : ffffffff8062347e s6 : 0000000000000000 s7 :
>>>> 0000000000000001
>>>> [ 0.000000] s8 : 0000000000002000 s9 : 00000000800226d0 s10:
>>>> 0000000000000000
>>>> [ 0.000000] s11: 0000000000000000 t3 : ffffffff8080a928 t4 :
>>>> ffffffff8080a928
>>>> [ 0.000000] t5 : ffffffff8080a928 t6 : ffffffff8080a940
>>>> [ 0.000000] status: 0000000200000100 badaddr: fffffffe7fff5350
>>>> cause: 000000000000000f
>>>> [ 0.000000] [<ffffffff805e88c8>] __memset+0x8c/0x104
>>>> [ 0.000000] [<ffffffff8062349c>]
>>>> early_init_dt_alloc_memory_arch+0x1e/0x48
>>>> [ 0.000000] [<ffffffff8043e892>] __unflatten_device_tree+0x52/0x114
>>>> [ 0.000000] [<ffffffff8062441e>] unflatten_device_tree+0x9e/0xb8
>>>> [ 0.000000] [<ffffffff806046fe>] setup_arch+0xd4/0x5bc
>>>> [ 0.000000] [<ffffffff806007aa>] start_kernel+0x76/0x81a
>>>> [ 0.000000] Code: b823 02b2 bc23 02b2 b023 04b2 b423 04b2 b823
>>>> 04b2 (bc23) 04b2
>>>> [ 0.000000] ---[ end trace 0000000000000000 ]---
>>>> [ 0.000000] Kernel panic - not syncing: Attempted to kill the
>>>> idle task!
>>>> [ 0.000000] ---[ end Kernel panic - not syncing: Attempted to
>>>> kill the idle task! ]---
>>>>
>>>> The problem is that memblock (unaware that some physical memory cannot
>>>> be used) has allocated memory from the top of memory but which is
>>>> outside the linear mapping region.
>>>>
>>>> Signed-off-by: Stuart Menefy <stuart.menefy@codasip.com>
>>>> Fixes: c99127c45248 ("riscv: Make sure the linear mapping does not
>>>> use the kernel mapping")
>>>> Reviewed-by: David McKay <david.mckay@codasip.com>
>>>>
>>>> ---
>>>> arch/riscv/mm/init.c | 15 +++++++++++----
>>>> 1 file changed, 11 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>>>> index e3405e4b99af..7e25606f858a 100644
>>>> --- a/arch/riscv/mm/init.c
>>>> +++ b/arch/riscv/mm/init.c
>>>> @@ -233,8 +233,6 @@ static void __init setup_bootmem(void)
>>>> */
>>>> memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
>>>>
>>>> - phys_ram_end = memblock_end_of_DRAM();
>>>> -
>>>> /*
>>>> * Make sure we align the start of the memory on a PMD
>>>> boundary so that
>>>> * at worst, we map the linear mapping with PMD mappings.
>>>> @@ -249,6 +247,16 @@ static void __init setup_bootmem(void)
>>>> if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU))
>>>> kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base;
>>>>
>>>> + /*
>>>> + * The size of the linear page mapping may restrict the
>>>> amount of
>>>> + * usable RAM.
>>>> + */
>>>> + if (IS_ENABLED(CONFIG_64BIT)) {
>>>> + max_mapped_addr = __pa(PAGE_OFFSET) + KERN_VIRT_SIZE;
>>>
>>> This is only true for sv39 once the following patch lands
>>> https://lore.kernel.org/linux-riscv/20240514133614.87813-1-alexghiti@rivosinc.com/
>>>
>> Hi Alex
>>
>> I've seen this problem whether your patch has been applied or not.
>
>
> Sure, I just meant the use of KERN_VIRT_SIZE above, sorry it was not
> clear.
>
>
>>
>>> Otherwise, sv39 is "weirdly" restricted to 124GB. You mention in the
>>> changelog that the linear mapping size is limited to 128GB, does that
>>> mean you tested your patch on top this one ^? If so, would you mind
>>> adding a Tested-by to it? Otherwise, would you mind testing on top
>>> of it
>>> :) ?
>> I've tested in both cases, so I'll add a Tested-by.
>
>
> Thanks!
>
>
>>
>> Note that to actually use 128GiB on Sv39 systems yet another patch is
>> needed which I'll also post.
>
>
> Ok, I'm curious! But if indeed another patch is needed, then we need
> to merge the 3 at the same time. I'll add this other patch to my fixes
> branch.
>
> Thanks,
>
> Alex
>
>
>>
>>> I'll see with Palmer, but maybe we can't take both patches to -fixes.
>> All three patches address different issues, so I think it would be
>> safe to
>> take them in any combination. However I understand the reluctance
>> when making changes in this area.
>>
>> Thanks
>>
>> Stuart
>>
>>
>>> You can add:
>>>
>>> Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
>>>
>>> Thanks,
>>>
>>> Alex
Actually, this patch breaks KASAN on sv39:
[ 0.000000] kernel BUG at arch/riscv/mm/kasan_init.c:393!
[ 0.000000] Kernel BUG [#1]
[ 0.000000] Modules linked in:
[ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted
6.10.0-rc5-defconfig_kasan_sparsemem_vmemmmap-g018bb5470bac #1
[ 0.000000] Hardware name: riscv-virtio,qemu (DT)
[ 0.000000] epc : kasan_shallow_populate_pud+0xf2/0x228
[ 0.000000] ra : kasan_shallow_populate_pud+0x2a/0x228
[ 0.000000] epc : ffffffff81c0d2bc ra : ffffffff81c0d1f4 sp :
ffffffff82607c60
[ 0.000000] gp : ffffffff82a79548 tp : ffffffff82615740 t0 :
ffffffd7fefed000
[ 0.000000] t1 : ffffffd7fefec000 t2 : 0000000000000000 s0 :
ffffffff82607ce0
[ 0.000000] s1 : ffffffff82a85ef8 a0 : 000000005fbfb001 a1 :
0000400000000000
[ 0.000000] a2 : 0000000000000000 a3 : dfffffff00000000 a4 :
ffffffffffffffff
[ 0.000000] a5 : 0000000000000000 a6 : ffffffd7fefecfff a7 :
1ffffffaffdfd9ff
[ 0.000000] s2 : fffffff800000000 s3 : fffffff800000000 s4 :
fffffff7ffffffff
[ 0.000000] s5 : ffffffff8248ab80 s6 : 0000002000000000 s7 :
000000003fffffff
[ 0.000000] s8 : ffffffff8248ab58 s9 : ffffffff8248ab88 s10:
ffffffff8248b20c
[ 0.000000] s11: ffffffff8248b210 t3 : fffffff9ffdfda00 t4 :
0000000000000200
[ 0.000000] t5 : 0000000000000000 t6 : ffffffff82ac35e0
[ 0.000000] status: 0000000200000100 badaddr: 0000000000000000 cause:
0000000000000003
[ 0.000000] [<ffffffff81c0d2bc>] kasan_shallow_populate_pud+0xf2/0x228
[ 0.000000] [<ffffffff81c0d1aa>] kasan_shallow_populate_p4d+0x108/0x128
[ 0.000000] [<ffffffff81c0d082>] kasan_shallow_populate_pgd+0x15a/0x17a
[ 0.000000] [<ffffffff81c0bf26>] kasan_init+0x178/0x384
[ 0.000000] [<ffffffff81c060c0>] setup_arch+0x90/0x10a
[ 0.000000] [<ffffffff81c005b0>] start_kernel+0x5e/0x340
[ 0.000000] Code: 6a46 6aa6 6b06 7be2 7c42 7ca2 7d02 6de2 6109 8082
(9002) 0e13
[ 0.000000] ---[ end trace 0000000000000000 ]---
[ 0.000000] Kernel panic - not syncing: Fatal exception in interrupt
[ 0.000000] ---[ end Kernel panic - not syncing: Fatal exception in
interrupt ]---
I took a quick look and the issue is that KASAN is populating the kernel
page table but falls onto an already populated PUD entry (which is a PGD
entry on sv39) and then bugs since this should not happen.
@Stuart: Can you take a look? A simple defconfig + KASAN run on qemu
with sv39 triggers the issue. I'm currently investigating another
failure with rc5 and once finished I can take over/help if needed.
Thanks,
Alex
>>>
>>>
>>>> + memblock_cap_memory_range(phys_ram_base,
>>>> + max_mapped_addr -
>>>> phys_ram_base);
>>>> + }
>>>> +
>>>> /*
>>>> * Reserve physical address space that would be mapped to
>>>> virtual
>>>> * addresses greater than (void *)(-PAGE_SIZE) because:
>>>> @@ -265,6 +273,7 @@ static void __init setup_bootmem(void)
>>>> memblock_reserve(max_mapped_addr,
>>>> (phys_addr_t)-max_mapped_addr);
>>>> }
>>>>
>>>> + phys_ram_end = memblock_end_of_DRAM();
>>>> min_low_pfn = PFN_UP(phys_ram_base);
>>>> max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end);
>>>> high_memory = (void *)(__va(PFN_PHYS(max_low_pfn)));
>>>> @@ -1289,8 +1298,6 @@ static void __init
>>>> create_linear_mapping_page_table(void)
>>>> if (start <= __pa(PAGE_OFFSET) &&
>>>> __pa(PAGE_OFFSET) < end)
>>>> start = __pa(PAGE_OFFSET);
>>>> - if (end >= __pa(PAGE_OFFSET) + memory_limit)
>>>> - end = __pa(PAGE_OFFSET) + memory_limit;
>>>>
>>>> create_linear_mapping_range(start, end, 0);
>>>> }
>> _______________________________________________
>> linux-riscv mailing list
>> linux-riscv@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-riscv
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
On Mon, Jun 24, 2024 at 5:49 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
>
> On 24/06/2024 13:32, Alexandre Ghiti wrote:
> > On 24/06/2024 13:20, Stuart Menefy wrote:
> >> On Mon, Jun 24, 2024 at 8:29 AM Alexandre Ghiti <alex@ghiti.fr> wrote:
> >>>
> >>> On 22/06/2024 13:42, Stuart Menefy wrote:
> >>>> The RISC-V kernel already has checks to ensure that memory which would
> >>>> lie outside of the linear mapping is not used. However those checks
> >>>> use memory_limit, which is used to implement the mem= kernel command
> >>>> line option (to limit the total amount of memory, not its address
> >>>> range). When memory is made up of two or more non-contiguous memory
> >>>> banks this check is incorrect.
> >>>>
> >>>> Two changes are made here:
> >>>> - add a call in setup_bootmem() to memblock_cap_memory_range()
> >>>> which
> >>>> will cause any memory which falls outside the linear mapping
> >>>> to be
> >>>> removed from the memory regions.
> >>>> - remove the check in create_linear_mapping_page_table() which was
> >>>> intended to remove memory which is outside the liner mapping
> >>>> based
> >>>> on memory_limit, as it is no longer needed. Note a check for
> >>>> mapping more memory than memory_limit (to implement mem=) is
> >>>> unnecessary because of the existing call to
> >>>> memblock_enforce_memory_limit().
> >>>>
> >>>> This issue was seen when booting on a SV39 platform with two memory
> >>>> banks:
> >>>> 0x00,80000000 1GiB
> >>>> 0x20,00000000 32GiB
> >>>> This memory range is 158GiB from top to bottom, but the linear mapping
> >>>> is limited to 128GiB, so the lower block of RAM will be mapped at
> >>>> PAGE_OFFSET, and the upper block straddles the top of the linear
> >>>> mapping.
> >>>>
> >>>> This causes the following Oops:
> >>>> [ 0.000000] Linux version 6.10.0-rc2-gd3b8dd5b51dd-dirty
> >>>> (stuart.menefy@codasip.com) (riscv64-codasip-linux-gcc (GCC)
> >>>> 13.2.0, GNU ld (GNU Binutils) 2.41.0.20231213) #20 SMP Sat Jun 22
> >>>> 11:34:22 BST 2024
> >>>> [ 0.000000] memblock_add:
> >>>> [0x0000000080000000-0x00000000bfffffff]
> >>>> early_init_dt_add_memory_arch+0x4a/0x52
> >>>> [ 0.000000] memblock_add:
> >>>> [0x0000002000000000-0x00000027ffffffff]
> >>>> early_init_dt_add_memory_arch+0x4a/0x52
> >>>> ...
> >>>> [ 0.000000] memblock_alloc_try_nid: 23724 bytes align=0x8 nid=-1
> >>>> from=0x0000000000000000 max_addr=0x0000000000000000
> >>>> early_init_dt_alloc_memory_arch+0x1e/0x48
> >>>> [ 0.000000] memblock_reserve:
> >>>> [0x00000027ffff5350-0x00000027ffffaffb]
> >>>> memblock_alloc_range_nid+0xb8/0x132
> >>>> [ 0.000000] Unable to handle kernel paging request at virtual
> >>>> address fffffffe7fff5350
> >>>> [ 0.000000] Oops [#1]
> >>>> [ 0.000000] Modules linked in:
> >>>> [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted
> >>>> 6.10.0-rc2-gd3b8dd5b51dd-dirty #20
> >>>> [ 0.000000] Hardware name: codasip,a70x (DT)
> >>>> [ 0.000000] epc : __memset+0x8c/0x104
> >>>> [ 0.000000] ra : memblock_alloc_try_nid+0x74/0x84
> >>>> [ 0.000000] epc : ffffffff805e88c8 ra : ffffffff806148f6 sp :
> >>>> ffffffff80e03d50
> >>>> [ 0.000000] gp : ffffffff80ec4158 tp : ffffffff80e0bec0 t0 :
> >>>> fffffffe7fff52f8
> >>>> [ 0.000000] t1 : 00000027ffffb000 t2 : 5f6b636f6c626d65 s0 :
> >>>> ffffffff80e03d90
> >>>> [ 0.000000] s1 : 0000000000005cac a0 : fffffffe7fff5350 a1 :
> >>>> 0000000000000000
> >>>> [ 0.000000] a2 : 0000000000005cac a3 : fffffffe7fffaff8 a4 :
> >>>> 000000000000002c
> >>>> [ 0.000000] a5 : ffffffff805e88c8 a6 : 0000000000005cac a7 :
> >>>> 0000000000000030
> >>>> [ 0.000000] s2 : fffffffe7fff5350 s3 : ffffffffffffffff s4 :
> >>>> 0000000000000000
> >>>> [ 0.000000] s5 : ffffffff8062347e s6 : 0000000000000000 s7 :
> >>>> 0000000000000001
> >>>> [ 0.000000] s8 : 0000000000002000 s9 : 00000000800226d0 s10:
> >>>> 0000000000000000
> >>>> [ 0.000000] s11: 0000000000000000 t3 : ffffffff8080a928 t4 :
> >>>> ffffffff8080a928
> >>>> [ 0.000000] t5 : ffffffff8080a928 t6 : ffffffff8080a940
> >>>> [ 0.000000] status: 0000000200000100 badaddr: fffffffe7fff5350
> >>>> cause: 000000000000000f
> >>>> [ 0.000000] [<ffffffff805e88c8>] __memset+0x8c/0x104
> >>>> [ 0.000000] [<ffffffff8062349c>]
> >>>> early_init_dt_alloc_memory_arch+0x1e/0x48
> >>>> [ 0.000000] [<ffffffff8043e892>] __unflatten_device_tree+0x52/0x114
> >>>> [ 0.000000] [<ffffffff8062441e>] unflatten_device_tree+0x9e/0xb8
> >>>> [ 0.000000] [<ffffffff806046fe>] setup_arch+0xd4/0x5bc
> >>>> [ 0.000000] [<ffffffff806007aa>] start_kernel+0x76/0x81a
> >>>> [ 0.000000] Code: b823 02b2 bc23 02b2 b023 04b2 b423 04b2 b823
> >>>> 04b2 (bc23) 04b2
> >>>> [ 0.000000] ---[ end trace 0000000000000000 ]---
> >>>> [ 0.000000] Kernel panic - not syncing: Attempted to kill the
> >>>> idle task!
> >>>> [ 0.000000] ---[ end Kernel panic - not syncing: Attempted to
> >>>> kill the idle task! ]---
> >>>>
> >>>> The problem is that memblock (unaware that some physical memory cannot
> >>>> be used) has allocated memory from the top of memory but which is
> >>>> outside the linear mapping region.
> >>>>
> >>>> Signed-off-by: Stuart Menefy <stuart.menefy@codasip.com>
> >>>> Fixes: c99127c45248 ("riscv: Make sure the linear mapping does not
> >>>> use the kernel mapping")
> >>>> Reviewed-by: David McKay <david.mckay@codasip.com>
> >>>>
> >>>> ---
> >>>> arch/riscv/mm/init.c | 15 +++++++++++----
> >>>> 1 file changed, 11 insertions(+), 4 deletions(-)
> >>>>
> >>>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> >>>> index e3405e4b99af..7e25606f858a 100644
> >>>> --- a/arch/riscv/mm/init.c
> >>>> +++ b/arch/riscv/mm/init.c
> >>>> @@ -233,8 +233,6 @@ static void __init setup_bootmem(void)
> >>>> */
> >>>> memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
> >>>>
> >>>> - phys_ram_end = memblock_end_of_DRAM();
> >>>> -
> >>>> /*
> >>>> * Make sure we align the start of the memory on a PMD
> >>>> boundary so that
> >>>> * at worst, we map the linear mapping with PMD mappings.
> >>>> @@ -249,6 +247,16 @@ static void __init setup_bootmem(void)
> >>>> if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU))
> >>>> kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base;
> >>>>
> >>>> + /*
> >>>> + * The size of the linear page mapping may restrict the
> >>>> amount of
> >>>> + * usable RAM.
> >>>> + */
> >>>> + if (IS_ENABLED(CONFIG_64BIT)) {
> >>>> + max_mapped_addr = __pa(PAGE_OFFSET) + KERN_VIRT_SIZE;
> >>>
> >>> This is only true for sv39 once the following patch lands
> >>> https://lore.kernel.org/linux-riscv/20240514133614.87813-1-alexghiti@rivosinc.com/
> >>>
> >> Hi Alex
> >>
> >> I've seen this problem whether your patch has been applied or not.
> >
> >
> > Sure, I just meant the use of KERN_VIRT_SIZE above, sorry it was not
> > clear.
> >
> >
> >>
> >>> Otherwise, sv39 is "weirdly" restricted to 124GB. You mention in the
> >>> changelog that the linear mapping size is limited to 128GB, does that
> >>> mean you tested your patch on top this one ^? If so, would you mind
> >>> adding a Tested-by to it? Otherwise, would you mind testing on top
> >>> of it
> >>> :) ?
> >> I've tested in both cases, so I'll add a Tested-by.
> >
> >
> > Thanks!
> >
> >
> >>
> >> Note that to actually use 128GiB on Sv39 systems yet another patch is
> >> needed which I'll also post.
> >
> >
> > Ok, I'm curious! But if indeed another patch is needed, then we need
> > to merge the 3 at the same time. I'll add this other patch to my fixes
> > branch.
> >
> > Thanks,
> >
> > Alex
> >
> >
> >>
> >>> I'll see with Palmer, but maybe we can't take both patches to -fixes.
> >> All three patches address different issues, so I think it would be
> >> safe to
> >> take them in any combination. However I understand the reluctance
> >> when making changes in this area.
> >>
> >> Thanks
> >>
> >> Stuart
> >>
> >>
> >>> You can add:
> >>>
> >>> Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
> >>>
> >>> Thanks,
> >>>
> >>> Alex
>
>
> Actually, this patch breaks KASAN on sv39:
>
> [ 0.000000] kernel BUG at arch/riscv/mm/kasan_init.c:393!
> [ 0.000000] Kernel BUG [#1]
> [ 0.000000] Modules linked in:
> [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted
> 6.10.0-rc5-defconfig_kasan_sparsemem_vmemmmap-g018bb5470bac #1
> [ 0.000000] Hardware name: riscv-virtio,qemu (DT)
> [ 0.000000] epc : kasan_shallow_populate_pud+0xf2/0x228
> [ 0.000000] ra : kasan_shallow_populate_pud+0x2a/0x228
> [ 0.000000] epc : ffffffff81c0d2bc ra : ffffffff81c0d1f4 sp :
> ffffffff82607c60
> [ 0.000000] gp : ffffffff82a79548 tp : ffffffff82615740 t0 :
> ffffffd7fefed000
> [ 0.000000] t1 : ffffffd7fefec000 t2 : 0000000000000000 s0 :
> ffffffff82607ce0
> [ 0.000000] s1 : ffffffff82a85ef8 a0 : 000000005fbfb001 a1 :
> 0000400000000000
> [ 0.000000] a2 : 0000000000000000 a3 : dfffffff00000000 a4 :
> ffffffffffffffff
> [ 0.000000] a5 : 0000000000000000 a6 : ffffffd7fefecfff a7 :
> 1ffffffaffdfd9ff
> [ 0.000000] s2 : fffffff800000000 s3 : fffffff800000000 s4 :
> fffffff7ffffffff
> [ 0.000000] s5 : ffffffff8248ab80 s6 : 0000002000000000 s7 :
> 000000003fffffff
> [ 0.000000] s8 : ffffffff8248ab58 s9 : ffffffff8248ab88 s10:
> ffffffff8248b20c
> [ 0.000000] s11: ffffffff8248b210 t3 : fffffff9ffdfda00 t4 :
> 0000000000000200
> [ 0.000000] t5 : 0000000000000000 t6 : ffffffff82ac35e0
> [ 0.000000] status: 0000000200000100 badaddr: 0000000000000000 cause:
> 0000000000000003
> [ 0.000000] [<ffffffff81c0d2bc>] kasan_shallow_populate_pud+0xf2/0x228
> [ 0.000000] [<ffffffff81c0d1aa>] kasan_shallow_populate_p4d+0x108/0x128
> [ 0.000000] [<ffffffff81c0d082>] kasan_shallow_populate_pgd+0x15a/0x17a
> [ 0.000000] [<ffffffff81c0bf26>] kasan_init+0x178/0x384
> [ 0.000000] [<ffffffff81c060c0>] setup_arch+0x90/0x10a
> [ 0.000000] [<ffffffff81c005b0>] start_kernel+0x5e/0x340
> [ 0.000000] Code: 6a46 6aa6 6b06 7be2 7c42 7ca2 7d02 6de2 6109 8082
> (9002) 0e13
> [ 0.000000] ---[ end trace 0000000000000000 ]---
> [ 0.000000] Kernel panic - not syncing: Fatal exception in interrupt
> [ 0.000000] ---[ end Kernel panic - not syncing: Fatal exception in
> interrupt ]---
>
>
> I took a quick look and the issue is that KASAN is populating the kernel
> page table but falls onto an already populated PUD entry (which is a PGD
> entry on sv39) and then bugs since this should not happen.
>
> @Stuart: Can you take a look? A simple defconfig + KASAN run on qemu
> with sv39 triggers the issue. I'm currently investigating another
> failure with rc5 and once finished I can take over/help if needed.
The way the current KASAN initialisation is implemented requires that
the start of shallow shadow mappings are aligned on a pgdir boundary for Sv39.
The address of the shadow mappings are derived from the original address
shifted right by KASAN_SHADOW_SCALE_SHIFT, which is 3. So as a
pgdir boundary is 1GiB this means the original address must be 8GiB aligned.
The patch to increase the size of the linear mapping from 124GiB to 128GiB
moves the start address of the linear mapping by 4GiB, and as a side effect
moves the start of the vmalloc region by 4GiB, thus breaking the
8GiB alignment requirement for KASAN.
I've reworked the patch to simply shift the start address of the linear mapping
by 8GiB instead, and it appears to fix the KASAN problem, as well as allowing
the full 128GiB to be supported. A cleaner fix might be to make the alignment
requirements explicit in the definition of VMALLOC_START, or alternatively
fix the KASAN initialisation to cope with non-aligned start addresses. What
do you think?
Stuart
>
> Thanks,
>
> Alex
>
>
> >>>
> >>>
> >>>> + memblock_cap_memory_range(phys_ram_base,
> >>>> + max_mapped_addr -
> >>>> phys_ram_base);
> >>>> + }
> >>>> +
> >>>> /*
> >>>> * Reserve physical address space that would be mapped to
> >>>> virtual
> >>>> * addresses greater than (void *)(-PAGE_SIZE) because:
> >>>> @@ -265,6 +273,7 @@ static void __init setup_bootmem(void)
> >>>> memblock_reserve(max_mapped_addr,
> >>>> (phys_addr_t)-max_mapped_addr);
> >>>> }
> >>>>
> >>>> + phys_ram_end = memblock_end_of_DRAM();
> >>>> min_low_pfn = PFN_UP(phys_ram_base);
> >>>> max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end);
> >>>> high_memory = (void *)(__va(PFN_PHYS(max_low_pfn)));
> >>>> @@ -1289,8 +1298,6 @@ static void __init
> >>>> create_linear_mapping_page_table(void)
> >>>> if (start <= __pa(PAGE_OFFSET) &&
> >>>> __pa(PAGE_OFFSET) < end)
> >>>> start = __pa(PAGE_OFFSET);
> >>>> - if (end >= __pa(PAGE_OFFSET) + memory_limit)
> >>>> - end = __pa(PAGE_OFFSET) + memory_limit;
> >>>>
> >>>> create_linear_mapping_range(start, end, 0);
> >>>> }
> >> _______________________________________________
> >> linux-riscv mailing list
> >> linux-riscv@lists.infradead.org
> >> http://lists.infradead.org/mailman/listinfo/linux-riscv
> >
> > _______________________________________________
> > linux-riscv mailing list
> > linux-riscv@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-riscv
© 2016 - 2025 Red Hat, Inc.