arch/x86/mm/pat/set_memory.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+)
x86 has STRICT_*_RWX, but not even a warning when someone violates it.
Add this warning and fully refuse the transition.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
arch/x86/mm/pat/set_memory.c | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -580,6 +580,33 @@ static inline pgprot_t static_protection
}
/*
+ * Validate and enforce strict W^X semantics.
+ */
+static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
+ unsigned long pfn, unsigned long npg)
+{
+ unsigned long end;
+
+ if (!cpu_feature_enabled(X86_FEATURE_NX))
+ return new;
+
+ if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX)))
+ return new;
+
+ if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
+ return new;
+
+ end = start + npg * PAGE_SIZE - 1;
+ WARN_ONCE(1, "CPA refuse W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
+ (unsigned long long)pgprot_val(old),
+ (unsigned long long)pgprot_val(new),
+ start, end, pfn);
+
+ /* refuse the transition into WX */
+ return old;
+}
+
+/*
* Lookup the page table entry for a virtual address in a specific pgd.
* Return a pointer to the entry and the level of the mapping.
*/
@@ -885,6 +912,8 @@ static int __should_split_large_page(pte
new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
psize, CPA_DETECT);
+ new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages);
+
/*
* If there is a conflict, split the large page.
*
@@ -1525,6 +1554,7 @@ static int __change_page_attr(struct cpa
if (level == PG_LEVEL_4K) {
pte_t new_pte;
+ pgprot_t old_prot = pte_pgprot(old_pte);
pgprot_t new_prot = pte_pgprot(old_pte);
unsigned long pfn = pte_pfn(old_pte);
@@ -1536,6 +1566,8 @@ static int __change_page_attr(struct cpa
new_prot = static_protections(new_prot, address, pfn, 1, 0,
CPA_PROTECT);
+ new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1);
+
new_prot = pgprot_clear_protnone_bits(new_prot);
/*
On Mon, 29 Aug 2022 12:18:03 +0200 Peter Zijlstra <peterz@infradead.org> wrote: > x86 has STRICT_*_RWX, but not even a warning when someone violates it. > > Add this warning and fully refuse the transition. > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> So, this now triggers on enabling function tracing at boot up: "ftrace=function" This: SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=4, Nodes=1 Kernel/User page tables isolation: enabled ftrace: allocating 68738 entries in 269 pages ftrace: allocated 269 pages with 4 groups Starting tracer 'function' ------------[ cut here ]------------ CPA detected W^X violation: 8000000000000063 -> 0000000000000063 range: 0xffffffffc0013000 - 0xffffffffc0013fff PFN 10031b WARNING: CPU: 0 PID: 0 at arch/x86/mm/pat/set_memory.c:609 verify_rwx+0x61/0x6d Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 6.1.0-rc1-test+ #3 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014 RIP: 0010:verify_rwx+0x61/0x6d Code: e5 01 00 75 27 49 c1 e0 0c 48 89 d1 48 89 fe 48 c7 c7 5b b3 92 84 4e 8d 44 02 ff 48 89 da c6 05 71 29 e5 01 01 e8 35 90 e2 00 <0f> 0b 48 89 d8 5b 5d e9 6f 95 1a 01 0f 1f 44 00 00 55 48 89 e5 53 RSP: 0000:ffffffff84c03b08 EFLAGS: 00010086 RAX: 0000000000000000 RBX: 0000000000000063 RCX: 0000000000000003 RDX: 0000000000000003 RSI: ffffffff84c039b0 RDI: 0000000000000001 RBP: ffffffff84c03b10 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000025 R12: ffff8e730031c098 R13: 000000000010031b R14: 800000010031b063 R15: 8000000000000063 FS: 0000000000000000(0000) GS:ffff8e7416a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff8e73fd801000 CR3: 00000001fcc22001 CR4: 00000000000606f0 Call Trace: <TASK> __change_page_attr_set_clr+0x146/0x8a6 ? __mutex_unlock_slowpath+0x41/0x213 ? mutex_unlock+0x12/0x18 ? _vm_unmap_aliases+0x126/0x136 change_page_attr_set_clr+0x135/0x268 ? find_vmap_area+0x32/0x3e ? __fentry__+0x10/0x10 change_page_attr_clear.constprop.0+0x16/0x1c set_memory_x+0x2c/0x32 arch_ftrace_update_trampoline+0x218/0x2db ? ftrace_caller_op_ptr+0x17/0x17 ftrace_update_trampoline+0x16/0xa1 ? tracing_gen_ctx+0x1c/0x1c __register_ftrace_function+0x93/0xb2 ftrace_startup+0x21/0xf0 ? tracing_gen_ctx+0x1c/0x1c register_ftrace_function_nolock+0x26/0x40 register_ftrace_function+0x4e/0x143 ? mutex_unlock+0x12/0x18 ? tracing_gen_ctx+0x1c/0x1c function_trace_init+0x7d/0xc3 tracer_init+0x23/0x2c tracing_set_tracer+0x1d5/0x206 register_tracer+0x1c0/0x1e4 init_function_trace+0x90/0x96 early_trace_init+0x25c/0x352 start_kernel+0x424/0x6e4 x86_64_start_reservations+0x24/0x2a x86_64_start_kernel+0x8c/0x95 secondary_startup_64_no_verify+0xe0/0xeb </TASK> ---[ end trace 0000000000000000 ]--- -- Steve
Hi, On Mon, Aug 29, 2022 at 12:18:03PM +0200, Peter Zijlstra wrote: > > x86 has STRICT_*_RWX, but not even a warning when someone violates it. > > Add this warning and fully refuse the transition. > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> I see the following crash when trying to boot qemu using images with PAE enabled. I checked again after applying "x86/mm/32: Fix W^X detection when page tables do not support NX", but that did not fix the problem. Guenter --- [ 2.042861] CPA refuse W^X violation: 8000000000000063 -> 0000000000000063 range: 0x00000000c00a0000 - 0x00000000c00a0fff PFN a0 ILLOPC: cbc65efa: 0f 0b [ 2.043267] WARNING: CPU: 0 PID: 1 at arch/x86/mm/pat/set_memory.c:600 __change_page_attr_set_clr+0xdca/0xdd0 [ 2.043743] Modules linked in: [ 2.043978] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.0.0-rc6-next-20220921 #1 [ 2.044277] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 [ 2.044572] EIP: __change_page_attr_set_clr+0xdca/0xdd0 [ 2.044751] Code: 10 8b 45 ac 89 7c 24 04 89 74 24 14 89 4c 24 1c 8d 8e ff 0f 00 00 89 4c 24 18 89 44 24 08 c7 04 24 44 67 08 cd e8 56 38 fb 00 <0f> 0b eb 83 66 90 55 89 e5 57 56 89 d6 53 89 c3 83 ec 58 31 d2 8b [ 2.045179] EAX: 00000074 EBX: 000a0063 ECX: 00000000 EDX: 00000002 [ 2.045315] ESI: c00a0000 EDI: 00000063 EBP: c115fe4c ESP: c115fd34 [ 2.045445] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00000282 [ 2.045585] CR0: 80050033 CR2: ffbff000 CR3: 0d57c000 CR4: 000006f0 [ 2.046170] Call Trace: [ 2.046631] ? __purge_vmap_area_lazy+0x6c/0x640 [ 2.046768] ? _vm_unmap_aliases.part.0+0x1d8/0x1f0 [ 2.046923] ? __mutex_unlock_slowpath+0x2b/0x2b0 [ 2.047035] ? purge_fragmented_blocks_allcpus+0x64/0x2c0 [ 2.047199] ? _vm_unmap_aliases.part.0+0x1d8/0x1f0 [ 2.047315] ? _vm_unmap_aliases.part.0+0x54/0x1f0 [ 2.047496] change_page_attr_set_clr+0x11d/0x2d0 [ 2.047738] set_memory_x+0x56/0x60 [ 2.047863] pci_pcbios_init+0xc8/0x28c [ 2.047981] ? pcibios_resource_survey+0x63/0x63 [ 2.048152] pci_arch_init+0x3c/0x73 [ 2.048242] ? pcibios_resource_survey+0x63/0x63 [ 2.048340] do_one_initcall+0x4f/0x2e0 [ 2.048442] ? __this_cpu_preempt_check+0xf/0x11 [ 2.048578] ? rcu_read_lock_sched_held+0x41/0x70 [ 2.048684] ? trace_initcall_level+0x65/0xa6 [ 2.048805] kernel_init_freeable+0x210/0x264 [ 2.048908] ? rest_init+0x140/0x140 [ 2.049002] kernel_init+0x15/0x110 [ 2.049211] ? schedule_tail_wrapper+0x9/0xc [ 2.049312] ret_from_fork+0x1c/0x28 [ 2.049547] irq event stamp: 7715 [ 2.049633] hardirqs last enabled at (7723): [<cbce7119>] __up_console_sem+0x69/0x80 [ 2.049822] hardirqs last disabled at (7730): [<cbce70fd>] __up_console_sem+0x4d/0x80 [ 2.049972] softirqs last enabled at (7176): [<cbc29ac7>] call_on_stack+0x47/0x60 [ 2.050153] softirqs last disabled at (7167): [<cbc29ac7>] call_on_stack+0x47/0x60 [ 2.050307] ---[ end trace 0000000000000000 ]--- [ 2.050762] PCI: PCI BIOS area is rw and x. Use pci=nobios if you want it NX. [ 2.051115] kernel tried to execute NX-protected page - exploit attempt? (uid: 0) [ 2.051115] BUG: unable to handle page fault for address: c00fd2bf [ 2.051115] #PF: supervisor instruction fetch in kernel mode [ 2.051115] #PF: error_code(0x0011) - permissions violation [ 2.051115] *pdpt = 000000000d578001 *pde = 000000000dc18063 *pte = 80000000000fd063 [ 2.051115] Oops: 0011 [#1] PREEMPT SMP PTI [ 2.051115] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 6.0.0-rc6-next-20220921 #1 [ 2.051115] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 [ 2.051115] EIP: 0xc00fd2bf [ 2.051115] Code: 06 1e 8c d0 8e d8 66 89 e3 66 0f b7 e4 66 89 e0 66 e8 43 e8 ff ff 66 89 dc 1f 07 66 5f 66 5e 66 5d 66 5b 66 5a 66 59 66 58 cf <9c> 3d 24 50 43 49 75 13 bb 00 00 0f 00 b9 00 00 01 00 ba 1d d2 00 [ 2.051115] EAX: 49435024 EBX: 00000000 ECX: 00000000 EDX: cd1a027f [ 2.051115] ESI: 00000200 EDI: cd50e7f4 EBP: c115ff08 ESP: c115fee0 [ 2.051115] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00000046 [ 2.051115] CR0: 80050033 CR2: c00fd2bf CR3: 0d57c000 CR4: 000006f0 [ 2.051115] Call Trace: [ 2.051115] ? pci_pcbios_init+0xfa/0x28c [ 2.051115] ? pcibios_resource_survey+0x63/0x63 [ 2.051115] pci_arch_init+0x3c/0x73 [ 2.051115] ? pcibios_resource_survey+0x63/0x63 [ 2.051115] do_one_initcall+0x4f/0x2e0 [ 2.051115] ? __this_cpu_preempt_check+0xf/0x11 [ 2.051115] ? rcu_read_lock_sched_held+0x41/0x70 [ 2.051115] ? trace_initcall_level+0x65/0xa6 [ 2.051115] kernel_init_freeable+0x210/0x264 [ 2.051115] ? rest_init+0x140/0x140 [ 2.051115] kernel_init+0x15/0x110 [ 2.051115] ? schedule_tail_wrapper+0x9/0xc [ 2.051115] ret_from_fork+0x1c/0x28 [ 2.051115] Modules linked in: [ 2.051115] CR2: 00000000c00fd2bf [ 2.051115] ---[ end trace 0000000000000000 ]--- [ 2.051115] EIP: 0xc00fd2bf [ 2.051115] Code: 06 1e 8c d0 8e d8 66 89 e3 66 0f b7 e4 66 89 e0 66 e8 43 e8 ff ff 66 89 dc 1f 07 66 5f 66 5e 66 5d 66 5b 66 5a 66 59 66 58 cf <9c> 3d 24 50 43 49 75 13 bb 00 00 0f 00 b9 00 00 01 00 ba 1d d2 00 [ 2.051115] EAX: 49435024 EBX: 00000000 ECX: 00000000 EDX: cd1a027f [ 2.051115] ESI: 00000200 EDI: cd50e7f4 EBP: c115ff08 ESP: c115fee0 [ 2.051115] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00000046 [ 2.051115] CR0: 80050033 CR2: c00fd2bf CR3: 0d57c000 CR4: 000006f0 [ 2.051426] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009 --- # bad: [ef08d387bbbc20df740ced8caee0ffac835869ac] Add linux-next specific files for 20220920 # good: [521a547ced6477c54b4b0cc206000406c221b4d6] Linux 6.0-rc6 git bisect start 'HEAD' 'v6.0-rc6' # good: [df970c033333b10c728198606fe787535e08ab8a] Merge branch 'drm-next' of git://git.freedesktop.org/git/drm/drm.git git bisect good df970c033333b10c728198606fe787535e08ab8a # bad: [c46ae7d9b6ad0283ffd7b40117b52444d68e083e] Merge branch 'usb-next' of git://git.kernel.org/pub/scm/linux/kernel/git/johan/usb-serial.git git bisect bad c46ae7d9b6ad0283ffd7b40117b52444d68e083e # good: [6a21588fd7f579342d71f2c543d7dca6fd44ff8a] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git git bisect good 6a21588fd7f579342d71f2c543d7dca6fd44ff8a # bad: [9b5a7d7a43dc87c6326a23394f37d0786dc9e712] Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git git bisect bad 9b5a7d7a43dc87c6326a23394f37d0786dc9e712 # good: [00a0886a99d2aba28e8c9f1c124d9cbbaadab693] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/audit.git git bisect good 00a0886a99d2aba28e8c9f1c124d9cbbaadab693 # good: [57b16b0bfae3a029815b845e8e623fb02d255d68] Merge branch into tip/master: 'x86/cache' git bisect good 57b16b0bfae3a029815b845e8e623fb02d255d68 # good: [2632186d3de796a47b2dc00ac9dc9bbe6e70796b] Merge remote-tracking branch 'spi/for-6.1' into spi-next git bisect good 2632186d3de796a47b2dc00ac9dc9bbe6e70796b # good: [65c4764941bb230ef00164771fba0cdad0bfd3e4] dt-bindings: phy: hisilicon,hi3670-usb3: simplify example git bisect good 65c4764941bb230ef00164771fba0cdad0bfd3e4 # bad: [32aefecc271aa1ca4431e0f9094e5a578922527b] Merge branch into tip/master: 'x86/mm' git bisect bad 32aefecc271aa1ca4431e0f9094e5a578922527b # good: [16ac81825892970fbe5f32fb379466d19d3d3134] Merge branch into tip/master: 'x86/cpu' git bisect good 16ac81825892970fbe5f32fb379466d19d3d3134 # good: [77614503f9f135323315a53d60dc001f1a429f7c] Merge branch into tip/master: 'x86/misc' git bisect good 77614503f9f135323315a53d60dc001f1a429f7c # bad: [1043897681808118c0f7e70b210774000fe06621] Merge branch 'linus' into x86/mm, to refresh the branch git bisect bad 1043897681808118c0f7e70b210774000fe06621 # bad: [652c5bf380ad018e15006a7f8349800245ddbbad] x86/mm: Refuse W^X violations git bisect bad 652c5bf380ad018e15006a7f8349800245ddbbad # good: [86af8230ce138e0423f43f6b104f3fa050aced6d] x86/mm: Rename set_memory_present() to set_memory_p() git bisect good 86af8230ce138e0423f43f6b104f3fa050aced6d # first bad commit: [652c5bf380ad018e15006a7f8349800245ddbbad] x86/mm: Refuse W^X violations
On 9/21/22 13:07, Guenter Roeck wrote: > [ 2.042861] CPA refuse W^X violation: 8000000000000063 -> 0000000000000063 range: 0x00000000c00a0000 - 0x00000000c00a0fff PFN a0 > ILLOPC: cbc65efa: 0f 0b > [ 2.043267] WARNING: CPU: 0 PID: 1 at arch/x86/mm/pat/set_memory.c:600 __change_page_attr_set_clr+0xdca/0xdd0 ... > [ 2.050307] ---[ end trace 0000000000000000 ]--- > [ 2.050762] PCI: PCI BIOS area is rw and x. Use pci=nobios if you want it NX. > [ 2.051115] kernel tried to execute NX-protected page - exploit attempt? (uid: 0) > [ 2.051115] BUG: unable to handle page fault for address: c00fd2bf This _looks_ like it is working as intended. The PCI BIOS code tried to make a RWX page. The CPA code refused to do it and presumably returned an error, leaving a RW page, non-executable page. The PCI code didn't check the set_memory_x() return code and tried to go execute anyway. That resulted in the oops. I was able to reproduce this pretty easily. The workaround from dmesg is pci=nobios. That seems to do the trick for me, although that advise was sandwiched between a warning and an oops, so not the easiest to find. I'm a bit torn what to do on this one. Breaking the boot is bad, but so is leaving RWX memory around. Thoughts?
On Wed 2022-09-21 13:59:06, Dave Hansen wrote: > On 9/21/22 13:07, Guenter Roeck wrote: > > [ 2.042861] CPA refuse W^X violation: 8000000000000063 -> 0000000000000063 range: 0x00000000c00a0000 - 0x00000000c00a0fff PFN a0 > > ILLOPC: cbc65efa: 0f 0b > > [ 2.043267] WARNING: CPU: 0 PID: 1 at arch/x86/mm/pat/set_memory.c:600 __change_page_attr_set_clr+0xdca/0xdd0 > ... > > [ 2.050307] ---[ end trace 0000000000000000 ]--- > > [ 2.050762] PCI: PCI BIOS area is rw and x. Use pci=nobios if you want it NX. > > [ 2.051115] kernel tried to execute NX-protected page - exploit attempt? (uid: 0) > > [ 2.051115] BUG: unable to handle page fault for address: c00fd2bf > > This _looks_ like it is working as intended. The PCI BIOS code tried to > make a RWX page. The CPA code refused to do it and presumably returned > an error, leaving a RW page, non-executable page. The PCI code didn't > check the set_memory_x() return code and tried to go execute anyway. > That resulted in the oops. > > I was able to reproduce this pretty easily. The workaround from dmesg > is pci=nobios. That seems to do the trick for me, although that advise > was sandwiched between a warning and an oops, so not the easiest to find. > > I'm a bit torn what to do on this one. Breaking the boot is bad, but so > is leaving RWX memory around. Well, the original patch is bad. Boot regressions are not acceptable. We should first add an WARN_ON(), debug and fix the failures, then we can start refusing the transitions. Best regards, Pavel -- People of Russia, stop Putin before his war on Ukraine escalates.
On 9/21/22 13:59, Dave Hansen wrote: > On 9/21/22 13:07, Guenter Roeck wrote: >> [ 2.042861] CPA refuse W^X violation: 8000000000000063 -> 0000000000000063 range: 0x00000000c00a0000 - 0x00000000c00a0fff PFN a0 >> ILLOPC: cbc65efa: 0f 0b >> [ 2.043267] WARNING: CPU: 0 PID: 1 at arch/x86/mm/pat/set_memory.c:600 __change_page_attr_set_clr+0xdca/0xdd0 > ... >> [ 2.050307] ---[ end trace 0000000000000000 ]--- >> [ 2.050762] PCI: PCI BIOS area is rw and x. Use pci=nobios if you want it NX. >> [ 2.051115] kernel tried to execute NX-protected page - exploit attempt? (uid: 0) >> [ 2.051115] BUG: unable to handle page fault for address: c00fd2bf > > This _looks_ like it is working as intended. The PCI BIOS code tried to > make a RWX page. The CPA code refused to do it and presumably returned > an error, leaving a RW page, non-executable page. The PCI code didn't > check the set_memory_x() return code and tried to go execute anyway. > That resulted in the oops. > > I was able to reproduce this pretty easily. The workaround from dmesg > is pci=nobios. That seems to do the trick for me, although that advise > was sandwiched between a warning and an oops, so not the easiest to find. > > I'm a bit torn what to do on this one. Breaking the boot is bad, but so > is leaving RWX memory around. > > Thoughts? For my part I'll do what the above suggests, ie run tests with PAE enabled with pci=nobios command line option. AFAICS that hides the problem in my tests. I am just not sure if that is really appropriate. Guenter
On 9/21/22 15:59, Guenter Roeck wrote: > On 9/21/22 13:59, Dave Hansen wrote: >> On 9/21/22 13:07, Guenter Roeck wrote: >>> [ 2.042861] CPA refuse W^X violation: 8000000000000063 -> 0000000000000063 range: 0x00000000c00a0000 - 0x00000000c00a0fff PFN a0 >>> ILLOPC: cbc65efa: 0f 0b >>> [ 2.043267] WARNING: CPU: 0 PID: 1 at arch/x86/mm/pat/set_memory.c:600 __change_page_attr_set_clr+0xdca/0xdd0 >> ... >>> [ 2.050307] ---[ end trace 0000000000000000 ]--- >>> [ 2.050762] PCI: PCI BIOS area is rw and x. Use pci=nobios if you want it NX. >>> [ 2.051115] kernel tried to execute NX-protected page - exploit attempt? (uid: 0) >>> [ 2.051115] BUG: unable to handle page fault for address: c00fd2bf >> >> This _looks_ like it is working as intended. The PCI BIOS code tried to >> make a RWX page. The CPA code refused to do it and presumably returned >> an error, leaving a RW page, non-executable page. The PCI code didn't >> check the set_memory_x() return code and tried to go execute anyway. >> That resulted in the oops. >> >> I was able to reproduce this pretty easily. The workaround from dmesg >> is pci=nobios. That seems to do the trick for me, although that advise >> was sandwiched between a warning and an oops, so not the easiest to find. >> >> I'm a bit torn what to do on this one. Breaking the boot is bad, but so >> is leaving RWX memory around. >> >> Thoughts? > > For my part I'll do what the above suggests, ie run tests with PAE enabled > with pci=nobios command line option. AFAICS that hides the problem in my tests. > I am just not sure if that is really appropriate. > Oh well, that "helped" to hide one of the crashes. Here is another one. This is with PAE enabled and booting through efi32. Guenter --- [ 1.080779] ------------[ cut here ]------------ [ 1.080959] CPA refuse W^X violation: 8000000000000063 -> 0000000000000063 range: 0x00000000d0770000 - 0x00000000d0770fff PFN edcd ILLOPC: c7465efa: 0f 0b [ 1.081467] WARNING: CPU: 0 PID: 0 at arch/x86/mm/pat/set_memory.c:600 __change_page_attr_set_clr+0xdca/0xdd0 [ 1.082120] Modules linked in: [ 1.082476] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.0.0-rc6-next-20220921 #1 [ 1.082706] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 [ 1.082988] EIP: __change_page_attr_set_clr+0xdca/0xdd0 [ 1.083187] Code: 10 8b 45 ac 89 7c 24 04 89 74 24 14 89 4c 24 1c 8d 8e ff 0f 00 00 89 4c 24 18 89 44 24 08 c7 04 24 38 67 88 c8 e8 56 38 fb 00 <0f> 0b eb 83 66 90 55 89 e5 57 56 89 d6 53 89 c3 83 ec 58 31 d2 8b [ 1.083672] EAX: 00000076 EBX: 0edcd063 ECX: 00000000 EDX: 00000003 [ 1.083830] ESI: d0770000 EDI: 00000063 EBP: c8a3dea8 ESP: c8a3dd90 [ 1.083984] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00200296 [ 1.084286] CR0: 80050033 CR2: ffbff000 CR3: 08d7c000 CR4: 000006b0 [ 1.084501] Call Trace: [ 1.084849] ? __this_cpu_preempt_check+0xf/0x11 [ 1.085053] ? __purge_vmap_area_lazy+0x6c/0x640 [ 1.085269] ? _vm_unmap_aliases.part.0+0x1d8/0x1f0 [ 1.085415] ? __mutex_unlock_slowpath+0x2b/0x2b0 [ 1.085536] ? purge_fragmented_blocks_allcpus+0x64/0x2c0 [ 1.085696] ? _vm_unmap_aliases.part.0+0x1d8/0x1f0 [ 1.085820] ? _vm_unmap_aliases.part.0+0x54/0x1f0 [ 1.086004] change_page_attr_set_clr+0x11d/0x2d0 [ 1.086313] ? __efi_memmap_init+0x70/0xd3 [ 1.086475] set_memory_x+0x56/0x60 [ 1.086592] efi_runtime_update_mappings+0x36/0x42 [ 1.086717] efi_enter_virtual_mode+0x351/0x36e [ 1.086860] start_kernel+0x57d/0x60f [ 1.086956] ? set_intr_gate+0x42/0x55 [ 1.087079] i386_start_kernel+0x43/0x45 [ 1.087272] startup_32_smp+0x161/0x164 [ 1.087491] irq event stamp: 6582 [ 1.087593] hardirqs last enabled at (6590): [<c74e7119>] __up_console_sem+0x69/0x80 [ 1.087824] hardirqs last disabled at (6597): [<c74e70fd>] __up_console_sem+0x4d/0x80 [ 1.088010] softirqs last enabled at (6571): [<c7429a94>] call_on_stack+0x14/0x60 [ 1.088278] softirqs last disabled at (6614): [<c7429a94>] call_on_stack+0x14/0x60 [ 1.088466] ---[ end trace 0000000000000000 ]--- [ 1.089237] kernel tried to execute NX-protected page - exploit attempt? (uid: 0) [ 1.089237] BUG: unable to handle page fault for address: d0810e2a [ 1.089237] #PF: supervisor instruction fetch in kernel mode [ 1.089237] #PF: error_code(0x0011) - permissions violation [ 1.089237] *pdpt = 0000000008d78001 *pde = 000000000eec6067 *pte = 800000000fe98063 [ 1.089237] Oops: 0011 [#1] PREEMPT SMP PTI [ 1.089237] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G W 6.0.0-rc6-next-20220921 #1 [ 1.089237] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 [ 1.089237] EIP: 0xd0810e2a [ 1.089237] Code: 75 0c ff 75 08 68 c1 45 81 d0 6a 40 e8 ef ce ff ff 83 c4 20 83 ec 0c 53 e8 d4 cf ff ff 83 c4 10 31 c0 8d 65 f4 5b 5e 5f 5d c3 <55> 89 e5 57 56 53 bb 02 00 00 80 83 ec 5c 8b 7d 08 85 ff 0f 84 ed [ 1.089237] EAX: d0810e2a EBX: 00200202 ECX: 00000049 EDX: 00000000 [ 1.089237] ESI: c8a3df30 EDI: c84c5000 EBP: c8a3df20 ESP: c8a3def8 [ 1.089237] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00200202 [ 1.089237] CR0: 80050033 CR2: d0810e2a CR3: 08d7c000 CR4: 000006b0 [ 1.089237] Call Trace: [ 1.089237] ? virt_efi_set_variable_nonblocking+0x80/0xf0 [ 1.089237] ? virt_efi_reset_system+0xe0/0xe0 [ 1.089237] efi_delete_dummy_variable+0x55/0x70 [ 1.089237] efi_enter_virtual_mode+0x356/0x36e [ 1.089237] start_kernel+0x57d/0x60f [ 1.089237] ? set_intr_gate+0x42/0x55 [ 1.089237] i386_start_kernel+0x43/0x45 [ 1.089237] startup_32_smp+0x161/0x164 [ 1.089237] Modules linked in: [ 1.089237] CR2: 00000000d0810e2a [ 1.089237] ---[ end trace 0000000000000000 ]--- [ 1.089237] EIP: 0xd0810e2a
On Wed, Sep 21, 2022 at 08:09:13PM -0700, Guenter Roeck wrote: > Oh well, that "helped" to hide one of the crashes. Here is another one. > This is with PAE enabled and booting through efi32. > [ 1.086592] efi_runtime_update_mappings+0x36/0x42 > [ 1.086717] efi_enter_virtual_mode+0x351/0x36e > [ 1.086860] start_kernel+0x57d/0x60f > [ 1.086956] ? set_intr_gate+0x42/0x55 > [ 1.087079] i386_start_kernel+0x43/0x45 > [ 1.087272] startup_32_smp+0x161/0x164 Does this help? Dave; perhaps we should just let i386 be i386 and let it bitrot :/ diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index e06a199423c0..d81e379fcd43 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -136,6 +136,7 @@ void __init efi_runtime_update_mappings(void) if (md->type != EFI_RUNTIME_SERVICES_CODE) continue; + set_memory_ro(md->virt_addr, md->num_pages); set_memory_x(md->virt_addr, md->num_pages); } }
On 9/22/22 00:46, Peter Zijlstra wrote: > On Wed, Sep 21, 2022 at 08:09:13PM -0700, Guenter Roeck wrote: > >> Oh well, that "helped" to hide one of the crashes. Here is another one. >> This is with PAE enabled and booting through efi32. > >> [ 1.086592] efi_runtime_update_mappings+0x36/0x42 >> [ 1.086717] efi_enter_virtual_mode+0x351/0x36e >> [ 1.086860] start_kernel+0x57d/0x60f >> [ 1.086956] ? set_intr_gate+0x42/0x55 >> [ 1.087079] i386_start_kernel+0x43/0x45 >> [ 1.087272] startup_32_smp+0x161/0x164 > > Does this help? Dave; perhaps we should just let i386 be i386 and let it > bitrot :/ > > diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c > index e06a199423c0..d81e379fcd43 100644 > --- a/arch/x86/platform/efi/efi_32.c > +++ b/arch/x86/platform/efi/efi_32.c > @@ -136,6 +136,7 @@ void __init efi_runtime_update_mappings(void) > if (md->type != EFI_RUNTIME_SERVICES_CODE) > continue; > > + set_memory_ro(md->virt_addr, md->num_pages); > set_memory_x(md->virt_addr, md->num_pages); > } > } Yes, it does. Tested-by: Guenter Roeck <linux@roeck-us.net> Guenter
From: Peter Zijlstra <peterz@infradead.org>
I'm planning on sticking this in x86/mm so that it goes upstream
along with the W+X detection code.
--
A recent x86/mm change warns and refuses to create W+X mappings.
The 32-bit EFI code tries to create such a mapping and trips over
the new W+X refusal.
Make the EFI_RUNTIME_SERVICES_CODE mapping read-only to fix it.
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Andy Shevchenko <andy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: x86@kernel.org
Cc: linux-efi@vger.kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: https://lore.kernel.org/all/d8cd7c7e-24c1-7f70-24a9-91c77aa634af@roeck-us.net/
---
arch/x86/platform/efi/efi_32.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index e06a199423c0..d81e379fcd43 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -136,6 +136,7 @@ void __init efi_runtime_update_mappings(void)
if (md->type != EFI_RUNTIME_SERVICES_CODE)
continue;
+ set_memory_ro(md->virt_addr, md->num_pages);
set_memory_x(md->virt_addr, md->num_pages);
}
}
--
2.34.1
On Thu, 22 Sept 2022 at 21:32, Dave Hansen <dave.hansen@linux.intel.com> wrote: > > From: Peter Zijlstra <peterz@infradead.org> > > I'm planning on sticking this in x86/mm so that it goes upstream > along with the W+X detection code. > > -- > > A recent x86/mm change warns and refuses to create W+X mappings. > > The 32-bit EFI code tries to create such a mapping and trips over > the new W+X refusal. > > Make the EFI_RUNTIME_SERVICES_CODE mapping read-only to fix it. > This is not safe. EFI_RUNTIME_SERVICES_CODE covers both .text and .data sections of the EFI runtime PE/COFF executables in memory, so you are essentially making .data and .bss read-only. (Whether those executables actually modify their .data and .bss at runtime is a different matter, but the point is that it used to be possible) More recent firmwares may provide a 'memory attributes table' separately which describes the individual sections, but older 32-bit firmwares are not even built with 4k section alignment, so code and data may share a single page. Note that we haven't wired up this memory attributes table on i386 at the moment, and I seriously doubt that 32-bit firmware in the field exposes it. Can we just turn off this feature for 32-bit? > Reported-by: Guenter Roeck <linux@roeck-us.net> > Signed-off-by: Peter Zijlstra <peterz@infradead.org> > Tested-by: Guenter Roeck <linux@roeck-us.net> > Cc: Ard Biesheuvel <ardb@kernel.org> > Cc: Darren Hart <dvhart@infradead.org> > Cc: Andy Shevchenko <andy@infradead.org> > Cc: Thomas Gleixner <tglx@linutronix.de> > Cc: Ingo Molnar <mingo@redhat.com> > Cc: Borislav Petkov <bp@alien8.de> > Cc: x86@kernel.org > Cc: linux-efi@vger.kernel.org > Cc: "H. Peter Anvin" <hpa@zytor.com> > Link: https://lore.kernel.org/all/d8cd7c7e-24c1-7f70-24a9-91c77aa634af@roeck-us.net/ > --- > arch/x86/platform/efi/efi_32.c | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c > index e06a199423c0..d81e379fcd43 100644 > --- a/arch/x86/platform/efi/efi_32.c > +++ b/arch/x86/platform/efi/efi_32.c > @@ -136,6 +136,7 @@ void __init efi_runtime_update_mappings(void) > if (md->type != EFI_RUNTIME_SERVICES_CODE) > continue; > > + set_memory_ro(md->virt_addr, md->num_pages); > set_memory_x(md->virt_addr, md->num_pages); > } > }
On Fri, Sep 23, 2022 at 12:08:57AM +0200, Ard Biesheuvel wrote: > On Thu, 22 Sept 2022 at 21:32, Dave Hansen <dave.hansen@linux.intel.com> wrote: > > > > From: Peter Zijlstra <peterz@infradead.org> > > > > I'm planning on sticking this in x86/mm so that it goes upstream > > along with the W+X detection code. > > > > -- > > > > A recent x86/mm change warns and refuses to create W+X mappings. > > > > The 32-bit EFI code tries to create such a mapping and trips over > > the new W+X refusal. > > > > Make the EFI_RUNTIME_SERVICES_CODE mapping read-only to fix it. > > > > This is not safe. EFI_RUNTIME_SERVICES_CODE covers both .text and > .data sections of the EFI runtime PE/COFF executables in memory, so > you are essentially making .data and .bss read-only. (Whether those > executables actually modify their .data and .bss at runtime is a > different matter, but the point is that it used to be possible) > > More recent firmwares may provide a 'memory attributes table' > separately which describes the individual sections, but older 32-bit > firmwares are not even built with 4k section alignment, so code and > data may share a single page. Note that we haven't wired up this > memory attributes table on i386 at the moment, and I seriously doubt > that 32-bit firmware in the field exposes it. > > Can we just turn off this feature for 32-bit? Goodie; some seriously security minded people who did that EFI turd :/ Let's just heap it on the pile of 32bit sucks and should not be considered a security target anymore and indeed kill this feature.
(cc Kees) On Fri, 23 Sept 2022 at 09:00, Peter Zijlstra <peterz@infradead.org> wrote: > > On Fri, Sep 23, 2022 at 12:08:57AM +0200, Ard Biesheuvel wrote: > > On Thu, 22 Sept 2022 at 21:32, Dave Hansen <dave.hansen@linux.intel.com> wrote: > > > > > > From: Peter Zijlstra <peterz@infradead.org> > > > > > > I'm planning on sticking this in x86/mm so that it goes upstream > > > along with the W+X detection code. > > > > > > -- > > > > > > A recent x86/mm change warns and refuses to create W+X mappings. > > > > > > The 32-bit EFI code tries to create such a mapping and trips over > > > the new W+X refusal. > > > > > > Make the EFI_RUNTIME_SERVICES_CODE mapping read-only to fix it. > > > > > > > This is not safe. EFI_RUNTIME_SERVICES_CODE covers both .text and > > .data sections of the EFI runtime PE/COFF executables in memory, so > > you are essentially making .data and .bss read-only. (Whether those > > executables actually modify their .data and .bss at runtime is a > > different matter, but the point is that it used to be possible) > > > > More recent firmwares may provide a 'memory attributes table' > > separately which describes the individual sections, but older 32-bit > > firmwares are not even built with 4k section alignment, so code and > > data may share a single page. Note that we haven't wired up this > > memory attributes table on i386 at the moment, and I seriously doubt > > that 32-bit firmware in the field exposes it. > > > > Can we just turn off this feature for 32-bit? > > Goodie; some seriously security minded people who did that EFI turd :/ To be fair, most people tended to care more about memory footprint than about security at the time. And I don't recall a lot of enthusiasm in the Linux community either for rounding up kernel sections so they could be mapped with W^X permissions. And without PAE, all memory is executable anyway. > Let's just heap it on the pile of 32bit sucks and should not be > considered a security target anymore and indeed kill this feature. > I take it this issue is triggered by the fact that i386 maps the EFI runtime regions into the kernel page tables, and are therefore always mapped, right? If anyone cares enough about this to go and fix it, we could switch to the approach we use everywhere else, i.e., treat EFI memory as user space mappings, and activate them only while a runtime service is in progress. But frankly, why would anyone still be running this? With the EFI mixed mode support, only systems with CPUs that don't actually implement long mode still need this, and I am skeptical that such deployments would use recent kernels.
On 9/23/22 02:49, Ard Biesheuvel wrote: > (cc Kees) > > On Fri, 23 Sept 2022 at 09:00, Peter Zijlstra <peterz@infradead.org> wrote: >> >> On Fri, Sep 23, 2022 at 12:08:57AM +0200, Ard Biesheuvel wrote: >>> On Thu, 22 Sept 2022 at 21:32, Dave Hansen <dave.hansen@linux.intel.com> wrote: >>>> >>>> From: Peter Zijlstra <peterz@infradead.org> >>>> >>>> I'm planning on sticking this in x86/mm so that it goes upstream >>>> along with the W+X detection code. >>>> >>>> -- >>>> >>>> A recent x86/mm change warns and refuses to create W+X mappings. >>>> >>>> The 32-bit EFI code tries to create such a mapping and trips over >>>> the new W+X refusal. >>>> >>>> Make the EFI_RUNTIME_SERVICES_CODE mapping read-only to fix it. >>>> >>> >>> This is not safe. EFI_RUNTIME_SERVICES_CODE covers both .text and >>> .data sections of the EFI runtime PE/COFF executables in memory, so >>> you are essentially making .data and .bss read-only. (Whether those >>> executables actually modify their .data and .bss at runtime is a >>> different matter, but the point is that it used to be possible) >>> >>> More recent firmwares may provide a 'memory attributes table' >>> separately which describes the individual sections, but older 32-bit >>> firmwares are not even built with 4k section alignment, so code and >>> data may share a single page. Note that we haven't wired up this >>> memory attributes table on i386 at the moment, and I seriously doubt >>> that 32-bit firmware in the field exposes it. >>> >>> Can we just turn off this feature for 32-bit? >> >> Goodie; some seriously security minded people who did that EFI turd :/ > > To be fair, most people tended to care more about memory footprint > than about security at the time. And I don't recall a lot of > enthusiasm in the Linux community either for rounding up kernel > sections so they could be mapped with W^X permissions. And without > PAE, all memory is executable anyway. > >> Let's just heap it on the pile of 32bit sucks and should not be >> considered a security target anymore and indeed kill this feature. >> > > I take it this issue is triggered by the fact that i386 maps the EFI > runtime regions into the kernel page tables, and are therefore always > mapped, right? If anyone cares enough about this to go and fix it, we > could switch to the approach we use everywhere else, i.e., treat EFI > memory as user space mappings, and activate them only while a runtime > service is in progress. > > But frankly, why would anyone still be running this? With the EFI > mixed mode support, only systems with CPUs that don't actually > implement long mode still need this, and I am skeptical that such > deployments would use recent kernels. It is supported, thus I run qemu tests for it. That is the whole point of testing, after all. If PAE (assuming that is what you are talking about) is no longer supported or supportable, its support should be removed. If so, I'll be very happy to stop testing it. Thanks, Guenter
On Fri, 23 Sept 2022 at 15:58, Guenter Roeck <linux@roeck-us.net> wrote: > > On 9/23/22 02:49, Ard Biesheuvel wrote: > > (cc Kees) > > > > On Fri, 23 Sept 2022 at 09:00, Peter Zijlstra <peterz@infradead.org> wrote: > >> > >> On Fri, Sep 23, 2022 at 12:08:57AM +0200, Ard Biesheuvel wrote: > >>> On Thu, 22 Sept 2022 at 21:32, Dave Hansen <dave.hansen@linux.intel.com> wrote: > >>>> > >>>> From: Peter Zijlstra <peterz@infradead.org> > >>>> > >>>> I'm planning on sticking this in x86/mm so that it goes upstream > >>>> along with the W+X detection code. > >>>> > >>>> -- > >>>> > >>>> A recent x86/mm change warns and refuses to create W+X mappings. > >>>> > >>>> The 32-bit EFI code tries to create such a mapping and trips over > >>>> the new W+X refusal. > >>>> > >>>> Make the EFI_RUNTIME_SERVICES_CODE mapping read-only to fix it. > >>>> > >>> > >>> This is not safe. EFI_RUNTIME_SERVICES_CODE covers both .text and > >>> .data sections of the EFI runtime PE/COFF executables in memory, so > >>> you are essentially making .data and .bss read-only. (Whether those > >>> executables actually modify their .data and .bss at runtime is a > >>> different matter, but the point is that it used to be possible) > >>> > >>> More recent firmwares may provide a 'memory attributes table' > >>> separately which describes the individual sections, but older 32-bit > >>> firmwares are not even built with 4k section alignment, so code and > >>> data may share a single page. Note that we haven't wired up this > >>> memory attributes table on i386 at the moment, and I seriously doubt > >>> that 32-bit firmware in the field exposes it. > >>> > >>> Can we just turn off this feature for 32-bit? > >> > >> Goodie; some seriously security minded people who did that EFI turd :/ > > > > To be fair, most people tended to care more about memory footprint > > than about security at the time. And I don't recall a lot of > > enthusiasm in the Linux community either for rounding up kernel > > sections so they could be mapped with W^X permissions. And without > > PAE, all memory is executable anyway. > > > >> Let's just heap it on the pile of 32bit sucks and should not be > >> considered a security target anymore and indeed kill this feature. > >> > > > > I take it this issue is triggered by the fact that i386 maps the EFI > > runtime regions into the kernel page tables, and are therefore always > > mapped, right? If anyone cares enough about this to go and fix it, we > > could switch to the approach we use everywhere else, i.e., treat EFI > > memory as user space mappings, and activate them only while a runtime > > service is in progress. > > > > But frankly, why would anyone still be running this? With the EFI > > mixed mode support, only systems with CPUs that don't actually > > implement long mode still need this, and I am skeptical that such > > deployments would use recent kernels. > > It is supported, thus I run qemu tests for it. That is the whole point > of testing, after all. I completely agree with that, and I think all the testing you do is extremely valuable. > If PAE (assuming that is what you are talking about) Not at all - I was referring to i386 support in general. I was basically making the point that we still support i386 without PAE (which is a prerequisite for supporting non-executable mappings), and if we are going to be pedantic about security on this architecture, we should probably make PAE mandatory as well. If we are ok with the current state, enabling this permission check on i386 makes no sense. > is no longer supported or supportable, its support should be > removed. If so, I'll be very happy to stop testing it. > I'd say there are better ways to spend those cycles, but for the time being, I think we should continue testing it, as otherwise it will just bit rot.
On Fri, Sep 23, 2022 at 04:26:58PM +0200, Ard Biesheuvel wrote: > I was basically making the point that we still support i386 without > PAE (which is a prerequisite for supporting non-executable mappings), > and if we are going to be pedantic about security on this > architecture, we should probably make PAE mandatory as well. My expectation would be that if someone is running modern kernels on i386, they're not using PAE. If they care about PAE, I'd expect them to have long since moved to x86_64. > If we are ok with the current state, enabling this permission check on > i386 makes no sense. I'd agree. If it's a choice between "spend a lot of time making sure this works correctly on i386" and "don't do this at all on i386", I would pick the latter. If someone steps up to do the former, then by all means take the patches. -- Kees Cook
On Fri, 23 Sept 2022 at 20:31, Kees Cook <keescook@chromium.org> wrote: > > On Fri, Sep 23, 2022 at 04:26:58PM +0200, Ard Biesheuvel wrote: > > I was basically making the point that we still support i386 without > > PAE (which is a prerequisite for supporting non-executable mappings), > > and if we are going to be pedantic about security on this > > architecture, we should probably make PAE mandatory as well. > > My expectation would be that if someone is running modern kernels on i386, > they're not using PAE. If they care about PAE, I'd expect them to have > long since moved to x86_64. > Not sure I follow. If they care about PAE, they turn it on. Or do you mean 'if they care about being able to address lots of physical memory'? Because the *other* reason you might care about PAE is because it gives you NX support. But currently, PAE is not even enabled in the i386_defconfig, and defaults to off. This means people that are unaware of this won't enable it, and will be running without NX support. > > If we are ok with the current state, enabling this permission check on > > i386 makes no sense. > > I'd agree. If it's a choice between "spend a lot of time making sure > this works correctly on i386" and "don't do this at all on i386", I > would pick the latter. If someone steps up to do the former, then by > all means take the patches. > OK, so it seems we're all in violent agreement here. And if there is ever a push for enabling security features on 32-bit, we can add this to the laundry list of things that need to be looked at.
On Fri, Sep 23, 2022 at 09:53:02PM +0200, Ard Biesheuvel wrote: > On Fri, 23 Sept 2022 at 20:31, Kees Cook <keescook@chromium.org> wrote: > > > > On Fri, Sep 23, 2022 at 04:26:58PM +0200, Ard Biesheuvel wrote: > > > I was basically making the point that we still support i386 without > > > PAE (which is a prerequisite for supporting non-executable mappings), > > > and if we are going to be pedantic about security on this > > > architecture, we should probably make PAE mandatory as well. > > > > My expectation would be that if someone is running modern kernels on i386, > > they're not using PAE. If they care about PAE, I'd expect them to have > > long since moved to x86_64. > > > > Not sure I follow. If they care about PAE, they turn it on. Or do you > mean 'if they care about being able to address lots of physical > memory'? Because the *other* reason you might care about PAE is > because it gives you NX support. Right, I meant if they care about NX (and the topic of this thread) they want PAE, and if they want PAE, they likely moved to x86_64 long long ago for new kernels. > But currently, PAE is not even enabled in the i386_defconfig, and > defaults to off. This means people that are unaware of this won't > enable it, and will be running without NX support. And they all make me cry. ;) > > > If we are ok with the current state, enabling this permission check on > > > i386 makes no sense. > > > > I'd agree. If it's a choice between "spend a lot of time making sure > > this works correctly on i386" and "don't do this at all on i386", I > > would pick the latter. If someone steps up to do the former, then by > > all means take the patches. > > > > OK, so it seems we're all in violent agreement here. And if there is > ever a push for enabling security features on 32-bit, we can add this > to the laundry list of things that need to be looked at. Yup. -- Kees Cook
On 9/23/22 14:19, Kees Cook wrote: >> But currently, PAE is not even enabled in the i386_defconfig, and >> defaults to off. This means people that are unaware of this won't >> enable it, and will be running without NX support. > And they all make me cry. ;) It's been like that for a long time, presumably because the defconfig should *boot* in as many cases as possible. It wouldn't be hard to change. It also wouldn't be hard to default to HIGHMEM4G (non-PAE) on targeted builds for CPUs that don't support it. Patch attached to do that, if anyone else has an opinion. We should probably just leave i386 alone, but it breaks my heart to see Kees in tears.
On Fri, Sep 23, 2022 at 03:15:15PM -0700, Dave Hansen wrote: > On 9/23/22 14:19, Kees Cook wrote: > >> But currently, PAE is not even enabled in the i386_defconfig, and > >> defaults to off. This means people that are unaware of this won't > >> enable it, and will be running without NX support. > > And they all make me cry. ;) > > It's been like that for a long time, presumably because the defconfig > should *boot* in as many cases as possible. It wouldn't be hard to > change. It also wouldn't be hard to default to HIGHMEM4G (non-PAE) on > targeted builds for CPUs that don't support it. Patch attached to do > that, if anyone else has an opinion. > > We should probably just leave i386 alone, but it breaks my heart to see > Kees in tears. *dabs his eyes with tissue* > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index f9920f1341c8..fad978c7b7c5 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -1363,9 +1363,14 @@ config X86_CPUID > with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to > /dev/cpu/31/cpuid. > > +config CPU_HAS_PAE > + def_bool y > + depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !MWINCHIP3D && !MK6 > + > choice > prompt "High Memory Support" > default HIGHMEM4G > + default HIGHMEM64G if CPU_HAS_PAE > depends on X86_32 > > config NOHIGHMEM > @@ -1412,7 +1417,7 @@ config HIGHMEM4G > > config HIGHMEM64G > bool "64GB" > - depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !MWINCHIP3D && !MK6 > + depends on CPU_HAS_PAE > select X86_PAE > help > Select this if you have a 32-bit processor and more than 4 I feel happy now! :) -- Kees Cook
Dave Hansen <dave.hansen@intel.com> writes: > On 9/23/22 14:19, Kees Cook wrote: >>> But currently, PAE is not even enabled in the i386_defconfig, and >>> defaults to off. This means people that are unaware of this won't >>> enable it, and will be running without NX support. >> And they all make me cry. ;) > > It's been like that for a long time, presumably because the defconfig > should *boot* in as many cases as possible. It wouldn't be hard to > change. It also wouldn't be hard to default to HIGHMEM4G (non-PAE) on > targeted builds for CPUs that don't support it. Patch attached to do > that, if anyone else has an opinion. > > We should probably just leave i386 alone, but it breaks my heart to see > Kees in tears. Is it at all possible to simply drop efi support for 32bit builds? Last I looked (and it was quite a while ago) efi was only supported same architecture. So we are talking about 32bit efi for 32bit kernels. I think there were only a handful of systems that ever shipped 32bit efi, because when 32bit efi came out 64bit processors had been shipping for several years already. We still probably need to deal with whatever is needed for the BIOS. If there are enough interesting systems to care to keep the few systems that shipped with 32bit efi support going it probably does make sense to change how it is implemented because using the kernel's page tables has been nasty and given kexec all kinds of challenges to support because not only does efi happen strange mapping attributes but efi also winds up living at a fixed virtual address, that can't be changed. So if you care about anything like address space layout randomization efi provides a well know fixed target that defeats all of your work there as well. Can we do something to isolate 32bit efi so it is not a painpoint? Given how long 8bit and 16bit systems have lasted I rather suspect 32bit x86 will last in some embedded form for a very long time. PAE came in about the first pentium's I think so most embedded i386 processors should support it. Eric
On 9/22/22 00:46, Peter Zijlstra wrote: > On Wed, Sep 21, 2022 at 08:09:13PM -0700, Guenter Roeck wrote: > >> Oh well, that "helped" to hide one of the crashes. Here is another one. >> This is with PAE enabled and booting through efi32. > >> [ 1.086592] efi_runtime_update_mappings+0x36/0x42 >> [ 1.086717] efi_enter_virtual_mode+0x351/0x36e >> [ 1.086860] start_kernel+0x57d/0x60f >> [ 1.086956] ? set_intr_gate+0x42/0x55 >> [ 1.087079] i386_start_kernel+0x43/0x45 >> [ 1.087272] startup_32_smp+0x161/0x164 > > Does this help? Dave; perhaps we should just let i386 be i386 and let it > bitrot :/ How about we just turn off enforcement for now so that the poor i386 folks can at least boot? I have the feeling we're going to get bored with even the warnings if they persist for too long, though. Untested patch to make i386 violations harmless is attached.
On 9/22/22 08:00, Dave Hansen wrote: > On 9/22/22 00:46, Peter Zijlstra wrote: >> On Wed, Sep 21, 2022 at 08:09:13PM -0700, Guenter Roeck wrote: >> >>> Oh well, that "helped" to hide one of the crashes. Here is another one. >>> This is with PAE enabled and booting through efi32. >> >>> [ 1.086592] efi_runtime_update_mappings+0x36/0x42 >>> [ 1.086717] efi_enter_virtual_mode+0x351/0x36e >>> [ 1.086860] start_kernel+0x57d/0x60f >>> [ 1.086956] ? set_intr_gate+0x42/0x55 >>> [ 1.087079] i386_start_kernel+0x43/0x45 >>> [ 1.087272] startup_32_smp+0x161/0x164 >> >> Does this help? Dave; perhaps we should just let i386 be i386 and let it >> bitrot :/ > > How about we just turn off enforcement for now so that the poor i386 > folks can at least boot? I have the feeling we're going to get bored > with even the warnings if they persist for too long, though. > Problem with unfixed warnings is that they hide other problems if persistent, and they result in warnings to be seen just as useless noise. Case in point: In ChromeOS, we get literally hundreds of thousands of warning reports each day (most from drm and wireless drivers). Those originate from upstream code. No one really cares, and none ever get fixed. Please don't add more if you don't plan to fix them. Thanks, Guenter
On Mon, Aug 29, 2022 at 12:18:03PM +0200, Peter Zijlstra wrote: > > x86 has STRICT_*_RWX, but not even a warning when someone violates it. > > Add this warning and fully refuse the transition. > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Kees Cook <keescook@chromium.org> -- Kees Cook
The following commit has been merged into the x86/mm branch of tip:
Commit-ID: 652c5bf380ad018e15006a7f8349800245ddbbad
Gitweb: https://git.kernel.org/tip/652c5bf380ad018e15006a7f8349800245ddbbad
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Mon, 29 Aug 2022 12:18:03 +02:00
Committer: Dave Hansen <dave.hansen@linux.intel.com>
CommitterDate: Thu, 01 Sep 2022 11:10:19 -07:00
x86/mm: Refuse W^X violations
x86 has STRICT_*_RWX, but not even a warning when someone violates it.
Add this warning and fully refuse the transition.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/YwySW3ROc21hN7g9@hirez.programming.kicks-ass.net
---
arch/x86/mm/pat/set_memory.c | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 6a9043b..1a2d637 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -580,6 +580,33 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
}
/*
+ * Validate and enforce strict W^X semantics.
+ */
+static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
+ unsigned long pfn, unsigned long npg)
+{
+ unsigned long end;
+
+ if (!cpu_feature_enabled(X86_FEATURE_NX))
+ return new;
+
+ if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX)))
+ return new;
+
+ if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
+ return new;
+
+ end = start + npg * PAGE_SIZE - 1;
+ WARN_ONCE(1, "CPA refuse W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
+ (unsigned long long)pgprot_val(old),
+ (unsigned long long)pgprot_val(new),
+ start, end, pfn);
+
+ /* refuse the transition into WX */
+ return old;
+}
+
+/*
* Lookup the page table entry for a virtual address in a specific pgd.
* Return a pointer to the entry and the level of the mapping.
*/
@@ -885,6 +912,8 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
psize, CPA_DETECT);
+ new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages);
+
/*
* If there is a conflict, split the large page.
*
@@ -1525,6 +1554,7 @@ repeat:
if (level == PG_LEVEL_4K) {
pte_t new_pte;
+ pgprot_t old_prot = pte_pgprot(old_pte);
pgprot_t new_prot = pte_pgprot(old_pte);
unsigned long pfn = pte_pfn(old_pte);
@@ -1536,6 +1566,8 @@ repeat:
new_prot = static_protections(new_prot, address, pfn, 1, 0,
CPA_PROTECT);
+ new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1);
+
new_prot = pgprot_clear_protnone_bits(new_prot);
/*
© 2016 - 2026 Red Hat, Inc.