arch/x86/kernel/relocate_kernel_64.S | 32 +++++++++++++++++----------- 1 file changed, 20 insertions(+), 12 deletions(-)
From: David Woodhouse <dwmw@amazon.co.uk>
The virtual mapping of the control page may have been _PAGE_GLOBAL and
thus its PTE might not have been flushed on the %cr3 switch and it might
effectively still be read-only. Move the writes to it down into the
identity_mapped() function where the same %rip-relative addressing will
get the new mapping.
The stack is fine, as that's using the identity mapped address anyway.
Fixes: 5a82223e0743 ("x86/kexec: Mark relocate_kernel page as ROX instead of RWX")
Reported-by: Nathan Chancellor <nathan@kernel.org>
Reported-by: "Ning, Hongyu" <hongyu.ning@linux.intel.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219592
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
arch/x86/kernel/relocate_kernel_64.S | 32 +++++++++++++++++-----------
1 file changed, 20 insertions(+), 12 deletions(-)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 553d67845b84..b9c80b3091c8 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -90,22 +90,17 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
movq kexec_pa_table_page(%rip), %r9
movq %r9, %cr3
- /* Save %rsp and CRs. */
- movq %rsp, saved_rsp(%rip)
- movq %rax, saved_cr3(%rip)
- movq %cr0, %rax
- movq %rax, saved_cr0(%rip)
- /* Leave CR4 in %r13 to enable the right paging mode later. */
- movq %cr4, %r13
- movq %r13, saved_cr4(%rip)
-
- /* save indirection list for jumping back */
- movq %rdi, pa_backup_pages_map(%rip)
+ /*
+ * The control page still might not be writable because the original
+ * kernel PTE may have had the _PAGE_GLOBAL bit set. Don't write to
+ * it except through the *identmap* address.
+ */
/* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
movq %rcx, %r11
/* setup a new stack at the end of the physical control page */
+ movq %rsp, %rbp
lea PAGE_SIZE(%rsi), %rsp
/* jump to identity mapped page */
@@ -118,6 +113,19 @@ SYM_CODE_END(relocate_kernel)
SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
UNWIND_HINT_END_OF_STACK
+
+ /* Save original %rsp and CRs. */
+ movq %rbp, saved_rsp(%rip)
+ movq %rax, saved_cr3(%rip)
+ movq %cr0, %rax
+ movq %rax, saved_cr0(%rip)
+ /* Leave CR4 in %r13 to enable the right paging mode later. */
+ movq %cr4, %r13
+ movq %r13, saved_cr4(%rip)
+
+ /* save indirection list for jumping back */
+ movq %rdi, pa_backup_pages_map(%rip)
+
/*
* %rdi indirection page
* %rdx start address
@@ -185,7 +193,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
* - Machine check exception on TDX guest, if it was enabled before.
* Clearing MCE might not be allowed in TDX guests, depending on setup.
*
- * Use R13 that contains the original CR4 value, read in relocate_kernel().
+ * Use R13 that contains the original CR4 value
* PAE is always set in the original CR4.
*/
andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d
--
2.47.0
On 12/12/24 12:11, David Woodhouse wrote: > From: David Woodhouse <dwmw@amazon.co.uk> > > The virtual mapping of the control page may have been _PAGE_GLOBAL and > thus its PTE might not have been flushed on the %cr3 switch and it might > effectively still be read-only. Move the writes to it down into the > identity_mapped() function where the same %rip-relative addressing will > get the new mapping. > > The stack is fine, as that's using the identity mapped address anyway. Shouldn't we also ensure that Global entries don't bite anyone else? Something like the completely untested attached patch?
On 12 December 2024 21:18:10 GMT, Dave Hansen <dave.hansen@intel.com> wrote: >On 12/12/24 12:11, David Woodhouse wrote: >> From: David Woodhouse <dwmw@amazon.co.uk> >> >> The virtual mapping of the control page may have been _PAGE_GLOBAL and >> thus its PTE might not have been flushed on the %cr3 switch and it might >> effectively still be read-only. Move the writes to it down into the >> identity_mapped() function where the same %rip-relative addressing will >> get the new mapping. >> >> The stack is fine, as that's using the identity mapped address anyway. > >Shouldn't we also ensure that Global entries don't bite anyone else? >Something like the completely untested attached patch? Doesn't hurt, but this is an identity mapping so absolutely everything other than this one page is going to be in the low (positive) part of the canonical address space, so won't have had global pages in the first place will they? Probably a kind thing to do for whatever we're passing control to though :) I'll round it up into the tree and send it out with the next batch of debug support. Care to give me a SoB for it? You can s/CR0_PGE/CR4_PGE/ too if you like but I can do that myself as well.
On 12/12/24 13:32, David Woodhouse wrote: > On 12 December 2024 21:18:10 GMT, Dave Hansen <dave.hansen@intel.com> wrote: >> On 12/12/24 12:11, David Woodhouse wrote: >>> From: David Woodhouse <dwmw@amazon.co.uk> >>> >>> The virtual mapping of the control page may have been _PAGE_GLOBAL and >>> thus its PTE might not have been flushed on the %cr3 switch and >>> it might effectively still be read-only. Move the writes to it >>> down into the identity_mapped() function where the same >>> %rip-relative addressing will get the new mapping. >>> >>> The stack is fine, as that's using the identity mapped address >>> anyway. >> >> Shouldn't we also ensure that Global entries don't bite anyone >> else? Something like the completely untested attached patch? > Doesn't hurt, but this is an identity mapping so absolutely > everything other than this one page is going to be in the low > (positive) part of the canonical address space, so won't have had > global pages in the first place will they? Right, it's generally _not_ a problem. But it _can_ be a surprising problem which is why we're all looking at it today. ;) > Probably a kind thing to do for whatever we're passing control to > though :) > > I'll round it up into the tree and send it out with the next batch of > debug support. Care to give me a SoB for it? You can > s/CR0_PGE/CR4_PGE/ too if you like but I can do that myself as well. Here's a fixed one with a changelog and a SoB. Still 100% gloriously untested though.
On 12 December 2024 21:43:57 GMT, Dave Hansen <dave.hansen@intel.com> wrote: >On 12/12/24 13:32, David Woodhouse wrote: >> On 12 December 2024 21:18:10 GMT, Dave Hansen <dave.hansen@intel.com> wrote: >>> On 12/12/24 12:11, David Woodhouse wrote: >>>> From: David Woodhouse <dwmw@amazon.co.uk> >>>> >>>> The virtual mapping of the control page may have been _PAGE_GLOBAL and >>>> thus its PTE might not have been flushed on the %cr3 switch and >>>> it might effectively still be read-only. Move the writes to it >>>> down into the identity_mapped() function where the same >>>> %rip-relative addressing will get the new mapping. >>>> >>>> The stack is fine, as that's using the identity mapped address >>>> anyway. >>> >>> Shouldn't we also ensure that Global entries don't bite anyone >>> else? Something like the completely untested attached patch? >> Doesn't hurt, but this is an identity mapping so absolutely >> everything other than this one page is going to be in the low >> (positive) part of the canonical address space, so won't have had >> global pages in the first place will they? > >Right, it's generally _not_ a problem. But it _can_ be a surprising >problem which is why we're all looking at it today. ;) > >> Probably a kind thing to do for whatever we're passing control to >> though :) >> >> I'll round it up into the tree and send it out with the next batch of >> debug support. Care to give me a SoB for it? You can >> s/CR0_PGE/CR4_PGE/ too if you like but I can do that myself as well. >Here's a fixed one with a changelog and a SoB. Still 100% gloriously >untested though. Ta. I'll play with it in the morning. May actually shift it earlier and use it instead of my other fix, so we can actually write to the virtual address.
From: David Woodhouse <dwmw@amazon.co.uk>
The kernel switches to a new set of page tables during kexec. The global
mappings (_PAGE_GLOBAL==1) can remain in the TLB after this switch. This
is generally not a problem because the new page tables use a different
portion of the virtual address space than the normal kernel mappings.
The critical exception to that generalisation (and the only mapping
which isn't an identity mapping) is the kexec control page itself —
which was ROX in the original kernel mapping, but should be RWX in the
new page tables. If there is a global TLB entry for that in its prior
read-only state, it definitely needs to be flushed before attempting to
write through that virtual mapping.
It would be possible to just avoid writing to the virtual address of the
page and defer all writes until they can be done through the identity
mapping. But there's no good reason to keep the old TLB entries around,
as they can cause nothing but trouble.
Clear the PGE bit in %cr4 early, before storing data in the control page.
Fixes: 5a82223e0743 ("x86/kexec: Mark relocate_kernel page as ROX instead of RWX")
Co-authored-by: Dave Hansen <dave.hansen@linux.intel.com>
Reported-by: Nathan Chancellor <nathan@kernel.org>
Reported-by: "Ning, Hongyu" <hongyu.ning@linux.intel.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219592
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Tested-by: Nathan Chancellor <nathan@kernel.org>
---
This supersedes the previous 'Only write through identity mapping of
control page' patch as Dave's approach is much saner now he's actually
figured out what's going on.
arch/x86/kernel/relocate_kernel_64.S | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 553d67845b84..cbadf0142fcb 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -90,14 +90,20 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
movq kexec_pa_table_page(%rip), %r9
movq %r9, %cr3
+ /* Leave CR4 in %r13 to enable the right paging mode later. */
+ movq %cr4, %r13
+
+ /* Disable global pages immediately to ensure this mapping is RWX */
+ movq %r13, %r12
+ andq $~(X86_CR4_PGE), %r12
+ movq %r12, %cr4
+
/* Save %rsp and CRs. */
+ movq %r13, saved_cr4(%rip)
movq %rsp, saved_rsp(%rip)
movq %rax, saved_cr3(%rip)
movq %cr0, %rax
movq %rax, saved_cr0(%rip)
- /* Leave CR4 in %r13 to enable the right paging mode later. */
- movq %cr4, %r13
- movq %r13, saved_cr4(%rip)
/* save indirection list for jumping back */
movq %rdi, pa_backup_pages_map(%rip)
--
2.47.0
On Thu, Dec 12, 2024 at 08:11:19PM +0000, David Woodhouse wrote:
> From: David Woodhouse <dwmw@amazon.co.uk>
>
> The virtual mapping of the control page may have been _PAGE_GLOBAL and
> thus its PTE might not have been flushed on the %cr3 switch and it might
> effectively still be read-only. Move the writes to it down into the
> identity_mapped() function where the same %rip-relative addressing will
> get the new mapping.
>
> The stack is fine, as that's using the identity mapped address anyway.
>
> Fixes: 5a82223e0743 ("x86/kexec: Mark relocate_kernel page as ROX instead of RWX")
> Reported-by: Nathan Chancellor <nathan@kernel.org>
> Reported-by: "Ning, Hongyu" <hongyu.ning@linux.intel.com>
> Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219592
> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Tested-by: Nathan Chancellor <nathan@kernel.org>
> ---
> arch/x86/kernel/relocate_kernel_64.S | 32 +++++++++++++++++-----------
> 1 file changed, 20 insertions(+), 12 deletions(-)
>
> diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
> index 553d67845b84..b9c80b3091c8 100644
> --- a/arch/x86/kernel/relocate_kernel_64.S
> +++ b/arch/x86/kernel/relocate_kernel_64.S
> @@ -90,22 +90,17 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
> movq kexec_pa_table_page(%rip), %r9
> movq %r9, %cr3
>
> - /* Save %rsp and CRs. */
> - movq %rsp, saved_rsp(%rip)
> - movq %rax, saved_cr3(%rip)
> - movq %cr0, %rax
> - movq %rax, saved_cr0(%rip)
> - /* Leave CR4 in %r13 to enable the right paging mode later. */
> - movq %cr4, %r13
> - movq %r13, saved_cr4(%rip)
> -
> - /* save indirection list for jumping back */
> - movq %rdi, pa_backup_pages_map(%rip)
> + /*
> + * The control page still might not be writable because the original
> + * kernel PTE may have had the _PAGE_GLOBAL bit set. Don't write to
> + * it except through the *identmap* address.
> + */
>
> /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
> movq %rcx, %r11
>
> /* setup a new stack at the end of the physical control page */
> + movq %rsp, %rbp
> lea PAGE_SIZE(%rsi), %rsp
>
> /* jump to identity mapped page */
> @@ -118,6 +113,19 @@ SYM_CODE_END(relocate_kernel)
>
> SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
> UNWIND_HINT_END_OF_STACK
> +
> + /* Save original %rsp and CRs. */
> + movq %rbp, saved_rsp(%rip)
> + movq %rax, saved_cr3(%rip)
> + movq %cr0, %rax
> + movq %rax, saved_cr0(%rip)
> + /* Leave CR4 in %r13 to enable the right paging mode later. */
> + movq %cr4, %r13
> + movq %r13, saved_cr4(%rip)
> +
> + /* save indirection list for jumping back */
> + movq %rdi, pa_backup_pages_map(%rip)
> +
> /*
> * %rdi indirection page
> * %rdx start address
> @@ -185,7 +193,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
> * - Machine check exception on TDX guest, if it was enabled before.
> * Clearing MCE might not be allowed in TDX guests, depending on setup.
> *
> - * Use R13 that contains the original CR4 value, read in relocate_kernel().
> + * Use R13 that contains the original CR4 value
> * PAE is always set in the original CR4.
> */
> andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d
> --
> 2.47.0
>
>
© 2016 - 2025 Red Hat, Inc.