From: David Woodhouse <dwmw@amazon.co.uk>
A few places in the kexec control code page make the assumption that the
first instruction of relocate_kernel is at the very start of the page.
To allow for Clang CFI information to be added to relocate_kernel(), as
well as the general principle of removing unwarranted assumptions, fix
them to use the external __relocate_kernel_start symbol that the linker
adds. This means using a separate addq and subq for calculating offsets,
as the assembler can no longer calculate the delta directly for itself
and relocations aren't that versatile.
Turn the jump from relocate_kernel() to identity_mapped() into a real
indirect 'jmp *%rsi' too, while touching it. There was no real reason
for it to be a push+ret in the first place, and adding Clang CFI info
will also give objtool enough visibility to start complaining 'return
with modified stack frame' about it.
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
arch/x86/kernel/relocate_kernel_64.S | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 1996cea909ff..d74798d78263 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -95,11 +95,10 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
lea PAGE_SIZE(%rsi), %rsp
/* jump to identity mapped page */
- addq $(identity_mapped - relocate_kernel), %rsi
- pushq %rsi
- ANNOTATE_UNRET_SAFE
- ret
- int3
+ addq $identity_mapped, %rsi
+ subq $__relocate_kernel_start, %rsi
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rsi
SYM_CODE_END(relocate_kernel)
SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
@@ -219,16 +218,21 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
/* get the re-entry point of the peer system */
popq %rbp
- leaq relocate_kernel(%rip), %r8
movq kexec_pa_swap_page(%rip), %r10
movq pa_backup_pages_map(%rip), %rdi
movq kexec_pa_table_page(%rip), %rax
movq %rax, %cr3
+
+ /* Find start (and end) of this physical mapping of control page */
+ leaq (%rip), %r8
+ ANNOTATE_NOENDBR
+ andq $PAGE_MASK, %r8
lea PAGE_SIZE(%r8), %rsp
movq $1, %r11 /* Ensure preserve_context flag is set */
call swap_pages
movq kexec_va_control_page(%rip), %rax
- addq $(virtual_mapped - relocate_kernel), %rax
+ addq $virtual_mapped, %rax
+ subq $__relocate_kernel_start, %rax
pushq %rax
ANNOTATE_UNRET_SAFE
ret
--
2.47.0
On Tue, 17 Dec 2024 at 00:37, David Woodhouse <dwmw2@infradead.org> wrote:
>
> From: David Woodhouse <dwmw@amazon.co.uk>
>
> A few places in the kexec control code page make the assumption that the
> first instruction of relocate_kernel is at the very start of the page.
>
> To allow for Clang CFI information to be added to relocate_kernel(), as
> well as the general principle of removing unwarranted assumptions, fix
> them to use the external __relocate_kernel_start symbol that the linker
> adds. This means using a separate addq and subq for calculating offsets,
> as the assembler can no longer calculate the delta directly for itself
> and relocations aren't that versatile.
>
You can still avoid the absolute relocations though, ...
> Turn the jump from relocate_kernel() to identity_mapped() into a real
> indirect 'jmp *%rsi' too, while touching it. There was no real reason
> for it to be a push+ret in the first place, and adding Clang CFI info
> will also give objtool enough visibility to start complaining 'return
> with modified stack frame' about it.
>
> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> ---
> arch/x86/kernel/relocate_kernel_64.S | 18 +++++++++++-------
> 1 file changed, 11 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
> index 1996cea909ff..d74798d78263 100644
> --- a/arch/x86/kernel/relocate_kernel_64.S
> +++ b/arch/x86/kernel/relocate_kernel_64.S
> @@ -95,11 +95,10 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
> lea PAGE_SIZE(%rsi), %rsp
>
> /* jump to identity mapped page */
> - addq $(identity_mapped - relocate_kernel), %rsi
> - pushq %rsi
> - ANNOTATE_UNRET_SAFE
> - ret
> - int3
> + addq $identity_mapped, %rsi
> + subq $__relocate_kernel_start, %rsi
... if you turn this into
0: addq $identity_mapped - 0b, %rsi
subq $__relocate_kernel_start - 0b, %rsi
> + ANNOTATE_RETPOLINE_SAFE
> + jmp *%rsi
> SYM_CODE_END(relocate_kernel)
>
> SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
> @@ -219,16 +218,21 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
>
> /* get the re-entry point of the peer system */
> popq %rbp
> - leaq relocate_kernel(%rip), %r8
> movq kexec_pa_swap_page(%rip), %r10
> movq pa_backup_pages_map(%rip), %rdi
> movq kexec_pa_table_page(%rip), %rax
> movq %rax, %cr3
> +
> + /* Find start (and end) of this physical mapping of control page */
> + leaq (%rip), %r8
> + ANNOTATE_NOENDBR
> + andq $PAGE_MASK, %r8
> lea PAGE_SIZE(%r8), %rsp
> movq $1, %r11 /* Ensure preserve_context flag is set */
> call swap_pages
> movq kexec_va_control_page(%rip), %rax
> - addq $(virtual_mapped - relocate_kernel), %rax
> + addq $virtual_mapped, %rax
> + subq $__relocate_kernel_start, %rax
> pushq %rax
> ANNOTATE_UNRET_SAFE
> ret
> --
> 2.47.0
>
On 17 December 2024 09:47:36 CET, Ard Biesheuvel <ardb@kernel.org> wrote: >On Tue, 17 Dec 2024 at 00:37, David Woodhouse <dwmw2@infradead.org> wrote: >> >> From: David Woodhouse <dwmw@amazon.co.uk> >> >> A few places in the kexec control code page make the assumption that the >> first instruction of relocate_kernel is at the very start of the page. >> >> To allow for Clang CFI information to be added to relocate_kernel(), as >> well as the general principle of removing unwarranted assumptions, fix >> them to use the external __relocate_kernel_start symbol that the linker >> adds. This means using a separate addq and subq for calculating offsets, >> as the assembler can no longer calculate the delta directly for itself >> and relocations aren't that versatile. >> > >You can still avoid the absolute relocations though, ... ... >> + addq $identity_mapped, %rsi >> + subq $__relocate_kernel_start, %rsi > >... if you turn this into > >0: addq $identity_mapped - 0b, %rsi > subq $__relocate_kernel_start - 0b, %rsi Is there any benefit to doing so? Are absolute relocations problematic?
On Tue, 17 Dec 2024 at 10:17, David Woodhouse <dwmw2@infradead.org> wrote: > > On 17 December 2024 09:47:36 CET, Ard Biesheuvel <ardb@kernel.org> wrote: > >On Tue, 17 Dec 2024 at 00:37, David Woodhouse <dwmw2@infradead.org> wrote: > >> > >> From: David Woodhouse <dwmw@amazon.co.uk> > >> > >> A few places in the kexec control code page make the assumption that the > >> first instruction of relocate_kernel is at the very start of the page. > >> > >> To allow for Clang CFI information to be added to relocate_kernel(), as > >> well as the general principle of removing unwarranted assumptions, fix > >> them to use the external __relocate_kernel_start symbol that the linker > >> adds. This means using a separate addq and subq for calculating offsets, > >> as the assembler can no longer calculate the delta directly for itself > >> and relocations aren't that versatile. > >> > > > >You can still avoid the absolute relocations though, ... > ... > >> + addq $identity_mapped, %rsi > >> + subq $__relocate_kernel_start, %rsi > > > >... if you turn this into > > > >0: addq $identity_mapped - 0b, %rsi > > subq $__relocate_kernel_start - 0b, %rsi > > Is there any benefit to doing so? Are absolute relocations problematic? Every absolute relocation produces an entry in the relocation table that needs to be applied at every boot when KASLR is in effect. Beyond that, it doesn't matter. I've looked into PIC codegen/PIE linking for the core kernel, which is why this caught my eye. If that effort ever advances, I'll need to revisit this code as well and apply the change I suggested.
On Tue, 2024-12-17 at 10:25 +0100, Ard Biesheuvel wrote: > > > > You can still avoid the absolute relocations though, ... > > ... > > > > + addq $identity_mapped, %rsi > > > > + subq $__relocate_kernel_start, %rsi > > > > > > ... if you turn this into > > > > > > 0: addq $identity_mapped - 0b, %rsi > > > subq $__relocate_kernel_start - 0b, %rsi > > > > Is there any benefit to doing so? Are absolute relocations problematic? > > Every absolute relocation produces an entry in the relocation table > that needs to be applied at every boot when KASLR is in effect. Beyond > that, it doesn't matter. > > I've looked into PIC codegen/PIE linking for the core kernel, which is > why this caught my eye. If that effort ever advances, I'll need to > revisit this code as well and apply the change I suggested. OK, since it looks like I'll be reposting this series once I'm back at a keyboard for real, I've done that in my tree. There's one more absolute relocation, for saved_context just before returning to the kernel from the 'virtual_mapped' code. That's only reloading the GDT, and we could probably do that from the C code in machine_kexec().
On Fri, 3 Jan 2025 at 11:10, David Woodhouse <dwmw2@infradead.org> wrote:
>
> On Tue, 2024-12-17 at 10:25 +0100, Ard Biesheuvel wrote:
> >
> > > > You can still avoid the absolute relocations though, ...
> > > ...
> > > > > + addq $identity_mapped, %rsi
> > > > > + subq $__relocate_kernel_start, %rsi
> > > >
> > > > ... if you turn this into
> > > >
> > > > 0: addq $identity_mapped - 0b, %rsi
> > > > subq $__relocate_kernel_start - 0b, %rsi
> > >
> > > Is there any benefit to doing so? Are absolute relocations problematic?
> >
> > Every absolute relocation produces an entry in the relocation table
> > that needs to be applied at every boot when KASLR is in effect. Beyond
> > that, it doesn't matter.
> >
> > I've looked into PIC codegen/PIE linking for the core kernel, which is
> > why this caught my eye. If that effort ever advances, I'll need to
> > revisit this code as well and apply the change I suggested.
>
> OK, since it looks like I'll be reposting this series once I'm back at
> a keyboard for real, I've done that in my tree.
>
Thanks
> There's one more absolute relocation, for saved_context just before
> returning to the kernel from the 'virtual_mapped' code. That's only
> reloading the GDT, and we could probably do that from the C code in
> machine_kexec().
I suppose you're referring to
#ifdef CONFIG_KEXEC_JUMP
/* Saved in save_processor_state. */
movq $saved_context, %rax
lgdt saved_context_gdt_desc(%rax)
#endif
Any reason not to simply use
lgdt saved_context+saved_context_gdt_desc(%rip)
here?
On Mon, 2025-01-06 at 17:09 +0100, Ard Biesheuvel wrote: > > I suppose you're referring to > > #ifdef CONFIG_KEXEC_JUMP > /* Saved in save_processor_state. */ > movq $saved_context, %rax > lgdt saved_context_gdt_desc(%rax) > #endif > > Any reason not to simply use > > lgdt saved_context+saved_context_gdt_desc(%rip) > > here? Because the %rip isn't what you (and the linker) think it is. This code is copied into a control page which is allocated as part of the kexec kimage. It can only access things within that same page via %rip. (Which is not as much of a restriction as it sounds, as for most of its execution the rest of the kernel isn't even present in the page tables).
On Mon, 6 Jan 2025 at 17:13, David Woodhouse <dwmw2@infradead.org> wrote: > > On Mon, 2025-01-06 at 17:09 +0100, Ard Biesheuvel wrote: > > > > I suppose you're referring to > > > > #ifdef CONFIG_KEXEC_JUMP > > /* Saved in save_processor_state. */ > > movq $saved_context, %rax > > lgdt saved_context_gdt_desc(%rax) > > #endif > > > > Any reason not to simply use > > > > lgdt saved_context+saved_context_gdt_desc(%rip) > > > > here? > > Because the %rip isn't what you (and the linker) think it is. > > This code is copied into a control page which is allocated as part of > the kexec kimage. It can only access things within that same page via > %rip. (Which is not as much of a restriction as it sounds, as for most > of its execution the rest of the kernel isn't even present in the page > tables). Ah I was looking at an older version of that file - I now see that the preceding (%rip) references have been replaced as well. In any case, thanks for the head's up - I'll get back to this at some point and cc you if making any further changes here.
On 17 December 2024 10:25:56 CET, Ard Biesheuvel <ardb@kernel.org> wrote: >On Tue, 17 Dec 2024 at 10:17, David Woodhouse <dwmw2@infradead.org> wrote: >> >> On 17 December 2024 09:47:36 CET, Ard Biesheuvel <ardb@kernel.org> wrote: >> >On Tue, 17 Dec 2024 at 00:37, David Woodhouse <dwmw2@infradead.org> wrote: >> >> >> >> From: David Woodhouse <dwmw@amazon.co.uk> >> >> >> >> A few places in the kexec control code page make the assumption that the >> >> first instruction of relocate_kernel is at the very start of the page. >> >> >> >> To allow for Clang CFI information to be added to relocate_kernel(), as >> >> well as the general principle of removing unwarranted assumptions, fix >> >> them to use the external __relocate_kernel_start symbol that the linker >> >> adds. This means using a separate addq and subq for calculating offsets, >> >> as the assembler can no longer calculate the delta directly for itself >> >> and relocations aren't that versatile. >> >> >> > >> >You can still avoid the absolute relocations though, ... >> ... >> >> + addq $identity_mapped, %rsi >> >> + subq $__relocate_kernel_start, %rsi >> > >> >... if you turn this into >> > >> >0: addq $identity_mapped - 0b, %rsi >> > subq $__relocate_kernel_start - 0b, %rsi >> >> Is there any benefit to doing so? Are absolute relocations problematic? > >Every absolute relocation produces an entry in the relocation table >that needs to be applied at every boot when KASLR is in effect. Beyond >that, it doesn't matter. > >I've looked into PIC codegen/PIE linking for the core kernel, which is >why this caught my eye. If that effort ever advances, I'll need to >revisit this code as well and apply the change I suggested. Ack, I'll roll it in to the next revision if Ingo hasn't rounded this set up of fixes up already. (On which topic, since I took the easy way out of the CFO thing, this patch isn't strictly needed to fix a regression in tip/x86/boot so can be dropped for now too.) Thanks.
© 2016 - 2025 Red Hat, Inc.