From: David Woodhouse <dwmw@amazon.co.uk>
This currently calls set_memory_x() from machine_kexec_prepare() just
like the 32-bit version does. That's actually a bit earlier than I'd
like, as it leaves the page RWX all the time the image is even *loaded*.
Subsequent commits will eliminate all the writes to the page between the
point it's marked executable in machine_kexec_prepare() the time that
relocate_kernel() is running and has switched to the identmap %cr3, so
that it can be ROX. But that can't happen until it's moved to the .data
section of the kernel, and *that* can't happen until we start executing
the copy instead of executing it in place in the kernel .text. So break
the circular dependency in those commits by letting it be RWX for now.
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
arch/x86/kernel/machine_kexec_64.c | 30 ++++++++++++++++++++++------
arch/x86/kernel/relocate_kernel_64.S | 5 ++++-
2 files changed, 28 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 3a4cbac1a0c6..9567347f7a9b 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -157,7 +157,12 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd,
pmd_t *pmd;
pte_t *pte;
- vaddr = (unsigned long)relocate_kernel;
+ /*
+ * For the transition to the identity mapped page tables, the control
+ * code page also needs to be mapped at the virtual address it starts
+ * off running from.
+ */
+ vaddr = (unsigned long)__va(control_page);
paddr = control_page;
pgd += pgd_index(vaddr);
if (!pgd_present(*pgd)) {
@@ -311,11 +316,17 @@ int machine_kexec_prepare(struct kimage *image)
__memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
+ set_memory_x((unsigned long)control_page, 1);
+
return 0;
}
void machine_kexec_cleanup(struct kimage *image)
{
+ void *control_page = page_address(image->control_code_page);
+
+ set_memory_nx((unsigned long)control_page, 1);
+
free_transition_pgtable(image);
}
@@ -325,6 +336,11 @@ void machine_kexec_cleanup(struct kimage *image)
*/
void machine_kexec(struct kimage *image)
{
+ unsigned long (*relocate_kernel_ptr)(unsigned long indirection_page,
+ unsigned long page_list,
+ unsigned long start_address,
+ unsigned int preserve_context,
+ unsigned int host_mem_enc_active);
unsigned long page_list[PAGES_NR];
unsigned int host_mem_enc_active;
int save_ftrace_enabled;
@@ -371,6 +387,8 @@ void machine_kexec(struct kimage *image)
page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
<< PAGE_SHIFT);
+ relocate_kernel_ptr = control_page;
+
/*
* The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
@@ -390,11 +408,11 @@ void machine_kexec(struct kimage *image)
native_gdt_invalidate();
/* now call it */
- image->start = relocate_kernel((unsigned long)image->head,
- (unsigned long)page_list,
- image->start,
- image->preserve_context,
- host_mem_enc_active);
+ image->start = relocate_kernel_ptr((unsigned long)image->head,
+ (unsigned long)page_list,
+ image->start,
+ image->preserve_context,
+ host_mem_enc_active);
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 50cc33f2ecb7..b48bd82843fd 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -39,6 +39,7 @@
#define CP_PA_TABLE_PAGE DATA(0x20)
#define CP_PA_SWAP_PAGE DATA(0x28)
#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
+#define CP_VA_CONTROL_PAGE DATA(0x38)
.text
.align PAGE_SIZE
@@ -99,6 +100,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
movq %r9, CP_PA_TABLE_PAGE(%r11)
movq %r10, CP_PA_SWAP_PAGE(%r11)
movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
+ movq %r11, CP_VA_CONTROL_PAGE(%r11)
/* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
movq %rcx, %r11
@@ -235,7 +237,8 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
movq %rax, %cr3
lea PAGE_SIZE(%r8), %rsp
call swap_pages
- movq $virtual_mapped, %rax
+ movq CP_VA_CONTROL_PAGE(%r8), %rax
+ addq $(virtual_mapped - relocate_kernel), %rax
pushq %rax
ANNOTATE_UNRET_SAFE
ret
--
2.47.0
Hi David,
On Thu, Dec 05, 2024 at 03:05:13PM +0000, David Woodhouse wrote:
> From: David Woodhouse <dwmw@amazon.co.uk>
>
> This currently calls set_memory_x() from machine_kexec_prepare() just
> like the 32-bit version does. That's actually a bit earlier than I'd
> like, as it leaves the page RWX all the time the image is even *loaded*.
>
> Subsequent commits will eliminate all the writes to the page between the
> point it's marked executable in machine_kexec_prepare() the time that
> relocate_kernel() is running and has switched to the identmap %cr3, so
> that it can be ROX. But that can't happen until it's moved to the .data
> section of the kernel, and *that* can't happen until we start executing
> the copy instead of executing it in place in the kernel .text. So break
> the circular dependency in those commits by letting it be RWX for now.
>
> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> ---
> arch/x86/kernel/machine_kexec_64.c | 30 ++++++++++++++++++++++------
> arch/x86/kernel/relocate_kernel_64.S | 5 ++++-
> 2 files changed, 28 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
> index 3a4cbac1a0c6..9567347f7a9b 100644
> --- a/arch/x86/kernel/machine_kexec_64.c
> +++ b/arch/x86/kernel/machine_kexec_64.c
...
> void machine_kexec(struct kimage *image)
> {
> + unsigned long (*relocate_kernel_ptr)(unsigned long indirection_page,
> + unsigned long page_list,
> + unsigned long start_address,
> + unsigned int preserve_context,
> + unsigned int host_mem_enc_active);
> unsigned long page_list[PAGES_NR];
> unsigned int host_mem_enc_active;
> int save_ftrace_enabled;
> @@ -371,6 +387,8 @@ void machine_kexec(struct kimage *image)
> page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
> << PAGE_SHIFT);
>
> + relocate_kernel_ptr = control_page;
> +
Because of this change...
> /*
> * The segment registers are funny things, they have both a
> * visible and an invisible part. Whenever the visible part is
> @@ -390,11 +408,11 @@ void machine_kexec(struct kimage *image)
> native_gdt_invalidate();
>
> /* now call it */
> - image->start = relocate_kernel((unsigned long)image->head,
> - (unsigned long)page_list,
> - image->start,
> - image->preserve_context,
> - host_mem_enc_active);
> + image->start = relocate_kernel_ptr((unsigned long)image->head,
> + (unsigned long)page_list,
> + image->start,
> + image->preserve_context,
> + host_mem_enc_active);
kexec-ing on a CONFIG_CFI_CLANG kernel crashes and burns:
RAX=0000000000000018 RBX=ff408cf982596000 RCX=0000000000000000 RDX=000000047fffb280
RSI=ff6c99a785943d20 RDI=000000011e145002 RBP=ff6c99a785943d70 RSP=ff6c99a785943d10
R8 =0000000000000000 R9 =0000000000000000 R10=00000000ab150dc6 R11=ff408cf99e139000
R12=0000000028121969 R13=00000000fee1dead R14=0000000000000000 R15=0000000000000001
RIP=ffffffff84c9c510 RFL=00010086 [--S--P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =0018 0000000000000000 ffffffff 00c09300 DPL=0 DS [-WA]
CS =0010 0000000000000000 ffffffff 00a09b00 DPL=0 CS64 [-RA]
SS =0018 0000000000000000 ffffffff 00c09300 DPL=0 DS [-WA]
DS =0018 0000000000000000 ffffffff 00c09300 DPL=0 DS [-WA]
FS =0018 0000000000000000 ffffffff 00c09300 DPL=0 DS [-WA]
GS =0018 0000000000000000 ffffffff 00c09300 DPL=0 DS [-WA]
LDT=0000 0000000000000000 ffffffff 00c00000
TR =0040 fffffe441f360000 00004087 00008b00 DPL=0 TSS64-busy
GDT= 0000000000000000 00000000
IDT= 0000000000000000 00000000
CR0=80050033 CR2=00007ffde71dbfc0 CR3=00000001140b0002 CR4=00771ef0
DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000
DR6=00000000fffe0ff0 DR7=0000000000000400
EFER=0000000000000d01
Code=83 e1 01 48 8d 74 24 10 41 ba c6 0d 15 ab 45 03 53 f1 74 02 <0f> 0b 41 ff d3 0f 1f 00 48 89 43 18 f6 83 78 02 00 00 02 74 05 e8 16 06 da 00 44 89 3d 17
a0c: 83 e1 01 andl $0x1, %ecx
a0f: 48 8d 74 24 10 leaq 0x10(%rsp), %rsi
; image->start = relocate_kernel_ptr((unsigned long)image->head,
a14: 41 ba 67 a6 7c e6 movl $0xe67ca667, %r10d # imm = 0xE67CA667
a1a: 45 03 53 f1 addl -0xf(%r11), %r10d
a1e: 74 02 je 0xa22 <machine_kexec+0x1c2>
a20: 0f 0b ud2
a22: 2e e8 00 00 00 00 callq 0xa28 <machine_kexec+0x1c8>
a28: 48 89 43 18 movq %rax, 0x18(%rbx)
I guess this seems somewhat unavoidable because control_page is just a
'void *', perhaps machine_kexec() should just be marked as __nocfi? This
diff resolves that issue for me.
Cheers,
Nathan
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 9567347f7a9b..e77110c4bb91 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -334,7 +334,7 @@ void machine_kexec_cleanup(struct kimage *image)
* Do not allocate memory (or fail in any way) in machine_kexec().
* We are past the point of no return, committed to rebooting now.
*/
-void machine_kexec(struct kimage *image)
+void __nocfi machine_kexec(struct kimage *image)
{
unsigned long (*relocate_kernel_ptr)(unsigned long indirection_page,
unsigned long page_list,
On Sat, 2024-12-14 at 16:08 -0700, Nathan Chancellor wrote: > > I guess this seems somewhat unavoidable because control_page is just a > 'void *', perhaps machine_kexec() should just be marked as __nocfi? This > diff resolves that issue for me. The patch below seems to work too. I already wanted to deal with the case where relocate_kernel isn't at the start of the page, so it forces me to do that. For some reason it also started complaining vmlinux.o: warning: objtool: relocate_kernel+0x6a: return with modified stack frame ... which is easy to fix just by turning it into a jmp *%rsi; I have no idea why it was done with a ret like that in the first place. I don't know why it puts 16 bytes of NOPs between __reloc_start and __cfi_relocate_kernel (in addition to the 16 before relocate_kernel itself), and space is *fairly* tight in the control page, but it's tolerable. To make the CFI check actually give useful output if it triggers, I'm tempted to do the IDT/GDT invalidation relocate_kernel itself, instead of before the call. diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index ccb297765e08..e6befd11fee4 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -6,6 +6,7 @@ #include <linux/linkage.h> #include <linux/stringify.h> +#include <linux/cfi_types.h> #include <asm/alternative.h> #include <asm/page_types.h> #include <asm/kexec.h> @@ -61,7 +62,10 @@ SYM_DATA_END(kexec_debug_idt) .section .text.relocate_kernel,"ax"; .code64 -SYM_CODE_START_NOALIGN(relocate_kernel) +__reloc_start: + ANNOTATE_NOENDBR + +SYM_TYPED_FUNC_START(relocate_kernel) UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR /* @@ -115,10 +119,9 @@ SYM_CODE_START_NOALIGN(relocate_kernel) lea PAGE_SIZE(%rsi), %rsp /* jump to identity mapped page */ - addq $(identity_mapped - relocate_kernel), %rsi - pushq %rsi - ANNOTATE_UNRET_SAFE - ret + addq $(identity_mapped - __reloc_start), %rsi + ANNOTATE_RETPOLINE_SAFE + jmp *%rsi int3 SYM_CODE_END(relocate_kernel) @@ -263,7 +266,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) /* get the re-entry point of the peer system */ popq %rbp - leaq relocate_kernel(%rip), %r8 + leaq __reloc_start(%rip), %r8 movq kexec_pa_swap_page(%rip), %r10 movq pa_backup_pages_map(%rip), %rdi movq kexec_pa_table_page(%rip), %rax @@ -272,7 +275,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movq $1, %r11 /* Ensure preserve_context flag is set */ call swap_pages movq kexec_va_control_page(%rip), %rax - addq $(virtual_mapped - relocate_kernel), %rax + addq $(virtual_mapped - __reloc_start), %rax pushq %rax ANNOTATE_UNRET_SAFE ret
On Sun, Dec 15, 2024 at 10:09:57AM +0000, David Woodhouse wrote: > On Sat, 2024-12-14 at 16:08 -0700, Nathan Chancellor wrote: > > > > I guess this seems somewhat unavoidable because control_page is just a > > 'void *', perhaps machine_kexec() should just be marked as __nocfi? This > > diff resolves that issue for me. > > The patch below seems to work too. I already wanted to deal with the Can confirm, thanks for the quick fix. With your fix for the first issue I reported, the fix I sent for LTO, and this patch below, I can kexec on a CFI and LTO enabled kernel without any issues. > case where relocate_kernel isn't at the start of the page, so it forces > me to do that. > > For some reason it also started complaining > vmlinux.o: warning: objtool: relocate_kernel+0x6a: return with modified stack frame > ... which is easy to fix just by turning it into a jmp *%rsi; I have no > idea why it was done with a ret like that in the first place. > > I don't know why it puts 16 bytes of NOPs between __reloc_start and > __cfi_relocate_kernel (in addition to the 16 before relocate_kernel > itself), and space is *fairly* tight in the control page, but it's > tolerable. I think this is something to do with FineIBT IIRC? PeterZ might have more details. > To make the CFI check actually give useful output if it triggers, I'm > tempted to do the IDT/GDT invalidation relocate_kernel itself, instead > of before the call. > > > diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S > index ccb297765e08..e6befd11fee4 100644 > --- a/arch/x86/kernel/relocate_kernel_64.S > +++ b/arch/x86/kernel/relocate_kernel_64.S > @@ -6,6 +6,7 @@ > > #include <linux/linkage.h> > #include <linux/stringify.h> > +#include <linux/cfi_types.h> > #include <asm/alternative.h> > #include <asm/page_types.h> > #include <asm/kexec.h> > @@ -61,7 +62,10 @@ SYM_DATA_END(kexec_debug_idt) > > .section .text.relocate_kernel,"ax"; > .code64 > -SYM_CODE_START_NOALIGN(relocate_kernel) > +__reloc_start: > + ANNOTATE_NOENDBR > + > +SYM_TYPED_FUNC_START(relocate_kernel) > UNWIND_HINT_END_OF_STACK > ANNOTATE_NOENDBR > /* > @@ -115,10 +119,9 @@ SYM_CODE_START_NOALIGN(relocate_kernel) > lea PAGE_SIZE(%rsi), %rsp > > /* jump to identity mapped page */ > - addq $(identity_mapped - relocate_kernel), %rsi > - pushq %rsi > - ANNOTATE_UNRET_SAFE > - ret > + addq $(identity_mapped - __reloc_start), %rsi > + ANNOTATE_RETPOLINE_SAFE > + jmp *%rsi > int3 > SYM_CODE_END(relocate_kernel) > > @@ -263,7 +266,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) > > /* get the re-entry point of the peer system */ > popq %rbp > - leaq relocate_kernel(%rip), %r8 > + leaq __reloc_start(%rip), %r8 > movq kexec_pa_swap_page(%rip), %r10 > movq pa_backup_pages_map(%rip), %rdi > movq kexec_pa_table_page(%rip), %rax > @@ -272,7 +275,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) > movq $1, %r11 /* Ensure preserve_context flag is set */ > call swap_pages > movq kexec_va_control_page(%rip), %rax > - addq $(virtual_mapped - relocate_kernel), %rax > + addq $(virtual_mapped - __reloc_start), %rax > pushq %rax > ANNOTATE_UNRET_SAFE > ret >
On Sun, 2024-12-15 at 22:49 -0700, Nathan Chancellor wrote:
> On Sun, Dec 15, 2024 at 10:09:57AM +0000, David Woodhouse wrote:
> > On Sat, 2024-12-14 at 16:08 -0700, Nathan Chancellor wrote:
> > >
> > > I guess this seems somewhat unavoidable because control_page is just a
> > > 'void *', perhaps machine_kexec() should just be marked as __nocfi? This
> > > diff resolves that issue for me.
> >
> > The patch below seems to work too. I already wanted to deal with the
>
> Can confirm, thanks for the quick fix. With your fix for the first issue
> I reported, the fix I sent for LTO, and this patch below, I can kexec on
> a CFI and LTO enabled kernel without any issues.
Argh, using SYM_TYPED_FUNC_START() leads to objtool having more opinions.
I have the Clang build working in my tree now, but the GCC build now complains
vmlinux.o: warning: objtool: relocate_kernel+0x0: unreachable instruction
It seems like adding UNWIND_HINT_FUNC ought to be the answer for that?
But then it complains about this instead:
vmlinux.o: warning: objtool: relocate_kernel+0x69: unsupported stack register modification
That's the lea instruction at
lea PAGE_SIZE(%rsi), %rsp
79: 48 8d a6 00 10 00 00 lea 0x1000(%rsi),%rsp
I've pushed what I have to my kexec-debug branch; I was hoping to post
the fixes for the tip/x86/boot branch this morning but I've run out of
time and will be travelling for the rest of the week. Will try to get
something sent out this evening when I land.
I may resort to the __nocfi option for now. As noted on the typedef
patch and in IRC, the whole SYM_TYPED_FUNC_START() thing using the type
information from the *C* code which is actually doing the call anyway,
is a bit tautological anyway.
On Mon, 2024-12-16 at 12:09 +0000, David Woodhouse wrote: > On Sun, 2024-12-15 at 22:49 -0700, Nathan Chancellor wrote: > > On Sun, Dec 15, 2024 at 10:09:57AM +0000, David Woodhouse wrote: > > > On Sat, 2024-12-14 at 16:08 -0700, Nathan Chancellor wrote: > > > > > > > > I guess this seems somewhat unavoidable because control_page is just a > > > > 'void *', perhaps machine_kexec() should just be marked as __nocfi? This > > > > diff resolves that issue for me. > > > > > > The patch below seems to work too. I already wanted to deal with the > > > > Can confirm, thanks for the quick fix. With your fix for the first issue > > I reported, the fix I sent for LTO, and this patch below, I can kexec on > > a CFI and LTO enabled kernel without any issues. > > Argh, using SYM_TYPED_FUNC_START() leads to objtool having more opinions. > > I have the Clang build working in my tree now, but the GCC build now complains > > vmlinux.o: warning: objtool: relocate_kernel+0x0: unreachable instruction > > It seems like adding UNWIND_HINT_FUNC ought to be the answer for that? > But then it complains about this instead: > > vmlinux.o: warning: objtool: relocate_kernel+0x69: unsupported stack register modification > > That's the lea instruction at > lea PAGE_SIZE(%rsi), %rsp > 79: 48 8d a6 00 10 00 00 lea 0x1000(%rsi),%rsp > > I've pushed what I have to my kexec-debug branch; I was hoping to post > the fixes for the tip/x86/boot branch this morning but I've run out of > time and will be travelling for the rest of the week. Will try to get > something sent out this evening when I land. > > I may resort to the __nocfi option for now. As noted on the typedef > patch and in IRC, the whole SYM_TYPED_FUNC_START() thing using the type > information from the *C* code which is actually doing the call anyway, > is a bit tautological anyway. I've dropped this for now and just posted the __nocfi thing as the regression fix. I think we *should* provide the CFI information in relocate_kernel_64.S though, so I've left these commits in my tree at https://git.infradead.org/?p=users/dwmw2/linux.git;a=shortlog;h=refs/heads/kexec-debug I'd really appreciate some help in getting objtool to stop whining about them, for *both* Clang and GCC builds at the same time :)
On Tue, Dec 17, 2024 at 01:03:07PM +0100, David Woodhouse wrote: > I've dropped this for now and just posted the __nocfi thing as the > regression fix. I think we *should* provide the CFI information in > relocate_kernel_64.S though, so I've left these commits in my tree at > https://git.infradead.org/?p=users/dwmw2/linux.git;a=shortlog;h=refs/heads/kexec-debug > > I'd really appreciate some help in getting objtool to stop whining > about them, for *both* Clang and GCC builds at the same time :) Hm, I can't fetch for some reason: $ git fetch https://git.infradead.org/users/dwmw2/linux.git kexec-debug fatal: https://git.infradead.org/users/dwmw2/linux.git/info/refs not valid: could not determine hash algorithm; is this a git repository? At some point we had discussed placing the code in .rodata, was it the alternative preventing that? -- Josh
On Wed, 2024-12-18 at 01:03 -0800, Josh Poimboeuf wrote: > On Tue, Dec 17, 2024 at 01:03:07PM +0100, David Woodhouse wrote: > > I've dropped this for now and just posted the __nocfi thing as the > > regression fix. I think we *should* provide the CFI information in > > relocate_kernel_64.S though, so I've left these commits in my tree at > > https://git.infradead.org/?p=users/dwmw2/linux.git;a=shortlog;h=refs/heads/kexec-debug > > > > I'd really appreciate some help in getting objtool to stop whining > > about them, for *both* Clang and GCC builds at the same time :) > > Hm, I can't fetch for some reason: > > $ git fetch https://git.infradead.org/users/dwmw2/linux.git kexec-debug > fatal: https://git.infradead.org/users/dwmw2/linux.git/info/refs not valid: could not determine hash algorithm; is this a git repository? > The https URL is just gitweb. You can fetch from git://git.infradead.org/users/dwmw2/linux.git > At some point we had discussed placing the code in .rodata, was it the > alternative preventing that? No, the alternative seems to be fine, and it's all in the .data section now (since the kernel does write some variables there which are then accessed %rip-relative from the code itself). The problem is when I use SYM_TYPED_FUNC_START for relocate_kernel itself, objtool starts to whine that the first instruction of relocate_kernel is unreachable (presumably) because it is never jumped to directly, but copied into the control page and then called via a calculated function pointer). There's also an objtool warning about loading the stack pointer. I seem to get different warnings from objtool for Clang and GCC builds, and can't find a set of hints which will make them both shut up at the same time.
On Wed, Dec 18, 2024 at 10:44:25AM +0100, David Woodhouse wrote:
> > At some point we had discussed placing the code in .rodata, was it the
> > alternative preventing that?
>
> No, the alternative seems to be fine, and it's all in the .data section
> now (since the kernel does write some variables there which are then
> accessed %rip-relative from the code itself).
The linker script does place it in .data, but objtool runs on the object
file before linking, where it's still in an executable section
(.text..relocate_kernel).
How about something like below?
- move text to .data..relocate_kernel
- remove objtool annotations
- replace the alternative with a runtime check
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 5081d0b9e290..126d777a5695 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -61,6 +61,7 @@ struct kimage;
extern unsigned long kexec_va_control_page;
extern unsigned long kexec_pa_table_page;
extern unsigned long kexec_pa_swap_page;
+extern unsigned int kexec_preserve_mce;
extern gate_desc kexec_debug_idt[];
extern unsigned char kexec_debug_exc_vectors[];
extern uint16_t kexec_debug_8250_port;
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 853f3cdc7c75..751adeea0db8 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -363,6 +363,8 @@ int machine_kexec_prepare(struct kimage *image)
if (image->type == KEXEC_TYPE_DEFAULT)
kexec_pa_swap_page = page_to_pfn(image->swap_page) << PAGE_SHIFT;
+ kexec_preserve_mce = boot_cpu_has(X86_FEATURE_TDX_GUEST);
+
prepare_debug_idt((unsigned long)__pa(control_page),
(unsigned long)kexec_debug_exc_vectors - reloc_start);
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 4dca50141586..04b7a9f11e98 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -24,8 +24,8 @@
#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
/*
- * The .text..relocate_kernel and .data..relocate_kernel sections are copied
- * into the control page, and the remainder of the page is used as the stack.
+ * The .data..relocate_kernel section is copied into the control page, and the
+ * remainder of the page is used as the stack.
*/
.section .data..relocate_kernel,"a";
@@ -41,6 +41,7 @@ SYM_DATA(kexec_pa_swap_page, .quad 0)
SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
SYM_DATA(kexec_debug_8250_mmio32, .quad 0)
SYM_DATA(kexec_debug_8250_port, .word 0)
+SYM_DATA(kexec_preserve_mce, .word 0)
#ifdef CONFIG_KEXEC_DEBUG
.balign 16
@@ -60,12 +61,14 @@ SYM_DATA_END(kexec_debug_idt)
#endif /* CONFIG_KEXEC_DEBUG */
- .section .text..relocate_kernel,"ax";
+/*
+ * This code is copied rather than called in place. It's free to use indirect
+ * branches and bare returns, and doesn't need ORC unwinding data. Keep it in
+ * a data section so objtool doesn't see it.
+ */
+
.code64
SYM_TYPED_FUNC_START(relocate_kernel)
- UNWIND_HINT_END_OF_STACK
- UNWIND_HINT_FUNC
- ANNOTATE_NOENDBR
/*
* %rdi indirection_page
* %rsi pa_control_page
@@ -124,12 +127,10 @@ SYM_TYPED_FUNC_START(relocate_kernel)
/* jump to identity mapped page */
addq $identity_mapped, %rsi
subq $__relocate_kernel_start, %rsi
- ANNOTATE_RETPOLINE_SAFE
jmp *%rsi
SYM_CODE_END(relocate_kernel)
SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
- UNWIND_HINT_END_OF_STACK
/*
* %rdi indirection page
* %rdx start address
@@ -199,7 +200,11 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
* PAE is always set in the original CR4.
*/
andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d
- ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST
+ movw kexec_preserve_mce(%rip), %ax
+ testw %ax, %ax
+ jz .Lwrite_cr4
+ orl $X86_CR4_MCE, %r13d
+.Lwrite_cr4:
movq %r13, %cr4
/* Flush the TLB (needed?) */
@@ -250,7 +255,6 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
xorl %r14d, %r14d
xorl %r15d, %r15d
- ANNOTATE_UNRET_SAFE
ret
int3
@@ -264,7 +268,6 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
/* push the existing entry point onto the callee's stack */
pushq %rdx
- ANNOTATE_RETPOLINE_SAFE
call *%rdx
/* get the re-entry point of the peer system */
@@ -276,7 +279,6 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
/* Find start (and end) of this physical mapping of control page */
leaq (%rip), %r8
- ANNOTATE_NOENDBR
andq $PAGE_MASK, %r8
lea PAGE_SIZE(%r8), %rsp
movq $1, %r11 /* Ensure preserve_context flag is set */
@@ -285,14 +287,11 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
addq $virtual_mapped, %rax
subq $__relocate_kernel_start, %rax
pushq %rax
- ANNOTATE_UNRET_SAFE
ret
int3
SYM_CODE_END(identity_mapped)
SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
- UNWIND_HINT_END_OF_STACK
- ANNOTATE_NOENDBR // RET target, above
movq saved_rsp(%rip), %rsp
movq saved_cr4(%rip), %rax
movq %rax, %cr4
@@ -317,14 +316,12 @@ SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
popq %r12
popq %rbp
popq %rbx
- ANNOTATE_UNRET_SAFE
ret
int3
SYM_CODE_END(virtual_mapped)
/* Do the copies */
SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
- UNWIND_HINT_END_OF_STACK
/*
* %rdi indirection page
* %r11 preserve_context
@@ -387,7 +384,6 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
lea PAGE_SIZE(%rax), %rsi
jmp .Lloop
.Ldone:
- ANNOTATE_UNRET_SAFE
ret
int3
SYM_CODE_END(swap_pages)
@@ -404,8 +400,6 @@ SYM_CODE_END(swap_pages)
#define LSR 5 /* Line Status */
SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250)
- UNWIND_HINT_FUNC
- ANNOTATE_NOENDBR
addw $LSR, %dx
xchg %al, %ah
.Lxmtrdy_loop:
@@ -420,15 +414,11 @@ SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250)
xchg %al, %ah
outb %al, %dx
pr_char_null:
- ANNOTATE_NOENDBR
- ANNOTATE_UNRET_SAFE
ret
SYM_CODE_END(pr_char_8250)
SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32)
- UNWIND_HINT_FUNC
- ANNOTATE_NOENDBR
.Lxmtrdy_loop_mmio:
movb (LSR*4)(%rdx), %ah
testb $XMTRDY, %ah
@@ -438,7 +428,6 @@ SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32)
.Lready_mmio:
movb %al, (%rdx)
- ANNOTATE_UNRET_SAFE
ret
SYM_CODE_END(pr_char_8250_mmio32)
@@ -463,7 +452,6 @@ SYM_CODE_END(pr_char_8250_mmio32)
/* Print the nybble in %bl, clobber %rax */
SYM_CODE_START_LOCAL_NOALIGN(pr_nybble)
- UNWIND_HINT_FUNC
movb %bl, %al
nop
andb $0x0f, %al
@@ -471,33 +459,26 @@ SYM_CODE_START_LOCAL_NOALIGN(pr_nybble)
cmpb $0x3a, %al
jb 1f
addb $('a' - '0' - 10), %al
- ANNOTATE_RETPOLINE_SAFE
1: jmp *%rsi
SYM_CODE_END(pr_nybble)
SYM_CODE_START_LOCAL_NOALIGN(pr_qword)
- UNWIND_HINT_FUNC
movq $16, %rcx
1: rolq $4, %rbx
call pr_nybble
loop 1b
movb $'\n', %al
- ANNOTATE_RETPOLINE_SAFE
jmp *%rsi
SYM_CODE_END(pr_qword)
.macro print_reg a, b, c, d, r
movb $\a, %al
- ANNOTATE_RETPOLINE_SAFE
call *%rsi
movb $\b, %al
- ANNOTATE_RETPOLINE_SAFE
call *%rsi
movb $\c, %al
- ANNOTATE_RETPOLINE_SAFE
call *%rsi
movb $\d, %al
- ANNOTATE_RETPOLINE_SAFE
call *%rsi
movq \r, %rbx
call pr_qword
@@ -506,7 +487,6 @@ SYM_CODE_END(pr_qword)
SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors)
/* Each of these is 6 bytes. */
.macro vec_err exc
- UNWIND_HINT_ENTRY
. = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
nop
nop
@@ -515,14 +495,12 @@ SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors)
.endm
.macro vec_noerr exc
- UNWIND_HINT_ENTRY
. = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
pushq $0
pushq $\exc
jmp exc_handler
.endm
- ANNOTATE_NOENDBR
vec_noerr 0 // #DE
vec_noerr 1 // #DB
vec_noerr 2 // #NMI
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 63ff60a11be5..9c2abfd6ea26 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -100,7 +100,6 @@ const_pcpu_hot = pcpu_hot;
#define KEXEC_RELOCATE_KERNEL \
. = ALIGN(0x100); \
__relocate_kernel_start = .; \
- *(.text..relocate_kernel); \
*(.data..relocate_kernel); \
__relocate_kernel_end = .;
On Wed, 2024-12-18 at 13:23 -0800, Josh Poimboeuf wrote: > > The linker script does place it in .data, but objtool runs on the object > file before linking, where it's still in an executable section > (.text..relocate_kernel). > > How about something like below? > > - move text to .data..relocate_kernel > - remove objtool annotations > - replace the alternative with a runtime check That leaves me unable to use 'objdump -S arch/x86/kernel/relocate_kernel_64.o' so I hate it :) At the moment objtool is *mostly* happy with the code in here; is there no way to make it happy even with the CFI annotation? In practice I probably don't even need to use SYM_TYPED_FUNC_START() anyway, as it's doing the wrong thing. It's pointless if it just uses the external __cfi_typeid_relocate_kernel symbol that the C code emits, because that's obviously going to match the prototype that the C code expects. So I might emit the __cfi_relocate_kernel prologue entirely manually, and then maybe objtool will thinking it's entitled to opinions :)
On Wed, Dec 18, 2024 at 11:27:27PM +0100, David Woodhouse wrote: > On Wed, 2024-12-18 at 13:23 -0800, Josh Poimboeuf wrote: > > > > The linker script does place it in .data, but objtool runs on the object > > file before linking, where it's still in an executable section > > (.text..relocate_kernel). > > > > How about something like below? > > > > - move text to .data..relocate_kernel > > - remove objtool annotations > > - replace the alternative with a runtime check > > That leaves me unable to use 'objdump -S > arch/x86/kernel/relocate_kernel_64.o' so I hate it :) Well, it's already written in assembly, there's not much benefit in disassembling it ;-) But you can still force gdb to do so with something like "x/50i <addr>". Isn't that easier than putting in all these hacks to make objtool happy (and continue to keep it happy over the coming years), when it doesn't need to care about this code in the first place? Anyway, what I think you're looking for is UNWIND_HINT_UNDEFINED. In fact all the unwind annotations in that file should be UNDEFINED since the hints are all referring to the wrong addresses (because copied code) and the ORC unwinder isn't reachable for most of that code anyway. Also, it's fine to make relocate_kernel() a proper function with SYM_FUNC_END, you'd just need to add the following line afterwards: STACK_FRAME_NON_STANDARD relocate_kernel -- Josh
On Wed, 2024-12-18 at 16:20 -0800, Josh Poimboeuf wrote: > On Wed, Dec 18, 2024 at 11:27:27PM +0100, David Woodhouse wrote: > > On Wed, 2024-12-18 at 13:23 -0800, Josh Poimboeuf wrote: > > > > > > The linker script does place it in .data, but objtool runs on the object > > > file before linking, where it's still in an executable section > > > (.text..relocate_kernel). > > > > > > How about something like below? > > > > > > - move text to .data..relocate_kernel > > > - remove objtool annotations > > > - replace the alternative with a runtime check > > > > That leaves me unable to use 'objdump -S > > arch/x86/kernel/relocate_kernel_64.o' so I hate it :) > > Well, it's already written in assembly, there's not much benefit in > disassembling it ;-) But you can still force gdb to do so with > something like "x/50i <addr>". I've spent a lot of time recently wanting to know precisely which instruction it was that faulted (or that objtool is complaining about), so I *have* wanted to disassemble it. But yeah, in the general case that shouldn't be something we're doing all the time. Perhaps you're right, and we should just rip out all the annotations and move it to .data..relocate_kernel. But I'll give it a bit more effort to see if we can make objtool happy and keep the ALTERNATIVE working without having to do that manually... > Isn't that easier than putting in all these hacks to make objtool happy > (and continue to keep it happy over the coming years), when it doesn't > need to care about this code in the first place? Less worried about the 'continue' part. This code doesn't get much churn (not until I just came along and blitzed it, at least). > Anyway, what I think you're looking for is UNWIND_HINT_UNDEFINED. In > fact all the unwind annotations in that file should be UNDEFINED since > the hints are all referring to the wrong addresses (because copied code) > and the ORC unwinder isn't reachable for most of that code anyway. > > Also, it's fine to make relocate_kernel() a proper function with > SYM_FUNC_END, you'd just need to add the following line afterwards: > > STACK_FRAME_NON_STANDARD relocate_kernel > Thanks, I'll take a look. The next thing objtool is actually complaining about is that relocate_kernel() itself is not reachable (since it's never invoked in its original location): vmlinux.o: warning: objtool: relocate_kernel+0x0: unreachable instruction I'm not sure why objtool has only just started whining about that with the very last patches at the top of my tree; it's been true ever since the commit which executes from the relocated copy.
On Thu, Dec 19, 2024 at 11:02:55AM +0100, David Woodhouse wrote: > On Wed, 2024-12-18 at 16:20 -0800, Josh Poimboeuf wrote: > > Anyway, what I think you're looking for is UNWIND_HINT_UNDEFINED. In > > fact all the unwind annotations in that file should be UNDEFINED since > > the hints are all referring to the wrong addresses (because copied code) > > and the ORC unwinder isn't reachable for most of that code anyway. > > > > Also, it's fine to make relocate_kernel() a proper function with > > SYM_FUNC_END, you'd just need to add the following line afterwards: > > > > STACK_FRAME_NON_STANDARD relocate_kernel > > > > Thanks, I'll take a look. The next thing objtool is actually > complaining about is that relocate_kernel() itself is not reachable > (since it's never invoked in its original location): > > vmlinux.o: warning: objtool: relocate_kernel+0x0: unreachable instruction Right, I think UNWIND_HINT_UNDEFINED should fix that. -- Josh
On Sun, 2024-12-15 at 22:49 -0700, Nathan Chancellor wrote: > On Sun, Dec 15, 2024 at 10:09:57AM +0000, David Woodhouse wrote: > > On Sat, 2024-12-14 at 16:08 -0700, Nathan Chancellor wrote: > > > > > > I guess this seems somewhat unavoidable because control_page is just a > > > 'void *', perhaps machine_kexec() should just be marked as __nocfi? This > > > diff resolves that issue for me. > > > > The patch below seems to work too. I already wanted to deal with the > > Can confirm, thanks for the quick fix. With your fix for the first issue > I reported, the fix I sent for LTO, and this patch below, I can kexec on > a CFI and LTO enabled kernel without any issues. > Thank you for testing. Which is the LTO one? Is it in my tree yet or have I missed it? https://git.infradead.org/?p=users/dwmw2/linux.git;a=shortlog;h=refs/heads/kexec-debug (The fix we're discussing here isn't there yet; it still needs polishing)
On 14 December 2024 23:08:18 GMT, Nathan Chancellor <nathan@kernel.org> wrote:
>Hi David,
>
>On Thu, Dec 05, 2024 at 03:05:13PM +0000, David Woodhouse wrote:
>> From: David Woodhouse <dwmw@amazon.co.uk>
>>
>> This currently calls set_memory_x() from machine_kexec_prepare() just
>> like the 32-bit version does. That's actually a bit earlier than I'd
>> like, as it leaves the page RWX all the time the image is even *loaded*.
>>
>> Subsequent commits will eliminate all the writes to the page between the
>> point it's marked executable in machine_kexec_prepare() the time that
>> relocate_kernel() is running and has switched to the identmap %cr3, so
>> that it can be ROX. But that can't happen until it's moved to the .data
>> section of the kernel, and *that* can't happen until we start executing
>> the copy instead of executing it in place in the kernel .text. So break
>> the circular dependency in those commits by letting it be RWX for now.
>>
>> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
>> ---
>> arch/x86/kernel/machine_kexec_64.c | 30 ++++++++++++++++++++++------
>> arch/x86/kernel/relocate_kernel_64.S | 5 ++++-
>> 2 files changed, 28 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
>> index 3a4cbac1a0c6..9567347f7a9b 100644
>> --- a/arch/x86/kernel/machine_kexec_64.c
>> +++ b/arch/x86/kernel/machine_kexec_64.c
>...
>> void machine_kexec(struct kimage *image)
>> {
>> + unsigned long (*relocate_kernel_ptr)(unsigned long indirection_page,
>> + unsigned long page_list,
>> + unsigned long start_address,
>> + unsigned int preserve_context,
>> + unsigned int host_mem_enc_active);
>> unsigned long page_list[PAGES_NR];
>> unsigned int host_mem_enc_active;
>> int save_ftrace_enabled;
>> @@ -371,6 +387,8 @@ void machine_kexec(struct kimage *image)
>> page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
>> << PAGE_SHIFT);
>>
>> + relocate_kernel_ptr = control_page;
>> +
>
>Because of this change...
>
>> /*
>> * The segment registers are funny things, they have both a
>> * visible and an invisible part. Whenever the visible part is
>> @@ -390,11 +408,11 @@ void machine_kexec(struct kimage *image)
>> native_gdt_invalidate();
>>
>> /* now call it */
>> - image->start = relocate_kernel((unsigned long)image->head,
>> - (unsigned long)page_list,
>> - image->start,
>> - image->preserve_context,
>> - host_mem_enc_active);
>> + image->start = relocate_kernel_ptr((unsigned long)image->head,
>> + (unsigned long)page_list,
>> + image->start,
>> + image->preserve_context,
>> + host_mem_enc_active);
>
>kexec-ing on a CONFIG_CFI_CLANG kernel crashes and burns:
>
>RAX=0000000000000018 RBX=ff408cf982596000 RCX=0000000000000000 RDX=000000047fffb280
>RSI=ff6c99a785943d20 RDI=000000011e145002 RBP=ff6c99a785943d70 RSP=ff6c99a785943d10
>R8 =0000000000000000 R9 =0000000000000000 R10=00000000ab150dc6 R11=ff408cf99e139000
>R12=0000000028121969 R13=00000000fee1dead R14=0000000000000000 R15=0000000000000001
>RIP=ffffffff84c9c510 RFL=00010086 [--S--P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
>ES =0018 0000000000000000 ffffffff 00c09300 DPL=0 DS [-WA]
>CS =0010 0000000000000000 ffffffff 00a09b00 DPL=0 CS64 [-RA]
>SS =0018 0000000000000000 ffffffff 00c09300 DPL=0 DS [-WA]
>DS =0018 0000000000000000 ffffffff 00c09300 DPL=0 DS [-WA]
>FS =0018 0000000000000000 ffffffff 00c09300 DPL=0 DS [-WA]
>GS =0018 0000000000000000 ffffffff 00c09300 DPL=0 DS [-WA]
>LDT=0000 0000000000000000 ffffffff 00c00000
>TR =0040 fffffe441f360000 00004087 00008b00 DPL=0 TSS64-busy
>GDT= 0000000000000000 00000000
>IDT= 0000000000000000 00000000
>CR0=80050033 CR2=00007ffde71dbfc0 CR3=00000001140b0002 CR4=00771ef0
>DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000
>DR6=00000000fffe0ff0 DR7=0000000000000400
>EFER=0000000000000d01
>Code=83 e1 01 48 8d 74 24 10 41 ba c6 0d 15 ab 45 03 53 f1 74 02 <0f> 0b 41 ff d3 0f 1f 00 48 89 43 18 f6 83 78 02 00 00 02 74 05 e8 16 06 da 00 44 89 3d 17
>
> a0c: 83 e1 01 andl $0x1, %ecx
> a0f: 48 8d 74 24 10 leaq 0x10(%rsp), %rsi
>; image->start = relocate_kernel_ptr((unsigned long)image->head,
> a14: 41 ba 67 a6 7c e6 movl $0xe67ca667, %r10d # imm = 0xE67CA667
> a1a: 45 03 53 f1 addl -0xf(%r11), %r10d
> a1e: 74 02 je 0xa22 <machine_kexec+0x1c2>
> a20: 0f 0b ud2
> a22: 2e e8 00 00 00 00 callq 0xa28 <machine_kexec+0x1c8>
> a28: 48 89 43 18 movq %rax, 0x18(%rbx)
>
>I guess this seems somewhat unavoidable because control_page is just a
>'void *', perhaps machine_kexec() should just be marked as __nocfi? This
>diff resolves that issue for me.
>
>Cheers,
>Nathan
>
>diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
>index 9567347f7a9b..e77110c4bb91 100644
>--- a/arch/x86/kernel/machine_kexec_64.c
>+++ b/arch/x86/kernel/machine_kexec_64.c
>@@ -334,7 +334,7 @@ void machine_kexec_cleanup(struct kimage *image)
> * Do not allocate memory (or fail in any way) in machine_kexec().
> * We are past the point of no return, committed to rebooting now.
> */
>-void machine_kexec(struct kimage *image)
>+void __nocfi machine_kexec(struct kimage *image)
> {
> unsigned long (*relocate_kernel_ptr)(unsigned long indirection_page,
> unsigned long page_list,
Thanks. Could I have that with a SoB please? I can craft a commit message if you need. I'll round it up with a couple of other fixups I already have in my tree, and send them on.
The following commit has been merged into the x86/boot branch of tip:
Commit-ID: eeebbde57113730db7b3ec7380ada61a0193d27c
Gitweb: https://git.kernel.org/tip/eeebbde57113730db7b3ec7380ada61a0193d27c
Author: David Woodhouse <dwmw@amazon.co.uk>
AuthorDate: Thu, 05 Dec 2024 15:05:13
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 06 Dec 2024 10:41:59 +01:00
x86/kexec: Invoke copy of relocate_kernel() instead of the original
This currently calls set_memory_x() from machine_kexec_prepare() just
like the 32-bit version does. That's actually a bit earlier than I'd
like, as it leaves the page RWX all the time the image is even *loaded*.
Subsequent commits will eliminate all the writes to the page between the
point it's marked executable in machine_kexec_prepare() the time that
relocate_kernel() is running and has switched to the identmap %cr3, so
that it can be ROX. But that can't happen until it's moved to the .data
section of the kernel, and *that* can't happen until we start executing
the copy instead of executing it in place in the kernel .text. So break
the circular dependency in those commits by letting it be RWX for now.
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: https://lore.kernel.org/r/20241205153343.3275139-8-dwmw2@infradead.org
---
arch/x86/kernel/machine_kexec_64.c | 30 +++++++++++++++++++++------
arch/x86/kernel/relocate_kernel_64.S | 5 ++++-
2 files changed, 28 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 3a4cbac..9567347 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -157,7 +157,12 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd,
pmd_t *pmd;
pte_t *pte;
- vaddr = (unsigned long)relocate_kernel;
+ /*
+ * For the transition to the identity mapped page tables, the control
+ * code page also needs to be mapped at the virtual address it starts
+ * off running from.
+ */
+ vaddr = (unsigned long)__va(control_page);
paddr = control_page;
pgd += pgd_index(vaddr);
if (!pgd_present(*pgd)) {
@@ -311,11 +316,17 @@ int machine_kexec_prepare(struct kimage *image)
__memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
+ set_memory_x((unsigned long)control_page, 1);
+
return 0;
}
void machine_kexec_cleanup(struct kimage *image)
{
+ void *control_page = page_address(image->control_code_page);
+
+ set_memory_nx((unsigned long)control_page, 1);
+
free_transition_pgtable(image);
}
@@ -325,6 +336,11 @@ void machine_kexec_cleanup(struct kimage *image)
*/
void machine_kexec(struct kimage *image)
{
+ unsigned long (*relocate_kernel_ptr)(unsigned long indirection_page,
+ unsigned long page_list,
+ unsigned long start_address,
+ unsigned int preserve_context,
+ unsigned int host_mem_enc_active);
unsigned long page_list[PAGES_NR];
unsigned int host_mem_enc_active;
int save_ftrace_enabled;
@@ -371,6 +387,8 @@ void machine_kexec(struct kimage *image)
page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
<< PAGE_SHIFT);
+ relocate_kernel_ptr = control_page;
+
/*
* The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
@@ -390,11 +408,11 @@ void machine_kexec(struct kimage *image)
native_gdt_invalidate();
/* now call it */
- image->start = relocate_kernel((unsigned long)image->head,
- (unsigned long)page_list,
- image->start,
- image->preserve_context,
- host_mem_enc_active);
+ image->start = relocate_kernel_ptr((unsigned long)image->head,
+ (unsigned long)page_list,
+ image->start,
+ image->preserve_context,
+ host_mem_enc_active);
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index ca7f1e1..d0a87b3 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -39,6 +39,7 @@
#define CP_PA_TABLE_PAGE DATA(0x20)
#define CP_PA_SWAP_PAGE DATA(0x28)
#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
+#define CP_VA_CONTROL_PAGE DATA(0x38)
.text
.align PAGE_SIZE
@@ -99,6 +100,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
movq %r9, CP_PA_TABLE_PAGE(%r11)
movq %r10, CP_PA_SWAP_PAGE(%r11)
movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
+ movq %r11, CP_VA_CONTROL_PAGE(%r11)
/* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
movq %rcx, %r11
@@ -235,7 +237,8 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
movq %rax, %cr3
lea PAGE_SIZE(%r8), %rsp
call swap_pages
- movq $virtual_mapped, %rax
+ movq CP_VA_CONTROL_PAGE(%r8), %rax
+ addq $(virtual_mapped - relocate_kernel), %rax
pushq %rax
ANNOTATE_UNRET_SAFE
ret
© 2016 - 2025 Red Hat, Inc.