[RFC PATCH 6/7] x86/kexec: Debugging support: Dump registers on exception

David Woodhouse posted 7 patches 1 year, 3 months ago
There is a newer version of this series
[RFC PATCH 6/7] x86/kexec: Debugging support: Dump registers on exception
Posted by David Woodhouse 1 year, 3 months ago
From: David Woodhouse <dwmw@amazon.co.uk>

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kernel/relocate_kernel_64.S | 80 ++++++++++++++++++++++++++--
 1 file changed, 77 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 2a2a6e693e18..1c18cffe5229 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -398,6 +398,53 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
 SYM_CODE_END(swap_pages)
 
 #ifdef DEBUG
+/*
+ * This allows other types of serial ports to be used.
+ *  - %al: Character to be printed (no clobber %rax)
+ *  - %rdx: MMIO address or port.
+ */
+.macro pr_char
+	outb	%al, %dx
+.endm
+
+/* Print the byte in %bl, clobber %rax */
+SYM_CODE_START_LOCAL_NOALIGN(pr_byte)
+	movb	%bl, %al
+	nop
+	andb	$0x0f, %al
+	addb 	$0x30, %al
+	cmpb	$0x3a, %al
+	jb	1f
+	addb	$('a' - '0' - 10), %al
+1:	pr_char
+	ANNOTATE_UNRET_SAFE
+	ret
+SYM_CODE_END(pr_byte)
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_qword)
+	movq	$16, %rcx
+1:	rolq	$4, %rbx
+	call	pr_byte
+	loop	1b
+	movb	$'\n', %al
+	pr_char
+	ANNOTATE_UNRET_SAFE
+	ret
+SYM_CODE_END(pr_qword)
+
+.macro print_reg a, b, c, d, r
+	movb	$\a, %al
+	pr_char
+	movb	$\b, %al
+	pr_char
+	movb	$\c, %al
+	pr_char
+	movb	$\d, %al
+	pr_char
+	movq	\r, %rbx
+	call	pr_qword
+.endm
+
 SYM_CODE_START_LOCAL_NOALIGN(exc_vectors)
 	/* Each of these is 6 bytes. */
 .macro vec_err exc
@@ -437,11 +484,39 @@ SYM_CODE_END(exc_vectors)
 
 SYM_CODE_START_LOCAL_NOALIGN(exc_handler)
 	pushq	%rax
+	pushq	%rbx
+	pushq	%rcx
 	pushq	%rdx
+
 	movw	$0x3f8, %dx
-	movb	$'A', %al
-	outb	%al, %dx
+
+	/* rip and exception info */
+	print_reg 'E', 'x', 'c', ':', 32(%rsp)
+	print_reg 'E', 'r', 'r', ':', 40(%rsp)
+	print_reg 'r', 'i', 'p', ':', 48(%rsp)
+
+	/* We spilled these to the stack */
+	print_reg 'r', 'a', 'x', ':', 24(%rsp)
+	print_reg 'r', 'b', 'x', ':', 16(%rsp)
+	print_reg 'r', 'c', 'x', ':', 8(%rsp)
+	print_reg 'r', 'd', 'x', ':', (%rsp)
+
+	/* Other registers */
+	print_reg 'r', 's', 'i', ':', %rsi
+	print_reg 'r', 'd', 'i', ':', %rdi
+	print_reg 'r', '8', ' ', ':', %r8
+	print_reg 'r', '9', ' ', ':', %r9
+	print_reg 'r', '1', '0', ':', %r10
+	print_reg 'r', '1', '1', ':', %r11
+	print_reg 'r', '1', '2', ':', %r12
+	print_reg 'r', '1', '3', ':', %r13
+	print_reg 'r', '1', '4', ':', %r14
+	print_reg 'r', '1', '5', ':', %r15
+	print_reg 'c', 'r', '2', ':', %cr2
+
 	popq	%rdx
+	popq	%rcx
+	popq	%rbx
 	popq	%rax
 
 	/* Only return from int3 */
@@ -454,7 +529,6 @@ SYM_CODE_START_LOCAL_NOALIGN(exc_handler)
 .Ldie:
 	hlt
 	jmp	.Ldie
-
 SYM_CODE_END(exc_handler)
 
 .Lreloc_kernel_gdt:
-- 
2.44.0
Re: [RFC PATCH 6/7] x86/kexec: Debugging support: Dump registers on exception
Posted by Woodhouse, David 1 year, 3 months ago
On Sun, 2024-11-03 at 05:35 +0000, David Woodhouse wrote:
> 
> +
> +/* Print the byte in %bl, clobber %rax */
> +SYM_CODE_START_LOCAL_NOALIGN(pr_byte)
> +       movb    %bl, %al
> +       nop
> +       andb    $0x0f, %al
> +       addb    $0x30, %al
> +       cmpb    $0x3a, %al
> +       jb      1f
> +       addb    $('a' - '0' - 10), %al
> +1:     pr_char
> +       ANNOTATE_UNRET_SAFE
> +       ret
> +SYM_CODE_END(pr_byte)
> +

Obviously that function name (and comment) are wrong; fixed in my tree.
at
https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/kexec-debug

This function (and also pr_qword) are also what objtool is complaining
about:

vmlinux.o: warning: objtool: relocate_range+0x2f6: unreachable instruction
vmlinux.o: warning: objtool: relocate_range+0x305: unreachable instruction

I don't quite see why, because pr_qword() quite blatantly calls
pr_nyblle(), as it's now named. And exc_handler() repeatedly calls
pr_qword().

But most of the objtool annotations I've added here were just to make
it shut up and build, without much though. Peter, Josh, any chance you
can help me fix it up please?

It would also be really useful if objtool would let me have data inside
a "code" segment, without complaining that it can't decode it as
instructions — and without also failing to decode the first instruction
of the *subsequent* function. I've put the GDT at the end to work
around that, but it's a bit nasty.



Amazon Development Centre (London) Ltd. Registered in England and Wales with registration number 04543232 with its registered office at 1 Principal Place, Worship Street, London EC2A 2FA, United Kingdom.


Re: [RFC PATCH 6/7] x86/kexec: Debugging support: Dump registers on exception
Posted by H. Peter Anvin 1 year, 3 months ago
On 11/5/24 12:38, Woodhouse, David wrote:
> On Sun, 2024-11-03 at 05:35 +0000, David Woodhouse wrote:
>>
>> +
>> +/* Print the byte in %bl, clobber %rax */
>> +SYM_CODE_START_LOCAL_NOALIGN(pr_byte)
>> +       movb    %bl, %al
>> +       nop
>> +       andb    $0x0f, %al
>> +       addb    $0x30, %al
>> +       cmpb    $0x3a, %al
>> +       jb      1f
>> +       addb    $('a' - '0' - 10), %al
>> +1:     pr_char
>> +       ANNOTATE_UNRET_SAFE
>> +       ret
>> +SYM_CODE_END(pr_byte)
>> +
> 
> Obviously that function name (and comment) are wrong; fixed in my tree.
> at
> https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/kexec-debug
> 
> This function (and also pr_qword) are also what objtool is complaining
> about:
> 
> vmlinux.o: warning: objtool: relocate_range+0x2f6: unreachable instruction
> vmlinux.o: warning: objtool: relocate_range+0x305: unreachable instruction
> 
> I don't quite see why, because pr_qword() quite blatantly calls
> pr_nyblle(), as it's now named. And exc_handler() repeatedly calls
> pr_qword().
> 
> But most of the objtool annotations I've added here were just to make
> it shut up and build, without much though. Peter, Josh, any chance you
> can help me fix it up please?
> 
> It would also be really useful if objtool would let me have data inside
> a "code" segment, without complaining that it can't decode it as
> instructions — and without also failing to decode the first instruction
> of the *subsequent* function. I've put the GDT at the end to work
> around that, but it's a bit nasty.
> 

Looking at your code, you have a much bigger problem here:

+/*
+ * This allows other types of serial ports to be used.
+ *  - %al: Character to be printed (no clobber %rax)
+ *  - %rdx: MMIO address or port.
+ */
+.macro pr_char
+       outb    %al, %dx
+.endm
+

This will overflow your UART buffer very quickly since you are now 
dumping a whole bunch of data. The URT buffer -- if you even have one 
and it is enabled -- is only 16 bytes in a standard 16550A UART. In 
older UARTs (or emulated older UARTs) you might not have a buffer at 
all. To print more than a handful of bytes, you need to poll for the 
THRE bit=1 (bit 5 of register 5).

What is the point of writing this code in assembly in the first place? A 
much more logical thing to do is to just push the registers you haven't 
pushed already onto the stack and call a C function to do the actual 
dumping? It isn't like it is in any shape, way or form performance critical.

	-hpa

Re: [RFC PATCH 6/7] x86/kexec: Debugging support: Dump registers on exception
Posted by David Woodhouse 1 year, 3 months ago
On Tue, 2024-11-05 at 13:37 -0800, H. Peter Anvin wrote:
> 
> Looking at your code, you have a much bigger problem here:
> 
> +/*
> + * This allows other types of serial ports to be used.
> + *  - %al: Character to be printed (no clobber %rax)
> + *  - %rdx: MMIO address or port.
> + */
> +.macro pr_char
> +       outb    %al, %dx
> +.endm
> +
> 
> This will overflow your UART buffer very quickly since you are now 
> dumping a whole bunch of data. The URT buffer -- if you even have one
> and it is enabled -- is only 16 bytes in a standard 16550A UART. In 
> older UARTs (or emulated older UARTs) you might not have a buffer at 
> all. To print more than a handful of bytes, you need to poll for the 
> THRE bit=1 (bit 5 of register 5).

Emulated UARTs are generally fine because they don't really emulate the
buffer at all. And when I originally wrote this it was purely a hack to
debug an issue for myself, and used a different type of logging device
altogether.

But yeah, if this were to be used on bare metal 16550A it would indeed
need to wait for space in the FIFO/THR.

> What is the point of writing this code in assembly in the first place? A 
> much more logical thing to do is to just push the registers you haven't 
> pushed already onto the stack and call a C function to do the actual 
> dumping? It isn't like it is in any shape, way or form performance critical.

If we fix it up to use a proper linker script, that's slightly more
feasible. As things stand, it's only really possible to do it in the
existing asm file. 

And it's only the core of the exception handler "function" which could
be moved out to C; it didn't seem particularly worth bothering. Would
be nice to have the IDT generated from C code *before* calling
relocate_kernel() instead of inside relocate_kernel itself, perhaps,
but I was also trying to keep the #define DEBUG version of the code
fairly self-contained.


Re: [RFC PATCH 6/7] x86/kexec: Debugging support: Dump registers on exception
Posted by H. Peter Anvin 1 year, 3 months ago
On November 5, 2024 6:43:44 PM PST, David Woodhouse <dwmw2@infradead.org> wrote:
>On Tue, 2024-11-05 at 13:37 -0800, H. Peter Anvin wrote:
>> 
>> Looking at your code, you have a much bigger problem here:
>> 
>> +/*
>> + * This allows other types of serial ports to be used.
>> + *  - %al: Character to be printed (no clobber %rax)
>> + *  - %rdx: MMIO address or port.
>> + */
>> +.macro pr_char
>> +       outb    %al, %dx
>> +.endm
>> +
>> 
>> This will overflow your UART buffer very quickly since you are now 
>> dumping a whole bunch of data. The URT buffer -- if you even have one
>> and it is enabled -- is only 16 bytes in a standard 16550A UART. In 
>> older UARTs (or emulated older UARTs) you might not have a buffer at 
>> all. To print more than a handful of bytes, you need to poll for the 
>> THRE bit=1 (bit 5 of register 5).
>
>Emulated UARTs are generally fine because they don't really emulate the
>buffer at all. And when I originally wrote this it was purely a hack to
>debug an issue for myself, and used a different type of logging device
>altogether.
>
>But yeah, if this were to be used on bare metal 16550A it would indeed
>need to wait for space in the FIFO/THR.
>
>> What is the point of writing this code in assembly in the first place? A 
>> much more logical thing to do is to just push the registers you haven't 
>> pushed already onto the stack and call a C function to do the actual 
>> dumping? It isn't like it is in any shape, way or form performance critical.
>
>If we fix it up to use a proper linker script, that's slightly more
>feasible. As things stand, it's only really possible to do it in the
>existing asm file. 
>
>And it's only the core of the exception handler "function" which could
>be moved out to C; it didn't seem particularly worth bothering. Would
>be nice to have the IDT generated from C code *before* calling
>relocate_kernel() instead of inside relocate_kernel itself, perhaps,
>but I was also trying to keep the #define DEBUG version of the code
>fairly self-contained.
>
>

Yes, the linker script needs to happen. 

This is a case of doing it right vs doing it quickly.
Re: [RFC PATCH 6/7] x86/kexec: Debugging support: Dump registers on exception
Posted by David Woodhouse 1 year, 3 months ago
On Tue, 2024-11-05 at 18:47 -0800, H. Peter Anvin wrote:
> 
> 
> Yes, the linker script needs to happen. 


How does this look? Need to do 32-bit too, and actually test the
preserve_context case. And then port the debugging bits on top...

David Woodhouse (2):
      x86/kexec: Use linker script for relocate_kernel page layout
      x86/kexec: Add data section to relocate_kernel

 arch/x86/include/asm/sections.h      |  1 +
 arch/x86/kernel/machine_kexec_64.c   |  6 ++--
 arch/x86/kernel/relocate_kernel_64.S | 64 ++++++++++++++++++++----------------
 arch/x86/kernel/vmlinux.lds.S        | 12 ++++++-
 4 files changed, 51 insertions(+), 32 deletions(-)

[RFC PATCH 1/2] x86/kexec: Use linker script for relocate_kernel page layout
Posted by David Woodhouse 1 year, 3 months ago
From: David Woodhouse <dwmw@amazon.co.uk>

---
 arch/x86/include/asm/sections.h      |  1 +
 arch/x86/kernel/machine_kexec_64.c   |  6 ++++--
 arch/x86/kernel/relocate_kernel_64.S |  6 +-----
 arch/x86/kernel/vmlinux.lds.S        | 12 +++++++++++-
 4 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 3fa87e5e11ab..30e8ee7006f9 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -5,6 +5,7 @@
 #include <asm-generic/sections.h>
 #include <asm/extable.h>
 
+extern char __relocate_kernel_start[], __relocate_kernel_end[];
 extern char __brk_base[], __brk_limit[];
 extern char __end_rodata_aligned[];
 
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 9c9ac606893e..f873bef6eefd 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -156,7 +156,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 	pmd_t *pmd;
 	pte_t *pte;
 
-	vaddr = (unsigned long)relocate_kernel;
+	vaddr = (unsigned long)__relocate_kernel_start;
 	paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
 	pgd += pgd_index(vaddr);
 	if (!pgd_present(*pgd)) {
@@ -321,6 +321,8 @@ void machine_kexec_cleanup(struct kimage *image)
  */
 void machine_kexec(struct kimage *image)
 {
+	unsigned long reloc_start = (unsigned long)__relocate_kernel_start;
+	unsigned long reloc_end = (unsigned long)__relocate_kernel_end;
 	unsigned long page_list[PAGES_NR];
 	unsigned int host_mem_enc_active;
 	int save_ftrace_enabled;
@@ -358,7 +360,7 @@ void machine_kexec(struct kimage *image)
 	}
 
 	control_page = page_address(image->control_code_page) + PAGE_SIZE;
-	__memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
+	__memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start);
 
 	page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
 	page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 2848d086ceb0..1efcbd340528 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -40,10 +40,8 @@
 #define CP_PA_SWAP_PAGE		DATA(0x28)
 #define CP_PA_BACKUP_PAGES_MAP	DATA(0x30)
 
-	.text
-	.align PAGE_SIZE
+	.section .text.relocate_kernel,"ax";
 	.code64
-SYM_CODE_START_NOALIGN(relocate_range)
 SYM_CODE_START_NOALIGN(relocate_kernel)
 	UNWIND_HINT_END_OF_STACK
 	ANNOTATE_NOENDBR
@@ -332,5 +330,3 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
 	int3
 SYM_CODE_END(swap_pages)
 
-	.skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc
-SYM_CODE_END(relocate_range);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index b8c5741d2fb4..ad451371e179 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -95,7 +95,16 @@ const_pcpu_hot = pcpu_hot;
 #define BSS_DECRYPTED
 
 #endif
-
+#if defined(CONFIG_X86_64) && defined(CONFIG_KEXEC_CORE)
+#define KEXEC_RELOCATE_KERNEL_TEXT				\
+	. = ALIGN(PAGE_SIZE);					\
+	__relocate_kernel_start = .;				\
+	*(.text.relocate_kernel);				\
+	*(.rodata.relocate_kernel);				\
+	__relocate_kernel_end = .;
+#else
+#define KEXEC_RELOCATE_KERNEL_TEXT
+#endif
 PHDRS {
 	text PT_LOAD FLAGS(5);          /* R_E */
 	data PT_LOAD FLAGS(6);          /* RW_ */
@@ -124,6 +133,7 @@ SECTIONS
 		/* bootstrapping code */
 		HEAD_TEXT
 		TEXT_TEXT
+		KEXEC_RELOCATE_KERNEL_TEXT
 		SCHED_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
-- 
2.44.0
[RFC PATCH 2/2] x86/kexec: Add data section to relocate_kernel
Posted by David Woodhouse 1 year, 3 months ago
From: David Woodhouse <dwmw@amazon.co.uk>

Now that it's handled sanely by a linker script we can have actual data,
and just use %rip-relative addressing to access it.

If we could call the *copy* instead of the original relocate_kernel in
the kernel text, then we could use %rip-relative addressing everywhere.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kernel/relocate_kernel_64.S | 58 ++++++++++++++++------------
 arch/x86/kernel/vmlinux.lds.S        |  2 +-
 2 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 1efcbd340528..577aa1672349 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -27,18 +27,28 @@
  * ~ control_page + PAGE_SIZE are used as data storage and stack for
  * jumping back
  */
-#define DATA(offset)		(KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
 
+	.section .data.relocate_kernel,"a";
 /* Minimal CPU state */
-#define RSP			DATA(0x0)
-#define CR0			DATA(0x8)
-#define CR3			DATA(0x10)
-#define CR4			DATA(0x18)
-
+SYM_DATA_LOCAL(saved_rsp, .quad 0)
+SYM_DATA_LOCAL(saved_cr0, .quad 0)
+SYM_DATA_LOCAL(saved_cr3, .quad 0)
+SYM_DATA_LOCAL(saved_cr4, .quad 0)
 /* other data */
-#define CP_PA_TABLE_PAGE	DATA(0x20)
-#define CP_PA_SWAP_PAGE		DATA(0x28)
-#define CP_PA_BACKUP_PAGES_MAP	DATA(0x30)
+SYM_DATA_LOCAL(pa_table_page, .quad 0)
+SYM_DATA_LOCAL(pa_swap_page, .quad 0)
+SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
+
+/*
+ * There are two physical copies of relocate_kernel(), one in the original
+ * Kernel text and the other copied to the control page. There is a virtual
+ * mapping of each, in the original kernel. It is the *original* which is
+ * called from machine_kexec(), largely becaose the copy isn't mapped as an
+ * executable page. Thus, this code cannot just use %rip-relative addressing
+ * until after the %cr3 change and the jump to identity_mapped(). Until
+ * then, some pointer arithmetic is required.
+ */
+#define DATA(x) (x - relocate_kernel)
 
 	.section .text.relocate_kernel,"ax";
 	.code64
@@ -63,13 +73,13 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
 	pushf
 
 	movq	PTR(VA_CONTROL_PAGE)(%rsi), %r11
-	movq	%rsp, RSP(%r11)
+	movq	%rsp, DATA(saved_rsp)(%r11)
 	movq	%cr0, %rax
-	movq	%rax, CR0(%r11)
+	movq	%rax, DATA(saved_cr0)(%r11)
 	movq	%cr3, %rax
-	movq	%rax, CR3(%r11)
+	movq	%rax, DATA(saved_cr3)(%r11)
 	movq	%cr4, %rax
-	movq	%rax, CR4(%r11)
+	movq	%rax, DATA(saved_cr4)(%r11)
 
 	/* Save CR4. Required to enable the right paging mode later. */
 	movq	%rax, %r13
@@ -94,9 +104,9 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
 	movq	PTR(PA_SWAP_PAGE)(%rsi), %r10
 
 	/* save some information for jumping back */
-	movq	%r9, CP_PA_TABLE_PAGE(%r11)
-	movq	%r10, CP_PA_SWAP_PAGE(%r11)
-	movq	%rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
+	movq	%r9, DATA(pa_table_page)(%r11)
+	movq	%r10, DATA(pa_swap_page)(%r11)
+	movq	%rdi, DATA(pa_backup_pages_map)(%r11)
 
 	/* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
 	movq	%rcx, %r11
@@ -128,7 +138,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
 	/* set return address to 0 if not preserving context */
 	pushq	$0
 	/* store the start address on the stack */
-	pushq   %rdx
+	pushq   start_address(%rip)
 
 	/*
 	 * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
@@ -227,9 +237,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
 	/* get the re-entry point of the peer system */
 	movq	0(%rsp), %rbp
 	leaq	relocate_kernel(%rip), %r8
-	movq	CP_PA_SWAP_PAGE(%r8), %r10
-	movq	CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
-	movq	CP_PA_TABLE_PAGE(%r8), %rax
+	movq	pa_swap_page(%rip), %r10
+	movq	pa_backup_pages_map(%rip), %rdi
+	movq	pa_table_page(%rip), %rax
 	movq	%rax, %cr3
 	lea	PAGE_SIZE(%r8), %rsp
 	call	swap_pages
@@ -243,11 +253,11 @@ SYM_CODE_END(identity_mapped)
 SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
 	UNWIND_HINT_END_OF_STACK
 	ANNOTATE_NOENDBR // RET target, above
-	movq	RSP(%r8), %rsp
-	movq	CR4(%r8), %rax
+	movq	saved_rsp(%rip), %rsp
+	movq	saved_cr4(%rip), %rax
 	movq	%rax, %cr4
-	movq	CR3(%r8), %rax
-	movq	CR0(%r8), %r8
+	movq	saved_cr3(%rip), %rax
+	movq	saved_cr0(%r8), %r8
 	movq	%rax, %cr3
 	movq	%r8, %cr0
 	movq	%rbp, %rax
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index ad451371e179..65f879b31a82 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -100,7 +100,7 @@ const_pcpu_hot = pcpu_hot;
 	. = ALIGN(PAGE_SIZE);					\
 	__relocate_kernel_start = .;				\
 	*(.text.relocate_kernel);				\
-	*(.rodata.relocate_kernel);				\
+	*(.data.relocate_kernel);				\
 	__relocate_kernel_end = .;
 #else
 #define KEXEC_RELOCATE_KERNEL_TEXT
-- 
2.44.0
Re: [RFC PATCH 2/2] x86/kexec: Add data section to relocate_kernel
Posted by H. Peter Anvin 1 year, 3 months ago
On November 8, 2024 6:22:41 AM GMT+01:00, David Woodhouse <dwmw2@infradead.org> wrote:
>From: David Woodhouse <dwmw@amazon.co.uk>
>
>Now that it's handled sanely by a linker script we can have actual data,
>and just use %rip-relative addressing to access it.
>
>If we could call the *copy* instead of the original relocate_kernel in
>the kernel text, then we could use %rip-relative addressing everywhere.
>
>Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
>---
> arch/x86/kernel/relocate_kernel_64.S | 58 ++++++++++++++++------------
> arch/x86/kernel/vmlinux.lds.S        |  2 +-
> 2 files changed, 35 insertions(+), 25 deletions(-)
>
>diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
>index 1efcbd340528..577aa1672349 100644
>--- a/arch/x86/kernel/relocate_kernel_64.S
>+++ b/arch/x86/kernel/relocate_kernel_64.S
>@@ -27,18 +27,28 @@
>  * ~ control_page + PAGE_SIZE are used as data storage and stack for
>  * jumping back
>  */
>-#define DATA(offset)		(KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
> 
>+	.section .data.relocate_kernel,"a";
> /* Minimal CPU state */
>-#define RSP			DATA(0x0)
>-#define CR0			DATA(0x8)
>-#define CR3			DATA(0x10)
>-#define CR4			DATA(0x18)
>-
>+SYM_DATA_LOCAL(saved_rsp, .quad 0)
>+SYM_DATA_LOCAL(saved_cr0, .quad 0)
>+SYM_DATA_LOCAL(saved_cr3, .quad 0)
>+SYM_DATA_LOCAL(saved_cr4, .quad 0)
> /* other data */
>-#define CP_PA_TABLE_PAGE	DATA(0x20)
>-#define CP_PA_SWAP_PAGE		DATA(0x28)
>-#define CP_PA_BACKUP_PAGES_MAP	DATA(0x30)
>+SYM_DATA_LOCAL(pa_table_page, .quad 0)
>+SYM_DATA_LOCAL(pa_swap_page, .quad 0)
>+SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
>+
>+/*
>+ * There are two physical copies of relocate_kernel(), one in the original
>+ * Kernel text and the other copied to the control page. There is a virtual
>+ * mapping of each, in the original kernel. It is the *original* which is
>+ * called from machine_kexec(), largely becaose the copy isn't mapped as an
>+ * executable page. Thus, this code cannot just use %rip-relative addressing
>+ * until after the %cr3 change and the jump to identity_mapped(). Until
>+ * then, some pointer arithmetic is required.
>+ */
>+#define DATA(x) (x - relocate_kernel)
> 
> 	.section .text.relocate_kernel,"ax";
> 	.code64
>@@ -63,13 +73,13 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
> 	pushf
> 
> 	movq	PTR(VA_CONTROL_PAGE)(%rsi), %r11
>-	movq	%rsp, RSP(%r11)
>+	movq	%rsp, DATA(saved_rsp)(%r11)
> 	movq	%cr0, %rax
>-	movq	%rax, CR0(%r11)
>+	movq	%rax, DATA(saved_cr0)(%r11)
> 	movq	%cr3, %rax
>-	movq	%rax, CR3(%r11)
>+	movq	%rax, DATA(saved_cr3)(%r11)
> 	movq	%cr4, %rax
>-	movq	%rax, CR4(%r11)
>+	movq	%rax, DATA(saved_cr4)(%r11)
> 
> 	/* Save CR4. Required to enable the right paging mode later. */
> 	movq	%rax, %r13
>@@ -94,9 +104,9 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
> 	movq	PTR(PA_SWAP_PAGE)(%rsi), %r10
> 
> 	/* save some information for jumping back */
>-	movq	%r9, CP_PA_TABLE_PAGE(%r11)
>-	movq	%r10, CP_PA_SWAP_PAGE(%r11)
>-	movq	%rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
>+	movq	%r9, DATA(pa_table_page)(%r11)
>+	movq	%r10, DATA(pa_swap_page)(%r11)
>+	movq	%rdi, DATA(pa_backup_pages_map)(%r11)
> 
> 	/* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
> 	movq	%rcx, %r11
>@@ -128,7 +138,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
> 	/* set return address to 0 if not preserving context */
> 	pushq	$0
> 	/* store the start address on the stack */
>-	pushq   %rdx
>+	pushq   start_address(%rip)
> 
> 	/*
> 	 * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
>@@ -227,9 +237,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
> 	/* get the re-entry point of the peer system */
> 	movq	0(%rsp), %rbp
> 	leaq	relocate_kernel(%rip), %r8
>-	movq	CP_PA_SWAP_PAGE(%r8), %r10
>-	movq	CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
>-	movq	CP_PA_TABLE_PAGE(%r8), %rax
>+	movq	pa_swap_page(%rip), %r10
>+	movq	pa_backup_pages_map(%rip), %rdi
>+	movq	pa_table_page(%rip), %rax
> 	movq	%rax, %cr3
> 	lea	PAGE_SIZE(%r8), %rsp
> 	call	swap_pages
>@@ -243,11 +253,11 @@ SYM_CODE_END(identity_mapped)
> SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
> 	UNWIND_HINT_END_OF_STACK
> 	ANNOTATE_NOENDBR // RET target, above
>-	movq	RSP(%r8), %rsp
>-	movq	CR4(%r8), %rax
>+	movq	saved_rsp(%rip), %rsp
>+	movq	saved_cr4(%rip), %rax
> 	movq	%rax, %cr4
>-	movq	CR3(%r8), %rax
>-	movq	CR0(%r8), %r8
>+	movq	saved_cr3(%rip), %rax
>+	movq	saved_cr0(%r8), %r8
> 	movq	%rax, %cr3
> 	movq	%r8, %cr0
> 	movq	%rbp, %rax
>diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
>index ad451371e179..65f879b31a82 100644
>--- a/arch/x86/kernel/vmlinux.lds.S
>+++ b/arch/x86/kernel/vmlinux.lds.S
>@@ -100,7 +100,7 @@ const_pcpu_hot = pcpu_hot;
> 	. = ALIGN(PAGE_SIZE);					\
> 	__relocate_kernel_start = .;				\
> 	*(.text.relocate_kernel);				\
>-	*(.rodata.relocate_kernel);				\
>+	*(.data.relocate_kernel);				\
> 	__relocate_kernel_end = .;
> #else
> #define KEXEC_RELOCATE_KERNEL_TEXT

Looks good at first glance. I'm currently traveling so I haven't fully reviewed it though.
Re: [RFC PATCH 2/2] x86/kexec: Add data section to relocate_kernel
Posted by David Woodhouse 1 year, 2 months ago
On Fri, 2024-11-08 at 12:26 +0100, H. Peter Anvin wrote:
> 
> > --- a/arch/x86/kernel/vmlinux.lds.S
> > +++ b/arch/x86/kernel/vmlinux.lds.S
> > @@ -100,7 +100,7 @@ const_pcpu_hot = pcpu_hot;
> >         . = ALIGN(PAGE_SIZE);                                   \
> >         __relocate_kernel_start = .;                            \
> >         *(.text.relocate_kernel);                               \
> > -       *(.rodata.relocate_kernel);                             \
> > +       *(.data.relocate_kernel);                               \
> >         __relocate_kernel_end = .;
> > #else
> > #define KEXEC_RELOCATE_KERNEL_TEXT
> 
> Looks good at first glance. I'm currently traveling so I haven't
> fully reviewed it though.

Turns out it doesn't help much. It's neater, obviously, but objtool
still wants to disassemble the .data.relocate_kernel section after it
gets included in the overall kernel text section.

So once I rebase the debugging support on top and add the GDT¹ into the
data section, it gets very unhappy:

arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder bug, please report this.
arch/x86/tools/insn_decoder_test: warning: ffffffff8219250a:	eb fd                1f 00 00 00 00 00 00 00 ff ff 00 00 00 9a cf 00     ................ff ff 00 00 00 9a af 00 ff ff 00 00 00 92 cf 00     ................cc cc cc cc cc cc cc cc cc cc cc cc                 ............	jmp    ffffffffarch/x86/tools/insn_decoder_test: warning: objdump says 23 bytes, but insn_get_length() says 2
arch/x86/tools/insn_decoder_test: error: malformed line 5236442:
82192509 <exc_handler+0x18e>

It's also still complaining that pr_nybble() and pr_qword() are
unreachable, although it doesn't say that of the exc_handler() function
which blatantly calls them. Although I can look at that harder if I do
factor it out to C code.

¹ https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/kexec-debug
Re: [RFC PATCH 2/2] x86/kexec: Add data section to relocate_kernel
Posted by Peter Zijlstra 1 year, 2 months ago
On Tue, Nov 12, 2024 at 08:44:33AM +0000, David Woodhouse wrote:
> On Fri, 2024-11-08 at 12:26 +0100, H. Peter Anvin wrote:
> > 
> > > --- a/arch/x86/kernel/vmlinux.lds.S
> > > +++ b/arch/x86/kernel/vmlinux.lds.S
> > > @@ -100,7 +100,7 @@ const_pcpu_hot = pcpu_hot;
> > >         . = ALIGN(PAGE_SIZE);                                   \
> > >         __relocate_kernel_start = .;                            \
> > >         *(.text.relocate_kernel);                               \
> > > -       *(.rodata.relocate_kernel);                             \
> > > +       *(.data.relocate_kernel);                               \

Why are we having data in the middle of the text section?

> > >         __relocate_kernel_end = .;
> > > #else
> > > #define KEXEC_RELOCATE_KERNEL_TEXT
> > 
> > Looks good at first glance. I'm currently traveling so I haven't
> > fully reviewed it though.
> 
> Turns out it doesn't help much. It's neater, obviously, but objtool
> still wants to disassemble the .data.relocate_kernel section after it
> gets included in the overall kernel text section.

Objtool only decodes stuff that has SHF_EXECINSTR set. Why would your
data sections have that on?
Re: [RFC PATCH 2/2] x86/kexec: Add data section to relocate_kernel
Posted by David Woodhouse 1 year, 2 months ago
On Tue, 2024-11-12 at 11:14 +0100, Peter Zijlstra wrote:
> On Tue, Nov 12, 2024 at 08:44:33AM +0000, David Woodhouse wrote:
> > On Fri, 2024-11-08 at 12:26 +0100, H. Peter Anvin wrote:
> > > 
> > > > --- a/arch/x86/kernel/vmlinux.lds.S
> > > > +++ b/arch/x86/kernel/vmlinux.lds.S
> > > > @@ -100,7 +100,7 @@ const_pcpu_hot = pcpu_hot;
> > > >         . = ALIGN(PAGE_SIZE);                                   \
> > > >         __relocate_kernel_start = .;                            \
> > > >         *(.text.relocate_kernel);                               \
> > > > -       *(.rodata.relocate_kernel);                             \
> > > > +       *(.data.relocate_kernel);                               \
> 
> Why are we having data in the middle of the text section?

This is the relocate_kernel() page. It's the last thing the kernel
calls on kexec.

The kernel first takes a *copy* of it and places it into the identity
mapping page tables which are set up for kexec.

The relocate_kernel() function is then called at its *original*
location in the kernel text. Before reloading %cr3, it stores some
information which is going to become unavailable into its own page
(currently using a bit of a nasty hack based on a hard-coded
KEXEC_CONTROL_CODE_MAX_SIZE because we can't just use data symbol
references).

Then it reloads %cr3 and jumps to the identity-mapped copy of itself.

Could we put .text.relocate_kernel and .data.relocate_kernel somewhere
*other* than the main kernel text segment? Probably... it use use
alternative instructions, but we could deal with that. And if we call
from machine_kexec() directly into the *copy* (having marked it
executable), maybe...?


Re: [RFC PATCH 2/2] x86/kexec: Add data section to relocate_kernel
Posted by jpoimboe 1 year, 2 months ago
On Tue, Nov 12, 2024 at 10:45:05AM +0000, David Woodhouse wrote:
> On Tue, 2024-11-12 at 11:14 +0100, Peter Zijlstra wrote:
> > On Tue, Nov 12, 2024 at 08:44:33AM +0000, David Woodhouse wrote:
> > > On Fri, 2024-11-08 at 12:26 +0100, H. Peter Anvin wrote:
> > > > 
> > > > > --- a/arch/x86/kernel/vmlinux.lds.S
> > > > > +++ b/arch/x86/kernel/vmlinux.lds.S
> > > > > @@ -100,7 +100,7 @@ const_pcpu_hot = pcpu_hot;
> > > > >         . = ALIGN(PAGE_SIZE);                                   \
> > > > >         __relocate_kernel_start = .;                            \
> > > > >         *(.text.relocate_kernel);                               \
> > > > > -       *(.rodata.relocate_kernel);                             \
> > > > > +       *(.data.relocate_kernel);                               \
> > 
> > Why are we having data in the middle of the text section?
> 
> This is the relocate_kernel() page. It's the last thing the kernel
> calls on kexec.
> 
> The kernel first takes a *copy* of it and places it into the identity
> mapping page tables which are set up for kexec.
> 
> The relocate_kernel() function is then called at its *original*
> location in the kernel text. Before reloading %cr3, it stores some
> information which is going to become unavailable into its own page
> (currently using a bit of a nasty hack based on a hard-coded
> KEXEC_CONTROL_CODE_MAX_SIZE because we can't just use data symbol
> references).
> 
> Then it reloads %cr3 and jumps to the identity-mapped copy of itself.
> 
> Could we put .text.relocate_kernel and .data.relocate_kernel somewhere
> *other* than the main kernel text segment? Probably... it use use
> alternative instructions, but we could deal with that. And if we call
> from machine_kexec() directly into the *copy* (having marked it
> executable), maybe...?

I fetched your branch and only saw a "RET before UNTRAIN" warning, did
you happen to already fix the other warnings up?

Something like the below fixes the warning I saw.

Though, maybe it's better to just tell objtool to keep its paws off of
the problematic functions by use of the STACK_FRAME_NON_STANDARD macro?
Then you could get rid of all the unwind hints and annotations.


diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 6604f5d038aa..9ba10530b9c7 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -4012,8 +4012,11 @@ static int validate_unret(struct objtool_file *file, struct instruction *insn)
 			return 0;
 
 		case INSN_RETURN:
-			WARN_INSN(insn, "RET before UNTRAIN");
-			return 1;
+			if (!insn->retpoline_safe) {
+				WARN_INSN(insn, "RET before UNTRAIN");
+				return 1;
+			}
+			break;
 
 		case INSN_NOP:
 			if (insn->retpoline_safe)

Re: [RFC PATCH 2/2] x86/kexec: Add data section to relocate_kernel
Posted by David Woodhouse 1 year, 3 months ago
On 8 November 2024 03:26:58 GMT-08:00, "H. Peter Anvin" <hpa@zytor.com> wrote:
>On November 8, 2024 6:22:41 AM GMT+01:00, David Woodhouse <dwmw2@infradead.org> wrote:
>>From: David Woodhouse <dwmw@amazon.co.uk>
>>
>>Now that it's handled sanely by a linker script we can have actual data,
>>and just use %rip-relative addressing to access it.
>>
>>If we could call the *copy* instead of the original relocate_kernel in
>>the kernel text, then we could use %rip-relative addressing everywhere.
>>
>>Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
>>---
>> arch/x86/kernel/relocate_kernel_64.S | 58 ++++++++++++++++------------
>> arch/x86/kernel/vmlinux.lds.S        |  2 +-
>> 2 files changed, 35 insertions(+), 25 deletions(-)
>>
>>diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
>>index 1efcbd340528..577aa1672349 100644
>>--- a/arch/x86/kernel/relocate_kernel_64.S
>>+++ b/arch/x86/kernel/relocate_kernel_64.S
>>@@ -27,18 +27,28 @@
>>  * ~ control_page + PAGE_SIZE are used as data storage and stack for
>>  * jumping back
>>  */
>>-#define DATA(offset)		(KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
>> 
>>+	.section .data.relocate_kernel,"a";
>> /* Minimal CPU state */
>>-#define RSP			DATA(0x0)
>>-#define CR0			DATA(0x8)
>>-#define CR3			DATA(0x10)
>>-#define CR4			DATA(0x18)
>>-
>>+SYM_DATA_LOCAL(saved_rsp, .quad 0)
>>+SYM_DATA_LOCAL(saved_cr0, .quad 0)
>>+SYM_DATA_LOCAL(saved_cr3, .quad 0)
>>+SYM_DATA_LOCAL(saved_cr4, .quad 0)
>> /* other data */
>>-#define CP_PA_TABLE_PAGE	DATA(0x20)
>>-#define CP_PA_SWAP_PAGE		DATA(0x28)
>>-#define CP_PA_BACKUP_PAGES_MAP	DATA(0x30)
>>+SYM_DATA_LOCAL(pa_table_page, .quad 0)
>>+SYM_DATA_LOCAL(pa_swap_page, .quad 0)
>>+SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
>>+
>>+/*
>>+ * There are two physical copies of relocate_kernel(), one in the original
>>+ * Kernel text and the other copied to the control page. There is a virtual
>>+ * mapping of each, in the original kernel. It is the *original* which is
>>+ * called from machine_kexec(), largely becaose the copy isn't mapped as an
>>+ * executable page. Thus, this code cannot just use %rip-relative addressing
>>+ * until after the %cr3 change and the jump to identity_mapped(). Until
>>+ * then, some pointer arithmetic is required.
>>+ */
>>+#define DATA(x) (x - relocate_kernel)
>> 
>> 	.section .text.relocate_kernel,"ax";
>> 	.code64
>>@@ -63,13 +73,13 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
>> 	pushf
>> 
>> 	movq	PTR(VA_CONTROL_PAGE)(%rsi), %r11
>>-	movq	%rsp, RSP(%r11)
>>+	movq	%rsp, DATA(saved_rsp)(%r11)
>> 	movq	%cr0, %rax
>>-	movq	%rax, CR0(%r11)
>>+	movq	%rax, DATA(saved_cr0)(%r11)
>> 	movq	%cr3, %rax
>>-	movq	%rax, CR3(%r11)
>>+	movq	%rax, DATA(saved_cr3)(%r11)
>> 	movq	%cr4, %rax
>>-	movq	%rax, CR4(%r11)
>>+	movq	%rax, DATA(saved_cr4)(%r11)
>> 
>> 	/* Save CR4. Required to enable the right paging mode later. */
>> 	movq	%rax, %r13
>>@@ -94,9 +104,9 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
>> 	movq	PTR(PA_SWAP_PAGE)(%rsi), %r10
>> 
>> 	/* save some information for jumping back */
>>-	movq	%r9, CP_PA_TABLE_PAGE(%r11)
>>-	movq	%r10, CP_PA_SWAP_PAGE(%r11)
>>-	movq	%rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
>>+	movq	%r9, DATA(pa_table_page)(%r11)
>>+	movq	%r10, DATA(pa_swap_page)(%r11)
>>+	movq	%rdi, DATA(pa_backup_pages_map)(%r11)
>> 
>> 	/* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
>> 	movq	%rcx, %r11
>>@@ -128,7 +138,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
>> 	/* set return address to 0 if not preserving context */
>> 	pushq	$0
>> 	/* store the start address on the stack */
>>-	pushq   %rdx
>>+	pushq   start_address(%rip)
>> 
>> 	/*
>> 	 * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
>>@@ -227,9 +237,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
>> 	/* get the re-entry point of the peer system */
>> 	movq	0(%rsp), %rbp
>> 	leaq	relocate_kernel(%rip), %r8
>>-	movq	CP_PA_SWAP_PAGE(%r8), %r10
>>-	movq	CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
>>-	movq	CP_PA_TABLE_PAGE(%r8), %rax
>>+	movq	pa_swap_page(%rip), %r10
>>+	movq	pa_backup_pages_map(%rip), %rdi
>>+	movq	pa_table_page(%rip), %rax
>> 	movq	%rax, %cr3
>> 	lea	PAGE_SIZE(%r8), %rsp
>> 	call	swap_pages
>>@@ -243,11 +253,11 @@ SYM_CODE_END(identity_mapped)
>> SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
>> 	UNWIND_HINT_END_OF_STACK
>> 	ANNOTATE_NOENDBR // RET target, above
>>-	movq	RSP(%r8), %rsp
>>-	movq	CR4(%r8), %rax
>>+	movq	saved_rsp(%rip), %rsp
>>+	movq	saved_cr4(%rip), %rax
>> 	movq	%rax, %cr4
>>-	movq	CR3(%r8), %rax
>>-	movq	CR0(%r8), %r8
>>+	movq	saved_cr3(%rip), %rax
>>+	movq	saved_cr0(%r8), %r8
>> 	movq	%rax, %cr3
>> 	movq	%r8, %cr0
>> 	movq	%rbp, %rax
>>diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
>>index ad451371e179..65f879b31a82 100644
>>--- a/arch/x86/kernel/vmlinux.lds.S
>>+++ b/arch/x86/kernel/vmlinux.lds.S
>>@@ -100,7 +100,7 @@ const_pcpu_hot = pcpu_hot;
>> 	. = ALIGN(PAGE_SIZE);					\
>> 	__relocate_kernel_start = .;				\
>> 	*(.text.relocate_kernel);				\
>>-	*(.rodata.relocate_kernel);				\
>>+	*(.data.relocate_kernel);				\
>> 	__relocate_kernel_end = .;
>> #else
>> #define KEXEC_RELOCATE_KERNEL_TEXT
>
>Looks good at first glance. I'm currently traveling so I haven't fully reviewed it though.

Ta. That's good enough for me to go ahead and port the rest over. 

Is there a selftest for the preserve-context mode somewhere, with a payload that just does a "ret"?
Re: [RFC PATCH 2/2] x86/kexec: Add data section to relocate_kernel
Posted by David Woodhouse 1 year, 3 months ago
On Fri, 2024-11-08 at 05:22 +0000, David Woodhouse wrote:
> 
> @@ -128,7 +138,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
>         /* set return address to 0 if not preserving context */
>         pushq   $0
>         /* store the start address on the stack */
> -       pushq   %rdx
> +       pushq   start_address(%rip)
>  
>         /*
>          * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP

Oops, drop that part; I did that (and actually defined it, and stored
the value in it) just to test that everything was actually *working*,
since I think the rest of the data fields are only used for
preserve_context mode. Fixed in the tree at 

https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads//kexec-lds

Re: [RFC PATCH 6/7] x86/kexec: Debugging support: Dump registers on exception
Posted by David Woodhouse 1 year, 3 months ago
On Tue, 2024-11-05 at 18:47 -0800, H. Peter Anvin wrote:
> 
> Yes, the linker script needs to happen. 
> 
> This is a case of doing it right vs doing it quickly.

That may just tip it over the edge of how much work it's worth doing to
clean up the debug hack that I implemented for my own use, and make it
useful to others in the future. But I'll see what I can do.
Re: [RFC PATCH 6/7] x86/kexec: Debugging support: Dump registers on exception
Posted by H. Peter Anvin 1 year, 3 months ago
On 11/5/24 13:37, H. Peter Anvin wrote:
> What is the point of writing this code in assembly in the first place? A 
> much more logical thing to do is to just push the registers you haven't 
> pushed already onto the stack and call a C function to do the actual 
> dumping? It isn't like it is in any shape, way or form performance 
> critical.

arch/x86/boot/compressed/misc.c has some code that you can crib, both 
for writing to a text screen (not that useful anymore with EFI 
framebuffers) and serial port. If you factor it a little bit then you 
can probably even share the code directly.

(__putstr perhaps should have a __putchar() factored out of it?)

Then you can just do the obvious (have your assembly stub point %rdi to 
the base of the register dump; set the frame order to whatever you'd 
like, except rip/err/exc, or reverse the order if you prefer by changing 
the loop):

static inline __noreturn void die(void)
{
	while (1)
		asm volatile("hlt");
}

void dump_register_frame(const unsigned long frame[])
{
	static const char regnames[][5] = {
		"rax:", "rcx:", "rdx:", "rbx:",
		"rsp:", "rbp:", "rsi:", "rdi:",
		"r8: ", "r9: ", "r10:", "r11:",
		"r12:", "r13:", "r14:", "r15:",
		"cr2:", "Exc:", "Err:", "rip:"
	};

	for (size_t i = 0; i < ARRAY_SIZE(regnames); i++) {
		__putstr(regnames[i]);
		__puthex(frame[i]);
		__putstr("\n");
	}

	/* Only return from int3 */
	if (frame[17] != 3)
		die();
}
Re: [RFC PATCH 6/7] x86/kexec: Debugging support: Dump registers on exception
Posted by Andy Shevchenko 1 year, 2 months ago
On Tue, Nov 05, 2024 at 01:58:40PM -0800, H. Peter Anvin wrote:
> On 11/5/24 13:37, H. Peter Anvin wrote:
> > What is the point of writing this code in assembly in the first place? A
> > much more logical thing to do is to just push the registers you haven't
> > pushed already onto the stack and call a C function to do the actual
> > dumping? It isn't like it is in any shape, way or form performance
> > critical.
> 
> arch/x86/boot/compressed/misc.c has some code that you can crib, both for
> writing to a text screen (not that useful anymore with EFI framebuffers) and
> serial port. If you factor it a little bit then you can probably even share
> the code directly.
> 
> (__putstr perhaps should have a __putchar() factored out of it?)
> 
> Then you can just do the obvious (have your assembly stub point %rdi to the
> base of the register dump; set the frame order to whatever you'd like,
> except rip/err/exc, or reverse the order if you prefer by changing the
> loop):
> 
> static inline __noreturn void die(void)
> {
> 	while (1)
> 		asm volatile("hlt");
> }
> 
> void dump_register_frame(const unsigned long frame[])
> {
> 	static const char regnames[][5] = {
> 		"rax:", "rcx:", "rdx:", "rbx:",
> 		"rsp:", "rbp:", "rsi:", "rdi:",
> 		"r8: ", "r9: ", "r10:", "r11:",
> 		"r12:", "r13:", "r14:", "r15:",
> 		"cr2:", "Exc:", "Err:", "rip:"
> 	};
> 
> 	for (size_t i = 0; i < ARRAY_SIZE(regnames); i++) {
> 		__putstr(regnames[i]);
> 		__puthex(frame[i]);
> 		__putstr("\n");
> 	}
> 
> 	/* Only return from int3 */
> 	if (frame[17] != 3)
> 		die();
> }

+ a mil here!

Please, use existing early_serial_console infra. Also we have
debug_putaddr() / debug_puthex() / debug_putstr() which are
probably ones you want to use.

-- 
With Best Regards,
Andy Shevchenko
Re: [RFC PATCH 6/7] x86/kexec: Debugging support: Dump registers on exception
Posted by H. Peter Anvin 1 year, 3 months ago
On 11/5/24 13:58, H. Peter Anvin wrote:
> 
> Then you can just do the obvious (have your assembly stub point %rdi to 
> the base of the register dump; set the frame order to whatever you'd 
> like, except rip/err/exc, or reverse the order if you prefer by changing 
> the loop):
> 
> static inline __noreturn void die(void)
> {
>      while (1)
>          asm volatile("hlt");
> }
> 
> void dump_register_frame(const unsigned long frame[])
> {
>      static const char regnames[][5] = {
>          "rax:", "rcx:", "rdx:", "rbx:",
>          "rsp:", "rbp:", "rsi:", "rdi:",
>          "r8: ", "r9: ", "r10:", "r11:",
>          "r12:", "r13:", "r14:", "r15:",
>          "cr2:", "Exc:", "Err:", "rip:"
>      };
> 
>      for (size_t i = 0; i < ARRAY_SIZE(regnames); i++) {
>          __putstr(regnames[i]);
>          __puthex(frame[i]);
>          __putstr("\n");
>      }
> 
>      /* Only return from int3 */
>      if (frame[17] != 3)
>          die();
> }
> 

I don't know if it is necessary, but you can do something like this to 
make absolutely sure you don't end up with non-relative symbol references:

static inline __constfunc void * sym_addr(const void *sym)
{
	void *addr;
	asm("lea %c1(%%rip),%0" : "=r" (addr) : "i" (sym));
	return addr;
}

Re: [RFC PATCH 6/7] x86/kexec: Debugging support: Dump registers on exception
Posted by H. Peter Anvin 1 year, 3 months ago
On 11/5/24 14:27, H. Peter Anvin wrote:
> 
> I don't know if it is necessary, but you can do something like this to 
> make absolutely sure you don't end up with non-relative symbol references:
> 
> static inline __constfunc void * sym_addr(const void *sym)
> {
>      void *addr;
>      asm("lea %c1(%%rip),%0" : "=r" (addr) : "i" (sym));
>      return addr;
> }
> 

Compiling with "-fpic -fvisibility=hidden -mcmodel=medium" can also be 
used to suppress non-relative references. If you have external symbol 
references they at least *should* be emitted as GOTPCRELX relocations 
which the linker should relax to relative references; however, I would 
think that you really don't want any of those :)

	-hpa

Re: [RFC PATCH 6/7] x86/kexec: Debugging support: Dump registers on exception
Posted by H. Peter Anvin 1 year, 3 months ago
On November 5, 2024 12:38:10 PM PST, "Woodhouse, David" <dwmw@amazon.co.uk> wrote:
>On Sun, 2024-11-03 at 05:35 +0000, David Woodhouse wrote:
>> 
>> +
>> +/* Print the byte in %bl, clobber %rax */
>> +SYM_CODE_START_LOCAL_NOALIGN(pr_byte)
>> +       movb    %bl, %al
>> +       nop
>> +       andb    $0x0f, %al
>> +       addb    $0x30, %al
>> +       cmpb    $0x3a, %al
>> +       jb      1f
>> +       addb    $('a' - '0' - 10), %al
>> +1:     pr_char
>> +       ANNOTATE_UNRET_SAFE
>> +       ret
>> +SYM_CODE_END(pr_byte)
>> +
>
>Obviously that function name (and comment) are wrong; fixed in my tree.
>at
>https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/kexec-debug
>
>This function (and also pr_qword) are also what objtool is complaining
>about:
>
>vmlinux.o: warning: objtool: relocate_range+0x2f6: unreachable instruction
>vmlinux.o: warning: objtool: relocate_range+0x305: unreachable instruction
>
>I don't quite see why, because pr_qword() quite blatantly calls
>pr_nyblle(), as it's now named. And exc_handler() repeatedly calls
>pr_qword().
>
>But most of the objtool annotations I've added here were just to make
>it shut up and build, without much though. Peter, Josh, any chance you
>can help me fix it up please?
>
>It would also be really useful if objtool would let me have data inside
>a "code" segment, without complaining that it can't decode it as
>instructions — and without also failing to decode the first instruction
>of the *subsequent* function. I've put the GDT at the end to work
>around that, but it's a bit nasty.

code in the data *section* or *segment*? Either is nasty, though. That's what .rodata is for.
Re: [RFC PATCH 6/7] x86/kexec: Debugging support: Dump registers on exception
Posted by David Woodhouse 1 year, 3 months ago
On Tue, 2024-11-05 at 12:50 -0800, H. Peter Anvin wrote:
> On November 5, 2024 12:38:10 PM PST, "Woodhouse, David" <dwmw@amazon.co.uk> wrote:
> > On Sun, 2024-11-03 at 05:35 +0000, David Woodhouse wrote:
> > > 
> > > +
> > > +/* Print the byte in %bl, clobber %rax */
> > > +SYM_CODE_START_LOCAL_NOALIGN(pr_byte)
> > > +       movb    %bl, %al
> > > +       nop
> > > +       andb    $0x0f, %al
> > > +       addb    $0x30, %al
> > > +       cmpb    $0x3a, %al
> > > +       jb      1f
> > > +       addb    $('a' - '0' - 10), %al
> > > +1:     pr_char
> > > +       ANNOTATE_UNRET_SAFE
> > > +       ret
> > > +SYM_CODE_END(pr_byte)
> > > +
> > 
> > Obviously that function name (and comment) are wrong; fixed in my tree.
> > at
> > https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/kexec-debug
> > 
> > This function (and also pr_qword) are also what objtool is complaining
> > about:
> > 
> > vmlinux.o: warning: objtool: relocate_range+0x2f6: unreachable instruction
> > vmlinux.o: warning: objtool: relocate_range+0x305: unreachable instruction
> > 
> > I don't quite see why, because pr_qword() quite blatantly calls
> > pr_nyblle(), as it's now named. And exc_handler() repeatedly calls
> > pr_qword().
> > 
> > But most of the objtool annotations I've added here were just to make
> > it shut up and build, without much though. Peter, Josh, any chance you
> > can help me fix it up please?
> > 
> > It would also be really useful if objtool would let me have data inside
> > a "code" segment, without complaining that it can't decode it as
> > instructions — and without also failing to decode the first instruction
> > of the *subsequent* function. I've put the GDT at the end to work
> > around that, but it's a bit nasty.
> 
> code in the data *section* or *segment*? Either is nasty, though. That's what .rodata is for.

This is the relocate_kernel() function in
arch/x86/kernel/relocate_kernel_64.S

It's copied into a separate page, called (in its original location) as
a simple function from the kernel, changes %cr3 to set of identity-
mapped page tables and jumps to its *identity-mapped* address, then
copies all the right pages for kexec and jumps into the new kernel.

So it's all in a single page, and currently it plays nasty tricks to
store data after the code. Perhaps it *should* have its own code and
data sections and a linker script to keep them together...
Re: [RFC PATCH 6/7] x86/kexec: Debugging support: Dump registers on exception
Posted by H. Peter Anvin 1 year, 3 months ago
On November 5, 2024 1:29:54 PM PST, David Woodhouse <dwmw2@infradead.org> wrote:
>On Tue, 2024-11-05 at 12:50 -0800, H. Peter Anvin wrote:
>> On November 5, 2024 12:38:10 PM PST, "Woodhouse, David" <dwmw@amazon.co.uk> wrote:
>> > On Sun, 2024-11-03 at 05:35 +0000, David Woodhouse wrote:
>> > > 
>> > > +
>> > > +/* Print the byte in %bl, clobber %rax */
>> > > +SYM_CODE_START_LOCAL_NOALIGN(pr_byte)
>> > > +       movb    %bl, %al
>> > > +       nop
>> > > +       andb    $0x0f, %al
>> > > +       addb    $0x30, %al
>> > > +       cmpb    $0x3a, %al
>> > > +       jb      1f
>> > > +       addb    $('a' - '0' - 10), %al
>> > > +1:     pr_char
>> > > +       ANNOTATE_UNRET_SAFE
>> > > +       ret
>> > > +SYM_CODE_END(pr_byte)
>> > > +
>> > 
>> > Obviously that function name (and comment) are wrong; fixed in my tree.
>> > at
>> > https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/kexec-debug
>> > 
>> > This function (and also pr_qword) are also what objtool is complaining
>> > about:
>> > 
>> > vmlinux.o: warning: objtool: relocate_range+0x2f6: unreachable instruction
>> > vmlinux.o: warning: objtool: relocate_range+0x305: unreachable instruction
>> > 
>> > I don't quite see why, because pr_qword() quite blatantly calls
>> > pr_nyblle(), as it's now named. And exc_handler() repeatedly calls
>> > pr_qword().
>> > 
>> > But most of the objtool annotations I've added here were just to make
>> > it shut up and build, without much though. Peter, Josh, any chance you
>> > can help me fix it up please?
>> > 
>> > It would also be really useful if objtool would let me have data inside
>> > a "code" segment, without complaining that it can't decode it as
>> > instructions — and without also failing to decode the first instruction
>> > of the *subsequent* function. I've put the GDT at the end to work
>> > around that, but it's a bit nasty.
>> 
>> code in the data *section* or *segment*? Either is nasty, though. That's what .rodata is for.
>
>This is the relocate_kernel() function in
>arch/x86/kernel/relocate_kernel_64.S
>
>It's copied into a separate page, called (in its original location) as
>a simple function from the kernel, changes %cr3 to set of identity-
>mapped page tables and jumps to its *identity-mapped* address, then
>copies all the right pages for kexec and jumps into the new kernel.
>
>So it's all in a single page, and currently it plays nasty tricks to
>store data after the code. Perhaps it *should* have its own code and
>data sections and a linker script to keep them together...

Yes, it should.