From: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
arch/x86/kernel/relocate_kernel_64.S | 80 ++++++++++++++++++++++++++--
1 file changed, 77 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 2a2a6e693e18..1c18cffe5229 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -398,6 +398,53 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
SYM_CODE_END(swap_pages)
#ifdef DEBUG
+/*
+ * This allows other types of serial ports to be used.
+ * - %al: Character to be printed (no clobber %rax)
+ * - %rdx: MMIO address or port.
+ */
+.macro pr_char
+ outb %al, %dx
+.endm
+
+/* Print the byte in %bl, clobber %rax */
+SYM_CODE_START_LOCAL_NOALIGN(pr_byte)
+ movb %bl, %al
+ nop
+ andb $0x0f, %al
+ addb $0x30, %al
+ cmpb $0x3a, %al
+ jb 1f
+ addb $('a' - '0' - 10), %al
+1: pr_char
+ ANNOTATE_UNRET_SAFE
+ ret
+SYM_CODE_END(pr_byte)
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_qword)
+ movq $16, %rcx
+1: rolq $4, %rbx
+ call pr_byte
+ loop 1b
+ movb $'\n', %al
+ pr_char
+ ANNOTATE_UNRET_SAFE
+ ret
+SYM_CODE_END(pr_qword)
+
+.macro print_reg a, b, c, d, r
+ movb $\a, %al
+ pr_char
+ movb $\b, %al
+ pr_char
+ movb $\c, %al
+ pr_char
+ movb $\d, %al
+ pr_char
+ movq \r, %rbx
+ call pr_qword
+.endm
+
SYM_CODE_START_LOCAL_NOALIGN(exc_vectors)
/* Each of these is 6 bytes. */
.macro vec_err exc
@@ -437,11 +484,39 @@ SYM_CODE_END(exc_vectors)
SYM_CODE_START_LOCAL_NOALIGN(exc_handler)
pushq %rax
+ pushq %rbx
+ pushq %rcx
pushq %rdx
+
movw $0x3f8, %dx
- movb $'A', %al
- outb %al, %dx
+
+ /* rip and exception info */
+ print_reg 'E', 'x', 'c', ':', 32(%rsp)
+ print_reg 'E', 'r', 'r', ':', 40(%rsp)
+ print_reg 'r', 'i', 'p', ':', 48(%rsp)
+
+ /* We spilled these to the stack */
+ print_reg 'r', 'a', 'x', ':', 24(%rsp)
+ print_reg 'r', 'b', 'x', ':', 16(%rsp)
+ print_reg 'r', 'c', 'x', ':', 8(%rsp)
+ print_reg 'r', 'd', 'x', ':', (%rsp)
+
+ /* Other registers */
+ print_reg 'r', 's', 'i', ':', %rsi
+ print_reg 'r', 'd', 'i', ':', %rdi
+ print_reg 'r', '8', ' ', ':', %r8
+ print_reg 'r', '9', ' ', ':', %r9
+ print_reg 'r', '1', '0', ':', %r10
+ print_reg 'r', '1', '1', ':', %r11
+ print_reg 'r', '1', '2', ':', %r12
+ print_reg 'r', '1', '3', ':', %r13
+ print_reg 'r', '1', '4', ':', %r14
+ print_reg 'r', '1', '5', ':', %r15
+ print_reg 'c', 'r', '2', ':', %cr2
+
popq %rdx
+ popq %rcx
+ popq %rbx
popq %rax
/* Only return from int3 */
@@ -454,7 +529,6 @@ SYM_CODE_START_LOCAL_NOALIGN(exc_handler)
.Ldie:
hlt
jmp .Ldie
-
SYM_CODE_END(exc_handler)
.Lreloc_kernel_gdt:
--
2.44.0
On Sun, 2024-11-03 at 05:35 +0000, David Woodhouse wrote:
>
> +
> +/* Print the byte in %bl, clobber %rax */
> +SYM_CODE_START_LOCAL_NOALIGN(pr_byte)
> + movb %bl, %al
> + nop
> + andb $0x0f, %al
> + addb $0x30, %al
> + cmpb $0x3a, %al
> + jb 1f
> + addb $('a' - '0' - 10), %al
> +1: pr_char
> + ANNOTATE_UNRET_SAFE
> + ret
> +SYM_CODE_END(pr_byte)
> +
Obviously that function name (and comment) are wrong; fixed in my tree.
at
https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/kexec-debug
This function (and also pr_qword) are also what objtool is complaining
about:
vmlinux.o: warning: objtool: relocate_range+0x2f6: unreachable instruction
vmlinux.o: warning: objtool: relocate_range+0x305: unreachable instruction
I don't quite see why, because pr_qword() quite blatantly calls
pr_nyblle(), as it's now named. And exc_handler() repeatedly calls
pr_qword().
But most of the objtool annotations I've added here were just to make
it shut up and build, without much though. Peter, Josh, any chance you
can help me fix it up please?
It would also be really useful if objtool would let me have data inside
a "code" segment, without complaining that it can't decode it as
instructions — and without also failing to decode the first instruction
of the *subsequent* function. I've put the GDT at the end to work
around that, but it's a bit nasty.
Amazon Development Centre (London) Ltd. Registered in England and Wales with registration number 04543232 with its registered office at 1 Principal Place, Worship Street, London EC2A 2FA, United Kingdom.
On 11/5/24 12:38, Woodhouse, David wrote:
> On Sun, 2024-11-03 at 05:35 +0000, David Woodhouse wrote:
>>
>> +
>> +/* Print the byte in %bl, clobber %rax */
>> +SYM_CODE_START_LOCAL_NOALIGN(pr_byte)
>> + movb %bl, %al
>> + nop
>> + andb $0x0f, %al
>> + addb $0x30, %al
>> + cmpb $0x3a, %al
>> + jb 1f
>> + addb $('a' - '0' - 10), %al
>> +1: pr_char
>> + ANNOTATE_UNRET_SAFE
>> + ret
>> +SYM_CODE_END(pr_byte)
>> +
>
> Obviously that function name (and comment) are wrong; fixed in my tree.
> at
> https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/kexec-debug
>
> This function (and also pr_qword) are also what objtool is complaining
> about:
>
> vmlinux.o: warning: objtool: relocate_range+0x2f6: unreachable instruction
> vmlinux.o: warning: objtool: relocate_range+0x305: unreachable instruction
>
> I don't quite see why, because pr_qword() quite blatantly calls
> pr_nyblle(), as it's now named. And exc_handler() repeatedly calls
> pr_qword().
>
> But most of the objtool annotations I've added here were just to make
> it shut up and build, without much though. Peter, Josh, any chance you
> can help me fix it up please?
>
> It would also be really useful if objtool would let me have data inside
> a "code" segment, without complaining that it can't decode it as
> instructions — and without also failing to decode the first instruction
> of the *subsequent* function. I've put the GDT at the end to work
> around that, but it's a bit nasty.
>
Looking at your code, you have a much bigger problem here:
+/*
+ * This allows other types of serial ports to be used.
+ * - %al: Character to be printed (no clobber %rax)
+ * - %rdx: MMIO address or port.
+ */
+.macro pr_char
+ outb %al, %dx
+.endm
+
This will overflow your UART buffer very quickly since you are now
dumping a whole bunch of data. The URT buffer -- if you even have one
and it is enabled -- is only 16 bytes in a standard 16550A UART. In
older UARTs (or emulated older UARTs) you might not have a buffer at
all. To print more than a handful of bytes, you need to poll for the
THRE bit=1 (bit 5 of register 5).
What is the point of writing this code in assembly in the first place? A
much more logical thing to do is to just push the registers you haven't
pushed already onto the stack and call a C function to do the actual
dumping? It isn't like it is in any shape, way or form performance critical.
-hpa
On Tue, 2024-11-05 at 13:37 -0800, H. Peter Anvin wrote: > > Looking at your code, you have a much bigger problem here: > > +/* > + * This allows other types of serial ports to be used. > + * - %al: Character to be printed (no clobber %rax) > + * - %rdx: MMIO address or port. > + */ > +.macro pr_char > + outb %al, %dx > +.endm > + > > This will overflow your UART buffer very quickly since you are now > dumping a whole bunch of data. The URT buffer -- if you even have one > and it is enabled -- is only 16 bytes in a standard 16550A UART. In > older UARTs (or emulated older UARTs) you might not have a buffer at > all. To print more than a handful of bytes, you need to poll for the > THRE bit=1 (bit 5 of register 5). Emulated UARTs are generally fine because they don't really emulate the buffer at all. And when I originally wrote this it was purely a hack to debug an issue for myself, and used a different type of logging device altogether. But yeah, if this were to be used on bare metal 16550A it would indeed need to wait for space in the FIFO/THR. > What is the point of writing this code in assembly in the first place? A > much more logical thing to do is to just push the registers you haven't > pushed already onto the stack and call a C function to do the actual > dumping? It isn't like it is in any shape, way or form performance critical. If we fix it up to use a proper linker script, that's slightly more feasible. As things stand, it's only really possible to do it in the existing asm file. And it's only the core of the exception handler "function" which could be moved out to C; it didn't seem particularly worth bothering. Would be nice to have the IDT generated from C code *before* calling relocate_kernel() instead of inside relocate_kernel itself, perhaps, but I was also trying to keep the #define DEBUG version of the code fairly self-contained.
On November 5, 2024 6:43:44 PM PST, David Woodhouse <dwmw2@infradead.org> wrote: >On Tue, 2024-11-05 at 13:37 -0800, H. Peter Anvin wrote: >> >> Looking at your code, you have a much bigger problem here: >> >> +/* >> + * This allows other types of serial ports to be used. >> + * - %al: Character to be printed (no clobber %rax) >> + * - %rdx: MMIO address or port. >> + */ >> +.macro pr_char >> + outb %al, %dx >> +.endm >> + >> >> This will overflow your UART buffer very quickly since you are now >> dumping a whole bunch of data. The URT buffer -- if you even have one >> and it is enabled -- is only 16 bytes in a standard 16550A UART. In >> older UARTs (or emulated older UARTs) you might not have a buffer at >> all. To print more than a handful of bytes, you need to poll for the >> THRE bit=1 (bit 5 of register 5). > >Emulated UARTs are generally fine because they don't really emulate the >buffer at all. And when I originally wrote this it was purely a hack to >debug an issue for myself, and used a different type of logging device >altogether. > >But yeah, if this were to be used on bare metal 16550A it would indeed >need to wait for space in the FIFO/THR. > >> What is the point of writing this code in assembly in the first place? A >> much more logical thing to do is to just push the registers you haven't >> pushed already onto the stack and call a C function to do the actual >> dumping? It isn't like it is in any shape, way or form performance critical. > >If we fix it up to use a proper linker script, that's slightly more >feasible. As things stand, it's only really possible to do it in the >existing asm file. > >And it's only the core of the exception handler "function" which could >be moved out to C; it didn't seem particularly worth bothering. Would >be nice to have the IDT generated from C code *before* calling >relocate_kernel() instead of inside relocate_kernel itself, perhaps, >but I was also trying to keep the #define DEBUG version of the code >fairly self-contained. > > Yes, the linker script needs to happen. This is a case of doing it right vs doing it quickly.
On Tue, 2024-11-05 at 18:47 -0800, H. Peter Anvin wrote:
>
>
> Yes, the linker script needs to happen.
How does this look? Need to do 32-bit too, and actually test the
preserve_context case. And then port the debugging bits on top...
David Woodhouse (2):
x86/kexec: Use linker script for relocate_kernel page layout
x86/kexec: Add data section to relocate_kernel
arch/x86/include/asm/sections.h | 1 +
arch/x86/kernel/machine_kexec_64.c | 6 ++--
arch/x86/kernel/relocate_kernel_64.S | 64 ++++++++++++++++++++----------------
arch/x86/kernel/vmlinux.lds.S | 12 ++++++-
4 files changed, 51 insertions(+), 32 deletions(-)
From: David Woodhouse <dwmw@amazon.co.uk>
---
arch/x86/include/asm/sections.h | 1 +
arch/x86/kernel/machine_kexec_64.c | 6 ++++--
arch/x86/kernel/relocate_kernel_64.S | 6 +-----
arch/x86/kernel/vmlinux.lds.S | 12 +++++++++++-
4 files changed, 17 insertions(+), 8 deletions(-)
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 3fa87e5e11ab..30e8ee7006f9 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -5,6 +5,7 @@
#include <asm-generic/sections.h>
#include <asm/extable.h>
+extern char __relocate_kernel_start[], __relocate_kernel_end[];
extern char __brk_base[], __brk_limit[];
extern char __end_rodata_aligned[];
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 9c9ac606893e..f873bef6eefd 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -156,7 +156,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
pmd_t *pmd;
pte_t *pte;
- vaddr = (unsigned long)relocate_kernel;
+ vaddr = (unsigned long)__relocate_kernel_start;
paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
pgd += pgd_index(vaddr);
if (!pgd_present(*pgd)) {
@@ -321,6 +321,8 @@ void machine_kexec_cleanup(struct kimage *image)
*/
void machine_kexec(struct kimage *image)
{
+ unsigned long reloc_start = (unsigned long)__relocate_kernel_start;
+ unsigned long reloc_end = (unsigned long)__relocate_kernel_end;
unsigned long page_list[PAGES_NR];
unsigned int host_mem_enc_active;
int save_ftrace_enabled;
@@ -358,7 +360,7 @@ void machine_kexec(struct kimage *image)
}
control_page = page_address(image->control_code_page) + PAGE_SIZE;
- __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
+ __memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start);
page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 2848d086ceb0..1efcbd340528 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -40,10 +40,8 @@
#define CP_PA_SWAP_PAGE DATA(0x28)
#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
- .text
- .align PAGE_SIZE
+ .section .text.relocate_kernel,"ax";
.code64
-SYM_CODE_START_NOALIGN(relocate_range)
SYM_CODE_START_NOALIGN(relocate_kernel)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
@@ -332,5 +330,3 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
int3
SYM_CODE_END(swap_pages)
- .skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc
-SYM_CODE_END(relocate_range);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index b8c5741d2fb4..ad451371e179 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -95,7 +95,16 @@ const_pcpu_hot = pcpu_hot;
#define BSS_DECRYPTED
#endif
-
+#if defined(CONFIG_X86_64) && defined(CONFIG_KEXEC_CORE)
+#define KEXEC_RELOCATE_KERNEL_TEXT \
+ . = ALIGN(PAGE_SIZE); \
+ __relocate_kernel_start = .; \
+ *(.text.relocate_kernel); \
+ *(.rodata.relocate_kernel); \
+ __relocate_kernel_end = .;
+#else
+#define KEXEC_RELOCATE_KERNEL_TEXT
+#endif
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(6); /* RW_ */
@@ -124,6 +133,7 @@ SECTIONS
/* bootstrapping code */
HEAD_TEXT
TEXT_TEXT
+ KEXEC_RELOCATE_KERNEL_TEXT
SCHED_TEXT
LOCK_TEXT
KPROBES_TEXT
--
2.44.0
From: David Woodhouse <dwmw@amazon.co.uk>
Now that it's handled sanely by a linker script we can have actual data,
and just use %rip-relative addressing to access it.
If we could call the *copy* instead of the original relocate_kernel in
the kernel text, then we could use %rip-relative addressing everywhere.
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
arch/x86/kernel/relocate_kernel_64.S | 58 ++++++++++++++++------------
arch/x86/kernel/vmlinux.lds.S | 2 +-
2 files changed, 35 insertions(+), 25 deletions(-)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 1efcbd340528..577aa1672349 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -27,18 +27,28 @@
* ~ control_page + PAGE_SIZE are used as data storage and stack for
* jumping back
*/
-#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+ .section .data.relocate_kernel,"a";
/* Minimal CPU state */
-#define RSP DATA(0x0)
-#define CR0 DATA(0x8)
-#define CR3 DATA(0x10)
-#define CR4 DATA(0x18)
-
+SYM_DATA_LOCAL(saved_rsp, .quad 0)
+SYM_DATA_LOCAL(saved_cr0, .quad 0)
+SYM_DATA_LOCAL(saved_cr3, .quad 0)
+SYM_DATA_LOCAL(saved_cr4, .quad 0)
/* other data */
-#define CP_PA_TABLE_PAGE DATA(0x20)
-#define CP_PA_SWAP_PAGE DATA(0x28)
-#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
+SYM_DATA_LOCAL(pa_table_page, .quad 0)
+SYM_DATA_LOCAL(pa_swap_page, .quad 0)
+SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
+
+/*
+ * There are two physical copies of relocate_kernel(), one in the original
+ * Kernel text and the other copied to the control page. There is a virtual
+ * mapping of each, in the original kernel. It is the *original* which is
+ * called from machine_kexec(), largely becaose the copy isn't mapped as an
+ * executable page. Thus, this code cannot just use %rip-relative addressing
+ * until after the %cr3 change and the jump to identity_mapped(). Until
+ * then, some pointer arithmetic is required.
+ */
+#define DATA(x) (x - relocate_kernel)
.section .text.relocate_kernel,"ax";
.code64
@@ -63,13 +73,13 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
pushf
movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
- movq %rsp, RSP(%r11)
+ movq %rsp, DATA(saved_rsp)(%r11)
movq %cr0, %rax
- movq %rax, CR0(%r11)
+ movq %rax, DATA(saved_cr0)(%r11)
movq %cr3, %rax
- movq %rax, CR3(%r11)
+ movq %rax, DATA(saved_cr3)(%r11)
movq %cr4, %rax
- movq %rax, CR4(%r11)
+ movq %rax, DATA(saved_cr4)(%r11)
/* Save CR4. Required to enable the right paging mode later. */
movq %rax, %r13
@@ -94,9 +104,9 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
movq PTR(PA_SWAP_PAGE)(%rsi), %r10
/* save some information for jumping back */
- movq %r9, CP_PA_TABLE_PAGE(%r11)
- movq %r10, CP_PA_SWAP_PAGE(%r11)
- movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
+ movq %r9, DATA(pa_table_page)(%r11)
+ movq %r10, DATA(pa_swap_page)(%r11)
+ movq %rdi, DATA(pa_backup_pages_map)(%r11)
/* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
movq %rcx, %r11
@@ -128,7 +138,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
/* set return address to 0 if not preserving context */
pushq $0
/* store the start address on the stack */
- pushq %rdx
+ pushq start_address(%rip)
/*
* Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
@@ -227,9 +237,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
/* get the re-entry point of the peer system */
movq 0(%rsp), %rbp
leaq relocate_kernel(%rip), %r8
- movq CP_PA_SWAP_PAGE(%r8), %r10
- movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
- movq CP_PA_TABLE_PAGE(%r8), %rax
+ movq pa_swap_page(%rip), %r10
+ movq pa_backup_pages_map(%rip), %rdi
+ movq pa_table_page(%rip), %rax
movq %rax, %cr3
lea PAGE_SIZE(%r8), %rsp
call swap_pages
@@ -243,11 +253,11 @@ SYM_CODE_END(identity_mapped)
SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // RET target, above
- movq RSP(%r8), %rsp
- movq CR4(%r8), %rax
+ movq saved_rsp(%rip), %rsp
+ movq saved_cr4(%rip), %rax
movq %rax, %cr4
- movq CR3(%r8), %rax
- movq CR0(%r8), %r8
+ movq saved_cr3(%rip), %rax
+ movq saved_cr0(%r8), %r8
movq %rax, %cr3
movq %r8, %cr0
movq %rbp, %rax
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index ad451371e179..65f879b31a82 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -100,7 +100,7 @@ const_pcpu_hot = pcpu_hot;
. = ALIGN(PAGE_SIZE); \
__relocate_kernel_start = .; \
*(.text.relocate_kernel); \
- *(.rodata.relocate_kernel); \
+ *(.data.relocate_kernel); \
__relocate_kernel_end = .;
#else
#define KEXEC_RELOCATE_KERNEL_TEXT
--
2.44.0
On November 8, 2024 6:22:41 AM GMT+01:00, David Woodhouse <dwmw2@infradead.org> wrote: >From: David Woodhouse <dwmw@amazon.co.uk> > >Now that it's handled sanely by a linker script we can have actual data, >and just use %rip-relative addressing to access it. > >If we could call the *copy* instead of the original relocate_kernel in >the kernel text, then we could use %rip-relative addressing everywhere. > >Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> >--- > arch/x86/kernel/relocate_kernel_64.S | 58 ++++++++++++++++------------ > arch/x86/kernel/vmlinux.lds.S | 2 +- > 2 files changed, 35 insertions(+), 25 deletions(-) > >diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S >index 1efcbd340528..577aa1672349 100644 >--- a/arch/x86/kernel/relocate_kernel_64.S >+++ b/arch/x86/kernel/relocate_kernel_64.S >@@ -27,18 +27,28 @@ > * ~ control_page + PAGE_SIZE are used as data storage and stack for > * jumping back > */ >-#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) > >+ .section .data.relocate_kernel,"a"; > /* Minimal CPU state */ >-#define RSP DATA(0x0) >-#define CR0 DATA(0x8) >-#define CR3 DATA(0x10) >-#define CR4 DATA(0x18) >- >+SYM_DATA_LOCAL(saved_rsp, .quad 0) >+SYM_DATA_LOCAL(saved_cr0, .quad 0) >+SYM_DATA_LOCAL(saved_cr3, .quad 0) >+SYM_DATA_LOCAL(saved_cr4, .quad 0) > /* other data */ >-#define CP_PA_TABLE_PAGE DATA(0x20) >-#define CP_PA_SWAP_PAGE DATA(0x28) >-#define CP_PA_BACKUP_PAGES_MAP DATA(0x30) >+SYM_DATA_LOCAL(pa_table_page, .quad 0) >+SYM_DATA_LOCAL(pa_swap_page, .quad 0) >+SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0) >+ >+/* >+ * There are two physical copies of relocate_kernel(), one in the original >+ * Kernel text and the other copied to the control page. There is a virtual >+ * mapping of each, in the original kernel. It is the *original* which is >+ * called from machine_kexec(), largely becaose the copy isn't mapped as an >+ * executable page. Thus, this code cannot just use %rip-relative addressing >+ * until after the %cr3 change and the jump to identity_mapped(). Until >+ * then, some pointer arithmetic is required. >+ */ >+#define DATA(x) (x - relocate_kernel) > > .section .text.relocate_kernel,"ax"; > .code64 >@@ -63,13 +73,13 @@ SYM_CODE_START_NOALIGN(relocate_kernel) > pushf > > movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 >- movq %rsp, RSP(%r11) >+ movq %rsp, DATA(saved_rsp)(%r11) > movq %cr0, %rax >- movq %rax, CR0(%r11) >+ movq %rax, DATA(saved_cr0)(%r11) > movq %cr3, %rax >- movq %rax, CR3(%r11) >+ movq %rax, DATA(saved_cr3)(%r11) > movq %cr4, %rax >- movq %rax, CR4(%r11) >+ movq %rax, DATA(saved_cr4)(%r11) > > /* Save CR4. Required to enable the right paging mode later. */ > movq %rax, %r13 >@@ -94,9 +104,9 @@ SYM_CODE_START_NOALIGN(relocate_kernel) > movq PTR(PA_SWAP_PAGE)(%rsi), %r10 > > /* save some information for jumping back */ >- movq %r9, CP_PA_TABLE_PAGE(%r11) >- movq %r10, CP_PA_SWAP_PAGE(%r11) >- movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) >+ movq %r9, DATA(pa_table_page)(%r11) >+ movq %r10, DATA(pa_swap_page)(%r11) >+ movq %rdi, DATA(pa_backup_pages_map)(%r11) > > /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ > movq %rcx, %r11 >@@ -128,7 +138,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) > /* set return address to 0 if not preserving context */ > pushq $0 > /* store the start address on the stack */ >- pushq %rdx >+ pushq start_address(%rip) > > /* > * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP >@@ -227,9 +237,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) > /* get the re-entry point of the peer system */ > movq 0(%rsp), %rbp > leaq relocate_kernel(%rip), %r8 >- movq CP_PA_SWAP_PAGE(%r8), %r10 >- movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi >- movq CP_PA_TABLE_PAGE(%r8), %rax >+ movq pa_swap_page(%rip), %r10 >+ movq pa_backup_pages_map(%rip), %rdi >+ movq pa_table_page(%rip), %rax > movq %rax, %cr3 > lea PAGE_SIZE(%r8), %rsp > call swap_pages >@@ -243,11 +253,11 @@ SYM_CODE_END(identity_mapped) > SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) > UNWIND_HINT_END_OF_STACK > ANNOTATE_NOENDBR // RET target, above >- movq RSP(%r8), %rsp >- movq CR4(%r8), %rax >+ movq saved_rsp(%rip), %rsp >+ movq saved_cr4(%rip), %rax > movq %rax, %cr4 >- movq CR3(%r8), %rax >- movq CR0(%r8), %r8 >+ movq saved_cr3(%rip), %rax >+ movq saved_cr0(%r8), %r8 > movq %rax, %cr3 > movq %r8, %cr0 > movq %rbp, %rax >diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S >index ad451371e179..65f879b31a82 100644 >--- a/arch/x86/kernel/vmlinux.lds.S >+++ b/arch/x86/kernel/vmlinux.lds.S >@@ -100,7 +100,7 @@ const_pcpu_hot = pcpu_hot; > . = ALIGN(PAGE_SIZE); \ > __relocate_kernel_start = .; \ > *(.text.relocate_kernel); \ >- *(.rodata.relocate_kernel); \ >+ *(.data.relocate_kernel); \ > __relocate_kernel_end = .; > #else > #define KEXEC_RELOCATE_KERNEL_TEXT Looks good at first glance. I'm currently traveling so I haven't fully reviewed it though.
On Fri, 2024-11-08 at 12:26 +0100, H. Peter Anvin wrote: > > > --- a/arch/x86/kernel/vmlinux.lds.S > > +++ b/arch/x86/kernel/vmlinux.lds.S > > @@ -100,7 +100,7 @@ const_pcpu_hot = pcpu_hot; > > . = ALIGN(PAGE_SIZE); \ > > __relocate_kernel_start = .; \ > > *(.text.relocate_kernel); \ > > - *(.rodata.relocate_kernel); \ > > + *(.data.relocate_kernel); \ > > __relocate_kernel_end = .; > > #else > > #define KEXEC_RELOCATE_KERNEL_TEXT > > Looks good at first glance. I'm currently traveling so I haven't > fully reviewed it though. Turns out it doesn't help much. It's neater, obviously, but objtool still wants to disassemble the .data.relocate_kernel section after it gets included in the overall kernel text section. So once I rebase the debugging support on top and add the GDT¹ into the data section, it gets very unhappy: arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder bug, please report this. arch/x86/tools/insn_decoder_test: warning: ffffffff8219250a: eb fd 1f 00 00 00 00 00 00 00 ff ff 00 00 00 9a cf 00 ................ff ff 00 00 00 9a af 00 ff ff 00 00 00 92 cf 00 ................cc cc cc cc cc cc cc cc cc cc cc cc ............ jmp ffffffffarch/x86/tools/insn_decoder_test: warning: objdump says 23 bytes, but insn_get_length() says 2 arch/x86/tools/insn_decoder_test: error: malformed line 5236442: 82192509 <exc_handler+0x18e> It's also still complaining that pr_nybble() and pr_qword() are unreachable, although it doesn't say that of the exc_handler() function which blatantly calls them. Although I can look at that harder if I do factor it out to C code. ¹ https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/kexec-debug
On Tue, Nov 12, 2024 at 08:44:33AM +0000, David Woodhouse wrote: > On Fri, 2024-11-08 at 12:26 +0100, H. Peter Anvin wrote: > > > > > --- a/arch/x86/kernel/vmlinux.lds.S > > > +++ b/arch/x86/kernel/vmlinux.lds.S > > > @@ -100,7 +100,7 @@ const_pcpu_hot = pcpu_hot; > > > . = ALIGN(PAGE_SIZE); \ > > > __relocate_kernel_start = .; \ > > > *(.text.relocate_kernel); \ > > > - *(.rodata.relocate_kernel); \ > > > + *(.data.relocate_kernel); \ Why are we having data in the middle of the text section? > > > __relocate_kernel_end = .; > > > #else > > > #define KEXEC_RELOCATE_KERNEL_TEXT > > > > Looks good at first glance. I'm currently traveling so I haven't > > fully reviewed it though. > > Turns out it doesn't help much. It's neater, obviously, but objtool > still wants to disassemble the .data.relocate_kernel section after it > gets included in the overall kernel text section. Objtool only decodes stuff that has SHF_EXECINSTR set. Why would your data sections have that on?
On Tue, 2024-11-12 at 11:14 +0100, Peter Zijlstra wrote: > On Tue, Nov 12, 2024 at 08:44:33AM +0000, David Woodhouse wrote: > > On Fri, 2024-11-08 at 12:26 +0100, H. Peter Anvin wrote: > > > > > > > --- a/arch/x86/kernel/vmlinux.lds.S > > > > +++ b/arch/x86/kernel/vmlinux.lds.S > > > > @@ -100,7 +100,7 @@ const_pcpu_hot = pcpu_hot; > > > > . = ALIGN(PAGE_SIZE); \ > > > > __relocate_kernel_start = .; \ > > > > *(.text.relocate_kernel); \ > > > > - *(.rodata.relocate_kernel); \ > > > > + *(.data.relocate_kernel); \ > > Why are we having data in the middle of the text section? This is the relocate_kernel() page. It's the last thing the kernel calls on kexec. The kernel first takes a *copy* of it and places it into the identity mapping page tables which are set up for kexec. The relocate_kernel() function is then called at its *original* location in the kernel text. Before reloading %cr3, it stores some information which is going to become unavailable into its own page (currently using a bit of a nasty hack based on a hard-coded KEXEC_CONTROL_CODE_MAX_SIZE because we can't just use data symbol references). Then it reloads %cr3 and jumps to the identity-mapped copy of itself. Could we put .text.relocate_kernel and .data.relocate_kernel somewhere *other* than the main kernel text segment? Probably... it use use alternative instructions, but we could deal with that. And if we call from machine_kexec() directly into the *copy* (having marked it executable), maybe...?
On Tue, Nov 12, 2024 at 10:45:05AM +0000, David Woodhouse wrote:
> On Tue, 2024-11-12 at 11:14 +0100, Peter Zijlstra wrote:
> > On Tue, Nov 12, 2024 at 08:44:33AM +0000, David Woodhouse wrote:
> > > On Fri, 2024-11-08 at 12:26 +0100, H. Peter Anvin wrote:
> > > >
> > > > > --- a/arch/x86/kernel/vmlinux.lds.S
> > > > > +++ b/arch/x86/kernel/vmlinux.lds.S
> > > > > @@ -100,7 +100,7 @@ const_pcpu_hot = pcpu_hot;
> > > > > . = ALIGN(PAGE_SIZE); \
> > > > > __relocate_kernel_start = .; \
> > > > > *(.text.relocate_kernel); \
> > > > > - *(.rodata.relocate_kernel); \
> > > > > + *(.data.relocate_kernel); \
> >
> > Why are we having data in the middle of the text section?
>
> This is the relocate_kernel() page. It's the last thing the kernel
> calls on kexec.
>
> The kernel first takes a *copy* of it and places it into the identity
> mapping page tables which are set up for kexec.
>
> The relocate_kernel() function is then called at its *original*
> location in the kernel text. Before reloading %cr3, it stores some
> information which is going to become unavailable into its own page
> (currently using a bit of a nasty hack based on a hard-coded
> KEXEC_CONTROL_CODE_MAX_SIZE because we can't just use data symbol
> references).
>
> Then it reloads %cr3 and jumps to the identity-mapped copy of itself.
>
> Could we put .text.relocate_kernel and .data.relocate_kernel somewhere
> *other* than the main kernel text segment? Probably... it use use
> alternative instructions, but we could deal with that. And if we call
> from machine_kexec() directly into the *copy* (having marked it
> executable), maybe...?
I fetched your branch and only saw a "RET before UNTRAIN" warning, did
you happen to already fix the other warnings up?
Something like the below fixes the warning I saw.
Though, maybe it's better to just tell objtool to keep its paws off of
the problematic functions by use of the STACK_FRAME_NON_STANDARD macro?
Then you could get rid of all the unwind hints and annotations.
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 6604f5d038aa..9ba10530b9c7 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -4012,8 +4012,11 @@ static int validate_unret(struct objtool_file *file, struct instruction *insn)
return 0;
case INSN_RETURN:
- WARN_INSN(insn, "RET before UNTRAIN");
- return 1;
+ if (!insn->retpoline_safe) {
+ WARN_INSN(insn, "RET before UNTRAIN");
+ return 1;
+ }
+ break;
case INSN_NOP:
if (insn->retpoline_safe)
On 8 November 2024 03:26:58 GMT-08:00, "H. Peter Anvin" <hpa@zytor.com> wrote: >On November 8, 2024 6:22:41 AM GMT+01:00, David Woodhouse <dwmw2@infradead.org> wrote: >>From: David Woodhouse <dwmw@amazon.co.uk> >> >>Now that it's handled sanely by a linker script we can have actual data, >>and just use %rip-relative addressing to access it. >> >>If we could call the *copy* instead of the original relocate_kernel in >>the kernel text, then we could use %rip-relative addressing everywhere. >> >>Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> >>--- >> arch/x86/kernel/relocate_kernel_64.S | 58 ++++++++++++++++------------ >> arch/x86/kernel/vmlinux.lds.S | 2 +- >> 2 files changed, 35 insertions(+), 25 deletions(-) >> >>diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S >>index 1efcbd340528..577aa1672349 100644 >>--- a/arch/x86/kernel/relocate_kernel_64.S >>+++ b/arch/x86/kernel/relocate_kernel_64.S >>@@ -27,18 +27,28 @@ >> * ~ control_page + PAGE_SIZE are used as data storage and stack for >> * jumping back >> */ >>-#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) >> >>+ .section .data.relocate_kernel,"a"; >> /* Minimal CPU state */ >>-#define RSP DATA(0x0) >>-#define CR0 DATA(0x8) >>-#define CR3 DATA(0x10) >>-#define CR4 DATA(0x18) >>- >>+SYM_DATA_LOCAL(saved_rsp, .quad 0) >>+SYM_DATA_LOCAL(saved_cr0, .quad 0) >>+SYM_DATA_LOCAL(saved_cr3, .quad 0) >>+SYM_DATA_LOCAL(saved_cr4, .quad 0) >> /* other data */ >>-#define CP_PA_TABLE_PAGE DATA(0x20) >>-#define CP_PA_SWAP_PAGE DATA(0x28) >>-#define CP_PA_BACKUP_PAGES_MAP DATA(0x30) >>+SYM_DATA_LOCAL(pa_table_page, .quad 0) >>+SYM_DATA_LOCAL(pa_swap_page, .quad 0) >>+SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0) >>+ >>+/* >>+ * There are two physical copies of relocate_kernel(), one in the original >>+ * Kernel text and the other copied to the control page. There is a virtual >>+ * mapping of each, in the original kernel. It is the *original* which is >>+ * called from machine_kexec(), largely becaose the copy isn't mapped as an >>+ * executable page. Thus, this code cannot just use %rip-relative addressing >>+ * until after the %cr3 change and the jump to identity_mapped(). Until >>+ * then, some pointer arithmetic is required. >>+ */ >>+#define DATA(x) (x - relocate_kernel) >> >> .section .text.relocate_kernel,"ax"; >> .code64 >>@@ -63,13 +73,13 @@ SYM_CODE_START_NOALIGN(relocate_kernel) >> pushf >> >> movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 >>- movq %rsp, RSP(%r11) >>+ movq %rsp, DATA(saved_rsp)(%r11) >> movq %cr0, %rax >>- movq %rax, CR0(%r11) >>+ movq %rax, DATA(saved_cr0)(%r11) >> movq %cr3, %rax >>- movq %rax, CR3(%r11) >>+ movq %rax, DATA(saved_cr3)(%r11) >> movq %cr4, %rax >>- movq %rax, CR4(%r11) >>+ movq %rax, DATA(saved_cr4)(%r11) >> >> /* Save CR4. Required to enable the right paging mode later. */ >> movq %rax, %r13 >>@@ -94,9 +104,9 @@ SYM_CODE_START_NOALIGN(relocate_kernel) >> movq PTR(PA_SWAP_PAGE)(%rsi), %r10 >> >> /* save some information for jumping back */ >>- movq %r9, CP_PA_TABLE_PAGE(%r11) >>- movq %r10, CP_PA_SWAP_PAGE(%r11) >>- movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) >>+ movq %r9, DATA(pa_table_page)(%r11) >>+ movq %r10, DATA(pa_swap_page)(%r11) >>+ movq %rdi, DATA(pa_backup_pages_map)(%r11) >> >> /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ >> movq %rcx, %r11 >>@@ -128,7 +138,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) >> /* set return address to 0 if not preserving context */ >> pushq $0 >> /* store the start address on the stack */ >>- pushq %rdx >>+ pushq start_address(%rip) >> >> /* >> * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP >>@@ -227,9 +237,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) >> /* get the re-entry point of the peer system */ >> movq 0(%rsp), %rbp >> leaq relocate_kernel(%rip), %r8 >>- movq CP_PA_SWAP_PAGE(%r8), %r10 >>- movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi >>- movq CP_PA_TABLE_PAGE(%r8), %rax >>+ movq pa_swap_page(%rip), %r10 >>+ movq pa_backup_pages_map(%rip), %rdi >>+ movq pa_table_page(%rip), %rax >> movq %rax, %cr3 >> lea PAGE_SIZE(%r8), %rsp >> call swap_pages >>@@ -243,11 +253,11 @@ SYM_CODE_END(identity_mapped) >> SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) >> UNWIND_HINT_END_OF_STACK >> ANNOTATE_NOENDBR // RET target, above >>- movq RSP(%r8), %rsp >>- movq CR4(%r8), %rax >>+ movq saved_rsp(%rip), %rsp >>+ movq saved_cr4(%rip), %rax >> movq %rax, %cr4 >>- movq CR3(%r8), %rax >>- movq CR0(%r8), %r8 >>+ movq saved_cr3(%rip), %rax >>+ movq saved_cr0(%r8), %r8 >> movq %rax, %cr3 >> movq %r8, %cr0 >> movq %rbp, %rax >>diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S >>index ad451371e179..65f879b31a82 100644 >>--- a/arch/x86/kernel/vmlinux.lds.S >>+++ b/arch/x86/kernel/vmlinux.lds.S >>@@ -100,7 +100,7 @@ const_pcpu_hot = pcpu_hot; >> . = ALIGN(PAGE_SIZE); \ >> __relocate_kernel_start = .; \ >> *(.text.relocate_kernel); \ >>- *(.rodata.relocate_kernel); \ >>+ *(.data.relocate_kernel); \ >> __relocate_kernel_end = .; >> #else >> #define KEXEC_RELOCATE_KERNEL_TEXT > >Looks good at first glance. I'm currently traveling so I haven't fully reviewed it though. Ta. That's good enough for me to go ahead and port the rest over. Is there a selftest for the preserve-context mode somewhere, with a payload that just does a "ret"?
On Fri, 2024-11-08 at 05:22 +0000, David Woodhouse wrote: > > @@ -128,7 +138,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) > /* set return address to 0 if not preserving context */ > pushq $0 > /* store the start address on the stack */ > - pushq %rdx > + pushq start_address(%rip) > > /* > * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP Oops, drop that part; I did that (and actually defined it, and stored the value in it) just to test that everything was actually *working*, since I think the rest of the data fields are only used for preserve_context mode. Fixed in the tree at https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads//kexec-lds
On Tue, 2024-11-05 at 18:47 -0800, H. Peter Anvin wrote: > > Yes, the linker script needs to happen. > > This is a case of doing it right vs doing it quickly. That may just tip it over the edge of how much work it's worth doing to clean up the debug hack that I implemented for my own use, and make it useful to others in the future. But I'll see what I can do.
On 11/5/24 13:37, H. Peter Anvin wrote:
> What is the point of writing this code in assembly in the first place? A
> much more logical thing to do is to just push the registers you haven't
> pushed already onto the stack and call a C function to do the actual
> dumping? It isn't like it is in any shape, way or form performance
> critical.
arch/x86/boot/compressed/misc.c has some code that you can crib, both
for writing to a text screen (not that useful anymore with EFI
framebuffers) and serial port. If you factor it a little bit then you
can probably even share the code directly.
(__putstr perhaps should have a __putchar() factored out of it?)
Then you can just do the obvious (have your assembly stub point %rdi to
the base of the register dump; set the frame order to whatever you'd
like, except rip/err/exc, or reverse the order if you prefer by changing
the loop):
static inline __noreturn void die(void)
{
while (1)
asm volatile("hlt");
}
void dump_register_frame(const unsigned long frame[])
{
static const char regnames[][5] = {
"rax:", "rcx:", "rdx:", "rbx:",
"rsp:", "rbp:", "rsi:", "rdi:",
"r8: ", "r9: ", "r10:", "r11:",
"r12:", "r13:", "r14:", "r15:",
"cr2:", "Exc:", "Err:", "rip:"
};
for (size_t i = 0; i < ARRAY_SIZE(regnames); i++) {
__putstr(regnames[i]);
__puthex(frame[i]);
__putstr("\n");
}
/* Only return from int3 */
if (frame[17] != 3)
die();
}
On Tue, Nov 05, 2024 at 01:58:40PM -0800, H. Peter Anvin wrote:
> On 11/5/24 13:37, H. Peter Anvin wrote:
> > What is the point of writing this code in assembly in the first place? A
> > much more logical thing to do is to just push the registers you haven't
> > pushed already onto the stack and call a C function to do the actual
> > dumping? It isn't like it is in any shape, way or form performance
> > critical.
>
> arch/x86/boot/compressed/misc.c has some code that you can crib, both for
> writing to a text screen (not that useful anymore with EFI framebuffers) and
> serial port. If you factor it a little bit then you can probably even share
> the code directly.
>
> (__putstr perhaps should have a __putchar() factored out of it?)
>
> Then you can just do the obvious (have your assembly stub point %rdi to the
> base of the register dump; set the frame order to whatever you'd like,
> except rip/err/exc, or reverse the order if you prefer by changing the
> loop):
>
> static inline __noreturn void die(void)
> {
> while (1)
> asm volatile("hlt");
> }
>
> void dump_register_frame(const unsigned long frame[])
> {
> static const char regnames[][5] = {
> "rax:", "rcx:", "rdx:", "rbx:",
> "rsp:", "rbp:", "rsi:", "rdi:",
> "r8: ", "r9: ", "r10:", "r11:",
> "r12:", "r13:", "r14:", "r15:",
> "cr2:", "Exc:", "Err:", "rip:"
> };
>
> for (size_t i = 0; i < ARRAY_SIZE(regnames); i++) {
> __putstr(regnames[i]);
> __puthex(frame[i]);
> __putstr("\n");
> }
>
> /* Only return from int3 */
> if (frame[17] != 3)
> die();
> }
+ a mil here!
Please, use existing early_serial_console infra. Also we have
debug_putaddr() / debug_puthex() / debug_putstr() which are
probably ones you want to use.
--
With Best Regards,
Andy Shevchenko
On 11/5/24 13:58, H. Peter Anvin wrote:
>
> Then you can just do the obvious (have your assembly stub point %rdi to
> the base of the register dump; set the frame order to whatever you'd
> like, except rip/err/exc, or reverse the order if you prefer by changing
> the loop):
>
> static inline __noreturn void die(void)
> {
> while (1)
> asm volatile("hlt");
> }
>
> void dump_register_frame(const unsigned long frame[])
> {
> static const char regnames[][5] = {
> "rax:", "rcx:", "rdx:", "rbx:",
> "rsp:", "rbp:", "rsi:", "rdi:",
> "r8: ", "r9: ", "r10:", "r11:",
> "r12:", "r13:", "r14:", "r15:",
> "cr2:", "Exc:", "Err:", "rip:"
> };
>
> for (size_t i = 0; i < ARRAY_SIZE(regnames); i++) {
> __putstr(regnames[i]);
> __puthex(frame[i]);
> __putstr("\n");
> }
>
> /* Only return from int3 */
> if (frame[17] != 3)
> die();
> }
>
I don't know if it is necessary, but you can do something like this to
make absolutely sure you don't end up with non-relative symbol references:
static inline __constfunc void * sym_addr(const void *sym)
{
void *addr;
asm("lea %c1(%%rip),%0" : "=r" (addr) : "i" (sym));
return addr;
}
On 11/5/24 14:27, H. Peter Anvin wrote:
>
> I don't know if it is necessary, but you can do something like this to
> make absolutely sure you don't end up with non-relative symbol references:
>
> static inline __constfunc void * sym_addr(const void *sym)
> {
> void *addr;
> asm("lea %c1(%%rip),%0" : "=r" (addr) : "i" (sym));
> return addr;
> }
>
Compiling with "-fpic -fvisibility=hidden -mcmodel=medium" can also be
used to suppress non-relative references. If you have external symbol
references they at least *should* be emitted as GOTPCRELX relocations
which the linker should relax to relative references; however, I would
think that you really don't want any of those :)
-hpa
On November 5, 2024 12:38:10 PM PST, "Woodhouse, David" <dwmw@amazon.co.uk> wrote:
>On Sun, 2024-11-03 at 05:35 +0000, David Woodhouse wrote:
>>
>> +
>> +/* Print the byte in %bl, clobber %rax */
>> +SYM_CODE_START_LOCAL_NOALIGN(pr_byte)
>> + movb %bl, %al
>> + nop
>> + andb $0x0f, %al
>> + addb $0x30, %al
>> + cmpb $0x3a, %al
>> + jb 1f
>> + addb $('a' - '0' - 10), %al
>> +1: pr_char
>> + ANNOTATE_UNRET_SAFE
>> + ret
>> +SYM_CODE_END(pr_byte)
>> +
>
>Obviously that function name (and comment) are wrong; fixed in my tree.
>at
>https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/kexec-debug
>
>This function (and also pr_qword) are also what objtool is complaining
>about:
>
>vmlinux.o: warning: objtool: relocate_range+0x2f6: unreachable instruction
>vmlinux.o: warning: objtool: relocate_range+0x305: unreachable instruction
>
>I don't quite see why, because pr_qword() quite blatantly calls
>pr_nyblle(), as it's now named. And exc_handler() repeatedly calls
>pr_qword().
>
>But most of the objtool annotations I've added here were just to make
>it shut up and build, without much though. Peter, Josh, any chance you
>can help me fix it up please?
>
>It would also be really useful if objtool would let me have data inside
>a "code" segment, without complaining that it can't decode it as
>instructions — and without also failing to decode the first instruction
>of the *subsequent* function. I've put the GDT at the end to work
>around that, but it's a bit nasty.
code in the data *section* or *segment*? Either is nasty, though. That's what .rodata is for.
On Tue, 2024-11-05 at 12:50 -0800, H. Peter Anvin wrote:
> On November 5, 2024 12:38:10 PM PST, "Woodhouse, David" <dwmw@amazon.co.uk> wrote:
> > On Sun, 2024-11-03 at 05:35 +0000, David Woodhouse wrote:
> > >
> > > +
> > > +/* Print the byte in %bl, clobber %rax */
> > > +SYM_CODE_START_LOCAL_NOALIGN(pr_byte)
> > > + movb %bl, %al
> > > + nop
> > > + andb $0x0f, %al
> > > + addb $0x30, %al
> > > + cmpb $0x3a, %al
> > > + jb 1f
> > > + addb $('a' - '0' - 10), %al
> > > +1: pr_char
> > > + ANNOTATE_UNRET_SAFE
> > > + ret
> > > +SYM_CODE_END(pr_byte)
> > > +
> >
> > Obviously that function name (and comment) are wrong; fixed in my tree.
> > at
> > https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/kexec-debug
> >
> > This function (and also pr_qword) are also what objtool is complaining
> > about:
> >
> > vmlinux.o: warning: objtool: relocate_range+0x2f6: unreachable instruction
> > vmlinux.o: warning: objtool: relocate_range+0x305: unreachable instruction
> >
> > I don't quite see why, because pr_qword() quite blatantly calls
> > pr_nyblle(), as it's now named. And exc_handler() repeatedly calls
> > pr_qword().
> >
> > But most of the objtool annotations I've added here were just to make
> > it shut up and build, without much though. Peter, Josh, any chance you
> > can help me fix it up please?
> >
> > It would also be really useful if objtool would let me have data inside
> > a "code" segment, without complaining that it can't decode it as
> > instructions — and without also failing to decode the first instruction
> > of the *subsequent* function. I've put the GDT at the end to work
> > around that, but it's a bit nasty.
>
> code in the data *section* or *segment*? Either is nasty, though. That's what .rodata is for.
This is the relocate_kernel() function in
arch/x86/kernel/relocate_kernel_64.S
It's copied into a separate page, called (in its original location) as
a simple function from the kernel, changes %cr3 to set of identity-
mapped page tables and jumps to its *identity-mapped* address, then
copies all the right pages for kexec and jumps into the new kernel.
So it's all in a single page, and currently it plays nasty tricks to
store data after the code. Perhaps it *should* have its own code and
data sections and a linker script to keep them together...
On November 5, 2024 1:29:54 PM PST, David Woodhouse <dwmw2@infradead.org> wrote:
>On Tue, 2024-11-05 at 12:50 -0800, H. Peter Anvin wrote:
>> On November 5, 2024 12:38:10 PM PST, "Woodhouse, David" <dwmw@amazon.co.uk> wrote:
>> > On Sun, 2024-11-03 at 05:35 +0000, David Woodhouse wrote:
>> > >
>> > > +
>> > > +/* Print the byte in %bl, clobber %rax */
>> > > +SYM_CODE_START_LOCAL_NOALIGN(pr_byte)
>> > > + movb %bl, %al
>> > > + nop
>> > > + andb $0x0f, %al
>> > > + addb $0x30, %al
>> > > + cmpb $0x3a, %al
>> > > + jb 1f
>> > > + addb $('a' - '0' - 10), %al
>> > > +1: pr_char
>> > > + ANNOTATE_UNRET_SAFE
>> > > + ret
>> > > +SYM_CODE_END(pr_byte)
>> > > +
>> >
>> > Obviously that function name (and comment) are wrong; fixed in my tree.
>> > at
>> > https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/kexec-debug
>> >
>> > This function (and also pr_qword) are also what objtool is complaining
>> > about:
>> >
>> > vmlinux.o: warning: objtool: relocate_range+0x2f6: unreachable instruction
>> > vmlinux.o: warning: objtool: relocate_range+0x305: unreachable instruction
>> >
>> > I don't quite see why, because pr_qword() quite blatantly calls
>> > pr_nyblle(), as it's now named. And exc_handler() repeatedly calls
>> > pr_qword().
>> >
>> > But most of the objtool annotations I've added here were just to make
>> > it shut up and build, without much though. Peter, Josh, any chance you
>> > can help me fix it up please?
>> >
>> > It would also be really useful if objtool would let me have data inside
>> > a "code" segment, without complaining that it can't decode it as
>> > instructions — and without also failing to decode the first instruction
>> > of the *subsequent* function. I've put the GDT at the end to work
>> > around that, but it's a bit nasty.
>>
>> code in the data *section* or *segment*? Either is nasty, though. That's what .rodata is for.
>
>This is the relocate_kernel() function in
>arch/x86/kernel/relocate_kernel_64.S
>
>It's copied into a separate page, called (in its original location) as
>a simple function from the kernel, changes %cr3 to set of identity-
>mapped page tables and jumps to its *identity-mapped* address, then
>copies all the right pages for kexec and jumps into the new kernel.
>
>So it's all in a single page, and currently it plays nasty tricks to
>store data after the code. Perhaps it *should* have its own code and
>data sections and a linker script to keep them together...
Yes, it should.
© 2016 - 2026 Red Hat, Inc.