[PATCH] ppc/spapr: make fadump collect regs for maxcpus

Shivang Upadhyay posted 1 patch 1 month ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20260429065127.366813-1-shivangu@linux.ibm.com
Maintainers: Aditya Gupta <adityag@linux.ibm.com>, Sourabh Jain <sourabhjain@linux.ibm.com>, Nicholas Piggin <npiggin@gmail.com>, Harsh Prateek Bora <harshpb@linux.ibm.com>
There is a newer version of this series
hw/ppc/spapr_fadump.c | 57 +++++++++++++++++++++++++++++++++----------
1 file changed, 44 insertions(+), 13 deletions(-)
[PATCH] ppc/spapr: make fadump collect regs for maxcpus
Posted by Shivang Upadhyay 1 month ago
When a ppc machine is started with maxcpus greater than the number of
present CPUs (e.g., -smp cpus=2,maxcpus=8), firmware allocates the CPU
state data buffer based on maxcpus and advertises this size to the
kernel during fadump registration.

However, fadump currently fails to generate a vmcore during the
rebooted kernel because:

1. The CPU state buffer length does not match what firmware advertised
   (QEMU only populates entries for present CPUs, not maxcpus)
2. The kernel cannot find CPU data for all maxcpus slots

This patch adds placeholder entries for the unrealized cpus.
The placeholder entries contain only the CPUSTRT and CPUEND markers
with the CPU ID, which is sufficient for the kernel to recognize the
CPU slot exists even though the CPU was never realized.

This ensures the CPU state buffer size and structure matches what
firmware advertised to the kernel, allowing fadump to successfully
generate vmcore.

Cc: Aditya Gupta <adityag@linux.ibm.com>
Cc: Sourabh Jain <sourabhjain@linux.ibm.com>
Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
Cc: Harsh Prateek Bora <harshpb@linux.ibm.com>
Reported-by: Anushree Mathur <anushree.mathur@linux.vnet.ibm.com>
Signed-off-by: Shivang Upadhyay <shivangu@linux.ibm.com>
---
 hw/ppc/spapr_fadump.c | 57 +++++++++++++++++++++++++++++++++----------
 1 file changed, 44 insertions(+), 13 deletions(-)

diff --git a/hw/ppc/spapr_fadump.c b/hw/ppc/spapr_fadump.c
index 13cab0cfe1..b1a21ff115 100644
--- a/hw/ppc/spapr_fadump.c
+++ b/hw/ppc/spapr_fadump.c
@@ -309,6 +309,30 @@ static bool do_preserve_region(FadumpSection *region)
     return true;
 }
 
+/*
+ * Populate placeholder register entries for an unrealized CPU
+ *
+ * For CPUs that are not realized (cpu_index < maxcpus), we need to
+ * create minimal register entries containing only CPUSTRT and CPUEND
+ * markers to maintain the expected buffer structure.
+ *
+ * Returns pointer just past the placeholder entries, which can be used
+ * as the start address for the next CPU's register entries
+ */
+static FadumpRegEntry *populate_cpu_reg_entries_placeholder(int cpu_id,
+                                                             FadumpRegEntry *curr_reg_entry)
+{
+    curr_reg_entry->reg_id = cpu_to_be64(fadump_str_to_u64("CPUSTRT"));
+    curr_reg_entry->reg_value = cpu_to_be64(cpu_id & FADUMP_CPU_ID_MASK);
+    ++curr_reg_entry;
+
+    curr_reg_entry->reg_id = cpu_to_be64(fadump_str_to_u64("CPUEND"));
+    curr_reg_entry->reg_value = cpu_to_be64(cpu_id & FADUMP_CPU_ID_MASK);
+    ++curr_reg_entry;
+
+    return curr_reg_entry;
+}
+
 /*
  * Populate the passed CPUs register entries, in the buffer starting at
  * the argument 'curr_reg_entry'
@@ -450,7 +474,7 @@ static FadumpRegEntry *populate_cpu_reg_entries(CPUState *cpu,
  * callers), and sets the size of this buffer in the argument
  * 'cpu_state_len'
  */
-static void *get_cpu_state_data(uint64_t *cpu_state_len)
+static void *get_cpu_state_data(uint64_t *cpu_state_len, MachineState *ms)
 {
     FadumpRegSaveAreaHeader reg_save_hdr;
     g_autofree FadumpRegEntry *reg_entries = NULL;
@@ -459,15 +483,11 @@ static void *get_cpu_state_data(uint64_t *cpu_state_len)
 
     uint32_t num_reg_entries;
     uint32_t reg_entries_size;
-    uint32_t num_cpus = 0;
+    uint32_t num_cpus = ms->smp.max_cpus;
 
     void *cpu_state_buffer = NULL;
     uint64_t offset = 0;
 
-    CPU_FOREACH(cpu) {
-        ++num_cpus;
-    }
-
     reg_save_hdr.version = cpu_to_be32(0);
     reg_save_hdr.magic_number =
         cpu_to_be64(fadump_str_to_u64("REGSAVE"));
@@ -484,10 +504,20 @@ static void *get_cpu_state_data(uint64_t *cpu_state_len)
     /* Pointer to current CPU's registers */
     curr_reg_entry = reg_entries;
 
-    /* Populate register entries for all CPUs */
-    CPU_FOREACH(cpu) {
-        cpu_synchronize_state(cpu);
-        curr_reg_entry = populate_cpu_reg_entries(cpu, curr_reg_entry);
+    /*
+     * Populate register entries for all CPU slots (0 to maxcpus-1)
+     */
+    for (int i = 0; i < num_cpus; i++) {
+        cpu = qemu_get_cpu(i);
+        if (cpu) {
+            /* CPU is realized - populate full register entries */
+            cpu_synchronize_state(cpu);
+            curr_reg_entry = populate_cpu_reg_entries(cpu, curr_reg_entry);
+        } else {
+            /* CPU slot is empty - populate placeholder entries */
+            curr_reg_entry = populate_cpu_reg_entries_placeholder(i,
+                                                                   curr_reg_entry);
+        }
     }
 
     *cpu_state_len = 0;
@@ -526,7 +556,7 @@ static void *get_cpu_state_data(uint64_t *cpu_state_len)
  * such as invalid destination address or non-fatal error errors likely
  * caused due to invalid parameters, return true and set region->error_flags
  */
-static bool do_populate_cpu_state(FadumpSection *region)
+static bool do_populate_cpu_state(FadumpSection *region, MachineState *ms)
 {
     uint64_t dest_addr = be64_to_cpu(region->destination_address);
     uint64_t cpu_state_len = 0;
@@ -541,7 +571,7 @@ static bool do_populate_cpu_state(FadumpSection *region)
     attrs.user = 0;
     attrs.memory = 1;
 
-    cpu_state_buffer  = get_cpu_state_data(&cpu_state_len);
+    cpu_state_buffer  = get_cpu_state_data(&cpu_state_len, ms);
 
     io_result = address_space_write(default_as, dest_addr, attrs,
             cpu_state_buffer, cpu_state_len);
@@ -597,6 +627,7 @@ static bool do_populate_cpu_state(FadumpSection *region)
 static bool fadump_preserve_mem(SpaprMachineState *spapr)
 {
     FadumpMemStruct *fdm = &spapr->registered_fdm;
+    MachineState *ms = MACHINE(spapr);
     uint16_t dump_num_sections, data_type;
 
     assert(spapr->fadump_registered);
@@ -627,7 +658,7 @@ static bool fadump_preserve_mem(SpaprMachineState *spapr)
 
         switch (data_type) {
         case FADUMP_CPU_STATE_DATA:
-            if (!do_populate_cpu_state(&fdm->rgn[i])) {
+            if (!do_populate_cpu_state(&fdm->rgn[i], ms)) {
                 qemu_log_mask(LOG_GUEST_ERROR,
                     "FADump: Failed to store CPU State Data");
                 fdm->header.dump_status_flag |=
-- 
2.53.0
Re: [PATCH] ppc/spapr: make fadump collect regs for maxcpus
Posted by Anushree Mathur 2 weeks, 1 day ago

On 29/04/26 12:21 PM, Shivang Upadhyay wrote:
> When a ppc machine is started with maxcpus greater than the number of
> present CPUs (e.g., -smp cpus=2,maxcpus=8), firmware allocates the CPU
> state data buffer based on maxcpus and advertises this size to the
> kernel during fadump registration.
>
> However, fadump currently fails to generate a vmcore during the
> rebooted kernel because:
>
> 1. The CPU state buffer length does not match what firmware advertised
>     (QEMU only populates entries for present CPUs, not maxcpus)
> 2. The kernel cannot find CPU data for all maxcpus slots
>
> This patch adds placeholder entries for the unrealized cpus.
> The placeholder entries contain only the CPUSTRT and CPUEND markers
> with the CPU ID, which is sufficient for the kernel to recognize the
> CPU slot exists even though the CPU was never realized.
>
> This ensures the CPU state buffer size and structure matches what
> firmware advertised to the kernel, allowing fadump to successfully
> generate vmcore.
>
> Cc: Aditya Gupta <adityag@linux.ibm.com>
> Cc: Sourabh Jain <sourabhjain@linux.ibm.com>
> Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
> Cc: Harsh Prateek Bora <harshpb@linux.ibm.com>
> Reported-by: Anushree Mathur <anushree.mathur@linux.vnet.ibm.com>
> Signed-off-by: Shivang Upadhyay <shivangu@linux.ibm.com>
> ---
>   hw/ppc/spapr_fadump.c | 57 +++++++++++++++++++++++++++++++++----------
>   1 file changed, 44 insertions(+), 13 deletions(-)
>
> diff --git a/hw/ppc/spapr_fadump.c b/hw/ppc/spapr_fadump.c
> index 13cab0cfe1..b1a21ff115 100644
> --- a/hw/ppc/spapr_fadump.c
> +++ b/hw/ppc/spapr_fadump.c
> @@ -309,6 +309,30 @@ static bool do_preserve_region(FadumpSection *region)
>       return true;
>   }
>
> +/*
> + * Populate placeholder register entries for an unrealized CPU
> + *
> + * For CPUs that are not realized (cpu_index < maxcpus), we need to
> + * create minimal register entries containing only CPUSTRT and CPUEND
> + * markers to maintain the expected buffer structure.
> + *
> + * Returns pointer just past the placeholder entries, which can be used
> + * as the start address for the next CPU's register entries
> + */
> +static FadumpRegEntry *populate_cpu_reg_entries_placeholder(int cpu_id,
> +                                                             FadumpRegEntry *curr_reg_entry)
> +{
> +    curr_reg_entry->reg_id = cpu_to_be64(fadump_str_to_u64("CPUSTRT"));
> +    curr_reg_entry->reg_value = cpu_to_be64(cpu_id & FADUMP_CPU_ID_MASK);
> +    ++curr_reg_entry;
> +
> +    curr_reg_entry->reg_id = cpu_to_be64(fadump_str_to_u64("CPUEND"));
> +    curr_reg_entry->reg_value = cpu_to_be64(cpu_id & FADUMP_CPU_ID_MASK);
> +    ++curr_reg_entry;
> +
> +    return curr_reg_entry;
> +}
> +
>   /*
>    * Populate the passed CPUs register entries, in the buffer starting at
>    * the argument 'curr_reg_entry'
> @@ -450,7 +474,7 @@ static FadumpRegEntry *populate_cpu_reg_entries(CPUState *cpu,
>    * callers), and sets the size of this buffer in the argument
>    * 'cpu_state_len'
>    */
> -static void *get_cpu_state_data(uint64_t *cpu_state_len)
> +static void *get_cpu_state_data(uint64_t *cpu_state_len, MachineState *ms)
>   {
>       FadumpRegSaveAreaHeader reg_save_hdr;
>       g_autofree FadumpRegEntry *reg_entries = NULL;
> @@ -459,15 +483,11 @@ static void *get_cpu_state_data(uint64_t *cpu_state_len)
>
>       uint32_t num_reg_entries;
>       uint32_t reg_entries_size;
> -    uint32_t num_cpus = 0;
> +    uint32_t num_cpus = ms->smp.max_cpus;
>
>       void *cpu_state_buffer = NULL;
>       uint64_t offset = 0;
>
> -    CPU_FOREACH(cpu) {
> -        ++num_cpus;
> -    }
> -
>       reg_save_hdr.version = cpu_to_be32(0);
>       reg_save_hdr.magic_number =
>           cpu_to_be64(fadump_str_to_u64("REGSAVE"));
> @@ -484,10 +504,20 @@ static void *get_cpu_state_data(uint64_t *cpu_state_len)
>       /* Pointer to current CPU's registers */
>       curr_reg_entry = reg_entries;
>
> -    /* Populate register entries for all CPUs */
> -    CPU_FOREACH(cpu) {
> -        cpu_synchronize_state(cpu);
> -        curr_reg_entry = populate_cpu_reg_entries(cpu, curr_reg_entry);
> +    /*
> +     * Populate register entries for all CPU slots (0 to maxcpus-1)
> +     */
> +    for (int i = 0; i < num_cpus; i++) {
> +        cpu = qemu_get_cpu(i);
> +        if (cpu) {
> +            /* CPU is realized - populate full register entries */
> +            cpu_synchronize_state(cpu);
> +            curr_reg_entry = populate_cpu_reg_entries(cpu, curr_reg_entry);
> +        } else {
> +            /* CPU slot is empty - populate placeholder entries */
> +            curr_reg_entry = populate_cpu_reg_entries_placeholder(i,
> +                                                                   curr_reg_entry);
> +        }
>       }
>
>       *cpu_state_len = 0;
> @@ -526,7 +556,7 @@ static void *get_cpu_state_data(uint64_t *cpu_state_len)
>    * such as invalid destination address or non-fatal error errors likely
>    * caused due to invalid parameters, return true and set region->error_flags
>    */
> -static bool do_populate_cpu_state(FadumpSection *region)
> +static bool do_populate_cpu_state(FadumpSection *region, MachineState *ms)
>   {
>       uint64_t dest_addr = be64_to_cpu(region->destination_address);
>       uint64_t cpu_state_len = 0;
> @@ -541,7 +571,7 @@ static bool do_populate_cpu_state(FadumpSection *region)
>       attrs.user = 0;
>       attrs.memory = 1;
>
> -    cpu_state_buffer  = get_cpu_state_data(&cpu_state_len);
> +    cpu_state_buffer  = get_cpu_state_data(&cpu_state_len, ms);
>
>       io_result = address_space_write(default_as, dest_addr, attrs,
>               cpu_state_buffer, cpu_state_len);
> @@ -597,6 +627,7 @@ static bool do_populate_cpu_state(FadumpSection *region)
>   static bool fadump_preserve_mem(SpaprMachineState *spapr)
>   {
>       FadumpMemStruct *fdm = &spapr->registered_fdm;
> +    MachineState *ms = MACHINE(spapr);
>       uint16_t dump_num_sections, data_type;
>
>       assert(spapr->fadump_registered);
> @@ -627,7 +658,7 @@ static bool fadump_preserve_mem(SpaprMachineState *spapr)
>
>           switch (data_type) {
>           case FADUMP_CPU_STATE_DATA:
> -            if (!do_populate_cpu_state(&fdm->rgn[i])) {
> +            if (!do_populate_cpu_state(&fdm->rgn[i], ms)) {
>                   qemu_log_mask(LOG_GUEST_ERROR,
>                       "FADump: Failed to store CPU State Data");
>                   fdm->header.dump_status_flag |=


Hi Shivang,
Thanks for working on it, but patch is fixing the issue only for thread 
as 1 for guest topology but breaking for 2 or 4 or 8 as threads in guest.

Putting the details about the issue here.

As I reported when maxcpus tag is present, fadump is resulting
into multiple Kernel OOPS along with rtas error message "rtas fadump: 
Dump taken by platform is incomplete (0)".

Kernel OOPS are like following. Some kernel OOPS are for vmalloc memory, 
some are for cpu alloc etc in continuous loops and in the end guest crashed.

[    7.976103][  T775] BUG: Unable to handle kernel data access at 
0x629cb8ee54efc16f
[    7.976260][  T775] Faulting instruction address: 0xc000000000643e7c
[    7.976357][  T775] Oops: Kernel access of bad area, sig: 11 [#1]

[    7.978143][  T775] GPR28: 0000000000010000 629cb8ee54efc16f 
0000000000000008 c000000002e56a00
[    7.979257][  T775] NIP [c000000000643e7c] 
__kmalloc_node_noprof+0x21c/0x650
[    7.979402][  T775] LR [c000000000643e64] 
__kmalloc_node_noprof+0x204/0x650
[    7.979504][  T775] Call Trace:
[    7.979565][  T775] [c0000003fee5b620] [c008000000000000] 
0xc008000000000000 (unreliable)
[    7.979688][  T775] [c0000003fee5b6b0] [c000000000614ae8] 
__vmalloc_node_range_noprof+0x208/0x990
[    7.979831][  T775] [c0000003fee5b830] [c000000000615564] 
__vmalloc_noprof+0x54/0x70
[    7.979956][  T775] [c0000003fee5b8a0] [c000000000494cec] 
bpf_prog_alloc_no_stats+0x5c/0x2c0
[    7.980098][  T775] [c0000003fee5b8f0] [c000000000495028] 
bpf_prog_alloc+0xd8/0x120
[    7.980220][  T775] [c0000003fee5b930] [c000000000eff194] 
bpf_prog_create_from_user+0x74/0x190
[    7.980378][  T775] [c0000003fee5b9a0] [c0000000003dd770] 
seccomp_set_mode_filter+0x190/0xd20
[    7.980524][  T775] [c0000003fee5ba40] [c00000000002e410] 
system_call_exception+0x130/0x2a0
[    7.980649][  T775] [c0000003fee5be50] [c00000000000cfdc] 
system_call_vectored_common+0x15c/0x2ec
[    7.980791][  T775] ---- interrupt: 3000 at 0x7fffb6d6b3e4



[    7.805704][    C1] BUG: Unable to handle kernel instruction fetch 
(NULL pointer?)
[    7.805866][    C1] Faulting instruction address: 0x00000000
[    7.805976][    C1] Oops: Kernel access of bad area, sig: 7 [#1]
[    7.811066][    C1] [c000000003f9bd40] [c0000000012262c0] 
cpuidle_enter_state+0x440/0x6f8
[    7.811069][    C1] [c000000003f9bde0] [c000000000e1ab80] 
cpuidle_enter+0x50/0x80
[    7.811074][    C1] [c000000003f9be20] [c0000000002a3d28] 
call_cpuidle+0x48/0x90
[    7.811077][    C1] [c000000003f9be40] [c0000000002ab8d0] 
do_idle+0x2f0/0x3a0
[    7.811080][    C1] [c000000003f9beb0] [c0000000002abc4c] 
cpu_startup_entry+0x4c/0x50
[    7.811082][    C1] [c000000003f9bee0] [c000000000058dbc] 
start_secondary+0x6bc/0xe30
[    7.811086][    C1] [c000000003f9bfe0] [c00000000000e158] 
start_secondary_prolog+0x10/0x14
[    7.811089][    C1] Code: XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX 
XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX 
XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
[    7.813545][    C1] ---[ end trace 0000000000000000 ]---
[    7.818134][    C1] pstore: backend (nvram) writing error (-1)




After applying the patch the issue is getting fixed only for the 
scenario where topology is having thread "1" for the guest,
but when I give threads as 2 or 4 or 8 for the guest topology it is 
still crashing same as above mentioned. So the patch is
fixing the issue for partial scenarios. Some scenarios are still breaking.

Here I changed threads to 2 or 4 or 8 (-smp 
8,maxcpus=64,sockets=2,dies=1,clusters=1,cores=32,threads=1)

Thanks,
Anushree Mathur

Re: [PATCH] ppc/spapr: make fadump collect regs for maxcpus
Posted by Sourabh Jain 3 weeks, 2 days ago

On 29/04/26 12:21, Shivang Upadhyay wrote:
> When a ppc machine is started with maxcpus greater than the number of
> present CPUs (e.g., -smp cpus=2,maxcpus=8), firmware allocates the CPU
> state data buffer based on maxcpus and advertises this size to the
> kernel during fadump registration.
>
> However, fadump currently fails to generate a vmcore during the
> rebooted kernel because:

It would be good if we can include the error log printed by the
kernel while processing the CPU state buffer in the commit message.
It may help others in the future.

The rest of the things look good to me.

Feel free to add:
Reviewed-by: Sourabh Jain <sourabhjain@linux.ibm.com>

- Sourabh Jain

>
> 1. The CPU state buffer length does not match what firmware advertised
>     (QEMU only populates entries for present CPUs, not maxcpus)
> 2. The kernel cannot find CPU data for all maxcpus slots
>
> This patch adds placeholder entries for the unrealized cpus.
> The placeholder entries contain only the CPUSTRT and CPUEND markers
> with the CPU ID, which is sufficient for the kernel to recognize the
> CPU slot exists even though the CPU was never realized.
>
> This ensures the CPU state buffer size and structure matches what
> firmware advertised to the kernel, allowing fadump to successfully
> generate vmcore.
>
> Cc: Aditya Gupta <adityag@linux.ibm.com>
> Cc: Sourabh Jain <sourabhjain@linux.ibm.com>
> Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
> Cc: Harsh Prateek Bora <harshpb@linux.ibm.com>
> Reported-by: Anushree Mathur <anushree.mathur@linux.vnet.ibm.com>
> Signed-off-by: Shivang Upadhyay <shivangu@linux.ibm.com>
> ---
>   hw/ppc/spapr_fadump.c | 57 +++++++++++++++++++++++++++++++++----------
>   1 file changed, 44 insertions(+), 13 deletions(-)
>
> diff --git a/hw/ppc/spapr_fadump.c b/hw/ppc/spapr_fadump.c
> index 13cab0cfe1..b1a21ff115 100644
> --- a/hw/ppc/spapr_fadump.c
> +++ b/hw/ppc/spapr_fadump.c
> @@ -309,6 +309,30 @@ static bool do_preserve_region(FadumpSection *region)
>       return true;
>   }
>   
> +/*
> + * Populate placeholder register entries for an unrealized CPU
> + *
> + * For CPUs that are not realized (cpu_index < maxcpus), we need to
> + * create minimal register entries containing only CPUSTRT and CPUEND
> + * markers to maintain the expected buffer structure.
> + *
> + * Returns pointer just past the placeholder entries, which can be used
> + * as the start address for the next CPU's register entries
> + */
> +static FadumpRegEntry *populate_cpu_reg_entries_placeholder(int cpu_id,
> +                                                             FadumpRegEntry *curr_reg_entry)
> +{
> +    curr_reg_entry->reg_id = cpu_to_be64(fadump_str_to_u64("CPUSTRT"));
> +    curr_reg_entry->reg_value = cpu_to_be64(cpu_id & FADUMP_CPU_ID_MASK);
> +    ++curr_reg_entry;
> +
> +    curr_reg_entry->reg_id = cpu_to_be64(fadump_str_to_u64("CPUEND"));
> +    curr_reg_entry->reg_value = cpu_to_be64(cpu_id & FADUMP_CPU_ID_MASK);
> +    ++curr_reg_entry;
> +
> +    return curr_reg_entry;
> +}
> +
>   /*
>    * Populate the passed CPUs register entries, in the buffer starting at
>    * the argument 'curr_reg_entry'
> @@ -450,7 +474,7 @@ static FadumpRegEntry *populate_cpu_reg_entries(CPUState *cpu,
>    * callers), and sets the size of this buffer in the argument
>    * 'cpu_state_len'
>    */
> -static void *get_cpu_state_data(uint64_t *cpu_state_len)
> +static void *get_cpu_state_data(uint64_t *cpu_state_len, MachineState *ms)
>   {
>       FadumpRegSaveAreaHeader reg_save_hdr;
>       g_autofree FadumpRegEntry *reg_entries = NULL;
> @@ -459,15 +483,11 @@ static void *get_cpu_state_data(uint64_t *cpu_state_len)
>   
>       uint32_t num_reg_entries;
>       uint32_t reg_entries_size;
> -    uint32_t num_cpus = 0;
> +    uint32_t num_cpus = ms->smp.max_cpus;
>   
>       void *cpu_state_buffer = NULL;
>       uint64_t offset = 0;
>   
> -    CPU_FOREACH(cpu) {
> -        ++num_cpus;
> -    }
> -
>       reg_save_hdr.version = cpu_to_be32(0);
>       reg_save_hdr.magic_number =
>           cpu_to_be64(fadump_str_to_u64("REGSAVE"));
> @@ -484,10 +504,20 @@ static void *get_cpu_state_data(uint64_t *cpu_state_len)
>       /* Pointer to current CPU's registers */
>       curr_reg_entry = reg_entries;
>   
> -    /* Populate register entries for all CPUs */
> -    CPU_FOREACH(cpu) {
> -        cpu_synchronize_state(cpu);
> -        curr_reg_entry = populate_cpu_reg_entries(cpu, curr_reg_entry);
> +    /*
> +     * Populate register entries for all CPU slots (0 to maxcpus-1)
> +     */
> +    for (int i = 0; i < num_cpus; i++) {
> +        cpu = qemu_get_cpu(i);
> +        if (cpu) {
> +            /* CPU is realized - populate full register entries */
> +            cpu_synchronize_state(cpu);
> +            curr_reg_entry = populate_cpu_reg_entries(cpu, curr_reg_entry);
> +        } else {
> +            /* CPU slot is empty - populate placeholder entries */
> +            curr_reg_entry = populate_cpu_reg_entries_placeholder(i,
> +                                                                   curr_reg_entry);
> +        }
>       }
>   
>       *cpu_state_len = 0;
> @@ -526,7 +556,7 @@ static void *get_cpu_state_data(uint64_t *cpu_state_len)
>    * such as invalid destination address or non-fatal error errors likely
>    * caused due to invalid parameters, return true and set region->error_flags
>    */
> -static bool do_populate_cpu_state(FadumpSection *region)
> +static bool do_populate_cpu_state(FadumpSection *region, MachineState *ms)
>   {
>       uint64_t dest_addr = be64_to_cpu(region->destination_address);
>       uint64_t cpu_state_len = 0;
> @@ -541,7 +571,7 @@ static bool do_populate_cpu_state(FadumpSection *region)
>       attrs.user = 0;
>       attrs.memory = 1;
>   
> -    cpu_state_buffer  = get_cpu_state_data(&cpu_state_len);
> +    cpu_state_buffer  = get_cpu_state_data(&cpu_state_len, ms);
>   
>       io_result = address_space_write(default_as, dest_addr, attrs,
>               cpu_state_buffer, cpu_state_len);
> @@ -597,6 +627,7 @@ static bool do_populate_cpu_state(FadumpSection *region)
>   static bool fadump_preserve_mem(SpaprMachineState *spapr)
>   {
>       FadumpMemStruct *fdm = &spapr->registered_fdm;
> +    MachineState *ms = MACHINE(spapr);
>       uint16_t dump_num_sections, data_type;
>   
>       assert(spapr->fadump_registered);
> @@ -627,7 +658,7 @@ static bool fadump_preserve_mem(SpaprMachineState *spapr)
>   
>           switch (data_type) {
>           case FADUMP_CPU_STATE_DATA:
> -            if (!do_populate_cpu_state(&fdm->rgn[i])) {
> +            if (!do_populate_cpu_state(&fdm->rgn[i], ms)) {
>                   qemu_log_mask(LOG_GUEST_ERROR,
>                       "FADump: Failed to store CPU State Data");
>                   fdm->header.dump_status_flag |=
[PATCH v2] ppc/spapr: make fadump collect regs for maxcpus
Posted by Shivang Upadhyay 1 week, 3 days ago
When a ppc machine is started with maxcpus greater than the number of
present CPUs (e.g., -smp cpus=2,maxcpus=8), firmware allocates the CPU
state data buffer based on maxcpus and advertises this size to the
kernel during fadump registration.

However, fadump currently fails to generate a vmcore during the
rebooted kernel because:

1. The CPU state buffer length does not match what firmware advertised
   (QEMU only populates entries for present CPUs, not maxcpus)
2. The kernel cannot find CPU data for all maxcpus slots

This patch adds placeholder entries for the unrealized cpus.
The placeholder entries contain only the CPUSTRT and CPUEND markers
with the CPU ID, which is sufficient for the kernel to recognize the
CPU slot exists even though the CPU was never realized.

This ensures the CPU state buffer size and structure matches what
firmware advertised to the kernel, allowing fadump to successfully
generate vmcore.

Cc: Aditya Gupta <adityag@linux.ibm.com>
Cc: Sourabh Jain <sourabhjain@linux.ibm.com>
Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
Cc: Harsh Prateek Bora <harshpb@linux.ibm.com>
Reported-by: Anushree Mathur <anushree.mathur@linux.vnet.ibm.com>
Signed-off-by: Shivang Upadhyay <shivangu@linux.ibm.com>
---

Changelog:
V2:
 * change reservation size from maxcpus to possible_cpu_arch_ids. (Thanks Anushree for test case).
 * added a small change regarding interger ceil calculation.

V1:
 * link: https://lore.kernel.org/all/20260429065127.366813-1-shivangu@linux.ibm.com/

---
 hw/ppc/spapr_fadump.c | 60 +++++++++++++++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 14 deletions(-)

diff --git a/hw/ppc/spapr_fadump.c b/hw/ppc/spapr_fadump.c
index 13cab0cfe1..77339129ed 100644
--- a/hw/ppc/spapr_fadump.c
+++ b/hw/ppc/spapr_fadump.c
@@ -237,7 +237,7 @@ static bool do_preserve_region(FadumpSection *region)
         return false;
     }
 
-    num_chunks = ceil((src_len * 1.0f) / FADUMP_CHUNK_SIZE);
+    num_chunks = ((src_len + FADUMP_CHUNK_SIZE - 1) / FADUMP_CHUNK_SIZE);
     for (uint64_t chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
         /* Take minimum of bytes left to copy, and chunk size */
         uint64_t copy_len = MIN(
@@ -309,6 +309,30 @@ static bool do_preserve_region(FadumpSection *region)
     return true;
 }
 
+/*
+ * Populate placeholder register entries for an unrealized CPU
+ *
+ * For CPUs that are not realized (cpu_index < maxcpus), we need to
+ * create minimal register entries containing only CPUSTRT and CPUEND
+ * markers to maintain the expected buffer structure.
+ *
+ * Returns pointer just past the placeholder entries, which can be used
+ * as the start address for the next CPU's register entries
+ */
+static FadumpRegEntry *populate_cpu_reg_entries_placeholder(int cpu_id,
+                                                             FadumpRegEntry *curr_reg_entry)
+{
+    curr_reg_entry->reg_id = cpu_to_be64(fadump_str_to_u64("CPUSTRT"));
+    curr_reg_entry->reg_value = cpu_to_be64(cpu_id & FADUMP_CPU_ID_MASK);
+    ++curr_reg_entry;
+
+    curr_reg_entry->reg_id = cpu_to_be64(fadump_str_to_u64("CPUEND"));
+    curr_reg_entry->reg_value = cpu_to_be64(cpu_id & FADUMP_CPU_ID_MASK);
+    ++curr_reg_entry;
+
+    return curr_reg_entry;
+}
+
 /*
  * Populate the passed CPUs register entries, in the buffer starting at
  * the argument 'curr_reg_entry'
@@ -450,24 +474,21 @@ static FadumpRegEntry *populate_cpu_reg_entries(CPUState *cpu,
  * callers), and sets the size of this buffer in the argument
  * 'cpu_state_len'
  */
-static void *get_cpu_state_data(uint64_t *cpu_state_len)
+static void *get_cpu_state_data(uint64_t *cpu_state_len, MachineState *ms)
 {
     FadumpRegSaveAreaHeader reg_save_hdr;
     g_autofree FadumpRegEntry *reg_entries = NULL;
     FadumpRegEntry *curr_reg_entry;
     CPUState *cpu;
+    MachineClass *mc = MACHINE_GET_CLASS(ms);
 
     uint32_t num_reg_entries;
     uint32_t reg_entries_size;
-    uint32_t num_cpus = 0;
+    uint32_t num_cpus = mc->possible_cpu_arch_ids(ms)->len;
 
     void *cpu_state_buffer = NULL;
     uint64_t offset = 0;
 
-    CPU_FOREACH(cpu) {
-        ++num_cpus;
-    }
-
     reg_save_hdr.version = cpu_to_be32(0);
     reg_save_hdr.magic_number =
         cpu_to_be64(fadump_str_to_u64("REGSAVE"));
@@ -484,10 +505,20 @@ static void *get_cpu_state_data(uint64_t *cpu_state_len)
     /* Pointer to current CPU's registers */
     curr_reg_entry = reg_entries;
 
-    /* Populate register entries for all CPUs */
-    CPU_FOREACH(cpu) {
-        cpu_synchronize_state(cpu);
-        curr_reg_entry = populate_cpu_reg_entries(cpu, curr_reg_entry);
+    /*
+     * Populate register entries for all CPU slots (0 to maxcpus-1)
+     */
+    for (int i = 0; i < num_cpus; i++) {
+        cpu = qemu_get_cpu(i);
+        if (cpu) {
+            /* CPU is realized - populate full register entries */
+            cpu_synchronize_state(cpu);
+            curr_reg_entry = populate_cpu_reg_entries(cpu, curr_reg_entry);
+        } else {
+            /* CPU slot is empty - populate placeholder entries */
+            curr_reg_entry = populate_cpu_reg_entries_placeholder(i,
+                                                                   curr_reg_entry);
+        }
     }
 
     *cpu_state_len = 0;
@@ -526,7 +557,7 @@ static void *get_cpu_state_data(uint64_t *cpu_state_len)
  * such as invalid destination address or non-fatal error errors likely
  * caused due to invalid parameters, return true and set region->error_flags
  */
-static bool do_populate_cpu_state(FadumpSection *region)
+static bool do_populate_cpu_state(FadumpSection *region, MachineState *ms)
 {
     uint64_t dest_addr = be64_to_cpu(region->destination_address);
     uint64_t cpu_state_len = 0;
@@ -541,7 +572,7 @@ static bool do_populate_cpu_state(FadumpSection *region)
     attrs.user = 0;
     attrs.memory = 1;
 
-    cpu_state_buffer  = get_cpu_state_data(&cpu_state_len);
+    cpu_state_buffer  = get_cpu_state_data(&cpu_state_len, ms);
 
     io_result = address_space_write(default_as, dest_addr, attrs,
             cpu_state_buffer, cpu_state_len);
@@ -597,6 +628,7 @@ static bool do_populate_cpu_state(FadumpSection *region)
 static bool fadump_preserve_mem(SpaprMachineState *spapr)
 {
     FadumpMemStruct *fdm = &spapr->registered_fdm;
+    MachineState *ms = MACHINE(spapr);
     uint16_t dump_num_sections, data_type;
 
     assert(spapr->fadump_registered);
@@ -627,7 +659,7 @@ static bool fadump_preserve_mem(SpaprMachineState *spapr)
 
         switch (data_type) {
         case FADUMP_CPU_STATE_DATA:
-            if (!do_populate_cpu_state(&fdm->rgn[i])) {
+            if (!do_populate_cpu_state(&fdm->rgn[i], ms)) {
                 qemu_log_mask(LOG_GUEST_ERROR,
                     "FADump: Failed to store CPU State Data");
                 fdm->header.dump_status_flag |=
-- 
2.53.0