Logic for preserving the CPU registers and memory regions has been done
in previous patches.
Write those data at the relevant memory address, such as PROC_DUMP_AREA
for CPU registers, and MDRT for preserved memory regions.
Also export "mpipl-boot" device tree node, for kernel to know that it's
a 'dump active' boot
Reviewed-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Aditya Gupta <adityag@linux.ibm.com>
---
hw/ppc/pnv.c | 45 +++++++++++++-
hw/ppc/pnv_mpipl.c | 140 +++++++++++++++++++++++++++++++++++++++++++
include/hw/ppc/pnv.h | 1 +
3 files changed, 185 insertions(+), 1 deletion(-)
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 3038b1626c54..0b94e0c7a19b 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -751,6 +751,8 @@ static void pnv_reset(MachineState *machine, ResetType type)
PnvMachineState *pnv = PNV_MACHINE(machine);
IPMIBmc *bmc;
void *fdt;
+ int node_offset;
+ bool is_next_boot_mpipl = false;
qemu_devices_reset(type);
@@ -781,7 +783,48 @@ static void pnv_reset(MachineState *machine, ResetType type)
_FDT((fdt_pack(fdt)));
}
- if (!pnv->mpipl_state.is_next_boot_mpipl) {
+ /*
+ * Only on success of writing MPIPL data will the next boot be provided
+ * "mpipl-boot" property in device tree
+ * Otherwise boot like a normal non-MPIPL boot
+ */
+ if (pnv->mpipl_state.is_next_boot_mpipl) {
+ /* Write the preserved MDRT and CPU State Data */
+ bool mpipl_write_succeeded = do_mpipl_write(pnv);
+
+ if (!mpipl_write_succeeded) {
+ is_next_boot_mpipl = false;
+ } else {
+ is_next_boot_mpipl = true;
+ }
+ }
+
+ /*
+ * If it's a MPIPL boot, add the "mpipl-boot" property, and reset the
+ * boolean for MPIPL boot for next boot
+ */
+ if (is_next_boot_mpipl) {
+ void *fdt_copy = g_malloc0(FDT_MAX_SIZE);
+
+ /* Create a writable copy of the fdt */
+ _FDT((fdt_open_into(fdt, fdt_copy, FDT_MAX_SIZE)));
+
+ node_offset = fdt_path_offset(fdt_copy, "/ibm,opal/dump");
+ _FDT((fdt_appendprop_u64(fdt_copy, node_offset, "mpipl-boot", 1)));
+
+ /* Update the fdt, and free the original fdt */
+ if (fdt != machine->fdt) {
+ /*
+ * Only free the fdt if it's not machine->fdt, to prevent
+ * double free, since we already free machine->fdt later
+ */
+ g_free(fdt);
+ }
+ fdt = fdt_copy;
+
+ /* This boot is an MPIPL, reset the boolean for next boot */
+ pnv->mpipl_state.is_next_boot_mpipl = false;
+ } else {
/*
* Set the "Thread Register State Entry Size", so that firmware can
* allocate enough memory to capture CPU state in the event of a
diff --git a/hw/ppc/pnv_mpipl.c b/hw/ppc/pnv_mpipl.c
index 37f498051254..2cf54a8ec306 100644
--- a/hw/ppc/pnv_mpipl.c
+++ b/hw/ppc/pnv_mpipl.c
@@ -20,6 +20,8 @@
(pnv->mpipl_state.skiboot_base + MDST_TABLE_OFF)
#define MDDT_TABLE_RELOCATED \
(pnv->mpipl_state.skiboot_base + MDDT_TABLE_OFF)
+#define MDRT_TABLE_RELOCATED \
+ (pnv->mpipl_state.skiboot_base + MDRT_TABLE_OFF)
#define PROC_DUMP_RELOCATED \
(pnv->mpipl_state.skiboot_base + PROC_DUMP_AREA_OFF)
@@ -319,6 +321,139 @@ static bool pnv_mpipl_preserve_cpu_state(PnvMachineState *pnv)
return true;
}
+/*
+ * Write the preserved CPU state data in Processor Dump Area (PROC_DUMP_AREA)
+ *
+ * Returns true if everything went fine, else false for any error
+ */
+static bool pnv_mpipl_write_cpu_state(PnvMachineState *pnv)
+{
+ MpiplProcDumpArea *proc_area = &pnv->mpipl_state.proc_area;
+ MpiplPreservedCPUState *cpu_state = pnv->mpipl_state.cpu_states;
+ const uint32_t num_cpu_states = pnv->mpipl_state.num_cpu_states;
+ hwaddr next_regentries_hdr;
+ AddressSpace *default_as = &address_space_memory;
+ MemTxResult io_result;
+ MemTxAttrs attrs;
+
+ /* Mark the memory transactions as privileged memory access */
+ attrs.user = 0;
+ attrs.memory = 1;
+
+ if (be32_to_cpu(proc_area->alloc_size) <
+ (num_cpu_states * sizeof(MpiplPreservedCPUState))) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "MPIPL: Size of buffer allocate by skiboot (%u bytes) is not"
+ "enough to save all CPUs registers needed (%zu bytes)",
+ be32_to_cpu(proc_area->alloc_size),
+ num_cpu_states * sizeof(MpiplPreservedCPUState));
+
+ return false;
+ }
+
+ proc_area->version = PROC_DUMP_AREA_VERSION_P9;
+
+ /*
+ * This is the stride kernel/firmware should use to jump from a
+ * register entries header to next CPU's header
+ */
+ proc_area->thread_size = cpu_to_be32(sizeof(MpiplPreservedCPUState));
+
+ /* Write the header and register entries for each CPU */
+ next_regentries_hdr = be64_to_cpu(proc_area->alloc_addr) & (~HRMOR_BIT);
+ for (int i = 0; i < num_cpu_states; ++i) {
+ io_result = address_space_write(default_as, next_regentries_hdr, attrs,
+ &cpu_state->hdr, sizeof(MpiplRegDataHdr));
+ if (io_result != MEMTX_OK) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "MPIPL: Failed to write RegEntries Header\n");
+ return false;
+ }
+
+ io_result = address_space_write(default_as,
+ next_regentries_hdr + sizeof(MpiplRegDataHdr), attrs,
+ &cpu_state->reg_entries,
+ NUM_REGS_PER_CPU * (sizeof(MpiplRegEntry)));
+ if (io_result != MEMTX_OK) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "MPIPL: Failed to write Register Entries\n");
+ return false;
+ }
+
+ /*
+ * According to HDAT section:
+ * "15.3.1.5 Architected Register Data content":
+ *
+ * The next register entries header will be at current header +
+ * "Thread Register State Entry size"
+ *
+ * Note: proc_area.thread_size == sizeof(MpiplPreservedCPUState)
+ */
+ next_regentries_hdr += sizeof(MpiplPreservedCPUState);
+ ++cpu_state;
+ }
+
+ /* Point the destination address to the preserved memory region */
+ proc_area->dest_addr = proc_area->alloc_addr;
+ proc_area->act_size = cpu_to_be32(num_cpu_states *
+ sizeof(MpiplPreservedCPUState));
+
+ io_result = address_space_write(default_as, PROC_DUMP_AREA_OFF, attrs,
+ proc_area, sizeof(MpiplProcDumpArea));
+ if (io_result != MEMTX_OK) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "MPIPL: Failed to write Register Entries\n");
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Write the preserved MDRT table, representing preserved memory regions
+ *
+ * Returns true if everything went fine, else false for any error
+ */
+static bool pnv_mpipl_write_mdrt(PnvMachineState *pnv)
+{
+ MpiplPreservedState *state = &pnv->mpipl_state;
+ AddressSpace *default_as = &address_space_memory;
+ MemTxResult io_result;
+ MemTxAttrs attrs;
+
+ /* Mark the memory transactions as privileged memory access */
+ attrs.user = 0;
+ attrs.memory = 1;
+
+ /*
+ * Generally writes from platform during MPIPL don't go to a relocated
+ * skiboot address
+ *
+ * Though for MDRT we are doing so, as this is the address skiboot
+ * considers by default for MDRT
+ *
+ * MDRT/MDST/MDDT base addresses are actually meant to be shared by
+ * platform in SPIRA structures.
+ *
+ * Not implementing SPIRA as it increases complexity for no gains.
+ * Using the default address skiboot expects for MDRT, which is the
+ * relocated MDRT, hence writing to it
+ *
+ * Other tables like MDST/MDDT should not be written to relocated
+ * addresses, as skiboot will overwrite anything from SKIBOOT_BASE till
+ * SKIBOOT_BASE+SKIBOOT_SIZE (which is 0x30000000-0x31c00000 by default)
+ */
+ io_result = address_space_write(default_as, MDRT_TABLE_RELOCATED, attrs,
+ state->mdrt_table,
+ state->num_mdrt_entries * sizeof(MdrtTableEntry));
+ if (io_result != MEMTX_OK) {
+ qemu_log_mask(LOG_GUEST_ERROR, "MPIPL: Failed to write MDRT table\n");
+ return false;
+ }
+
+ return true;
+}
+
void do_mpipl_preserve(PnvMachineState *pnv)
{
pause_all_vcpus();
@@ -339,3 +474,8 @@ void do_mpipl_preserve(PnvMachineState *pnv)
*/
qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
}
+
+bool do_mpipl_write(PnvMachineState *pnv)
+{
+ return pnv_mpipl_write_mdrt(pnv) && pnv_mpipl_write_cpu_state(pnv);
+}
diff --git a/include/hw/ppc/pnv.h b/include/hw/ppc/pnv.h
index 7d73629f112a..98fe10fb4f2e 100644
--- a/include/hw/ppc/pnv.h
+++ b/include/hw/ppc/pnv.h
@@ -295,5 +295,6 @@ void pnv_bmc_set_pnor(IPMIBmc *bmc, PnvPnor *pnor);
/* MPIPL helpers */
void do_mpipl_preserve(PnvMachineState *pnv);
+bool do_mpipl_write(PnvMachineState *pnv);
#endif /* PPC_PNV_H */
--
2.53.0
On 09/03/26 12:49, Aditya Gupta wrote:
> Logic for preserving the CPU registers and memory regions has been done
> in previous patches.
>
> Write those data at the relevant memory address, such as PROC_DUMP_AREA
> for CPU registers, and MDRT for preserved memory regions.
>
> Also export "mpipl-boot" device tree node, for kernel to know that it's
> a 'dump active' boot
>
> Reviewed-by: Hari Bathini <hbathini@linux.ibm.com>
> Signed-off-by: Aditya Gupta <adityag@linux.ibm.com>
> ---
> hw/ppc/pnv.c | 45 +++++++++++++-
> hw/ppc/pnv_mpipl.c | 140 +++++++++++++++++++++++++++++++++++++++++++
> include/hw/ppc/pnv.h | 1 +
> 3 files changed, 185 insertions(+), 1 deletion(-)
>
> diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
> index 3038b1626c54..0b94e0c7a19b 100644
> --- a/hw/ppc/pnv.c
> +++ b/hw/ppc/pnv.c
> @@ -751,6 +751,8 @@ static void pnv_reset(MachineState *machine, ResetType type)
> PnvMachineState *pnv = PNV_MACHINE(machine);
> IPMIBmc *bmc;
> void *fdt;
> + int node_offset;
> + bool is_next_boot_mpipl = false;
>
> qemu_devices_reset(type);
>
> @@ -781,7 +783,48 @@ static void pnv_reset(MachineState *machine, ResetType type)
> _FDT((fdt_pack(fdt)));
> }
>
> - if (!pnv->mpipl_state.is_next_boot_mpipl) {
> + /*
> + * Only on success of writing MPIPL data will the next boot be provided
> + * "mpipl-boot" property in device tree
> + * Otherwise boot like a normal non-MPIPL boot
> + */
> + if (pnv->mpipl_state.is_next_boot_mpipl) {
> + /* Write the preserved MDRT and CPU State Data */
> + bool mpipl_write_succeeded = do_mpipl_write(pnv);
do_mpipl_write returns bool and mpipl_write_succeeded is of type boot yet
is_next_boot_mpipl is assigned under if-else condition. Is it really needed?
> +
> + if (!mpipl_write_succeeded) {
> + is_next_boot_mpipl = false;
> + } else {
> + is_next_boot_mpipl = true;
> + }
> + }
> +
> + /*
> + * If it's a MPIPL boot, add the "mpipl-boot" property, and reset the
> + * boolean for MPIPL boot for next boot
> + */
> + if (is_next_boot_mpipl) {
> + void *fdt_copy = g_malloc0(FDT_MAX_SIZE);
> +
> + /* Create a writable copy of the fdt */
> + _FDT((fdt_open_into(fdt, fdt_copy, FDT_MAX_SIZE)));
> +
> + node_offset = fdt_path_offset(fdt_copy, "/ibm,opal/dump");
> + _FDT((fdt_appendprop_u64(fdt_copy, node_offset, "mpipl-boot", 1)));
What if node_offset is not valid? What is expected here?
> +
> + /* Update the fdt, and free the original fdt */
> + if (fdt != machine->fdt) {
> + /*
> + * Only free the fdt if it's not machine->fdt, to prevent
> + * double free, since we already free machine->fdt later
> + */
> + g_free(fdt);
> + }
> + fdt = fdt_copy;
> +
> + /* This boot is an MPIPL, reset the boolean for next boot */
> + pnv->mpipl_state.is_next_boot_mpipl = false;
> + } else {
> /*
> * Set the "Thread Register State Entry Size", so that firmware can
> * allocate enough memory to capture CPU state in the event of a
> diff --git a/hw/ppc/pnv_mpipl.c b/hw/ppc/pnv_mpipl.c
> index 37f498051254..2cf54a8ec306 100644
> --- a/hw/ppc/pnv_mpipl.c
> +++ b/hw/ppc/pnv_mpipl.c
> @@ -20,6 +20,8 @@
> (pnv->mpipl_state.skiboot_base + MDST_TABLE_OFF)
> #define MDDT_TABLE_RELOCATED \
> (pnv->mpipl_state.skiboot_base + MDDT_TABLE_OFF)
> +#define MDRT_TABLE_RELOCATED \
> + (pnv->mpipl_state.skiboot_base + MDRT_TABLE_OFF)
> #define PROC_DUMP_RELOCATED \
> (pnv->mpipl_state.skiboot_base + PROC_DUMP_AREA_OFF)
>
> @@ -319,6 +321,139 @@ static bool pnv_mpipl_preserve_cpu_state(PnvMachineState *pnv)
> return true;
> }
>
> +/*
> + * Write the preserved CPU state data in Processor Dump Area (PROC_DUMP_AREA)
> + *
> + * Returns true if everything went fine, else false for any error
> + */
> +static bool pnv_mpipl_write_cpu_state(PnvMachineState *pnv)
> +{
> + MpiplProcDumpArea *proc_area = &pnv->mpipl_state.proc_area;
> + MpiplPreservedCPUState *cpu_state = pnv->mpipl_state.cpu_states;
> + const uint32_t num_cpu_states = pnv->mpipl_state.num_cpu_states;
> + hwaddr next_regentries_hdr;
> + AddressSpace *default_as = &address_space_memory;
> + MemTxResult io_result;
> + MemTxAttrs attrs;
> +
> + /* Mark the memory transactions as privileged memory access */
> + attrs.user = 0;
> + attrs.memory = 1;
> +
> + if (be32_to_cpu(proc_area->alloc_size) <
> + (num_cpu_states * sizeof(MpiplPreservedCPUState))) {
> + qemu_log_mask(LOG_GUEST_ERROR,
> + "MPIPL: Size of buffer allocate by skiboot (%u bytes) is not"
> + "enough to save all CPUs registers needed (%zu bytes)",
> + be32_to_cpu(proc_area->alloc_size),
> + num_cpu_states * sizeof(MpiplPreservedCPUState));
> +
> + return false;
> + }
> +
> + proc_area->version = PROC_DUMP_AREA_VERSION_P9;
> +
> + /*
> + * This is the stride kernel/firmware should use to jump from a
> + * register entries header to next CPU's header
> + */
> + proc_area->thread_size = cpu_to_be32(sizeof(MpiplPreservedCPUState));
> +
> + /* Write the header and register entries for each CPU */
> + next_regentries_hdr = be64_to_cpu(proc_area->alloc_addr) & (~HRMOR_BIT);
> + for (int i = 0; i < num_cpu_states; ++i) {
> + io_result = address_space_write(default_as, next_regentries_hdr, attrs,
> + &cpu_state->hdr, sizeof(MpiplRegDataHdr));
> + if (io_result != MEMTX_OK) {
> + qemu_log_mask(LOG_GUEST_ERROR,
> + "MPIPL: Failed to write RegEntries Header\n");
> + return false;
> + }
> +
> + io_result = address_space_write(default_as,
> + next_regentries_hdr + sizeof(MpiplRegDataHdr), attrs,
> + &cpu_state->reg_entries,
> + NUM_REGS_PER_CPU * (sizeof(MpiplRegEntry)));
> + if (io_result != MEMTX_OK) {
> + qemu_log_mask(LOG_GUEST_ERROR,
> + "MPIPL: Failed to write Register Entries\n");
> + return false;
> + }
> +
> + /*
> + * According to HDAT section:
> + * "15.3.1.5 Architected Register Data content":
> + *
> + * The next register entries header will be at current header +
> + * "Thread Register State Entry size"
> + *
> + * Note: proc_area.thread_size == sizeof(MpiplPreservedCPUState)
> + */
> + next_regentries_hdr += sizeof(MpiplPreservedCPUState);
> + ++cpu_state;
> + }
> +
> + /* Point the destination address to the preserved memory region */
> + proc_area->dest_addr = proc_area->alloc_addr;
> + proc_area->act_size = cpu_to_be32(num_cpu_states *
> + sizeof(MpiplPreservedCPUState));
> +
> + io_result = address_space_write(default_as, PROC_DUMP_AREA_OFF, attrs,
> + proc_area, sizeof(MpiplProcDumpArea));
> + if (io_result != MEMTX_OK) {
> + qemu_log_mask(LOG_GUEST_ERROR,
> + "MPIPL: Failed to write Register Entries\n");
> + return false;
> + }
> +
> + return true;
> +}
> +
> +/*
> + * Write the preserved MDRT table, representing preserved memory regions
> + *
> + * Returns true if everything went fine, else false for any error
> + */
> +static bool pnv_mpipl_write_mdrt(PnvMachineState *pnv)
> +{
> + MpiplPreservedState *state = &pnv->mpipl_state;
> + AddressSpace *default_as = &address_space_memory;
> + MemTxResult io_result;
> + MemTxAttrs attrs;
> +
> + /* Mark the memory transactions as privileged memory access */
> + attrs.user = 0;
> + attrs.memory = 1;
> +
> + /*
> + * Generally writes from platform during MPIPL don't go to a relocated
> + * skiboot address
> + *
> + * Though for MDRT we are doing so, as this is the address skiboot
> + * considers by default for MDRT
> + *
> + * MDRT/MDST/MDDT base addresses are actually meant to be shared by
> + * platform in SPIRA structures.
> + *
> + * Not implementing SPIRA as it increases complexity for no gains.
> + * Using the default address skiboot expects for MDRT, which is the
> + * relocated MDRT, hence writing to it
> + *
> + * Other tables like MDST/MDDT should not be written to relocated
> + * addresses, as skiboot will overwrite anything from SKIBOOT_BASE till
> + * SKIBOOT_BASE+SKIBOOT_SIZE (which is 0x30000000-0x31c00000 by default)
> + */
> + io_result = address_space_write(default_as, MDRT_TABLE_RELOCATED, attrs,
> + state->mdrt_table,
> + state->num_mdrt_entries * sizeof(MdrtTableEntry));
> + if (io_result != MEMTX_OK) {
> + qemu_log_mask(LOG_GUEST_ERROR, "MPIPL: Failed to write MDRT table\n");
> + return false;
> + }
> +
> + return true;
> +}
> +
> void do_mpipl_preserve(PnvMachineState *pnv)
> {
> pause_all_vcpus();
> @@ -339,3 +474,8 @@ void do_mpipl_preserve(PnvMachineState *pnv)
> */
> qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
> }
> +
> +bool do_mpipl_write(PnvMachineState *pnv)
> +{
> + return pnv_mpipl_write_mdrt(pnv) && pnv_mpipl_write_cpu_state(pnv);
> +}
> diff --git a/include/hw/ppc/pnv.h b/include/hw/ppc/pnv.h
> index 7d73629f112a..98fe10fb4f2e 100644
> --- a/include/hw/ppc/pnv.h
> +++ b/include/hw/ppc/pnv.h
> @@ -295,5 +295,6 @@ void pnv_bmc_set_pnor(IPMIBmc *bmc, PnvPnor *pnor);
>
> /* MPIPL helpers */
> void do_mpipl_preserve(PnvMachineState *pnv);
> +bool do_mpipl_write(PnvMachineState *pnv);
>
> #endif /* PPC_PNV_H */
© 2016 - 2026 Red Hat, Inc.