arch/powerpc/include/asm/smp.h | 1 + arch/powerpc/kernel/smp.c | 1 + arch/powerpc/platforms/pseries/smp.c | 29 +++++++++++++++++++++++++++- 3 files changed, 30 insertions(+), 1 deletion(-)
During DLPAR operations, the newly added CPUs start in halted mode.
The kernel then takes some time to initialize those CPUs internally and
start them using the "start-cpu" RTAS call. However, if a kexec crash
occurs in this window (before the new CPU has been initialized),
the kexec NMI will try to reset all other CPUs from the crashing CPU.
This leads to firmware starting the uninitialized CPUs as well.
This can cause the kdump kernel to hang during bring-up.
Sample Log:
[175993.028231][ T1502] NIP [00007fffb953f394] 0x7fffb953f394
[175993.028314][ T1502] LR [00007fffb953f394] 0x7fffb953f394
[175993.028390][ T1502] --- interrupt: 3000
[ 5.519483][ T1] Processor 0 is stuck.
[ 11.089481][ T1] Processor 1 is stuck.
To fix this, only issue the system-reset hcall to CPUs that have
actually been started by the kernel.
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Srikar Dronamraju <srikar@linux.ibm.com>
Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Nysal Jan K.A. <nysal@linux.ibm.com>
Cc: Vishal Chourasia <vishalc@linux.ibm.com>
Cc: Ritesh Harjani <ritesh.list@gmail.com>
Cc: Sourabh Jain <sourabhjain@linux.ibm.com>
Reported-by: Anushree Mathur <anushree.mathur@linux.vnet.ibm.com>
Signed-off-by: Shivang Upadhyay <shivangu@linux.ibm.com>
---
Changelog:
V2:
* added set_crash_nmi_ipi to saperate crash's case from other nmi_ipi
users
V1:
* https://lore.kernel.org/all/20251205142825.44698-1-shivangu@linux.ibm.com/
---
arch/powerpc/include/asm/smp.h | 1 +
arch/powerpc/kernel/smp.c | 1 +
arch/powerpc/platforms/pseries/smp.c | 29 +++++++++++++++++++++++++++-
3 files changed, 30 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index e41b9ea42122..cb74201f5674 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -47,6 +47,7 @@ struct smp_ops_t {
void (*cause_ipi)(int cpu);
#endif
int (*cause_nmi_ipi)(int cpu);
+ void (*set_crash_nmi_ipi)(void);
void (*probe)(void);
int (*kick_cpu)(int nr);
int (*prepare_cpu)(int nr);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 3467f86fd78f..3390ee8adf79 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -594,6 +594,7 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
{
int cpu;
+ smp_ops->set_crash_nmi_ipi();
smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_ipi_callback, 1000000);
if (kdump_in_progress() && crash_wake_offline) {
for_each_present_cpu(cpu) {
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index db99725e752b..c6c2baacca9a 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -51,6 +51,9 @@
*/
static cpumask_var_t of_spin_mask;
+
+static int crash_nmi_ipi;
+
/* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */
int smp_query_cpu_stopped(unsigned int pcpu)
{
@@ -171,12 +174,35 @@ static void dbell_or_ic_cause_ipi(int cpu)
ic_cause_ipi(cpu);
}
+static void pseries_set_crash_nmi_ipi(void)
+{
+ crash_nmi_ipi = 1;
+}
+
static int pseries_cause_nmi_ipi(int cpu)
{
int hwcpu;
+ int k, curcpu;
+ curcpu = smp_processor_id();
if (cpu == NMI_IPI_ALL_OTHERS) {
- hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
+ if (crash_nmi_ipi) {
+ for_each_present_cpu(k) {
+ if (k != curcpu) {
+ hwcpu = get_hard_smp_processor_id(k);
+
+ /* it is possible that cpu is present,
+ * but not started yet.
+ */
+
+ if (paca_ptrs[hwcpu]->cpu_start == 1) {
+ plpar_signal_sys_reset(hwcpu);
+ }
+ }
+ }
+ return 1;
+ } else
+ hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
} else {
if (cpu < 0) {
WARN_ONCE(true, "incorrect cpu parameter %d", cpu);
@@ -243,6 +269,7 @@ static struct smp_ops_t pseries_smp_ops = {
.message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
.cause_ipi = NULL, /* Filled at runtime by pSeries_smp_probe() */
.cause_nmi_ipi = pseries_cause_nmi_ipi,
+ .set_crash_nmi_ipi = pseries_set_crash_nmi_ipi,
.probe = pSeries_smp_probe,
.prepare_cpu = pseries_smp_prepare_cpu,
.kick_cpu = smp_pSeries_kick_cpu,
--
2.53.0
* Shivang Upadhyay <shivangu@linux.ibm.com> [2026-03-30 11:52:06]:
> During DLPAR operations, the newly added CPUs start in halted mode.
> The kernel then takes some time to initialize those CPUs internally and
> start them using the "start-cpu" RTAS call. However, if a kexec crash
> occurs in this window (before the new CPU has been initialized),
> the kexec NMI will try to reset all other CPUs from the crashing CPU.
> This leads to firmware starting the uninitialized CPUs as well.
What would happen if an non kexec crash nmi was delivered when we did a
DLPAR operation and before the CPUs are initialized?
>
> This can cause the kdump kernel to hang during bring-up.
>
> Sample Log:
> [175993.028231][ T1502] NIP [00007fffb953f394] 0x7fffb953f394
> [175993.028314][ T1502] LR [00007fffb953f394] 0x7fffb953f394
> [175993.028390][ T1502] --- interrupt: 3000
> [ 5.519483][ T1] Processor 0 is stuck.
> [ 11.089481][ T1] Processor 1 is stuck.
>
> To fix this, only issue the system-reset hcall to CPUs that have
> actually been started by the kernel.
>
> Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Nicholas Piggin <npiggin@gmail.com>
> Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
> Cc: Srikar Dronamraju <srikar@linux.ibm.com>
> Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
> Cc: Nysal Jan K.A. <nysal@linux.ibm.com>
> Cc: Vishal Chourasia <vishalc@linux.ibm.com>
> Cc: Ritesh Harjani <ritesh.list@gmail.com>
> Cc: Sourabh Jain <sourabhjain@linux.ibm.com>
> Reported-by: Anushree Mathur <anushree.mathur@linux.vnet.ibm.com>
> Signed-off-by: Shivang Upadhyay <shivangu@linux.ibm.com>
> ---
> Changelog:
>
> V2:
> * added set_crash_nmi_ipi to saperate crash's case from other nmi_ipi
> users
>
> V1:
> * https://lore.kernel.org/all/20251205142825.44698-1-shivangu@linux.ibm.com/
> ---
> arch/powerpc/include/asm/smp.h | 1 +
> arch/powerpc/kernel/smp.c | 1 +
> arch/powerpc/platforms/pseries/smp.c | 29 +++++++++++++++++++++++++++-
> 3 files changed, 30 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
> index e41b9ea42122..cb74201f5674 100644
> --- a/arch/powerpc/include/asm/smp.h
> +++ b/arch/powerpc/include/asm/smp.h
> @@ -47,6 +47,7 @@ struct smp_ops_t {
> void (*cause_ipi)(int cpu);
> #endif
> int (*cause_nmi_ipi)(int cpu);
> + void (*set_crash_nmi_ipi)(void);
> void (*probe)(void);
> int (*kick_cpu)(int nr);
> int (*prepare_cpu)(int nr);
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 3467f86fd78f..3390ee8adf79 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -594,6 +594,7 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
> {
> int cpu;
>
> + smp_ops->set_crash_nmi_ipi();
Shouldn't we be checking got smp_ops->set_crash_nmi_ipi being non-NULL
before calling. Dont we expect set_crash_nmi_ipi() to be NULL in PowerNV
after this patch? Or what would happen if set_crash_nmi_ipi() was called in
PowerNV code?
> smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_ipi_callback, 1000000);
> if (kdump_in_progress() && crash_wake_offline) {
> for_each_present_cpu(cpu) {
> diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
> index db99725e752b..c6c2baacca9a 100644
> --- a/arch/powerpc/platforms/pseries/smp.c
> +++ b/arch/powerpc/platforms/pseries/smp.c
> @@ -51,6 +51,9 @@
> */
> static cpumask_var_t of_spin_mask;
>
> +
Nit: Are we adding an unnecessary newline?
> +static int crash_nmi_ipi;
> +
> /* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */
> int smp_query_cpu_stopped(unsigned int pcpu)
> {
> @@ -171,12 +174,35 @@ static void dbell_or_ic_cause_ipi(int cpu)
> ic_cause_ipi(cpu);
> }
>
> +static void pseries_set_crash_nmi_ipi(void)
> +{
> + crash_nmi_ipi = 1;
> +}
> +
> static int pseries_cause_nmi_ipi(int cpu)
> {
> int hwcpu;
> + int k, curcpu;
>
> + curcpu = smp_processor_id();
> if (cpu == NMI_IPI_ALL_OTHERS) {
> - hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
> + if (crash_nmi_ipi) {
> + for_each_present_cpu(k) {
> + if (k != curcpu) {
> + hwcpu = get_hard_smp_processor_id(k);
> +
> + /* it is possible that cpu is present,
> + * but not started yet.
> + */
> +
> + if (paca_ptrs[hwcpu]->cpu_start == 1) {
> + plpar_signal_sys_reset(hwcpu);
> + }
> + }
> + }
> + return 1;
> + } else
> + hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
> } else {
> if (cpu < 0) {
> WARN_ONCE(true, "incorrect cpu parameter %d", cpu);
> @@ -243,6 +269,7 @@ static struct smp_ops_t pseries_smp_ops = {
> .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
> .cause_ipi = NULL, /* Filled at runtime by pSeries_smp_probe() */
> .cause_nmi_ipi = pseries_cause_nmi_ipi,
> + .set_crash_nmi_ipi = pseries_set_crash_nmi_ipi,
> .probe = pSeries_smp_probe,
> .prepare_cpu = pseries_smp_prepare_cpu,
> .kick_cpu = smp_pSeries_kick_cpu,
> --
> 2.53.0
>
--
Thanks and Regards
Srikar Dronamraju
Hi Srikar,
Thanks for reviewing.
On Tue, 2026-03-31 at 10:03 +0530, Srikar Dronamraju wrote:
> * Shivang Upadhyay <shivangu@linux.ibm.com> [2026-03-30 11:52:06]:
>
> > During DLPAR operations, the newly added CPUs start in halted mode.
> > The kernel then takes some time to initialize those CPUs internally
> > and
> > start them using the "start-cpu" RTAS call. However, if a kexec
> > crash
> > occurs in this window (before the new CPU has been initialized),
> > the kexec NMI will try to reset all other CPUs from the crashing
> > CPU.
> > This leads to firmware starting the uninitialized CPUs as well.
>
> What would happen if an non kexec crash nmi was delivered when we did
> a
> DLPAR operation and before the CPUs are initialized?
As per my understanding, similar hang like situation should happen
there also. But I think this case would be highly unlikely. Crash's
case is little special, because this is called for offline cpus also.
>
> >
> > This can cause the kdump kernel to hang during bring-up.
> >
> > Sample Log:
> > [175993.028231][ T1502] NIP [00007fffb953f394] 0x7fffb953f394
> > [175993.028314][ T1502] LR [00007fffb953f394] 0x7fffb953f394
> > [175993.028390][ T1502] --- interrupt: 3000
> > [ 5.519483][ T1] Processor 0 is stuck.
> > [ 11.089481][ T1] Processor 1 is stuck.
> >
> > To fix this, only issue the system-reset hcall to CPUs that have
> > actually been started by the kernel.
> >
> > Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
> > Cc: Michael Ellerman <mpe@ellerman.id.au>
> > Cc: Nicholas Piggin <npiggin@gmail.com>
> > Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
> > Cc: Srikar Dronamraju <srikar@linux.ibm.com>
> > Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
> > Cc: Nysal Jan K.A. <nysal@linux.ibm.com>
> > Cc: Vishal Chourasia <vishalc@linux.ibm.com>
> > Cc: Ritesh Harjani <ritesh.list@gmail.com>
> > Cc: Sourabh Jain <sourabhjain@linux.ibm.com>
> > Reported-by: Anushree Mathur <anushree.mathur@linux.vnet.ibm.com>
> > Signed-off-by: Shivang Upadhyay <shivangu@linux.ibm.com>
> > ---
> > Changelog:
> >
> > V2:
> > * added set_crash_nmi_ipi to saperate crash's case from other
> > nmi_ipi
> > users
> >
> > V1:
> > *
> > https://lore.kernel.org/all/20251205142825.44698-1-shivangu@linux.ibm.com/
> > ---
> > arch/powerpc/include/asm/smp.h | 1 +
> > arch/powerpc/kernel/smp.c | 1 +
> > arch/powerpc/platforms/pseries/smp.c | 29
> > +++++++++++++++++++++++++++-
> > 3 files changed, 30 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/powerpc/include/asm/smp.h
> > b/arch/powerpc/include/asm/smp.h
> > index e41b9ea42122..cb74201f5674 100644
> > --- a/arch/powerpc/include/asm/smp.h
> > +++ b/arch/powerpc/include/asm/smp.h
> > @@ -47,6 +47,7 @@ struct smp_ops_t {
> > void (*cause_ipi)(int cpu);
> > #endif
> > int (*cause_nmi_ipi)(int cpu);
> > + void (*set_crash_nmi_ipi)(void);
> > void (*probe)(void);
> > int (*kick_cpu)(int nr);
> > int (*prepare_cpu)(int nr);
> > diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> > index 3467f86fd78f..3390ee8adf79 100644
> > --- a/arch/powerpc/kernel/smp.c
> > +++ b/arch/powerpc/kernel/smp.c
> > @@ -594,6 +594,7 @@ void crash_send_ipi(void
> > (*crash_ipi_callback)(struct pt_regs *))
> > {
> > int cpu;
> >
> > + smp_ops->set_crash_nmi_ipi();
>
> Shouldn't we be checking got smp_ops->set_crash_nmi_ipi being non-
> NULL
> before calling. Dont we expect set_crash_nmi_ipi() to be NULL in
> PowerNV
> after this patch? Or what would happen if set_crash_nmi_ipi() was
> called in
> PowerNV code?
>
Yeah, that should be a Bug, Ill fix this in new patch. Thanks for
catching this.
> > smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_ipi_callback,
> > 1000000);
> > if (kdump_in_progress() && crash_wake_offline) {
> > for_each_present_cpu(cpu) {
> > diff --git a/arch/powerpc/platforms/pseries/smp.c
> > b/arch/powerpc/platforms/pseries/smp.c
> > index db99725e752b..c6c2baacca9a 100644
> > --- a/arch/powerpc/platforms/pseries/smp.c
> > +++ b/arch/powerpc/platforms/pseries/smp.c
> > @@ -51,6 +51,9 @@
> > */
> > static cpumask_var_t of_spin_mask;
> >
> > +
>
> Nit: Are we adding an unnecessary newline?
Ack.
>
> > +static int crash_nmi_ipi;
> > +
> > /* Query where a cpu is now. Return codes #defined in
> > plpar_wrappers.h */
> > int smp_query_cpu_stopped(unsigned int pcpu)
> > {
> > @@ -171,12 +174,35 @@ static void dbell_or_ic_cause_ipi(int cpu)
> > ic_cause_ipi(cpu);
> > }
> >
> > +static void pseries_set_crash_nmi_ipi(void)
> > +{
> > + crash_nmi_ipi = 1;
> > +}
> > +
> > static int pseries_cause_nmi_ipi(int cpu)
> > {
> > int hwcpu;
> > + int k, curcpu;
> >
> > + curcpu = smp_processor_id();
> > if (cpu == NMI_IPI_ALL_OTHERS) {
> > - hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
> > + if (crash_nmi_ipi) {
> > + for_each_present_cpu(k) {
> > + if (k != curcpu) {
> > + hwcpu =
> > get_hard_smp_processor_id(k);
> > +
> > + /* it is possible that cpu
> > is present,
> > + * but not started yet.
> > + */
> > +
> > + if (paca_ptrs[hwcpu]-
> > >cpu_start == 1) {
> > + plpar_signal_sys_r
> > eset(hwcpu);
> > + }
> > + }
> > + }
> > + return 1;
> > + } else
> > + hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
> > } else {
> > if (cpu < 0) {
> > WARN_ONCE(true, "incorrect cpu parameter
> > %d", cpu);
> > @@ -243,6 +269,7 @@ static struct smp_ops_t pseries_smp_ops = {
> > .message_pass = NULL, /* Use
> > smp_muxed_ipi_message_pass */
> > .cause_ipi = NULL, /* Filled at runtime by
> > pSeries_smp_probe() */
> > .cause_nmi_ipi = pseries_cause_nmi_ipi,
> > + .set_crash_nmi_ipi = pseries_set_crash_nmi_ipi,
> > .probe = pSeries_smp_probe,
> > .prepare_cpu = pseries_smp_prepare_cpu,
> > .kick_cpu = smp_pSeries_kick_cpu,
> > --
> > 2.53.0
> >
Thanks
~Shivang.
Hi Shivang.
Few nitpicks.
On 3/30/26 11:52 AM, Shivang Upadhyay wrote:
> During DLPAR operations, the newly added CPUs start in halted mode.
> The kernel then takes some time to initialize those CPUs internally and
> start them using the "start-cpu" RTAS call. However, if a kexec crash
> occurs in this window (before the new CPU has been initialized),
> the kexec NMI will try to reset all other CPUs from the crashing CPU.
> This leads to firmware starting the uninitialized CPUs as well.
>
> This can cause the kdump kernel to hang during bring-up.
>
> Sample Log:
> [175993.028231][ T1502] NIP [00007fffb953f394] 0x7fffb953f394
> [175993.028314][ T1502] LR [00007fffb953f394] 0x7fffb953f394
> [175993.028390][ T1502] --- interrupt: 3000
> [ 5.519483][ T1] Processor 0 is stuck.
> [ 11.089481][ T1] Processor 1 is stuck.
>
> To fix this, only issue the system-reset hcall to CPUs that have
> actually been started by the kernel.
>
> Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Nicholas Piggin <npiggin@gmail.com>
> Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
> Cc: Srikar Dronamraju <srikar@linux.ibm.com>
> Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
> Cc: Nysal Jan K.A. <nysal@linux.ibm.com>
> Cc: Vishal Chourasia <vishalc@linux.ibm.com>
> Cc: Ritesh Harjani <ritesh.list@gmail.com>
> Cc: Sourabh Jain <sourabhjain@linux.ibm.com>
> Reported-by: Anushree Mathur <anushree.mathur@linux.vnet.ibm.com>
> Signed-off-by: Shivang Upadhyay <shivangu@linux.ibm.com>
> ---
> Changelog:
>
> V2:
> * added set_crash_nmi_ipi to saperate crash's case from other nmi_ipi
> users
>
> V1:
> * https://lore.kernel.org/all/20251205142825.44698-1-shivangu@linux.ibm.com/
> ---
> arch/powerpc/include/asm/smp.h | 1 +
> arch/powerpc/kernel/smp.c | 1 +
> arch/powerpc/platforms/pseries/smp.c | 29 +++++++++++++++++++++++++++-
> 3 files changed, 30 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
> index e41b9ea42122..cb74201f5674 100644
> --- a/arch/powerpc/include/asm/smp.h
> +++ b/arch/powerpc/include/asm/smp.h
> @@ -47,6 +47,7 @@ struct smp_ops_t {
> void (*cause_ipi)(int cpu);
> #endif
> int (*cause_nmi_ipi)(int cpu);
> + void (*set_crash_nmi_ipi)(void);
> void (*probe)(void);
> int (*kick_cpu)(int nr);
> int (*prepare_cpu)(int nr);
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 3467f86fd78f..3390ee8adf79 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -594,6 +594,7 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
> {
> int cpu;
>
> + smp_ops->set_crash_nmi_ipi();
> smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_ipi_callback, 1000000);
> if (kdump_in_progress() && crash_wake_offline) {
> for_each_present_cpu(cpu) {
> diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
> index db99725e752b..c6c2baacca9a 100644
> --- a/arch/powerpc/platforms/pseries/smp.c
> +++ b/arch/powerpc/platforms/pseries/smp.c
> @@ -51,6 +51,9 @@
> */
> static cpumask_var_t of_spin_mask;
>
> +
> +static int crash_nmi_ipi;
> +
> /* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */
> int smp_query_cpu_stopped(unsigned int pcpu)
> {
> @@ -171,12 +174,35 @@ static void dbell_or_ic_cause_ipi(int cpu)
> ic_cause_ipi(cpu);
> }
>
> +static void pseries_set_crash_nmi_ipi(void)
> +{
> + crash_nmi_ipi = 1;
> +}
> +
> static int pseries_cause_nmi_ipi(int cpu)
> {
> int hwcpu;
> + int k, curcpu;
Please follow inverted christmas tree for variables.
Can the below block be re-written with less indentations?
One level indentation removal could be.
if (cpu == NMI_IPI_ALL_OTHERS && crash_nmi_ipi)
hwcpu = <>
else if (cpu == NMI_IPI_ALL_OTHERS)
hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
else
<existing remaining logic>
>
> + curcpu = smp_processor_id();
> if (cpu == NMI_IPI_ALL_OTHERS) {
> - hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
> + if (crash_nmi_ipi) {
> + for_each_present_cpu(k) {
> + if (k != curcpu) {
Maybe below to reduce one more indentation level.
if ( k == curcpu)
continue;
> + hwcpu = get_hard_smp_processor_id(k);
> +
> + /* it is possible that cpu is present,
> + * but not started yet.
> + */
> +
> + if (paca_ptrs[hwcpu]->cpu_start == 1) {
It either 1 or 0. So if(paca_ptrs[hwcpu]->cpu_start) is enough.
> + plpar_signal_sys_reset(hwcpu);
> + }
> + }
> + }
> + return 1;
> + } else
> + hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
> } else {
> if (cpu < 0) {
> WARN_ONCE(true, "incorrect cpu parameter %d", cpu);
> @@ -243,6 +269,7 @@ static struct smp_ops_t pseries_smp_ops = {
> .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
> .cause_ipi = NULL, /* Filled at runtime by pSeries_smp_probe() */
> .cause_nmi_ipi = pseries_cause_nmi_ipi,
> + .set_crash_nmi_ipi = pseries_set_crash_nmi_ipi,
> .probe = pSeries_smp_probe,
> .prepare_cpu = pseries_smp_prepare_cpu,
> .kick_cpu = smp_pSeries_kick_cpu,
Hi Shrikanth, Thanks for your review.
On Mon, 2026-03-30 at 14:19 +0530, Shrikanth Hegde wrote:
> Hi Shivang.
>
> Few nitpicks.
>
> On 3/30/26 11:52 AM, Shivang Upadhyay wrote:
> > During DLPAR operations, the newly added CPUs start in halted mode.
> > The kernel then takes some time to initialize those CPUs internally
> > and
> > start them using the "start-cpu" RTAS call. However, if a kexec
> > crash
> > occurs in this window (before the new CPU has been initialized),
> > the kexec NMI will try to reset all other CPUs from the crashing
> > CPU.
> > This leads to firmware starting the uninitialized CPUs as well.
> >
> > This can cause the kdump kernel to hang during bring-up.
> >
> > Sample Log:
> > [175993.028231][ T1502] NIP [00007fffb953f394] 0x7fffb953f394
> > [175993.028314][ T1502] LR [00007fffb953f394] 0x7fffb953f394
> > [175993.028390][ T1502] --- interrupt: 3000
> > [ 5.519483][ T1] Processor 0 is stuck.
> > [ 11.089481][ T1] Processor 1 is stuck.
> >
> > To fix this, only issue the system-reset hcall to CPUs that have
> > actually been started by the kernel.
> >
> > Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
> > Cc: Michael Ellerman <mpe@ellerman.id.au>
> > Cc: Nicholas Piggin <npiggin@gmail.com>
> > Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
> > Cc: Srikar Dronamraju <srikar@linux.ibm.com>
> > Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
> > Cc: Nysal Jan K.A. <nysal@linux.ibm.com>
> > Cc: Vishal Chourasia <vishalc@linux.ibm.com>
> > Cc: Ritesh Harjani <ritesh.list@gmail.com>
> > Cc: Sourabh Jain <sourabhjain@linux.ibm.com>
> > Reported-by: Anushree Mathur <anushree.mathur@linux.vnet.ibm.com>
> > Signed-off-by: Shivang Upadhyay <shivangu@linux.ibm.com>
> > ---
> > Changelog:
> >
> > V2:
> > * added set_crash_nmi_ipi to saperate crash's case from other
> > nmi_ipi
> > users
> >
> > V1:
> > *
> > https://lore.kernel.org/all/20251205142825.44698-1-shivangu@linux.ibm.com/
> > ---
> > arch/powerpc/include/asm/smp.h | 1 +
> > arch/powerpc/kernel/smp.c | 1 +
> > arch/powerpc/platforms/pseries/smp.c | 29
> > +++++++++++++++++++++++++++-
> > 3 files changed, 30 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/powerpc/include/asm/smp.h
> > b/arch/powerpc/include/asm/smp.h
> > index e41b9ea42122..cb74201f5674 100644
> > --- a/arch/powerpc/include/asm/smp.h
> > +++ b/arch/powerpc/include/asm/smp.h
> > @@ -47,6 +47,7 @@ struct smp_ops_t {
> > void (*cause_ipi)(int cpu);
> > #endif
> > int (*cause_nmi_ipi)(int cpu);
> > + void (*set_crash_nmi_ipi)(void);
> > void (*probe)(void);
> > int (*kick_cpu)(int nr);
> > int (*prepare_cpu)(int nr);
> > diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> > index 3467f86fd78f..3390ee8adf79 100644
> > --- a/arch/powerpc/kernel/smp.c
> > +++ b/arch/powerpc/kernel/smp.c
> > @@ -594,6 +594,7 @@ void crash_send_ipi(void
> > (*crash_ipi_callback)(struct pt_regs *))
> > {
> > int cpu;
> >
> > + smp_ops->set_crash_nmi_ipi();
> > smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_ipi_callback,
> > 1000000);
> > if (kdump_in_progress() && crash_wake_offline) {
> > for_each_present_cpu(cpu) {
> > diff --git a/arch/powerpc/platforms/pseries/smp.c
> > b/arch/powerpc/platforms/pseries/smp.c
> > index db99725e752b..c6c2baacca9a 100644
> > --- a/arch/powerpc/platforms/pseries/smp.c
> > +++ b/arch/powerpc/platforms/pseries/smp.c
> > @@ -51,6 +51,9 @@
> > */
> > static cpumask_var_t of_spin_mask;
> >
> > +
> > +static int crash_nmi_ipi;
> > +
> > /* Query where a cpu is now. Return codes #defined in
> > plpar_wrappers.h */
> > int smp_query_cpu_stopped(unsigned int pcpu)
> > {
> > @@ -171,12 +174,35 @@ static void dbell_or_ic_cause_ipi(int cpu)
> > ic_cause_ipi(cpu);
> > }
> >
> > +static void pseries_set_crash_nmi_ipi(void)
> > +{
> > + crash_nmi_ipi = 1;
> > +}
> > +
> > static int pseries_cause_nmi_ipi(int cpu)
> > {
> > int hwcpu;
> > + int k, curcpu;
>
> Please follow inverted christmas tree for variables.
Ack.
>
>
> Can the below block be re-written with less indentations?
> One level indentation removal could be.
>
> if (cpu == NMI_IPI_ALL_OTHERS && crash_nmi_ipi)
> hwcpu = <>
> else if (cpu == NMI_IPI_ALL_OTHERS)
> hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
> else
> <existing remaining logic>
>
> >
> > + curcpu = smp_processor_id();
> > if (cpu == NMI_IPI_ALL_OTHERS) {
> > - hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
> > + if (crash_nmi_ipi) {
> > + for_each_present_cpu(k) {
> > + if (k != curcpu) {
>
> Maybe below to reduce one more indentation level.
>
> if ( k == curcpu)
> continue;
>
Sure, i can rewrite it that way.
>
> > + hwcpu =
> > get_hard_smp_processor_id(k);
> > +
> > + /* it is possible that cpu
> > is present,
> > + * but not started yet.
> > + */
> > +
> > + if (paca_ptrs[hwcpu]-
> > >cpu_start == 1) {
>
> It either 1 or 0. So if(paca_ptrs[hwcpu]->cpu_start) is enough.
Ack.
~Shivang.
© 2016 - 2026 Red Hat, Inc.