[PATCH] arm64: kgdb: Ensure atomic single-step execution

Mengchen Li posted 1 patch 4 weeks ago
arch/arm64/kernel/kgdb.c | 49 ++++++++++++++++++++----------------------------
1 file changed, 20 insertions(+), 29 deletions(-)
[PATCH] arm64: kgdb: Ensure atomic single-step execution
Posted by Mengchen Li 4 weeks ago
The existing KGDB single-step handling on ARM64 is susceptible to
interference from external interrupts. If an interrupt arrives in the
narrow time window between the execution of the instruction under test
and the generation of the step exception, the CPU will vector to the
interrupt handler (e.g., el1_interrupt -> __el1_irq) instead of
triggering the debug exception immediately.

When the step exception is finally taken, the context is no longer that
of the original instruction. This causes the debugger to appear "stuck",
as it repeatedly tries to single-step through the interrupt handler's
code (e.g., irq_enter_rcu()) instead of the target code.

The fix is to make the single-step operation atomic by masking interrupts
for its duration:
1. Upon receiving a step ('s') request from GDB, save the current
   PSTATE and then mask IRQs by setting the PSTATE.I bit.
2. After the single-step exception is taken, in kgdb_single_step_handler(),
   disable the kernel's single-step mechanism and meticulously restore
   the original interrupt mask state from the saved PSTATE.

This guarantees the instruction is executed without interruption and the
debug exception is taken in the correct context.

As a result of this new approach, the following cleanups are also made:
- The global `kgdb_single_step` flag is removed, as state is now precisely
  managed by `kgdb_cpu_doing_single_step` and the interrupt mask.
- The logic to disable single-step and manage the flag in the 'c'ontinue
  case is removed, as it is rendered redundant.
- The call to `kernel_rewind_single_step()` is unnecessary and is removed.

Tested on OrangePi 3B (RK3566) via serial console (kgdboc);
allows reliable single-stepping with GDB where it previously failed.

Signed-off-by: Mengchen Li <mengchenli64@gmail.com>
---
 arch/arm64/kernel/kgdb.c | 49 ++++++++++++++++++++----------------------------
 1 file changed, 20 insertions(+), 29 deletions(-)

diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
index 968324a..ee8a7e3 100644
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -101,6 +101,8 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
 	{ "fpcr", 4, -1 },
 };
 
+static DEFINE_PER_CPU(unsigned int, kgdb_pstate);
+
 char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
 {
 	if (regno >= DBG_MAX_REG_NUM || regno < 0)
@@ -128,25 +130,15 @@ int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
 void
 sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *task)
 {
-	struct cpu_context *cpu_context = &task->thread.cpu_context;
+	struct pt_regs *thread_regs;
 
 	/* Initialize to zero */
 	memset((char *)gdb_regs, 0, NUMREGBYTES);
 
-	gdb_regs[19] = cpu_context->x19;
-	gdb_regs[20] = cpu_context->x20;
-	gdb_regs[21] = cpu_context->x21;
-	gdb_regs[22] = cpu_context->x22;
-	gdb_regs[23] = cpu_context->x23;
-	gdb_regs[24] = cpu_context->x24;
-	gdb_regs[25] = cpu_context->x25;
-	gdb_regs[26] = cpu_context->x26;
-	gdb_regs[27] = cpu_context->x27;
-	gdb_regs[28] = cpu_context->x28;
-	gdb_regs[29] = cpu_context->fp;
-
-	gdb_regs[31] = cpu_context->sp;
-	gdb_regs[32] = cpu_context->pc;
+	thread_regs = task_pt_regs(task);
+	memcpy((void *)gdb_regs, (void *)thread_regs->regs, GP_REG_BYTES);
+	/* Special case for PSTATE */
+	dbg_get_reg(33, gdb_regs + GP_REG_BYTES, thread_regs);
 }
 
 void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
@@ -195,18 +187,12 @@ int kgdb_arch_handle_exception(int exception_vector, int signo,
 		 * over and over again.
 		 */
 		kgdb_arch_update_addr(linux_regs, remcom_in_buffer);
-		atomic_set(&kgdb_cpu_doing_single_step, -1);
-		kgdb_single_step =  0;
-
-		/*
-		 * Received continue command, disable single step
-		 */
-		if (kernel_active_single_step())
-			kernel_disable_single_step();
-
 		err = 0;
 		break;
 	case 's':
+		/* mask interrupts while single stepping */
+		__this_cpu_write(kgdb_pstate, linux_regs->pstate);
+		linux_regs->pstate |= (1 << 7);
 		/*
 		 * Update step address value with address passed
 		 * with step packet.
@@ -217,15 +203,12 @@ int kgdb_arch_handle_exception(int exception_vector, int signo,
 		 */
 		kgdb_arch_update_addr(linux_regs, remcom_in_buffer);
 		atomic_set(&kgdb_cpu_doing_single_step, raw_smp_processor_id());
-		kgdb_single_step =  1;
 
 		/*
 		 * Enable single step handling
 		 */
 		if (!kernel_active_single_step())
 			kernel_enable_single_step(linux_regs);
-		else
-			kernel_rewind_single_step(linux_regs);
 		err = 0;
 		break;
 	default:
@@ -252,9 +235,17 @@ NOKPROBE_SYMBOL(kgdb_compiled_brk_handler);
 
 int kgdb_single_step_handler(struct pt_regs *regs, unsigned long esr)
 {
-	if (!kgdb_single_step)
-		return DBG_HOOK_ERROR;
+	unsigned int pstate;
+
+	kernel_disable_single_step();
+	atomic_set(&kgdb_cpu_doing_single_step, -1);
 
+	/* restore interrupt mask status */
+	pstate = __this_cpu_read(kgdb_pstate);
+	if (pstate & (1 << 7))
+		regs->pstate |= (1 << 7);
+	else
+		regs->pstate &= ~(1 << 7);
 	kgdb_handle_exception(0, SIGTRAP, 0, regs);
 	return DBG_HOOK_HANDLED;
 }
-- 
2.7.4
Re: [PATCH] arm64: kgdb: Ensure atomic single-step execution
Posted by Mark Rutland 1 week, 1 day ago
On Thu, Sep 04, 2025 at 03:47:23PM +0800, Mengchen Li wrote:
> The existing KGDB single-step handling on ARM64 is susceptible to
> interference from external interrupts. If an interrupt arrives in the
> narrow time window between the execution of the instruction under test
> and the generation of the step exception, the CPU will vector to the
> interrupt handler (e.g., el1_interrupt -> __el1_irq) instead of
> triggering the debug exception immediately.
> 
> When the step exception is finally taken, the context is no longer that
> of the original instruction. This causes the debugger to appear "stuck",
> as it repeatedly tries to single-step through the interrupt handler's
> code (e.g., irq_enter_rcu()) instead of the target code.
> 
> The fix is to make the single-step operation atomic by masking interrupts
> for its duration:
> 1. Upon receiving a step ('s') request from GDB, save the current
>    PSTATE and then mask IRQs by setting the PSTATE.I bit.
> 2. After the single-step exception is taken, in kgdb_single_step_handler(),
>    disable the kernel's single-step mechanism and meticulously restore
>    the original interrupt mask state from the saved PSTATE.

I don't think that works:

* Anything which reads PSTATE/DAIF will see PSTATE.I set unexpectedly.
  For example, that will break irqflag tracing, since
  arch_irqs_disabled_flags() will return true in cases where it is
  expected to return false.

* That will break anything which sets PSTATE.I.
  For example, if stepping local_irq_disable(), the initial DAIF value
  might have DAIF.I clear. If that's restored *after*
  local_irq_disable() has set DAIF.I, then restoring the original DAIF.I
  value will erroneously unmask interrupts.

More generally:

* The DAIF.IF bits need to be handled atomically, for platforms where
  FIQ is used.

* Even if the DAIF.IF bits are set, It's still possible to take SError,
  and potentially other exceptions in future (e.g. PMU, NMI). I don't
  think we can only think about the DAIF.I or DAIF.IF bits.

If we want to do something here, I think we'd need to enlighten the
entry code with more comprehensive management of the singlestep state.
I'm not too keen on coupling that with KGDB.

> This guarantees the instruction is executed without interruption and the
> debug exception is taken in the correct context.
> 
> As a result of this new approach, the following cleanups are also made:
> - The global `kgdb_single_step` flag is removed, as state is now precisely
>   managed by `kgdb_cpu_doing_single_step` and the interrupt mask.
> - The logic to disable single-step and manage the flag in the 'c'ontinue
>   case is removed, as it is rendered redundant.
> - The call to `kernel_rewind_single_step()` is unnecessary and is removed.
> 
> Tested on OrangePi 3B (RK3566) via serial console (kgdboc);
> allows reliable single-stepping with GDB where it previously failed.
> 
> Signed-off-by: Mengchen Li <mengchenli64@gmail.com>
> ---
>  arch/arm64/kernel/kgdb.c | 49 ++++++++++++++++++++----------------------------
>  1 file changed, 20 insertions(+), 29 deletions(-)
> 
> diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
> index 968324a..ee8a7e3 100644
> --- a/arch/arm64/kernel/kgdb.c
> +++ b/arch/arm64/kernel/kgdb.c
> @@ -101,6 +101,8 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
>  	{ "fpcr", 4, -1 },
>  };
>  
> +static DEFINE_PER_CPU(unsigned int, kgdb_pstate);
> +
>  char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
>  {
>  	if (regno >= DBG_MAX_REG_NUM || regno < 0)
> @@ -128,25 +130,15 @@ int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
>  void
>  sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *task)
>  {
> -	struct cpu_context *cpu_context = &task->thread.cpu_context;
> +	struct pt_regs *thread_regs;
>  
>  	/* Initialize to zero */
>  	memset((char *)gdb_regs, 0, NUMREGBYTES);
>  
> -	gdb_regs[19] = cpu_context->x19;
> -	gdb_regs[20] = cpu_context->x20;
> -	gdb_regs[21] = cpu_context->x21;
> -	gdb_regs[22] = cpu_context->x22;
> -	gdb_regs[23] = cpu_context->x23;
> -	gdb_regs[24] = cpu_context->x24;
> -	gdb_regs[25] = cpu_context->x25;
> -	gdb_regs[26] = cpu_context->x26;
> -	gdb_regs[27] = cpu_context->x27;
> -	gdb_regs[28] = cpu_context->x28;
> -	gdb_regs[29] = cpu_context->fp;
> -
> -	gdb_regs[31] = cpu_context->sp;
> -	gdb_regs[32] = cpu_context->pc;
> +	thread_regs = task_pt_regs(task);
> +	memcpy((void *)gdb_regs, (void *)thread_regs->regs, GP_REG_BYTES);
> +	/* Special case for PSTATE */
> +	dbg_get_reg(33, gdb_regs + GP_REG_BYTES, thread_regs);
>  }

The commit message doesn't explain anything about the behaviour for
sleeping threads, so it's not clear why this is changed at all.

The task_pt_regs() helper returns a pointer to the pt_regs for the
*userspace* context of a task. That doesn't represent the kernel
context, and that's meaningless for kthreads without a userspace
context.

Regardless of anything else, this is definitely wrong. It is at best
pointless.

[...]

> +		/* mask interrupts while single stepping */
> +		__this_cpu_write(kgdb_pstate, linux_regs->pstate);
> +		linux_regs->pstate |= (1 << 7);

As a general note, please don't open-code bit shifts like this.

IIUC this is PSR_I_BIT.

Mark.
Re: [PATCH] arm64: kgdb: Ensure atomic single-step execution
Posted by Will Deacon 1 week, 3 days ago
On Thu, Sep 04, 2025 at 03:47:23PM +0800, Mengchen Li wrote:
> The existing KGDB single-step handling on ARM64 is susceptible to
> interference from external interrupts. If an interrupt arrives in the
> narrow time window between the execution of the instruction under test
> and the generation of the step exception, the CPU will vector to the
> interrupt handler (e.g., el1_interrupt -> __el1_irq) instead of
> triggering the debug exception immediately.
> 
> When the step exception is finally taken, the context is no longer that
> of the original instruction. This causes the debugger to appear "stuck",
> as it repeatedly tries to single-step through the interrupt handler's
> code (e.g., irq_enter_rcu()) instead of the target code.
> 
> The fix is to make the single-step operation atomic by masking interrupts
> for its duration:
> 1. Upon receiving a step ('s') request from GDB, save the current
>    PSTATE and then mask IRQs by setting the PSTATE.I bit.
> 2. After the single-step exception is taken, in kgdb_single_step_handler(),
>    disable the kernel's single-step mechanism and meticulously restore
>    the original interrupt mask state from the saved PSTATE.
> 
> This guarantees the instruction is executed without interruption and the
> debug exception is taken in the correct context.
> 
> As a result of this new approach, the following cleanups are also made:
> - The global `kgdb_single_step` flag is removed, as state is now precisely
>   managed by `kgdb_cpu_doing_single_step` and the interrupt mask.
> - The logic to disable single-step and manage the flag in the 'c'ontinue
>   case is removed, as it is rendered redundant.
> - The call to `kernel_rewind_single_step()` is unnecessary and is removed.
> 
> Tested on OrangePi 3B (RK3566) via serial console (kgdboc);
> allows reliable single-stepping with GDB where it previously failed.
> 
> Signed-off-by: Mengchen Li <mengchenli64@gmail.com>
> ---
>  arch/arm64/kernel/kgdb.c | 49 ++++++++++++++++++++----------------------------
>  1 file changed, 20 insertions(+), 29 deletions(-)
> 
> diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
> index 968324a..ee8a7e3 100644
> --- a/arch/arm64/kernel/kgdb.c
> +++ b/arch/arm64/kernel/kgdb.c
> @@ -101,6 +101,8 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
>  	{ "fpcr", 4, -1 },
>  };
>  
> +static DEFINE_PER_CPU(unsigned int, kgdb_pstate);
> +
>  char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
>  {
>  	if (regno >= DBG_MAX_REG_NUM || regno < 0)
> @@ -128,25 +130,15 @@ int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
>  void
>  sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *task)
>  {
> -	struct cpu_context *cpu_context = &task->thread.cpu_context;
> +	struct pt_regs *thread_regs;
>  
>  	/* Initialize to zero */
>  	memset((char *)gdb_regs, 0, NUMREGBYTES);
>  
> -	gdb_regs[19] = cpu_context->x19;
> -	gdb_regs[20] = cpu_context->x20;
> -	gdb_regs[21] = cpu_context->x21;
> -	gdb_regs[22] = cpu_context->x22;
> -	gdb_regs[23] = cpu_context->x23;
> -	gdb_regs[24] = cpu_context->x24;
> -	gdb_regs[25] = cpu_context->x25;
> -	gdb_regs[26] = cpu_context->x26;
> -	gdb_regs[27] = cpu_context->x27;
> -	gdb_regs[28] = cpu_context->x28;
> -	gdb_regs[29] = cpu_context->fp;
> -
> -	gdb_regs[31] = cpu_context->sp;
> -	gdb_regs[32] = cpu_context->pc;
> +	thread_regs = task_pt_regs(task);
> +	memcpy((void *)gdb_regs, (void *)thread_regs->regs, GP_REG_BYTES);

Why are you now copying the caller saved registers?

Will