[PATCH v4 02/39] task_work: Fix TWA_NMI_CURRENT race with __schedule()

Josh Poimboeuf posted 39 patches 1 year ago
[PATCH v4 02/39] task_work: Fix TWA_NMI_CURRENT race with __schedule()
Posted by Josh Poimboeuf 1 year ago
If TWA_NMI_CURRENT task work is queued from an NMI triggered while
running in __schedule() with IRQs disabled, task_work_set_notify_irq()
ends up inadvertently running on the next scheduled task.  So the
original task doesn't get its TIF_NOTIFY_RESUME flag set and the task
work may get delayed indefinitely, or may not get to run at all.

    __schedule()
	// disable irqs
	    <NMI>
		task_work_add(current, work, TWA_NMI_CURRENT);
	    </NMI>
	// current = next;
	// enable irqs
	    <IRQ>
		task_work_set_notify_irq()
		test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); // wrong task!
	    </IRQ>
	// original task skips task work on its next return to user (or exit!)

Fix it by storing the task pointer along with the irq_work struct and
passing that task to set_notify_resume().

Fixes: 466e4d801cd4 ("task_work: Add TWA_NMI_CURRENT as an additional notify mode.")
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 kernel/task_work.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/kernel/task_work.c b/kernel/task_work.c
index 92024a8bfe12..f17447f69843 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -7,12 +7,23 @@
 static struct callback_head work_exited; /* all we need is ->next == NULL */
 
 #ifdef CONFIG_IRQ_WORK
+
+struct nmi_irq_work {
+	struct irq_work work;
+	struct task_struct *task;
+};
+
 static void task_work_set_notify_irq(struct irq_work *entry)
 {
-	test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+	struct nmi_irq_work *work = container_of(entry, struct nmi_irq_work, work);
+
+	set_notify_resume(work->task);
 }
-static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =
-	IRQ_WORK_INIT_HARD(task_work_set_notify_irq);
+
+static DEFINE_PER_CPU(struct nmi_irq_work, nmi_irq_work) = {
+	.work = IRQ_WORK_INIT_HARD(task_work_set_notify_irq),
+};
+
 #endif
 
 /**
@@ -65,15 +76,21 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
 		if (!IS_ENABLED(CONFIG_IRQ_WORK))
 			return -EINVAL;
 #ifdef CONFIG_IRQ_WORK
+{
+		struct nmi_irq_work *irq_work = this_cpu_ptr(&nmi_irq_work);
+
 		head = task->task_works;
 		if (unlikely(head == &work_exited))
 			return -ESRCH;
 
-		if (!irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)))
+		if (!irq_work_queue(&irq_work->work))
 			return -EBUSY;
 
+		irq_work->task = current;
+
 		work->next = head;
 		task->task_works = work;
+}
 #endif
 		return 0;
 	}
@@ -109,11 +126,6 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
 	case TWA_SIGNAL_NO_IPI:
 		__set_notify_signal(task);
 		break;
-#ifdef CONFIG_IRQ_WORK
-	case TWA_NMI_CURRENT:
-		irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
-		break;
-#endif
 	default:
 		WARN_ON_ONCE(1);
 		break;
-- 
2.48.1
Re: [PATCH v4 02/39] task_work: Fix TWA_NMI_CURRENT race with __schedule()
Posted by Peter Zijlstra 1 year ago
On Tue, Jan 21, 2025 at 06:30:54PM -0800, Josh Poimboeuf wrote:
> If TWA_NMI_CURRENT task work is queued from an NMI triggered while
> running in __schedule() with IRQs disabled, task_work_set_notify_irq()
> ends up inadvertently running on the next scheduled task.  So the
> original task doesn't get its TIF_NOTIFY_RESUME flag set and the task
> work may get delayed indefinitely, or may not get to run at all.
> 
>     __schedule()
> 	// disable irqs
> 	    <NMI>
> 		task_work_add(current, work, TWA_NMI_CURRENT);
> 	    </NMI>
> 	// current = next;
> 	// enable irqs
> 	    <IRQ>
> 		task_work_set_notify_irq()
> 		test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); // wrong task!
> 	    </IRQ>
> 	// original task skips task work on its next return to user (or exit!)
> 
> Fix it by storing the task pointer along with the irq_work struct and
> passing that task to set_notify_resume().

So I'm a little confused, isn't something like this sufficient?

If we hit before schedule(), all just works as expected, if we hit after
schedule(), the task will already have the TIF flag set, and we'll hit
the return to user path once it gets scheduled again.

---
diff --git a/kernel/task_work.c b/kernel/task_work.c
index c969f1f26be5..155549c017b2 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -9,7 +9,12 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
 #ifdef CONFIG_IRQ_WORK
 static void task_work_set_notify_irq(struct irq_work *entry)
 {
-	test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+	/*
+	 * no-op IPI
+	 *
+	 * TWA_NMI_CURRENT will already have set the TIF flag, all
+	 * this interrupt does it tickle the return-to-user path.
+	 */
 }
 static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =
 	IRQ_WORK_INIT_HARD(task_work_set_notify_irq);
@@ -98,6 +103,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
 		break;
 #ifdef CONFIG_IRQ_WORK
 	case TWA_NMI_CURRENT:
+		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
 		irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
 		break;
 #endif
Re: [PATCH v4 02/39] task_work: Fix TWA_NMI_CURRENT race with __schedule()
Posted by Steven Rostedt 9 months, 3 weeks ago
On Wed, 22 Jan 2025 13:42:28 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> If we hit before schedule(), all just works as expected, if we hit after
> schedule(), the task will already have the TIF flag set, and we'll hit
> the return to user path once it gets scheduled again.
> 
> ---
> diff --git a/kernel/task_work.c b/kernel/task_work.c
> index c969f1f26be5..155549c017b2 100644
> --- a/kernel/task_work.c
> +++ b/kernel/task_work.c
> @@ -9,7 +9,12 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
>  #ifdef CONFIG_IRQ_WORK
>  static void task_work_set_notify_irq(struct irq_work *entry)
>  {
> -	test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
> +	/*
> +	 * no-op IPI
> +	 *
> +	 * TWA_NMI_CURRENT will already have set the TIF flag, all
> +	 * this interrupt does it tickle the return-to-user path.
> +	 */
>  }
>  static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =
>  	IRQ_WORK_INIT_HARD(task_work_set_notify_irq);
> @@ -98,6 +103,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
>  		break;
>  #ifdef CONFIG_IRQ_WORK
>  	case TWA_NMI_CURRENT:
> +		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
>  		irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
>  		break;
>  #endif

Does this patch replace patches 1 and 2?

If so, Peter, can you give me a Signed-off-by?

-- Steve
Re: [PATCH v4 02/39] task_work: Fix TWA_NMI_CURRENT race with __schedule()
Posted by Josh Poimboeuf 9 months, 3 weeks ago
On Tue, Apr 22, 2025 at 12:15:41PM -0400, Steven Rostedt wrote:
> On Wed, 22 Jan 2025 13:42:28 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > If we hit before schedule(), all just works as expected, if we hit after
> > schedule(), the task will already have the TIF flag set, and we'll hit
> > the return to user path once it gets scheduled again.
> > 
> > ---
> > diff --git a/kernel/task_work.c b/kernel/task_work.c
> > index c969f1f26be5..155549c017b2 100644
> > --- a/kernel/task_work.c
> > +++ b/kernel/task_work.c
> > @@ -9,7 +9,12 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
> >  #ifdef CONFIG_IRQ_WORK
> >  static void task_work_set_notify_irq(struct irq_work *entry)
> >  {
> > -	test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
> > +	/*
> > +	 * no-op IPI
> > +	 *
> > +	 * TWA_NMI_CURRENT will already have set the TIF flag, all
> > +	 * this interrupt does it tickle the return-to-user path.
> > +	 */
> >  }
> >  static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =
> >  	IRQ_WORK_INIT_HARD(task_work_set_notify_irq);
> > @@ -98,6 +103,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
> >  		break;
> >  #ifdef CONFIG_IRQ_WORK
> >  	case TWA_NMI_CURRENT:
> > +		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
> >  		irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
> >  		break;
> >  #endif
> 
> Does this patch replace patches 1 and 2?

Indeed it does.

> If so, Peter, can you give me a Signed-off-by?
> 
> -- Steve

-- 
Josh
Re: [PATCH v4 02/39] task_work: Fix TWA_NMI_CURRENT race with __schedule()
Posted by Josh Poimboeuf 1 year ago
On Wed, Jan 22, 2025 at 01:42:28PM +0100, Peter Zijlstra wrote:
> So I'm a little confused, isn't something like this sufficient?
> 
> If we hit before schedule(), all just works as expected, if we hit after
> schedule(), the task will already have the TIF flag set, and we'll hit
> the return to user path once it gets scheduled again.
> 
> ---
> diff --git a/kernel/task_work.c b/kernel/task_work.c
> index c969f1f26be5..155549c017b2 100644
> --- a/kernel/task_work.c
> +++ b/kernel/task_work.c
> @@ -9,7 +9,12 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
>  #ifdef CONFIG_IRQ_WORK
>  static void task_work_set_notify_irq(struct irq_work *entry)
>  {
> -	test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
> +	/*
> +	 * no-op IPI
> +	 *
> +	 * TWA_NMI_CURRENT will already have set the TIF flag, all
> +	 * this interrupt does it tickle the return-to-user path.
> +	 */
>  }
>  static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =
>  	IRQ_WORK_INIT_HARD(task_work_set_notify_irq);
> @@ -98,6 +103,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
>  		break;
>  #ifdef CONFIG_IRQ_WORK
>  	case TWA_NMI_CURRENT:
> +		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
>  		irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
>  		break;
>  #endif

Yeah, that looks so much better...

The self-IPI is only needed when the NMI happened in user space, right?
Would it make sense to have an optimized version of that?

-- 
Josh
Re: [PATCH v4 02/39] task_work: Fix TWA_NMI_CURRENT race with __schedule()
Posted by Josh Poimboeuf 1 year ago
On Wed, Jan 22, 2025 at 01:03:42PM -0800, Josh Poimboeuf wrote:
> The self-IPI is only needed when the NMI happened in user space, right?
> Would it make sense to have an optimized version of that?

Actually, maybe not, that could be tricky if the NMI hits in the kernel
after task work runs.

-- 
Josh
Re: [PATCH v4 02/39] task_work: Fix TWA_NMI_CURRENT race with __schedule()
Posted by Peter Zijlstra 1 year ago
On Wed, Jan 22, 2025 at 02:14:30PM -0800, Josh Poimboeuf wrote:
> On Wed, Jan 22, 2025 at 01:03:42PM -0800, Josh Poimboeuf wrote:
> > The self-IPI is only needed when the NMI happened in user space, right?
> > Would it make sense to have an optimized version of that?
> 
> Actually, maybe not, that could be tricky if the NMI hits in the kernel
> after task work runs.

Right, I was going to say, lets keep this as simple as possible :-)
Re: [PATCH v4 02/39] task_work: Fix TWA_NMI_CURRENT race with __schedule()
Posted by Peter Zijlstra 1 year ago
On Tue, Jan 21, 2025 at 06:30:54PM -0800, Josh Poimboeuf wrote:

> @@ -109,11 +126,6 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
>  	case TWA_SIGNAL_NO_IPI:
>  		__set_notify_signal(task);
>  		break;
> -#ifdef CONFIG_IRQ_WORK
> -	case TWA_NMI_CURRENT:
> -		irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
> -		break;
> -#endif
>  	default:
>  		WARN_ON_ONCE(1);
>  		break;

Shouldn't this go to in the previous patch?