If TWA_NMI_CURRENT task work is queued from an NMI triggered while
running in __schedule() with IRQs disabled, task_work_set_notify_irq()
ends up inadvertently running on the next scheduled task. So the
original task doesn't get its TIF_NOTIFY_RESUME flag set and the task
work may get delayed indefinitely, or may not get to run at all.
__schedule()
// disable irqs
<NMI>
task_work_add(current, work, TWA_NMI_CURRENT);
</NMI>
// current = next;
// enable irqs
<IRQ>
task_work_set_notify_irq()
test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); // wrong task!
</IRQ>
// original task skips task work on its next return to user (or exit!)
Fix it by storing the task pointer along with the irq_work struct and
passing that task to set_notify_resume().
Fixes: 466e4d801cd4 ("task_work: Add TWA_NMI_CURRENT as an additional notify mode.")
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
kernel/task_work.c | 30 +++++++++++++++++++++---------
1 file changed, 21 insertions(+), 9 deletions(-)
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 92024a8bfe12..f17447f69843 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -7,12 +7,23 @@
static struct callback_head work_exited; /* all we need is ->next == NULL */
#ifdef CONFIG_IRQ_WORK
+
+struct nmi_irq_work {
+ struct irq_work work;
+ struct task_struct *task;
+};
+
static void task_work_set_notify_irq(struct irq_work *entry)
{
- test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+ struct nmi_irq_work *work = container_of(entry, struct nmi_irq_work, work);
+
+ set_notify_resume(work->task);
}
-static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =
- IRQ_WORK_INIT_HARD(task_work_set_notify_irq);
+
+static DEFINE_PER_CPU(struct nmi_irq_work, nmi_irq_work) = {
+ .work = IRQ_WORK_INIT_HARD(task_work_set_notify_irq),
+};
+
#endif
/**
@@ -65,15 +76,21 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
if (!IS_ENABLED(CONFIG_IRQ_WORK))
return -EINVAL;
#ifdef CONFIG_IRQ_WORK
+{
+ struct nmi_irq_work *irq_work = this_cpu_ptr(&nmi_irq_work);
+
head = task->task_works;
if (unlikely(head == &work_exited))
return -ESRCH;
- if (!irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)))
+ if (!irq_work_queue(&irq_work->work))
return -EBUSY;
+ irq_work->task = current;
+
work->next = head;
task->task_works = work;
+}
#endif
return 0;
}
@@ -109,11 +126,6 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
case TWA_SIGNAL_NO_IPI:
__set_notify_signal(task);
break;
-#ifdef CONFIG_IRQ_WORK
- case TWA_NMI_CURRENT:
- irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
- break;
-#endif
default:
WARN_ON_ONCE(1);
break;
--
2.48.1