[PATCH v4 30/39] unwind_user/deferred: Make unwind deferral requests NMI-safe

Josh Poimboeuf posted 39 patches 1 year ago
[PATCH v4 30/39] unwind_user/deferred: Make unwind deferral requests NMI-safe
Posted by Josh Poimboeuf 1 year ago
Make unwind_deferred_request() NMI-safe so tracers in NMI context can
call it to get the cookie immediately rather than have to do the fragile
"schedule irq work and then call unwind_deferred_request()" dance.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 include/linux/entry-common.h          |   1 +
 include/linux/unwind_deferred.h       |   6 ++
 include/linux/unwind_deferred_types.h |   1 +
 kernel/unwind/deferred.c              | 106 ++++++++++++++++++++++----
 4 files changed, 98 insertions(+), 16 deletions(-)

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index fb2b27154fee..e9b8c145f480 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -363,6 +363,7 @@ static __always_inline void exit_to_user_mode(void)
 	lockdep_hardirqs_on_prepare();
 	instrumentation_end();
 
+	unwind_exit_to_user_mode();
 	user_enter_irqoff();
 	arch_exit_to_user_mode();
 	lockdep_hardirqs_on(CALLER_ADDR0);
diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index 741f409f0d1f..22269f4d2392 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -30,6 +30,11 @@ static __always_inline void unwind_enter_from_user_mode(void)
 	current->unwind_info.cookie = 0;
 }
 
+static __always_inline void unwind_exit_to_user_mode(void)
+{
+	current->unwind_info.cookie = 0;
+}
+
 #else /* !CONFIG_UNWIND_USER */
 
 static inline void unwind_task_init(struct task_struct *task) {}
@@ -40,6 +45,7 @@ static inline int unwind_deferred_request(struct task_struct *task, struct unwin
 static inline bool unwind_deferred_cancel(struct task_struct *task, struct unwind_work *work) { return false; }
 
 static inline void unwind_enter_from_user_mode(void) {}
+static inline void unwind_exit_to_user_mode(void) {}
 
 #endif /* !CONFIG_UNWIND_USER */
 
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
index 6f71a06329fb..c535cca6534b 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -11,6 +11,7 @@ struct unwind_cache {
 struct unwind_task_info {
 	struct unwind_cache	cache;
 	u64			cookie;
+	u64			nmi_cookie;
 };
 
 #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index 2f38055cce48..939c94abaa50 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -29,27 +29,49 @@ static u64 ctx_to_cookie(u64 cpu, u64 ctx)
 
 /*
  * Read the task context cookie, first initializing it if this is the first
- * call to get_cookie() since the most recent entry from user.
+ * call to get_cookie() since the most recent entry from user.  This has to be
+ * done carefully to coordinate with unwind_deferred_request_nmi().
  */
 static u64 get_cookie(struct unwind_task_info *info)
 {
 	u64 ctx_ctr;
 	u64 cookie;
-	u64 cpu;
 
 	guard(irqsave)();
 
-	cookie = info->cookie;
+	cookie = READ_ONCE(info->cookie);
 	if (cookie)
 		return cookie;
 
+	ctx_ctr = __this_cpu_read(unwind_ctx_ctr);
 
-	cpu = raw_smp_processor_id();
-	ctx_ctr = __this_cpu_inc_return(unwind_ctx_ctr);
-	info->cookie = ctx_to_cookie(cpu, ctx_ctr);
+	/* Read ctx_ctr before info->nmi_cookie */
+	barrier();
+
+	cookie = READ_ONCE(info->nmi_cookie);
+	if (cookie) {
+		/*
+		 * This is the first call to get_cookie() since an NMI handler
+		 * first wrote it to info->nmi_cookie.  Sync it.
+		 */
+		WRITE_ONCE(info->cookie, cookie);
+		WRITE_ONCE(info->nmi_cookie, 0);
+		return cookie;
+	}
+
+	/*
+	 * Write info->cookie.  It's ok to race with an NMI here.  The value of
+	 * the cookie is based on ctx_ctr from before the NMI could have
+	 * incremented it.  The result will be the same even if cookie or
+	 * ctx_ctr end up getting written twice.
+	 */
+	cookie = ctx_to_cookie(raw_smp_processor_id(), ctx_ctr + 1);
+	WRITE_ONCE(info->cookie, cookie);
+	WRITE_ONCE(info->nmi_cookie, 0);
+	barrier();
+	__this_cpu_write(unwind_ctx_ctr, ctx_ctr + 1);
 
 	return cookie;
-
 }
 
 static void unwind_deferred_task_work(struct callback_head *head)
@@ -100,7 +122,52 @@ static void unwind_deferred_task_work(struct callback_head *head)
 
 do_callback:
 	work->func(work, &trace, cookie);
-	work->pending = 0;
+	WRITE_ONCE(work->pending, 0);
+}
+
+static int unwind_deferred_request_nmi(struct unwind_work *work, u64 *cookie)
+{
+	struct unwind_task_info *info = &current->unwind_info;
+	bool inited_cookie = false;
+	int ret;
+
+	*cookie = info->cookie;
+	if (!*cookie) {
+		/*
+		 * This is the first unwind request since the most recent entry
+		 * from user.  Initialize the task cookie.
+		 *
+		 * Don't write to info->cookie directly, otherwise it may get
+		 * cleared if the NMI occurred in the kernel during early entry
+		 * or late exit before the task work gets to run.  Instead, use
+		 * info->nmi_cookie which gets synced later by get_cookie().
+		 */
+		if (!info->nmi_cookie) {
+			u64 cpu = raw_smp_processor_id();
+			u64 ctx_ctr;
+
+			ctx_ctr = __this_cpu_inc_return(unwind_ctx_ctr);
+			info->nmi_cookie = ctx_to_cookie(cpu, ctx_ctr);
+
+			inited_cookie = true;
+		}
+
+		*cookie = info->nmi_cookie;
+
+	} else if (work->pending) {
+		return -EEXIST;
+	}
+
+	ret = task_work_add(current, &work->work, TWA_NMI_CURRENT);
+	if (ret) {
+		if (inited_cookie)
+			info->nmi_cookie = 0;
+		return ret;
+	}
+
+	work->pending = 1;
+
+	return 0;
 }
 
 /*
@@ -131,29 +198,36 @@ static void unwind_deferred_task_work(struct callback_head *head)
 int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
 {
 	struct unwind_task_info *info = &current->unwind_info;
+	int pending;
 	int ret;
 
 	*cookie = 0;
 
-	if (WARN_ON_ONCE(in_nmi()))
-		return -EINVAL;
-
 	if (!current->mm || !user_mode(task_pt_regs(current)))
 		return -EINVAL;
 
+	if (in_nmi())
+		return unwind_deferred_request_nmi(work, cookie);
+
 	guard(irqsave)();
 
 	*cookie = get_cookie(info);
 
 	/* callback already pending? */
-	if (work->pending)
+	pending = READ_ONCE(work->pending);
+	if (pending)
 		return -EEXIST;
 
-	ret = task_work_add(current, &work->work, TWA_RESUME);
-	if (WARN_ON_ONCE(ret))
-		return ret;
+	/* Claim the work unless an NMI just now swooped in to do so. */
+	if (!try_cmpxchg(&work->pending, &pending, 1))
+		return -EEXIST;
 
-	work->pending = 1;
+	/* The work has been claimed, now schedule it. */
+	ret = task_work_add(current, &work->work, TWA_RESUME);
+	if (WARN_ON_ONCE(ret)) {
+		WRITE_ONCE(work->pending, 0);
+		return ret;
+	}
 
 	return 0;
 }
-- 
2.48.1
Re: [PATCH v4 30/39] unwind_user/deferred: Make unwind deferral requests NMI-safe
Posted by Peter Zijlstra 1 year ago
On Tue, Jan 21, 2025 at 06:31:22PM -0800, Josh Poimboeuf wrote:
> +static int unwind_deferred_request_nmi(struct unwind_work *work, u64 *cookie)
> +{
> +	struct unwind_task_info *info = &current->unwind_info;
> +	bool inited_cookie = false;
> +	int ret;
> +
> +	*cookie = info->cookie;
> +	if (!*cookie) {
> +		/*
> +		 * This is the first unwind request since the most recent entry
> +		 * from user.  Initialize the task cookie.
> +		 *
> +		 * Don't write to info->cookie directly, otherwise it may get
> +		 * cleared if the NMI occurred in the kernel during early entry
> +		 * or late exit before the task work gets to run.  Instead, use
> +		 * info->nmi_cookie which gets synced later by get_cookie().
> +		 */
> +		if (!info->nmi_cookie) {
> +			u64 cpu = raw_smp_processor_id();
> +			u64 ctx_ctr;
> +
> +			ctx_ctr = __this_cpu_inc_return(unwind_ctx_ctr);

__this_cpu_inc_return() is *NOT* NMI safe IIRC.
Re: [PATCH v4 30/39] unwind_user/deferred: Make unwind deferral requests NMI-safe
Posted by Josh Poimboeuf 1 year ago
On Wed, Jan 22, 2025 at 03:24:18PM +0100, Peter Zijlstra wrote:
> On Tue, Jan 21, 2025 at 06:31:22PM -0800, Josh Poimboeuf wrote:
> > +static int unwind_deferred_request_nmi(struct unwind_work *work, u64 *cookie)
> > +{
> > +	struct unwind_task_info *info = &current->unwind_info;
> > +	bool inited_cookie = false;
> > +	int ret;
> > +
> > +	*cookie = info->cookie;
> > +	if (!*cookie) {
> > +		/*
> > +		 * This is the first unwind request since the most recent entry
> > +		 * from user.  Initialize the task cookie.
> > +		 *
> > +		 * Don't write to info->cookie directly, otherwise it may get
> > +		 * cleared if the NMI occurred in the kernel during early entry
> > +		 * or late exit before the task work gets to run.  Instead, use
> > +		 * info->nmi_cookie which gets synced later by get_cookie().
> > +		 */
> > +		if (!info->nmi_cookie) {
> > +			u64 cpu = raw_smp_processor_id();
> > +			u64 ctx_ctr;
> > +
> > +			ctx_ctr = __this_cpu_inc_return(unwind_ctx_ctr);
> 
> __this_cpu_inc_return() is *NOT* NMI safe IIRC.

Hm, I guess I was only looking at x86.

-- 
Josh
Re: [PATCH v4 30/39] unwind_user/deferred: Make unwind deferral requests NMI-safe
Posted by Peter Zijlstra 1 year ago
On Wed, Jan 22, 2025 at 02:52:57PM -0800, Josh Poimboeuf wrote:
> On Wed, Jan 22, 2025 at 03:24:18PM +0100, Peter Zijlstra wrote:
> > On Tue, Jan 21, 2025 at 06:31:22PM -0800, Josh Poimboeuf wrote:
> > > +static int unwind_deferred_request_nmi(struct unwind_work *work, u64 *cookie)
> > > +{
> > > +	struct unwind_task_info *info = &current->unwind_info;
> > > +	bool inited_cookie = false;
> > > +	int ret;
> > > +
> > > +	*cookie = info->cookie;
> > > +	if (!*cookie) {
> > > +		/*
> > > +		 * This is the first unwind request since the most recent entry
> > > +		 * from user.  Initialize the task cookie.
> > > +		 *
> > > +		 * Don't write to info->cookie directly, otherwise it may get
> > > +		 * cleared if the NMI occurred in the kernel during early entry
> > > +		 * or late exit before the task work gets to run.  Instead, use
> > > +		 * info->nmi_cookie which gets synced later by get_cookie().
> > > +		 */
> > > +		if (!info->nmi_cookie) {
> > > +			u64 cpu = raw_smp_processor_id();
> > > +			u64 ctx_ctr;
> > > +
> > > +			ctx_ctr = __this_cpu_inc_return(unwind_ctx_ctr);
> > 
> > __this_cpu_inc_return() is *NOT* NMI safe IIRC.
> 
> Hm, I guess I was only looking at x86.

:-), right so x86 is rather special here, the various RISC platforms are
what you should be looking at, eg. arm64.
Re: [PATCH v4 30/39] unwind_user/deferred: Make unwind deferral requests NMI-safe
Posted by Peter Zijlstra 1 year ago
On Tue, Jan 21, 2025 at 06:31:22PM -0800, Josh Poimboeuf wrote:

> diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
> index 2f38055cce48..939c94abaa50 100644
> --- a/kernel/unwind/deferred.c
> +++ b/kernel/unwind/deferred.c
> @@ -29,27 +29,49 @@ static u64 ctx_to_cookie(u64 cpu, u64 ctx)
>  
>  /*
>   * Read the task context cookie, first initializing it if this is the first
> - * call to get_cookie() since the most recent entry from user.
> + * call to get_cookie() since the most recent entry from user.  This has to be
> + * done carefully to coordinate with unwind_deferred_request_nmi().
>   */
>  static u64 get_cookie(struct unwind_task_info *info)
>  {
>  	u64 ctx_ctr;
>  	u64 cookie;
> -	u64 cpu;
>  
>  	guard(irqsave)();
>  
> -	cookie = info->cookie;
> +	cookie = READ_ONCE(info->cookie);
>  	if (cookie)
>  		return cookie;
>  
> +	ctx_ctr = __this_cpu_read(unwind_ctx_ctr);
>  
> -	cpu = raw_smp_processor_id();
> -	ctx_ctr = __this_cpu_inc_return(unwind_ctx_ctr);
> -	info->cookie = ctx_to_cookie(cpu, ctx_ctr);
> +	/* Read ctx_ctr before info->nmi_cookie */
> +	barrier();
> +
> +	cookie = READ_ONCE(info->nmi_cookie);
> +	if (cookie) {
> +		/*
> +		 * This is the first call to get_cookie() since an NMI handler
> +		 * first wrote it to info->nmi_cookie.  Sync it.
> +		 */
> +		WRITE_ONCE(info->cookie, cookie);
> +		WRITE_ONCE(info->nmi_cookie, 0);
> +		return cookie;
> +	}
> +
> +	/*
> +	 * Write info->cookie.  It's ok to race with an NMI here.  The value of
> +	 * the cookie is based on ctx_ctr from before the NMI could have
> +	 * incremented it.  The result will be the same even if cookie or
> +	 * ctx_ctr end up getting written twice.
> +	 */
> +	cookie = ctx_to_cookie(raw_smp_processor_id(), ctx_ctr + 1);
> +	WRITE_ONCE(info->cookie, cookie);
> +	WRITE_ONCE(info->nmi_cookie, 0);
> +	barrier();
> +	__this_cpu_write(unwind_ctx_ctr, ctx_ctr + 1);
>  
>  	return cookie;
> -
>  }

Oh gawd. Can we please do something simple like:

	guard(irqsave)();
	cpu = raw_smp_processor_id();
	ctr = __this_cpu_read(unwind_ctx_cnt);
	cookie = READ_ONCE(current->unwind_info.cookie);
	do {
		if (cookie)
			return cookie;
		cookie = ctx_to_cookie(cpu, ctr+1);
	} while (!try_cmpxchg64(&current->unwind_info.cookie, &cookie, cookie));
	__this_cpu_write(unwind_ctx_ctr, ctr+1);
	return cookie;
Re: [PATCH v4 30/39] unwind_user/deferred: Make unwind deferral requests NMI-safe
Posted by Josh Poimboeuf 1 year ago
On Wed, Jan 22, 2025 at 03:15:05PM +0100, Peter Zijlstra wrote:
> On Tue, Jan 21, 2025 at 06:31:22PM -0800, Josh Poimboeuf wrote:
> Oh gawd. Can we please do something simple like:
> 
> 	guard(irqsave)();
> 	cpu = raw_smp_processor_id();
> 	ctr = __this_cpu_read(unwind_ctx_cnt);

Don't you need a compiler barrier here?  __this_cpu_read() doesn't have
one.

> 	cookie = READ_ONCE(current->unwind_info.cookie);
> 	do {
> 		if (cookie)
> 			return cookie;
> 		cookie = ctx_to_cookie(cpu, ctr+1);
> 	} while (!try_cmpxchg64(&current->unwind_info.cookie, &cookie, cookie));
> 	__this_cpu_write(unwind_ctx_ctr, ctr+1);
> 	return cookie;

I was trying to avoid the overhead of the cmpxchg.

But also, the nmi_cookie is still needed for the case where the NMI
arrives before info->cookie gets cleared by early entry-from-user.

-- 
Josh
Re: [PATCH v4 30/39] unwind_user/deferred: Make unwind deferral requests NMI-safe
Posted by Peter Zijlstra 1 year ago
On Wed, Jan 22, 2025 at 02:49:02PM -0800, Josh Poimboeuf wrote:
> On Wed, Jan 22, 2025 at 03:15:05PM +0100, Peter Zijlstra wrote:
> > On Tue, Jan 21, 2025 at 06:31:22PM -0800, Josh Poimboeuf wrote:
> > Oh gawd. Can we please do something simple like:
> > 
> > 	guard(irqsave)();
> > 	cpu = raw_smp_processor_id();
> > 	ctr = __this_cpu_read(unwind_ctx_cnt);
> 
> Don't you need a compiler barrier here?  __this_cpu_read() doesn't have
> one.

What for?

> > 	cookie = READ_ONCE(current->unwind_info.cookie);
> > 	do {
> > 		if (cookie)
> > 			return cookie;
> > 		cookie = ctx_to_cookie(cpu, ctr+1);
> > 	} while (!try_cmpxchg64(&current->unwind_info.cookie, &cookie, cookie));
> > 	__this_cpu_write(unwind_ctx_ctr, ctr+1);
> > 	return cookie;
> 
> I was trying to avoid the overhead of the cmpxchg.

We're going to be doing userspace stack unwinding, I don't think
overhead is a real concern.

> But also, the nmi_cookie is still needed for the case where the NMI
> arrives before info->cookie gets cleared by early entry-from-user.

So how about we clear cookie (and set nr_entries to -1) at
return-to-user, after we've done the work loop and have interrupts
disabled until we hit userspace.

Any NMI that hits there will have to cause another entry anyway.
Re: [PATCH v4 30/39] unwind_user/deferred: Make unwind deferral requests NMI-safe
Posted by Josh Poimboeuf 1 year ago
On Thu, Jan 23, 2025 at 09:40:26AM +0100, Peter Zijlstra wrote:
> On Wed, Jan 22, 2025 at 02:49:02PM -0800, Josh Poimboeuf wrote:
> > On Wed, Jan 22, 2025 at 03:15:05PM +0100, Peter Zijlstra wrote:
> > > On Tue, Jan 21, 2025 at 06:31:22PM -0800, Josh Poimboeuf wrote:
> > > Oh gawd. Can we please do something simple like:
> > > 
> > > 	guard(irqsave)();
> > > 	cpu = raw_smp_processor_id();
> > > 	ctr = __this_cpu_read(unwind_ctx_cnt);
> > 
> > Don't you need a compiler barrier here?  __this_cpu_read() doesn't have
> > one.
> 
> What for?

Hm, I guess it's not needed for this one.

> > > 	cookie = READ_ONCE(current->unwind_info.cookie);
> > > 	do {
> > > 		if (cookie)
> > > 			return cookie;
> > > 		cookie = ctx_to_cookie(cpu, ctr+1);
> > > 	} while (!try_cmpxchg64(&current->unwind_info.cookie, &cookie, cookie));

Should not the 2nd argument be &zero?

> > > 	__this_cpu_write(unwind_ctx_ctr, ctr+1);
> > > 	return cookie;
> > But also, the nmi_cookie is still needed for the case where the NMI
> > arrives before info->cookie gets cleared by early entry-from-user.
> 
> So how about we clear cookie (and set nr_entries to -1) at

I think we could set nr_entries to 0 instead of -1?

> return-to-user, after we've done the work loop and have interrupts
> disabled until we hit userspace.
>
> Any NMI that hits there will have to cause another entry anyway.

But there's a cookie mismatch:

    // return-to-user: IRQs disabled
    <NMI>
	current->unwind_info.cookie = 0x1234
    </NMI>
    unwind_exit_to_user_mode()
	current->unwind_info.cookie = 0
    IRET
<IRQ>
    task_work()
        callback(@cookie=WRONG)

-- 
Josh
Re: [PATCH v4 30/39] unwind_user/deferred: Make unwind deferral requests NMI-safe
Posted by Peter Zijlstra 1 year ago
On Thu, Jan 23, 2025 at 11:48:07AM -0800, Josh Poimboeuf wrote:
> > > > 	cookie = READ_ONCE(current->unwind_info.cookie);
> > > > 	do {
> > > > 		if (cookie)
> > > > 			return cookie;
> > > > 		cookie = ctx_to_cookie(cpu, ctr+1);
> > > > 	} while (!try_cmpxchg64(&current->unwind_info.cookie, &cookie, cookie));
> 
> Should not the 2nd argument be &zero?

This I suppose

	cookie = READ_ONCE(current->unwind_info.cookie);
	do {
		if (cookie)
			return cookie;
	} while (!try_cmpxchg64(&current->unwind_info.cookie, &cookie,
				ctx_to_cookie(cpu, ctr+1)));
Re: [PATCH v4 30/39] unwind_user/deferred: Make unwind deferral requests NMI-safe
Posted by Josh Poimboeuf 1 year ago
On Thu, Jan 23, 2025 at 11:17:34PM +0100, Peter Zijlstra wrote:
> On Thu, Jan 23, 2025 at 11:48:07AM -0800, Josh Poimboeuf wrote:
> > > > > 	cookie = READ_ONCE(current->unwind_info.cookie);
> > > > > 	do {
> > > > > 		if (cookie)
> > > > > 			return cookie;
> > > > > 		cookie = ctx_to_cookie(cpu, ctr+1);
> > > > > 	} while (!try_cmpxchg64(&current->unwind_info.cookie, &cookie, cookie));
> > 
> > Should not the 2nd argument be &zero?
> 
> This I suppose
> 
> 	cookie = READ_ONCE(current->unwind_info.cookie);
> 	do {
> 		if (cookie)
> 			return cookie;
> 	} while (!try_cmpxchg64(&current->unwind_info.cookie, &cookie,
> 				ctx_to_cookie(cpu, ctr+1)));

Here's what I have at the moment.  unwind_deferred_request() is vastly
simplified thanks to your suggestions, no more NMI-specific muck.
work->pending is replaced with work->cookie so the task work can use it
in case it crossed the cookie-clearing boundary (return-to-user with
IRQs disabled).

static inline bool nmi_supported(void)
{
	return IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG);
}

/*
 * Read the task context cookie, initializing it if this is the first request
 * since the most recent entry from user.
 */
static u64 get_cookie(void)
{
	struct unwind_task_info *info = &current->unwind_info;
	u64 ctr, cookie, old_cookie = 0;
	unsigned int cpu;

	cookie = READ_ONCE(info->cookie);
	if (cookie)
		return cookie;

	guard(irqsave)();

	cpu = raw_smp_processor_id();
	ctr = __this_cpu_read(unwind_ctx_ctr);

	cookie = ctx_to_cookie(cpu, ++ctr);

	/* Make sure the cookie is non-zero. */
	if (!cookie)
		cookie = ctx_to_cookie(cpu, ++ctr);

	/* Write the task cookie unless an NMI just now swooped in to do so. */
	if (!try_cmpxchg64(&info->cookie, &old_cookie, cookie))
		return old_cookie;

	/* Success, update the context counter. */
	__this_cpu_write(unwind_ctx_ctr, ctr);

	return cookie;
}

/*
 * Schedule a user space unwind to be done in task work before exiting the
 * kernel.
 *
 * The returned cookie output is a unique identifer for the current task entry
 * context.  Its value will also be passed to the callback function.  It can be
 * used to stitch kernel and user stack traces together in post-processing.
 *
 * It's valid to call this function multiple times for the same @work within
 * the same task entry context.  Each call will return the same cookie.  If the
 * callback is already pending, an error will be returned along with the
 * cookie.  If the callback is not pending because it has already been
 * previously called for the same entry context, it will be called again with
 * the same stack trace and cookie.
 *
 * Thus are three possible return scenarios:
 *
 *   * return != 0, *cookie == 0: the operation failed, no pending callback.
 *
 *   * return != 0, *cookie != 0: the callback is already pending. The cookie
 *     can still be used to correlate with the pending callback.
 *
 *   * return == 0, *cookie != 0: the callback queued successfully.  The
 *     callback is guaranteed to be called with the given cookie.
 */
int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
{
	enum task_work_notify_mode mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME;
	u64 zero = 0;
	int ret;

	*cookie = 0;

	if (!current->mm || !user_mode(task_pt_regs(current)))
		return -EINVAL;

	if (!nmi_supported() && in_nmi())
		return -EINVAL;

	/* Return a valid @cookie even if the callback is already pending. */
	*cookie = READ_ONCE(work->cookie);
	if (*cookie)
		return -EEXIST;

	*cookie = get_cookie();

	/* Claim the work unless an NMI just now swooped in to do so. */
	if (!try_cmpxchg64(&work->cookie, &zero, *cookie))
		return -EEXIST;

	/* The work has been claimed, now schedule it. */
	ret = task_work_add(current, &work->work, mode);
	if (WARN_ON_ONCE(ret)) {
		WRITE_ONCE(work->cookie, 0);
		return ret;
	}

	return 0;
}

-- 
Josh
Re: [PATCH v4 30/39] unwind_user/deferred: Make unwind deferral requests NMI-safe
Posted by Peter Zijlstra 1 year ago
On Thu, Jan 23, 2025 at 03:34:45PM -0800, Josh Poimboeuf wrote:

> Here's what I have at the moment.  unwind_deferred_request() is vastly
> simplified thanks to your suggestions, no more NMI-specific muck.
> work->pending is replaced with work->cookie so the task work can use it
> in case it crossed the cookie-clearing boundary (return-to-user with
> IRQs disabled).

That looks a lot saner, thanks! I'll be waiting for the next cycle, not
sure I can track all the changes in me head.
Re: [PATCH v4 30/39] unwind_user/deferred: Make unwind deferral requests NMI-safe
Posted by Josh Poimboeuf 1 year ago
On Thu, Jan 23, 2025 at 11:48:10AM -0800, Josh Poimboeuf wrote:
> On Thu, Jan 23, 2025 at 09:40:26AM +0100, Peter Zijlstra wrote:
> > On Wed, Jan 22, 2025 at 02:49:02PM -0800, Josh Poimboeuf wrote:
> > > But also, the nmi_cookie is still needed for the case where the NMI
> > > arrives before info->cookie gets cleared by early entry-from-user.
> > 
> > So how about we clear cookie (and set nr_entries to -1) at
> 
> I think we could set nr_entries to 0 instead of -1?
> 
> > return-to-user, after we've done the work loop and have interrupts
> > disabled until we hit userspace.
> >
> > Any NMI that hits there will have to cause another entry anyway.
> 
> But there's a cookie mismatch:
> 
>     // return-to-user: IRQs disabled
>     <NMI>
> 	current->unwind_info.cookie = 0x1234
>     </NMI>
>     unwind_exit_to_user_mode()
> 	current->unwind_info.cookie = 0
>     IRET
> <IRQ>
>     task_work()
>         callback(@cookie=WRONG)

Though, assuming we're keeping the unwind_work struct, there's a simpler
alternative to nmi_cookie: store the cookie in the unwind_work.  Then
the task work can just use that instead of current->unwind_info.cookie.

-- 
Josh