From nobody Wed Jan 22 11:37:54 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id B33601F7577; Wed, 22 Jan 2025 02:32:23 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1737513143; cv=none; b=n5AOIss93CXtXWpywT8BeoSitBZ0wtVEuz5Oniu/LOapukc7KfayahXvAj+LlL3nW5axKBwePtSKFYsNVApUGoW1apbUKGiPi8HI755olC+ALWZWAKep6AxiqrpyHLst6OhlJ+Pb0BtKANCk75xRW/p3M/7LyOsR+kG1oTPihR4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1737513143; c=relaxed/simple; bh=PQu+wKfIIEYE7EgISh+KBFyzxaRDx69Xi4f1BqfJyYk=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=MGlbU/o0Xl5W2ahSXODMrx2MzL/UWWHJwh/GV27aSY0gssnx2g5HkvilZXKYRPYQmQXOeIjBdtExUzE22MuWggZ3UOktsRrq3BsRGDlHt3TZHahfriu2tgc9cgALdQyzYuUdpNCWw/ysjQJsXaPXBrwgBlMr+TcZ/jg4RUQNOF8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=r+qatnsd; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="r+qatnsd" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 5ADA7C4CEE4; Wed, 22 Jan 2025 02:32:22 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1737513143; bh=PQu+wKfIIEYE7EgISh+KBFyzxaRDx69Xi4f1BqfJyYk=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=r+qatnsddBSQqDToUlQUSWzBO5yXfBN4UPBs0l83zSKm8UiMw8FCVD2bflCzUfddX m97qMEY4elziNFmcU2EVxxBdWGoKX3ibzY8ST4Ks4cMiPPjROFRTsdsVmCckZaHlCd P70iRg2PukQ7Lp1WmzSfKed789yb0WEXDqau0Hs9qYekgsSmUQVWFbHsqdGKao4gyO p4vgabqdo+LwPnOxKm6ExA4nN72/v8rWwR0QTae/c9PG+8dGqInxoINnytUPASad6c Hy3krac//d1ynoMLmDzhhBc84fHdiWlIptq9QVH7l6cZttM5JFx7B59KG/lyxSREQJ bBLNd8MLaTUAg== From: Josh Poimboeuf To: x86@kernel.org Cc: Peter Zijlstra , Steven Rostedt , Ingo Molnar , Arnaldo Carvalho de Melo , linux-kernel@vger.kernel.org, Indu Bhagat , Mark Rutland , Alexander Shishkin , Jiri Olsa , Namhyung Kim , Ian Rogers , Adrian Hunter , linux-perf-users@vger.kernel.org, Mark Brown , linux-toolchains@vger.kernel.org, Jordan Rome , Sam James , linux-trace-kernel@vger.kernel.org, Andrii Nakryiko , Jens Remus , Mathieu Desnoyers , Florian Weimer , Andy Lutomirski , Masami Hiramatsu , Weinan Liu Subject: [PATCH v4 30/39] unwind_user/deferred: Make unwind deferral requests NMI-safe Date: Tue, 21 Jan 2025 18:31:22 -0800 Message-ID: <4ea47a9238cb726614f36a0aad2a545816442e57.1737511963.git.jpoimboe@kernel.org> X-Mailer: git-send-email 2.48.1 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Make unwind_deferred_request() NMI-safe so tracers in NMI context can call it to get the cookie immediately rather than have to do the fragile "schedule irq work and then call unwind_deferred_request()" dance. Signed-off-by: Josh Poimboeuf --- include/linux/entry-common.h | 1 + include/linux/unwind_deferred.h | 6 ++ include/linux/unwind_deferred_types.h | 1 + kernel/unwind/deferred.c | 106 ++++++++++++++++++++++---- 4 files changed, 98 insertions(+), 16 deletions(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index fb2b27154fee..e9b8c145f480 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -363,6 +363,7 @@ static __always_inline void exit_to_user_mode(void) lockdep_hardirqs_on_prepare(); instrumentation_end(); =20 + unwind_exit_to_user_mode(); user_enter_irqoff(); arch_exit_to_user_mode(); lockdep_hardirqs_on(CALLER_ADDR0); diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferre= d.h index 741f409f0d1f..22269f4d2392 100644 --- a/include/linux/unwind_deferred.h +++ b/include/linux/unwind_deferred.h @@ -30,6 +30,11 @@ static __always_inline void unwind_enter_from_user_mode(= void) current->unwind_info.cookie =3D 0; } =20 +static __always_inline void unwind_exit_to_user_mode(void) +{ + current->unwind_info.cookie =3D 0; +} + #else /* !CONFIG_UNWIND_USER */ =20 static inline void unwind_task_init(struct task_struct *task) {} @@ -40,6 +45,7 @@ static inline int unwind_deferred_request(struct task_str= uct *task, struct unwin static inline bool unwind_deferred_cancel(struct task_struct *task, struct= unwind_work *work) { return false; } =20 static inline void unwind_enter_from_user_mode(void) {} +static inline void unwind_exit_to_user_mode(void) {} =20 #endif /* !CONFIG_UNWIND_USER */ =20 diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_d= eferred_types.h index 6f71a06329fb..c535cca6534b 100644 --- a/include/linux/unwind_deferred_types.h +++ b/include/linux/unwind_deferred_types.h @@ -11,6 +11,7 @@ struct unwind_cache { struct unwind_task_info { struct unwind_cache cache; u64 cookie; + u64 nmi_cookie; }; =20 #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */ diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c index 2f38055cce48..939c94abaa50 100644 --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -29,27 +29,49 @@ static u64 ctx_to_cookie(u64 cpu, u64 ctx) =20 /* * Read the task context cookie, first initializing it if this is the first - * call to get_cookie() since the most recent entry from user. + * call to get_cookie() since the most recent entry from user. This has t= o be + * done carefully to coordinate with unwind_deferred_request_nmi(). */ static u64 get_cookie(struct unwind_task_info *info) { u64 ctx_ctr; u64 cookie; - u64 cpu; =20 guard(irqsave)(); =20 - cookie =3D info->cookie; + cookie =3D READ_ONCE(info->cookie); if (cookie) return cookie; =20 + ctx_ctr =3D __this_cpu_read(unwind_ctx_ctr); =20 - cpu =3D raw_smp_processor_id(); - ctx_ctr =3D __this_cpu_inc_return(unwind_ctx_ctr); - info->cookie =3D ctx_to_cookie(cpu, ctx_ctr); + /* Read ctx_ctr before info->nmi_cookie */ + barrier(); + + cookie =3D READ_ONCE(info->nmi_cookie); + if (cookie) { + /* + * This is the first call to get_cookie() since an NMI handler + * first wrote it to info->nmi_cookie. Sync it. + */ + WRITE_ONCE(info->cookie, cookie); + WRITE_ONCE(info->nmi_cookie, 0); + return cookie; + } + + /* + * Write info->cookie. It's ok to race with an NMI here. The value of + * the cookie is based on ctx_ctr from before the NMI could have + * incremented it. The result will be the same even if cookie or + * ctx_ctr end up getting written twice. + */ + cookie =3D ctx_to_cookie(raw_smp_processor_id(), ctx_ctr + 1); + WRITE_ONCE(info->cookie, cookie); + WRITE_ONCE(info->nmi_cookie, 0); + barrier(); + __this_cpu_write(unwind_ctx_ctr, ctx_ctr + 1); =20 return cookie; - } =20 static void unwind_deferred_task_work(struct callback_head *head) @@ -100,7 +122,52 @@ static void unwind_deferred_task_work(struct callback_= head *head) =20 do_callback: work->func(work, &trace, cookie); - work->pending =3D 0; + WRITE_ONCE(work->pending, 0); +} + +static int unwind_deferred_request_nmi(struct unwind_work *work, u64 *cook= ie) +{ + struct unwind_task_info *info =3D ¤t->unwind_info; + bool inited_cookie =3D false; + int ret; + + *cookie =3D info->cookie; + if (!*cookie) { + /* + * This is the first unwind request since the most recent entry + * from user. Initialize the task cookie. + * + * Don't write to info->cookie directly, otherwise it may get + * cleared if the NMI occurred in the kernel during early entry + * or late exit before the task work gets to run. Instead, use + * info->nmi_cookie which gets synced later by get_cookie(). + */ + if (!info->nmi_cookie) { + u64 cpu =3D raw_smp_processor_id(); + u64 ctx_ctr; + + ctx_ctr =3D __this_cpu_inc_return(unwind_ctx_ctr); + info->nmi_cookie =3D ctx_to_cookie(cpu, ctx_ctr); + + inited_cookie =3D true; + } + + *cookie =3D info->nmi_cookie; + + } else if (work->pending) { + return -EEXIST; + } + + ret =3D task_work_add(current, &work->work, TWA_NMI_CURRENT); + if (ret) { + if (inited_cookie) + info->nmi_cookie =3D 0; + return ret; + } + + work->pending =3D 1; + + return 0; } =20 /* @@ -131,29 +198,36 @@ static void unwind_deferred_task_work(struct callback= _head *head) int unwind_deferred_request(struct unwind_work *work, u64 *cookie) { struct unwind_task_info *info =3D ¤t->unwind_info; + int pending; int ret; =20 *cookie =3D 0; =20 - if (WARN_ON_ONCE(in_nmi())) - return -EINVAL; - if (!current->mm || !user_mode(task_pt_regs(current))) return -EINVAL; =20 + if (in_nmi()) + return unwind_deferred_request_nmi(work, cookie); + guard(irqsave)(); =20 *cookie =3D get_cookie(info); =20 /* callback already pending? */ - if (work->pending) + pending =3D READ_ONCE(work->pending); + if (pending) return -EEXIST; =20 - ret =3D task_work_add(current, &work->work, TWA_RESUME); - if (WARN_ON_ONCE(ret)) - return ret; + /* Claim the work unless an NMI just now swooped in to do so. */ + if (!try_cmpxchg(&work->pending, &pending, 1)) + return -EEXIST; =20 - work->pending =3D 1; + /* The work has been claimed, now schedule it. */ + ret =3D task_work_add(current, &work->work, TWA_RESUME); + if (WARN_ON_ONCE(ret)) { + WRITE_ONCE(work->pending, 0); + return ret; + } =20 return 0; } --=20 2.48.1