From nobody Sat Jun 13 00:24:03 2026 Received: from out-181.mta0.migadu.com (out-181.mta0.migadu.com [91.218.175.181]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 82D9447B423 for ; Mon, 11 May 2026 18:25:20 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.181 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523922; cv=none; b=omdv0XeumG5EjnFnW17u+NSG3sBE/o8DRvD3f9OOrHuFcXwN0e5qfEKHcY5m7SS/oCLpBfjX/A02S199i5+eDTXsMSP49wtnCxZhUPYhE6mLacL5XPsvJ8J3jhDZPe4lEbQSFC46CwqZgX1W0j2OO93F+Glv9gTM09QRwupNulo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523922; c=relaxed/simple; bh=tlbRJPfl1ZZkEWC7hqJDh2bvR3yWdajgZ6vYRz4zjnk=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=iIC2k/OvySRy6i51xJNAOHozcqyrEV7qBdO1mZxAEL6TOfqKEiuz6udC3r5EgTgjdi0BSYKfV4WlMlLDtpJsHPGmNWh6ajJlHCQ1xRtZMDM/8gaZcPIIynxm4GpwyJS7bMCpn6z9dlFeH0pXLK986PdXAbctxU9NAMk7Y/rudwE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=B2HmG0NU; arc=none smtp.client-ip=91.218.175.181 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="B2HmG0NU" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1778523917; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=sJXtw30v4yT1aoXHrw8fkCEubjdJLYouJzBqKlxq+jQ=; b=B2HmG0NU0oMB4KuRavo42Tz0lgyuJSsUB7nvYoUS0cw/P4O/EtIMviO2A22ngKbjSCgaay pWuiq/QERPN/uUR/eAw+QD+e0yqZKPbuWfOaylSwkmoPbb5kAhJOpILnNpUmW+KuhSg+DI hnbpGoDg5EvfutcrWm10aPtv2GM2gxc= From: wen.yang@linux.dev To: Gabriele Monaco , Steven Rostedt Cc: linux-trace-kernel@vger.kernel.org, linux-kernel@vger.kernel.org, Wen Yang Subject: [RFC PATCH v2 01/10] rv/da: fix monitor start ordering and memory ordering for monitoring flag Date: Tue, 12 May 2026 02:24:47 +0800 Message-Id: <8af5ba4bd93d2acb8a546e8e47ced974a87c1eb8.1778522945.git.wen.yang@linux.dev> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Wen Yang da_monitor_start() set monitoring=3D1 before calling da_monitor_init_hook(), may racing with the sched_switch handler: da_monitor_start() sched_switch handler ------------------------- --------------------------------- da_mon->monitoring =3D 1; if (da_monitoring(da_mon)) /* true */ ha_start_timer_ns(...); /* hrtimer->base =3D=3D NULL, crash = */ da_monitor_init_hook(da_mon); /* hrtimer_setup() sets base */ Fix the ordering and pair with release/acquire semantics: da_monitor_init_hook(da_mon); smp_store_release(&da_mon->monitoring, 1); /* da_monitor_start() */ return smp_load_acquire(&da_mon->monitoring); /* da_monitoring() */ On ARM64 a plain STR + LDR does not form a release-acquire pair, so the load can observe monitoring=3D1 while hrtimer->base is still NULL. The plain accesses are also data races under KCSAN. Use WRITE_ONCE for the monitoring=3D0 store in da_monitor_reset() to cover the reset path. Fixes: 792575348ff7 ("rv/include: Add deterministic automata monitor defini= tion via C macros") Signed-off-by: Wen Yang Reviewed-by: Gabriele Monaco --- include/rv/da_monitor.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h index 39765ff6f098..00ded3d5ab3f 100644 --- a/include/rv/da_monitor.h +++ b/include/rv/da_monitor.h @@ -82,7 +82,7 @@ static void react(enum states curr_state, enum events eve= nt) static inline void da_monitor_reset(struct da_monitor *da_mon) { da_monitor_reset_hook(da_mon); - da_mon->monitoring =3D 0; + WRITE_ONCE(da_mon->monitoring, 0); da_mon->curr_state =3D model_get_initial_state(); } =20 @@ -95,8 +95,9 @@ static inline void da_monitor_reset(struct da_monitor *da= _mon) static inline void da_monitor_start(struct da_monitor *da_mon) { da_mon->curr_state =3D model_get_initial_state(); - da_mon->monitoring =3D 1; da_monitor_init_hook(da_mon); + /* Pairs with smp_load_acquire in da_monitoring(). */ + smp_store_release(&da_mon->monitoring, 1); } =20 /* @@ -104,7 +105,8 @@ static inline void da_monitor_start(struct da_monitor *= da_mon) */ static inline bool da_monitoring(struct da_monitor *da_mon) { - return da_mon->monitoring; + /* Pairs with smp_store_release in da_monitor_start(). */ + return smp_load_acquire(&da_mon->monitoring); } =20 /* --=20 2.25.1 From nobody Sat Jun 13 00:24:03 2026 Received: from out-183.mta0.migadu.com (out-183.mta0.migadu.com [91.218.175.183]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 73ADD47CC86 for ; Mon, 11 May 2026 18:25:22 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.183 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523924; cv=none; b=cCl9EGWf09xvloSsS4/uFuEASKdo9oOtXo64clgMmW/UoAR7Ceh3ezuHFWMJCjcaCc8e6yJQKgoO1KN5R85w9j7qenO0xskbn+fSzPMYP7XOmZk5C7g1R7e0wAPzFU8AlrWkTZOtTdOLM5+Iwe5lnqTRK/DtPvbFN4N2HkWVhmI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523924; c=relaxed/simple; bh=e/86egzIPIJnJBpM7nA833Sd6LId+B75HRElGKUem60=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=G6qcdEJYdjI3AiQgV+TymyT/MrBsb3RcYDaiJCcGuu7hL9OuKT9vkuunrglpWFfiZn6dLuVANkY1btQ5HZofmn+LIQ2jk6iaOmMqQrZRxNySflgt/wHrqUUB0uTtRG7qHXkAoFG1DsitpTs0b5W+xtNNWKjEA8SccR+k+O88sDU= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=X6cd78er; arc=none smtp.client-ip=91.218.175.183 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="X6cd78er" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1778523920; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=aOka89wKQYcbIpNURyHjkJzZlwApVr8sbGR1QlKrqvk=; b=X6cd78erysjpIfKHEOgBTvBYm2PkrrOb/EMt7bvOWRuT9NMYy9Cgdd06TdzqNhkJ4WVkl7 b2MiUQWDsAVcI0LIYvJmOkrxFk6b5Nwaklr73DCmxtAlUxCGEbeF+xG0tSqQ+gAq8fmYtV ufBBhyZ0nEfkdzLiDdm6LohpdlC2vMc= From: wen.yang@linux.dev To: Gabriele Monaco , Steven Rostedt Cc: linux-trace-kernel@vger.kernel.org, linux-kernel@vger.kernel.org, Wen Yang Subject: [RFC PATCH v2 02/10] rv/da: fix per-task da_monitor_destroy() ordering and sync Date: Tue, 12 May 2026 02:24:48 +0800 Message-Id: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Wen Yang The following two paths race: CPU 0 (disable_stall/__rv_disable_monitor) CPU 1 (wwnr probe handler) ------------------------------------------ ----------------------------- disable_stall() da_monitor_destroy() da_monitor_reset_all() <------ [task T: monitoring=3D0] da_monitor_start(&T->rv[n]) /* no timer_setup */ monitoring=3D1 <---- tracepoint_synchronize_unregister() // CPU 1 probe has already returned; sync returns Later, enable_stall() acquires the same slot and calls da_monitor_init(): da_monitor_reset_all() da_monitor_reset(&T->rv[slot]) // monitoring=3D1, timer.function=3D= =3D0 ha_monitor_reset_env() ha_cancel_timer() timer_delete(&ha_mon->timer) // ODEBUG: timer never initialised ODEBUG: assert_init not available (active state 0) object type: timer_list Call trace: timer_delete <- da_monitor_reset_all <- enable_stall Call tracepoint_synchronize_unregister() inside da_monitor_destroy() before da_monitor_reset_all(). The unregister_trace_xxx() calls in the monitor's disable() have already disconnected the tracepoints; the sync here drains any handler still in flight, so no new monitoring=3D1 can appear after da_monitor_reset_all() clears the slot. Also fix the slot release ordering: release the slot only after reset_all() to avoid accessing rv[] with an out-of-bounds index. Fixes: f5587d1b6ec9 ("rv: Add Hybrid Automata monitor type") Signed-off-by: Wen Yang --- include/rv/da_monitor.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h index 00ded3d5ab3f..d04bb3229c75 100644 --- a/include/rv/da_monitor.h +++ b/include/rv/da_monitor.h @@ -304,6 +304,20 @@ static int da_monitor_init(void) =20 /* * da_monitor_destroy - return the allocated slot + * + * Call tracepoint_synchronize_unregister() before reset_all() to close + * the race where an in-flight non-HA probe handler sets monitoring=3D1 + * (without calling timer_setup()) after da_monitor_reset_all() has + * already cleared the slot but before the caller's own sync completes. + * Without this barrier, an HA_TIMER_WHEEL monitor that later acquires + * the same slot would call timer_delete() on a never-initialised + * timer_list, triggering ODEBUG warnings. + * + * Note: tracepoint_synchronize_unregister() is a system-wide barrier + * that waits for all CPUs to finish any in-flight tracepoint handlers. + * The caller's own __rv_disable_monitor() issues a second sync after + * returning from disable(); that redundant call is harmless on the + * infrequent admin (enable/disable) path. */ static inline void da_monitor_destroy(void) { @@ -311,10 +325,10 @@ static inline void da_monitor_destroy(void) WARN_ONCE(1, "Disabling a disabled monitor: " __stringify(MONITOR_NAME)); return; } + tracepoint_synchronize_unregister(); + da_monitor_reset_all(); rv_put_task_monitor_slot(task_mon_slot); task_mon_slot =3D RV_PER_TASK_MONITOR_INIT; - - da_monitor_reset_all(); } =20 #elif RV_MON_TYPE =3D=3D RV_MON_PER_OBJ --=20 2.25.1 From nobody Sat Jun 13 00:24:03 2026 Received: from out-171.mta0.migadu.com (out-171.mta0.migadu.com [91.218.175.171]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 24BBD47B423 for ; Mon, 11 May 2026 18:25:24 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.171 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523926; cv=none; b=ek/0iq9fl3whKCnKwoOlztp80OcMkd3w/9wP8MRo0DohIRrK7sPdfJKCbuvaNt7VmgHkns8OMFEg1Oz6LtZCvR3XquQ7GCEUr2rzsFBhkq6w8bFnyYs7BwkcQywJPUsIEoe5SVYcwLfZ6K9vNQXasfZ2etfv2jip3kwHNtk1yns= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523926; c=relaxed/simple; bh=x0N3Y2QenibRrFuO3zhAz0CXTVqKQaqnYOVYAIRkZ5g=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=cL3TQf+bnI6zPm+L2lx86tqlCaDWYEc8yEhj1J3uYe2mDZoZWyj2iPs5pIH6ngG/uVQ+oHmGo3DApTfXpngC5jUIMOB6FHP7VIgY+YLFrn5C1/Kovt25M6vUU++Bh8vLxmOfCfA9ji3QxYPEPaPkgR699YLaPnjLgcJAU8dWfvE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=a90M5aGG; arc=none smtp.client-ip=91.218.175.171 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="a90M5aGG" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1778523923; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=X15e0nlHXpxQx401W4O6S+WSF172wMdASqEezgkf+IU=; b=a90M5aGGZF8zuIg8yOmnsadCo9ca3ya/GpYjw38DbjyYjcAcFYWTmaPa0TC2bcMpJrbuhm Jl3SyK6A+0c8lr6Qkj4pu/OCKz7Xo/W+n5gxlAtlpFycj3qBnqSe5P1h2csNxXnHUakLT+ G/Ea4nB55raT6WCSZUAzjJvQ6u7hRGU= From: wen.yang@linux.dev To: Gabriele Monaco , Steven Rostedt Cc: linux-trace-kernel@vger.kernel.org, linux-kernel@vger.kernel.org, Wen Yang Subject: [RFC PATCH v2 03/10] selftests/verification: fix verificationtest-ktap for out-of-tree execution Date: Tue, 12 May 2026 02:24:49 +0800 Message-Id: <7368ee25b1b45c92beb14c05be366b71da585ca4.1778522945.git.wen.yang@linux.dev> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Wen Yang verificationtest-ktap used a CWD-relative path (../ftrace/ftracetest) and a relative argument (../verification) for --rv. This works when the shell changes into the verification directory first, but breaks when the script is invoked directly - e.g. by the kselftest runner or vng - because the working directory is the kernel source root, not the script's own directory. Fix this by computing the script's directory from $0 with cd/dirname/pwd and using absolute paths for both the ftracetest invocation and the --rv argument. Also export the directory to PATH so that check_requires in the ftracetest framework can locate helper binaries. Signed-off-by: Wen Yang --- tools/testing/selftests/verification/verificationtest-ktap | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/verification/verificationtest-ktap b/t= ools/testing/selftests/verification/verificationtest-ktap index 18f7fe324e2f..456b8578a307 100755 --- a/tools/testing/selftests/verification/verificationtest-ktap +++ b/tools/testing/selftests/verification/verificationtest-ktap @@ -5,4 +5,6 @@ # # Copyright (C) Arm Ltd., 2023 =20 -../ftrace/ftracetest -K -v --rv ../verification +dir=3D$(cd "$(dirname "$0")" && pwd) +export PATH=3D"$dir:$PATH" +"$dir/../ftrace/ftracetest" -K -v --rv "$dir" --=20 2.25.1 From nobody Sat Jun 13 00:24:03 2026 Received: from out-170.mta0.migadu.com (out-170.mta0.migadu.com [91.218.175.170]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 92E6347D922 for ; Mon, 11 May 2026 18:25:27 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.170 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523929; cv=none; b=uzyqMXOxn0Twco67tTb59JVqHGW1tEq/SDL49qrVFg6xxy09xQbhGEGFZl/gTK2VlAaxo2KD/UGgDT75OG+yPwQNuAAd4Cs9HNwqiR8ta2iDdeUQu7xBlK2K7ATuuc1O5L1UBwvEEQ5ANXZKoItx/SO152LDErwISPtncol6NLc= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523929; c=relaxed/simple; bh=v9qQ+smEFVXRv9Ckmq8lKzhougf8HNPeZpMZ9y1mzBA=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version:Content-Type; b=gCu/srtizbZ/4myW+3oqOJG8dgZiNzOVr2SlQ0/RSJyWXjASPmsuS7wCQMkLG6DPXz5yDkoHQiyjjcb+1GM4SAcy5I1pUDJ/z134RbMU6GjFqz38kCDDU9yZrhWNF6SH74FJaOHVVr/8zsk7cb6ELl1rwByHAOWVRLvJQHObrO8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=Z5lt11u9; arc=none smtp.client-ip=91.218.175.170 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="Z5lt11u9" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1778523925; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=IGSMUK6c+xA+ZoX0w0HHk4pXxa4vBX3tXNiW0TECRoE=; b=Z5lt11u9mdTA05Nr9sh5ybGIVkrI51+lGzzQJMYHNLuplG1TRIMM0KqEzkD97KHdOBMxbR hCyrn0VLA78ze2lvGIQf6OmrvDE84w5hImQ6UihrNp8DZJAbC/hruhM46M84MNO3GV4y4u Uzvfw1wK/i/KTPmdlWzoVzIT518DoMw= From: wen.yang@linux.dev To: Gabriele Monaco , Steven Rostedt Cc: linux-trace-kernel@vger.kernel.org, linux-kernel@vger.kernel.org, Wen Yang Subject: [RFC PATCH v2 04/10] rv/da: add pre-allocated storage pool for per-object monitors Date: Tue, 12 May 2026 02:24:50 +0800 Message-Id: <2774332570ee823be60cfe84ba85e9573b4df478.1778522945.git.wen.yang@linux.dev> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT From: Wen Yang da_create_empty_storage() uses kmalloc_nolock(), which requires CONFIG_HAVE_ALIGNED_STRUCT_PAGE; on UML and some PREEMPT_RT configurations it always returns NULL. Calling kmalloc from scheduler tracepoint handlers also adds unwanted latency and can fail under memory pressure. Add da_monitor_init_prealloc(N) as an opt-in alternative to da_monitor_init(). It allocates N da_monitor_storage slots with GFP_KERNEL up-front and manages them on a LIFO free-stack protected by a spinlock, so da_create_or_get() never calls kmalloc on the hot path. Monitors that do not call da_monitor_init_prealloc() are unaffected. Signed-off-by: Wen Yang --- include/rv/da_monitor.h | 208 +++++++++++++++++++++++++++++++++++----- 1 file changed, 186 insertions(+), 22 deletions(-) diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h index d04bb3229c75..7d6f62766251 100644 --- a/include/rv/da_monitor.h +++ b/include/rv/da_monitor.h @@ -433,18 +433,6 @@ static inline da_id_type da_get_id(struct da_monitor *= da_mon) return container_of(da_mon, struct da_monitor_storage, rv.da_mon)->id; } =20 -/* - * da_create_or_get - create the per-object storage if not already there - * - * This needs a lookup so should be guarded by RCU, the condition is check= ed - * directly in da_create_storage() - */ -static inline void da_create_or_get(da_id_type id, monitor_target target) -{ - guard(rcu)(); - da_create_storage(id, target, da_get_monitor(id, target)); -} - /* * da_fill_empty_storage - store the target in a pre-allocated storage * @@ -475,15 +463,121 @@ static inline monitor_target da_get_target_by_id(da_= id_type id) return mon_storage->target; } =20 +/* + * Per-object pool state. + * + * Zero-initialised by default (storage =3D=3D NULL =E2=9F=B9 kmalloc mode= ). A monitor + * opts into pool mode by calling da_monitor_init_prealloc(N) instead of + * da_monitor_init(), which sets storage to a non-NULL kcalloc'd array. + * + * Because every field is wrapped in this struct and the struct itself is a + * per-TU static, each monitor that includes this header gets a completely + * independent pool. A kmalloc monitor (e.g. nomiss) and a pool monitor + * (e.g. tlob) therefore coexist without any interference. + * + * da_pool_return_cb runs from softirq on non-PREEMPT_RT, so irqsave is + * required to prevent deadlock with task-context callers. On PREEMPT_RT + * it runs from an rcuc kthread where spinlock_t is a sleeping lock. + */ +struct da_per_obj_pool { + struct da_monitor_storage *storage; /* non-NULL =E2=9F=B9 pool mode */ + struct da_monitor_storage **free; /* kmalloc'd pointer stack */ + unsigned int free_top; + spinlock_t lock; +}; + +static struct da_per_obj_pool da_pool =3D { + .lock =3D __SPIN_LOCK_UNLOCKED(da_pool.lock), +}; + +static void da_pool_return_cb(struct rcu_head *head) +{ + struct da_monitor_storage *ms =3D + container_of(head, struct da_monitor_storage, rcu); + unsigned long flags; + + spin_lock_irqsave(&da_pool.lock, flags); + da_pool.free[da_pool.free_top++] =3D ms; + spin_unlock_irqrestore(&da_pool.lock, flags); +} + +/* Pops a slot from the pre-allocated pool; returns -ENOSPC if exhausted. = */ +static inline int da_create_or_get_pool(da_id_type id, monitor_target targ= et) +{ + struct da_monitor_storage *mon_storage; + unsigned long flags; + + spin_lock_irqsave(&da_pool.lock, flags); + if (!da_pool.free_top) { + spin_unlock_irqrestore(&da_pool.lock, flags); + return -ENOSPC; + } + mon_storage =3D da_pool.free[--da_pool.free_top]; + spin_unlock_irqrestore(&da_pool.lock, flags); + + mon_storage->id =3D id; + mon_storage->target =3D target; + guard(rcu)(); + hash_add_rcu(da_monitor_ht, &mon_storage->node, id); + return 0; +} + +/* + * Tries da_create_storage() first (lock-free via kmalloc_nolock); falls b= ack + * to kmalloc(GFP_KERNEL). Must be called from task context. + */ +static inline int da_create_or_get_kmalloc(da_id_type id, monitor_target t= arget) +{ + struct da_monitor_storage *mon_storage; + + scoped_guard(rcu) { + if (da_create_storage(id, target, da_get_monitor(id, target))) + return 0; + } + + /* + * da_create_storage() failed because kmalloc_nolock() returned NULL. + * Allocate with GFP_KERNEL outside the RCU read section: GFP_KERNEL + * may sleep for memory reclaim, which is illegal while the RCU read + * lock is held (preemption disabled on !PREEMPT_RT). + */ + mon_storage =3D kmalloc_obj(*mon_storage, GFP_KERNEL | __GFP_ZERO); + if (!mon_storage) + return -ENOMEM; + mon_storage->id =3D id; + mon_storage->target =3D target; + + /* + * Re-check for a concurrent insertion before linking: another + * caller may have succeeded while we slept in kmalloc(). + * Discard our allocation and let the winner's entry stand. + */ + scoped_guard(rcu) { + if (da_get_monitor(id, target)) { + kfree(mon_storage); + return 0; + } + hash_add_rcu(da_monitor_ht, &mon_storage->node, id); + } + return 0; +} + +/* Create the per-object storage if not already there. */ +static inline int da_create_or_get(da_id_type id, monitor_target target) +{ + if (da_pool.storage) + return da_create_or_get_pool(id, target); + return da_create_or_get_kmalloc(id, target); +} + /* * da_destroy_storage - destroy the per-object storage * - * The caller is responsible to synchronise writers, either with locks or - * implicitly. For instance, if da_destroy_storage is called at sched_exit= and - * da_create_storage can never occur after that, it's safe to call this wi= thout - * locks. - * This function includes an RCU read-side critical section to synchronise - * against da_monitor_destroy(). + * Pool mode: removes from hash and returns the slot via call_rcu(). + * Kmalloc mode: removes from hash and frees via kfree_rcu(). + * + * Includes an RCU read-side critical section to synchronise against + * da_monitor_destroy(). */ static inline void da_destroy_storage(da_id_type id) { @@ -491,15 +585,17 @@ static inline void da_destroy_storage(da_id_type id) =20 guard(rcu)(); mon_storage =3D __da_get_mon_storage(id); - if (!mon_storage) return; da_monitor_reset_hook(&mon_storage->rv.da_mon); hash_del_rcu(&mon_storage->node); - kfree_rcu(mon_storage, rcu); + if (da_pool.storage) + call_rcu(&mon_storage->rcu, da_pool_return_cb); + else + kfree_rcu(mon_storage, rcu); } =20 -static void da_monitor_reset_all(void) +static __maybe_unused void da_monitor_reset_all(void) { struct da_monitor_storage *mon_storage; int bkt; @@ -510,13 +606,65 @@ static void da_monitor_reset_all(void) rcu_read_unlock(); } =20 +/* + * da_monitor_init_prealloc - initialise with a pre-allocated storage pool + * + * Allocates @prealloc_count storage slots up-front so that da_create_or_g= et() + * and da_destroy_storage() never call kmalloc/kfree. Must be called inst= ead + * of da_monitor_init() for monitors that require pool mode. + */ +static inline int da_monitor_init_prealloc(unsigned int prealloc_count) +{ + hash_init(da_monitor_ht); + + da_pool.storage =3D kcalloc(prealloc_count, sizeof(*da_pool.storage), + GFP_KERNEL); + if (!da_pool.storage) + return -ENOMEM; + + da_pool.free =3D kmalloc_array(prealloc_count, sizeof(*da_pool.free), + GFP_KERNEL); + if (!da_pool.free) { + kfree(da_pool.storage); + da_pool.storage =3D NULL; + return -ENOMEM; + } + + da_pool.free_top =3D 0; + for (unsigned int i =3D 0; i < prealloc_count; i++) + da_pool.free[da_pool.free_top++] =3D &da_pool.storage[i]; + return 0; +} + +/* + * da_monitor_init - initialise in kmalloc mode (no pre-allocation) + */ static inline int da_monitor_init(void) { hash_init(da_monitor_ht); return 0; } =20 -static inline void da_monitor_destroy(void) +static inline void da_monitor_destroy_pool(void) +{ + WARN_ON_ONCE(!hash_empty(da_monitor_ht)); + /* + * Wait for all in-flight da_pool_return_cb() callbacks to + * complete before freeing da_pool.free. synchronize_rcu() is + * not sufficient: it only waits for callbacks registered before + * it was called, but call_rcu() from concurrent da_destroy_storage() + * calls may have been enqueued later. rcu_barrier() drains every + * pending callback. + */ + rcu_barrier(); + kfree(da_pool.storage); + da_pool.storage =3D NULL; + kfree(da_pool.free); + da_pool.free =3D NULL; + da_pool.free_top =3D 0; +} + +static inline void da_monitor_destroy_kmalloc(void) { struct da_monitor_storage *mon_storage; struct hlist_node *tmp; @@ -534,6 +682,22 @@ static inline void da_monitor_destroy(void) } } =20 +/* + * da_monitor_destroy - tear down the per-object monitor + * + * Pool mode: the hash must already be empty (caller must have drained all + * tasks first); calls rcu_barrier() to drain all pending da_pool_return_c= b() + * callbacks before freeing pool arrays. + * Kmalloc mode: drains any remaining entries after synchronize_rcu(). + */ +static inline void da_monitor_destroy(void) +{ + if (da_pool.storage) + da_monitor_destroy_pool(); + else + da_monitor_destroy_kmalloc(); +} + /* * Allow the per-object monitors to run allocation manually, necessary if = the * start condition is in a context problematic for allocation (e.g. schedu= ling). --=20 2.25.1 From nobody Sat Jun 13 00:24:03 2026 Received: from out-180.mta0.migadu.com (out-180.mta0.migadu.com [91.218.175.180]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 03A4947D949 for ; Mon, 11 May 2026 18:25:29 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.180 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523932; cv=none; b=LZbl/d3d8cIdvJRiSJWwBEBecRxZRaX7xFPhd+gXaPpgpX/i3thG6XmLrgTSGeqVpZEbt6y7XHd/N2GQopyEU6By5ruGCYBhflWsNUrkGCfJ2nipL59+Af+wtYgT5XzKLc+qSnpmVOvNXs9IT74permObvyzSEdmGl4BjF73mcs= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523932; c=relaxed/simple; bh=DuyicdAoTi0cNa2W8wNKtUeMrh1PpbT38NLcpkBaZZw=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version:Content-Type; b=Ot2gY8rVDSucxP6FTjCDabD9rI02uD+GszMgheU/NT/YB6NUUHKTT0Ad5dWuBClMCnqyrtv27JLRe51a8E4q0AtxtjCs5uDVRwB6P3RabQP2I7wgVHFFItX3cjiqogaItf8K1SC0T+twywpwJsdPfK70XkDp5cs79j3jDVXiQ7Y= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=nxPOlTOU; arc=none smtp.client-ip=91.218.175.180 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="nxPOlTOU" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1778523928; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=+G4nCRUX4qX9mDfWjYwDvHpd39WgYIEQeRFwqTqoLSw=; b=nxPOlTOUDXyPKD80O5m8PRLjEjvsEV8xaT5BVULsG+klGsOIihJMqMRXox4mLQ/0d/n5b9 U2RZc5YFnZwcayhwl+fV7Jc0uw+EkWm3edvTSjam4HR75VXoZz+W5pftYSlDo4P3Qc1+eo URv59pTg110vzhKgj9xVz+nHru1NJe0= From: wen.yang@linux.dev To: Gabriele Monaco , Steven Rostedt Cc: linux-trace-kernel@vger.kernel.org, linux-kernel@vger.kernel.org, Wen Yang Subject: [RFC PATCH v2 05/10] rv: add generic uprobe infrastructure for RV monitors Date: Tue, 12 May 2026 02:24:51 +0800 Message-Id: <72f9f3e5cdda96262d9362db272bc4bcccabf3fc.1778522945.git.wen.yang@linux.dev> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT From: Wen Yang Introduce rv_uprobe, a thin wrapper around uprobe_consumer providing rv_uprobe_attach_path(), rv_uprobe_attach(), and rv_uprobe_detach() for RV monitors. An opaque priv pointer is forwarded unchanged to entry/return handlers so monitors can carry per-binding state (e.g. a latency threshold) to the hot path without any global lookup. rv_uprobe_detach() is fully synchronous (nosync + sync + path_put + kfree), closing the use-after-free window present in open-coded patterns where kfree() precedes uprobe_unregister_sync(). Signed-off-by: Wen Yang --- include/rv/rv_uprobe.h | 87 ++++++++++++++++++++ kernel/trace/rv/Kconfig | 4 + kernel/trace/rv/Makefile | 1 + kernel/trace/rv/rv_uprobe.c | 153 ++++++++++++++++++++++++++++++++++++ 4 files changed, 245 insertions(+) create mode 100644 include/rv/rv_uprobe.h create mode 100644 kernel/trace/rv/rv_uprobe.c diff --git a/include/rv/rv_uprobe.h b/include/rv/rv_uprobe.h new file mode 100644 index 000000000000..084cdb36a2ff --- /dev/null +++ b/include/rv/rv_uprobe.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Generic uprobe infrastructure for RV monitors. + * + */ + +#ifndef _RV_UPROBE_H +#define _RV_UPROBE_H + +#include +#include + +struct pt_regs; + +/** + * struct rv_uprobe - a single uprobe registered on behalf of an RV monitor + * + * @offset: byte offset within the ELF binary where the probe is install= ed + * @priv: monitor-private pointer; set at attach time, never touched by + * this layer; passed unchanged to entry_fn / ret_fn + * @path: resolved path of the probed binary (read-only after attach); + * callers may use path.dentry for identity comparisons + * + * The implementation fields (uprobe_consumer, uprobe handle, callbacks) a= re + * private to rv_uprobe.c and are not exposed here; monitors must not acce= ss + * them directly. + */ +struct rv_uprobe { + /* public: read-only after rv_uprobe_attach*() */ + loff_t offset; + void *priv; + struct path path; +}; + +/** + * rv_uprobe_attach_path - register an uprobe given an already-resolved pa= th + * @path: path of the target binary; rv_uprobe takes its own reference + * @offset: byte offset within the binary + * @entry_fn: called on probe hit (entry); may be NULL + * @ret_fn: called on function return (uretprobe); may be NULL + * @priv: opaque pointer forwarded to callbacks unchanged + * + * Use this variant when the caller has already resolved the path (e.g. to + * register multiple probes on the same binary with a single kern_path cal= l). + * The inode is derived internally via d_real_inode(), so inode and path a= re + * always consistent. + * + * Returns a pointer to the new rv_uprobe on success, ERR_PTR on failure. + */ +struct rv_uprobe *rv_uprobe_attach_path(struct path *path, loff_t offset, + int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data), + int (*ret_fn)(struct rv_uprobe *p, unsigned long func, + struct pt_regs *regs, __u64 *data), + void *priv); + +/** + * rv_uprobe_attach - resolve binpath and register an uprobe + * @binpath: absolute path to the target binary + * @offset: byte offset within the binary + * @entry_fn: called on probe hit (entry); may be NULL + * @ret_fn: called on function return (uretprobe); may be NULL + * @priv: opaque pointer forwarded to callbacks unchanged + * + * Resolves binpath via kern_path(), then delegates to rv_uprobe_attach_pa= th(). + * + * Returns a pointer to the new rv_uprobe on success, ERR_PTR on failure. + */ +struct rv_uprobe *rv_uprobe_attach(const char *binpath, loff_t offset, + int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data), + int (*ret_fn)(struct rv_uprobe *p, unsigned long func, + struct pt_regs *regs, __u64 *data), + void *priv); + +/** + * rv_uprobe_detach - synchronously unregister an uprobe and free it + * @p: probe to detach; may be NULL (no-op) + * + * Calls uprobe_unregister_nosync(), then uprobe_unregister_sync() to wait + * for any in-progress handler to finish, then releases the path reference + * and frees the rv_uprobe struct. The caller's priv data is NOT freed. + * + * Safe to call from process context only (uprobe_unregister_sync() may + * schedule). + */ +void rv_uprobe_detach(struct rv_uprobe *p); + +#endif /* _RV_UPROBE_H */ diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 3884b14df375..e2e0033a00b9 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -59,6 +59,10 @@ config RV_PER_TASK_MONITORS This option configures the maximum number of per-task RV monitors that = can run simultaneously. =20 +config RV_UPROBE + bool + depends on RV && UPROBES + source "kernel/trace/rv/monitors/wip/Kconfig" source "kernel/trace/rv/monitors/wwnr/Kconfig" =20 diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 94498da35b37..f139b904bea3 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_RV_MON_STALL) +=3D monitors/stall/stall.o obj-$(CONFIG_RV_MON_DEADLINE) +=3D monitors/deadline/deadline.o obj-$(CONFIG_RV_MON_NOMISS) +=3D monitors/nomiss/nomiss.o # Add new monitors here +obj-$(CONFIG_RV_UPROBE) +=3D rv_uprobe.o obj-$(CONFIG_RV_REACTORS) +=3D rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) +=3D reactor_printk.o obj-$(CONFIG_RV_REACT_PANIC) +=3D reactor_panic.o diff --git a/kernel/trace/rv/rv_uprobe.c b/kernel/trace/rv/rv_uprobe.c new file mode 100644 index 000000000000..bc28399cfd4b --- /dev/null +++ b/kernel/trace/rv/rv_uprobe.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generic uprobe infrastructure for RV monitors. + * + */ +#include +#include +#include +#include +#include +#include + +/* + * Private extension of struct rv_uprobe. Allocated by rv_uprobe_attach*() + * and returned to callers as &impl->pub. + */ +struct rv_uprobe_impl { + struct rv_uprobe pub; /* must be first; callers hold &pub */ + struct uprobe_consumer uc; + struct uprobe *uprobe; + int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data); + int (*ret_fn)(struct rv_uprobe *p, unsigned long func, + struct pt_regs *regs, __u64 *data); +}; + +static int rv_uprobe_handler(struct uprobe_consumer *uc, + struct pt_regs *regs, __u64 *data) +{ + struct rv_uprobe_impl *impl =3D container_of(uc, struct rv_uprobe_impl, u= c); + + if (impl->entry_fn) + return impl->entry_fn(&impl->pub, regs, data); + return 0; +} + +static int rv_uprobe_ret_handler(struct uprobe_consumer *uc, + unsigned long func, + struct pt_regs *regs, __u64 *data) +{ + struct rv_uprobe_impl *impl =3D container_of(uc, struct rv_uprobe_impl, u= c); + + if (impl->ret_fn) + return impl->ret_fn(&impl->pub, func, regs, data); + return 0; +} + +static struct rv_uprobe * +__rv_uprobe_attach(struct inode *inode, struct path *path, loff_t offset, + int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *dat= a), + int (*ret_fn)(struct rv_uprobe *p, unsigned long func, + struct pt_regs *regs, __u64 *data), + void *priv) +{ + struct rv_uprobe_impl *impl; + int ret; + + if (!entry_fn && !ret_fn) + return ERR_PTR(-EINVAL); + + impl =3D kzalloc_obj(*impl, GFP_KERNEL); + if (!impl) + return ERR_PTR(-ENOMEM); + + impl->pub.offset =3D offset; + impl->pub.priv =3D priv; + impl->entry_fn =3D entry_fn; + impl->ret_fn =3D ret_fn; + path_get(path); + impl->pub.path =3D *path; + + if (entry_fn) + impl->uc.handler =3D rv_uprobe_handler; + if (ret_fn) + impl->uc.ret_handler =3D rv_uprobe_ret_handler; + + impl->uprobe =3D uprobe_register(inode, offset, 0, &impl->uc); + if (IS_ERR(impl->uprobe)) { + ret =3D PTR_ERR(impl->uprobe); + path_put(&impl->pub.path); + kfree(impl); + return ERR_PTR(ret); + } + + return &impl->pub; +} + +/** + * rv_uprobe_attach_path - register an uprobe given an already-resolved pa= th + */ +struct rv_uprobe *rv_uprobe_attach_path(struct path *path, loff_t offset, + int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data), + int (*ret_fn)(struct rv_uprobe *p, unsigned long func, + struct pt_regs *regs, __u64 *data), + void *priv) +{ + struct inode *inode =3D d_real_inode(path->dentry); + + return __rv_uprobe_attach(inode, path, offset, entry_fn, ret_fn, priv); +} +EXPORT_SYMBOL_GPL(rv_uprobe_attach_path); + +/** + * rv_uprobe_attach - resolve binpath and register an uprobe + */ +struct rv_uprobe *rv_uprobe_attach(const char *binpath, loff_t offset, + int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data), + int (*ret_fn)(struct rv_uprobe *p, unsigned long func, + struct pt_regs *regs, __u64 *data), + void *priv) +{ + struct rv_uprobe *p; + struct path path; + int ret; + + ret =3D kern_path(binpath, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + + if (!d_is_reg(path.dentry)) { + path_put(&path); + return ERR_PTR(-EINVAL); + } + + p =3D rv_uprobe_attach_path(&path, offset, entry_fn, ret_fn, priv); + path_put(&path); + return p; +} +EXPORT_SYMBOL_GPL(rv_uprobe_attach); + +/** + * rv_uprobe_detach - synchronously unregister an uprobe and free it + */ +void rv_uprobe_detach(struct rv_uprobe *p) +{ + struct rv_uprobe_impl *impl; + + if (!p) + return; + + impl =3D container_of(p, struct rv_uprobe_impl, pub); + uprobe_unregister_nosync(impl->uprobe, &impl->uc); + /* + * uprobe_unregister_sync() is a global barrier: it waits for all + * in-flight uprobe handlers across the entire system to complete, + * not just handlers for this probe. This is intentional =E2=80=94 it + * guarantees that no handler touching impl->pub.priv is running by + * the time we return, even if the caller immediately frees priv. + */ + uprobe_unregister_sync(); + path_put(&p->path); + kfree(impl); +} +EXPORT_SYMBOL_GPL(rv_uprobe_detach); --=20 2.25.1 From nobody Sat Jun 13 00:24:03 2026 Received: from out-172.mta0.migadu.com (out-172.mta0.migadu.com [91.218.175.172]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2671647D95F for ; Mon, 11 May 2026 18:25:33 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523934; cv=none; b=GqLDojpZL84ocCER/3EadqAX2YUx8qR6ugc2PlcORatgjJ1PqwoA+HXg2FO0jeveYQ8nWGiGc+vH/EKQXrJMBrNRHDbAyup2I0P3hpTmzJULk8HpfibEHsVodsp8aKgJQrmI6ByxGbOx/++DAG6+gHfbkS+Tl3Udb6Pv7Ms1Bzo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523934; c=relaxed/simple; bh=1US6IxmZ7Cw2Pv456ZtYpFmro3G70vVpFzCpn/ff0YI=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version:Content-Type; b=iw7qejDnvI09uYvm2Z5n6fRmlaC8Ubrgti1+ldsynfoLvmBtBmROPn4N2QuW72SK1PL5vi6q/pkqzXkiwlwrdCwzbvZVtqmCKNUuyxv9A4qIFAWjg9n9B5AOcTLWEp1P+/xGBosTo5NJdHmqvp8LxjpO/B+nGpeO1p8iJ7W4vM0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=MPkB4SGg; arc=none smtp.client-ip=91.218.175.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="MPkB4SGg" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1778523930; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=aqhXGiNES/GeSzECBAFP4Cw9YkFTgSPnFcrP4KJFfFA=; b=MPkB4SGgdiDjMKXrx+PsWBux/SPLjsV9LL/e5j3tvJszFI+4pgQJLTBUS9cUXT2iyQUf7p xyPxNqyjfMjm2GiGVPTu6QVm6p8cE8BuVtgnPp0yOsSRKOhyw15lY5EtLZ4CjSby7gZfdF ZArSw9M9Bm0457/8KCE/4ktKdN6eAxY= From: wen.yang@linux.dev To: Gabriele Monaco , Steven Rostedt Cc: linux-trace-kernel@vger.kernel.org, linux-kernel@vger.kernel.org, Wen Yang Subject: [RFC PATCH v2 06/10] rvgen: support reset() on the __init arrow for global-window HA clocks Date: Tue, 12 May 2026 02:24:52 +0800 Message-Id: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT From: Wen Yang rvgen rejects a state invariant when its env is never reset on any state-transition edge. This prevents expressing monitors where a clock tracks the full monitoring window =E2=80=94 reset once at object creation, active in all states. Allow reset() annotations on the __init_STATE -> STATE arrow. automata.py adds listed envs to the new env_init_started set (and to env_stored so the HA framework allocates per-object storage). dot2k.py uses env_init_started for three purposes: - Generate a handle_monitor_start() skeleton that resets the env and arms the timer after the caller sets up DA storage and initial state. - Guard ha_inv_to_guard calls with !ha_monitor_env_invalid() for these envs: a concurrent DA event between da_handle_start_event() and ha_reset_env() would otherwise store U64_MAX - BUDGET as the guard anchor, silently disabling enforcement. - Always generate ha_verify_guards() for monitors with invariants, providing a stable extension point for future per-event guards. Models without __init resets (e.g. stall.dot) are unaffected. Signed-off-by: Wen Yang --- tools/verification/rvgen/rvgen/automata.py | 26 ++++++ tools/verification/rvgen/rvgen/dot2k.py | 100 +++++++++++++++++++-- 2 files changed, 119 insertions(+), 7 deletions(-) diff --git a/tools/verification/rvgen/rvgen/automata.py b/tools/verificatio= n/rvgen/rvgen/automata.py index b9f8149f7118..178a1a4ffd8a 100644 --- a/tools/verification/rvgen/rvgen/automata.py +++ b/tools/verification/rvgen/rvgen/automata.py @@ -69,15 +69,41 @@ class Automata: self.states, self.initial_state, self.final_states =3D self.__get_= state_variables() self.env_types =3D {} self.env_stored =3D set() + self.env_init_started =3D set() self.constraint_vars =3D set() self.self_loop_reset_events =3D set() self.events, self.envs =3D self.__get_event_variables() + self.__parse_init_resets() self.function, self.constraints =3D self.__create_matrix() self.events_start, self.events_start_run =3D self.__store_init_eve= nts() self.env_stored =3D sorted(self.env_stored) + self.env_init_started =3D sorted(self.env_init_started) self.constraint_vars =3D sorted(self.constraint_vars) self.self_loop_reset_events =3D sorted(self.self_loop_reset_events) =20 + def __parse_init_resets(self) -> None: + """Parse reset() annotations on the __init_STATE -> STATE arrow. + + Adds each listed env to env_stored (HA framework allocates per-obj= ect + storage) and env_init_started (ha2k generates handle_monitor_start= ()). + """ + init_prefix =3D f'"{self.init_marker}' + for line in map(str.lstrip, self.__dot_lines): + if not line.startswith(init_prefix): + continue + split_line =3D line.split() + if len(split_line) < 3 or split_line[1] !=3D "->": + continue + if "label" not in line: + continue + label =3D "".join(split_line[split_line.index("label") + 2:-1]= ).replace('"', '') + for part in label.split(";"): + reset_m =3D self.constraint_reset.search(part.strip()) + if reset_m: + env =3D reset_m["env"] + self.env_stored.add(env) + self.env_init_started.add(env) + def __get_model_name(self) -> str: basename =3D ntpath.basename(self.__dot_path) if not basename.endswith(".dot") and not basename.endswith(".gv"): diff --git a/tools/verification/rvgen/rvgen/dot2k.py b/tools/verification/r= vgen/rvgen/dot2k.py index e6f476b903b0..e8066260c0af 100644 --- a/tools/verification/rvgen/rvgen/dot2k.py +++ b/tools/verification/rvgen/rvgen/dot2k.py @@ -366,7 +366,18 @@ f"""static inline void ha_convert_inv_guard(struct ha_= monitor *ha_mon, conf_g =3D [e for s, e in conflict_guards if s =3D=3D state] if not conf_i and not conf_g: continue - buff.append(f"\t{_else}if (curr_state =3D=3D {self.states[stat= e]}{self.enum_suffix})") + + state_name =3D f"{self.states[state]}{self.enum_suffix}" + env_full =3D self.__get_constraint_env(constr) + env_bare =3D env_full[:-len(self.enum_suffix)] + if env_bare in self.env_init_started: + # env_store is ENV_INVALID_VALUE until handle_monitor_star= t(); + # skip ha_inv_to_guard during the init race window. + cont =3D "\t\t " if _else else "\t " + buff.append(f"\t{_else}if (curr_state =3D=3D {state_name} = &&") + buff.append(f"{cont}!ha_monitor_env_invalid(ha_mon, {env_f= ull}))") + else: + buff.append(f"\t{_else}if (curr_state =3D=3D {state_name})= ") =20 buff.append(f"\t\t{self.__start_to_conv(constr)};") _else =3D "else " @@ -376,16 +387,22 @@ f"""static inline void ha_convert_inv_guard(struct ha= _monitor *ha_mon, =20 def __fill_verify_guards_func(self) -> list[str]: buff =3D [] - if not self.guards: + # Always generate for monitors with invariants: stable extension + # point for future guard conditions. + if not self.guards and not self.invariants: return [] =20 buff.append( f"""static inline bool ha_verify_guards(struct ha_monitor *ha_mon, \t\t\t\t enum {self.enum_states_def} curr_state, enum {self.enum_events= _def} event, \t\t\t\t enum {self.enum_states_def} next_state, u64 time_ns) -{{ -\tbool res =3D true; -""") +{{""") + + if not self.guards: + buff.append("\treturn true;\n}\n") + return buff + + buff.append("\tbool res =3D true;\n") =20 _else =3D "" for edge, constr in sorted(self.guards.items()): @@ -522,7 +539,7 @@ f"""static bool ha_verify_constraint(struct ha_monitor = *ha_mon, buff.append("\tha_convert_inv_guard(ha_mon, curr_state, event,= " "next_state, time_ns);\n") =20 - if self.guards: + if self.guards or self.invariants: buff.append("\tif (!ha_verify_guards(ha_mon, curr_state, event= , " "next_state, time_ns))\n\t\treturn false;\n") =20 @@ -599,8 +616,77 @@ f"""static bool ha_verify_constraint(struct ha_monitor= *ha_mon, buff.append("}\n") return buff =20 + def __fill_init_start_helper(self) -> list[str]: + """Generate handle_monitor_start() for envs reset on the __init ar= row. + + env_store is invalid inside da_handle_start_event(); this helper m= ust + be called after DA storage is allocated and initial state is set. + """ + if not self.env_init_started: + return [] + + # Collect the ha_start_timer call for each init-started env from t= he + # first state invariant that references it. + timer_calls: dict[str, str] =3D {} + for env in self.env_init_started: + env_full =3D f"{env}{self.enum_suffix}" + for constr in self.invariants.values(): + if env_full in constr: + timer_calls[env] =3D constr + break + + buff =3D [] + buff.append( +"""/* + * handle_monitor_start - reset per-object clock(s) and arm the timer. + * + * env_store is invalid inside da_handle_start_event(); call this helper + * after allocating DA storage and setting the initial DA state. + * + * XXX: replace the placeholders with the actual logic for your monitor. + */""") + + if self.monitor_type =3D=3D "per_obj": + buff.append("static int handle_monitor_start(int id, monitor_t= arget t)") + buff.append("{") + buff.append("\tstruct ha_monitor *ha_mon;") + buff.append("\tu64 time_ns =3D ktime_get_ns();\n") + buff.append("\t/* XXX: allocate DA storage, e.g. da_create_or_= get(id, t) */") + buff.append("\t/* XXX: set initial DA state, e.g. da_handle_st= art_event(id, t, ) */") + buff.append("\tha_mon =3D /* XXX: retrieve ha_monitor for (id,= t) */;") + elif self.monitor_type =3D=3D "per_task": + buff.append("static int handle_monitor_start(struct task_struc= t *p)") + buff.append("{") + buff.append("\tstruct ha_monitor *ha_mon;") + buff.append("\tu64 time_ns =3D ktime_get_ns();\n") + buff.append("\t/* XXX: allocate DA storage, e.g. da_create_or_= get(p->pid, p) */") + buff.append("\t/* XXX: set initial DA state, e.g. da_handle_st= art_event(p->pid, p, ) */") + buff.append("\tha_mon =3D /* XXX: retrieve ha_monitor for p */= ;") + else: + buff.append("static int handle_monitor_start(void)") + buff.append("{") + buff.append("\tstruct ha_monitor *ha_mon;") + buff.append("\tu64 time_ns =3D ktime_get_ns();\n") + buff.append("\tha_mon =3D /* XXX: retrieve global ha_monitor *= /;") + + buff.append("\tif (!ha_mon)") + buff.append("\t\treturn -ENOENT;") + + for env in self.env_init_started: + buff.append(f"\tha_reset_env(ha_mon, {env}{self.enum_suffix}, = time_ns);") + if env in timer_calls: + buff.append(f"\t{timer_calls[env]};") + else: + buff.append(f"\t/* XXX: arm timer for {env} */") + + buff.append("\treturn 0;") + buff.append("}\n") + return buff + def _fill_hybrid_definitions(self) -> list[str]: - return self.__fill_hybrid_get_reset_functions() + self.__fill_cons= tr_func() + return (self.__fill_hybrid_get_reset_functions() + + self.__fill_init_start_helper() + + self.__fill_constr_func()) =20 def _fill_timer_type(self) -> list: if self.invariants: --=20 2.25.1 From nobody Sat Jun 13 00:24:03 2026 Received: from out-171.mta0.migadu.com (out-171.mta0.migadu.com [91.218.175.171]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 034DF47DF95 for ; Mon, 11 May 2026 18:25:34 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.171 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523936; cv=none; b=fkYcHUhK20kIEJYYLKW1Obqsgbbewtln8N0y1OSsno2BiJpVzzpMqvAnbGPKa9V4JWCPrbQalY0TMe854f2shsMZFSotQZjnEfYEDUwE/9a+LBOLRjdPrqZ8NzP56U5YN1yvGXy+egLEBE+Onvkoa684obu4spuzg7lGEXeL6hA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523936; c=relaxed/simple; bh=P4iAu2y1WYRKIWrbM8V+pf5m9s4kdbFvRct0dVmz3mo=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=Ml7PwcKXcWAR6AocXCYtIqZ58JxlP2NmOERP9KOyNJdEI2ISvBWTPyoLLxsMSzTGoTJLgn06pi+7zwNmKWeF9GpEYrQJ1z/vSVNEzELRksoOaP2W/9wyp5rErnLV1B+x86vC+Oc1i8FmxcYOVefPlqEgq+CeDobtHd/ax5hSNMA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=TLT0uH/e; arc=none smtp.client-ip=91.218.175.171 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="TLT0uH/e" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1778523933; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=ltI/qJggpko9gP6TfxD/MZJvsOySJfuz1/BGvhgs6fE=; b=TLT0uH/eGuM7vfWexGAXC3IuIJjuyuEtUv4XeAL+1BP5hHGpvxQCAP8fBakPUVMY3xmYeG 90NSzrs30vM0BqQsqngxkKUtObrQmojul7bFcHqj5DssQjz6mJq3J5FssBCvQG5f/nDYMy XJrHOUeZNdd1DgPRzbvkrQoeshx3Yjo= From: wen.yang@linux.dev To: Gabriele Monaco , Steven Rostedt Cc: linux-trace-kernel@vger.kernel.org, linux-kernel@vger.kernel.org, Wen Yang Subject: [RFC PATCH v2 07/10] rv/tlob: add tlob model DOT file Date: Tue, 12 May 2026 02:24:53 +0800 Message-Id: <5ed278ec33f219bb646a5bd37d6b55da7dcbbd8a.1778522945.git.wen.yang@linux.dev> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Wen Yang Add the Graphviz DOT specification for the tlob (task latency over budget) hybrid automaton. The model defines three states: running (initial), waiting (in the scheduler runqueue), and sleeping (blocked on a resource), with the transitions: running --(sleep)-------> sleeping running --(preempt)-----> waiting sleeping --(wakeup)------> waiting waiting --(switch_in)--> running A single clock invariant clk_elapsed < BUDGET_NS() is active in all three states. The HA framework enforces it via a per-task hrtimer; expiry emits error_env_tlob and resets the monitor automatically. Suggested-by: Gabriele Monaco =20 Signed-off-by: Wen Yang --- MAINTAINERS | 3 +++ tools/verification/models/tlob.dot | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 tools/verification/models/tlob.dot diff --git a/MAINTAINERS b/MAINTAINERS index 74c86cf9bc65..beb7224d08ef 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23317,7 +23317,10 @@ S: Maintained F: Documentation/trace/rv/ F: include/linux/rv.h F: include/rv/ +F: include/uapi/linux/rv.h F: kernel/trace/rv/ +F: samples/rv/ +F: tools/testing/selftests/rv/ F: tools/testing/selftests/verification/ F: tools/verification/ =20 diff --git a/tools/verification/models/tlob.dot b/tools/verification/models= /tlob.dot new file mode 100644 index 000000000000..8421b1120e80 --- /dev/null +++ b/tools/verification/models/tlob.dot @@ -0,0 +1,21 @@ +digraph state_automaton { + center =3D true; + size =3D "7,11"; + {node [shape =3D plaintext, style=3Dinvis, label=3D""] "__init_running"}; + {node [shape =3D ellipse] "running"}; + {node [shape =3D plaintext] "running"}; + {node [shape =3D plaintext] "waiting"}; + {node [shape =3D plaintext] "sleeping"}; + "__init_running" -> "running" [ label =3D "reset(clk_elapsed)" ]; + "running" [label =3D "running\nclk_elapsed < BUDGET_NS()", color =3D gr= een3]; + "waiting" [label =3D "waiting\nclk_elapsed < BUDGET_NS()"]; + "sleeping" [label =3D "sleeping\nclk_elapsed < BUDGET_NS()"]; + "running" -> "sleeping" [ label =3D "sleep" ]; + "running" -> "waiting" [ label =3D "preempt" ]; + "waiting" -> "running" [ label =3D "switch_in" ]; + "sleeping" -> "waiting" [ label =3D "wakeup" ]; + { rank =3D min ; + "__init_running"; + "running"; + } +} --=20 2.25.1 From nobody Sat Jun 13 00:24:03 2026 Received: from out-178.mta0.migadu.com (out-178.mta0.migadu.com [91.218.175.178]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 775F5480321; Mon, 11 May 2026 18:25:38 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.178 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523943; cv=none; b=OUya/3bZGhoFnRSRQIgdEOIZzz9Ndcrc6kqKPaP68dHNWc4rO1gSQ0f+WAdRBBTbAj0syYfwfxcbCGSXdT8OGhLf+n7iurxy+IIs0fucIBftwLlUB9A8r7CYfpX99bTj7MkBta2h7Kp8A71TBqFxfrVs8KDraETKxktx2zSt0CM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523943; c=relaxed/simple; bh=LmXUHbU6cbDA6Eed+Umo2RqH8Cuh8Rvdqjvdo3OPYR8=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version:Content-Type; b=NsDUhSZ96KKyY7A1J2xkSDyetMUHsu0EfzdSURsfUdTn8McXngpP+QotB2orP9N8657r4NHOTzFgZ6CBy7zONyCLaBL1EIAhn11haJ48wNdefNy7QAzvsmGRvgwTuyEDsBF9uk2MJw5EsEXzQkhg09gHJZ09Os8LigqoxG/b2BA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=A84Sc0D2; arc=none smtp.client-ip=91.218.175.178 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="A84Sc0D2" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1778523936; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=vbXPDLSKtqFOaaJYQqNvzeH7KyO9chy4JcjNbljuZk4=; b=A84Sc0D2iQc090nXAtyOOQbYB1Iz/RNn/X0Yip6zYWIiJlEYf5U6l8Hd+cYAfPiZOSILw8 3RIf0A56MRpWfpQ+EjQwuilLIhxnKL854qB/Jw8Olr97lsR5oV9dZh9kn8WTKJnd8KIU2g kBssbZh7/I6vUGCIP/A9K53H1KuWNxM= From: wen.yang@linux.dev To: Gabriele Monaco , Steven Rostedt Cc: linux-trace-kernel@vger.kernel.org, linux-kernel@vger.kernel.org, Wen Yang Subject: [RFC PATCH v2 08/10] rv/tlob: add tlob hybrid automaton monitor Date: Tue, 12 May 2026 02:24:54 +0800 Message-Id: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT From: Wen Yang Introduce tlob (task latency over budget), a per-task hybrid-automaton RV monitor that measures elapsed time (CLOCK_MONOTONIC) across a user-delimited code section and fires an error_env_tlob tracepoint when the elapsed time exceeds a configurable per-invocation budget. The monitor is built on RV_MON_PER_OBJ with HA_TIMER_HRTIMER. Three states track the scheduler status of the monitored task: running --(sleep)-------> sleeping running --(preempt)-----> waiting sleeping --(wakeup)------> waiting waiting --(switch_in)--> running A single clock invariant clk_elapsed < BUDGET_NS() is active in all three states. The budget hrtimer is rearmed on each DA transition for the remaining budget, keeping the absolute deadline fixed at start_time + BUDGET_NS. Per-task state is stored in the DA framework's hash table keyed by task->pid. Storage is pre-allocated by tlob_start_task() with GFP_KERNEL via da_create_or_get() before the scheduler tracepoints can fire, using DA_SKIP_AUTO_ALLOC so that no kmalloc occurs on the tracepoint hot path. This avoids both the kmalloc_nolock() restriction (requires HAVE_ALIGNED_STRUCT_PAGE) and latency issues under PREEMPT_RT. Nested monitoring is handled by nest_depth: tlob_start_task() on an already-monitored pid returns -EEXIST and increments nest_depth without disturbing the outer window; only the outermost tlob_stop_task() performs real cleanup. Two userspace interfaces are provided. The ioctl interface exposes in-process self-instrumentation via /dev/rv with TLOB_IOCTL_TRACE_START and TLOB_IOCTL_TRACE_STOP. The uprobe interface enables external monitoring of unmodified binaries via tracefs: echo "p PATH:OFFSET_START OFFSET_STOP threshold=3DNS" \ > /sys/kernel/tracing/rv/monitors/tlob/monitor Violations are reported via error_env_tlob (HA clock-invariant) regardless of which interface triggered them. Suggested-by: Gabriele Monaco =20 Signed-off-by: Wen Yang --- Documentation/trace/rv/index.rst | 1 + Documentation/trace/rv/monitor_tlob.rst | 213 ++++ include/linux/rv.h | 45 + include/rv/automata.h | 15 + include/rv/ha_monitor.h | 33 +- include/rv/rv_uprobe.h | 32 + include/uapi/linux/rv.h | 86 ++ kernel/trace/rv/Kconfig | 2 + kernel/trace/rv/Makefile | 4 +- kernel/trace/rv/monitors/tlob/Kconfig | 69 ++ kernel/trace/rv/monitors/tlob/tlob.c | 1307 ++++++++++++++++++++ kernel/trace/rv/monitors/tlob/tlob.h | 171 +++ kernel/trace/rv/monitors/tlob/tlob_trace.h | 58 + kernel/trace/rv/rv.c | 38 + kernel/trace/rv/rv.h | 2 + kernel/trace/rv/rv_chardev.c | 201 +++ kernel/trace/rv/rv_trace.h | 1 + kernel/trace/rv/rv_uprobe.c | 46 +- tools/include/uapi/linux/rv.h | 86 ++ 19 files changed, 2400 insertions(+), 10 deletions(-) create mode 100644 Documentation/trace/rv/monitor_tlob.rst create mode 100644 include/uapi/linux/rv.h create mode 100644 kernel/trace/rv/monitors/tlob/Kconfig create mode 100644 kernel/trace/rv/monitors/tlob/tlob.c create mode 100644 kernel/trace/rv/monitors/tlob/tlob.h create mode 100644 kernel/trace/rv/monitors/tlob/tlob_trace.h create mode 100644 kernel/trace/rv/rv_chardev.c create mode 100644 tools/include/uapi/linux/rv.h diff --git a/Documentation/trace/rv/index.rst b/Documentation/trace/rv/inde= x.rst index 29769f06bb0f..1501545b5f08 100644 --- a/Documentation/trace/rv/index.rst +++ b/Documentation/trace/rv/index.rst @@ -16,5 +16,6 @@ Runtime Verification monitor_wwnr.rst monitor_sched.rst monitor_rtapp.rst + monitor_tlob.rst monitor_stall.rst monitor_deadline.rst diff --git a/Documentation/trace/rv/monitor_tlob.rst b/Documentation/trace/= rv/monitor_tlob.rst new file mode 100644 index 000000000000..91b592630b3f --- /dev/null +++ b/Documentation/trace/rv/monitor_tlob.rst @@ -0,0 +1,213 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Monitor tlob +=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D + +- Name: tlob - task latency over budget +- Type: per-object hybrid automaton (RV_MON_PER_OBJ) +- Author: Wen Yang + +Description +----------- + +The tlob monitor tracks per-task elapsed wall-clock time (CLOCK_MONOTONIC, +spanning running, waiting, and sleeping states) and reports a violation wh= en +the monitored task exceeds a configurable per-invocation budget threshold. + +The monitor implements a three-state hybrid automaton with a single clock +environment variable ``clk_elapsed``. The clock invariant +``clk_elapsed < BUDGET_NS()`` is active in all three states; when it is +violated the HA timer fires and the framework emits ``error_env_tlob`` +then calls ``da_monitor_reset()`` automatically:: + + | (initial, via task_start) + v + +--------------+ + | running | <-----------+ + +--------------+ | + | | | + sleep preempt switch_in + | | | + v v | + +---------+ +---------+ | + | sleeping| | waiting | -------+ + +---------+ +---------+ + | ^ + +---wakeup---+ + + Key transitions: + running --(sleep)------> sleeping (task blocks waiting for a resour= ce) + running --(preempt)----> waiting (task preempted, back in runqueue) + sleeping --(wakeup)-----> waiting (resource available, enters runqu= eue) + waiting --(switch_in)--> running (scheduler picks task, back on CP= U) + + ``task_start`` calls ``da_handle_start_event()`` with the synthetic event + ``switch_in_tlob`` to force the initial DA state to ``running`` (since + ``switch_in`` transitions waiting=E2=86=92running), then resets ``clk_el= apsed`` and + arms the budget timer directly via ``ha_reset_clk_ns()`` + ``ha_start_ti= mer_ns()``. + ``task_stop`` cancels the HA timer synchronously via + ``ha_cancel_timer_sync()`` then calls ``da_monitor_reset()`` directly. + +The non-running condition (monitor not yet started or reset after a +stop/violation) is handled implicitly by the RV framework +(``da_mon->monitoring =3D=3D 0``) =E2=80=94 it is not an explicit DA state. + +Per-task state lives in ``struct tlob_task_state`` which is stored as +``monitor_target`` in the framework's ``da_monitor_storage``, indexed by +pid. The per-invocation ``threshold_us`` is read via +``ha_get_target(ha_mon)->threshold_us`` inside the HA constraint functions, +following the same pattern as the ``nomiss`` monitor. + +Usage +----- + +tracefs interface (uprobe-based external monitoring) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``monitor`` tracefs file instruments an unmodified binary via uprobes. +The format follows the ftrace ``uprobe_events`` convention (``PATH:OFFSET`` +for the probe location, ``key=3Dvalue`` for configuration parameters):: + + p PATH:OFFSET_START OFFSET_STOP threshold=3DUS + +The uprobe at ``OFFSET_START`` fires ``tlob_start_task()``; the uprobe at +``OFFSET_STOP`` fires ``tlob_stop_task()``. Both offsets are ELF file +offsets of entry points in ``PATH``. ``PATH`` may contain ``:``; the last +``:`` in the ``PATH:OFFSET_START`` token is the separator. + +To remove a binding, use ``-PATH:OFFSET_START``:: + + echo 1 > /sys/kernel/tracing/rv/monitors/tlob/enable + + echo "p /usr/bin/myapp:0x12a0 0x12f0 threshold=3D5000" \ + > /sys/kernel/tracing/rv/monitors/tlob/monitor + + # Remove a binding + echo "-/usr/bin/myapp:0x12a0" > /sys/kernel/tracing/rv/monitors/tlob/mon= itor + + # List registered bindings + cat /sys/kernel/tracing/rv/monitors/tlob/monitor + + # Read violations from the trace buffer + cat /sys/kernel/tracing/trace + +ioctl self-instrumentation (/dev/rv) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``/dev/rv`` is a shared RV character device. Before using any monitor-spe= cific +ioctl, the fd must be bound to a monitor via ``RV_IOCTL_BIND_MONITOR``. E= ach +open fd has independent per-fd monitoring state:: + + int fd =3D open("/dev/rv", O_RDWR); + + /* Bind this fd to the tlob monitor. */ + struct rv_bind_args bind =3D { .monitor_name =3D "tlob" }; + ioctl(fd, RV_IOCTL_BIND_MONITOR, &bind); + + struct tlob_start_args args =3D { + .threshold_us =3D 50000, /* 50 ms in microseconds */ + }; + ioctl(fd, TLOB_IOCTL_TRACE_START, &args); + + /* ... code path under observation ... */ + + int ret =3D ioctl(fd, TLOB_IOCTL_TRACE_STOP, NULL); + /* ret =3D=3D 0: within budget */ + /* ret =3D=3D -EOVERFLOW: budget exceeded */ + + close(fd); + +``TRACE_STOP`` returns ``-EOVERFLOW`` whenever the budget was exceeded. +The HA timer calls ``da_monitor_reset()`` (storage remains); the +synchronous ``ha_cancel_timer_sync()`` in ``tlob_stop_task()`` ensures the +callback has completed before checking ``da_monitoring()``. + +Violation events +~~~~~~~~~~~~~~~~ + +Budget violations are always reported via the ``error_env_tlob`` RV +tracepoint (HA clock-invariant violation), regardless of which interface +triggered them:: + + cat /sys/kernel/tracing/trace + +To capture violations in a file:: + + trace-cmd record -e error_env_tlob & + # ... run workload ... + trace-cmd report + +tracefs files +------------- + +The following files are created under +``/sys/kernel/tracing/rv/monitors/tlob/``: + +``enable`` (rw) + Write ``1`` to enable the monitor; write ``0`` to disable it. + +``desc`` (ro) + Human-readable description of the monitor. + +``monitor`` (rw) + Write ``p PATH:OFFSET_START OFFSET_STOP threshold=3DUS`` + to bind two entry uprobes. Write ``-PATH:OFFSET_START`` to remove a + binding. Read to list registered bindings in the same format. + +Kernel API +---------- + +.. kernel-doc:: kernel/trace/rv/monitors/tlob/tlob.c + :functions: tlob_start_task tlob_stop_task + +``tlob_start_task(task, threshold_us)`` + Begin monitoring *task* with a total latency budget of *threshold_us* + microseconds. Allocates per-task state, sets initial DA state to + ``running``, resets ``clk_elapsed``, and arms the HA budget timer. + Returns 0, -ENODEV (monitor disabled), -ERANGE (zero threshold), + -EALREADY (already monitoring), -ENOSPC (at capacity), or -ENOMEM. + +``tlob_stop_task(task)`` + Stop monitoring *task*. Synchronously cancels the HA timer via + ``ha_cancel_timer_sync()``, checks ``da_monitoring()`` to determine outc= ome. + Returns 0 (clean stop, within budget), -EOVERFLOW (budget was exceeded), + -ESRCH (not monitored), or -EAGAIN (concurrent stop racing). + +Design notes +------------ + +State transitions are driven by two tracepoints: + +- ``sched_switch``: ``prev_state =3D=3D 0`` (``TASK_RUNNING``, preempted, + stays on runqueue) =E2=86=92 running=E2=86=92waiting; ``prev_state !=3D = 0`` (voluntarily + blocked, leaves runqueue) =E2=86=92 running=E2=86=92sleeping; ``next`` p= ointer =E2=86=92 + waiting=E2=86=92running. +- ``sched_wakeup``: task moves back onto the runqueue =E2=86=92 sleeping= =E2=86=92waiting. + +No ``waiting =E2=86=92 sleeping`` edge exists because a task can only block +itself while executing on CPU. ``try_to_wake_up()`` is also a no-op +when ``__state =3D=3D TASK_RUNNING``, so ``sched_wakeup`` never fires while +the task is in ``waiting`` state. + +Limitations: + +- The initial DA state is always ``running``, set by feeding the synthetic + event ``switch_in_tlob`` to ``da_handle_start_event()``. Monitoring a n= on-current + task that is already in waiting or sleeping state at call time misclassi= fies + the first interval as ``running_ns``. +- ``TASK_STOPPED`` and ``TASK_TRACED`` carry ``prev_state !=3D 0`` and are + therefore counted as ``sleeping_ns``, indistinguishable from + I/O-blocked time. +- ``sched_wakeup_new`` is not hooked. In practice this is not an issue + because ``tlob_start_task`` is always called from a running context. + +Specification +------------- + +Graphviz DOT file in tools/verification/models/tlob.dot. + +KUnit tests under ``kernel/trace/rv/monitors/tlob/tlob_kunit.c`` +(CONFIG_TLOB_KUNIT_TEST). + +User-space integration tests under ``tools/testing/selftests/verification/= `` +(requires CONFIG_RV_MON_TLOB=3Dy and root). diff --git a/include/linux/rv.h b/include/linux/rv.h index 541ba404926a..1ea91bb3f1c2 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -21,6 +21,13 @@ #include #include =20 +/* Forward declaration: poll_table is only needed by rv_chardev_ops::poll. + * Avoid pulling in from rv.h =E2=80=94 that header is incl= uded by + * sched.h, and poll.h =E2=86=92 fs.h =E2=86=92 rcupdate.h creates a heade= r-ordering cycle + * with migrate_disable() on UML/non-SMP targets. + */ +struct poll_table_struct; + /* * Deterministic automaton per-object variables. */ @@ -158,6 +165,44 @@ int rv_register_monitor(struct rv_monitor *monitor, st= ruct rv_monitor *parent); int rv_get_task_monitor_slot(void); void rv_put_task_monitor_slot(int slot); =20 +/** + * struct rv_chardev_ops - per-monitor callbacks for the /dev/rv chardev + * + * Monitors that want to expose an ioctl self-instrumentation interface + * register an instance of this struct with rv_chardev_register_monitor(). + * + * @owner: Module that owns this ops struct. Set to THIS_MODULE. + * The chardev holds a module reference for every bound fd so + * the module cannot be unloaded while any fd remains open. + * @bind: Called when userspace issues RV_IOCTL_BIND_MONITOR. Should + * allocate and return per-fd private data (opaque pointer), or + * ERR_PTR(errno) on failure. + * @ioctl: Called for every monitor-specific ioctl after binding. @priv + * is the pointer returned by @bind. + * @poll: Optional. Called from the fd's poll() / epoll_wait() path. + * Should call poll_wait(@file, wq, @wait) on the monitor's inte= rnal + * wait queue and return the current event mask (EPOLLIN | EPOLL= RDNORM + * when an event is pending, 0 otherwise). If NULL, poll() alwa= ys + * returns 0 (no events). + * @release: Called when the fd is closed. Must free @priv. + */ +struct rv_chardev_ops { + struct module *owner; + void *(*bind)(void); + long (*ioctl)(void *priv, unsigned int cmd, unsigned long arg); + __poll_t (*poll)(void *priv, struct file *file, struct poll_table_struct = *wait); + void (*release)(void *priv); +}; + +int rv_chardev_register_monitor(const char *name, + const struct rv_chardev_ops *ops); +void rv_chardev_unregister_monitor(const char *name); + +#if IS_ENABLED(CONFIG_KUNIT) +void rv_kunit_monitoring_on(void); +void rv_kunit_monitoring_off(void); +#endif + #ifdef CONFIG_RV_REACTORS int rv_unregister_reactor(struct rv_reactor *reactor); int rv_register_reactor(struct rv_reactor *reactor); diff --git a/include/rv/automata.h b/include/rv/automata.h index 4a4eb40cf09a..ae819638d85a 100644 --- a/include/rv/automata.h +++ b/include/rv/automata.h @@ -41,6 +41,21 @@ static char *model_get_event_name(enum events event) return RV_AUTOMATON_NAME.event_names[event]; } =20 +/* + * model_get_timer_event_name - label used when the HA timer fires (no eve= nt). + * + * Monitors may define MONITOR_TIMER_EVENT_NAME before including the model + * header to give the timer-fired violation a semantically meaningful label + * (e.g. "budget_exceeded" for tlob). Defaults to "none". + */ +#ifndef MONITOR_TIMER_EVENT_NAME +#define MONITOR_TIMER_EVENT_NAME "none" +#endif +static inline char *model_get_timer_event_name(void) +{ + return MONITOR_TIMER_EVENT_NAME; +} + /* * model_get_initial_state - return the automaton's initial state */ diff --git a/include/rv/ha_monitor.h b/include/rv/ha_monitor.h index d59507e8cb30..dfc993774089 100644 --- a/include/rv/ha_monitor.h +++ b/include/rv/ha_monitor.h @@ -28,6 +28,7 @@ static inline void ha_monitor_init_env(struct da_monitor = *da_mon); static inline void ha_monitor_reset_env(struct da_monitor *da_mon); static inline void ha_setup_timer(struct ha_monitor *ha_mon); static inline bool ha_cancel_timer(struct ha_monitor *ha_mon); +static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon); static bool ha_monitor_handle_constraint(struct da_monitor *da_mon, enum states curr_state, enum events event, @@ -35,7 +36,10 @@ static bool ha_monitor_handle_constraint(struct da_monit= or *da_mon, da_id_type id); #define da_monitor_event_hook ha_monitor_handle_constraint #define da_monitor_init_hook ha_monitor_init_env +/* Allow monitors to override da_monitor_reset_hook before including this = header. */ +#ifndef da_monitor_reset_hook #define da_monitor_reset_hook ha_monitor_reset_env +#endif =20 #include #include @@ -70,7 +74,7 @@ static void ha_react(enum states curr_state, enum events = event, char *env) rv_react(&rv_this, "rv: monitor %s does not allow event %s on state %s with env %s\n", __stringify(MONITOR_NAME), - event =3D=3D EVENT_NONE ? EVENT_NONE_LBL : model_get_event_name(event), + event =3D=3D EVENT_NONE ? model_get_timer_event_name() : model_get_even= t_name(event), model_get_state_name(curr_state), env); } =20 @@ -246,7 +250,7 @@ static inline void __ha_monitor_timer_callback(struct h= a_monitor *ha_mon) ha_get_env_string(&env_string, ha_mon, time_ns); ha_react(curr_state, EVENT_NONE, env_string.buffer); ha_trace_error_env(ha_mon, model_get_state_name(curr_state), - EVENT_NONE_LBL, env_string.buffer, + model_get_timer_event_name(), env_string.buffer, da_get_id(&ha_mon->da_mon)); =20 da_monitor_reset(&ha_mon->da_mon); @@ -412,6 +416,14 @@ static inline bool ha_cancel_timer(struct ha_monitor *= ha_mon) { return timer_delete(&ha_mon->timer); } +/* + * ha_cancel_timer_sync - Cancel the timer, blocking until any running + * callback has completed. + */ +static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon) +{ + timer_delete_sync(&ha_mon->timer); +} #elif HA_TIMER_TYPE =3D=3D HA_TIMER_HRTIMER /* * Helper functions to handle the monitor timer. @@ -432,12 +444,12 @@ static enum hrtimer_restart ha_monitor_timer_callback= (struct hrtimer *hrtimer) static inline void ha_setup_timer(struct ha_monitor *ha_mon) { hrtimer_setup(&ha_mon->hrtimer, ha_monitor_timer_callback, - CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); + CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); } static inline void ha_start_timer_ns(struct ha_monitor *ha_mon, enum envs = env, u64 expire, u64 time_ns) { - int mode =3D HRTIMER_MODE_REL_HARD; + int mode =3D HRTIMER_MODE_REL_SOFT; u64 passed =3D ha_invariant_passed_ns(ha_mon, env, expire, time_ns); =20 if (RV_MON_TYPE =3D=3D RV_MON_PER_CPU) @@ -463,6 +475,18 @@ static inline bool ha_cancel_timer(struct ha_monitor *= ha_mon) { return hrtimer_try_to_cancel(&ha_mon->hrtimer) =3D=3D 1; } +/* + * ha_cancel_timer_sync - Cancel the timer, blocking until any running + * callback has completed. + * + * Use in teardown paths (e.g. stop_task) where the caller must know the + * callback has finished before inspecting or freeing monitor state. + * Must not be called from atomic context or within the timer callback. + */ +static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon) +{ + hrtimer_cancel(&ha_mon->hrtimer); +} #else /* HA_TIMER_NONE */ /* * Start function is intentionally not defined, monitors using timers must @@ -473,6 +497,7 @@ static inline bool ha_cancel_timer(struct ha_monitor *h= a_mon) { return false; } +static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon) { } #endif =20 #endif diff --git a/include/rv/rv_uprobe.h b/include/rv/rv_uprobe.h index 084cdb36a2ff..9106c5c9275e 100644 --- a/include/rv/rv_uprobe.h +++ b/include/rv/rv_uprobe.h @@ -79,9 +79,41 @@ struct rv_uprobe *rv_uprobe_attach(const char *binpath, = loff_t offset, * for any in-progress handler to finish, then releases the path reference * and frees the rv_uprobe struct. The caller's priv data is NOT freed. * + * When removing a single probe, prefer this over the three-phase API. * Safe to call from process context only (uprobe_unregister_sync() may * schedule). */ void rv_uprobe_detach(struct rv_uprobe *p); =20 +/** + * rv_uprobe_unregister_nosync - dequeue an uprobe without waiting + * @p: probe to dequeue; may be NULL (no-op) + * + * Removes the uprobe from the uprobe subsystem but does NOT wait for + * in-flight handlers to complete. The caller must call rv_uprobe_sync() + * before calling rv_uprobe_free() on the same probe. + * + * Use this to batch multiple deregistrations before a single rv_uprobe_sy= nc(). + */ +void rv_uprobe_unregister_nosync(struct rv_uprobe *p); + +/** + * rv_uprobe_sync - wait for all in-flight uprobe handlers to complete + * + * Global barrier: waits for every in-flight uprobe handler across the sys= tem + * to finish. Call once after a batch of rv_uprobe_unregister_nosync() ca= lls + * and before any rv_uprobe_free() call. + */ +void rv_uprobe_sync(void); + +/** + * rv_uprobe_free - release resources of a previously deregistered probe + * @p: probe to free; may be NULL (no-op) + * + * Releases the path reference and frees the rv_uprobe struct. Must only + * be called after rv_uprobe_sync() has returned. The caller's priv data + * is NOT freed. + */ +void rv_uprobe_free(struct rv_uprobe *p); + #endif /* _RV_UPROBE_H */ diff --git a/include/uapi/linux/rv.h b/include/uapi/linux/rv.h new file mode 100644 index 000000000000..a34e5426393b --- /dev/null +++ b/include/uapi/linux/rv.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * UAPI definitions for Runtime Verification (RV) monitors. + * + * All RV monitors that expose an ioctl self-instrumentation interface + * share the magic byte RV_IOC_MAGIC ('r'). + * + * Usage examples and design rationale are in: + * Documentation/trace/rv/monitor_tlob.rst + */ + +#ifndef _UAPI_LINUX_RV_H +#define _UAPI_LINUX_RV_H + +#include +#include + +/* Magic byte shared by all RV monitor ioctls. */ +#define RV_IOC_MAGIC 'r' + +/* Maximum monitor name length (including NUL terminator). */ +#define RV_MONITOR_NAME_MAX 32 + +/* Generic /dev/rv ioctls (ioctl numbers 0=E2=80=9315 are reserved for the= core) */ + +/** + * struct rv_bind_args - arguments for RV_IOCTL_BIND_MONITOR + * @monitor_name: NUL-terminated name of the monitor to bind (e.g. "tlob"). + */ +struct rv_bind_args { + char monitor_name[RV_MONITOR_NAME_MAX]; +}; + +/* + * RV_IOCTL_BIND_MONITOR - associate this fd with a specific RV monitor. + * + * Must be called once after open() and before any monitor-specific ioctl. + * + * Returns 0 on success. + * Returns -EBUSY if this fd is already bound to a monitor. + * Returns -ENOENT if the requested monitor is not registered. + * Returns -ENOMEM on allocation failure. + */ +#define RV_IOCTL_BIND_MONITOR _IOW(RV_IOC_MAGIC, 0, struct rv_bind_args) + +/* tlob: task latency over budget monitor (ioctl numbers 1=E2=80=9315) */ + +/** + * struct tlob_start_args - arguments for TLOB_IOCTL_TRACE_START + * @threshold_us: Total latency budget for this window, in microseconds. + * Must be greater than zero. Both on-CPU and off-CPU time + * (including runqueue wait) count toward this budget. + */ +struct tlob_start_args { + __u64 threshold_us; +}; + +/* + * TLOB_IOCTL_TRACE_START - begin monitoring the calling task. + * + * Arms a per-task hrtimer for threshold_us microseconds (CLOCK_MONOTONIC, + * so both on-CPU and off-CPU time count toward the budget). + * + * Returns 0 on success. + * Returns -EEXIST if TRACE_START was already called on this fd. + * Returns -ENOSPC if TLOB_MAX_MONITORED tasks are already being tracked. + * Returns -ENOMEM on allocation failure. + * Returns -ENODEV if the tlob monitor is not enabled. + * Returns -ERANGE if threshold_us is 0. + */ +#define TLOB_IOCTL_TRACE_START _IOW(RV_IOC_MAGIC, 1, struct tlob_start_arg= s) + +/* + * TLOB_IOCTL_TRACE_STOP - end monitoring the calling task. + * + * Returns 0 if within budget. + * Returns -EOVERFLOW if the latency budget was exceeded. + * Returns -EINVAL if TLOB_IOCTL_TRACE_START was not called on this fd. + * + * poll/epoll: after TRACE_START the fd becomes readable (EPOLLIN) when the + * budget is exceeded. The caller may then issue TRACE_STOP to retrieve t= he + * result, or simply close the fd to clean up. + */ +#define TLOB_IOCTL_TRACE_STOP _IO(RV_IOC_MAGIC, 2) + +#endif /* _UAPI_LINUX_RV_H */ diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index e2e0033a00b9..1c36939db8e5 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -87,6 +87,8 @@ source "kernel/trace/rv/monitors/deadline/Kconfig" source "kernel/trace/rv/monitors/nomiss/Kconfig" # Add new deadline monitors here =20 +source "kernel/trace/rv/monitors/tlob/Kconfig" + # Add new monitors here =20 config RV_REACTORS diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index f139b904bea3..8a5b5c84aff9 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -2,7 +2,7 @@ =20 ccflags-y +=3D -I $(src) # needed for trace events =20 -obj-$(CONFIG_RV) +=3D rv.o +obj-$(CONFIG_RV) +=3D rv.o rv_chardev.o obj-$(CONFIG_RV_MON_WIP) +=3D monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) +=3D monitors/wwnr/wwnr.o obj-$(CONFIG_RV_MON_SCHED) +=3D monitors/sched/sched.o @@ -17,6 +17,8 @@ obj-$(CONFIG_RV_MON_STS) +=3D monitors/sts/sts.o obj-$(CONFIG_RV_MON_NRP) +=3D monitors/nrp/nrp.o obj-$(CONFIG_RV_MON_SSSW) +=3D monitors/sssw/sssw.o obj-$(CONFIG_RV_MON_OPID) +=3D monitors/opid/opid.o +obj-$(CONFIG_RV_MON_TLOB) +=3D monitors/tlob/tlob.o +obj-$(CONFIG_TLOB_KUNIT_TEST) +=3D monitors/tlob/tlob_kunit.o obj-$(CONFIG_RV_MON_STALL) +=3D monitors/stall/stall.o obj-$(CONFIG_RV_MON_DEADLINE) +=3D monitors/deadline/deadline.o obj-$(CONFIG_RV_MON_NOMISS) +=3D monitors/nomiss/nomiss.o diff --git a/kernel/trace/rv/monitors/tlob/Kconfig b/kernel/trace/rv/monito= rs/tlob/Kconfig new file mode 100644 index 000000000000..82e521891496 --- /dev/null +++ b/kernel/trace/rv/monitors/tlob/Kconfig @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_TLOB + depends on RV + select RV_UPROBE + select HA_MON_EVENTS_ID + bool "tlob monitor" + help + Enable the tlob (task latency over budget) monitor. This monitor + tracks the elapsed time (CLOCK_MONOTONIC) of a marked code path + within a task (including both on-CPU and off-CPU time) and reports + a violation when the elapsed time exceeds a configurable budget. + + The monitor uses a three-state hybrid automaton (running, waiting, + sleeping) stored per object using RV_MON_PER_OBJ. A single HA + clock invariant (clk_elapsed < BUDGET_NS) is enforced in all three + states via a per-task hrtimer. + + States: running (initial, on-CPU), waiting (in runqueue, off-CPU), + sleeping (blocked on resource, off-CPU). + Key transitions: + running --(sleep)------> sleeping + running --(preempt)----> waiting + sleeping --(wakeup)-----> waiting + waiting --(switch_in)--> running + task_start calls da_handle_start_event() to set the initial state, + then arms the budget timer directly via ha_reset_clk_ns() + + ha_start_timer_ns(). task_stop cancels the timer synchronously via + ha_cancel_timer_sync() then calls da_monitor_reset(). + + Two userspace interfaces are provided: + + tracefs uprobe binding (external, unmodified binaries): + echo "p PATH:OFFSET_START OFFSET_STOP threshold=3DNS" \ + > /sys/kernel/tracing/rv/monitors/tlob/monitor + The uprobe at offset_start fires tlob_start_task(); the uprobe at + offset_stop fires tlob_stop_task(). Both are plain entry uprobes + so a mistyped offset cannot corrupt the call stack. + + /dev/rv ioctl (in-process self-instrumentation): + ioctl(fd, TLOB_IOCTL_TRACE_START, &args); + do_critical_work(); + ret =3D ioctl(fd, TLOB_IOCTL_TRACE_STOP, NULL); + /* ret =3D=3D -EOVERFLOW when budget exceeded */ + Allows conditional monitoring, sub-function granularity, and + inline reaction to violations without polling the trace buffer. + + Up to TLOB_MAX_MONITORED tasks may be monitored simultaneously. + + Violations are always reported via the standard error_env_tlob RV + tracepoint regardless of which interface triggered them. The + tracefs interface requires only tracefs write permissions, avoiding + the CAP_BPF privilege needed for equivalent eBPF-based approaches. + + For further information, see: + Documentation/trace/rv/monitor_tlob.rst + +config TLOB_KUNIT_TEST + tristate "KUnit tests for tlob monitor" if !KUNIT_ALL_TESTS + depends on RV_MON_TLOB && KUNIT + default KUNIT_ALL_TESTS + help + Enable KUnit in-kernel unit tests for the tlob RV monitor. + + Tests cover automaton state transitions, the start/stop task + interface, scheduler context-switch accounting, and the uprobe + format string parser. + + Say Y or M here to run the tlob KUnit test suite; otherwise say N. diff --git a/kernel/trace/rv/monitors/tlob/tlob.c b/kernel/trace/rv/monitor= s/tlob/tlob.c new file mode 100644 index 000000000000..475e972ae9aa --- /dev/null +++ b/kernel/trace/rv/monitors/tlob/tlob.c @@ -0,0 +1,1307 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * tlob: task latency over budget monitor + * + * Track the elapsed wall-clock time of a marked code path and detect when + * a monitored task exceeds its per-task latency budget. CLOCK_MONOTONIC + * is used so both on-CPU and off-CPU time count toward the budget. + * + * On a budget violation, two tracepoints are emitted from the hrtimer + * callback: error_env_tlob signals the violation, and detail_env_tlob + * provides a per-state time breakdown (running_ns, waiting_ns, sleeping_n= s) + * that pinpoints whether the overrun occurred in running, waiting, or sle= eping state. + * + * The monitor uses RV_MON_PER_OBJ: per-task state (struct tlob_task_state) + * is stored as monitor_target in the framework's hash table. + * + * One HA clock invariant is enforced: + * clk_elapsed < BUDGET_NS() (active in all states) + * + * task_start uses da_handle_start_event() to set the initial state, then + * calls ha_reset_clk_ns() + ha_start_timer_ns() directly to initialise the + * clock and arm the budget timer. No synthetic event is needed. + * The HA timer is cancelled synchronously by ha_cancel_timer_sync() in + * tlob_stop_task(). + * + * Copyright (C) 2026 Wen Yang + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../../rv.h" + +#define MODULE_NAME "tlob" + +#include +#include + +/* + * Per-fd private data; one instance per open /dev/rv fd. + * monitoring: set while TRACE_START is active; cleared at TRACE_STOP. + * budget_exceeded: set by hrtimer callback; read at TRACE_STOP to report + * -EOVERFLOW even when cleanup was claimed by a concurrent stop_all or + * a task-exit handler. + */ +struct tlob_fpriv { + struct task_struct *task; + bool monitoring; + bool budget_exceeded; +}; + +/* + * Per-task latency monitoring state. One instance per monitoring window. + * Stored as monitor_target in da_monitor_storage; freed via call_rcu. + */ +struct tlob_task_state { + struct task_struct *task; /* via get_task_struct */ + u64 threshold_us; /* budget in microseconds */ + + /* 1 =3D cleanup claimed; ha_setup_invariants won't restart the timer. */ + atomic_t stopping; + + /* Serialises the ns accumulators; held briefly (hardirq-safe). */ + raw_spinlock_t entry_lock; + u64 running_ns; /* time in running state */ + u64 waiting_ns; /* time in waiting state */ + u64 sleeping_ns; /* time in sleeping state */ + ktime_t last_ts; + + /* store-release in TRACE_START ioctl, load-acquire in reset_notify. */ + struct tlob_fpriv *fpriv; + + struct rcu_head rcu; /* for call_rcu() teardown */ +}; + +#define RV_MON_TYPE RV_MON_PER_OBJ +#define HA_TIMER_TYPE HA_TIMER_HRTIMER +/* Pool mode: da_handle_start_event uses da_fill_empty_storage, not kmallo= c. */ +#define DA_SKIP_AUTO_ALLOC + +/* Type for da_monitor_storage.target; must be defined before the includes= . */ +typedef struct tlob_task_state *monitor_target; + +/* Forward-declared so da_monitor_reset_hook works before ha_monitor.h. */ +static inline void tlob_reset_notify(struct da_monitor *da_mon); +#define da_monitor_reset_hook tlob_reset_notify + +/* + * When the hrtimer fires (budget elapsed), the HA framework emits + * error_env_tlob with this label instead of the generic "none". + */ +#define MONITOR_TIMER_EVENT_NAME "budget_exceeded" + +#include "tlob.h" +#include + +/* + * Called from da_monitor_reset() on both normal stop and hrtimer expiry. + * On violation (stopping=3D=3D0), emits detail_env_tlob. + */ +static inline void tlob_reset_notify(struct da_monitor *da_mon) +{ + struct ha_monitor *ha_mon =3D to_ha_monitor(da_mon); + struct tlob_task_state *ws; + + ha_monitor_reset_env(da_mon); + + ws =3D ha_get_target(ha_mon); + if (!ws) + return; + + /* + * Emit per-state breakdown on budget violation only. + * stopping=3D=3D0: timer callback owns this path (genuine overrun). + * stopping=3D=3D1: normal stop claimed ownership first; skip. + */ + if (!atomic_read(&ws->stopping)) { + unsigned int curr_state =3D READ_ONCE(da_mon->curr_state); + u64 running_ns, waiting_ns, sleeping_ns, partial_ns; + struct tlob_fpriv *fp; + unsigned long flags; + + /* + * Snapshot accumulators; partial_ns covers curr_state time + * not yet folded in (transition-out pending). + */ + raw_spin_lock_irqsave(&ws->entry_lock, flags); + partial_ns =3D ktime_get_ns() - ktime_to_ns(ws->last_ts); + running_ns =3D ws->running_ns + + (curr_state =3D=3D running_tlob ? partial_ns : 0); + waiting_ns =3D ws->waiting_ns + + (curr_state =3D=3D waiting_tlob ? partial_ns : 0); + sleeping_ns =3D ws->sleeping_ns + + (curr_state =3D=3D sleeping_tlob ? partial_ns : 0); + raw_spin_unlock_irqrestore(&ws->entry_lock, flags); + + trace_detail_env_tlob(da_get_id(da_mon), ws->threshold_us, + running_ns, waiting_ns, sleeping_ns); + + /* + * Latch violation in the fd so TRACE_STOP can return -EOVERFLOW + * even if a concurrent stop_all or task-exit handler claims + * cleanup first. Pairs with smp_store_release in TRACE_START. + */ + fp =3D smp_load_acquire(&ws->fpriv); + if (fp) + WRITE_ONCE(fp->budget_exceeded, true); + } +} + +#define BUDGET_US(ha_mon) (ha_get_target(ha_mon)->threshold_us) +#define BUDGET_NS(ha_mon) (BUDGET_US(ha_mon) * 1000ULL) + +/* HA constraint functions (called by ha_monitor_handle_constraint) */ + +static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_tlob env, u64 t= ime_ns) +{ + if (env =3D=3D clk_elapsed_tlob) + return ha_get_clk_ns(ha_mon, env, time_ns); + return ENV_INVALID_VALUE; +} + +static void ha_reset_env(struct ha_monitor *ha_mon, enum envs_tlob env, u6= 4 time_ns) +{ + if (env =3D=3D clk_elapsed_tlob) + ha_reset_clk_ns(ha_mon, env, time_ns); +} + +/* + * ha_verify_invariants - clk_elapsed < BUDGET_NS must hold in all states. + */ +static inline bool ha_verify_invariants(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (curr_state =3D=3D running_tlob) + return ha_check_invariant_ns(ha_mon, clk_elapsed_tlob, time_ns); + else if (curr_state =3D=3D sleeping_tlob) + return ha_check_invariant_ns(ha_mon, clk_elapsed_tlob, time_ns); + else if (curr_state =3D=3D waiting_tlob) + return ha_check_invariant_ns(ha_mon, clk_elapsed_tlob, time_ns); + return true; +} + +/* + * Convert invariant (deadline) to guard (reset anchor) on state transitio= ns. + * Skip if uninitialised (ENV_INVALID_VALUE): the race between + * da_handle_start_event() and ha_reset_clk_ns() would give U64_MAX - BUDG= ET_NS. + */ +static inline void ha_convert_inv_guard(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (curr_state =3D=3D next_state) + return; + if (curr_state =3D=3D running_tlob && + !ha_monitor_env_invalid(ha_mon, clk_elapsed_tlob)) + ha_inv_to_guard(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns); + else if (curr_state =3D=3D sleeping_tlob && + !ha_monitor_env_invalid(ha_mon, clk_elapsed_tlob)) + ha_inv_to_guard(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns); + else if (curr_state =3D=3D waiting_tlob && + !ha_monitor_env_invalid(ha_mon, clk_elapsed_tlob)) + ha_inv_to_guard(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns); +} + +/* No per-event guard conditions for tlob; invariants suffice. */ +static inline bool ha_verify_guards(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + return true; +} + +/* + * Arm or cancel the HA budget timer on state transitions. + * Guard on stopping: sched_switch events can arrive after ha_cancel_timer= _sync, + * restarting the timer and triggering an ODEBUG "activate active" splat. + */ +static inline void ha_setup_invariants(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (next_state =3D=3D curr_state) + return; + if (next_state =3D=3D running_tlob) { + if (!atomic_read_acquire(&ha_get_target(ha_mon)->stopping)) + ha_start_timer_ns(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns); + } else if (next_state =3D=3D sleeping_tlob) { + if (!atomic_read_acquire(&ha_get_target(ha_mon)->stopping)) + ha_start_timer_ns(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns); + } else if (next_state =3D=3D waiting_tlob) { + if (!atomic_read_acquire(&ha_get_target(ha_mon)->stopping)) + ha_start_timer_ns(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns); + } else if (curr_state =3D=3D running_tlob) + ha_cancel_timer(ha_mon); + else if (curr_state =3D=3D waiting_tlob) + ha_cancel_timer(ha_mon); + else if (curr_state =3D=3D sleeping_tlob) + ha_cancel_timer(ha_mon); +} + +static bool ha_verify_constraint(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (!ha_verify_invariants(ha_mon, curr_state, event, next_state, time_ns)) + return false; + + ha_convert_inv_guard(ha_mon, curr_state, event, next_state, time_ns); + + if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns)) + return false; + + ha_setup_invariants(ha_mon, curr_state, event, next_state, time_ns); + + return true; +} + +static struct kmem_cache *tlob_state_cache; + +static atomic_t tlob_num_monitored =3D ATOMIC_INIT(0); + +/* Uprobe binding list; protected by tlob_uprobe_mutex. */ +static LIST_HEAD(tlob_uprobe_list); +static DEFINE_MUTEX(tlob_uprobe_mutex); + +/* + * Serialises duplicate-check + da_create_or_get() to prevent two concurre= nt + * callers for the same pid from both inserting into the hash table. + */ +static DEFINE_MUTEX(tlob_start_mutex); + +/* + * Counts open /dev/rv fds plus one synthetic ref held while enabled. + * __tlob_destroy_monitor() drops the synthetic ref and waits for zero + * before teardown, preventing kmem_cache_zalloc() on a destroyed cache. + */ +static refcount_t tlob_fd_refcount =3D REFCOUNT_INIT(0); +static DECLARE_COMPLETION(tlob_fd_released); + +/* Per-uprobe-binding state: a start + stop probe pair for one binary regi= on. */ +struct tlob_uprobe_binding { + struct list_head list; + u64 threshold_us; + char binpath[TLOB_MAX_PATH]; + loff_t offset_start; + loff_t offset_stop; + struct rv_uprobe *start_probe; + struct rv_uprobe *stop_probe; +}; + +/* RCU callback: free the slab once no readers remain. */ +static void tlob_free_rcu(struct rcu_head *head) +{ + struct tlob_task_state *ws =3D + container_of(head, struct tlob_task_state, rcu); + kmem_cache_free(tlob_state_cache, ws); +} + +/* + * handle_sched_switch - advance the DA on every context switch. + * + * Generates three DA events: + * prev, prev_state !=3D 0 -> sleep_tlob (running -> sleeping) + * prev, prev_state =3D=3D 0 -> preempt_tlob (running -> waiting) + * next -> switch_in_tlob (waiting -> running) + */ +static void handle_sched_switch(void *data, bool preempt_unused, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + struct tlob_task_state *ws; + unsigned long flags; + bool do_prev =3D false, do_next =3D false; + bool prev_preempted; + ktime_t now; + + rcu_read_lock(); + + ws =3D da_get_target_by_id(prev->pid); + if (ws) { + raw_spin_lock_irqsave(&ws->entry_lock, flags); + now =3D ktime_get(); + ws->running_ns +=3D ktime_to_ns(ktime_sub(now, ws->last_ts)); + ws->last_ts =3D now; + /* prev_state =3D=3D 0: TASK_RUNNING (preempted); !=3D 0: sleeping. */ + prev_preempted =3D (prev_state =3D=3D 0); + do_prev =3D true; + raw_spin_unlock_irqrestore(&ws->entry_lock, flags); + } + + ws =3D da_get_target_by_id(next->pid); + if (ws) { + raw_spin_lock_irqsave(&ws->entry_lock, flags); + now =3D ktime_get(); + ws->waiting_ns +=3D ktime_to_ns(ktime_sub(now, ws->last_ts)); + ws->last_ts =3D now; + do_next =3D true; + raw_spin_unlock_irqrestore(&ws->entry_lock, flags); + } + + rcu_read_unlock(); + + if (do_prev) + da_handle_event(prev->pid, NULL, + prev_preempted ? preempt_tlob : sleep_tlob); + if (do_next) + da_handle_event(next->pid, NULL, switch_in_tlob); +} + +/* + * handle_sched_wakeup - sleeping -> waiting transition. + * + * try_to_wake_up() skips TASK_RUNNING tasks, so this never fires for a + * task already in running or waiting state. + */ +static void handle_sched_wakeup(void *data, struct task_struct *p) +{ + struct tlob_task_state *ws; + unsigned long flags; + bool found =3D false; + + rcu_read_lock(); + ws =3D da_get_target_by_id(p->pid); + if (ws) { + ktime_t now =3D ktime_get(); + + raw_spin_lock_irqsave(&ws->entry_lock, flags); + ws->sleeping_ns +=3D ktime_to_ns(ktime_sub(now, ws->last_ts)); + ws->last_ts =3D now; + raw_spin_unlock_irqrestore(&ws->entry_lock, flags); + found =3D true; + } + rcu_read_unlock(); + + if (found) + da_handle_event(p->pid, NULL, wakeup_tlob); +} + +/* + * handle_sched_process_exit - clean up if a task exits without TRACE_STOP. + * + * Called in do_exit() context; the task still has a valid pid here. + */ +static void handle_sched_process_exit(void *data, struct task_struct *p, + bool group_dead) +{ + struct tlob_task_state *ws; + bool found =3D false; + + rcu_read_lock(); + ws =3D da_get_target_by_id(p->pid); + found =3D !!ws; + rcu_read_unlock(); + + if (found) + tlob_stop_task(p); +} + + + +/** + * tlob_start_task - begin monitoring @task with budget @threshold_us us. + * @task: Task to monitor; may be current or another task. + * @threshold_us: Latency budget in microseconds (wall-clock; running + wa= iting + sleeping). > 0. + * + * Returns 0, -ENODEV, -EALREADY, -ENOSPC, or -ENOMEM. + */ +int tlob_start_task(struct task_struct *task, u64 threshold_us) +{ + struct tlob_task_state *ws_existing; + struct tlob_task_state *ws; + struct da_monitor *da_mon; + struct ha_monitor *ha_mon; + u64 now_ns; + int ret; + + if (!da_monitor_enabled()) + return -ENODEV; + + if (threshold_us =3D=3D 0) + return -ERANGE; + + /* Serialise duplicate-check + da_create_or_get for the same pid. */ + guard(mutex)(&tlob_start_mutex); + + rcu_read_lock(); + ws_existing =3D da_get_target_by_id(task->pid); + if (ws_existing) { + rcu_read_unlock(); + return -EALREADY; + } + rcu_read_unlock(); + + ws =3D kmem_cache_zalloc(tlob_state_cache, GFP_KERNEL); + if (!ws) + return -ENOMEM; + + ws->task =3D task; + get_task_struct(task); + ws->threshold_us =3D threshold_us; + ws->last_ts =3D ktime_get(); + raw_spin_lock_init(&ws->entry_lock); + + /* Claim a pool slot (no kmalloc; DA_SKIP_AUTO_ALLOC + prealloc). */ + ret =3D da_create_or_get(task->pid, ws); + if (ret) { + put_task_struct(task); + kmem_cache_free(tlob_state_cache, ws); + return ret; + } + + atomic_inc(&tlob_num_monitored); + + /* Hold RCU across handle + timer setup to keep da_mon valid. */ + rcu_read_lock(); + da_handle_start_event(task->pid, ws, switch_in_tlob); + da_mon =3D da_get_monitor(task->pid, NULL); + if (unlikely(!da_mon)) { + /* Slot registered; missing da_mon means concurrent destroy. */ + rcu_read_unlock(); + da_destroy_storage(task->pid); + atomic_dec(&tlob_num_monitored); + put_task_struct(task); + kmem_cache_free(tlob_state_cache, ws); + return -ENOMEM; + } + ha_mon =3D to_ha_monitor(da_mon); + now_ns =3D ktime_get_ns(); + ha_reset_env(ha_mon, clk_elapsed_tlob, now_ns); + ha_start_timer_ns(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), now_ns); + rcu_read_unlock(); + + return 0; +} +EXPORT_SYMBOL_GPL(tlob_start_task); + +/** + * tlob_stop_task - stop monitoring @task. + * @task: Task to stop. + * + * CAS on ws->stopping (0->1) under RCU claims cleanup ownership; + * the winner cancels the timer synchronously and frees all resources. + * + * Returns 0, -EOVERFLOW (budget exceeded), -ESRCH (not monitored), + * or -EAGAIN (concurrent caller claimed cleanup). + */ +int tlob_stop_task(struct task_struct *task) +{ + struct da_monitor *da_mon; + struct ha_monitor *ha_mon; + struct tlob_task_state *ws; + bool budget_exceeded; + + rcu_read_lock(); + ws =3D da_get_target_by_id(task->pid); + if (!ws) { + rcu_read_unlock(); + return -ESRCH; + } + + da_mon =3D da_get_monitor(task->pid, NULL); + if (unlikely(!da_mon)) { + /* ws in hash but da_mon gone; internal inconsistency. */ + rcu_read_unlock(); + WARN_ON_ONCE(1); + return -ESRCH; + } + + ha_mon =3D to_ha_monitor(da_mon); + + /* + * CAS (0->1) claims cleanup ownership under RCU (ws guaranteed valid). + * _release pairs with atomic_read_acquire in ha_setup_invariants. + */ + if (atomic_cmpxchg_release(&ws->stopping, 0, 1) !=3D 0) { + rcu_read_unlock(); + return -EAGAIN; + } + + rcu_read_unlock(); + + /* Wait for in-flight timer callback before reading da_monitoring. */ + ha_cancel_timer_sync(ha_mon); + + /* Timer fired first -> budget exceeded; otherwise reset normally. */ + rcu_read_lock(); + budget_exceeded =3D !da_monitoring(da_mon); + if (!budget_exceeded) + da_monitor_reset(da_mon); + rcu_read_unlock(); + da_destroy_storage(task->pid); + atomic_dec(&tlob_num_monitored); + + put_task_struct(ws->task); + call_rcu(&ws->rcu, tlob_free_rcu); + return budget_exceeded ? -EOVERFLOW : 0; +} +EXPORT_SYMBOL_GPL(tlob_stop_task); + +static void tlob_stop_all(void) +{ + struct da_monitor_storage *ms; + pid_t pids[TLOB_MAX_MONITORED]; + int bkt, n =3D 0; + + /* Snapshot pids under RCU; re-derive ws under a fresh lock below. */ + rcu_read_lock(); + hash_for_each_rcu(da_monitor_ht, bkt, ms, node) { + if (ms->target && n < TLOB_MAX_MONITORED) + pids[n++] =3D ms->id; + } + rcu_read_unlock(); + + for (int i =3D 0; i < n; i++) { + pid_t pid =3D pids[i]; + struct da_monitor *da_mon; + struct ha_monitor *ha_mon; + struct tlob_task_state *ws; + + rcu_read_lock(); + da_mon =3D da_get_monitor(pid, NULL); + if (!da_mon) { + /* Cleaned up by tlob_stop_task or exit handler. */ + rcu_read_unlock(); + continue; + } + + ws =3D da_get_target(da_mon); + ha_mon =3D to_ha_monitor(da_mon); + + /* CAS (0->1) claims ownership; skip if another caller won. */ + if (atomic_cmpxchg_release(&ws->stopping, 0, 1) !=3D 0) { + rcu_read_unlock(); + continue; + } + rcu_read_unlock(); + + ha_cancel_timer_sync(ha_mon); + + scoped_guard(rcu) { + da_monitor_reset(da_mon); + } + da_destroy_storage(pid); + atomic_dec(&tlob_num_monitored); + put_task_struct(ws->task); + call_rcu(&ws->rcu, tlob_free_rcu); + } +} + +static int tlob_uprobe_entry_handler(struct rv_uprobe *p, struct pt_regs *= regs, + __u64 *data) +{ + struct tlob_uprobe_binding *b =3D p->priv; + + tlob_start_task(current, b->threshold_us); + return 0; +} + +static int tlob_uprobe_stop_handler(struct rv_uprobe *p, struct pt_regs *r= egs, + __u64 *data) +{ + tlob_stop_task(current); + return 0; +} + +/* + * Register start + stop entry uprobes for a binding. + * Called with tlob_uprobe_mutex held. + */ +static int tlob_add_uprobe(u64 threshold_us, const char *binpath, + loff_t offset_start, loff_t offset_stop) +{ + struct tlob_uprobe_binding *b, *tmp_b; + char pathbuf[TLOB_MAX_PATH]; + struct path path; + char *canon; + int ret; + + if (binpath[0] !=3D '/') + return -EINVAL; + + b =3D kzalloc_obj(*b, GFP_KERNEL); + if (!b) + return -ENOMEM; + + b->threshold_us =3D threshold_us; + b->offset_start =3D offset_start; + b->offset_stop =3D offset_stop; + + ret =3D kern_path(binpath, LOOKUP_FOLLOW, &path); + if (ret) + goto err_free; + + if (!d_is_reg(path.dentry)) { + ret =3D -EINVAL; + goto err_path; + } + + /* Reject duplicate start offset for the same binary. */ + list_for_each_entry(tmp_b, &tlob_uprobe_list, list) { + if (tmp_b->offset_start =3D=3D offset_start && + tmp_b->start_probe->path.dentry =3D=3D path.dentry) { + ret =3D -EEXIST; + goto err_path; + } + } + + canon =3D d_path(&path, pathbuf, sizeof(pathbuf)); + if (IS_ERR(canon)) { + ret =3D PTR_ERR(canon); + goto err_path; + } + strscpy(b->binpath, canon, sizeof(b->binpath)); + + /* Both probes share b (priv) and path; attach_path refs path itself. */ + b->start_probe =3D rv_uprobe_attach_path(&path, offset_start, + tlob_uprobe_entry_handler, NULL, b); + if (IS_ERR(b->start_probe)) { + ret =3D PTR_ERR(b->start_probe); + b->start_probe =3D NULL; + goto err_path; + } + + b->stop_probe =3D rv_uprobe_attach_path(&path, offset_stop, + tlob_uprobe_stop_handler, NULL, b); + if (IS_ERR(b->stop_probe)) { + ret =3D PTR_ERR(b->stop_probe); + b->stop_probe =3D NULL; + goto err_start; + } + + path_put(&path); + list_add_tail(&b->list, &tlob_uprobe_list); + return 0; + +err_start: + rv_uprobe_detach(b->start_probe); +err_path: + path_put(&path); +err_free: + kfree(b); + return ret; +} + +static int tlob_remove_uprobe_by_key(loff_t offset_start, const char *binp= ath) +{ + struct tlob_uprobe_binding *b, *tmp; + struct path remove_path; + int ret; + + ret =3D kern_path(binpath, LOOKUP_FOLLOW, &remove_path); + if (ret) + return ret; + + ret =3D -ENOENT; + list_for_each_entry_safe(b, tmp, &tlob_uprobe_list, list) { + if (b->offset_start !=3D offset_start) + continue; + if (b->start_probe->path.dentry !=3D remove_path.dentry) + continue; + list_del(&b->list); + rv_uprobe_detach(b->start_probe); + rv_uprobe_detach(b->stop_probe); + kfree(b); + ret =3D 0; + break; + } + + path_put(&remove_path); + return ret; +} + +static void tlob_remove_all_uprobes(void) +{ + struct tlob_uprobe_binding *b, *tmp; + LIST_HEAD(pending); + + mutex_lock(&tlob_uprobe_mutex); + list_for_each_entry_safe(b, tmp, &tlob_uprobe_list, list) { + list_move(&b->list, &pending); + rv_uprobe_unregister_nosync(b->start_probe); + rv_uprobe_unregister_nosync(b->stop_probe); + } + mutex_unlock(&tlob_uprobe_mutex); + + if (list_empty(&pending)) + return; + + /* + * One global barrier for all probes dequeued above; no new handlers + * for any of them can fire after this returns. + */ + rv_uprobe_sync(); + + list_for_each_entry_safe(b, tmp, &pending, list) { + rv_uprobe_free(b->start_probe); + rv_uprobe_free(b->stop_probe); + kfree(b); + } +} + +static ssize_t tlob_monitor_read(struct file *file, + char __user *ubuf, + size_t count, loff_t *ppos) +{ + const int line_sz =3D TLOB_MAX_PATH + 128; + struct tlob_uprobe_binding *b; + char *buf, *p; + int n =3D 0, buf_sz, pos =3D 0; + ssize_t ret; + + mutex_lock(&tlob_uprobe_mutex); + list_for_each_entry(b, &tlob_uprobe_list, list) + n++; + + buf_sz =3D (n ? n : 1) * line_sz + 1; + buf =3D kmalloc(buf_sz, GFP_KERNEL); + if (!buf) { + mutex_unlock(&tlob_uprobe_mutex); + return -ENOMEM; + } + + list_for_each_entry(b, &tlob_uprobe_list, list) { + p =3D b->binpath; + pos +=3D scnprintf(buf + pos, buf_sz - pos, + "p %s:0x%llx 0x%llx threshold=3D%llu\n", + p, + (unsigned long long)b->offset_start, + (unsigned long long)b->offset_stop, + b->threshold_us); + } + mutex_unlock(&tlob_uprobe_mutex); + + ret =3D simple_read_from_buffer(ubuf, count, ppos, buf, pos); + kfree(buf); + return ret; +} + +/* + * Parse "p PATH:OFFSET_START OFFSET_STOP threshold=3DUS". + * PATH may contain ':'; the last ':' separates path from offset. + * Returns 0 or -EINVAL. + */ +static int tlob_parse_uprobe_line(char *buf, u64 *thr_out, + char **path_out, + loff_t *start_out, loff_t *stop_out) +{ + unsigned long long thr =3D 0, stop_val =3D 0; + long long start_val; + char *p, *path_token, *token, *colon; + bool got_stop =3D false, got_thr =3D false; + int n; + + /* Must start with "p " */ + if (buf[0] !=3D 'p' || buf[1] !=3D ' ') + return -EINVAL; + + p =3D buf + 2; + while (*p =3D=3D ' ') + p++; + + /* First space-delimited token is PATH:OFFSET_START */ + path_token =3D strsep(&p, " \t"); + if (!path_token || !*path_token) + return -EINVAL; + + /* Split at last ':' to handle paths that contain ':'. */ + colon =3D strrchr(path_token, ':'); + if (!colon || colon - path_token < 2) + return -EINVAL; + *colon =3D '\0'; + + if (path_token[0] !=3D '/') + return -EINVAL; + + n =3D 0; + if (sscanf(colon + 1, "%lli%n", &start_val, &n) !=3D 1 || n =3D=3D 0) + return -EINVAL; + if (start_val < 0) + return -EINVAL; + + /* Remaining tokens: OFFSET_STOP threshold=3DUS */ + while (p && (token =3D strsep(&p, " \t")) !=3D NULL) { + if (!*token) + continue; + if (strncmp(token, "threshold=3D", 10) =3D=3D 0) { + if (kstrtoull(token + 10, 0, &thr)) + return -EINVAL; + got_thr =3D true; + } else if (!got_stop) { + long long sv; + + n =3D 0; + if (sscanf(token, "%lli%n", &sv, &n) !=3D 1 || n =3D=3D 0) + return -EINVAL; + if (sv < 0) + return -EINVAL; + stop_val =3D (unsigned long long)sv; + got_stop =3D true; + } else { + return -EINVAL; + } + } + + if (!got_stop || !got_thr || thr =3D=3D 0) + return -EINVAL; + if (start_val =3D=3D (long long)stop_val) + return -EINVAL; + + *thr_out =3D thr; + *path_out =3D path_token; + *start_out =3D (loff_t)start_val; + *stop_out =3D (loff_t)stop_val; + return 0; +} + +/* Parse "-PATH:OFFSET_START" (ftrace uprobe_events removal convention). */ +static int tlob_parse_remove_line(char *buf, char **path_out, loff_t *star= t_out) +{ + char *binpath, *colon; + long long off; + int n =3D 0; + + if (buf[0] !=3D '-') + return -EINVAL; + binpath =3D buf + 1; + if (binpath[0] !=3D '/') + return -EINVAL; + colon =3D strrchr(binpath, ':'); + if (!colon || colon - binpath < 2) + return -EINVAL; + *colon =3D '\0'; + if (sscanf(colon + 1, "%lli%n", &off, &n) !=3D 1 || n =3D=3D 0) + return -EINVAL; + *path_out =3D binpath; + *start_out =3D (loff_t)off; + return 0; +} + +VISIBLE_IF_KUNIT int tlob_create_or_delete_uprobe(char *buf) +{ + loff_t offset_start, offset_stop; + u64 threshold_us; + char *binpath; + int ret; + + if (buf[0] =3D=3D '-') { + ret =3D tlob_parse_remove_line(buf, &binpath, &offset_start); + if (ret) + return ret; + mutex_lock(&tlob_uprobe_mutex); + ret =3D tlob_remove_uprobe_by_key(offset_start, binpath); + mutex_unlock(&tlob_uprobe_mutex); + return ret; + } + ret =3D tlob_parse_uprobe_line(buf, &threshold_us, &binpath, + &offset_start, &offset_stop); + if (ret) + return ret; + mutex_lock(&tlob_uprobe_mutex); + ret =3D tlob_add_uprobe(threshold_us, binpath, offset_start, offset_stop); + mutex_unlock(&tlob_uprobe_mutex); + return ret; +} +EXPORT_SYMBOL_IF_KUNIT(tlob_create_or_delete_uprobe); + +static ssize_t tlob_monitor_write(struct file *file, + const char __user *ubuf, + size_t count, loff_t *ppos) +{ + char buf[TLOB_MAX_PATH + 128]; + + if (count >=3D sizeof(buf)) + return -EINVAL; + if (copy_from_user(buf, ubuf, count)) + return -EFAULT; + buf[count] =3D '\0'; + if (count > 0 && buf[count - 1] =3D=3D '\n') + buf[count - 1] =3D '\0'; + return tlob_create_or_delete_uprobe(buf) ?: (ssize_t)count; +} + +static const struct file_operations tlob_monitor_fops =3D { + .open =3D simple_open, + .read =3D tlob_monitor_read, + .write =3D tlob_monitor_write, + .llseek =3D noop_llseek, +}; + +static int __tlob_init_monitor(void) +{ + int retval; + + tlob_state_cache =3D kmem_cache_create("tlob_task_state", + sizeof(struct tlob_task_state), + 0, 0, NULL); + if (!tlob_state_cache) + return -ENOMEM; + + atomic_set(&tlob_num_monitored, 0); + + retval =3D da_monitor_init_prealloc(TLOB_MAX_MONITORED); + if (retval) { + kmem_cache_destroy(tlob_state_cache); + tlob_state_cache =3D NULL; + return retval; + } + + /* Synthetic reference: held while the monitor is enabled. */ + reinit_completion(&tlob_fd_released); + refcount_set(&tlob_fd_refcount, 1); + + rv_this.enabled =3D 1; + return 0; +} + +static void __tlob_destroy_monitor(void) +{ + rv_this.enabled =3D 0; + /* + * Remove uprobes first so stop_task can't race with tlob_stop_all(). + * rv_uprobe_sync() inside ensures all in-flight handlers have finished. + */ + tlob_remove_all_uprobes(); + tlob_stop_all(); + /* Wait for tlob_free_rcu and da_pool_return_cb before pool teardown. */ + synchronize_rcu(); + + /* + * Drop the synthetic ref and wait for all open fds to close before + * teardown; prevents kmem_cache_zalloc() on the destroyed cache. + */ + if (!refcount_dec_and_test(&tlob_fd_refcount)) + wait_for_completion(&tlob_fd_released); + + da_monitor_destroy(); + kmem_cache_destroy(tlob_state_cache); + tlob_state_cache =3D NULL; +} + +/* KUnit wrappers that acquire rv_interface_lock around monitor init/destr= oy. */ +#if IS_ENABLED(CONFIG_KUNIT) +int tlob_init_monitor(void) +{ + int ret; + + mutex_lock(&rv_interface_lock); + ret =3D __tlob_init_monitor(); + mutex_unlock(&rv_interface_lock); + return ret; +} +EXPORT_SYMBOL_GPL(tlob_init_monitor); + +void tlob_destroy_monitor(void) +{ + mutex_lock(&rv_interface_lock); + __tlob_destroy_monitor(); + mutex_unlock(&rv_interface_lock); +} +EXPORT_SYMBOL_GPL(tlob_destroy_monitor); + +int tlob_num_monitored_read(void) +{ + return atomic_read(&tlob_num_monitored); +} +EXPORT_SYMBOL_IF_KUNIT(tlob_num_monitored_read); + +/* Tracepoint probes for KUnit; rv_trace.h is only included here. */ +static struct tlob_captured_event tlob_kunit_last_event; +static struct tlob_captured_error_env tlob_kunit_last_error_env; +static atomic_t tlob_kunit_event_cnt =3D ATOMIC_INIT(0); +static atomic_t tlob_kunit_error_env_cnt =3D ATOMIC_INIT(0); + +static void tlob_kunit_event_probe(void *data, int id, char *state, char *= event, + char *next_state, bool final_state) +{ + tlob_kunit_last_event.id =3D id; + strscpy(tlob_kunit_last_event.state, state, + sizeof(tlob_kunit_last_event.state)); + strscpy(tlob_kunit_last_event.event, event, + sizeof(tlob_kunit_last_event.event)); + strscpy(tlob_kunit_last_event.next_state, next_state, + sizeof(tlob_kunit_last_event.next_state)); + tlob_kunit_last_event.final_state =3D final_state; + atomic_inc(&tlob_kunit_event_cnt); +} + +static void tlob_kunit_error_env_probe(void *data, int id, char *state, + char *event, char *env) +{ + tlob_kunit_last_error_env.id =3D id; + strscpy(tlob_kunit_last_error_env.state, state, + sizeof(tlob_kunit_last_error_env.state)); + strscpy(tlob_kunit_last_error_env.event, event, + sizeof(tlob_kunit_last_error_env.event)); + strscpy(tlob_kunit_last_error_env.env, env, + sizeof(tlob_kunit_last_error_env.env)); + atomic_inc(&tlob_kunit_error_env_cnt); +} + +int tlob_register_kunit_probes(void) +{ + int ret; + + atomic_set(&tlob_kunit_event_cnt, 0); + atomic_set(&tlob_kunit_error_env_cnt, 0); + + ret =3D register_trace_event_tlob(tlob_kunit_event_probe, NULL); + if (ret) + return ret; + ret =3D register_trace_error_env_tlob(tlob_kunit_error_env_probe, NULL); + if (ret) { + unregister_trace_event_tlob(tlob_kunit_event_probe, NULL); + return ret; + } + return 0; +} +EXPORT_SYMBOL_IF_KUNIT(tlob_register_kunit_probes); + +void tlob_unregister_kunit_probes(void) +{ + unregister_trace_event_tlob(tlob_kunit_event_probe, NULL); + unregister_trace_error_env_tlob(tlob_kunit_error_env_probe, NULL); + tracepoint_synchronize_unregister(); +} +EXPORT_SYMBOL_IF_KUNIT(tlob_unregister_kunit_probes); + +int tlob_event_count_read(void) +{ + return atomic_read(&tlob_kunit_event_cnt); +} +EXPORT_SYMBOL_IF_KUNIT(tlob_event_count_read); + +void tlob_event_count_reset(void) +{ + atomic_set(&tlob_kunit_event_cnt, 0); +} +EXPORT_SYMBOL_IF_KUNIT(tlob_event_count_reset); + +int tlob_error_env_count_read(void) +{ + return atomic_read(&tlob_kunit_error_env_cnt); +} +EXPORT_SYMBOL_IF_KUNIT(tlob_error_env_count_read); + +void tlob_error_env_count_reset(void) +{ + atomic_set(&tlob_kunit_error_env_cnt, 0); +} +EXPORT_SYMBOL_IF_KUNIT(tlob_error_env_count_reset); + +const struct tlob_captured_event *tlob_last_event_read(void) +{ + return &tlob_kunit_last_event; +} +EXPORT_SYMBOL_IF_KUNIT(tlob_last_event_read); + +const struct tlob_captured_error_env *tlob_last_error_env_read(void) +{ + return &tlob_kunit_last_error_env; +} +EXPORT_SYMBOL_IF_KUNIT(tlob_last_error_env_read); + +#endif /* CONFIG_KUNIT */ + +VISIBLE_IF_KUNIT int tlob_enable_hooks(void) +{ + rv_attach_trace_probe("tlob", sched_switch, handle_sched_switch); + rv_attach_trace_probe("tlob", sched_wakeup, handle_sched_wakeup); + rv_attach_trace_probe("tlob", sched_process_exit, handle_sched_process_ex= it); + return 0; +} +EXPORT_SYMBOL_IF_KUNIT(tlob_enable_hooks); + +VISIBLE_IF_KUNIT void tlob_disable_hooks(void) +{ + rv_detach_trace_probe("tlob", sched_switch, handle_sched_switch); + rv_detach_trace_probe("tlob", sched_wakeup, handle_sched_wakeup); + rv_detach_trace_probe("tlob", sched_process_exit, handle_sched_process_ex= it); +} +EXPORT_SYMBOL_IF_KUNIT(tlob_disable_hooks); + +static int enable_tlob(void) +{ + int retval; + + retval =3D __tlob_init_monitor(); + if (retval) + return retval; + + return tlob_enable_hooks(); +} + +static void disable_tlob(void) +{ + tlob_disable_hooks(); + __tlob_destroy_monitor(); +} + +static struct rv_monitor rv_this =3D { + .name =3D "tlob", + .description =3D "Per-task latency-over-budget monitor.", + .enable =3D enable_tlob, + .disable =3D disable_tlob, + .reset =3D da_monitor_reset_all, + .enabled =3D 0, +}; + +static void *tlob_chardev_bind(void) +{ + struct tlob_fpriv *fp; + + fp =3D kzalloc_obj(*fp, GFP_KERNEL); + if (!fp) + return ERR_PTR(-ENOMEM); + + /* Pin cache/pool for fd lifetime; balanced in tlob_chardev_release. + * If the synthetic ref has already been dropped (__tlob_destroy_monitor + * ran to completion), reject the bind so the caller gets ENODEV instead + * of corrupting a zero refcount. + */ + if (!refcount_inc_not_zero(&tlob_fd_refcount)) { + kfree(fp); + return ERR_PTR(-ENODEV); + } + return fp; +} + +static void tlob_chardev_release(void *priv) +{ + struct tlob_fpriv *fp =3D priv; + + if (fp->monitoring) { + /* All return values are safe on close. */ + (void)tlob_stop_task(fp->task); + put_task_struct(fp->task); + } + + kfree(fp); + + /* Release fd's pin; if last, wake __tlob_destroy_monitor. */ + if (refcount_dec_and_test(&tlob_fd_refcount)) + complete(&tlob_fd_released); +} + +static long tlob_chardev_ioctl(void *priv, unsigned int cmd, unsigned long= arg) +{ + struct tlob_fpriv *fp =3D priv; + struct tlob_start_args args; + struct task_struct *task; + int ret; + + switch (cmd) { + case TLOB_IOCTL_TRACE_START: + if (fp->monitoring) + return -EALREADY; + + if (copy_from_user(&args, (void __user *)arg, sizeof(args))) + return -EFAULT; + + ret =3D tlob_start_task(current, args.threshold_us); + if (ret) + return ret; + + fp->task =3D current; + get_task_struct(current); + fp->budget_exceeded =3D false; + + /* Link fd so hrtimer callback can latch budget_exceeded. */ + scoped_guard(rcu) { + struct tlob_task_state *ws =3D da_get_target_by_id(current->pid); + + if (ws) + smp_store_release(&ws->fpriv, fp); + } + + fp->monitoring =3D true; + return 0; + + case TLOB_IOCTL_TRACE_STOP: + if (!fp->monitoring) + return -EINVAL; + + task =3D fp->task; + fp->monitoring =3D false; + fp->task =3D NULL; + + ret =3D tlob_stop_task(task); + put_task_struct(task); + + /* + * -EOVERFLOW: budget exceeded; propagate to caller. + * -EAGAIN: concurrent stop_all claimed cleanup; fall through to + * budget_exceeded latch set by the hrtimer callback. + * -ESRCH: task exited before TRACE_STOP (process-exit handler + * claimed cleanup); same latch applies. Not an internal error. + */ + if (ret =3D=3D -EAGAIN || ret =3D=3D -ESRCH) + return READ_ONCE(fp->budget_exceeded) ? -EOVERFLOW : 0; + return ret; + + default: + return -ENOTTY; + } +} + +static const struct rv_chardev_ops tlob_chardev_ops =3D { + .owner =3D THIS_MODULE, + .bind =3D tlob_chardev_bind, + .ioctl =3D tlob_chardev_ioctl, + .release =3D tlob_chardev_release, +}; + +static int __init register_tlob(void) +{ + int ret; + + ret =3D rv_chardev_register_monitor("tlob", &tlob_chardev_ops); + if (ret) + return ret; + + ret =3D rv_register_monitor(&rv_this, NULL); + if (ret) { + rv_chardev_unregister_monitor("tlob"); + return ret; + } + + if (rv_this.root_d) { + if (!tracefs_create_file("monitor", 0644, rv_this.root_d, NULL, + &tlob_monitor_fops)) { + rv_unregister_monitor(&rv_this); + rv_chardev_unregister_monitor("tlob"); + return -ENOMEM; + } + } + + return 0; +} + +static void __exit unregister_tlob(void) +{ + rv_chardev_unregister_monitor("tlob"); + rv_unregister_monitor(&rv_this); +} + +module_init(register_tlob); +module_exit(unregister_tlob); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Wen Yang "); +MODULE_DESCRIPTION("tlob: task latency over budget per-task monitor."); diff --git a/kernel/trace/rv/monitors/tlob/tlob.h b/kernel/trace/rv/monitor= s/tlob/tlob.h new file mode 100644 index 000000000000..71c1735d27d2 --- /dev/null +++ b/kernel/trace/rv/monitors/tlob/tlob.h @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _RV_TLOB_H +#define _RV_TLOB_H + +/* + * C representation of the tlob hybrid automaton. + * + * Three-state HA following sched_stat / wwnr monitor naming conventions: + * + * running (initial) - task is executing on CPU [sched_stat: r= untime] + * waiting - task is in runqueue, awaiting CPU [sched_stat: = wait ] + * sleeping - task is blocked, awaiting resource[sched_stat: = sleep ] + * + * Events (derived from sched_switch / sched_wakeup tracepoints): + * sleep - sched_switch, prev_state !=3D 0 running =E2=86=92 slee= ping + * preempt - sched_switch, prev_state =3D=3D 0 running =E2=86=92 wa= iting + * wakeup - sched_wakeup sleeping =E2=86=92 waiting + * switch_in - sched_switch, next =3D=3D task waiting =E2=86=92 ru= nning + * + * One HA clock invariant: + * clk_elapsed < BUDGET_NS() active in all states (total latency budge= t) + * + * task_start and task_stop are NOT DA events: + * task_start calls da_handle_start_event() to set initial state, then + * ha_reset_clk_ns() + ha_start_timer_ns() to initialise the clock and a= rm + * the timer directly. + * task_stop calls hrtimer_cancel() + da_monitor_reset() directly. + * + * For the format description see: + * Documentation/trace/rv/deterministic_automata.rst + */ + +#include +#include + +#define MONITOR_NAME tlob + +enum states_tlob { + running_tlob, + waiting_tlob, + sleeping_tlob, + state_max_tlob, +}; + +#define INVALID_STATE state_max_tlob + +enum events_tlob { + sleep_tlob, + preempt_tlob, + wakeup_tlob, + switch_in_tlob, + event_max_tlob, +}; + +/* + * HA environment variable: clk_elapsed is the only clock. + * It measures wall-clock time since task_start and is active in all state= s. + */ +enum envs_tlob { + clk_elapsed_tlob, + env_max_tlob, + env_max_stored_tlob =3D env_max_tlob, +}; + +_Static_assert(env_max_stored_tlob <=3D MAX_HA_ENV_LEN, "Not enough slots"= ); +#define HA_CLK_NS + +struct automaton_tlob { + char *state_names[state_max_tlob]; + char *event_names[event_max_tlob]; + char *env_names[env_max_tlob]; + unsigned char function[state_max_tlob][event_max_tlob]; + unsigned char initial_state; + bool final_states[state_max_tlob]; +}; + +static const struct automaton_tlob automaton_tlob =3D { + .state_names =3D { + "running", + "waiting", + "sleeping", + }, + .event_names =3D { + "sleep", + "preempt", + "wakeup", + "switch_in", + }, + .env_names =3D { + "clk_elapsed", + }, + .function =3D { + /* running */ + { + sleeping_tlob, /* sleep (sched_switch, prev_state !=3D 0) */ + waiting_tlob, /* preempt (sched_switch, prev_state =3D=3D 0) */ + INVALID_STATE, /* wakeup (TASK_RUNNING can't be woken) */ + INVALID_STATE, /* switch_in (already on CPU) */ + }, + /* waiting */ + { + INVALID_STATE, /* sleep (not on CPU) */ + INVALID_STATE, /* preempt (not on CPU) */ + INVALID_STATE, /* wakeup (already TASK_RUNNING) */ + running_tlob, /* switch_in */ + }, + /* sleeping */ + { + INVALID_STATE, /* sleep (already sleeping) */ + INVALID_STATE, /* preempt (not on CPU) */ + waiting_tlob, /* wakeup */ + INVALID_STATE, /* switch_in (must go through waiting first) */ + }, + }, + .initial_state =3D running_tlob, + .final_states =3D { 1, 0, 0 }, +}; + +/* Maximum number of concurrently monitored tasks. */ +#define TLOB_MAX_MONITORED 64U + +/* Maximum binary path length for uprobe binding. */ +#define TLOB_MAX_PATH 256 + +/* Exported to ioctl/uprobe layers and KUnit */ +int tlob_start_task(struct task_struct *task, u64 threshold_us); +int tlob_stop_task(struct task_struct *task); + +#if IS_ENABLED(CONFIG_KUNIT) +int tlob_init_monitor(void); +void tlob_destroy_monitor(void); +int tlob_enable_hooks(void); +void tlob_disable_hooks(void); +int tlob_create_or_delete_uprobe(char *buf); +int tlob_num_monitored_read(void); + +struct tlob_captured_event { + int id; + char state[16]; + char event[16]; + char next_state[16]; + bool final_state; +}; + +struct tlob_captured_error_env { + int id; + char state[16]; + char event[16]; + char env[64]; +}; + +struct tlob_captured_detail { + int pid; + u64 threshold_us; + u64 running_ns; + u64 waiting_ns; + u64 sleeping_ns; +}; + +int tlob_register_kunit_probes(void); +void tlob_unregister_kunit_probes(void); +int tlob_event_count_read(void); +void tlob_event_count_reset(void); +int tlob_error_env_count_read(void); +void tlob_error_env_count_reset(void); +const struct tlob_captured_event *tlob_last_event_read(void); +const struct tlob_captured_error_env *tlob_last_error_env_read(void); +const struct tlob_captured_detail *tlob_last_detail_read(void); +#endif /* CONFIG_KUNIT */ + +#endif /* _RV_TLOB_H */ diff --git a/kernel/trace/rv/monitors/tlob/tlob_trace.h b/kernel/trace/rv/m= onitors/tlob/tlob_trace.h new file mode 100644 index 000000000000..08d34e1b0ab8 --- /dev/null +++ b/kernel/trace/rv/monitors/tlob/tlob_trace.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h for tlob tracepoints. + * + * event_tlob and error_tlob are defined on the event_da_monitor_id and + * error_da_monitor_id classes, following the same pattern as nomiss. + * error_env_tlob carries the environment variable name that caused the + * clock-invariant violation (budget exceeded). + * The id field carries the pid of the monitored task. + */ + +#ifdef CONFIG_RV_MON_TLOB +/* id is the pid of the monitored task */ +DEFINE_EVENT(event_da_monitor_id, event_tlob, + TP_PROTO(int id, char *state, char *event, char *next_state, bool fi= nal_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_tlob, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); + +DEFINE_EVENT(error_env_da_monitor_id, error_env_tlob, + TP_PROTO(int id, char *state, char *event, char *env), + TP_ARGS(id, state, event, env)); + +/* + * detail_env_tlob - per-state time breakdown emitted alongside error_env_= tlob. + * + * Fired once per budget violation, immediately after error_env_tlob, from + * the hrtimer callback (hardirq context). The three _ns fields sum to + * approximately threshold_us * 1000; any rounding comes from the partial + * time accumulated in the current state since the last transition. + */ +TRACE_EVENT(detail_env_tlob, + TP_PROTO(int pid, u64 threshold_us, + u64 running_ns, u64 waiting_ns, u64 sleeping_ns), + TP_ARGS(pid, threshold_us, running_ns, waiting_ns, sleeping_ns), + TP_STRUCT__entry( + __field(int, pid) + __field(u64, threshold_us) + __field(u64, running_ns) + __field(u64, waiting_ns) + __field(u64, sleeping_ns) + ), + TP_fast_assign( + __entry->pid =3D pid; + __entry->threshold_us =3D threshold_us; + __entry->running_ns =3D running_ns; + __entry->waiting_ns =3D waiting_ns; + __entry->sleeping_ns =3D sleeping_ns; + ), + TP_printk("pid=3D%d threshold_us=3D%llu running_ns=3D%llu waiting_ns=3D%l= lu sleeping_ns=3D%llu", + __entry->pid, __entry->threshold_us, + __entry->running_ns, __entry->waiting_ns, + __entry->sleeping_ns) +); +#endif /* CONFIG_RV_MON_TLOB */ diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index ee4e68102f17..a45c4763dbe5 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -142,10 +142,17 @@ #include #include #include +#include =20 #ifdef CONFIG_RV_MON_EVENTS #define CREATE_TRACE_POINTS #include + +#ifdef CONFIG_RV_MON_TLOB +EXPORT_TRACEPOINT_SYMBOL_GPL(error_tlob); +EXPORT_TRACEPOINT_SYMBOL_GPL(event_tlob); +EXPORT_TRACEPOINT_SYMBOL_GPL(error_env_tlob); +#endif #endif =20 #include "rv.h" @@ -696,6 +703,33 @@ static void turn_monitoring_on(void) WRITE_ONCE(monitoring_on, true); } =20 +#if IS_ENABLED(CONFIG_KUNIT) +/** + * rv_kunit_monitoring_on - enable the global monitoring_on flag for KUnit= tests. + * + * KUnit test suite_init functions must call this before initialising any + * monitor, mirroring the turn_monitoring_on() call in rv_init_interface(). + * The matching rv_kunit_monitoring_off() must be called in suite_exit to + * restore the flag so that test suites do not interfere with each other. + */ +void rv_kunit_monitoring_on(void) +{ + turn_monitoring_on(); +} +EXPORT_SYMBOL_IF_KUNIT(rv_kunit_monitoring_on); + +/** + * rv_kunit_monitoring_off - disable the global monitoring_on flag for KUn= it tests. + * + * Must be called in suite_exit to restore global state after rv_kunit_mon= itoring_on(). + */ +void rv_kunit_monitoring_off(void) +{ + turn_monitoring_off(); +} +EXPORT_SYMBOL_IF_KUNIT(rv_kunit_monitoring_off); +#endif /* CONFIG_KUNIT */ + static void turn_monitoring_on_with_reset(void) { lockdep_assert_held(&rv_interface_lock); @@ -846,6 +880,10 @@ int __init rv_init_interface(void) if (retval) return 1; =20 + retval =3D rv_chardev_init(); + if (retval) + return 1; + turn_monitoring_on(); =20 rv_root.root_dir =3D no_free_ptr(root_dir); diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h index 2c0f51ff9d5c..82c9a2b57596 100644 --- a/kernel/trace/rv/rv.h +++ b/kernel/trace/rv/rv.h @@ -31,6 +31,8 @@ int rv_enable_monitor(struct rv_monitor *mon); bool rv_is_container_monitor(struct rv_monitor *mon); bool rv_is_nested_monitor(struct rv_monitor *mon); =20 +int rv_chardev_init(void); + #ifdef CONFIG_RV_REACTORS int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root); int init_rv_reactors(struct dentry *root_dir); diff --git a/kernel/trace/rv/rv_chardev.c b/kernel/trace/rv/rv_chardev.c new file mode 100644 index 000000000000..1fba1642ebc1 --- /dev/null +++ b/kernel/trace/rv/rv_chardev.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include + +#include "rv.h" + +static_assert(MAX_RV_MONITOR_NAME_SIZE =3D=3D RV_MONITOR_NAME_MAX, + "RV internal and UAPI monitor name size constants must match"); + +struct rv_fd_priv { + const struct rv_chardev_ops *ops; + void *monitor_priv; +}; + +struct rv_chardev_entry { + char name[MAX_RV_MONITOR_NAME_SIZE]; + const struct rv_chardev_ops *ops; + struct list_head list; +}; + +/* Protected by rv_interface_lock (from rv.h / rv.c). */ +static LIST_HEAD(rv_chardev_list); + +/** + * rv_chardev_register_monitor - expose a monitor via /dev/rv + * @name: Monitor name, must match the rv_monitor .name field. + * @ops: Callbacks providing bind / ioctl / release. + * + * Returns 0 on success, -EINVAL if @name is too long, -EEXIST if @name is + * already registered, -ENOMEM on OOM. + */ +int rv_chardev_register_monitor(const char *name, + const struct rv_chardev_ops *ops) +{ + struct rv_chardev_entry *e, *existing; + + if (strlen(name) >=3D MAX_RV_MONITOR_NAME_SIZE) + return -EINVAL; + + e =3D kmalloc_obj(*e, GFP_KERNEL); + if (!e) + return -ENOMEM; + + strscpy(e->name, name, sizeof(e->name)); + e->ops =3D ops; + + guard(mutex)(&rv_interface_lock); + list_for_each_entry(existing, &rv_chardev_list, list) { + if (strcmp(existing->name, name) =3D=3D 0) { + kfree(e); + return -EEXIST; + } + } + list_add_tail(&e->list, &rv_chardev_list); + return 0; +} +EXPORT_SYMBOL_GPL(rv_chardev_register_monitor); + +/** + * rv_chardev_unregister_monitor - remove a monitor from the /dev/rv regis= try + * @name: Monitor name previously passed to rv_chardev_register_monitor(). + * + * Existing bound fds remain valid; their ops pointer is stable until the + * fd is closed. The caller must ensure no new binds to this monitor can + * succeed after unregistration =E2=80=94 typically by unregistering befor= e unloading + * the module that provides the ops. + */ +void rv_chardev_unregister_monitor(const char *name) +{ + struct rv_chardev_entry *e, *tmp; + + guard(mutex)(&rv_interface_lock); + list_for_each_entry_safe(e, tmp, &rv_chardev_list, list) { + if (strcmp(e->name, name) =3D=3D 0) { + list_del(&e->list); + kfree(e); + return; + } + } +} +EXPORT_SYMBOL_GPL(rv_chardev_unregister_monitor); + +static int rv_dev_open(struct inode *inode, struct file *file) +{ + struct rv_fd_priv *fp; + + fp =3D kzalloc_obj(*fp, GFP_KERNEL); + if (!fp) + return -ENOMEM; + + file->private_data =3D fp; + return 0; +} + +static int rv_dev_release(struct inode *inode, struct file *file) +{ + struct rv_fd_priv *fp =3D file->private_data; + + if (fp->ops) { + fp->ops->release(fp->monitor_priv); + module_put(fp->ops->owner); + } + kfree(fp); + return 0; +} + +static int rv_bind_monitor(struct rv_fd_priv *fp, const char __user *uarg) +{ + const struct rv_chardev_ops *ops =3D NULL; + struct rv_bind_args args; + void *priv; + + if (fp->ops) + return -EBUSY; + + if (copy_from_user(&args, uarg, sizeof(args))) + return -EFAULT; + + args.monitor_name[RV_MONITOR_NAME_MAX - 1] =3D '\0'; + + /* + * Pin the owning module while the list entry is still valid under + * rv_interface_lock, preventing a concurrent rmmod from completing + * between lookup and reference acquisition. bind() may sleep + * (GFP_KERNEL inside), so it runs after the lock is dropped. + */ + scoped_guard(mutex, &rv_interface_lock) { + struct rv_chardev_entry *e; + + list_for_each_entry(e, &rv_chardev_list, list) { + if (strcmp(e->name, args.monitor_name) !=3D 0) + continue; + if (!try_module_get(e->ops->owner)) + return -ENODEV; + ops =3D e->ops; + break; + } + } + + if (!ops) + return -ENOENT; + + priv =3D ops->bind(); + if (IS_ERR(priv)) { + module_put(ops->owner); + return PTR_ERR(priv); + } + + fp->ops =3D ops; + fp->monitor_priv =3D priv; + return 0; +} + +static long rv_dev_ioctl(struct file *file, unsigned int cmd, unsigned lon= g arg) +{ + struct rv_fd_priv *fp =3D file->private_data; + + if (cmd =3D=3D RV_IOCTL_BIND_MONITOR) + return rv_bind_monitor(fp, (const char __user *)arg); + + if (!fp->ops) + return -ENXIO; + + return fp->ops->ioctl(fp->monitor_priv, cmd, arg); +} + +static __poll_t rv_dev_poll(struct file *file, poll_table *wait) +{ + struct rv_fd_priv *fp =3D file->private_data; + + if (!fp->ops || !fp->ops->poll) + return 0; + + return fp->ops->poll(fp->monitor_priv, file, wait); +} + +static const struct file_operations rv_dev_fops =3D { + .owner =3D THIS_MODULE, + .open =3D rv_dev_open, + .release =3D rv_dev_release, + .unlocked_ioctl =3D rv_dev_ioctl, + .compat_ioctl =3D rv_dev_ioctl, + .poll =3D rv_dev_poll, +}; + +static struct miscdevice rv_miscdev =3D { + .minor =3D MISC_DYNAMIC_MINOR, + .name =3D "rv", + .fops =3D &rv_dev_fops, +}; + +int __init rv_chardev_init(void) +{ + return misc_register(&rv_miscdev); +} diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 9622c269789c..a4bc215c1f15 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -189,6 +189,7 @@ DECLARE_EVENT_CLASS(error_env_da_monitor_id, =20 #include #include +#include // Add new monitors based on CONFIG_HA_MON_EVENTS_ID here =20 #endif diff --git a/kernel/trace/rv/rv_uprobe.c b/kernel/trace/rv/rv_uprobe.c index bc28399cfd4b..1ba7b80c1d87 100644 --- a/kernel/trace/rv/rv_uprobe.c +++ b/kernel/trace/rv/rv_uprobe.c @@ -132,13 +132,10 @@ EXPORT_SYMBOL_GPL(rv_uprobe_attach); */ void rv_uprobe_detach(struct rv_uprobe *p) { - struct rv_uprobe_impl *impl; - if (!p) return; =20 - impl =3D container_of(p, struct rv_uprobe_impl, pub); - uprobe_unregister_nosync(impl->uprobe, &impl->uc); + rv_uprobe_unregister_nosync(p); /* * uprobe_unregister_sync() is a global barrier: it waits for all * in-flight uprobe handlers across the entire system to complete, @@ -146,8 +143,47 @@ void rv_uprobe_detach(struct rv_uprobe *p) * guarantees that no handler touching impl->pub.priv is running by * the time we return, even if the caller immediately frees priv. */ + rv_uprobe_sync(); + rv_uprobe_free(p); +} +EXPORT_SYMBOL_GPL(rv_uprobe_detach); + +/** + * rv_uprobe_unregister_nosync - dequeue an uprobe without waiting + */ +void rv_uprobe_unregister_nosync(struct rv_uprobe *p) +{ + struct rv_uprobe_impl *impl; + + if (!p) + return; + + impl =3D container_of(p, struct rv_uprobe_impl, pub); + uprobe_unregister_nosync(impl->uprobe, &impl->uc); +} +EXPORT_SYMBOL_GPL(rv_uprobe_unregister_nosync); + +/** + * rv_uprobe_sync - wait for all in-flight uprobe handlers to complete + */ +void rv_uprobe_sync(void) +{ uprobe_unregister_sync(); +} +EXPORT_SYMBOL_GPL(rv_uprobe_sync); + +/** + * rv_uprobe_free - release resources of a previously deregistered probe + */ +void rv_uprobe_free(struct rv_uprobe *p) +{ + struct rv_uprobe_impl *impl; + + if (!p) + return; + + impl =3D container_of(p, struct rv_uprobe_impl, pub); path_put(&p->path); kfree(impl); } -EXPORT_SYMBOL_GPL(rv_uprobe_detach); +EXPORT_SYMBOL_GPL(rv_uprobe_free); diff --git a/tools/include/uapi/linux/rv.h b/tools/include/uapi/linux/rv.h new file mode 100644 index 000000000000..a34e5426393b --- /dev/null +++ b/tools/include/uapi/linux/rv.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * UAPI definitions for Runtime Verification (RV) monitors. + * + * All RV monitors that expose an ioctl self-instrumentation interface + * share the magic byte RV_IOC_MAGIC ('r'). + * + * Usage examples and design rationale are in: + * Documentation/trace/rv/monitor_tlob.rst + */ + +#ifndef _UAPI_LINUX_RV_H +#define _UAPI_LINUX_RV_H + +#include +#include + +/* Magic byte shared by all RV monitor ioctls. */ +#define RV_IOC_MAGIC 'r' + +/* Maximum monitor name length (including NUL terminator). */ +#define RV_MONITOR_NAME_MAX 32 + +/* Generic /dev/rv ioctls (ioctl numbers 0=E2=80=9315 are reserved for the= core) */ + +/** + * struct rv_bind_args - arguments for RV_IOCTL_BIND_MONITOR + * @monitor_name: NUL-terminated name of the monitor to bind (e.g. "tlob"). + */ +struct rv_bind_args { + char monitor_name[RV_MONITOR_NAME_MAX]; +}; + +/* + * RV_IOCTL_BIND_MONITOR - associate this fd with a specific RV monitor. + * + * Must be called once after open() and before any monitor-specific ioctl. + * + * Returns 0 on success. + * Returns -EBUSY if this fd is already bound to a monitor. + * Returns -ENOENT if the requested monitor is not registered. + * Returns -ENOMEM on allocation failure. + */ +#define RV_IOCTL_BIND_MONITOR _IOW(RV_IOC_MAGIC, 0, struct rv_bind_args) + +/* tlob: task latency over budget monitor (ioctl numbers 1=E2=80=9315) */ + +/** + * struct tlob_start_args - arguments for TLOB_IOCTL_TRACE_START + * @threshold_us: Total latency budget for this window, in microseconds. + * Must be greater than zero. Both on-CPU and off-CPU time + * (including runqueue wait) count toward this budget. + */ +struct tlob_start_args { + __u64 threshold_us; +}; + +/* + * TLOB_IOCTL_TRACE_START - begin monitoring the calling task. + * + * Arms a per-task hrtimer for threshold_us microseconds (CLOCK_MONOTONIC, + * so both on-CPU and off-CPU time count toward the budget). + * + * Returns 0 on success. + * Returns -EEXIST if TRACE_START was already called on this fd. + * Returns -ENOSPC if TLOB_MAX_MONITORED tasks are already being tracked. + * Returns -ENOMEM on allocation failure. + * Returns -ENODEV if the tlob monitor is not enabled. + * Returns -ERANGE if threshold_us is 0. + */ +#define TLOB_IOCTL_TRACE_START _IOW(RV_IOC_MAGIC, 1, struct tlob_start_arg= s) + +/* + * TLOB_IOCTL_TRACE_STOP - end monitoring the calling task. + * + * Returns 0 if within budget. + * Returns -EOVERFLOW if the latency budget was exceeded. + * Returns -EINVAL if TLOB_IOCTL_TRACE_START was not called on this fd. + * + * poll/epoll: after TRACE_START the fd becomes readable (EPOLLIN) when the + * budget is exceeded. The caller may then issue TRACE_STOP to retrieve t= he + * result, or simply close the fd to clean up. + */ +#define TLOB_IOCTL_TRACE_STOP _IO(RV_IOC_MAGIC, 2) + +#endif /* _UAPI_LINUX_RV_H */ --=20 2.25.1 From nobody Sat Jun 13 00:24:03 2026 Received: from out-186.mta0.migadu.com (out-186.mta0.migadu.com [91.218.175.186]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9CEA447D936 for ; Mon, 11 May 2026 18:25:40 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.186 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523943; cv=none; b=IGa0jMn5xvsogj94Wi9ieA5lvhO8IhPR5Xxa00u9266eeks/7ku0nwoqZWSMRLqiJRzU2K+dqTAdsDanO0anN8ssA+VFobGy9sisvMM0NoNfTsTMCcNbtHTQxgHCzI3lRMhLJPE1qvzHAoQylIRYqB1i3fF+jq40yNLoUylzJ9c= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523943; c=relaxed/simple; bh=6NCq3f94HELO7dKV0BotLdVgPwX8Hbeag/Wl6kZNkhI=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=fRf2pTZfCL96KeFiujPYc67AZUiZVR9qDhCi2btp6CHG1IkAKM29P2GEVdPfPIaGV/9cFCNyUuVZZ2NFgV86K055XAhzZep5k6Mv44XT7BfSxFMEah6h7uRfxMPwuxyrJ9FY6ta9BgQ0K+byKCJIiMu7dGAdjbNqs8Hq2Cn0/H8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=v69wOhYA; arc=none smtp.client-ip=91.218.175.186 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="v69wOhYA" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1778523938; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=bEys0L8JCN/AQ4YVzhBi2+9J8yJLK8cUJc5a2PfqJDs=; b=v69wOhYAMM21DElOfLcYWFfzwQ+2XkpFQucIUwWYEglKpj01A1C1p3V7SZ10+mxujCMvZB RVnRBbo8tdAgSuyK4HQkscRXJzHTZL9s7Lxsio5HI/vSgepexcwkYWeZJGKuDS881+DGBd azNroMf2tXVio3DuD9nQRrn+nytqDEM= From: wen.yang@linux.dev To: Gabriele Monaco , Steven Rostedt Cc: linux-trace-kernel@vger.kernel.org, linux-kernel@vger.kernel.org, Wen Yang Subject: [RFC PATCH v2 09/10] rv/tlob: add KUnit tests for the tlob monitor Date: Tue, 12 May 2026 02:24:55 +0800 Message-Id: In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Wen Yang Add five KUnit test suites gated behind CONFIG_TLOB_KUNIT_TEST (depends on RV_MON_TLOB && KUNIT; default KUNIT_ALL_TESTS) with a .kunitconfig fragment for the kunit.py runner. tlob_task_api tests the start/stop API, error returns (-EEXIST, -ESRCH, -EOVERFLOW, -ENOSPC, -ERANGE). tlob_sched_integration covers context-switch accounting and monitoring a kthread. tlob_parse_uprobe exercises the uprobe line parser. tlob_trace_output checks sched_switch and error_env_tlob field layout. tlob_violation_react verifies error_env_tlob fires once on budget expiry and zero times when the budget is not exceeded. Suggested-by: Gabriele Monaco =20 Signed-off-by: Wen Yang --- kernel/trace/rv/monitors/tlob/.kunitconfig | 5 + kernel/trace/rv/monitors/tlob/tlob.c | 26 + kernel/trace/rv/monitors/tlob/tlob_kunit.c | 881 +++++++++++++++++++++ 3 files changed, 912 insertions(+) create mode 100644 kernel/trace/rv/monitors/tlob/.kunitconfig create mode 100644 kernel/trace/rv/monitors/tlob/tlob_kunit.c diff --git a/kernel/trace/rv/monitors/tlob/.kunitconfig b/kernel/trace/rv/m= onitors/tlob/.kunitconfig new file mode 100644 index 000000000000..977c58601ab7 --- /dev/null +++ b/kernel/trace/rv/monitors/tlob/.kunitconfig @@ -0,0 +1,5 @@ +CONFIG_FTRACE=3Dy +CONFIG_KUNIT=3Dy +CONFIG_RV=3Dy +CONFIG_RV_MON_TLOB=3Dy +CONFIG_TLOB_KUNIT_TEST=3Dy diff --git a/kernel/trace/rv/monitors/tlob/tlob.c b/kernel/trace/rv/monitor= s/tlob/tlob.c index 475e972ae9aa..90e7035a0b55 100644 --- a/kernel/trace/rv/monitors/tlob/tlob.c +++ b/kernel/trace/rv/monitors/tlob/tlob.c @@ -1024,6 +1024,7 @@ EXPORT_SYMBOL_IF_KUNIT(tlob_num_monitored_read); /* Tracepoint probes for KUnit; rv_trace.h is only included here. */ static struct tlob_captured_event tlob_kunit_last_event; static struct tlob_captured_error_env tlob_kunit_last_error_env; +static struct tlob_captured_detail tlob_kunit_last_detail; static atomic_t tlob_kunit_event_cnt =3D ATOMIC_INIT(0); static atomic_t tlob_kunit_error_env_cnt =3D ATOMIC_INIT(0); =20 @@ -1054,6 +1055,17 @@ static void tlob_kunit_error_env_probe(void *data, i= nt id, char *state, atomic_inc(&tlob_kunit_error_env_cnt); } =20 +static void tlob_kunit_detail_probe(void *data, int pid, u64 threshold_us, + u64 running_ns, u64 waiting_ns, + u64 sleeping_ns) +{ + tlob_kunit_last_detail.pid =3D pid; + tlob_kunit_last_detail.threshold_us =3D threshold_us; + tlob_kunit_last_detail.running_ns =3D running_ns; + tlob_kunit_last_detail.waiting_ns =3D waiting_ns; + tlob_kunit_last_detail.sleeping_ns =3D sleeping_ns; +} + int tlob_register_kunit_probes(void) { int ret; @@ -1069,6 +1081,12 @@ int tlob_register_kunit_probes(void) unregister_trace_event_tlob(tlob_kunit_event_probe, NULL); return ret; } + ret =3D register_trace_detail_env_tlob(tlob_kunit_detail_probe, NULL); + if (ret) { + unregister_trace_error_env_tlob(tlob_kunit_error_env_probe, NULL); + unregister_trace_event_tlob(tlob_kunit_event_probe, NULL); + return ret; + } return 0; } EXPORT_SYMBOL_IF_KUNIT(tlob_register_kunit_probes); @@ -1077,6 +1095,7 @@ void tlob_unregister_kunit_probes(void) { unregister_trace_event_tlob(tlob_kunit_event_probe, NULL); unregister_trace_error_env_tlob(tlob_kunit_error_env_probe, NULL); + unregister_trace_detail_env_tlob(tlob_kunit_detail_probe, NULL); tracepoint_synchronize_unregister(); } EXPORT_SYMBOL_IF_KUNIT(tlob_unregister_kunit_probes); @@ -1105,6 +1124,7 @@ void tlob_error_env_count_reset(void) } EXPORT_SYMBOL_IF_KUNIT(tlob_error_env_count_reset); =20 + const struct tlob_captured_event *tlob_last_event_read(void) { return &tlob_kunit_last_event; @@ -1117,6 +1137,12 @@ const struct tlob_captured_error_env *tlob_last_erro= r_env_read(void) } EXPORT_SYMBOL_IF_KUNIT(tlob_last_error_env_read); =20 +const struct tlob_captured_detail *tlob_last_detail_read(void) +{ + return &tlob_kunit_last_detail; +} +EXPORT_SYMBOL_IF_KUNIT(tlob_last_detail_read); + #endif /* CONFIG_KUNIT */ =20 VISIBLE_IF_KUNIT int tlob_enable_hooks(void) diff --git a/kernel/trace/rv/monitors/tlob/tlob_kunit.c b/kernel/trace/rv/m= onitors/tlob/tlob_kunit.c new file mode 100644 index 000000000000..ed2e7c7abaf8 --- /dev/null +++ b/kernel/trace/rv/monitors/tlob/tlob_kunit.c @@ -0,0 +1,881 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit tests for the tlob RV monitor. + * + * tlob_task_api: start/stop lifecycle, error paths, violations. + * tlob_sched_integration: per-state accounting across real context switch= es. + * tlob_uprobe_format: uprobe binding format; add/remove acceptance an= d rejection. + * tlob_trace_output: trace event format for event_tlob, error_env_tl= ob. + * tlob_violation_react: error count per budget expiry; per-state breakd= own. + * + * tlob_add_uprobe() duplicate-(binary, offset_start) constraint is not co= vered + * here: kern_path() requires a real filesystem; see selftests instead. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tlob.h" + +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); + +/* + * Kthread cleanup guard: registers a kunit action that stops a kthread on + * test exit, even when a KUNIT_ASSERT fires before normal teardown code r= uns. + * + * Caller must call get_task_struct() before registering the guard. + * Set guard->task =3D NULL before normal-path teardown to prevent double-= stop. + * Pass the completion to unblock on early exit, or NULL if not needed. + */ +struct tlob_kthread_guard { + struct task_struct *task; + struct completion *unblock; +}; + +static void kthread_guard_fn(void *arg) +{ + struct tlob_kthread_guard *g =3D arg; + + if (!g->task) + return; + if (g->unblock) + complete(g->unblock); + kthread_stop(g->task); + put_task_struct(g->task); +} + +static struct tlob_kthread_guard * +tlob_guard_kthread(struct kunit *test, struct task_struct *task, + struct completion *unblock) +{ + struct tlob_kthread_guard *g; + + g =3D kunit_kzalloc(test, sizeof(*g), GFP_KERNEL); + if (!g) + return NULL; + g->task =3D task; + g->unblock =3D unblock; + if (kunit_add_action_or_reset(test, kthread_guard_fn, g)) + return NULL; + return g; +} + +/* Suite 1: task API - lifecycle, error paths, violations. */ + +/* Basic start/stop cycle */ +static void tlob_start_stop_ok(struct kunit *test) +{ + int ret; + + ret =3D tlob_start_task(current, 10000000ULL); + KUNIT_ASSERT_EQ(test, ret, 0); + KUNIT_EXPECT_EQ(test, tlob_stop_task(current), 0); + KUNIT_EXPECT_EQ(test, tlob_num_monitored_read(), 0); +} + +/* Double start must return -EALREADY; double stop must return -ESRCH. */ +static void tlob_double_start(struct kunit *test) +{ + KUNIT_ASSERT_EQ(test, tlob_start_task(current, 10000000ULL), 0); + KUNIT_EXPECT_EQ(test, tlob_start_task(current, 10000000ULL), -EALREADY); + KUNIT_EXPECT_EQ(test, tlob_stop_task(current), 0); + KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -ESRCH); + KUNIT_EXPECT_EQ(test, tlob_num_monitored_read(), 0); +} + +/* Stop without start must return -ESRCH. */ +static void tlob_stop_without_start(struct kunit *test) +{ + tlob_stop_task(current); + KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -ESRCH); + KUNIT_EXPECT_EQ(test, tlob_num_monitored_read(), 0); +} + +/* threshold_us =3D=3D 0 is invalid and must return -ERANGE. */ +static void tlob_zero_threshold(struct kunit *test) +{ + KUNIT_EXPECT_EQ(test, tlob_start_task(current, 0), -ERANGE); +} + +/* 1 ns budget: timer almost certainly fires before tlob_stop_task(). */ +static void tlob_immediate_deadline(struct kunit *test) +{ + int ret =3D tlob_start_task(current, 1); + + KUNIT_ASSERT_EQ(test, ret, 0); + udelay(100); + /* timer fired -> -EOVERFLOW; if we won the race, 0 is also valid */ + ret =3D tlob_stop_task(current); + KUNIT_EXPECT_TRUE(test, ret =3D=3D 0 || ret =3D=3D -EOVERFLOW); + KUNIT_EXPECT_EQ(test, tlob_num_monitored_read(), 0); +} + +/* + * kthreads provide distinct task_structs; fill to TLOB_MAX_MONITORED, + * then verify -ENOSPC. + */ +struct tlob_waiter_ctx { + struct completion start; + struct completion done; +}; + +static int tlob_waiter_fn(void *arg) +{ + struct tlob_waiter_ctx *ctx =3D arg; + + wait_for_completion(&ctx->start); + complete(&ctx->done); + return 0; +} + +static void tlob_enospc(struct kunit *test) +{ + struct tlob_waiter_ctx *ctxs; + struct task_struct **threads; + int i, ret; + + ctxs =3D kunit_kcalloc(test, TLOB_MAX_MONITORED, + sizeof(*ctxs), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, ctxs); + + threads =3D kunit_kcalloc(test, TLOB_MAX_MONITORED, + sizeof(*threads), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, threads); + + KUNIT_ASSERT_EQ(test, tlob_num_monitored_read(), 0); + + for (i =3D 0; i < TLOB_MAX_MONITORED; i++) { + init_completion(&ctxs[i].start); + init_completion(&ctxs[i].done); + + threads[i] =3D kthread_run(tlob_waiter_fn, &ctxs[i], + "tlob_waiter_%d", i); + if (IS_ERR(threads[i])) { + KUNIT_FAIL(test, "kthread_run failed at i=3D%d", i); + threads[i] =3D NULL; + goto cleanup; + } + get_task_struct(threads[i]); + + ret =3D tlob_start_task(threads[i], 10000000ULL); + if (ret !=3D 0) { + KUNIT_FAIL(test, "tlob_start_task failed at i=3D%d: %d", + i, ret); + put_task_struct(threads[i]); + complete(&ctxs[i].start); + threads[i] =3D NULL; + goto cleanup; + } + } + + ret =3D tlob_start_task(current, 10000000ULL); + KUNIT_EXPECT_EQ(test, ret, -ENOSPC); + +cleanup: + /* cancel monitoring and unblock first, then wait for full exit */ + for (i =3D 0; i < TLOB_MAX_MONITORED; i++) { + if (!threads[i]) + break; + tlob_stop_task(threads[i]); + complete(&ctxs[i].start); + } + for (i =3D 0; i < TLOB_MAX_MONITORED; i++) { + if (!threads[i]) + break; + kthread_stop(threads[i]); + put_task_struct(threads[i]); + } +} + +/* + * Holder kthread holds a mutex for 80 ms; arm a 10 ms budget, burn ~1 ms + * on-CPU, then block on the mutex; timer fires while sleeping -> -EOVERFL= OW. + */ +struct tlob_holder_ctx { + struct mutex lock; + struct completion ready; + unsigned int hold_ms; +}; + +static int tlob_holder_fn(void *arg) +{ + struct tlob_holder_ctx *ctx =3D arg; + + mutex_lock(&ctx->lock); + complete(&ctx->ready); + msleep(ctx->hold_ms); + mutex_unlock(&ctx->lock); + return 0; +} + +static void tlob_deadline_fires_sleeping(struct kunit *test) +{ + struct tlob_holder_ctx *ctx; + struct tlob_kthread_guard *guard; + struct task_struct *holder; + ktime_t t0; + int ret; + + ctx =3D kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, ctx); + ctx->hold_ms =3D 80; + mutex_init(&ctx->lock); + init_completion(&ctx->ready); + + holder =3D kthread_run(tlob_holder_fn, ctx, "tlob_holder_kunit"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, holder); + get_task_struct(holder); + + guard =3D tlob_guard_kthread(test, holder, NULL); + KUNIT_ASSERT_NOT_NULL(test, guard); + + wait_for_completion(&ctx->ready); + + ret =3D tlob_start_task(current, 10000); + KUNIT_ASSERT_EQ(test, ret, 0); + + t0 =3D ktime_get(); + while (ktime_us_delta(ktime_get(), t0) < 1000) + cpu_relax(); + + /* block on mutex: running->sleeping; timer fires while sleeping */ + mutex_lock(&ctx->lock); + mutex_unlock(&ctx->lock); + + KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -EOVERFLOW); + + guard->task =3D NULL; + kthread_stop(holder); + put_task_struct(holder); +} + +/* + * yield() triggers a preempt sched_switch (prev_state=3D=3D0): running->w= aiting. + * Busy-spin 50 ms so the 2 ms budget fires regardless of scheduler timing. + */ +static void tlob_deadline_fires_waiting(struct kunit *test) +{ + ktime_t t0; + int ret; + + ret =3D tlob_start_task(current, 2000); + KUNIT_ASSERT_EQ(test, ret, 0); + + yield(); + + t0 =3D ktime_get(); + while (ktime_us_delta(ktime_get(), t0) < 50000) + cpu_relax(); + + KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -EOVERFLOW); +} + +/* Arm a 1 ms budget and busy-spin for 50 ms; timer fires in running state= . */ +static void tlob_deadline_fires_running(struct kunit *test) +{ + ktime_t t0; + int ret; + + ret =3D tlob_start_task(current, 1000); + KUNIT_ASSERT_EQ(test, ret, 0); + + t0 =3D ktime_get(); + while (ktime_us_delta(ktime_get(), t0) < 50000) + cpu_relax(); + + KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -EOVERFLOW); +} + +/* Start three tasks, reinit monitor, verify all entries are gone. */ +static int tlob_dummy_fn(void *arg) +{ + wait_for_completion((struct completion *)arg); + return 0; +} + +static void tlob_reinit_clears_all(struct kunit *test) +{ + struct completion *done1, *done2; + struct tlob_kthread_guard *guard1, *guard2; + struct task_struct *t1, *t2; + int ret; + + done1 =3D kunit_kzalloc(test, sizeof(*done1), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, done1); + done2 =3D kunit_kzalloc(test, sizeof(*done2), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, done2); + + init_completion(done1); + init_completion(done2); + + t1 =3D kthread_run(tlob_dummy_fn, done1, "tlob_dummy1"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t1); + get_task_struct(t1); + guard1 =3D tlob_guard_kthread(test, t1, done1); + KUNIT_ASSERT_NOT_NULL(test, guard1); + + t2 =3D kthread_run(tlob_dummy_fn, done2, "tlob_dummy2"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t2); + get_task_struct(t2); + guard2 =3D tlob_guard_kthread(test, t2, done2); + KUNIT_ASSERT_NOT_NULL(test, guard2); + + KUNIT_ASSERT_EQ(test, tlob_start_task(current, 10000000ULL), 0); + KUNIT_ASSERT_EQ(test, tlob_start_task(t1, 10000000ULL), 0); + KUNIT_ASSERT_EQ(test, tlob_start_task(t2, 10000000ULL), 0); + + tlob_destroy_monitor(); + ret =3D tlob_init_monitor(); + KUNIT_ASSERT_EQ(test, ret, 0); + + KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -ESRCH); + KUNIT_EXPECT_EQ(test, tlob_stop_task(t1), -ESRCH); + KUNIT_EXPECT_EQ(test, tlob_stop_task(t2), -ESRCH); + + /* null guards before teardown to prevent double-stop */ + guard1->task =3D NULL; + guard2->task =3D NULL; + complete(done1); + complete(done2); + kthread_stop(t1); + kthread_stop(t2); + put_task_struct(t1); + put_task_struct(t2); +} + +static int tlob_task_api_suite_init(struct kunit_suite *suite) +{ + rv_kunit_monitoring_on(); + return tlob_init_monitor(); +} + +static void tlob_task_api_suite_exit(struct kunit_suite *suite) +{ + tlob_destroy_monitor(); + rv_kunit_monitoring_off(); +} + +static void tlob_task_api_exit(struct kunit *test) +{ + /* + * tlob_stop_task() returns pool slots via call_rcu (da_pool_return_cb). + * Wait for all pending callbacks so each test starts with a full pool. + */ + rcu_barrier(); +} + +static struct kunit_case tlob_task_api_cases[] =3D { + KUNIT_CASE(tlob_start_stop_ok), + KUNIT_CASE(tlob_double_start), + KUNIT_CASE(tlob_stop_without_start), + KUNIT_CASE(tlob_zero_threshold), + KUNIT_CASE(tlob_immediate_deadline), + KUNIT_CASE(tlob_enospc), + KUNIT_CASE(tlob_deadline_fires_sleeping), + KUNIT_CASE(tlob_deadline_fires_waiting), + KUNIT_CASE(tlob_deadline_fires_running), + KUNIT_CASE(tlob_reinit_clears_all), + {} +}; + +static struct kunit_suite tlob_task_api_suite =3D { + .name =3D "tlob_task_api", + .suite_init =3D tlob_task_api_suite_init, + .suite_exit =3D tlob_task_api_suite_exit, + .exit =3D tlob_task_api_exit, + .test_cases =3D tlob_task_api_cases, +}; + +/* Suite 2: sched integration - per-state ns accounting. */ + +struct tlob_ping_ctx { + struct completion ping; + struct completion pong; +}; + +static int tlob_ping_fn(void *arg) +{ + struct tlob_ping_ctx *ctx =3D arg; + + wait_for_completion(&ctx->ping); + complete(&ctx->pong); + return 0; +} + +/* Force two context switches and verify stop returns 0 (within budget). */ +static void tlob_sched_switch_accounting(struct kunit *test) +{ + struct tlob_ping_ctx *ctx; + struct tlob_kthread_guard *guard; + struct task_struct *peer; + int ret; + + ctx =3D kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, ctx); + init_completion(&ctx->ping); + init_completion(&ctx->pong); + + peer =3D kthread_run(tlob_ping_fn, ctx, "tlob_ping_kunit"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, peer); + get_task_struct(peer); + + guard =3D tlob_guard_kthread(test, peer, &ctx->ping); + KUNIT_ASSERT_NOT_NULL(test, guard); + + ret =3D tlob_start_task(current, 5000000ULL); + KUNIT_ASSERT_EQ(test, ret, 0); + + /* complete(ping) -> peer runs, forcing a context switch out and back */ + complete(&ctx->ping); + wait_for_completion(&ctx->pong); + + ret =3D tlob_stop_task(current); + KUNIT_EXPECT_EQ(test, ret, 0); + + guard->task =3D NULL; + kthread_stop(peer); + put_task_struct(peer); +} + +/* start/stop monitoring a kthread other than current */ +static int tlob_block_fn(void *arg) +{ + struct completion *done =3D arg; + + msleep(20); + complete(done); + return 0; +} + +static void tlob_monitor_other_task(struct kunit *test) +{ + struct completion *done; + struct tlob_kthread_guard *guard; + struct task_struct *target; + int ret; + + done =3D kunit_kzalloc(test, sizeof(*done), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, done); + init_completion(done); + + target =3D kthread_run(tlob_block_fn, done, "tlob_target_kunit"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, target); + get_task_struct(target); + + guard =3D tlob_guard_kthread(test, target, NULL); + KUNIT_ASSERT_NOT_NULL(test, guard); + + ret =3D tlob_start_task(target, 5000000ULL); + KUNIT_ASSERT_EQ(test, ret, 0); + + wait_for_completion(done); + + /* 5 s budget won't fire in 20 ms; 0 or -EOVERFLOW are both valid */ + ret =3D tlob_stop_task(target); + KUNIT_EXPECT_TRUE(test, ret =3D=3D 0 || ret =3D=3D -EOVERFLOW); + + guard->task =3D NULL; + kthread_stop(target); + put_task_struct(target); +} + +static int tlob_sched_suite_init(struct kunit_suite *suite) +{ + rv_kunit_monitoring_on(); + return tlob_init_monitor(); +} + +static void tlob_sched_suite_exit(struct kunit_suite *suite) +{ + tlob_destroy_monitor(); + rv_kunit_monitoring_off(); +} + +static struct kunit_case tlob_sched_integration_cases[] =3D { + KUNIT_CASE(tlob_sched_switch_accounting), + KUNIT_CASE(tlob_monitor_other_task), + {} +}; + +static struct kunit_suite tlob_sched_integration_suite =3D { + .name =3D "tlob_sched_integration", + .suite_init =3D tlob_sched_suite_init, + .suite_exit =3D tlob_sched_suite_exit, + .test_cases =3D tlob_sched_integration_cases, +}; + +/* Suite 3: uprobe binding format - add/remove acceptance and rejection. */ + +static const char * const tlob_format_valid[] =3D { + "p /usr/bin/myapp:4768 4848 threshold=3D5000", + "p /usr/bin/myapp:0x12a0 0x12f0 threshold=3D10000", + "p /opt/my:app/bin:0x100 0x200 threshold=3D1000", +}; + +static const char * const tlob_format_invalid[] =3D { + /* add: malformed */ + "p /usr/bin/myapp:0x100 0x200 threshold=3D0", + "p :0x100 0x200 threshold=3D5000", + "p /usr/bin/myapp:0x100 threshold=3D5000", + "p /usr/bin/myapp:-1 0x200 threshold=3D5000", + "p /usr/bin/myapp:0x100 0x200", + "p /usr/bin/myapp:0x100 0x100 threshold=3D5000", + /* remove: malformed */ + "-usr/bin/myapp:0x100", + "-/usr/bin/myapp", + "-/:0x100", + "-/usr/bin/myapp:abc", +}; + +/* + * Valid add lines return -ENOENT (path does not exist in the test environ= ment) + * rather than 0; a non-(-EINVAL) return confirms the format was accepted. + */ +static void tlob_format_accepted(struct kunit *test) +{ + char buf[128]; + int i; + + for (i =3D 0; i < ARRAY_SIZE(tlob_format_valid); i++) { + strscpy(buf, tlob_format_valid[i], sizeof(buf)); + KUNIT_EXPECT_NE(test, tlob_create_or_delete_uprobe(buf), -EINVAL); + } +} + +static void tlob_format_rejected(struct kunit *test) +{ + char buf[128]; + int i; + + for (i =3D 0; i < ARRAY_SIZE(tlob_format_invalid); i++) { + strscpy(buf, tlob_format_invalid[i], sizeof(buf)); + KUNIT_EXPECT_EQ(test, tlob_create_or_delete_uprobe(buf), -EINVAL); + } +} + +static struct kunit_case tlob_uprobe_format_cases[] =3D { + KUNIT_CASE(tlob_format_accepted), + KUNIT_CASE(tlob_format_rejected), + {} +}; + +static struct kunit_suite tlob_uprobe_format_suite =3D { + .name =3D "tlob_uprobe_format", + .test_cases =3D tlob_uprobe_format_cases, +}; + +/* Suite 4: trace output - verify event_tlob and error_env_tlob field valu= es. */ + +static void tlob_trace_event_format(struct kunit *test) +{ + const struct tlob_captured_event *ev; + int pid =3D current->pid; + int ret; + + tlob_event_count_reset(); + ret =3D tlob_start_task(current, 5000000ULL); + KUNIT_ASSERT_EQ(test, ret, 0); + + /* sleep/wakeup/switch_in: running->sleeping->waiting->running */ + msleep(20); + + KUNIT_EXPECT_EQ(test, tlob_stop_task(current), 0); + + KUNIT_EXPECT_GE(test, tlob_event_count_read(), 3); + + ev =3D tlob_last_event_read(); + KUNIT_EXPECT_EQ(test, ev->id, pid); + KUNIT_EXPECT_STREQ(test, ev->state, "waiting"); + KUNIT_EXPECT_STREQ(test, ev->event, "switch_in"); + KUNIT_EXPECT_STREQ(test, ev->next_state, "running"); + KUNIT_EXPECT_TRUE(test, ev->final_state); +} + +static void tlob_trace_error_env_format(struct kunit *test) +{ + const struct tlob_captured_error_env *err; + ktime_t t0; + int pid =3D current->pid; + int ret; + + tlob_error_env_count_reset(); + ret =3D tlob_start_task(current, 1000); + KUNIT_ASSERT_EQ(test, ret, 0); + + t0 =3D ktime_get(); + while (ktime_us_delta(ktime_get(), t0) < 50000) + cpu_relax(); + + tlob_stop_task(current); + + KUNIT_ASSERT_GE(test, tlob_error_env_count_read(), 1); + + err =3D tlob_last_error_env_read(); + KUNIT_EXPECT_EQ(test, err->id, pid); + KUNIT_EXPECT_STREQ(test, err->state, "running"); + KUNIT_EXPECT_STREQ(test, err->event, "budget_exceeded"); + KUNIT_EXPECT_TRUE(test, strncmp(err->env, "clk_elapsed=3D", 12) =3D=3D 0); +} + +static int tlob_trace_suite_init(struct kunit_suite *suite) +{ + int ret; + + rv_kunit_monitoring_on(); + ret =3D tlob_init_monitor(); + if (ret) + goto err_mon_off; + ret =3D tlob_register_kunit_probes(); + if (ret) + goto err_destroy; + ret =3D tlob_enable_hooks(); + if (ret) + goto err_probes; + return 0; + +err_probes: + tlob_unregister_kunit_probes(); +err_destroy: + tlob_destroy_monitor(); +err_mon_off: + rv_kunit_monitoring_off(); + return ret; +} + +static void tlob_trace_suite_exit(struct kunit_suite *suite) +{ + tlob_disable_hooks(); + tlob_unregister_kunit_probes(); + tlob_destroy_monitor(); + rv_kunit_monitoring_off(); +} + +static struct kunit_case tlob_trace_output_cases[] =3D { + KUNIT_CASE(tlob_trace_event_format), + KUNIT_CASE(tlob_trace_error_env_format), + {} +}; + +static struct kunit_suite tlob_trace_output_suite =3D { + .name =3D "tlob_trace_output", + .suite_init =3D tlob_trace_suite_init, + .suite_exit =3D tlob_trace_suite_exit, + .test_cases =3D tlob_trace_output_cases, +}; + +/* + * Suite 5: violation reaction - complement to Suite 4. + * Suite 4 checks trace field values; Suite 5 checks semantics: + * error count per budget expiry and per-state ns breakdown. + */ + +/* generous budget; usleep forces state transitions; no error must fire */ +static void tlob_no_error_within_budget(struct kunit *test) +{ + tlob_error_env_count_reset(); + tlob_event_count_reset(); + + KUNIT_ASSERT_EQ(test, tlob_start_task(current, 10000000ULL), 0); + usleep_range(5000, 10000); + KUNIT_EXPECT_EQ(test, tlob_stop_task(current), 0); + KUNIT_EXPECT_EQ(test, tlob_error_env_count_read(), 0); + KUNIT_EXPECT_GE(test, tlob_event_count_read(), 2); +} + +/* busy-spin 50 ms >> 1 ms budget; running_ns must dominate */ +static void tlob_detail_running_dominates(struct kunit *test) +{ + const struct tlob_captured_detail *d; + u64 total_ns; + ktime_t t0; + int ret; + + tlob_error_env_count_reset(); + + ret =3D tlob_start_task(current, 1000); + KUNIT_ASSERT_EQ(test, ret, 0); + + t0 =3D ktime_get(); + while (ktime_us_delta(ktime_get(), t0) < 50000) + cpu_relax(); + + tlob_stop_task(current); + + KUNIT_EXPECT_EQ(test, tlob_error_env_count_read(), 1); + d =3D tlob_last_detail_read(); + KUNIT_EXPECT_EQ(test, d->pid, current->pid); + KUNIT_EXPECT_EQ(test, d->threshold_us, 1000ULL); + total_ns =3D d->running_ns + d->waiting_ns + d->sleeping_ns; + KUNIT_EXPECT_GE(test, total_ns, 1000ULL * 1000); + KUNIT_EXPECT_GT(test, d->running_ns, d->sleeping_ns + d->waiting_ns); +} + +struct tlob_hog_ctx { + int spin_ms; +}; + +static int tlob_hog_fn(void *arg) +{ + struct tlob_hog_ctx *ctx =3D arg; + ktime_t t0 =3D ktime_get(); + + while (!kthread_should_stop() && + ktime_ms_delta(ktime_get(), t0) < ctx->spin_ms) + cpu_relax(); + return 0; +} + +/* + * SCHED_FIFO kthread bound to the same CPU preempts the monitored task + * (sched_switch prev_state =3D=3D 0: running->waiting) and holds the CPU = for + * 80 ms >> 10 ms budget, guaranteeing the timer fires in waiting state. + */ +static void tlob_detail_waiting_dominates(struct kunit *test) +{ + struct tlob_hog_ctx *ctx; + struct task_struct *hog; + struct tlob_kthread_guard *guard; + const struct tlob_captured_detail *d; + struct sched_param param =3D { .sched_priority =3D MAX_RT_PRIO - 1 }; + int ret; + + tlob_error_env_count_reset(); + + ctx =3D kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, ctx); + ctx->spin_ms =3D 80; + + hog =3D kthread_create(tlob_hog_fn, ctx, "tlob_s5_hog"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, hog); + get_task_struct(hog); + + kthread_bind(hog, smp_processor_id()); + sched_setscheduler_nocheck(hog, SCHED_FIFO, ¶m); + + guard =3D tlob_guard_kthread(test, hog, NULL); + KUNIT_ASSERT_NOT_NULL(test, guard); + + ret =3D tlob_start_task(current, 10000); /* 10 ms budget */ + KUNIT_ASSERT_EQ(test, ret, 0); + + wake_up_process(hog); + yield(); /* sched_switch prev_state =3D=3D 0: running->waiting */ + + tlob_stop_task(current); + + KUNIT_EXPECT_EQ(test, tlob_error_env_count_read(), 1); + d =3D tlob_last_detail_read(); + KUNIT_EXPECT_EQ(test, d->sleeping_ns, 0ULL); + KUNIT_EXPECT_GT(test, d->waiting_ns, d->running_ns + d->sleeping_ns); + + guard->task =3D NULL; + kthread_stop(hog); + put_task_struct(hog); +} + +/* block on mutex for 80 ms >> 10 ms budget; sleeping_ns must dominate */ +static void tlob_detail_sleeping_dominates(struct kunit *test) +{ + struct tlob_holder_ctx *ctx; + struct tlob_kthread_guard *guard; + struct task_struct *holder; + const struct tlob_captured_detail *d; + int ret; + + tlob_error_env_count_reset(); + + ctx =3D kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, ctx); + ctx->hold_ms =3D 80; + mutex_init(&ctx->lock); + init_completion(&ctx->ready); + + holder =3D kthread_run(tlob_holder_fn, ctx, "tlob_s5_detail"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, holder); + get_task_struct(holder); + + guard =3D tlob_guard_kthread(test, holder, NULL); + KUNIT_ASSERT_NOT_NULL(test, guard); + + wait_for_completion(&ctx->ready); + + ret =3D tlob_start_task(current, 10000); + KUNIT_ASSERT_EQ(test, ret, 0); + + mutex_lock(&ctx->lock); + mutex_unlock(&ctx->lock); + + tlob_stop_task(current); + + KUNIT_EXPECT_EQ(test, tlob_error_env_count_read(), 1); + d =3D tlob_last_detail_read(); + KUNIT_EXPECT_GT(test, d->sleeping_ns, d->running_ns + d->waiting_ns); + + guard->task =3D NULL; + kthread_stop(holder); + put_task_struct(holder); +} + +static int tlob_violation_suite_init(struct kunit_suite *suite) +{ + int ret; + + rv_kunit_monitoring_on(); + ret =3D tlob_init_monitor(); + if (ret) + goto err_mon_off; + ret =3D tlob_register_kunit_probes(); + if (ret) + goto err_destroy; + ret =3D tlob_enable_hooks(); + if (ret) + goto err_probes; + return 0; + +err_probes: + tlob_unregister_kunit_probes(); +err_destroy: + tlob_destroy_monitor(); +err_mon_off: + rv_kunit_monitoring_off(); + return ret; +} + +static void tlob_violation_suite_exit(struct kunit_suite *suite) +{ + tlob_disable_hooks(); + tlob_unregister_kunit_probes(); + tlob_destroy_monitor(); + rv_kunit_monitoring_off(); +} + +static struct kunit_case tlob_violation_react_cases[] =3D { + KUNIT_CASE(tlob_no_error_within_budget), + KUNIT_CASE(tlob_detail_running_dominates), + KUNIT_CASE(tlob_detail_sleeping_dominates), + KUNIT_CASE(tlob_detail_waiting_dominates), + {} +}; + +static struct kunit_suite tlob_violation_react_suite =3D { + .name =3D "tlob_violation_react", + .suite_init =3D tlob_violation_suite_init, + .suite_exit =3D tlob_violation_suite_exit, + .test_cases =3D tlob_violation_react_cases, +}; + +kunit_test_suites(&tlob_task_api_suite, + &tlob_sched_integration_suite, + &tlob_uprobe_format_suite, + &tlob_trace_output_suite, + &tlob_violation_react_suite); + +MODULE_DESCRIPTION("KUnit tests for the tlob RV monitor"); +MODULE_LICENSE("GPL"); --=20 2.25.1 From nobody Sat Jun 13 00:24:03 2026 Received: from out-189.mta0.migadu.com (out-189.mta0.migadu.com [91.218.175.189]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8CEB547B43E for ; Mon, 11 May 2026 18:25:43 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=91.218.175.189 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523947; cv=none; b=tSHkOib/SH9hCy6Y8xu/YHxL8Xgr019U8u+5NcjrURttoyHbBO5/JYFrz82fBbSWJi3yXdYenB8kT87R2yz00Pvwl6nD6oLTtAuJ/b28K9x17cSAvJlMkmc+UVqqBntgcCcpeEnVV8ozquZ6FDBpvq0tT39qp55bnC1jEHRrsHs= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1778523947; c=relaxed/simple; bh=jt7wFLfwZv/YriPWtL7DTGSVa1XLxA321mI14+fGzOI=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=HclNzg1fNothnDcoYSBCua3J0a3Vy3U6tgBqmBS+1CtNwKTkBG3Boj+Uo/ZM3nhiZtg2OU/C9CpAKe5gtejFLKIsrp2NqIksFDUFEyJTLnJkOdPjTXyF8yh891/+bovgbQFk1nnap2dicwD4X3WuU4D1PAnCfCnXR5dZJQacB2A= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=TFJcfB/N; arc=none smtp.client-ip=91.218.175.189 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="TFJcfB/N" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1778523941; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=+NnBxbxr+omcJSdpV1L1cGtFbHhIz2Q1hlzpeMQFQCM=; b=TFJcfB/NGQZ21EkrYhgR4xnWPXcaP7TqAIpqaEFAg+pG4Lf7mTw2MBk17tmMnvsftNMf0k vOuhB0mOjOgRprPmUQuUBxcOH+pjFNvlRr8ksP3u/3swDz5O8uCNopuXGCT4luxsLnOsbm h+U3xD52YuXfHPON9xo5wW37nmicBpE= From: wen.yang@linux.dev To: Gabriele Monaco , Steven Rostedt Cc: linux-trace-kernel@vger.kernel.org, linux-kernel@vger.kernel.org, Wen Yang Subject: [RFC PATCH v2 10/10] selftests/verification: add tlob selftests Date: Tue, 12 May 2026 02:24:56 +0800 Message-Id: <8148267505ef90175b6b69e1ffb3aa560ff42d35.1778522945.git.wen.yang@linux.dev> In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" From: Wen Yang Add selftest coverage for the tlob RV monitor in tools/testing/selftests/verification/. Two helper binaries are built by tlob/Makefile: tlob_helper for the ioctl interface (/dev/rv) and tlob_uprobe_target for the uprobe tests. The top-level Makefile delegates to tlob/ via a generic MONITOR_SUBDIRS pattern so monitor-specific build details stay within each monitor's own subdirectory. Eight test files cover the tracefs control interface (tracefs.tc), the ioctl self-instrumentation interface (ioctl.tc, 8 scenarios), and the uprobe external monitoring interface (uprobe_bind.tc, uprobe_violation.tc, uprobe_no_event.tc, uprobe_multi.tc, uprobe_detail_sleeping.tc, uprobe_detail_waiting.tc). Tested on x86_64 with vng (virtme-ng): TAP version 13 1..12 ok 1 Test monitor enable/disable ok 2 Test monitor reactor setting ok 3 Check available monitors ok 4 Test wwnr monitor with printk reactor ok 5 Test tlob ioctl self-instrumentation (within/over-budget, error path= s) ok 6 Test tlob monitor tracefs interface (enable/disable and files) ok 7 uprobe binding: visible in monitor file, removable, duplicate offset= rejected ok 8 uprobe detail sleeping: sleeping_ns dominates when task blocks betwe= en probes ok 9 uprobe detail waiting: waiting_ns dominates when task is preempted b= etween probes ok 10 Two bindings on same binary with different offsets and budgets fire= independently ok 11 Verify no spurious error_env_tlob events without an active uprobe b= inding ok 12 uprobe violation: error_env_tlob and detail_env_tlob fire with corr= ect fields # Totals: pass:12 fail:0 xfail:0 xpass:0 skip:0 error:0 Suggested-by: Gabriele Monaco =20 Signed-off-by: Wen Yang --- tools/testing/selftests/verification/Makefile | 21 +- .../verification/test.d/tlob/ioctl.tc | 36 + .../verification/test.d/tlob/tracefs.tc | 17 + .../verification/test.d/tlob/uprobe_bind.tc | 34 + .../test.d/tlob/uprobe_detail_sleeping.tc | 47 ++ .../test.d/tlob/uprobe_detail_waiting.tc | 60 ++ .../verification/test.d/tlob/uprobe_multi.tc | 60 ++ .../test.d/tlob/uprobe_no_event.tc | 19 + .../test.d/tlob/uprobe_violation.tc | 60 ++ .../selftests/verification/tlob/Makefile | 21 + .../selftests/verification/tlob/tlob_ioctl.c | 626 ++++++++++++++++++ .../selftests/verification/tlob/tlob_target.c | 138 ++++ 12 files changed, 1138 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/verification/test.d/tlob/ioctl.= tc create mode 100644 tools/testing/selftests/verification/test.d/tlob/tracef= s.tc create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe= _bind.tc create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe= _detail_sleeping.tc create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe= _detail_waiting.tc create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe= _multi.tc create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe= _no_event.tc create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe= _violation.tc create mode 100644 tools/testing/selftests/verification/tlob/Makefile create mode 100644 tools/testing/selftests/verification/tlob/tlob_ioctl.c create mode 100644 tools/testing/selftests/verification/tlob/tlob_target.c diff --git a/tools/testing/selftests/verification/Makefile b/tools/testing/= selftests/verification/Makefile index aa8790c22a71..b5584fd3762d 100644 --- a/tools/testing/selftests/verification/Makefile +++ b/tools/testing/selftests/verification/Makefile @@ -1,8 +1,27 @@ # SPDX-License-Identifier: GPL-2.0 -all: =20 TEST_PROGS :=3D verificationtest-ktap TEST_FILES :=3D test.d settings EXTRA_CLEAN :=3D $(OUTPUT)/logs/* =20 +# Subdirectories that provide helper binaries for the test runner. +# Each entry must contain a Makefile that accepts OUTDIR=3D and deposits +# its binaries there; verificationtest-ktap adds OUTDIR to PATH so +# the ftracetest require-checks resolve the binaries by name. +MONITOR_SUBDIRS :=3D tlob + include ../lib.mk + +# Build and clean each monitor subdirectory. +all: $(patsubst %,_build_%,$(MONITOR_SUBDIRS)) + +clean: $(patsubst %,_clean_%,$(MONITOR_SUBDIRS)) + +.PHONY: $(patsubst %,_build_%,$(MONITOR_SUBDIRS)) \ + $(patsubst %,_clean_%,$(MONITOR_SUBDIRS)) + +$(patsubst %,_build_%,$(MONITOR_SUBDIRS)): _build_%: + $(MAKE) -C $* OUTDIR=3D"$(OUTPUT)" TOOLS_INCLUDES=3D"$(TOOLS_INCLUDES)" + +$(patsubst %,_clean_%,$(MONITOR_SUBDIRS)): _clean_%: + $(MAKE) -C $* OUTDIR=3D"$(OUTPUT)" clean diff --git a/tools/testing/selftests/verification/test.d/tlob/ioctl.tc b/to= ols/testing/selftests/verification/test.d/tlob/ioctl.tc new file mode 100644 index 000000000000..54ae249af9a6 --- /dev/null +++ b/tools/testing/selftests/verification/test.d/tlob/ioctl.tc @@ -0,0 +1,36 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# description: Test tlob ioctl self-instrumentation (within/over-budget, e= rror paths) +# requires: tlob:monitor tlob_ioctl:program + +TLOB_HELPER=3D$(command -v tlob_ioctl) + +[ -c /dev/rv ] || exit_unsupported + +echo 1 > monitors/tlob/enable + +# within budget: 50 ms threshold, 10 ms workload +"$TLOB_HELPER" within_budget + +# over budget in running state: 1 ms threshold, 100 ms busy-spin +"$TLOB_HELPER" over_budget_running + +# over budget in sleeping state: 3 ms threshold, 50 ms sleep +"$TLOB_HELPER" over_budget_sleeping + +# over budget in waiting state: 1 us threshold, sched_yield +"$TLOB_HELPER" over_budget_waiting + +# error paths +"$TLOB_HELPER" double_start +"$TLOB_HELPER" stop_no_start + +# per-thread isolation +"$TLOB_HELPER" multi_thread + +# bind against disabled monitor must return ENODEV, not crash +echo 0 > monitors/tlob/enable +"$TLOB_HELPER" not_enabled +echo 1 > monitors/tlob/enable + +echo 0 > monitors/tlob/enable diff --git a/tools/testing/selftests/verification/test.d/tlob/tracefs.tc b/= tools/testing/selftests/verification/test.d/tlob/tracefs.tc new file mode 100644 index 000000000000..5d1e7cc02498 --- /dev/null +++ b/tools/testing/selftests/verification/test.d/tlob/tracefs.tc @@ -0,0 +1,17 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# description: Test tlob monitor tracefs interface (enable/disable and fil= es) +# requires: tlob:monitor + +check_requires monitors/tlob/enable monitors/tlob/desc monitors/tlob/monit= or + +# enable / disable via the enable file +echo 1 > monitors/tlob/enable +grep -q 1 monitors/tlob/enable +echo "tlob" >> enabled_monitors +grep -q tlob enabled_monitors + +echo 0 > monitors/tlob/enable +grep -q 0 monitors/tlob/enable +echo "!tlob" >> enabled_monitors +! grep -q "^tlob$" enabled_monitors diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_bind.t= c b/tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc new file mode 100644 index 000000000000..41e20d593855 --- /dev/null +++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc @@ -0,0 +1,34 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# description: Test uprobe binding (visible in monitor file, removable, du= plicate rejected) +# requires: tlob:monitor tlob_ioctl:program tlob_target:program + +TLOB_HELPER=3D$(command -v tlob_ioctl) +UPROBE_TARGET=3D$(command -v tlob_target) +TLOB_MONITOR=3Dmonitors/tlob/monitor + +busy_offset=3D$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work = 2>/dev/null) +stop_offset=3D$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work_= done 2>/dev/null) +[ -n "$busy_offset" ] || exit_unsupported +[ -n "$stop_offset" ] || exit_unsupported + +"$UPROBE_TARGET" 30000 & +busy_pid=3D$! +sleep 0.05 + +echo 1 > monitors/tlob/enable +echo "p ${UPROBE_TARGET}:${busy_offset} ${stop_offset} threshold=3D5000000= " > "$TLOB_MONITOR" + +# Binding must appear in monitor file with canonical hex-offset format. +grep -qE "^p ${UPROBE_TARGET}:0x[0-9a-f]+ 0x[0-9a-f]+ threshold=3D[0-9]+$"= "$TLOB_MONITOR" +grep -q "threshold=3D5000000" "$TLOB_MONITOR" + +# Duplicate offset_start must be rejected. +! echo "p ${UPROBE_TARGET}:${busy_offset} ${stop_offset} threshold=3D9999"= > "$TLOB_MONITOR" 2>/dev/null + +# Remove the binding; it must no longer appear. +echo "-${UPROBE_TARGET}:${busy_offset}" > "$TLOB_MONITOR" +! grep -q "^p .*:0x${busy_offset#0x} " "$TLOB_MONITOR" + +kill "$busy_pid" 2>/dev/null; wait "$busy_pid" 2>/dev/null || true +echo 0 > monitors/tlob/enable diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_detail= _sleeping.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_deta= il_sleeping.tc new file mode 100644 index 000000000000..2b8656e0fef1 --- /dev/null +++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_sleepi= ng.tc @@ -0,0 +1,47 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# description: Test uprobe detail sleeping (sleeping_ns dominates when tas= k blocks between probes) +# requires: tlob:monitor tlob_ioctl:program tlob_target:program + +TLOB_HELPER=3D$(command -v tlob_ioctl) +UPROBE_TARGET=3D$(command -v tlob_target) +TLOB_MONITOR=3Dmonitors/tlob/monitor + +start_offset=3D$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_sleep_wor= k 2>/dev/null) +stop_offset=3D$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_sleep_work= _done 2>/dev/null) +[ -n "$start_offset" ] || exit_unsupported +[ -n "$stop_offset" ] || exit_unsupported + +"$UPROBE_TARGET" 5000 sleep & +busy_pid=3D$! +sleep 0.05 + +echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable +echo 1 > /sys/kernel/tracing/tracing_on +echo 1 > monitors/tlob/enable +echo > /sys/kernel/tracing/trace + +# 50 ms budget; task sleeps 200 ms per iteration -> sleeping_ns dominates. +echo "p ${UPROBE_TARGET}:${start_offset} ${stop_offset} threshold=3D50000"= > "$TLOB_MONITOR" + +found=3D0; i=3D0 +while [ "$i" -lt 30 ]; do + sleep 0.1 + grep -q "detail_env_tlob" /sys/kernel/tracing/trace && { found=3D1; break= ; } + i=3D$((i+1)) +done + +echo "-${UPROBE_TARGET}:${start_offset}" > "$TLOB_MONITOR" 2>/dev/null +kill "$busy_pid" 2>/dev/null; wait "$busy_pid" 2>/dev/null || true +echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable +echo 0 > monitors/tlob/enable + +[ "$found" =3D "1" ] + +line=3D$(grep "detail_env_tlob" /sys/kernel/tracing/trace | head -n 1) +running=3D$(echo "$line" | sed 's/.*running_ns=3D\([0-9]*\).*/\1/') +waiting=3D$(echo "$line" | sed 's/.*waiting_ns=3D\([0-9]*\).*/\1/') +sleeping=3D$(echo "$line" | sed 's/.*sleeping_ns=3D\([0-9]*\).*/\1/') +[ "$sleeping" -gt "$((running + waiting))" ] + +echo > /sys/kernel/tracing/trace diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_detail= _waiting.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_detai= l_waiting.tc new file mode 100644 index 000000000000..0705854f24df --- /dev/null +++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_waitin= g.tc @@ -0,0 +1,60 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# description: Test uprobe detail waiting (waiting_ns dominates when task = is preempted between probes) +# requires: tlob:monitor tlob_ioctl:program tlob_target:program + +TLOB_HELPER=3D$(command -v tlob_ioctl) +UPROBE_TARGET=3D$(command -v tlob_target) +TLOB_MONITOR=3Dmonitors/tlob/monitor + +command -v chrt > /dev/null || exit_unsupported +command -v taskset > /dev/null || exit_unsupported + +start_offset=3D$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_preempt_w= ork 2>/dev/null) +stop_offset=3D$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_preempt_wo= rk_done 2>/dev/null) +[ -n "$start_offset" ] || exit_unsupported +[ -n "$stop_offset" ] || exit_unsupported + +cpu=3D0 + +echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable +echo 1 > /sys/kernel/tracing/tracing_on +echo 1 > monitors/tlob/enable +echo > /sys/kernel/tracing/trace + +# Register probe before the target starts so the start uprobe fires on the +# first entry to tlob_preempt_work. Budget: 500 ms. +echo "p ${UPROBE_TARGET}:${start_offset} ${stop_offset} threshold=3D500000= " > "$TLOB_MONITOR" + +# Target starts; start probe fires on tlob_preempt_work entry. +taskset -c "$cpu" "$UPROBE_TARGET" 5000 preempt & +busy_pid=3D$! +sleep 0.05 + +# RT hog on the same CPU preempts the target; target stays in waiting state +# (runnable, off-CPU) until the budget expires -> waiting_ns dominates. +chrt -f 99 taskset -c "$cpu" sh -c 'while true; do :; done' 2>/dev/null & +hog_pid=3D$! + +found=3D0; i=3D0 +while [ "$i" -lt 30 ]; do + sleep 0.1 + grep -q "detail_env_tlob" /sys/kernel/tracing/trace && { found=3D1; break= ; } + i=3D$((i+1)) +done + +echo "-${UPROBE_TARGET}:${start_offset}" > "$TLOB_MONITOR" 2>/dev/null +kill "$hog_pid" 2>/dev/null; wait "$hog_pid" 2>/dev/null || true +kill "$busy_pid" 2>/dev/null; wait "$busy_pid" 2>/dev/null || true +echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable +echo 0 > monitors/tlob/enable + +[ "$found" =3D "1" ] + +line=3D$(grep "detail_env_tlob" /sys/kernel/tracing/trace | head -n 1) +running=3D$(echo "$line" | sed 's/.*running_ns=3D\([0-9]*\).*/\1/') +sleeping=3D$(echo "$line" | sed 's/.*sleeping_ns=3D\([0-9]*\).*/\1/') +waiting=3D$(echo "$line" | sed 's/.*waiting_ns=3D\([0-9]*\).*/\1/') +[ "$waiting" -gt "$((running + sleeping))" ] + +echo > /sys/kernel/tracing/trace diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_multi.= tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_multi.tc new file mode 100644 index 000000000000..c4b8f7108ae9 --- /dev/null +++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_multi.tc @@ -0,0 +1,60 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# description: Test two uprobe bindings on same binary (different offsets = fire independently) +# requires: tlob:monitor tlob_ioctl:program tlob_target:program + +TLOB_HELPER=3D$(command -v tlob_ioctl) +UPROBE_TARGET=3D$(command -v tlob_target) +TLOB_MONITOR=3Dmonitors/tlob/monitor + +busy_offset=3D$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work = 2>/dev/null) +busy_stop=3D$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work_do= ne 2>/dev/null) +sleep_offset=3D$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_sleep_wor= k 2>/dev/null) +sleep_stop=3D$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_sleep_work_= done 2>/dev/null) +[ -n "$busy_offset" ] || exit_unsupported +[ -n "$busy_stop" ] || exit_unsupported +[ -n "$sleep_offset" ] || exit_unsupported +[ -n "$sleep_stop" ] || exit_unsupported + +"$UPROBE_TARGET" 30000 & # busy mode: tlob_busy_work fires every 200= ms +busy_pid=3D$! +"$UPROBE_TARGET" 30000 sleep & # sleep mode: tlob_sleep_work fires every 2= 00 ms +sleep_pid=3D$! +sleep 0.05 + +echo 1 > /sys/kernel/tracing/events/rv/error_env_tlob/enable +echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable +echo 1 > /sys/kernel/tracing/tracing_on +echo 1 > monitors/tlob/enable +echo > /sys/kernel/tracing/trace + +# Binding A: 5 s budget on the busy probe - must not fire in 200 ms loops. +echo "p ${UPROBE_TARGET}:${busy_offset} ${busy_stop} threshold=3D5000000" = > "$TLOB_MONITOR" +# Binding B: 10 ns budget on the sleep probe - fires on first invocation. +echo "p ${UPROBE_TARGET}:${sleep_offset} ${sleep_stop} threshold=3D10" > "= $TLOB_MONITOR" + +# Wait up to 2 s for error_env_tlob from binding B. +found=3D0; i=3D0 +while [ "$i" -lt 20 ]; do + sleep 0.1 + grep -q "error_env_tlob" /sys/kernel/tracing/trace && { found=3D1; break;= } + i=3D$((i+1)) +done + +echo "-${UPROBE_TARGET}:${busy_offset}" > "$TLOB_MONITOR" 2>/dev/null +echo "-${UPROBE_TARGET}:${sleep_offset}" > "$TLOB_MONITOR" 2>/dev/null +kill "$sleep_pid" 2>/dev/null; wait "$sleep_pid" 2>/dev/null || true +kill "$busy_pid" 2>/dev/null; wait "$busy_pid" 2>/dev/null || true + +echo 0 > monitors/tlob/enable +echo 0 > /sys/kernel/tracing/events/rv/error_env_tlob/enable +echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable + +[ "$found" =3D "1" ] +# error_env_tlob payload: label and clock variable must be present. +grep "error_env_tlob" /sys/kernel/tracing/trace | head -n 1 | grep -q "bud= get_exceeded" +grep "error_env_tlob" /sys/kernel/tracing/trace | head -n 1 | grep -q "clk= _elapsed=3D" +# detail_env_tlob must appear alongside the error. +grep -q "detail_env_tlob" /sys/kernel/tracing/trace + +echo > /sys/kernel/tracing/trace diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_no_eve= nt.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_no_event.tc new file mode 100644 index 000000000000..4a74853346e3 --- /dev/null +++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_no_event.tc @@ -0,0 +1,19 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# description: Test no spurious error_env_tlob events without an active up= robe binding +# requires: tlob:monitor tlob_ioctl:program + +TLOB_MONITOR=3Dmonitors/tlob/monitor + +echo 1 > /sys/kernel/tracing/events/rv/error_env_tlob/enable +echo 1 > /sys/kernel/tracing/tracing_on +echo 1 > monitors/tlob/enable +echo > /sys/kernel/tracing/trace + +sleep 0.5 + +! grep -q "error_env_tlob" /sys/kernel/tracing/trace + +echo 0 > monitors/tlob/enable +echo 0 > /sys/kernel/tracing/events/rv/error_env_tlob/enable +echo > /sys/kernel/tracing/trace diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_violat= ion.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_violation.= tc new file mode 100644 index 000000000000..624fdb950f6b --- /dev/null +++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_violation.tc @@ -0,0 +1,60 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# description: Test uprobe violation (error_env_tlob and detail_env_tlob f= ire with correct fields) +# requires: tlob:monitor tlob_ioctl:program tlob_target:program + +TLOB_HELPER=3D$(command -v tlob_ioctl) +UPROBE_TARGET=3D$(command -v tlob_target) +TLOB_MONITOR=3Dmonitors/tlob/monitor + +busy_offset=3D$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work = 2>/dev/null) +stop_offset=3D$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work_= done 2>/dev/null) +[ -n "$busy_offset" ] || exit_unsupported +[ -n "$stop_offset" ] || exit_unsupported + +"$UPROBE_TARGET" 30000 & +busy_pid=3D$! +sleep 0.05 + +echo 1 > /sys/kernel/tracing/events/rv/error_env_tlob/enable +echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable +echo 1 > /sys/kernel/tracing/tracing_on +echo 1 > monitors/tlob/enable +echo > /sys/kernel/tracing/trace + +# 10 ns budget - fires almost immediately; task is busy-spinning on-CPU. +echo "p ${UPROBE_TARGET}:${busy_offset} ${stop_offset} threshold=3D10" > "= $TLOB_MONITOR" + +# wait up to 2 s for detail_env_tlob +found=3D0; i=3D0 +while [ "$i" -lt 20 ]; do + sleep 0.1 + grep -q "detail_env_tlob" /sys/kernel/tracing/trace && { found=3D1; break= ; } + i=3D$((i+1)) +done + +echo "-${UPROBE_TARGET}:${busy_offset}" > "$TLOB_MONITOR" 2>/dev/null +kill "$busy_pid" 2>/dev/null; wait "$busy_pid" 2>/dev/null || true +echo 0 > /sys/kernel/tracing/events/rv/error_env_tlob/enable +echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable +echo 0 > monitors/tlob/enable + +[ "$found" =3D "1" ] + +# error_env_tlob event label must be budget_exceeded +grep "error_env_tlob" /sys/kernel/tracing/trace | head -n 1 | grep -q "bud= get_exceeded" + +# detail_env_tlob must have all five fields with the correct threshold +line=3D$(grep "detail_env_tlob" /sys/kernel/tracing/trace | head -n 1) +echo "$line" | grep -q "pid=3D" +echo "$line" | grep -q "threshold_us=3D10" +echo "$line" | grep -q "running_ns=3D" +echo "$line" | grep -q "waiting_ns=3D" +echo "$line" | grep -q "sleeping_ns=3D" + +# Busy-spin keeps the task on-CPU: running_ns must exceed sleeping_ns. +running=3D$(echo "$line" | sed 's/.*running_ns=3D\([0-9]*\).*/\1/') +sleeping=3D$(echo "$line" | sed 's/.*sleeping_ns=3D\([0-9]*\).*/\1/') +[ "$running" -gt "$sleeping" ] + +echo > /sys/kernel/tracing/trace diff --git a/tools/testing/selftests/verification/tlob/Makefile b/tools/tes= ting/selftests/verification/tlob/Makefile new file mode 100644 index 000000000000..1bedf946cb34 --- /dev/null +++ b/tools/testing/selftests/verification/tlob/Makefile @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0 +# Builds tlob selftest helper binaries. +# +# Invoked by ../Makefile; pass OUTDIR to control the output directory +# and TOOLS_INCLUDES for the in-tree UAPI -isystem flag. + +OUTDIR ?=3D $(CURDIR)/.. +CFLAGS +=3D $(TOOLS_INCLUDES) + +.PHONY: all +all: $(OUTDIR)/tlob_ioctl $(OUTDIR)/tlob_target + +$(OUTDIR)/tlob_ioctl: tlob_ioctl.c + $(CC) $(CFLAGS) -o $@ $< -lpthread + +$(OUTDIR)/tlob_target: tlob_target.c + $(CC) $(CFLAGS) -o $@ $< + +.PHONY: clean +clean: + $(RM) $(OUTDIR)/tlob_ioctl $(OUTDIR)/tlob_target diff --git a/tools/testing/selftests/verification/tlob/tlob_ioctl.c b/tools= /testing/selftests/verification/tlob/tlob_ioctl.c new file mode 100644 index 000000000000..abb4e2e80a2c --- /dev/null +++ b/tools/testing/selftests/verification/tlob/tlob_ioctl.c @@ -0,0 +1,626 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * tlob_ioctl.c - ioctl test driver and ELF utility for tlob selftests + * + * Usage: tlob_ioctl [args...] + * + * not_enabled - TRACE_START without monitor enabled -> ENODEV + * within_budget - sleep within budget -> 0 + * over_budget_running - busy-spin past budget -> EOVERFLOW + * over_budget_sleeping - sleep past budget -> EOVERFLOW + * over_budget_waiting - sched_yield into waiting state -> EOVERFLOW + * double_start - two starts without stop -> EALREADY + * stop_no_start - stop without start -> EINVAL + * multi_thread - two fds: thread A within budget, thread B over + * bench - TRACE_START/STOP latency (TAP output, always p= asses) + * sym_offset - print ELF file offset of symbol + * + * Exit: 0 =3D pass, 1 =3D fail, 2 =3D skip (device not available). + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static int rv_fd =3D -1; + +static int open_rv(void) +{ + struct rv_bind_args bind =3D { .monitor_name =3D "tlob" }; + + rv_fd =3D open("/dev/rv", O_RDWR); + if (rv_fd < 0) { + fprintf(stderr, "open /dev/rv: %s\n", strerror(errno)); + return -1; + } + if (ioctl(rv_fd, RV_IOCTL_BIND_MONITOR, &bind) < 0) { + fprintf(stderr, "bind tlob: %s\n", strerror(errno)); + close(rv_fd); + rv_fd =3D -1; + return -1; + } + return 0; +} + +static void busy_spin_us(unsigned long us) +{ + struct timespec start, now; + unsigned long elapsed; + + clock_gettime(CLOCK_MONOTONIC, &start); + do { + clock_gettime(CLOCK_MONOTONIC, &now); + elapsed =3D (unsigned long)(now.tv_sec - start.tv_sec) + * 1000000000UL + + (unsigned long)(now.tv_nsec - start.tv_nsec); + } while (elapsed < us * 1000UL); +} + +static int trace_start(uint64_t threshold_us) +{ + struct tlob_start_args args =3D { + .threshold_us =3D threshold_us, + }; + + return ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args); +} + +static int trace_stop(void) +{ + return ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL); +} + +/* Synchronous TRACE_START / TRACE_STOP tests */ + +/* Bind to a disabled monitor must return ENODEV without crashing */ +static int test_not_enabled(void) +{ + struct rv_bind_args bind =3D { .monitor_name =3D "tlob" }; + int fd; + int ret; + + fd =3D open("/dev/rv", O_RDWR); + if (fd < 0) { + fprintf(stderr, "open /dev/rv: %s\n", strerror(errno)); + return 2; /* skip */ + } + + ret =3D ioctl(fd, RV_IOCTL_BIND_MONITOR, &bind); + close(fd); + + if (ret =3D=3D 0) { + fprintf(stderr, "RV_IOCTL_BIND_MONITOR: expected ENODEV, got success\n"); + return 1; + } + if (errno !=3D ENODEV) { + fprintf(stderr, "RV_IOCTL_BIND_MONITOR: expected ENODEV, got %s\n", + strerror(errno)); + return 1; + } + return 0; +} + +static int test_within_budget(void) +{ + int ret; + + /* 50 ms budget */ + if (trace_start(50000) < 0) { + fprintf(stderr, "TRACE_START: %s\n", strerror(errno)); + return 1; + } + usleep(10000); /* 10 ms */ + ret =3D trace_stop(); + if (ret !=3D 0) { + fprintf(stderr, "TRACE_STOP: expected 0, got %d errno=3D%s\n", + ret, strerror(errno)); + return 1; + } + return 0; +} + +static int test_over_budget_running(void) +{ + int ret; + + /* 1 ms budget */ + if (trace_start(1000) < 0) { + fprintf(stderr, "TRACE_START: %s\n", strerror(errno)); + return 1; + } + busy_spin_us(100000); /* 100 ms */ + ret =3D trace_stop(); + if (ret =3D=3D 0) { + fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got 0\n"); + return 1; + } + if (errno !=3D EOVERFLOW) { + fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got %s\n", + strerror(errno)); + return 1; + } + return 0; +} + +static int test_over_budget_sleeping(void) +{ + int ret; + + /* 3 ms budget */ + if (trace_start(3000) < 0) { + fprintf(stderr, "TRACE_START: %s\n", strerror(errno)); + return 1; + } + usleep(50000); /* 50 ms; sleeping time counts toward budget */ + ret =3D trace_stop(); + if (ret =3D=3D 0) { + fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got 0\n"); + return 1; + } + if (errno !=3D EOVERFLOW) { + fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got %s\n", + strerror(errno)); + return 1; + } + return 0; +} + +static int test_over_budget_waiting(void) +{ + int ret; + + /* 1 us budget */ + if (trace_start(1) < 0) { + fprintf(stderr, "TRACE_START: %s\n", strerror(errno)); + return 1; + } + sched_yield(); /* running -> waiting -> running */ + busy_spin_us(10); /* 10 us >> 1 us budget; hrtimer fires during spin */ + ret =3D trace_stop(); + if (ret =3D=3D 0) { + fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got 0\n"); + return 1; + } + if (errno !=3D EOVERFLOW) { + fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got %s\n", + strerror(errno)); + return 1; + } + return 0; +} + +/* Error-handling tests */ + +static int test_double_start(void) +{ + int ret; + + /* 10 s: large enough the hrtimer won't fire during the test */ + if (trace_start(10000000ULL) < 0) { + fprintf(stderr, "first TRACE_START: %s\n", strerror(errno)); + return 1; + } + ret =3D trace_start(10000000); + if (ret =3D=3D 0) { + fprintf(stderr, "second TRACE_START: expected EALREADY, got 0\n"); + trace_stop(); + return 1; + } + if (errno !=3D EALREADY) { + fprintf(stderr, "second TRACE_START: expected EALREADY, got %s\n", + strerror(errno)); + trace_stop(); + return 1; + } + trace_stop(); + return 0; +} + +static int test_stop_no_start(void) +{ + int ret; + + /* Ensure clean state: ignore error from a stale entry */ + trace_stop(); + + ret =3D trace_stop(); + if (ret =3D=3D 0) { + fprintf(stderr, "TRACE_STOP: expected EINVAL, got 0\n"); + return 1; + } + if (errno !=3D EINVAL) { + fprintf(stderr, "TRACE_STOP: expected EINVAL, got %s\n", + strerror(errno)); + return 1; + } + return 0; +} + +/* Two threads, each with its own fd: A within budget, B over budget. */ + +struct mt_thread_args { + uint64_t threshold_us; + unsigned long workload_us; + int busy; + int expect_eoverflow; + int result; +}; + +static void *mt_thread_fn(void *arg) +{ + struct mt_thread_args *a =3D arg; + struct tlob_start_args args =3D { .threshold_us =3D a->threshold_us }; + struct rv_bind_args bind =3D { .monitor_name =3D "tlob" }; + int fd; + int ret; + + fd =3D open("/dev/rv", O_RDWR); + if (fd < 0) { + fprintf(stderr, "thread open /dev/rv: %s\n", strerror(errno)); + a->result =3D 1; + return NULL; + } + if (ioctl(fd, RV_IOCTL_BIND_MONITOR, &bind) < 0) { + fprintf(stderr, "thread bind tlob: %s\n", strerror(errno)); + close(fd); + a->result =3D 1; + return NULL; + } + + ret =3D ioctl(fd, TLOB_IOCTL_TRACE_START, &args); + if (ret < 0) { + fprintf(stderr, "thread TRACE_START: %s\n", strerror(errno)); + close(fd); + a->result =3D 1; + return NULL; + } + + if (a->busy) + busy_spin_us(a->workload_us); + else + usleep(a->workload_us); + + ret =3D ioctl(fd, TLOB_IOCTL_TRACE_STOP, NULL); + if (a->expect_eoverflow) { + if (ret =3D=3D 0 || errno !=3D EOVERFLOW) { + fprintf(stderr, "thread: expected EOVERFLOW, got ret=3D%d errno=3D%s\n", + ret, strerror(errno)); + close(fd); + a->result =3D 1; + return NULL; + } + } else { + if (ret !=3D 0) { + fprintf(stderr, "thread: expected 0, got ret=3D%d errno=3D%s\n", + ret, strerror(errno)); + close(fd); + a->result =3D 1; + return NULL; + } + } + close(fd); + a->result =3D 0; + return NULL; +} + +static int test_multi_thread(void) +{ + pthread_t ta, tb; + struct mt_thread_args a =3D { + .threshold_us =3D 20000, /* 20 ms */ + .workload_us =3D 5000, /* 5 ms sleep -> within budget */ + .busy =3D 0, + .expect_eoverflow =3D 0, + }; + struct mt_thread_args b =3D { + .threshold_us =3D 3000, /* 3 ms */ + .workload_us =3D 30000, /* 30 ms spin -> over budget */ + .busy =3D 1, + .expect_eoverflow =3D 1, + }; + + pthread_create(&ta, NULL, mt_thread_fn, &a); + pthread_create(&tb, NULL, mt_thread_fn, &b); + pthread_join(ta, NULL); + pthread_join(tb, NULL); + + return (a.result || b.result) ? 1 : 0; +} + +/* + * Benchmark TRACE_START, TRACE_STOP, and round-trip ioctls. + * Output uses TAP '#' prefix; always returns 0. + */ +#define BENCH_WARMUP 32 +#define BENCH_N 1000 + +static long long timespec_diff_ns(const struct timespec *a, + const struct timespec *b) +{ + return (long long)(b->tv_sec - a->tv_sec) * 1000000000LL + + (b->tv_nsec - a->tv_nsec); +} + +static int test_bench(void) +{ + struct tlob_start_args args =3D { + .threshold_us =3D 10000000ULL, /* 10 s */ + }; + struct timespec t0, t1; + long long total_start_ns =3D 0, total_stop_ns =3D 0, total_rt_ns =3D 0; + int i; + + /* warm up */ + for (i =3D 0; i < BENCH_WARMUP; i++) { + if (ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args) =3D=3D 0) + ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL); + } + + /* start only */ + for (i =3D 0; i < BENCH_N; i++) { + clock_gettime(CLOCK_MONOTONIC, &t0); + ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args); + clock_gettime(CLOCK_MONOTONIC, &t1); + total_start_ns +=3D timespec_diff_ns(&t0, &t1); + ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL); + } + + /* stop only */ + for (i =3D 0; i < BENCH_N; i++) { + ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args); + clock_gettime(CLOCK_MONOTONIC, &t0); + ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL); + clock_gettime(CLOCK_MONOTONIC, &t1); + total_stop_ns +=3D timespec_diff_ns(&t0, &t1); + } + + /* round-trip */ + clock_gettime(CLOCK_MONOTONIC, &t0); + for (i =3D 0; i < BENCH_N; i++) { + ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args); + ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL); + } + clock_gettime(CLOCK_MONOTONIC, &t1); + total_rt_ns =3D timespec_diff_ns(&t0, &t1); + + printf("# start ioctl only: %lld ns/iter (N=3D%d, includes syscall)\= n", + total_start_ns / BENCH_N, BENCH_N); + printf("# stop ioctl only: %lld ns/iter (N=3D%d, includes syscall)\= n", + total_stop_ns / BENCH_N, BENCH_N); + printf("# start+stop roundtrip: %lld ns/iter (N=3D%d, includes 2 syscall= s)\n", + total_rt_ns / BENCH_N, BENCH_N); + return 0; +} + +/* + * Print the ELF file offset of in . Walks .symtab + * (falling back to .dynsym) and converts vaddr to file offset via PT_LOAD. + * Supports 32- and 64-bit ELF. + */ +static int sym_offset(const char *binary, const char *symname) +{ + int fd; + struct stat st; + void *map; + Elf64_Ehdr *ehdr; + Elf32_Ehdr *ehdr32; + int is64; + uint64_t sym_vaddr =3D 0; + int found =3D 0; + uint64_t file_offset =3D 0; + + fd =3D open(binary, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "open %s: %s\n", binary, strerror(errno)); + return 1; + } + if (fstat(fd, &st) < 0) { + close(fd); + return 1; + } + map =3D mmap(NULL, (size_t)st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + close(fd); + if (map =3D=3D MAP_FAILED) { + fprintf(stderr, "mmap: %s\n", strerror(errno)); + return 1; + } + + ehdr =3D (Elf64_Ehdr *)map; + ehdr32 =3D (Elf32_Ehdr *)map; + if (st.st_size < 4 || + ehdr->e_ident[EI_MAG0] !=3D ELFMAG0 || + ehdr->e_ident[EI_MAG1] !=3D ELFMAG1 || + ehdr->e_ident[EI_MAG2] !=3D ELFMAG2 || + ehdr->e_ident[EI_MAG3] !=3D ELFMAG3) { + fprintf(stderr, "%s: not an ELF file\n", binary); + munmap(map, (size_t)st.st_size); + return 1; + } + is64 =3D (ehdr->e_ident[EI_CLASS] =3D=3D ELFCLASS64); + + if (is64) { + Elf64_Shdr *shdrs =3D (Elf64_Shdr *)((char *)map + ehdr->e_shoff); + Elf64_Shdr *shstrtab_hdr =3D &shdrs[ehdr->e_shstrndx]; + const char *shstrtab =3D (char *)map + shstrtab_hdr->sh_offset; + int si; + + /* prefer .symtab; fall back to .dynsym */ + for (int pass =3D 0; pass < 2 && !found; pass++) { + const char *target =3D pass ? ".dynsym" : ".symtab"; + + for (si =3D 0; si < ehdr->e_shnum && !found; si++) { + Elf64_Shdr *sh =3D &shdrs[si]; + const char *name =3D shstrtab + sh->sh_name; + + if (strcmp(name, target) !=3D 0) + continue; + + Elf64_Shdr *strtab_sh =3D &shdrs[sh->sh_link]; + const char *strtab =3D (char *)map + strtab_sh->sh_offset; + Elf64_Sym *syms =3D (Elf64_Sym *)((char *)map + sh->sh_offset); + uint64_t nsyms =3D sh->sh_size / sizeof(Elf64_Sym); + uint64_t j; + + for (j =3D 0; j < nsyms; j++) { + if (strcmp(strtab + syms[j].st_name, symname) =3D=3D 0) { + sym_vaddr =3D syms[j].st_value; + found =3D 1; + break; + } + } + } + } + + if (!found) { + fprintf(stderr, "symbol '%s' not found in %s\n", symname, binary); + munmap(map, (size_t)st.st_size); + return 1; + } + + /* Convert vaddr to file offset via PT_LOAD segments */ + Elf64_Phdr *phdrs =3D (Elf64_Phdr *)((char *)map + ehdr->e_phoff); + int pi; + + for (pi =3D 0; pi < ehdr->e_phnum; pi++) { + Elf64_Phdr *ph =3D &phdrs[pi]; + + if (ph->p_type !=3D PT_LOAD) + continue; + if (sym_vaddr >=3D ph->p_vaddr && + sym_vaddr < ph->p_vaddr + ph->p_filesz) { + file_offset =3D sym_vaddr - ph->p_vaddr + ph->p_offset; + break; + } + } + } else { + /* 32-bit ELF */ + Elf32_Shdr *shdrs =3D (Elf32_Shdr *)((char *)map + ehdr32->e_shoff); + Elf32_Shdr *shstrtab_hdr =3D &shdrs[ehdr32->e_shstrndx]; + const char *shstrtab =3D (char *)map + shstrtab_hdr->sh_offset; + int si; + uint32_t sym_vaddr32 =3D 0; + + for (int pass =3D 0; pass < 2 && !found; pass++) { + const char *target =3D pass ? ".dynsym" : ".symtab"; + + for (si =3D 0; si < ehdr32->e_shnum && !found; si++) { + Elf32_Shdr *sh =3D &shdrs[si]; + const char *name =3D shstrtab + sh->sh_name; + + if (strcmp(name, target) !=3D 0) + continue; + + Elf32_Shdr *strtab_sh =3D &shdrs[sh->sh_link]; + const char *strtab =3D (char *)map + strtab_sh->sh_offset; + Elf32_Sym *syms =3D (Elf32_Sym *)((char *)map + sh->sh_offset); + uint32_t nsyms =3D sh->sh_size / sizeof(Elf32_Sym); + uint32_t j; + + for (j =3D 0; j < nsyms; j++) { + if (strcmp(strtab + syms[j].st_name, symname) =3D=3D 0) { + sym_vaddr32 =3D syms[j].st_value; + found =3D 1; + break; + } + } + } + } + + if (!found) { + fprintf(stderr, "symbol '%s' not found in %s\n", symname, binary); + munmap(map, (size_t)st.st_size); + return 1; + } + + Elf32_Phdr *phdrs =3D (Elf32_Phdr *)((char *)map + ehdr32->e_phoff); + int pi; + + for (pi =3D 0; pi < ehdr32->e_phnum; pi++) { + Elf32_Phdr *ph =3D &phdrs[pi]; + + if (ph->p_type !=3D PT_LOAD) + continue; + if (sym_vaddr32 >=3D ph->p_vaddr && + sym_vaddr32 < ph->p_vaddr + ph->p_filesz) { + file_offset =3D sym_vaddr32 - ph->p_vaddr + ph->p_offset; + break; + } + } + sym_vaddr =3D sym_vaddr32; + } + + munmap(map, (size_t)st.st_size); + + if (!file_offset && sym_vaddr) { + fprintf(stderr, "could not map vaddr 0x%lx to file offset\n", + (unsigned long)sym_vaddr); + return 1; + } + + printf("0x%lx\n", (unsigned long)file_offset); + return 0; +} + +int main(int argc, char *argv[]) +{ + int rc; + + if (argc < 2) { + fprintf(stderr, "Usage: %s [args...]\n", argv[0]); + return 1; + } + + /* sym_offset does not need /dev/rv */ + if (strcmp(argv[1], "sym_offset") =3D=3D 0) { + if (argc < 4) { + fprintf(stderr, "Usage: %s sym_offset \n", + argv[0]); + return 1; + } + return sym_offset(argv[2], argv[3]); + } + + /* not_enabled: monitor is disabled; bind must return ENODEV without open= _rv() */ + if (strcmp(argv[1], "not_enabled") =3D=3D 0) + return test_not_enabled(); + + if (open_rv() < 0) + return 2; /* skip */ + + if (strcmp(argv[1], "bench") =3D=3D 0) + rc =3D test_bench(); + else if (strcmp(argv[1], "within_budget") =3D=3D 0) + rc =3D test_within_budget(); + else if (strcmp(argv[1], "over_budget_running") =3D=3D 0) + rc =3D test_over_budget_running(); + else if (strcmp(argv[1], "over_budget_sleeping") =3D=3D 0) + rc =3D test_over_budget_sleeping(); + else if (strcmp(argv[1], "over_budget_waiting") =3D=3D 0) + rc =3D test_over_budget_waiting(); + else if (strcmp(argv[1], "double_start") =3D=3D 0) + rc =3D test_double_start(); + else if (strcmp(argv[1], "stop_no_start") =3D=3D 0) + rc =3D test_stop_no_start(); + else if (strcmp(argv[1], "multi_thread") =3D=3D 0) + rc =3D test_multi_thread(); + else { + fprintf(stderr, "Unknown test: %s\n", argv[1]); + rc =3D 1; + } + + close(rv_fd); + return rc; +} diff --git a/tools/testing/selftests/verification/tlob/tlob_target.c b/tool= s/testing/selftests/verification/tlob/tlob_target.c new file mode 100644 index 000000000000..0fdbc575d71d --- /dev/null +++ b/tools/testing/selftests/verification/tlob/tlob_target.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * tlob_target.c - uprobe target binary for tlob selftests. + * + * Provides three start/stop probe pairs, each designed to exercise a + * different dominant component of the detail_env_tlob ns breakdown: + * + * tlob_busy_work / tlob_busy_work_done - busy-spin: running_ns do= minates + * tlob_sleep_work / tlob_sleep_work_done - nanosleep: sleeping_ns d= ominates + * tlob_preempt_work / tlob_preempt_work_done - busy-spin: waiting_ns do= minates + * (needs an RT competitor = on the same CPU) + * + * Usage: tlob_target [mode] + * + * mode is one of: busy (default), sleep, preempt. + * Loops in 200 ms iterations until has elapsed + * (0 =3D run for ~24 hours). + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +#ifndef noinline +#define noinline __attribute__((noinline)) +#endif + +static inline int timespec_before(const struct timespec *a, + const struct timespec *b) +{ + return a->tv_sec < b->tv_sec || + (a->tv_sec =3D=3D b->tv_sec && a->tv_nsec < b->tv_nsec); +} + +static void timespec_add_ms(struct timespec *ts, unsigned long ms) +{ + ts->tv_sec +=3D ms / 1000; + ts->tv_nsec +=3D (long)(ms % 1000) * 1000000L; + if (ts->tv_nsec >=3D 1000000000L) { + ts->tv_sec++; + ts->tv_nsec -=3D 1000000000L; + } +} + +/* stop probe; noinline keeps the entry point visible to uprobes */ +noinline void tlob_busy_work_done(void) +{ + /* empty: uprobe fires on entry */ +} + +/* start probe; busy-spin so running_ns dominates */ +noinline void tlob_busy_work(unsigned long duration_ns) +{ + struct timespec start, now; + unsigned long elapsed; + + clock_gettime(CLOCK_MONOTONIC, &start); + do { + clock_gettime(CLOCK_MONOTONIC, &now); + elapsed =3D (unsigned long)(now.tv_sec - start.tv_sec) + * 1000000000UL + + (unsigned long)(now.tv_nsec - start.tv_nsec); + } while (elapsed < duration_ns); + + tlob_busy_work_done(); +} + +/* stop probe; noinline keeps the entry point visible to uprobes */ +noinline void tlob_sleep_work_done(void) +{ + /* empty: uprobe fires on entry */ +} + +/* start probe; nanosleep so sleeping_ns dominates */ +noinline void tlob_sleep_work(unsigned long duration_ms) +{ + struct timespec ts =3D { + .tv_sec =3D duration_ms / 1000, + .tv_nsec =3D (long)(duration_ms % 1000) * 1000000L, + }; + nanosleep(&ts, NULL); + tlob_sleep_work_done(); +} + +/* stop probe; noinline keeps the entry point visible to uprobes */ +noinline void tlob_preempt_work_done(void) +{ + /* empty: uprobe fires on entry */ +} + +/* + * start probe; busy-spin so an RT competitor on the same CPU drives + * waiting_ns (prev_state=3D=3D0 -> preempt event, task stays runnable off= -CPU). + */ +noinline void tlob_preempt_work(unsigned long duration_ms) +{ + struct timespec start, now; + unsigned long elapsed; + + clock_gettime(CLOCK_MONOTONIC, &start); + do { + clock_gettime(CLOCK_MONOTONIC, &now); + elapsed =3D (unsigned long)(now.tv_sec - start.tv_sec) + * 1000000000UL + + (unsigned long)(now.tv_nsec - start.tv_nsec); + } while (elapsed < duration_ms * 1000000UL); + + tlob_preempt_work_done(); +} + +int main(int argc, char *argv[]) +{ + unsigned long duration_ms =3D 0; + const char *mode =3D "busy"; + struct timespec deadline, now; + + if (argc >=3D 2) + duration_ms =3D strtoul(argv[1], NULL, 10); + if (argc >=3D 3) + mode =3D argv[2]; + + clock_gettime(CLOCK_MONOTONIC, &deadline); + timespec_add_ms(&deadline, duration_ms ? duration_ms : 86400000UL); + + do { + if (strcmp(mode, "sleep") =3D=3D 0) + tlob_sleep_work(200); + else if (strcmp(mode, "preempt") =3D=3D 0) + tlob_preempt_work(200); + else + tlob_busy_work(200 * 1000000UL); + clock_gettime(CLOCK_MONOTONIC, &now); + } while (timespec_before(&now, &deadline)); + + return 0; +} --=20 2.25.1