Add a boot self test that can catch sprious coverage from interrupts.
The coverage callback filters out interrupt code, but only after the
handler updates preempt count. Some code periodically leaks out
of that section and leads to spurious coverage.
Add a best-effort (but simple) test that is likely to catch such bugs.
If the test is enabled on CI systems that use KCOV, they should catch
any issues fast.
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: syzkaller@googlegroups.com
---
Changed since v1:
- renamed KCOV_TEST to KCOV_SELFTEST
- improved the config description
- loop for exactly 300ms in the test
In my local testing w/o the previous fix,
it immidiatly produced the following splat:
kcov: running selftest
BUG: TASK stack guard page was hit at ffffc90000147ff8
Oops: stack guard page: 0000 [#1] PREEMPT SMP KASAN PTI
...
kvm_set_cpu_l1tf_flush_l1d+0x5/0x20
sysvec_call_function+0x15/0xb0
asm_sysvec_call_function+0x1a/0x20
kcov_init+0xe4/0x130
do_one_initcall+0xbc/0x470
kernel_init_freeable+0x4fc/0x930
kernel_init+0x1c/0x2b0
---
kernel/kcov.c | 31 +++++++++++++++++++++++++++++++
lib/Kconfig.debug | 8 ++++++++
2 files changed, 39 insertions(+)
diff --git a/kernel/kcov.c b/kernel/kcov.c
index c3124f6d5536..72a5bf55107f 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/hashtable.h>
#include <linux/init.h>
+#include <linux/jiffies.h>
#include <linux/kmsan-checks.h>
#include <linux/mm.h>
#include <linux/preempt.h>
@@ -1057,6 +1058,32 @@ u64 kcov_common_handle(void)
}
EXPORT_SYMBOL(kcov_common_handle);
+#ifdef CONFIG_KCOV_SELFTEST
+static void __init selftest(void)
+{
+ unsigned long start;
+
+ pr_err("running self test\n");
+ /*
+ * Test that interrupts don't produce spurious coverage.
+ * The coverage callback filters out interrupt code, but only
+ * after the handler updates preempt count. Some code periodically
+ * leaks out of that section and leads to spurious coverage.
+ * It's hard to call the actual interrupt handler directly,
+ * so we just loop here for a bit waiting for a timer interrupt.
+ * We set kcov_mode to enable tracing, but don't setup the area,
+ * so any attempt to trace will crash. Note: we must not call any
+ * potentially traced functions in this region.
+ */
+ start = jiffies;
+ current->kcov_mode = KCOV_MODE_TRACE_PC;
+ while ((jiffies - start) * MSEC_PER_SEC / HZ < 300)
+ ;
+ current->kcov_mode = 0;
+ pr_err("done running self test\n");
+}
+#endif
+
static int __init kcov_init(void)
{
int cpu;
@@ -1076,6 +1103,10 @@ static int __init kcov_init(void)
*/
debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops);
+#ifdef CONFIG_KCOV_SELFTEST
+ selftest();
+#endif
+
return 0;
}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 59b6765d86b8..695a437a52d9 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2171,6 +2171,14 @@ config KCOV_IRQ_AREA_SIZE
soft interrupts. This specifies the size of those areas in the
number of unsigned long words.
+config KCOV_SELFTEST
+ bool "Perform short selftests on boot"
+ depends on KCOV
+ help
+ Run short KCOV coverage collection selftests on boot.
+ On test failure, causes the kernel to panic. Recommended to be
+ enabled, ensuring critical functionality works as intended.
+
menuconfig RUNTIME_TESTING_MENU
bool "Runtime Testing"
default y
--
2.45.2.505.gda0bf45e8d-goog
On Tue, Jun 11, 2024 at 09:50:31AM +0200, Dmitry Vyukov wrote:
> Add a boot self test that can catch sprious coverage from interrupts.
> The coverage callback filters out interrupt code, but only after the
> handler updates preempt count. Some code periodically leaks out
> of that section and leads to spurious coverage.
> Add a best-effort (but simple) test that is likely to catch such bugs.
> If the test is enabled on CI systems that use KCOV, they should catch
> any issues fast.
>
> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> Reviewed-by: Alexander Potapenko <glider@google.com>
> Cc: x86@kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: syzkaller@googlegroups.com
>
> ---
>
> Changed since v1:
> - renamed KCOV_TEST to KCOV_SELFTEST
> - improved the config description
> - loop for exactly 300ms in the test
>
> In my local testing w/o the previous fix,
> it immidiatly produced the following splat:
>
> kcov: running selftest
> BUG: TASK stack guard page was hit at ffffc90000147ff8
> Oops: stack guard page: 0000 [#1] PREEMPT SMP KASAN PTI
> ...
> kvm_set_cpu_l1tf_flush_l1d+0x5/0x20
> sysvec_call_function+0x15/0xb0
> asm_sysvec_call_function+0x1a/0x20
> kcov_init+0xe4/0x130
> do_one_initcall+0xbc/0x470
> kernel_init_freeable+0x4fc/0x930
> kernel_init+0x1c/0x2b0
So I'm not entirely sure how the above BUG comes about, nor how this
selftest tickles it. Could you elaborate?
I've found check_kcov_mode() which has this !in_task() clause, but I'm
not entirely sure how failing that leads to the above mentioned failure.
> ---
> kernel/kcov.c | 31 +++++++++++++++++++++++++++++++
> lib/Kconfig.debug | 8 ++++++++
> 2 files changed, 39 insertions(+)
>
> diff --git a/kernel/kcov.c b/kernel/kcov.c
> index c3124f6d5536..72a5bf55107f 100644
> --- a/kernel/kcov.c
> +++ b/kernel/kcov.c
> @@ -11,6 +11,7 @@
> #include <linux/fs.h>
> #include <linux/hashtable.h>
> #include <linux/init.h>
> +#include <linux/jiffies.h>
> #include <linux/kmsan-checks.h>
> #include <linux/mm.h>
> #include <linux/preempt.h>
> @@ -1057,6 +1058,32 @@ u64 kcov_common_handle(void)
> }
> EXPORT_SYMBOL(kcov_common_handle);
>
> +#ifdef CONFIG_KCOV_SELFTEST
> +static void __init selftest(void)
> +{
> + unsigned long start;
> +
> + pr_err("running self test\n");
> + /*
> + * Test that interrupts don't produce spurious coverage.
> + * The coverage callback filters out interrupt code, but only
> + * after the handler updates preempt count. Some code periodically
> + * leaks out of that section and leads to spurious coverage.
> + * It's hard to call the actual interrupt handler directly,
> + * so we just loop here for a bit waiting for a timer interrupt.
> + * We set kcov_mode to enable tracing, but don't setup the area,
> + * so any attempt to trace will crash. Note: we must not call any
> + * potentially traced functions in this region.
> + */
> + start = jiffies;
> + current->kcov_mode = KCOV_MODE_TRACE_PC;
barrier();
> + while ((jiffies - start) * MSEC_PER_SEC / HZ < 300)
> + ;
barrier();
> + current->kcov_mode = 0;
> + pr_err("done running self test\n");
> +}
> +#endif
On Wed, 19 Jun 2024 at 13:13, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Jun 11, 2024 at 09:50:31AM +0200, Dmitry Vyukov wrote:
> > Add a boot self test that can catch sprious coverage from interrupts.
> > The coverage callback filters out interrupt code, but only after the
> > handler updates preempt count. Some code periodically leaks out
> > of that section and leads to spurious coverage.
> > Add a best-effort (but simple) test that is likely to catch such bugs.
> > If the test is enabled on CI systems that use KCOV, they should catch
> > any issues fast.
> >
> > Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> > Reviewed-by: Alexander Potapenko <glider@google.com>
> > Cc: x86@kernel.org
> > Cc: linux-kernel@vger.kernel.org
> > Cc: syzkaller@googlegroups.com
> >
> > ---
> >
> > Changed since v1:
> > - renamed KCOV_TEST to KCOV_SELFTEST
> > - improved the config description
> > - loop for exactly 300ms in the test
> >
> > In my local testing w/o the previous fix,
> > it immidiatly produced the following splat:
> >
> > kcov: running selftest
> > BUG: TASK stack guard page was hit at ffffc90000147ff8
> > Oops: stack guard page: 0000 [#1] PREEMPT SMP KASAN PTI
> > ...
> > kvm_set_cpu_l1tf_flush_l1d+0x5/0x20
> > sysvec_call_function+0x15/0xb0
> > asm_sysvec_call_function+0x1a/0x20
> > kcov_init+0xe4/0x130
> > do_one_initcall+0xbc/0x470
> > kernel_init_freeable+0x4fc/0x930
> > kernel_init+0x1c/0x2b0
>
> So I'm not entirely sure how the above BUG comes about, nor how this
> selftest tickles it. Could you elaborate?
>
> I've found check_kcov_mode() which has this !in_task() clause, but I'm
> not entirely sure how failing that leads to the above mentioned failure.
I've tried to explain it in the test comment, maybe I need to improve it:
+ * We set kcov_mode to enable tracing, but don't setup the area,
+ * so any attempt to trace will crash. Note: we must not call any
+ * potentially traced functions in this region.
Basically, we setup current task kcov in a way that any attempt to
trace in __sanitizer_cov_trace_pc() will crash, and then just loop
waiting for interrupts.
A more legit way to achieve the same would be to properly setup kcov
for tracing from within the kernel, then call outermost interrupt
entry function, then check we traced nothing. But that would require a
non-trivial amount of new complex code, and e.g. the top-most
interrupt entry function is not exported and is arch-specific.
> > ---
> > kernel/kcov.c | 31 +++++++++++++++++++++++++++++++
> > lib/Kconfig.debug | 8 ++++++++
> > 2 files changed, 39 insertions(+)
> >
> > diff --git a/kernel/kcov.c b/kernel/kcov.c
> > index c3124f6d5536..72a5bf55107f 100644
> > --- a/kernel/kcov.c
> > +++ b/kernel/kcov.c
> > @@ -11,6 +11,7 @@
> > #include <linux/fs.h>
> > #include <linux/hashtable.h>
> > #include <linux/init.h>
> > +#include <linux/jiffies.h>
> > #include <linux/kmsan-checks.h>
> > #include <linux/mm.h>
> > #include <linux/preempt.h>
> > @@ -1057,6 +1058,32 @@ u64 kcov_common_handle(void)
> > }
> > EXPORT_SYMBOL(kcov_common_handle);
> >
> > +#ifdef CONFIG_KCOV_SELFTEST
> > +static void __init selftest(void)
> > +{
> > + unsigned long start;
> > +
> > + pr_err("running self test\n");
> > + /*
> > + * Test that interrupts don't produce spurious coverage.
> > + * The coverage callback filters out interrupt code, but only
> > + * after the handler updates preempt count. Some code periodically
> > + * leaks out of that section and leads to spurious coverage.
> > + * It's hard to call the actual interrupt handler directly,
> > + * so we just loop here for a bit waiting for a timer interrupt.
> > + * We set kcov_mode to enable tracing, but don't setup the area,
> > + * so any attempt to trace will crash. Note: we must not call any
> > + * potentially traced functions in this region.
> > + */
> > + start = jiffies;
> > + current->kcov_mode = KCOV_MODE_TRACE_PC;
>
> barrier();
>
> > + while ((jiffies - start) * MSEC_PER_SEC / HZ < 300)
> > + ;
>
> barrier();
>
> > + current->kcov_mode = 0;
> > + pr_err("done running self test\n");
> > +}
> > +#endif
>
> --
> You received this message because you are subscribed to the Google Groups "syzkaller" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to syzkaller+unsubscribe@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/syzkaller/20240619111309.GJ31592%40noisy.programming.kicks-ass.net.
On Wed, Jun 19, 2024 at 01:18:52PM +0200, Dmitry Vyukov wrote: > On Wed, 19 Jun 2024 at 13:13, Peter Zijlstra <peterz@infradead.org> wrote: > > > > On Tue, Jun 11, 2024 at 09:50:31AM +0200, Dmitry Vyukov wrote: > > > Add a boot self test that can catch sprious coverage from interrupts. > > > The coverage callback filters out interrupt code, but only after the > > > handler updates preempt count. Some code periodically leaks out > > > of that section and leads to spurious coverage. > > > Add a best-effort (but simple) test that is likely to catch such bugs. > > > If the test is enabled on CI systems that use KCOV, they should catch > > > any issues fast. > > > > > > Signed-off-by: Dmitry Vyukov <dvyukov@google.com> > > > Reviewed-by: Alexander Potapenko <glider@google.com> > > > Cc: x86@kernel.org > > > Cc: linux-kernel@vger.kernel.org > > > Cc: syzkaller@googlegroups.com > > > > > > --- > > > > > > Changed since v1: > > > - renamed KCOV_TEST to KCOV_SELFTEST > > > - improved the config description > > > - loop for exactly 300ms in the test > > > > > > In my local testing w/o the previous fix, > > > it immidiatly produced the following splat: > > > > > > kcov: running selftest > > > BUG: TASK stack guard page was hit at ffffc90000147ff8 > > > Oops: stack guard page: 0000 [#1] PREEMPT SMP KASAN PTI > > > ... > > > kvm_set_cpu_l1tf_flush_l1d+0x5/0x20 > > > sysvec_call_function+0x15/0xb0 > > > asm_sysvec_call_function+0x1a/0x20 > > > kcov_init+0xe4/0x130 > > > do_one_initcall+0xbc/0x470 > > > kernel_init_freeable+0x4fc/0x930 > > > kernel_init+0x1c/0x2b0 > > > > So I'm not entirely sure how the above BUG comes about, nor how this > > selftest tickles it. Could you elaborate? > > > > I've found check_kcov_mode() which has this !in_task() clause, but I'm > > not entirely sure how failing that leads to the above mentioned failure. > > I've tried to explain it in the test comment, maybe I need to improve it: > > + * We set kcov_mode to enable tracing, but don't setup the area, > + * so any attempt to trace will crash. Note: we must not call any > + * potentially traced functions in this region. Ah, I'm just slow today.. did not connect the dots. No this is fine. > Basically, we setup current task kcov in a way that any attempt to > trace in __sanitizer_cov_trace_pc() will crash, and then just loop > waiting for interrupts. > > A more legit way to achieve the same would be to properly setup kcov > for tracing from within the kernel, then call outermost interrupt > entry function, then check we traced nothing. But that would require a > non-trivial amount of new complex code, and e.g. the top-most > interrupt entry function is not exported and is arch-specific. Yeah, polling jiffies should be fine I suppose.
On Tue, Jun 11, 2024 at 9:50 AM 'Dmitry Vyukov' via syzkaller
<syzkaller@googlegroups.com> wrote:
>
> Add a boot self test that can catch sprious coverage from interrupts.
> The coverage callback filters out interrupt code, but only after the
> handler updates preempt count. Some code periodically leaks out
> of that section and leads to spurious coverage.
> Add a best-effort (but simple) test that is likely to catch such bugs.
> If the test is enabled on CI systems that use KCOV, they should catch
> any issues fast.
>
> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> Reviewed-by: Alexander Potapenko <glider@google.com>
> Cc: x86@kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: syzkaller@googlegroups.com
>
> ---
>
> Changed since v1:
> - renamed KCOV_TEST to KCOV_SELFTEST
> - improved the config description
> - loop for exactly 300ms in the test
>
> In my local testing w/o the previous fix,
> it immidiatly produced the following splat:
>
> kcov: running selftest
> BUG: TASK stack guard page was hit at ffffc90000147ff8
> Oops: stack guard page: 0000 [#1] PREEMPT SMP KASAN PTI
> ...
> kvm_set_cpu_l1tf_flush_l1d+0x5/0x20
> sysvec_call_function+0x15/0xb0
> asm_sysvec_call_function+0x1a/0x20
> kcov_init+0xe4/0x130
> do_one_initcall+0xbc/0x470
> kernel_init_freeable+0x4fc/0x930
> kernel_init+0x1c/0x2b0
> ---
> kernel/kcov.c | 31 +++++++++++++++++++++++++++++++
> lib/Kconfig.debug | 8 ++++++++
> 2 files changed, 39 insertions(+)
>
> diff --git a/kernel/kcov.c b/kernel/kcov.c
> index c3124f6d5536..72a5bf55107f 100644
> --- a/kernel/kcov.c
> +++ b/kernel/kcov.c
> @@ -11,6 +11,7 @@
> #include <linux/fs.h>
> #include <linux/hashtable.h>
> #include <linux/init.h>
> +#include <linux/jiffies.h>
> #include <linux/kmsan-checks.h>
> #include <linux/mm.h>
> #include <linux/preempt.h>
> @@ -1057,6 +1058,32 @@ u64 kcov_common_handle(void)
> }
> EXPORT_SYMBOL(kcov_common_handle);
>
> +#ifdef CONFIG_KCOV_SELFTEST
> +static void __init selftest(void)
> +{
> + unsigned long start;
> +
> + pr_err("running self test\n");
> + /*
> + * Test that interrupts don't produce spurious coverage.
> + * The coverage callback filters out interrupt code, but only
> + * after the handler updates preempt count. Some code periodically
> + * leaks out of that section and leads to spurious coverage.
> + * It's hard to call the actual interrupt handler directly,
> + * so we just loop here for a bit waiting for a timer interrupt.
> + * We set kcov_mode to enable tracing, but don't setup the area,
> + * so any attempt to trace will crash. Note: we must not call any
> + * potentially traced functions in this region.
> + */
> + start = jiffies;
> + current->kcov_mode = KCOV_MODE_TRACE_PC;
> + while ((jiffies - start) * MSEC_PER_SEC / HZ < 300)
> + ;
> + current->kcov_mode = 0;
> + pr_err("done running self test\n");
> +}
> +#endif
> +
> static int __init kcov_init(void)
> {
> int cpu;
> @@ -1076,6 +1103,10 @@ static int __init kcov_init(void)
> */
> debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops);
>
> +#ifdef CONFIG_KCOV_SELFTEST
> + selftest();
> +#endif
> +
> return 0;
> }
>
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 59b6765d86b8..695a437a52d9 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -2171,6 +2171,14 @@ config KCOV_IRQ_AREA_SIZE
> soft interrupts. This specifies the size of those areas in the
> number of unsigned long words.
>
> +config KCOV_SELFTEST
> + bool "Perform short selftests on boot"
> + depends on KCOV
> + help
> + Run short KCOV coverage collection selftests on boot.
> + On test failure, causes the kernel to panic. Recommended to be
Nit: "causes the kernel to panic" => "causes a kernel panic" or "panic
the kernel"
> + enabled, ensuring critical functionality works as intended.
> +
> menuconfig RUNTIME_TESTING_MENU
> bool "Runtime Testing"
> default y
> --
> 2.45.2.505.gda0bf45e8d-goog
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
On Tue, 11 Jun 2024 at 09:50, Dmitry Vyukov <dvyukov@google.com> wrote:
>
> Add a boot self test that can catch sprious coverage from interrupts.
> The coverage callback filters out interrupt code, but only after the
> handler updates preempt count. Some code periodically leaks out
> of that section and leads to spurious coverage.
> Add a best-effort (but simple) test that is likely to catch such bugs.
> If the test is enabled on CI systems that use KCOV, they should catch
> any issues fast.
>
> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> Reviewed-by: Alexander Potapenko <glider@google.com>
> Cc: x86@kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: syzkaller@googlegroups.com
Reviewed-by: Marco Elver <elver@google.com>
> ---
>
> Changed since v1:
> - renamed KCOV_TEST to KCOV_SELFTEST
> - improved the config description
> - loop for exactly 300ms in the test
>
> In my local testing w/o the previous fix,
> it immidiatly produced the following splat:
>
> kcov: running selftest
> BUG: TASK stack guard page was hit at ffffc90000147ff8
> Oops: stack guard page: 0000 [#1] PREEMPT SMP KASAN PTI
> ...
> kvm_set_cpu_l1tf_flush_l1d+0x5/0x20
> sysvec_call_function+0x15/0xb0
> asm_sysvec_call_function+0x1a/0x20
> kcov_init+0xe4/0x130
> do_one_initcall+0xbc/0x470
> kernel_init_freeable+0x4fc/0x930
> kernel_init+0x1c/0x2b0
> ---
> kernel/kcov.c | 31 +++++++++++++++++++++++++++++++
> lib/Kconfig.debug | 8 ++++++++
> 2 files changed, 39 insertions(+)
>
> diff --git a/kernel/kcov.c b/kernel/kcov.c
> index c3124f6d5536..72a5bf55107f 100644
> --- a/kernel/kcov.c
> +++ b/kernel/kcov.c
> @@ -11,6 +11,7 @@
> #include <linux/fs.h>
> #include <linux/hashtable.h>
> #include <linux/init.h>
> +#include <linux/jiffies.h>
> #include <linux/kmsan-checks.h>
> #include <linux/mm.h>
> #include <linux/preempt.h>
> @@ -1057,6 +1058,32 @@ u64 kcov_common_handle(void)
> }
> EXPORT_SYMBOL(kcov_common_handle);
>
> +#ifdef CONFIG_KCOV_SELFTEST
> +static void __init selftest(void)
> +{
> + unsigned long start;
> +
> + pr_err("running self test\n");
> + /*
> + * Test that interrupts don't produce spurious coverage.
> + * The coverage callback filters out interrupt code, but only
> + * after the handler updates preempt count. Some code periodically
> + * leaks out of that section and leads to spurious coverage.
> + * It's hard to call the actual interrupt handler directly,
> + * so we just loop here for a bit waiting for a timer interrupt.
> + * We set kcov_mode to enable tracing, but don't setup the area,
> + * so any attempt to trace will crash. Note: we must not call any
> + * potentially traced functions in this region.
> + */
> + start = jiffies;
> + current->kcov_mode = KCOV_MODE_TRACE_PC;
> + while ((jiffies - start) * MSEC_PER_SEC / HZ < 300)
> + ;
> + current->kcov_mode = 0;
> + pr_err("done running self test\n");
> +}
> +#endif
> +
> static int __init kcov_init(void)
> {
> int cpu;
> @@ -1076,6 +1103,10 @@ static int __init kcov_init(void)
> */
> debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops);
>
> +#ifdef CONFIG_KCOV_SELFTEST
> + selftest();
> +#endif
> +
> return 0;
> }
>
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 59b6765d86b8..695a437a52d9 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -2171,6 +2171,14 @@ config KCOV_IRQ_AREA_SIZE
> soft interrupts. This specifies the size of those areas in the
> number of unsigned long words.
>
> +config KCOV_SELFTEST
> + bool "Perform short selftests on boot"
> + depends on KCOV
> + help
> + Run short KCOV coverage collection selftests on boot.
> + On test failure, causes the kernel to panic. Recommended to be
> + enabled, ensuring critical functionality works as intended.
> +
> menuconfig RUNTIME_TESTING_MENU
> bool "Runtime Testing"
> default y
> --
> 2.45.2.505.gda0bf45e8d-goog
>
© 2016 - 2026 Red Hat, Inc.