[PATCH v2 08/14] lib/string_kunit: add performance benchmark for strlen()

Feng Jiang posted 14 patches 3 weeks, 6 days ago
There is a newer version of this series
[PATCH v2 08/14] lib/string_kunit: add performance benchmark for strlen()
Posted by Feng Jiang 3 weeks, 6 days ago
Introduce a benchmark to compare the architecture-optimized strlen()
implementation against the generic C version (__generic_strlen).

The benchmark uses a table-driven approach to evaluate performance
across different string lengths (short, medium, and long). It employs
ktime_get() for timing and get_random_bytes() followed by null-byte
filtering to generate test data that prevents early termination.

This helps in quantifying the performance gains of architecture-specific
optimizations on various platforms.

Suggested-by: Andy Shevchenko <andy@kernel.org>
Signed-off-by: Feng Jiang <jiangfeng@kylinos.cn>
---
 lib/tests/string_kunit.c | 117 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)

diff --git a/lib/tests/string_kunit.c b/lib/tests/string_kunit.c
index 8eb095404b95..2266954ae5e0 100644
--- a/lib/tests/string_kunit.c
+++ b/lib/tests/string_kunit.c
@@ -20,6 +20,77 @@
 #define STRING_TEST_MAX_LEN	128
 #define STRING_TEST_MAX_OFFSET	16
 
+#if defined(__HAVE_ARCH_STRLEN)
+#define STRING_BENCH_ENABLED
+#endif
+
+#ifdef STRING_BENCH_ENABLED
+/* Configuration for string benchmark scenarios */
+struct string_bench_case {
+	const char *name;
+	size_t len;
+	unsigned int iterations;
+};
+
+static const struct string_bench_case bench_cases[] = {
+	{"short", 8, 100000},
+	{"medium", 64, 100000},
+	{"long", 2048, 10000},
+};
+
+/**
+ * get_max_bench_len() - Get the maximum length from benchmark cases
+ * @cases: array of test cases
+ * @count: number of cases
+ */
+static size_t get_max_bench_len(const struct string_bench_case *cases, size_t count)
+{
+	size_t i, max_len = 0;
+
+	for (i = 0; i < count; i++) {
+		if (cases[i].len > max_len)
+			max_len = cases[i].len;
+	}
+
+	return max_len;
+}
+
+/**
+ * get_random_nonzero_bytes() - Fill buffer with random non-null bytes
+ * @buf: buffer to fill
+ * @len: number of bytes to fill
+ */
+static void get_random_nonzero_bytes(void *buf, size_t len)
+{
+	u8 *s = (u8 *)buf;
+
+	get_random_bytes(buf, len);
+
+	/* Replace null bytes to avoid early string termination */
+	for (size_t i = 0; i < len; i++) {
+		if (s[i] == '\0')
+			s[i] = 0x01;
+	}
+}
+
+static void string_bench_report(struct kunit *test, const char *func,
+		const struct string_bench_case *bc,
+		u64 time_arch, u64 time_generic)
+{
+	u64 ratio_int, ratio_frac;
+
+	/* Calculate speedup ratio with 2 decimal places. */
+	ratio_int = div64_u64(time_generic, time_arch);
+	ratio_frac = div64_u64((time_generic % time_arch) * 100, time_arch);
+
+	kunit_info(test, "%s performance (%s, len: %zu, iters: %u):\n",
+		func, bc->name, bc->len, bc->iterations);
+	kunit_info(test, "  arch-optimized: %llu ns\n", time_arch);
+	kunit_info(test, "  generic C:      %llu ns\n", time_generic);
+	kunit_info(test, "  speedup:        %llu.%02llux\n", ratio_int, ratio_frac);
+}
+#endif /* STRING_BENCH_ENABLED */
+
 static void string_test_memset16(struct kunit *test)
 {
 	unsigned i, j, k;
@@ -129,6 +200,49 @@ static void string_test_strlen(struct kunit *test)
 	}
 }
 
+#ifdef __HAVE_ARCH_STRLEN
+static void string_test_strlen_bench(struct kunit *test)
+{
+	char *buf;
+	size_t buf_len, iters;
+	ktime_t start, end;
+	u64 time_arch, time_generic;
+
+	buf_len = get_max_bench_len(bench_cases, ARRAY_SIZE(bench_cases)) + 1;
+
+	buf = kunit_kzalloc(test, buf_len, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buf);
+
+	for (size_t i = 0; i < ARRAY_SIZE(bench_cases); i++) {
+		get_random_nonzero_bytes(buf, bench_cases[i].len);
+		buf[bench_cases[i].len] = '\0';
+
+		iters = bench_cases[i].iterations;
+
+		/* 1. Benchmark the architecture-optimized version */
+		start = ktime_get();
+		for (unsigned int j = 0; j < iters; j++) {
+			OPTIMIZER_HIDE_VAR(buf);
+			(void)strlen(buf);
+		}
+		end = ktime_get();
+		time_arch = ktime_to_ns(ktime_sub(end, start));
+
+		/* 2. Benchmark the generic C version */
+		start = ktime_get();
+		for (unsigned int j = 0; j < iters; j++) {
+			OPTIMIZER_HIDE_VAR(buf);
+			(void)__generic_strlen(buf);
+		}
+		end = ktime_get();
+		time_generic = ktime_to_ns(ktime_sub(end, start));
+
+		string_bench_report(test, "strlen", &bench_cases[i],
+				time_arch, time_generic);
+	}
+}
+#endif
+
 static void string_test_strnlen(struct kunit *test)
 {
 	char *s;
@@ -702,6 +816,9 @@ static struct kunit_case string_test_cases[] = {
 	KUNIT_CASE(string_test_memset32),
 	KUNIT_CASE(string_test_memset64),
 	KUNIT_CASE(string_test_strlen),
+#ifdef __HAVE_ARCH_STRLEN
+	KUNIT_CASE(string_test_strlen_bench),
+#endif
 	KUNIT_CASE(string_test_strnlen),
 	KUNIT_CASE(string_test_strchr),
 	KUNIT_CASE(string_test_strnchr),
-- 
2.25.1
Re: [PATCH v2 08/14] lib/string_kunit: add performance benchmark for strlen()
Posted by kernel test robot 3 weeks, 1 day ago
Hi Feng,

kernel test robot noticed the following build errors:

[auto build test ERROR on kees/for-next/hardening]
[also build test ERROR on linus/master v6.19-rc5 next-20260116]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Feng-Jiang/lib-string-extract-generic-strlen-into-__generic_strlen/20260113-163741
base:   https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening
patch link:    https://lore.kernel.org/r/20260113082748.250916-9-jiangfeng%40kylinos.cn
patch subject: [PATCH v2 08/14] lib/string_kunit: add performance benchmark for strlen()
config: i386-randconfig-015-20251207 (https://download.01.org/0day-ci/archive/20260118/202601181845.EiSSqJu7-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260118/202601181845.EiSSqJu7-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601181845.EiSSqJu7-lkp@intel.com/

All errors (new ones prefixed by >>, old ones prefixed by <<):

>> ERROR: modpost: "__umoddi3" [lib/tests/string_kunit.ko] undefined!

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH v2 08/14] lib/string_kunit: add performance benchmark for strlen()
Posted by Andy Shevchenko 3 weeks, 6 days ago
On Tue, Jan 13, 2026 at 04:27:42PM +0800, Feng Jiang wrote:
> Introduce a benchmark to compare the architecture-optimized strlen()
> implementation against the generic C version (__generic_strlen).
> 
> The benchmark uses a table-driven approach to evaluate performance
> across different string lengths (short, medium, and long). It employs
> ktime_get() for timing and get_random_bytes() followed by null-byte
> filtering to generate test data that prevents early termination.
> 
> This helps in quantifying the performance gains of architecture-specific
> optimizations on various platforms.

...

> +static void string_test_strlen_bench(struct kunit *test)
> +{
> +	char *buf;
> +	size_t buf_len, iters;
> +	ktime_t start, end;
> +	u64 time_arch, time_generic;
> +
> +	buf_len = get_max_bench_len(bench_cases, ARRAY_SIZE(bench_cases)) + 1;
> +
> +	buf = kunit_kzalloc(test, buf_len, GFP_KERNEL);
> +	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buf);
> +
> +	for (size_t i = 0; i < ARRAY_SIZE(bench_cases); i++) {
> +		get_random_nonzero_bytes(buf, bench_cases[i].len);
> +		buf[bench_cases[i].len] = '\0';
> +
> +		iters = bench_cases[i].iterations;
> +
> +		/* 1. Benchmark the architecture-optimized version */
> +		start = ktime_get();
> +		for (unsigned int j = 0; j < iters; j++) {
> +			OPTIMIZER_HIDE_VAR(buf);
> +			(void)strlen(buf);

First Q: Are you sure the compiler doesn't replace this with __builtin_strlen() ?

> +		}
> +		end = ktime_get();
> +		time_arch = ktime_to_ns(ktime_sub(end, start));
> +
> +		/* 2. Benchmark the generic C version */
> +		start = ktime_get();
> +		for (unsigned int j = 0; j < iters; j++) {
> +			OPTIMIZER_HIDE_VAR(buf);
> +			(void)__generic_strlen(buf);
> +		}

Are you sure the warmed up caches do not affect the benchmark? I think you need
to flush / make caches dirty or so on each iteration.

> +		end = ktime_get();
> +		time_generic = ktime_to_ns(ktime_sub(end, start));
> +
> +		string_bench_report(test, "strlen", &bench_cases[i],
> +				time_arch, time_generic);
> +	}
> +}


-- 
With Best Regards,
Andy Shevchenko
Re: [PATCH v2 08/14] lib/string_kunit: add performance benchmark for strlen()
Posted by Feng Jiang 3 weeks, 5 days ago
On 2026/1/13 16:46, Andy Shevchenko wrote:
> On Tue, Jan 13, 2026 at 04:27:42PM +0800, Feng Jiang wrote:
>> Introduce a benchmark to compare the architecture-optimized strlen()
>> implementation against the generic C version (__generic_strlen).
>>
>> The benchmark uses a table-driven approach to evaluate performance
>> across different string lengths (short, medium, and long). It employs
>> ktime_get() for timing and get_random_bytes() followed by null-byte
>> filtering to generate test data that prevents early termination.
>>
>> This helps in quantifying the performance gains of architecture-specific
>> optimizations on various platforms.
> 
> ...
> 
>> +static void string_test_strlen_bench(struct kunit *test)
>> +{
>> +	char *buf;
>> +	size_t buf_len, iters;
>> +	ktime_t start, end;
>> +	u64 time_arch, time_generic;
>> +
>> +	buf_len = get_max_bench_len(bench_cases, ARRAY_SIZE(bench_cases)) + 1;
>> +
>> +	buf = kunit_kzalloc(test, buf_len, GFP_KERNEL);
>> +	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buf);
>> +
>> +	for (size_t i = 0; i < ARRAY_SIZE(bench_cases); i++) {
>> +		get_random_nonzero_bytes(buf, bench_cases[i].len);
>> +		buf[bench_cases[i].len] = '\0';
>> +
>> +		iters = bench_cases[i].iterations;
>> +
>> +		/* 1. Benchmark the architecture-optimized version */
>> +		start = ktime_get();
>> +		for (unsigned int j = 0; j < iters; j++) {
>> +			OPTIMIZER_HIDE_VAR(buf);
>> +			(void)strlen(buf);
> 
> First Q: Are you sure the compiler doesn't replace this with __builtin_strlen() ?
> 
>> +		}
>> +		end = ktime_get();
>> +		time_arch = ktime_to_ns(ktime_sub(end, start));
>> +
>> +		/* 2. Benchmark the generic C version */
>> +		start = ktime_get();
>> +		for (unsigned int j = 0; j < iters; j++) {
>> +			OPTIMIZER_HIDE_VAR(buf);
>> +			(void)__generic_strlen(buf);
>> +		}
> 
> Are you sure the warmed up caches do not affect the benchmark? I think you need
> to flush / make caches dirty or so on each iteration.
> 
>> +		end = ktime_get();
>> +		time_generic = ktime_to_ns(ktime_sub(end, start));
>> +
>> +		string_bench_report(test, "strlen", &bench_cases[i],
>> +				time_arch, time_generic);
>> +	}
>> +}
> 
> 

Thank you for the catch. You are absolutely correct—the 2500x figure is heavily
distorted and does not reflect real-world performance.

I've found that by using a volatile function pointer to call the implementations
(instead of direct calls), the results returned to a realistic range. It appears
the previous benchmark logic allowed the compiler to over-optimize the test loop
in ways that skewed the data.

I will refactor the benchmark logic in v3, specifically referencing the crc32
KUnit implementation (e.g., using warm-up loops and adding preempt_disable()
to eliminate context-switch interference) to ensure the data is robust and accurate.

-- 
With Best Regards,
Feng Jiang

Re: [PATCH v2 08/14] lib/string_kunit: add performance benchmark for strlen()
Posted by Feng Jiang 3 weeks, 5 days ago
On 2026/1/14 14:14, Feng Jiang wrote:
> On 2026/1/13 16:46, Andy Shevchenko wrote:
>> On Tue, Jan 13, 2026 at 04:27:42PM +0800, Feng Jiang wrote:
>>> Introduce a benchmark to compare the architecture-optimized strlen()
>>> implementation against the generic C version (__generic_strlen).
>>>
>>> The benchmark uses a table-driven approach to evaluate performance
>>> across different string lengths (short, medium, and long). It employs
>>> ktime_get() for timing and get_random_bytes() followed by null-byte
>>> filtering to generate test data that prevents early termination.
>>>
>>> This helps in quantifying the performance gains of architecture-specific
>>> optimizations on various platforms.
>>
>> ...
>>
>>> +static void string_test_strlen_bench(struct kunit *test)
>>> +{
>>> +	char *buf;
>>> +	size_t buf_len, iters;
>>> +	ktime_t start, end;
>>> +	u64 time_arch, time_generic;
>>> +
>>> +	buf_len = get_max_bench_len(bench_cases, ARRAY_SIZE(bench_cases)) + 1;
>>> +
>>> +	buf = kunit_kzalloc(test, buf_len, GFP_KERNEL);
>>> +	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buf);
>>> +
>>> +	for (size_t i = 0; i < ARRAY_SIZE(bench_cases); i++) {
>>> +		get_random_nonzero_bytes(buf, bench_cases[i].len);
>>> +		buf[bench_cases[i].len] = '\0';
>>> +
>>> +		iters = bench_cases[i].iterations;
>>> +
>>> +		/* 1. Benchmark the architecture-optimized version */
>>> +		start = ktime_get();
>>> +		for (unsigned int j = 0; j < iters; j++) {
>>> +			OPTIMIZER_HIDE_VAR(buf);
>>> +			(void)strlen(buf);
>>
>> First Q: Are you sure the compiler doesn't replace this with __builtin_strlen() ?
>>
>>> +		}
>>> +		end = ktime_get();
>>> +		time_arch = ktime_to_ns(ktime_sub(end, start));
>>> +
>>> +		/* 2. Benchmark the generic C version */
>>> +		start = ktime_get();
>>> +		for (unsigned int j = 0; j < iters; j++) {
>>> +			OPTIMIZER_HIDE_VAR(buf);
>>> +			(void)__generic_strlen(buf);
>>> +		}
>>
>> Are you sure the warmed up caches do not affect the benchmark? I think you need
>> to flush / make caches dirty or so on each iteration.
>>
>>> +		end = ktime_get();
>>> +		time_generic = ktime_to_ns(ktime_sub(end, start));
>>> +
>>> +		string_bench_report(test, "strlen", &bench_cases[i],
>>> +				time_arch, time_generic);
>>> +	}
>>> +}
>>
>>
> 
> Thank you for the catch. You are absolutely correct—the 2500x figure is heavily
> distorted and does not reflect real-world performance.
> 
> I've found that by using a volatile function pointer to call the implementations
> (instead of direct calls), the results returned to a realistic range. It appears
> the previous benchmark logic allowed the compiler to over-optimize the test loop
> in ways that skewed the data.
> 
> I will refactor the benchmark logic in v3, specifically referencing the crc32
> KUnit implementation (e.g., using warm-up loops and adding preempt_disable()
> to eliminate context-switch interference) to ensure the data is robust and accurate.
> 

Just a quick follow-up: I've also verified that using a volatile variable to store
the return value (as seen in crc_benchmark()) is equally effective at preventing
the optimization.

The core change is as follows:

    volatile size_t len;
    ...
    for (unsigned int j = 0; j < iters; j++) {
        OPTIMIZER_HIDE_VAR(buf);
        len = strlen(buf);
    }

Preliminary results with this change look much more reasonable:

    ok 4 string_test_strlen
    # string_test_strlen_bench: strlen performance (short, len: 8, iters: 100000):
    # string_test_strlen_bench:   arch-optimized: 4767500 ns
    # string_test_strlen_bench:   generic C:      5815800 ns
    # string_test_strlen_bench:   speedup:        1.21x
    # string_test_strlen_bench: strlen performance (medium, len: 64, iters: 100000):
    # string_test_strlen_bench:   arch-optimized: 6573600 ns
    # string_test_strlen_bench:   generic C:      16342500 ns
    # string_test_strlen_bench:   speedup:        2.48x
    # string_test_strlen_bench: strlen performance (long, len: 2048, iters: 10000):
    # string_test_strlen_bench:   arch-optimized: 7931000 ns
    # string_test_strlen_bench:   generic C:      35347300 ns
    # string_test_strlen_bench:   speedup:        4.45x
    ok 5 string_test_strlen_bench

I will adopt this pattern in v3, along with cache warm-up and preempt_disable(),
to stay consistent with existing kernel benchmarks and ensure robust measurements.

-- 
With Best Regards,
Feng Jiang

Re: [PATCH v2 08/14] lib/string_kunit: add performance benchmark for strlen()
Posted by David Laight 3 weeks, 5 days ago
On Wed, 14 Jan 2026 15:04:58 +0800
Feng Jiang <jiangfeng@kylinos.cn> wrote:

> On 2026/1/14 14:14, Feng Jiang wrote:
> > On 2026/1/13 16:46, Andy Shevchenko wrote:  
> >> On Tue, Jan 13, 2026 at 04:27:42PM +0800, Feng Jiang wrote:  
> >>> Introduce a benchmark to compare the architecture-optimized strlen()
> >>> implementation against the generic C version (__generic_strlen).
> >>>
> >>> The benchmark uses a table-driven approach to evaluate performance
> >>> across different string lengths (short, medium, and long). It employs
> >>> ktime_get() for timing and get_random_bytes() followed by null-byte
> >>> filtering to generate test data that prevents early termination.
> >>>
> >>> This helps in quantifying the performance gains of architecture-specific
> >>> optimizations on various platforms.  
...
> Preliminary results with this change look much more reasonable:
> 
>     ok 4 string_test_strlen
>     # string_test_strlen_bench: strlen performance (short, len: 8, iters: 100000):
>     # string_test_strlen_bench:   arch-optimized: 4767500 ns
>     # string_test_strlen_bench:   generic C:      5815800 ns
>     # string_test_strlen_bench:   speedup:        1.21x
>     # string_test_strlen_bench: strlen performance (medium, len: 64, iters: 100000):
>     # string_test_strlen_bench:   arch-optimized: 6573600 ns
>     # string_test_strlen_bench:   generic C:      16342500 ns
>     # string_test_strlen_bench:   speedup:        2.48x
>     # string_test_strlen_bench: strlen performance (long, len: 2048, iters: 10000):
>     # string_test_strlen_bench:   arch-optimized: 7931000 ns
>     # string_test_strlen_bench:   generic C:      35347300 ns

That is far too long.
In 35ms you are including a lot of timer interrupts.
You are also just testing the 'hot cache' case.
The kernel runs 'cold cache' a lot of the time - especially for instructions.

To time short loops (or even single passes) you need a data dependency
between the 'start time' and the code being tested (easy enough, just add
(time & non_compile_time_zero) to a parameter), and between the result of
the code and the 'end time' - somewhat harder (doable in x86 if you use
the pmc cycle counter).

	David


>     # string_test_strlen_bench:   speedup:        4.45x
>     ok 5 string_test_strlen_bench
> 
> I will adopt this pattern in v3, along with cache warm-up and preempt_disable(),
> to stay consistent with existing kernel benchmarks and ensure robust measurements.
>
Re: [PATCH v2 08/14] lib/string_kunit: add performance benchmark for strlen()
Posted by Feng Jiang 3 weeks, 4 days ago
On 2026/1/14 18:21, David Laight wrote:
> On Wed, 14 Jan 2026 15:04:58 +0800
> Feng Jiang <jiangfeng@kylinos.cn> wrote:
> 
>> On 2026/1/14 14:14, Feng Jiang wrote:
>>> On 2026/1/13 16:46, Andy Shevchenko wrote:  
>>>> On Tue, Jan 13, 2026 at 04:27:42PM +0800, Feng Jiang wrote:  
>>>>> Introduce a benchmark to compare the architecture-optimized strlen()
>>>>> implementation against the generic C version (__generic_strlen).
>>>>>
>>>>> The benchmark uses a table-driven approach to evaluate performance
>>>>> across different string lengths (short, medium, and long). It employs
>>>>> ktime_get() for timing and get_random_bytes() followed by null-byte
>>>>> filtering to generate test data that prevents early termination.
>>>>>
>>>>> This helps in quantifying the performance gains of architecture-specific
>>>>> optimizations on various platforms.  
> ...
>> Preliminary results with this change look much more reasonable:
>>
>>     ok 4 string_test_strlen
>>     # string_test_strlen_bench: strlen performance (short, len: 8, iters: 100000):
>>     # string_test_strlen_bench:   arch-optimized: 4767500 ns
>>     # string_test_strlen_bench:   generic C:      5815800 ns
>>     # string_test_strlen_bench:   speedup:        1.21x
>>     # string_test_strlen_bench: strlen performance (medium, len: 64, iters: 100000):
>>     # string_test_strlen_bench:   arch-optimized: 6573600 ns
>>     # string_test_strlen_bench:   generic C:      16342500 ns
>>     # string_test_strlen_bench:   speedup:        2.48x
>>     # string_test_strlen_bench: strlen performance (long, len: 2048, iters: 10000):
>>     # string_test_strlen_bench:   arch-optimized: 7931000 ns
>>     # string_test_strlen_bench:   generic C:      35347300 ns
> 
> That is far too long.
> In 35ms you are including a lot of timer interrupts.
> You are also just testing the 'hot cache' case.
> The kernel runs 'cold cache' a lot of the time - especially for instructions.
> 
> To time short loops (or even single passes) you need a data dependency
> between the 'start time' and the code being tested (easy enough, just add
> (time & non_compile_time_zero) to a parameter), and between the result of
> the code and the 'end time' - somewhat harder (doable in x86 if you use
> the pmc cycle counter).

Hi David,

I appreciate the feedback! You're absolutely right that 35ms is quite long; it
was measured in a TCG environment, and on real hardware (ARM64 KVM), it's
actually an order of magnitude faster. I'll definitely tighten the iterations
in v3 to avoid potential noise.

For the more advanced suggestions like cold cache and data dependency, I can
see how they would make the benchmark much more rigorous. My plan is to follow
the pattern in crc_benchmark() to refine the logic, as I feel this approach is
simple, easy to maintain, and provides a good enough baseline for our needs.

While I understand that simulating a cold cache would be more precise, I'm
concerned it might introduce significant complexity at this stage. I hope the
current focus on hot-path throughput is a reasonable starting point for a
general KUnit test.

-- 
With Best Regards,
Feng Jiang
Re: [PATCH v2 08/14] lib/string_kunit: add performance benchmark for strlen()
Posted by David Laight 3 weeks, 4 days ago
On Thu, 15 Jan 2026 14:24:16 +0800
Feng Jiang <jiangfeng@kylinos.cn> wrote:

> On 2026/1/14 18:21, David Laight wrote:
> > On Wed, 14 Jan 2026 15:04:58 +0800
> > Feng Jiang <jiangfeng@kylinos.cn> wrote:
> >   
> >> On 2026/1/14 14:14, Feng Jiang wrote:  
> >>> On 2026/1/13 16:46, Andy Shevchenko wrote:    
> >>>> On Tue, Jan 13, 2026 at 04:27:42PM +0800, Feng Jiang wrote:    
> >>>>> Introduce a benchmark to compare the architecture-optimized strlen()
> >>>>> implementation against the generic C version (__generic_strlen).
> >>>>>
> >>>>> The benchmark uses a table-driven approach to evaluate performance
> >>>>> across different string lengths (short, medium, and long). It employs
> >>>>> ktime_get() for timing and get_random_bytes() followed by null-byte
> >>>>> filtering to generate test data that prevents early termination.
> >>>>>
> >>>>> This helps in quantifying the performance gains of architecture-specific
> >>>>> optimizations on various platforms.    
> > ...  
> >> Preliminary results with this change look much more reasonable:
> >>
> >>     ok 4 string_test_strlen
> >>     # string_test_strlen_bench: strlen performance (short, len: 8, iters: 100000):
> >>     # string_test_strlen_bench:   arch-optimized: 4767500 ns
> >>     # string_test_strlen_bench:   generic C:      5815800 ns
> >>     # string_test_strlen_bench:   speedup:        1.21x
> >>     # string_test_strlen_bench: strlen performance (medium, len: 64, iters: 100000):
> >>     # string_test_strlen_bench:   arch-optimized: 6573600 ns
> >>     # string_test_strlen_bench:   generic C:      16342500 ns
> >>     # string_test_strlen_bench:   speedup:        2.48x
> >>     # string_test_strlen_bench: strlen performance (long, len: 2048, iters: 10000):
> >>     # string_test_strlen_bench:   arch-optimized: 7931000 ns
> >>     # string_test_strlen_bench:   generic C:      35347300 ns  
> >>     # string_test_strlen_bench:   speedup:        4.45x
> > 
> > That is far too long.
> > In 35ms you are including a lot of timer interrupts.
> > You are also just testing the 'hot cache' case.
> > The kernel runs 'cold cache' a lot of the time - especially for instructions.
> > 
> > To time short loops (or even single passes) you need a data dependency
> > between the 'start time' and the code being tested (easy enough, just add
> > (time & non_compile_time_zero) to a parameter), and between the result of
> > the code and the 'end time' - somewhat harder (doable in x86 if you use
> > the pmc cycle counter).  
> 
> Hi David,
> 
> I appreciate the feedback! You're absolutely right that 35ms is quite long; it
> was measured in a TCG environment, and on real hardware (ARM64 KVM), it's
> actually an order of magnitude faster. I'll definitely tighten the iterations
> in v3 to avoid potential noise.

Doing time-based measurements on anything but real hardware is pointless.
(It is even problematic on some real hardware because the cpu clock speed
changes dynamically - which is why I've started using the x86 pmc to count
actual clock cycles.)

You only really need enough iterations to get enough 'ticks' from the
timer for the answer to make sense.
Other effects mean you can't really quote values to even 1% - so 100 ticks
from the timer is more than enough.
I'm not sure what the resolution of ktime_get_ns() is (will be hardware
dependant) 
You are better off running the test a few times and using the best value.

Also you don't need to do a very long test to show a x4 improvement!

To see how good an algorithm really is you really need to work out the
'fixed cost' and 'cost per byte'  in 'clocks' and 'clocks per byte'
(or 'bytes per clock') although they can be 'noisy' for short lengths.
The latter tells you how near to 'optimal' the algorithm is and lets you
compare results between different cpus (eg Zen-5 v i7-12xxx).
For instance the x86-64 IP checksum code (nominally 16bit add with carry)
actually runs at more than 8 bytes/clock on most cpu (IIRC it manages 12
but not 16).

> 
> For the more advanced suggestions like cold cache and data dependency, I can
> see how they would make the benchmark much more rigorous. My plan is to follow
> the pattern in crc_benchmark() to refine the logic, as I feel this approach is
> simple, easy to maintain, and provides a good enough baseline for our needs.
> 
> While I understand that simulating a cold cache would be more precise, I'm
> concerned it might introduce significant complexity at this stage. I hope the
> current focus on hot-path throughput is a reasonable starting point for a
> general KUnit test.
> 

I've only done 'cold cache' testing in userspace - counting the actual
clocks for the each call (the first value is cold cache).

Gives a massive difference for large functions like blake2s where the
unrolled loop is somewhat faster, but for the cold-cache it is only
worth it for buffers over (about) 8k (and that might be worse if the
cpu is running at full speed which makes the memory effectively slower).

The other issue with running a test multiple times is that the branch
predictor will correctly predict all the branches.
So something like memcpy() which might have different code for different
lengths will always pick the correct one.
Branch mis-prediction seems to cost about 20 clocks on my zen-5.

Anyway, some measurements are better than none.

	David
Re: [PATCH v2 08/14] lib/string_kunit: add performance benchmark for strlen()
Posted by Andy Shevchenko 3 weeks, 5 days ago
On Wed, Jan 14, 2026 at 03:04:58PM +0800, Feng Jiang wrote:
> On 2026/1/14 14:14, Feng Jiang wrote:
> > On 2026/1/13 16:46, Andy Shevchenko wrote:

...

> > Thank you for the catch. You are absolutely correct—the 2500x figure is heavily
> > distorted and does not reflect real-world performance.
> > 
> > I've found that by using a volatile function pointer to call the implementations
> > (instead of direct calls), the results returned to a realistic range. It appears
> > the previous benchmark logic allowed the compiler to over-optimize the test loop
> > in ways that skewed the data.
> > 
> > I will refactor the benchmark logic in v3, specifically referencing the crc32
> > KUnit implementation (e.g., using warm-up loops and adding preempt_disable()
> > to eliminate context-switch interference) to ensure the data is robust and accurate.
> > 
> 
> Just a quick follow-up: I've also verified that using a volatile variable to store
> the return value (as seen in crc_benchmark()) is equally effective at preventing
> the optimization.
> 
> The core change is as follows:
> 
>     volatile size_t len;
>     ...
>     for (unsigned int j = 0; j < iters; j++) {
>         OPTIMIZER_HIDE_VAR(buf);
>         len = strlen(buf);

But please, check for sure this is Linux kernel generic implementation (before)
and not __builtin_strlen() from GCC. (OTOH, it would be nice to benchmark that
one as well, although I think that __builtin_strlen() in general maybe slightly
better choice than Linux kernel generic implementation.) I.o.w. be sure *what*
you test.

>     }

Or using WRITE_ONCE() :-) But that one will probably be confusing as it usually
should be paired with READ_ONCE() somewhere else in the code. So, I agree on
crc_benchmark() approach taken.

> Preliminary results with this change look much more reasonable:
> 
>     ok 4 string_test_strlen
>     # string_test_strlen_bench: strlen performance (short, len: 8, iters: 100000):
>     # string_test_strlen_bench:   arch-optimized: 4767500 ns
>     # string_test_strlen_bench:   generic C:      5815800 ns
>     # string_test_strlen_bench:   speedup:        1.21x
>     # string_test_strlen_bench: strlen performance (medium, len: 64, iters: 100000):
>     # string_test_strlen_bench:   arch-optimized: 6573600 ns
>     # string_test_strlen_bench:   generic C:      16342500 ns
>     # string_test_strlen_bench:   speedup:        2.48x
>     # string_test_strlen_bench: strlen performance (long, len: 2048, iters: 10000):
>     # string_test_strlen_bench:   arch-optimized: 7931000 ns
>     # string_test_strlen_bench:   generic C:      35347300 ns
>     # string_test_strlen_bench:   speedup:        4.45x
>     ok 5 string_test_strlen_bench
> 
> I will adopt this pattern in v3, along with cache warm-up and preempt_disable(),
> to stay consistent with existing kernel benchmarks and ensure robust measurements.

-- 
With Best Regards,
Andy Shevchenko


Re: [PATCH v2 08/14] lib/string_kunit: add performance benchmark for strlen()
Posted by Feng Jiang 3 weeks, 5 days ago
On 2026/1/14 15:21, Andy Shevchenko wrote:
> On Wed, Jan 14, 2026 at 03:04:58PM +0800, Feng Jiang wrote:
>> On 2026/1/14 14:14, Feng Jiang wrote:
>>> On 2026/1/13 16:46, Andy Shevchenko wrote:
> 
> ...
> 
>>> Thank you for the catch. You are absolutely correct—the 2500x figure is heavily
>>> distorted and does not reflect real-world performance.
>>>
>>> I've found that by using a volatile function pointer to call the implementations
>>> (instead of direct calls), the results returned to a realistic range. It appears
>>> the previous benchmark logic allowed the compiler to over-optimize the test loop
>>> in ways that skewed the data.
>>>
>>> I will refactor the benchmark logic in v3, specifically referencing the crc32
>>> KUnit implementation (e.g., using warm-up loops and adding preempt_disable()
>>> to eliminate context-switch interference) to ensure the data is robust and accurate.
>>>
>>
>> Just a quick follow-up: I've also verified that using a volatile variable to store
>> the return value (as seen in crc_benchmark()) is equally effective at preventing
>> the optimization.
>>
>> The core change is as follows:
>>
>>     volatile size_t len;
>>     ...
>>     for (unsigned int j = 0; j < iters; j++) {
>>         OPTIMIZER_HIDE_VAR(buf);
>>         len = strlen(buf);
> 
> But please, check for sure this is Linux kernel generic implementation (before)
> and not __builtin_strlen() from GCC. (OTOH, it would be nice to benchmark that
> one as well, although I think that __builtin_strlen() in general maybe slightly
> better choice than Linux kernel generic implementation.) I.o.w. be sure *what*
> you test.
> 

Thanks for the reminder. I actually verified this with objdump and gdb before
submitting the patch—the calls are indeed hitting the intended arch-specific
strlen symbols, not the compiler's __builtin_strlen(). I missed mentioning this
detail in my previous email.

I also just performed an additional test by explicitly calling the exported
arch-specific __pi_strlen() symbol, and the results remained consistent.

Results with riscv __pi_strlen():

    ok 4 string_test_strlen
    # string_test_strlen_bench: strlen performance (short, len: 8, iters: 100000):
    # string_test_strlen_bench:   arch-optimized: 4650500 ns
    # string_test_strlen_bench:   generic C:      5776000 ns
    # string_test_strlen_bench:   speedup:        1.24x
    # string_test_strlen_bench: strlen performance (medium, len: 64, iters: 100000):
    # string_test_strlen_bench:   arch-optimized: 6895000 ns
    # string_test_strlen_bench:   generic C:      16343400 ns
    # string_test_strlen_bench:   speedup:        2.37x
    # string_test_strlen_bench: strlen performance (long, len: 2048, iters: 10000):
    # string_test_strlen_bench:   arch-optimized: 8052800 ns
    # string_test_strlen_bench:   generic C:      35290700 ns
    # string_test_strlen_bench:   speedup:        4.38x
    ok 5 string_test_strlen_bench

>>     }
> 
> Or using WRITE_ONCE() :-) But that one will probably be confusing as it usually
> should be paired with READ_ONCE() somewhere else in the code. So, I agree on
> crc_benchmark() approach taken.
> 

Thanks for the guidance. I'll stick with the crc_benchmark() pattern to avoid any
potential confusion regarding concurrency that WRITE_ONCE() might imply.

I'm still learning the most idiomatic practices in the kernel, so I appreciate the tip.

>> Preliminary results with this change look much more reasonable:
>>
>>     ok 4 string_test_strlen
>>     # string_test_strlen_bench: strlen performance (short, len: 8, iters: 100000):
>>     # string_test_strlen_bench:   arch-optimized: 4767500 ns
>>     # string_test_strlen_bench:   generic C:      5815800 ns
>>     # string_test_strlen_bench:   speedup:        1.21x
>>     # string_test_strlen_bench: strlen performance (medium, len: 64, iters: 100000):
>>     # string_test_strlen_bench:   arch-optimized: 6573600 ns
>>     # string_test_strlen_bench:   generic C:      16342500 ns
>>     # string_test_strlen_bench:   speedup:        2.48x
>>     # string_test_strlen_bench: strlen performance (long, len: 2048, iters: 10000):
>>     # string_test_strlen_bench:   arch-optimized: 7931000 ns
>>     # string_test_strlen_bench:   generic C:      35347300 ns
>>     # string_test_strlen_bench:   speedup:        4.45x
>>     ok 5 string_test_strlen_bench
>>
>> I will adopt this pattern in v3, along with cache warm-up and preempt_disable(),
>> to stay consistent with existing kernel benchmarks and ensure robust measurements.
> 

-- 
With Best Regards,
Feng Jiang