[v2] KUnit: memcpy: add benchmark

[PATCH v2] KUnit: memcpy: add benchmark

Posted by Matteo Croce 1 week, 1 day ago

Add optional benchmarks for memcpy() and memmove() functions.
Each benchmark is run twice: first with buffers aligned and then with
buffers unaligned, to spot unaligned accesses on platforms where they
have a noticeable performance impact.

Sample output:
	# modprobe memcpy_kunit
	KTAP version 1
	1..1
	    KTAP version 1
	    # Subtest: memcpy
	    # module: memcpy_kunit
	    1..8
	    [...]
	    # memcpy_bench_test: memcpy: aligned copy of 400 MBytes in 22 msecs (18027 MB/s)
	    # memcpy_bench_test: memcpy: unaligned copy of 400 MBytes in 23 msecs (17360 MB/s)
	    # memcpy_bench_test.speed: slow
	    ok 7 memcpy_bench_test
	    # memmove_bench_test: memmove: aligned move of 399 MBytes in 17 msecs (23012 MB/s)
	    # memmove_bench_test: memmove: unaligned move of 399 MBytes in 17 msecs (22381 MB/s)
	    # memmove_bench_test.speed: slow
	    ok 8 memmove_bench_test
	# memcpy: pass:8 fail:0 skip:0 total:8
	# Totals: pass:8 fail:0 skip:0 total:8
	ok 1 memcpy

Signed-off-by: Matteo Croce <teknoraver@meta.com>
---
 lib/Kconfig.debug        |   9 ++++
 lib/tests/memcpy_kunit.c | 106 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 115 insertions(+)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ba36939fda79..02868c4397cb 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2880,6 +2880,15 @@ config MEMCPY_KUNIT_TEST
 
 	  If unsure, say N.
 
+config MEMCPY_KUNIT_BENCHMARK
+	bool "Benchmark string functions"
+	depends on MEMCPY_KUNIT_TEST
+	help
+	  A benchmark for memcpy() and memmove() functions,
+	  with both aligned and unaligned buffers.
+
+	  If unsure, say N.
+
 config IS_SIGNED_TYPE_KUNIT_TEST
 	tristate "Test is_signed_type() macro" if !KUNIT_ALL_TESTS
 	depends on KUNIT
diff --git a/lib/tests/memcpy_kunit.c b/lib/tests/memcpy_kunit.c
index d36933554e46..100cda8d4f34 100644
--- a/lib/tests/memcpy_kunit.c
+++ b/lib/tests/memcpy_kunit.c
@@ -493,6 +493,108 @@ static void memmove_overlap_test(struct kunit *test)
 	}
 }
 
+#ifdef CONFIG_MEMCPY_KUNIT_BENCHMARK
+
+#define COPY_SIZE	(4 * 1024 * 1024)
+#define COPIES_NUM	100
+
+static int memcpy_bench_align(struct kunit *test, bool unalign)
+{
+	u64 start, end, total_ns = 0;
+	char *buf1;
+	char *buf2;
+	int ret = 0;
+
+	buf1 = kzalloc(COPY_SIZE, GFP_KERNEL);
+	if (!buf1)
+		return -ENOMEM;
+
+	buf2 = kzalloc(COPY_SIZE, GFP_KERNEL);
+	if (!buf2) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
+	for (int i = 0; i < COPIES_NUM; i++) {
+		preempt_disable();
+		start = ktime_get_ns();
+		memcpy(buf1 + unalign, buf2, COPY_SIZE - unalign);
+		end = ktime_get_ns();
+		preempt_enable();
+		total_ns += end - start;
+	}
+
+	/* Avoid division by zero */
+	if (!total_ns)
+		total_ns = 1;
+
+	kunit_info(test, "memcpy: %saligned copy of %lu MBytes in %lld msecs (%lld MB/s)\n",
+		   unalign ? "un" : "",
+		   (unsigned long)(COPIES_NUM * COPY_SIZE) / (1024 * 1024),
+		   total_ns / 1000000,
+		   (COPIES_NUM * COPY_SIZE * 1000000000ULL / total_ns) / (1024 * 1024));
+
+	kfree(buf2);
+
+out_free:
+	kfree(buf1);
+
+	return ret;
+}
+
+static void memcpy_bench_test(struct kunit *test)
+{
+	KUNIT_ASSERT_EQ_MSG(test, memcpy_bench_align(test, false), 0,
+			   "aligned memcpy benchmark failed");
+	KUNIT_ASSERT_EQ_MSG(test, memcpy_bench_align(test, true), 0,
+			   "unaligned memcpy benchmark failed");
+}
+
+#define POS_SHIFT (2 * PAGE_SIZE)
+
+static int memmove_bench_align(struct kunit *test, bool unalign)
+{
+	u64 start, end, total_ns = 0;
+	char *buf;
+	int ret = 0;
+
+	buf = kzalloc(COPY_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	for (int i = 0; i < COPIES_NUM; i++) {
+		preempt_disable();
+		start = ktime_get_ns();
+		memmove(buf + POS_SHIFT + unalign, buf, COPY_SIZE - POS_SHIFT - unalign);
+		end = ktime_get_ns();
+		preempt_enable();
+		total_ns += end - start;
+	}
+
+	if (!total_ns)
+		total_ns = 1;
+
+	kunit_info(test, "memmove: %saligned move of %lu MBytes in %lld msecs (%lld MB/s)\n",
+		   unalign ? "un" : "",
+		   (unsigned long)(COPIES_NUM * (COPY_SIZE - POS_SHIFT)) / (1024 * 1024),
+		   total_ns / 1000000,
+		   (COPIES_NUM * (COPY_SIZE - POS_SHIFT) * 1000000000ULL / total_ns) /
+			(1024 * 1024));
+
+	kfree(buf);
+
+	return ret;
+}
+
+static void memmove_bench_test(struct kunit *test)
+{
+	KUNIT_ASSERT_EQ_MSG(test, memmove_bench_align(test, false), 0,
+			   "aligned memmove benchmark failed");
+	KUNIT_ASSERT_EQ_MSG(test, memmove_bench_align(test, true), 0,
+			   "unaligned memmove benchmark failed");
+}
+#endif
+
 static struct kunit_case memcpy_test_cases[] = {
 	KUNIT_CASE(memset_test),
 	KUNIT_CASE(memcpy_test),
@@ -500,6 +602,10 @@ static struct kunit_case memcpy_test_cases[] = {
 	KUNIT_CASE_SLOW(memmove_test),
 	KUNIT_CASE_SLOW(memmove_large_test),
 	KUNIT_CASE_SLOW(memmove_overlap_test),
+#ifdef CONFIG_MEMCPY_KUNIT_BENCHMARK
+	KUNIT_CASE_SLOW(memcpy_bench_test),
+	KUNIT_CASE_SLOW(memmove_bench_test),
+#endif
 	{}
 };
 
-- 
2.52.0

Re: [PATCH v2] KUnit: memcpy: add benchmark

Posted by David Laight 1 week, 1 day ago

On Fri, 30 Jan 2026 00:45:39 +0100
Matteo Croce <technoboy85@gmail.com> wrote:

> Add optional benchmarks for memcpy() and memmove() functions.
> Each benchmark is run twice: first with buffers aligned and then with
> buffers unaligned, to spot unaligned accesses on platforms where they
> have a noticeable performance impact.
...
> +#ifdef CONFIG_MEMCPY_KUNIT_BENCHMARK
> +
> +#define COPY_SIZE	(4 * 1024 * 1024)

That is far too big.
You are timing data-cache loads from memory, not memcpy().

To avoid cache misses you probably want to keep the size below 1k.
It is also worth timing short and very short transfers (maybe 2 and 16
bytes) because the fixed overhead can matter more than the transfer
speed.
The difference between 256 and 1024 bytes is enough to (reasonably)
infer the 'cost per byte' for long buffers.

I think I'd time a simple byte copy loop for comparison purposes.
(You might need a barrier() in the loop to stop gcc changing it.)

Alignment wise, on some Intel x86 systems the only makes a big difference
to 'rep movsb' is 32byte aligning the destination buffer.
I don't remember what I got on the zen-5.

'rep movsb' on my zen-5 has a couple of oddities.
- There is a small penalty is the destination starts in the last cache
  line of a page.
- If (dest - src) % 4096 is between 1 and 63 everything is very
  much slower.

You might want to explicitly include something for the latter.
(I found it getting strange timings for misaligned copies.)

Otherwise I got:
  length    clocks
       0       7
   1..3f       5
      40       4
  41..7f       5
  80..1ff     39 (except 16c with is 4 clocks faster!)
      200     38
 201..23f     40
      240     38
 241..27f     41
      280     39
The pattern then continues much the same, increasing by 1 clock every 64 bytes
with the multiple of 64 being a bit cheaper.
Those timings subtract off a 'test overhead' that may include some of the setup
time for 'rep movsb'.
(I need to do them again using data dependencies instead of lfence.)

	David

Re: [PATCH v2] KUnit: memcpy: add benchmark

Posted by Matteo Croce 1 week ago

Il giorno ven 30 gen 2026 alle ore 10:02 David Laight
<david.laight.linux@gmail.com> ha scritto:
>
> On Fri, 30 Jan 2026 00:45:39 +0100
> Matteo Croce <technoboy85@gmail.com> wrote:
>
> > Add optional benchmarks for memcpy() and memmove() functions.
> > Each benchmark is run twice: first with buffers aligned and then with
> > buffers unaligned, to spot unaligned accesses on platforms where they
> > have a noticeable performance impact.
> ...
> > +#ifdef CONFIG_MEMCPY_KUNIT_BENCHMARK
> > +
> > +#define COPY_SIZE    (4 * 1024 * 1024)
>
> That is far too big.
> You are timing data-cache loads from memory, not memcpy().
>
> To avoid cache misses you probably want to keep the size below 1k.
> It is also worth timing short and very short transfers (maybe 2 and 16
> bytes) because the fixed overhead can matter more than the transfer
> speed.
> The difference between 256 and 1024 bytes is enough to (reasonably)
> infer the 'cost per byte' for long buffers.
>
> I think I'd time a simple byte copy loop for comparison purposes.
> (You might need a barrier() in the loop to stop gcc changing it.)
>
> Alignment wise, on some Intel x86 systems the only makes a big difference
> to 'rep movsb' is 32byte aligning the destination buffer.
> I don't remember what I got on the zen-5.
>
> 'rep movsb' on my zen-5 has a couple of oddities.
> - There is a small penalty is the destination starts in the last cache
>   line of a page.
> - If (dest - src) % 4096 is between 1 and 63 everything is very
>   much slower.
>
> You might want to explicitly include something for the latter.
> (I found it getting strange timings for misaligned copies.)
>
> Otherwise I got:
>   length    clocks
>        0       7
>    1..3f       5
>       40       4
>   41..7f       5
>   80..1ff     39 (except 16c with is 4 clocks faster!)
>       200     38
>  201..23f     40
>       240     38
>  241..27f     41
>       280     39
> The pattern then continues much the same, increasing by 1 clock every 64 bytes
> with the multiple of 64 being a bit cheaper.
> Those timings subtract off a 'test overhead' that may include some of the setup
> time for 'rep movsb'.
> (I need to do them again using data dependencies instead of lfence.)
>
>         David

I'm currently working on a RISC-V machine which doesn't support
unaligned access.
The RISC-V memcpy fallbacks to byte copy when the buffers are
unaligned, so I'm trying to fix it.
This is what I'm using this benchmark for, to measure the improvements
over the current implementation.

These are the numbers with the stock memcpy and 4 MB buffer:
memcpy: aligned copy of 400 MBytes in 429 msecs (931 MB/s)
memcpy: unaligned copy of 400 MBytes in 1202 msecs (332 MB/s)

These are the numbers with the stock memcpy and 1 KB buffer:
memcpy: aligned copy of 100 KBytes in 39 usecs (2500 MB/s)
memcpy: unaligned copy of 100 KBytes in 125 usecs (793 MB/s)

These are the numbers with the improved memcpy and 4 MB buffer:
memcpy: aligned copy of 400 MBytes in 428 msecs (933 MB/s)
memcpy: unaligned copy of 400 MBytes in 519 msecs (770 MB/s)

These are the numbers with the improved memcpy and 1 KB buffer:
memcpy: aligned copy of 100 KBytes in 44 usecs (2222 MB/s)
memcpy: unaligned copy of 100 KBytes in 55 usecs (1786 MB/s)

If the results depended purely on load times from memory there
wouldn't be this big difference, while the improved version is ~2.3x
faster.
Also, when timing a big transfer I always consistent numbers, with
small ones they float a lot.
These are a series of runs with 4 MB size:
memcpy: aligned copy of 100 KBytes in 39 usecs (2500 MB/s)
memcpy: unaligned copy of 100 KBytes in 125 usecs (793 MB/s)
memcpy: aligned copy of 100 KBytes in 41 usecs (2381 MB/s)
memcpy: unaligned copy of 100 KBytes in 129 usecs (769 MB/s)
memcpy: aligned copy of 100 KBytes in 39 usecs (2500 MB/s)
memcpy: unaligned copy of 100 KBytes in 124 usecs (800 MB/s)
memcpy: aligned copy of 100 KBytes in 39 usecs (2500 MB/s)
memcpy: unaligned copy of 100 KBytes in 128 usecs (775 MB/s)

And these are some 1 KB runs:
memcpy: aligned copy of 100 KBytes in 49 usecs (2040 MB/s)
memcpy: unaligned copy of 100 KBytes in 61 usecs (1639 MB/s)
memcpy: aligned copy of 100 KBytes in 44 usecs (2222 MB/s)
memcpy: unaligned copy of 100 KBytes in 55 usecs (1786 MB/s)
memcpy: aligned copy of 100 KBytes in 41 usecs (2381 MB/s)
memcpy: unaligned copy of 100 KBytes in 53 usecs (1852 MB/s)
memcpy: aligned copy of 100 KBytes in 38 usecs (2564 MB/s)
memcpy: unaligned copy of 100 KBytes in 55 usecs (1786 MB/s)

So, what I could do is to extend the test *also* to lower sizes, like
2 bytes or so.

Regards,
-- 
Matteo Croce

perl -e 'for($t=0;;$t++){print chr($t*($t>>8|$t>>13)&255)}' |aplay