[PATCH] KUnit: memcpy: add benchmark

Matteo Croce posted 1 patch 1 week, 2 days ago
There is a newer version of this series
lib/Kconfig.debug        |   9 ++++
lib/tests/memcpy_kunit.c | 108 +++++++++++++++++++++++++++++++++++++++
2 files changed, 117 insertions(+)
[PATCH] KUnit: memcpy: add benchmark
Posted by Matteo Croce 1 week, 2 days ago
Add optional benchmarks for memcpy() and memmove() functions.
Each benchmark is run twice: first with buffers aligned and then with
buffers unaligned, to spot unaligned accesses on platforms where they
have a noticeable performance impact.

Signed-off-by: Matteo Croce <teknoraver@meta.com>
---
 lib/Kconfig.debug        |   9 ++++
 lib/tests/memcpy_kunit.c | 108 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 117 insertions(+)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ba36939fda79..02868c4397cb 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2880,6 +2880,15 @@ config MEMCPY_KUNIT_TEST
 
 	  If unsure, say N.
 
+config MEMCPY_KUNIT_BENCHMARK
+	bool "Benchmark string functions"
+	depends on MEMCPY_KUNIT_TEST
+	help
+	  A benchmark for memcpy() and memmove() functions,
+	  with both aligned and unaligned buffers.
+
+	  If unsure, say N.
+
 config IS_SIGNED_TYPE_KUNIT_TEST
 	tristate "Test is_signed_type() macro" if !KUNIT_ALL_TESTS
 	depends on KUNIT
diff --git a/lib/tests/memcpy_kunit.c b/lib/tests/memcpy_kunit.c
index d36933554e46..33080dddc58e 100644
--- a/lib/tests/memcpy_kunit.c
+++ b/lib/tests/memcpy_kunit.c
@@ -493,6 +493,110 @@ static void memmove_overlap_test(struct kunit *test)
 	}
 }
 
+#ifdef CONFIG_MEMCPY_KUNIT_BENCHMARK
+
+#define COPY_SIZE	(4 * 1024 * 1024)
+#define COPIES_NUM	100
+
+static int memcpy_bench_align(struct kunit *test, bool unalign)
+{
+	u64 start, end, total_ns = 0;
+	char *buf1;
+	char *buf2;
+	int ret = 0;
+
+	buf1 = kzalloc(COPY_SIZE, GFP_KERNEL);
+	if (!buf1)
+		return -ENOMEM;
+
+	buf2 = kzalloc(COPY_SIZE, GFP_KERNEL);
+	if (!buf2) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
+	preempt_disable();
+	for (int i = 0; i < COPIES_NUM; i++) {
+		start = ktime_get_ns();
+		memcpy(buf1 + unalign, buf2, COPY_SIZE - unalign);
+		end = ktime_get_ns();
+		total_ns += end - start;
+		cond_resched();
+	}
+	preempt_enable();
+
+	/* Avoid division by zero */
+	if (!total_ns)
+		total_ns = 1;
+
+	kunit_info(test, "memcpy: %saligned copy of %lu MBytes in %lld msecs (%lld MB/s)\n",
+		   unalign ? "un" : "",
+		   (unsigned long)(COPIES_NUM * COPY_SIZE) / (1024 * 1024),
+		   total_ns / 1000000,
+		   (COPIES_NUM * COPY_SIZE * 1000000000ULL / total_ns) / (1024 * 1024));
+
+	kfree(buf2);
+
+out_free:
+	kfree(buf1);
+
+	return ret;
+}
+
+static void memcpy_bench_test(struct kunit *test)
+{
+	KUNIT_ASSERT_EQ_MSG(test, memcpy_bench_align(test, false), 0,
+			   "aligned memcpy benchmark failed");
+	KUNIT_ASSERT_EQ_MSG(test, memcpy_bench_align(test, true), 0,
+			   "unaligned memcpy benchmark failed");
+}
+
+#define POS_SHIFT (2 * PAGE_SIZE)
+
+static int memmove_bench_align(struct kunit *test, bool unalign)
+{
+	u64 start, end, total_ns = 0;
+	char *buf;
+	int ret = 0;
+
+	buf = kzalloc(COPY_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	preempt_disable();
+	for (int i = 0; i < COPIES_NUM; i++) {
+		start = ktime_get_ns();
+		memmove(buf + POS_SHIFT + unalign, buf, COPY_SIZE - POS_SHIFT - unalign);
+		end = ktime_get_ns();
+		total_ns += end - start;
+		cond_resched();
+	}
+	preempt_enable();
+
+	if (!total_ns)
+		total_ns = 1;
+
+	kunit_info(test, "memmove: %saligned move of %lu MBytes in %lld msecs (%lld MB/s)\n",
+		   unalign ? "un" : "",
+		   (unsigned long)(COPIES_NUM * (COPY_SIZE - POS_SHIFT)) / (1024 * 1024),
+		   total_ns / 1000000,
+		   (COPIES_NUM * (COPY_SIZE - POS_SHIFT) * 1000000000ULL / total_ns) /
+			(1024 * 1024));
+
+	kfree(buf);
+
+	return ret;
+}
+
+static void memmove_bench_test(struct kunit *test)
+{
+	KUNIT_ASSERT_EQ_MSG(test, memmove_bench_align(test, false), 0,
+			   "aligned memmove benchmark failed");
+	KUNIT_ASSERT_EQ_MSG(test, memmove_bench_align(test, true), 0,
+			   "unaligned memmove benchmark failed");
+}
+#endif
+
 static struct kunit_case memcpy_test_cases[] = {
 	KUNIT_CASE(memset_test),
 	KUNIT_CASE(memcpy_test),
@@ -500,6 +604,10 @@ static struct kunit_case memcpy_test_cases[] = {
 	KUNIT_CASE_SLOW(memmove_test),
 	KUNIT_CASE_SLOW(memmove_large_test),
 	KUNIT_CASE_SLOW(memmove_overlap_test),
+#ifdef CONFIG_MEMCPY_KUNIT_BENCHMARK
+	KUNIT_CASE_SLOW(memcpy_bench_test),
+	KUNIT_CASE_SLOW(memmove_bench_test),
+#endif
 	{}
 };
 
-- 
2.52.0
Re: [PATCH] KUnit: memcpy: add benchmark
Posted by Andrew Morton 1 week, 1 day ago
On Thu, 29 Jan 2026 01:43:28 +0100 Matteo Croce <technoboy85@gmail.com> wrote:

> Add optional benchmarks for memcpy() and memmove() functions.
> Each benchmark is run twice: first with buffers aligned and then with
> buffers unaligned, to spot unaligned accesses on platforms where they
> have a noticeable performance impact.
> 
> ...
>
> +static int memcpy_bench_align(struct kunit *test, bool unalign)
> +{
> +	u64 start, end, total_ns = 0;
> +	char *buf1;
> +	char *buf2;
> +	int ret = 0;
> +
> +	buf1 = kzalloc(COPY_SIZE, GFP_KERNEL);
> +	if (!buf1)
> +		return -ENOMEM;
> +
> +	buf2 = kzalloc(COPY_SIZE, GFP_KERNEL);
> +	if (!buf2) {
> +		ret = -ENOMEM;
> +		goto out_free;
> +	}
> +
> +	preempt_disable();
> +	for (int i = 0; i < COPIES_NUM; i++) {
> +		start = ktime_get_ns();
> +		memcpy(buf1 + unalign, buf2, COPY_SIZE - unalign);
> +		end = ktime_get_ns();
> +		total_ns += end - start;
> +		cond_resched();

Is cond_resched() inside preempt_disable() actually legal?

Might be, but it doesn't make a lot of sense, does it?

> +	}
> +	preempt_enable();
> +
Re: [PATCH] KUnit: memcpy: add benchmark
Posted by Matteo Croce 1 week, 1 day ago
Il giorno ven 30 gen 2026 alle ore 00:20 Andrew Morton
<akpm@linux-foundation.org> ha scritto:
>
> On Thu, 29 Jan 2026 01:43:28 +0100 Matteo Croce <technoboy85@gmail.com> wrote:
>
> > Add optional benchmarks for memcpy() and memmove() functions.
> > Each benchmark is run twice: first with buffers aligned and then with
> > buffers unaligned, to spot unaligned accesses on platforms where they
> > have a noticeable performance impact.
> >
> > ...
> >
> > +static int memcpy_bench_align(struct kunit *test, bool unalign)
> > +{
> > +     u64 start, end, total_ns = 0;
> > +     char *buf1;
> > +     char *buf2;
> > +     int ret = 0;
> > +
> > +     buf1 = kzalloc(COPY_SIZE, GFP_KERNEL);
> > +     if (!buf1)
> > +             return -ENOMEM;
> > +
> > +     buf2 = kzalloc(COPY_SIZE, GFP_KERNEL);
> > +     if (!buf2) {
> > +             ret = -ENOMEM;
> > +             goto out_free;
> > +     }
> > +
> > +     preempt_disable();
> > +     for (int i = 0; i < COPIES_NUM; i++) {
> > +             start = ktime_get_ns();
> > +             memcpy(buf1 + unalign, buf2, COPY_SIZE - unalign);
> > +             end = ktime_get_ns();
> > +             total_ns += end - start;
> > +             cond_resched();
>
> Is cond_resched() inside preempt_disable() actually legal?
>
> Might be, but it doesn't make a lot of sense, does it?
>
> > +     }
> > +     preempt_enable();
> > +
>

Right. In a previous version I was doing
preempt_disable()/preempt_enable() around the two ktime_get_ns(), but
then I thought that enabling and disabling preemption 100 time was too
much.
I'll restore the preempt macros around the actual copy and remove
cond_resched().
Thanks.

-- 
Matteo Croce

perl -e 'for($t=0;;$t++){print chr($t*($t>>8|$t>>13)&255)}' |aplay
Re: [PATCH] KUnit: memcpy: add benchmark
Posted by Andrew Morton 1 week, 1 day ago
On Fri, 30 Jan 2026 00:36:30 +0100 Matteo Croce <technoboy85@gmail.com> wrote:

> > Is cond_resched() inside preempt_disable() actually legal?
> >
> > Might be, but it doesn't make a lot of sense, does it?
> >
> > > +     }
> > > +     preempt_enable();
> > > +
> >
> 
> Right. In a previous version I was doing
> preempt_disable()/preempt_enable() around the two ktime_get_ns(), but
> then I thought that enabling and disabling preemption 100 time was too
> much.
> I'll restore the preempt macros around the actual copy and remove
> cond_resched().

OK.

local_irq_save() would be more accurate.  

Does it really need to copy 4MB?  Smaller would make
local_irq_disable() more viable.

kmalloc(4MB) does seem to be pushing our luck.  I'm spotting

	./arch/arm/configs/pxa_defconfig:CONFIG_ARCH_FORCE_MAX_ORDER=8

which is 1MB?
Re: [PATCH] KUnit: memcpy: add benchmark
Posted by Matteo Croce 1 week, 1 day ago
Il giorno ven 30 gen 2026 alle ore 00:53 Andrew Morton
<akpm@linux-foundation.org> ha scritto:
>
> On Fri, 30 Jan 2026 00:36:30 +0100 Matteo Croce <technoboy85@gmail.com> wrote:
>
> > > Is cond_resched() inside preempt_disable() actually legal?
> > >
> > > Might be, but it doesn't make a lot of sense, does it?
> > >
> > > > +     }
> > > > +     preempt_enable();
> > > > +
> > >
> >
> > Right. In a previous version I was doing
> > preempt_disable()/preempt_enable() around the two ktime_get_ns(), but
> > then I thought that enabling and disabling preemption 100 time was too
> > much.
> > I'll restore the preempt macros around the actual copy and remove
> > cond_resched().
>
> OK.
>
> local_irq_save() would be more accurate.
>

Right, even more strict.

> Does it really need to copy 4MB?  Smaller would make
> local_irq_disable() more viable.
>
> kmalloc(4MB) does seem to be pushing our luck.  I'm spotting
>
>         ./arch/arm/configs/pxa_defconfig:CONFIG_ARCH_FORCE_MAX_ORDER=8
>
> which is 1MB?

I wanted to stay outside of the caches as much as possible.
What about using "PAGE_SIZE * (1 << CONFIG_ARCH_FORCE_MAX_ORDER)" so
it adjusts itself?

-- 
Matteo Croce

perl -e 'for($t=0;;$t++){print chr($t*($t>>8|$t>>13)&255)}' |aplay
Re: [PATCH] KUnit: memcpy: add benchmark
Posted by Matteo Croce 1 week, 1 day ago
Il giorno ven 30 gen 2026 alle ore 01:04 Matteo Croce
<technoboy85@gmail.com> ha scritto:
>
> Il giorno ven 30 gen 2026 alle ore 00:53 Andrew Morton
> <akpm@linux-foundation.org> ha scritto:
> >
> > On Fri, 30 Jan 2026 00:36:30 +0100 Matteo Croce <technoboy85@gmail.com> wrote:
> >
> > > > Is cond_resched() inside preempt_disable() actually legal?
> > > >
> > > > Might be, but it doesn't make a lot of sense, does it?
> > > >
> > > > > +     }
> > > > > +     preempt_enable();
> > > > > +
> > > >
> > >
> > > Right. In a previous version I was doing
> > > preempt_disable()/preempt_enable() around the two ktime_get_ns(), but
> > > then I thought that enabling and disabling preemption 100 time was too
> > > much.
> > > I'll restore the preempt macros around the actual copy and remove
> > > cond_resched().
> >
> > OK.
> >
> > local_irq_save() would be more accurate.
> >
>
> Right, even more strict.
>
> > Does it really need to copy 4MB?  Smaller would make
> > local_irq_disable() more viable.
> >
> > kmalloc(4MB) does seem to be pushing our luck.  I'm spotting
> >
> >         ./arch/arm/configs/pxa_defconfig:CONFIG_ARCH_FORCE_MAX_ORDER=8
> >
> > which is 1MB?
>
> I wanted to stay outside of the caches as much as possible.
> What about using "PAGE_SIZE * (1 << CONFIG_ARCH_FORCE_MAX_ORDER)" so
> it adjusts itself?
>

Or "PAGE_SIZE << MAX_PAGE_ORDER", if I grep it I see it's already used
here and there.

-- 
Matteo Croce

perl -e 'for($t=0;;$t++){print chr($t*($t>>8|$t>>13)&255)}' |aplay