lib/Kconfig.debug | 9 ++++ lib/tests/memcpy_kunit.c | 108 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+)
Add optional benchmarks for memcpy() and memmove() functions.
Each benchmark is run twice: first with buffers aligned and then with
buffers unaligned, to spot unaligned accesses on platforms where they
have a noticeable performance impact.
Signed-off-by: Matteo Croce <teknoraver@meta.com>
---
lib/Kconfig.debug | 9 ++++
lib/tests/memcpy_kunit.c | 108 +++++++++++++++++++++++++++++++++++++++
2 files changed, 117 insertions(+)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ba36939fda79..02868c4397cb 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2880,6 +2880,15 @@ config MEMCPY_KUNIT_TEST
If unsure, say N.
+config MEMCPY_KUNIT_BENCHMARK
+ bool "Benchmark string functions"
+ depends on MEMCPY_KUNIT_TEST
+ help
+ A benchmark for memcpy() and memmove() functions,
+ with both aligned and unaligned buffers.
+
+ If unsure, say N.
+
config IS_SIGNED_TYPE_KUNIT_TEST
tristate "Test is_signed_type() macro" if !KUNIT_ALL_TESTS
depends on KUNIT
diff --git a/lib/tests/memcpy_kunit.c b/lib/tests/memcpy_kunit.c
index d36933554e46..33080dddc58e 100644
--- a/lib/tests/memcpy_kunit.c
+++ b/lib/tests/memcpy_kunit.c
@@ -493,6 +493,110 @@ static void memmove_overlap_test(struct kunit *test)
}
}
+#ifdef CONFIG_MEMCPY_KUNIT_BENCHMARK
+
+#define COPY_SIZE (4 * 1024 * 1024)
+#define COPIES_NUM 100
+
+static int memcpy_bench_align(struct kunit *test, bool unalign)
+{
+ u64 start, end, total_ns = 0;
+ char *buf1;
+ char *buf2;
+ int ret = 0;
+
+ buf1 = kzalloc(COPY_SIZE, GFP_KERNEL);
+ if (!buf1)
+ return -ENOMEM;
+
+ buf2 = kzalloc(COPY_SIZE, GFP_KERNEL);
+ if (!buf2) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+
+ preempt_disable();
+ for (int i = 0; i < COPIES_NUM; i++) {
+ start = ktime_get_ns();
+ memcpy(buf1 + unalign, buf2, COPY_SIZE - unalign);
+ end = ktime_get_ns();
+ total_ns += end - start;
+ cond_resched();
+ }
+ preempt_enable();
+
+ /* Avoid division by zero */
+ if (!total_ns)
+ total_ns = 1;
+
+ kunit_info(test, "memcpy: %saligned copy of %lu MBytes in %lld msecs (%lld MB/s)\n",
+ unalign ? "un" : "",
+ (unsigned long)(COPIES_NUM * COPY_SIZE) / (1024 * 1024),
+ total_ns / 1000000,
+ (COPIES_NUM * COPY_SIZE * 1000000000ULL / total_ns) / (1024 * 1024));
+
+ kfree(buf2);
+
+out_free:
+ kfree(buf1);
+
+ return ret;
+}
+
+static void memcpy_bench_test(struct kunit *test)
+{
+ KUNIT_ASSERT_EQ_MSG(test, memcpy_bench_align(test, false), 0,
+ "aligned memcpy benchmark failed");
+ KUNIT_ASSERT_EQ_MSG(test, memcpy_bench_align(test, true), 0,
+ "unaligned memcpy benchmark failed");
+}
+
+#define POS_SHIFT (2 * PAGE_SIZE)
+
+static int memmove_bench_align(struct kunit *test, bool unalign)
+{
+ u64 start, end, total_ns = 0;
+ char *buf;
+ int ret = 0;
+
+ buf = kzalloc(COPY_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ preempt_disable();
+ for (int i = 0; i < COPIES_NUM; i++) {
+ start = ktime_get_ns();
+ memmove(buf + POS_SHIFT + unalign, buf, COPY_SIZE - POS_SHIFT - unalign);
+ end = ktime_get_ns();
+ total_ns += end - start;
+ cond_resched();
+ }
+ preempt_enable();
+
+ if (!total_ns)
+ total_ns = 1;
+
+ kunit_info(test, "memmove: %saligned move of %lu MBytes in %lld msecs (%lld MB/s)\n",
+ unalign ? "un" : "",
+ (unsigned long)(COPIES_NUM * (COPY_SIZE - POS_SHIFT)) / (1024 * 1024),
+ total_ns / 1000000,
+ (COPIES_NUM * (COPY_SIZE - POS_SHIFT) * 1000000000ULL / total_ns) /
+ (1024 * 1024));
+
+ kfree(buf);
+
+ return ret;
+}
+
+static void memmove_bench_test(struct kunit *test)
+{
+ KUNIT_ASSERT_EQ_MSG(test, memmove_bench_align(test, false), 0,
+ "aligned memmove benchmark failed");
+ KUNIT_ASSERT_EQ_MSG(test, memmove_bench_align(test, true), 0,
+ "unaligned memmove benchmark failed");
+}
+#endif
+
static struct kunit_case memcpy_test_cases[] = {
KUNIT_CASE(memset_test),
KUNIT_CASE(memcpy_test),
@@ -500,6 +604,10 @@ static struct kunit_case memcpy_test_cases[] = {
KUNIT_CASE_SLOW(memmove_test),
KUNIT_CASE_SLOW(memmove_large_test),
KUNIT_CASE_SLOW(memmove_overlap_test),
+#ifdef CONFIG_MEMCPY_KUNIT_BENCHMARK
+ KUNIT_CASE_SLOW(memcpy_bench_test),
+ KUNIT_CASE_SLOW(memmove_bench_test),
+#endif
{}
};
--
2.52.0
On Thu, 29 Jan 2026 01:43:28 +0100 Matteo Croce <technoboy85@gmail.com> wrote:
> Add optional benchmarks for memcpy() and memmove() functions.
> Each benchmark is run twice: first with buffers aligned and then with
> buffers unaligned, to spot unaligned accesses on platforms where they
> have a noticeable performance impact.
>
> ...
>
> +static int memcpy_bench_align(struct kunit *test, bool unalign)
> +{
> + u64 start, end, total_ns = 0;
> + char *buf1;
> + char *buf2;
> + int ret = 0;
> +
> + buf1 = kzalloc(COPY_SIZE, GFP_KERNEL);
> + if (!buf1)
> + return -ENOMEM;
> +
> + buf2 = kzalloc(COPY_SIZE, GFP_KERNEL);
> + if (!buf2) {
> + ret = -ENOMEM;
> + goto out_free;
> + }
> +
> + preempt_disable();
> + for (int i = 0; i < COPIES_NUM; i++) {
> + start = ktime_get_ns();
> + memcpy(buf1 + unalign, buf2, COPY_SIZE - unalign);
> + end = ktime_get_ns();
> + total_ns += end - start;
> + cond_resched();
Is cond_resched() inside preempt_disable() actually legal?
Might be, but it doesn't make a lot of sense, does it?
> + }
> + preempt_enable();
> +
Il giorno ven 30 gen 2026 alle ore 00:20 Andrew Morton
<akpm@linux-foundation.org> ha scritto:
>
> On Thu, 29 Jan 2026 01:43:28 +0100 Matteo Croce <technoboy85@gmail.com> wrote:
>
> > Add optional benchmarks for memcpy() and memmove() functions.
> > Each benchmark is run twice: first with buffers aligned and then with
> > buffers unaligned, to spot unaligned accesses on platforms where they
> > have a noticeable performance impact.
> >
> > ...
> >
> > +static int memcpy_bench_align(struct kunit *test, bool unalign)
> > +{
> > + u64 start, end, total_ns = 0;
> > + char *buf1;
> > + char *buf2;
> > + int ret = 0;
> > +
> > + buf1 = kzalloc(COPY_SIZE, GFP_KERNEL);
> > + if (!buf1)
> > + return -ENOMEM;
> > +
> > + buf2 = kzalloc(COPY_SIZE, GFP_KERNEL);
> > + if (!buf2) {
> > + ret = -ENOMEM;
> > + goto out_free;
> > + }
> > +
> > + preempt_disable();
> > + for (int i = 0; i < COPIES_NUM; i++) {
> > + start = ktime_get_ns();
> > + memcpy(buf1 + unalign, buf2, COPY_SIZE - unalign);
> > + end = ktime_get_ns();
> > + total_ns += end - start;
> > + cond_resched();
>
> Is cond_resched() inside preempt_disable() actually legal?
>
> Might be, but it doesn't make a lot of sense, does it?
>
> > + }
> > + preempt_enable();
> > +
>
Right. In a previous version I was doing
preempt_disable()/preempt_enable() around the two ktime_get_ns(), but
then I thought that enabling and disabling preemption 100 time was too
much.
I'll restore the preempt macros around the actual copy and remove
cond_resched().
Thanks.
--
Matteo Croce
perl -e 'for($t=0;;$t++){print chr($t*($t>>8|$t>>13)&255)}' |aplay
On Fri, 30 Jan 2026 00:36:30 +0100 Matteo Croce <technoboy85@gmail.com> wrote: > > Is cond_resched() inside preempt_disable() actually legal? > > > > Might be, but it doesn't make a lot of sense, does it? > > > > > + } > > > + preempt_enable(); > > > + > > > > Right. In a previous version I was doing > preempt_disable()/preempt_enable() around the two ktime_get_ns(), but > then I thought that enabling and disabling preemption 100 time was too > much. > I'll restore the preempt macros around the actual copy and remove > cond_resched(). OK. local_irq_save() would be more accurate. Does it really need to copy 4MB? Smaller would make local_irq_disable() more viable. kmalloc(4MB) does seem to be pushing our luck. I'm spotting ./arch/arm/configs/pxa_defconfig:CONFIG_ARCH_FORCE_MAX_ORDER=8 which is 1MB?
Il giorno ven 30 gen 2026 alle ore 00:53 Andrew Morton
<akpm@linux-foundation.org> ha scritto:
>
> On Fri, 30 Jan 2026 00:36:30 +0100 Matteo Croce <technoboy85@gmail.com> wrote:
>
> > > Is cond_resched() inside preempt_disable() actually legal?
> > >
> > > Might be, but it doesn't make a lot of sense, does it?
> > >
> > > > + }
> > > > + preempt_enable();
> > > > +
> > >
> >
> > Right. In a previous version I was doing
> > preempt_disable()/preempt_enable() around the two ktime_get_ns(), but
> > then I thought that enabling and disabling preemption 100 time was too
> > much.
> > I'll restore the preempt macros around the actual copy and remove
> > cond_resched().
>
> OK.
>
> local_irq_save() would be more accurate.
>
Right, even more strict.
> Does it really need to copy 4MB? Smaller would make
> local_irq_disable() more viable.
>
> kmalloc(4MB) does seem to be pushing our luck. I'm spotting
>
> ./arch/arm/configs/pxa_defconfig:CONFIG_ARCH_FORCE_MAX_ORDER=8
>
> which is 1MB?
I wanted to stay outside of the caches as much as possible.
What about using "PAGE_SIZE * (1 << CONFIG_ARCH_FORCE_MAX_ORDER)" so
it adjusts itself?
--
Matteo Croce
perl -e 'for($t=0;;$t++){print chr($t*($t>>8|$t>>13)&255)}' |aplay
Il giorno ven 30 gen 2026 alle ore 01:04 Matteo Croce
<technoboy85@gmail.com> ha scritto:
>
> Il giorno ven 30 gen 2026 alle ore 00:53 Andrew Morton
> <akpm@linux-foundation.org> ha scritto:
> >
> > On Fri, 30 Jan 2026 00:36:30 +0100 Matteo Croce <technoboy85@gmail.com> wrote:
> >
> > > > Is cond_resched() inside preempt_disable() actually legal?
> > > >
> > > > Might be, but it doesn't make a lot of sense, does it?
> > > >
> > > > > + }
> > > > > + preempt_enable();
> > > > > +
> > > >
> > >
> > > Right. In a previous version I was doing
> > > preempt_disable()/preempt_enable() around the two ktime_get_ns(), but
> > > then I thought that enabling and disabling preemption 100 time was too
> > > much.
> > > I'll restore the preempt macros around the actual copy and remove
> > > cond_resched().
> >
> > OK.
> >
> > local_irq_save() would be more accurate.
> >
>
> Right, even more strict.
>
> > Does it really need to copy 4MB? Smaller would make
> > local_irq_disable() more viable.
> >
> > kmalloc(4MB) does seem to be pushing our luck. I'm spotting
> >
> > ./arch/arm/configs/pxa_defconfig:CONFIG_ARCH_FORCE_MAX_ORDER=8
> >
> > which is 1MB?
>
> I wanted to stay outside of the caches as much as possible.
> What about using "PAGE_SIZE * (1 << CONFIG_ARCH_FORCE_MAX_ORDER)" so
> it adjusts itself?
>
Or "PAGE_SIZE << MAX_PAGE_ORDER", if I grep it I see it's already used
here and there.
--
Matteo Croce
perl -e 'for($t=0;;$t++){print chr($t*($t>>8|$t>>13)&255)}' |aplay
© 2016 - 2026 Red Hat, Inc.