The code brings back the tasklet based code in order
to be able to run in softirq context.
One additional test is added which benchmarks the
impact of page_pool_napi_local().
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
---
.../bench/page_pool/bench_page_pool_simple.c | 92 ++++++++++++++++++-
1 file changed, 90 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
index cb6468adbda4..84683c547814 100644
--- a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
+++ b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
@@ -9,6 +9,7 @@
#include <linux/limits.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/netdevice.h>
#include <net/page_pool/helpers.h>
#include "time_bench.h"
@@ -16,6 +17,8 @@
static int verbose = 1;
#define MY_POOL_SIZE 1024
+DEFINE_MUTEX(wait_for_tasklet);
+
/* Makes tests selectable. Useful for perf-record to analyze a single test.
* Hint: Bash shells support writing binary number like: $((2#101010)
*
@@ -31,6 +34,10 @@ enum benchmark_bit {
bit_run_bench_no_softirq01,
bit_run_bench_no_softirq02,
bit_run_bench_no_softirq03,
+ bit_run_bench_tasklet01,
+ bit_run_bench_tasklet02,
+ bit_run_bench_tasklet03,
+ bit_run_bench_tasklet04,
};
#define bit(b) (1 << (b))
@@ -120,7 +127,12 @@ static void pp_fill_ptr_ring(struct page_pool *pp, int elems)
kfree(array);
}
-enum test_type { type_fast_path, type_ptr_ring, type_page_allocator };
+enum test_type {
+ type_fast_path,
+ type_napi_aware,
+ type_ptr_ring,
+ type_page_allocator,
+};
/* Depends on compile optimizing this function */
static int time_bench_page_pool(struct time_bench_record *rec, void *data,
@@ -132,6 +144,7 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
struct page_pool *pp;
struct page *page;
+ struct napi_struct napi = {0};
struct page_pool_params pp_params = {
.order = 0,
@@ -141,6 +154,7 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
.dev = NULL, /* Only use for DMA mapping */
.dma_dir = DMA_BIDIRECTIONAL,
};
+ struct page_pool_stats stats = {0};
pp = page_pool_create(&pp_params);
if (IS_ERR(pp)) {
@@ -155,6 +169,11 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
else
pr_warn("%s(): Cannot use page_pool fast-path\n", func);
+ if (type == type_napi_aware) {
+ napi.list_owner = smp_processor_id();
+ page_pool_enable_direct_recycling(pp, &napi);
+ }
+
time_bench_start(rec);
/** Loop to measure **/
for (i = 0; i < rec->loops; i++) {
@@ -173,7 +192,13 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
page_pool_recycle_direct(pp, page);
} else if (type == type_ptr_ring) {
- /* Normal return path */
+ /* Normal return path, either direct or via ptr_ring */
+ page_pool_put_page(pp, page, -1, false);
+
+ } else if (type == type_napi_aware) {
+ /* NAPI-aware recycling: uses fast-path recycling if
+ * possible.
+ */
page_pool_put_page(pp, page, -1, false);
} else if (type == type_page_allocator) {
@@ -188,6 +213,14 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
}
}
time_bench_stop(rec, loops_cnt);
+
+ if (type == type_napi_aware) {
+ page_pool_get_stats(pp, &stats);
+ if (stats.recycle_stats.cached < rec->loops)
+ pr_warn("%s(): NAPI-aware recycling wasn't used\n",
+ func);
+ }
+
out:
page_pool_destroy(pp);
return loops_cnt;
@@ -211,6 +244,54 @@ static int time_bench_page_pool03_slow(struct time_bench_record *rec,
return time_bench_page_pool(rec, data, type_page_allocator, __func__);
}
+static int time_bench_page_pool04_napi_aware(struct time_bench_record *rec,
+ void *data)
+{
+ return time_bench_page_pool(rec, data, type_napi_aware, __func__);
+}
+
+/* Testing page_pool requires running under softirq.
+ *
+ * Running under a tasklet satisfy this, as tasklets are built on top of
+ * softirq.
+ */
+static void pp_tasklet_handler(struct tasklet_struct *t)
+{
+ uint32_t nr_loops = loops;
+
+ if (in_serving_softirq())
+ pr_warn("%s(): in_serving_softirq fast-path\n",
+ __func__); // True
+ else
+ pr_warn("%s(): Cannot use page_pool fast-path\n", __func__);
+
+ if (enabled(bit_run_bench_tasklet01))
+ time_bench_loop(nr_loops, 0, "tasklet_page_pool01_fast_path",
+ NULL, time_bench_page_pool01_fast_path);
+
+ if (enabled(bit_run_bench_tasklet02))
+ time_bench_loop(nr_loops, 0, "tasklet_page_pool02_ptr_ring",
+ NULL, time_bench_page_pool02_ptr_ring);
+
+ if (enabled(bit_run_bench_tasklet03))
+ time_bench_loop(nr_loops, 0, "tasklet_page_pool03_slow", NULL,
+ time_bench_page_pool03_slow);
+
+ if (enabled(bit_run_bench_tasklet04))
+ time_bench_loop(nr_loops, 0, "tasklet_page_pool04_napi_aware",
+ NULL, time_bench_page_pool04_napi_aware);
+
+ mutex_unlock(&wait_for_tasklet); /* Module __init waiting on unlock */
+}
+DECLARE_TASKLET_DISABLED(pp_tasklet, pp_tasklet_handler);
+
+static void run_tasklet_tests(void)
+{
+ tasklet_enable(&pp_tasklet);
+ /* "Async" schedule tasklet, which runs on the CPU that schedule it */
+ tasklet_schedule(&pp_tasklet);
+}
+
static int run_benchmark_tests(void)
{
uint32_t nr_loops = loops;
@@ -251,12 +332,19 @@ static int __init bench_page_pool_simple_module_init(void)
run_benchmark_tests();
+ mutex_lock(&wait_for_tasklet);
+ run_tasklet_tests();
+ /* Sleep on mutex, waiting for tasklet to release */
+ mutex_lock(&wait_for_tasklet);
+
return 0;
}
module_init(bench_page_pool_simple_module_init);
static void __exit bench_page_pool_simple_module_exit(void)
{
+ tasklet_kill(&pp_tasklet);
+
if (verbose)
pr_info("Unloaded\n");
}
--
2.50.1
> diff --git a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
> index cb6468adb..84683c547 100644
> --- a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
> +++ b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
> @@ -16,6 +17,8 @@
> static int verbose = 1;
> #define MY_POOL_SIZE 1024
>
> +DEFINE_MUTEX(wait_for_tasklet);
> +
[ ... ]
> +/* Testing page_pool requires running under softirq.
> + *
> + * Running under a tasklet satisfy this, as tasklets are built on top of
> + * softirq.
> + */
> +static void pp_tasklet_handler(struct tasklet_struct *t)
> +{
> + uint32_t nr_loops = loops;
> +
> + if (in_serving_softirq())
> + pr_warn("%s(): in_serving_softirq fast-path\n",
> + __func__); // True
> + else
> + pr_warn("%s(): Cannot use page_pool fast-path\n", __func__);
> +
> + if (enabled(bit_run_bench_tasklet01))
> + time_bench_loop(nr_loops, 0, "tasklet_page_pool01_fast_path",
> + NULL, time_bench_page_pool01_fast_path);
> +
> + if (enabled(bit_run_bench_tasklet02))
> + time_bench_loop(nr_loops, 0, "tasklet_page_pool02_ptr_ring",
> + NULL, time_bench_page_pool02_ptr_ring);
> +
> + if (enabled(bit_run_bench_tasklet03))
> + time_bench_loop(nr_loops, 0, "tasklet_page_pool03_slow", NULL,
> + time_bench_page_pool03_slow);
> +
> + if (enabled(bit_run_bench_tasklet04))
> + time_bench_loop(nr_loops, 0, "tasklet_page_pool04_napi_aware",
> + NULL, time_bench_page_pool04_napi_aware);
> +
> + mutex_unlock(&wait_for_tasklet); /* Module __init waiting on unlock */
^^^^^^^^^^^^^
Can mutex_unlock() be called from softirq context? The pp_tasklet_handler()
function runs as a tasklet handler in softirq context, but mutexes are
sleeping locks that require process context. Would a completion or spinlock
be more appropriate here?
The synchronization pattern appears to be:
bench_page_pool_simple_module_init()->mutex_lock(&wait_for_tasklet)
bench_page_pool_simple_module_init()->tasklet_schedule(&pp_tasklet)
bench_page_pool_simple_module_init()->mutex_lock(&wait_for_tasklet) [blocks]
pp_tasklet_handler()->mutex_unlock(&wait_for_tasklet) [softirq context]
> +}
> +DECLARE_TASKLET_DISABLED(pp_tasklet, pp_tasklet_handler);
> +
> +static void run_tasklet_tests(void)
> +{
> + tasklet_enable(&pp_tasklet);
> + /* "Async" schedule tasklet, which runs on the CPU that schedule it */
> + tasklet_schedule(&pp_tasklet);
> +}
[ ... ]
> @@ -251,12 +332,19 @@ static int __init bench_page_pool_simple_module_init(void)
>
> run_benchmark_tests();
>
> + mutex_lock(&wait_for_tasklet);
> + run_tasklet_tests();
> + /* Sleep on mutex, waiting for tasklet to release */
> + mutex_lock(&wait_for_tasklet);
> +
> return 0;
> }
---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/19165940352
© 2016 - 2025 Red Hat, Inc.