xdp: Delegate fast path return decision to page_pool

[RFC 1/2] page_pool: add benchmarking for napi-based recycling

Posted by Dragos Tatulea 3 months ago

The code brings back the tasklet based code in order
to be able to run in softirq context.

One additional test is added which benchmarks the
impact of page_pool_napi_local().

Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
---
 .../bench/page_pool/bench_page_pool_simple.c  | 92 ++++++++++++++++++-
 1 file changed, 90 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
index cb6468adbda4..84683c547814 100644
--- a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
+++ b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
@@ -9,6 +9,7 @@
 #include <linux/limits.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/netdevice.h>
 #include <net/page_pool/helpers.h>
 
 #include "time_bench.h"
@@ -16,6 +17,8 @@
 static int verbose = 1;
 #define MY_POOL_SIZE 1024
 
+DEFINE_MUTEX(wait_for_tasklet);
+
 /* Makes tests selectable. Useful for perf-record to analyze a single test.
  * Hint: Bash shells support writing binary number like: $((2#101010)
  *
@@ -31,6 +34,10 @@ enum benchmark_bit {
 	bit_run_bench_no_softirq01,
 	bit_run_bench_no_softirq02,
 	bit_run_bench_no_softirq03,
+	bit_run_bench_tasklet01,
+	bit_run_bench_tasklet02,
+	bit_run_bench_tasklet03,
+	bit_run_bench_tasklet04,
 };
 
 #define bit(b)		(1 << (b))
@@ -120,7 +127,12 @@ static void pp_fill_ptr_ring(struct page_pool *pp, int elems)
 	kfree(array);
 }
 
-enum test_type { type_fast_path, type_ptr_ring, type_page_allocator };
+enum test_type {
+	type_fast_path,
+	type_napi_aware,
+	type_ptr_ring,
+	type_page_allocator,
+};
 
 /* Depends on compile optimizing this function */
 static int time_bench_page_pool(struct time_bench_record *rec, void *data,
@@ -132,6 +144,7 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
 
 	struct page_pool *pp;
 	struct page *page;
+	struct napi_struct napi = {0};
 
 	struct page_pool_params pp_params = {
 		.order = 0,
@@ -141,6 +154,7 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
 		.dev = NULL, /* Only use for DMA mapping */
 		.dma_dir = DMA_BIDIRECTIONAL,
 	};
+	struct page_pool_stats stats = {0};
 
 	pp = page_pool_create(&pp_params);
 	if (IS_ERR(pp)) {
@@ -155,6 +169,11 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
 	else
 		pr_warn("%s(): Cannot use page_pool fast-path\n", func);
 
+	if (type == type_napi_aware) {
+		napi.list_owner = smp_processor_id();
+		page_pool_enable_direct_recycling(pp, &napi);
+	}
+
 	time_bench_start(rec);
 	/** Loop to measure **/
 	for (i = 0; i < rec->loops; i++) {
@@ -173,7 +192,13 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
 			page_pool_recycle_direct(pp, page);
 
 		} else if (type == type_ptr_ring) {
-			/* Normal return path */
+			/* Normal return path, either direct or via ptr_ring */
+			page_pool_put_page(pp, page, -1, false);
+
+		} else if (type == type_napi_aware) {
+			/* NAPI-aware recycling: uses fast-path recycling if
+			 * possible.
+			 */
 			page_pool_put_page(pp, page, -1, false);
 
 		} else if (type == type_page_allocator) {
@@ -188,6 +213,14 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
 		}
 	}
 	time_bench_stop(rec, loops_cnt);
+
+	if (type == type_napi_aware) {
+		page_pool_get_stats(pp, &stats);
+		if (stats.recycle_stats.cached < rec->loops)
+			pr_warn("%s(): NAPI-aware recycling wasn't used\n",
+				func);
+	}
+
 out:
 	page_pool_destroy(pp);
 	return loops_cnt;
@@ -211,6 +244,54 @@ static int time_bench_page_pool03_slow(struct time_bench_record *rec,
 	return time_bench_page_pool(rec, data, type_page_allocator, __func__);
 }
 
+static int time_bench_page_pool04_napi_aware(struct time_bench_record *rec,
+					     void *data)
+{
+	return time_bench_page_pool(rec, data, type_napi_aware, __func__);
+}
+
+/* Testing page_pool requires running under softirq.
+ *
+ * Running under a tasklet satisfy this, as tasklets are built on top of
+ * softirq.
+ */
+static void pp_tasklet_handler(struct tasklet_struct *t)
+{
+	uint32_t nr_loops = loops;
+
+	if (in_serving_softirq())
+		pr_warn("%s(): in_serving_softirq fast-path\n",
+			__func__); // True
+	else
+		pr_warn("%s(): Cannot use page_pool fast-path\n", __func__);
+
+	if (enabled(bit_run_bench_tasklet01))
+		time_bench_loop(nr_loops, 0, "tasklet_page_pool01_fast_path",
+				NULL, time_bench_page_pool01_fast_path);
+
+	if (enabled(bit_run_bench_tasklet02))
+		time_bench_loop(nr_loops, 0, "tasklet_page_pool02_ptr_ring",
+				NULL, time_bench_page_pool02_ptr_ring);
+
+	if (enabled(bit_run_bench_tasklet03))
+		time_bench_loop(nr_loops, 0, "tasklet_page_pool03_slow", NULL,
+				time_bench_page_pool03_slow);
+
+	if (enabled(bit_run_bench_tasklet04))
+		time_bench_loop(nr_loops, 0, "tasklet_page_pool04_napi_aware",
+				NULL, time_bench_page_pool04_napi_aware);
+
+	mutex_unlock(&wait_for_tasklet); /* Module __init waiting on unlock */
+}
+DECLARE_TASKLET_DISABLED(pp_tasklet, pp_tasklet_handler);
+
+static void run_tasklet_tests(void)
+{
+	tasklet_enable(&pp_tasklet);
+	/* "Async" schedule tasklet, which runs on the CPU that schedule it */
+	tasklet_schedule(&pp_tasklet);
+}
+
 static int run_benchmark_tests(void)
 {
 	uint32_t nr_loops = loops;
@@ -251,12 +332,19 @@ static int __init bench_page_pool_simple_module_init(void)
 
 	run_benchmark_tests();
 
+	mutex_lock(&wait_for_tasklet);
+	run_tasklet_tests();
+	/* Sleep on mutex, waiting for tasklet to release */
+	mutex_lock(&wait_for_tasklet);
+
 	return 0;
 }
 module_init(bench_page_pool_simple_module_init);
 
 static void __exit bench_page_pool_simple_module_exit(void)
 {
+	tasklet_kill(&pp_tasklet);
+
 	if (verbose)
 		pr_info("Unloaded\n");
 }
-- 
2.50.1

Re: [RFC 1/2] page_pool: add benchmarking for napi-based recycling

Posted by bot+bpf-ci@kernel.org 3 months ago

> diff --git a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
> index cb6468adb..84683c547 100644
> --- a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
> +++ b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
> @@ -16,6 +17,8 @@
>  static int verbose = 1;
>  #define MY_POOL_SIZE 1024
>
> +DEFINE_MUTEX(wait_for_tasklet);
> +

[ ... ]

> +/* Testing page_pool requires running under softirq.
> + *
> + * Running under a tasklet satisfy this, as tasklets are built on top of
> + * softirq.
> + */
> +static void pp_tasklet_handler(struct tasklet_struct *t)
> +{
> +	uint32_t nr_loops = loops;
> +
> +	if (in_serving_softirq())
> +		pr_warn("%s(): in_serving_softirq fast-path\n",
> +			__func__); // True
> +	else
> +		pr_warn("%s(): Cannot use page_pool fast-path\n", __func__);
> +
> +	if (enabled(bit_run_bench_tasklet01))
> +		time_bench_loop(nr_loops, 0, "tasklet_page_pool01_fast_path",
> +				NULL, time_bench_page_pool01_fast_path);
> +
> +	if (enabled(bit_run_bench_tasklet02))
> +		time_bench_loop(nr_loops, 0, "tasklet_page_pool02_ptr_ring",
> +				NULL, time_bench_page_pool02_ptr_ring);
> +
> +	if (enabled(bit_run_bench_tasklet03))
> +		time_bench_loop(nr_loops, 0, "tasklet_page_pool03_slow", NULL,
> +				time_bench_page_pool03_slow);
> +
> +	if (enabled(bit_run_bench_tasklet04))
> +		time_bench_loop(nr_loops, 0, "tasklet_page_pool04_napi_aware",
> +				NULL, time_bench_page_pool04_napi_aware);
> +
> +	mutex_unlock(&wait_for_tasklet); /* Module __init waiting on unlock */
        ^^^^^^^^^^^^^

Can mutex_unlock() be called from softirq context? The pp_tasklet_handler()
function runs as a tasklet handler in softirq context, but mutexes are
sleeping locks that require process context. Would a completion or spinlock
be more appropriate here?

The synchronization pattern appears to be:
  bench_page_pool_simple_module_init()->mutex_lock(&wait_for_tasklet)
  bench_page_pool_simple_module_init()->tasklet_schedule(&pp_tasklet)
  bench_page_pool_simple_module_init()->mutex_lock(&wait_for_tasklet) [blocks]
  pp_tasklet_handler()->mutex_unlock(&wait_for_tasklet) [softirq context]

> +}
> +DECLARE_TASKLET_DISABLED(pp_tasklet, pp_tasklet_handler);
> +
> +static void run_tasklet_tests(void)
> +{
> +	tasklet_enable(&pp_tasklet);
> +	/* "Async" schedule tasklet, which runs on the CPU that schedule it */
> +	tasklet_schedule(&pp_tasklet);
> +}

[ ... ]

> @@ -251,12 +332,19 @@ static int __init bench_page_pool_simple_module_init(void)
>
>  	run_benchmark_tests();
>
> +	mutex_lock(&wait_for_tasklet);
> +	run_tasklet_tests();
> +	/* Sleep on mutex, waiting for tasklet to release */
> +	mutex_lock(&wait_for_tasklet);
> +
>  	return 0;
>  }

---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/19165940352

[RFC 1/2] page_pool: add benchmarking for napi-based recycling
[RFC 2/2] xdp: Delegate fast path return decision to page_pool