[v1] bpf/verifier: optimize ID mapping reset in states_equal

[PATCH] bpf/verifier: optimize ID mapping reset in states_equal

Posted by wujing 3 weeks, 2 days ago

The verifier uses an ID mapping table (struct bpf_idmap) during state
equivalence checks. Currently, reset_idmap_scratch performs a full memset
on the entire map in every call.

The table size is exactly 4800 bytes (approx. 4.7KB), calculated as:
- MAX_BPF_REG = 11
- MAX_BPF_STACK = 512
- BPF_REG_SIZE = 8
- MAX_CALL_FRAMES = 8
- BPF_ID_MAP_SIZE = (11 + 512 / 8) * 8 = 600 entries
- Each entry (struct bpf_id_pair) is 8 bytes (two u32 fields)
- Total size = 600 * 8 = 4800 bytes

For complex programs with many pruning points, this constant large memset
introduces significant CPU overhead and cache pressure, especially when
only a few IDs are actually used.

This patch optimizes the reset logic by:
1. Adding 'map_cnt' to bpf_idmap to track used slots.
2. Updating 'map_cnt' in check_ids to record the high-water mark.
3. Making reset_idmap_scratch perform a partial memset based on 'map_cnt'.

This improves pruning performance and reduces redundant memory writes.

Signed-off-by: wujing <realwujing@gmail.com>
Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 10 ++++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 130bcbd66f60..562f7e63be29 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -692,6 +692,7 @@ struct bpf_id_pair {
 
 struct bpf_idmap {
 	u32 tmp_id_gen;
+	u32 map_cnt;
 	struct bpf_id_pair map[BPF_ID_MAP_SIZE];
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 37ce3990c9ad..6220dde41107 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18954,6 +18954,7 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
 			/* Reached an empty slot; haven't seen this id before */
 			map[i].old = old_id;
 			map[i].cur = cur_id;
+			idmap->map_cnt = i + 1;
 			return true;
 		}
 		if (map[i].old == old_id)
@@ -19471,8 +19472,13 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
 
 static void reset_idmap_scratch(struct bpf_verifier_env *env)
 {
-	env->idmap_scratch.tmp_id_gen = env->id_gen;
-	memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+	struct bpf_idmap *idmap = &env->idmap_scratch;
+
+	idmap->tmp_id_gen = env->id_gen;
+	if (idmap->map_cnt) {
+		memset(idmap->map, 0, idmap->map_cnt * sizeof(struct bpf_id_pair));
+		idmap->map_cnt = 0;
+	}
 }
 
 static bool states_equal(struct bpf_verifier_env *env,
-- 
2.39.5

Re: [PATCH] bpf/verifier: optimize ID mapping reset in states_equal

Posted by Eduard Zingerman 3 weeks, 1 day ago

On Thu, 2026-01-15 at 22:49 +0800, wujing wrote:
> The verifier uses an ID mapping table (struct bpf_idmap) during state
> equivalence checks. Currently, reset_idmap_scratch performs a full memset
> on the entire map in every call.
> 
> The table size is exactly 4800 bytes (approx. 4.7KB), calculated as:
> - MAX_BPF_REG = 11
> - MAX_BPF_STACK = 512
> - BPF_REG_SIZE = 8
> - MAX_CALL_FRAMES = 8
> - BPF_ID_MAP_SIZE = (11 + 512 / 8) * 8 = 600 entries
> - Each entry (struct bpf_id_pair) is 8 bytes (two u32 fields)
> - Total size = 600 * 8 = 4800 bytes
> 
> For complex programs with many pruning points, this constant large memset
> introduces significant CPU overhead and cache pressure, especially when
> only a few IDs are actually used.
> 
> This patch optimizes the reset logic by:
> 1. Adding 'map_cnt' to bpf_idmap to track used slots.
> 2. Updating 'map_cnt' in check_ids to record the high-water mark.
> 3. Making reset_idmap_scratch perform a partial memset based on 'map_cnt'.
> 
> This improves pruning performance and reduces redundant memory writes.
> 
> Signed-off-by: wujing <realwujing@gmail.com>
                 ^^^^^^
		 Please use your full name.
> Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
> ---

I think this is an ok change.
Could you please collect some stats using 'perf stat' for some big selftest?

>  include/linux/bpf_verifier.h |  1 +
>  kernel/bpf/verifier.c        | 10 ++++++++--
>  2 files changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index 130bcbd66f60..562f7e63be29 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -692,6 +692,7 @@ struct bpf_id_pair {
>  
>  struct bpf_idmap {
>  	u32 tmp_id_gen;
> +	u32 map_cnt;
>  	struct bpf_id_pair map[BPF_ID_MAP_SIZE];
>  };
>  
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 37ce3990c9ad..6220dde41107 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -18954,6 +18954,7 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
>  			/* Reached an empty slot; haven't seen this id before */
>  			map[i].old = old_id;
>  			map[i].cur = cur_id;
> +			idmap->map_cnt = i + 1;
>  			return true;
>  		}
>  		if (map[i].old == old_id)
> @@ -19471,8 +19472,13 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
>  
>  static void reset_idmap_scratch(struct bpf_verifier_env *env)
>  {
> -	env->idmap_scratch.tmp_id_gen = env->id_gen;
> -	memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
> +	struct bpf_idmap *idmap = &env->idmap_scratch;
> +
> +	idmap->tmp_id_gen = env->id_gen;
> +	if (idmap->map_cnt) {

Nit: this condition is not really necessary.

> +		memset(idmap->map, 0, idmap->map_cnt * sizeof(struct bpf_id_pair));
> +		idmap->map_cnt = 0;
> +	}
>  }
>  
>  static bool states_equal(struct bpf_verifier_env *env,

Re: [PATCH] bpf/verifier: optimize ID mapping reset in states_equal

Posted by Andrii Nakryiko 3 weeks, 1 day ago

On Thu, Jan 15, 2026 at 12:36 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> On Thu, 2026-01-15 at 22:49 +0800, wujing wrote:
> > The verifier uses an ID mapping table (struct bpf_idmap) during state
> > equivalence checks. Currently, reset_idmap_scratch performs a full memset
> > on the entire map in every call.
> >
> > The table size is exactly 4800 bytes (approx. 4.7KB), calculated as:
> > - MAX_BPF_REG = 11
> > - MAX_BPF_STACK = 512
> > - BPF_REG_SIZE = 8
> > - MAX_CALL_FRAMES = 8
> > - BPF_ID_MAP_SIZE = (11 + 512 / 8) * 8 = 600 entries
> > - Each entry (struct bpf_id_pair) is 8 bytes (two u32 fields)
> > - Total size = 600 * 8 = 4800 bytes
> >
> > For complex programs with many pruning points, this constant large memset
> > introduces significant CPU overhead and cache pressure, especially when
> > only a few IDs are actually used.
> >
> > This patch optimizes the reset logic by:
> > 1. Adding 'map_cnt' to bpf_idmap to track used slots.
> > 2. Updating 'map_cnt' in check_ids to record the high-water mark.
> > 3. Making reset_idmap_scratch perform a partial memset based on 'map_cnt'.
> >
> > This improves pruning performance and reduces redundant memory writes.
> >
> > Signed-off-by: wujing <realwujing@gmail.com>
>                  ^^^^^^
>                  Please use your full name.
> > Signed-off-by: Qiliang Yuan <yuanql9@chinatelecom.cn>
> > ---
>
> I think this is an ok change.
> Could you please collect some stats using 'perf stat' for some big selftest?
>
> >  include/linux/bpf_verifier.h |  1 +
> >  kernel/bpf/verifier.c        | 10 ++++++++--
> >  2 files changed, 9 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> > index 130bcbd66f60..562f7e63be29 100644
> > --- a/include/linux/bpf_verifier.h
> > +++ b/include/linux/bpf_verifier.h
> > @@ -692,6 +692,7 @@ struct bpf_id_pair {
> >
> >  struct bpf_idmap {
> >       u32 tmp_id_gen;
> > +     u32 map_cnt;
> >       struct bpf_id_pair map[BPF_ID_MAP_SIZE];
> >  };
> >
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index 37ce3990c9ad..6220dde41107 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -18954,6 +18954,7 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
> >                       /* Reached an empty slot; haven't seen this id before */
> >                       map[i].old = old_id;
> >                       map[i].cur = cur_id;
> > +                     idmap->map_cnt = i + 1;
> >                       return true;
> >               }
> >               if (map[i].old == old_id)
> > @@ -19471,8 +19472,13 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
> >
> >  static void reset_idmap_scratch(struct bpf_verifier_env *env)
> >  {
> > -     env->idmap_scratch.tmp_id_gen = env->id_gen;
> > -     memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
> > +     struct bpf_idmap *idmap = &env->idmap_scratch;
> > +
> > +     idmap->tmp_id_gen = env->id_gen;
> > +     if (idmap->map_cnt) {
>
> Nit: this condition is not really necessary.
>
> > +             memset(idmap->map, 0, idmap->map_cnt * sizeof(struct bpf_id_pair));

the whole memset is not necessary if we have map_cnt and use that when
searching IDs, no? I'd also call the field just idmap->cnt

> > +             idmap->map_cnt = 0;
> > +     }
> >  }
> >
> >  static bool states_equal(struct bpf_verifier_env *env,

[PATCH v2] bpf/verifier: optimize ID mapping reset in states_equal

Posted by Qiliang Yuan 3 weeks, 1 day ago

The verifier uses an ID mapping table (struct bpf_idmap) during state
equivalence checks. Currently, reset_idmap_scratch performs a full memset
on the entire map (~4.7KB) in every call to states_equal.

Following suggestions from Eduard Zingerman and Andrii Nakryiko, this
patch optimizes the reset logic to avoid unnecessary memory operations:

1. Rename 'map_cnt' to 'cnt' for brevity.
2. Replace the O(N) memset() in reset_idmap_scratch() with a simple
   O(1) counter reset (idmap->cnt = 0).
3. Update check_ids() to search only up to 'idmap->cnt' entries. If no
   mapping is found, append the new mapping and increment 'cnt'.

This ensures that reset overhead is minimal and the search loop is
bounded by the number of IDs actually encountered in the current
equivalence check.

Benchmark results (system-wide 'perf stat' during high-concurrency 'veristat'
stress test, 60s):

The following results, captured using perf while running veristat in parallel
across all CPU cores, show a significant reduction in instruction overhead
(~9.3%) and branch executions (~11%), confirming that the O(1) reset logic
significantly reduces the verifier's workload during state equivalence
checks.

Metric          | Baseline      | Patched       | Delta
----------------|---------------|---------------|----------
Iterations      | 5710          | 5731          | +0.37%
Instructions    | 1.714 T       | 1.555 T       | -9.28%
Inst/Iter       | 300.2 M       | 271.3 M       | -9.63%
Cycles          | 1.436 T       | 1.335 T       | -7.03%
Branches        | 350.4 B       | 311.9 B       | -10.99%
Migrations      | 25,977        | 23,524        | -9.44%

Test Command:
  seq 1 2000000 | sudo perf stat -a -- \
    timeout 60s xargs -P $(nproc) -I {} ./veristat access_map_in_map.bpf.o

Detailed Performance Stats:

Baseline:
 Performance counter stats for 'system wide':

         6,735,538      context-switches                 #   3505.5 cs/sec  cs_per_second
      1,921,431.27 msec cpu-clock                        #     32.0 CPUs  CPUs_utilized
            25,977      cpu-migrations                   #     13.5 migrations/sec  migrations_per_second
         7,268,841      page-faults                      #   3783.0 faults/sec  page_fault_per_second
    18,662,357,052      branch-misses                    #      3.9 %  branch_miss_rate         (50.14%)
   350,411,558,023      branches                         #    182.4 M/sec  branch_frequency     (66.85%)
 1,435,774,261,319      cpu-cycles                       #      0.7 GHz  cycles_frequency       (66.95%)
 1,714,154,229,503      instructions                     #      1.2 instructions  insn_per_cycle  (66.86%)
   429,445,480,497      stalled-cycles-frontend          #     0.30 frontend_cycles_idle        (66.36%)

      60.035899231 seconds time elapsed

Patched:
 Performance counter stats for 'system wide':

         6,662,371      context-switches                 #   3467.3 cs/sec  cs_per_second
      1,921,497.78 msec cpu-clock                        #     32.0 CPUs  CPUs_utilized
            23,524      cpu-migrations                   #     12.2 migrations/sec  migrations_per_second
         7,783,064      page-faults                      #   4050.5 faults/sec  page_faults_per_second
    18,181,655,163      branch-misses                    #      4.3 %  branch_miss_rate         (50.15%)
   311,865,239,743      branches                         #    162.3 M/sec  branch_frequency     (66.86%)
 1,334,859,779,821      cpu-cycles                       #      0.7 GHz  cycles_frequency       (66.96%)
 1,555,086,465,845      instructions                     #      1.2 instructions  insn_per_cycle  (66.87%)
   407,666,712,045      stalled-cycles-frontend          #     0.31 frontend_cycles_idle        (66.35%)

      60.034702643 seconds time elapsed

Suggested-by: Eduard Zingerman <eddyz87@gmail.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
---
Hi Eduard and Andrii,

Thank you for the feedback on v1. I've optimized the ID mapping reset
logic to O(1) and updated the benchmark results as suggested.

v1 -> v2:
- Rename map_cnt to cnt (Andrii Nakryiko)
- Eliminate memset() by using cnt to bound search loop (Andrii Nakryiko)
- Remove unnecessary if() check in reset_idmap_scratch() (Eduard Zingerman)
- Use full name in Signed-off-by (Eduard Zingerman)

 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 23 ++++++++++++++---------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 130bcbd66f60..8355b585cd18 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -692,6 +692,7 @@ struct bpf_id_pair {
 
 struct bpf_idmap {
 	u32 tmp_id_gen;
+	u32 cnt;
 	struct bpf_id_pair map[BPF_ID_MAP_SIZE];
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3135643d5695..6ec6d70e5ce7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18948,18 +18948,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
 	if (old_id == 0) /* cur_id == 0 as well */
 		return true;
 
-	for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
-		if (!map[i].old) {
-			/* Reached an empty slot; haven't seen this id before */
-			map[i].old = old_id;
-			map[i].cur = cur_id;
-			return true;
-		}
+	for (i = 0; i < idmap->cnt; i++) {
 		if (map[i].old == old_id)
 			return map[i].cur == cur_id;
 		if (map[i].cur == cur_id)
 			return false;
 	}
+
+	/* Reached the end of known mappings; haven't seen this id before */
+	if (idmap->cnt < BPF_ID_MAP_SIZE) {
+		map[idmap->cnt].old = old_id;
+		map[idmap->cnt].cur = cur_id;
+		idmap->cnt++;
+		return true;
+	}
+
 	/* We ran out of idmap slots, which should be impossible */
 	WARN_ON_ONCE(1);
 	return false;
@@ -19470,8 +19473,10 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
 
 static void reset_idmap_scratch(struct bpf_verifier_env *env)
 {
-	env->idmap_scratch.tmp_id_gen = env->id_gen;
-	memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+	struct bpf_idmap *idmap = &env->idmap_scratch;
+
+	idmap->tmp_id_gen = env->id_gen;
+	idmap->cnt = 0;
 }
 
 static bool states_equal(struct bpf_verifier_env *env,
-- 
2.39.5

[PATCH v2] bpf/verifier: optimize ID mapping reset in states_equal

Posted by Qiliang Yuan 3 weeks, 1 day ago

The verifier uses an ID mapping table (struct bpf_idmap) during state
equivalence checks. Currently, reset_idmap_scratch performs a full memset
on the entire map (~4.7KB) in every call to states_equal.

Following suggestions from Eduard Zingerman and Andrii Nakryiko, this
patch optimizes the reset logic to avoid unnecessary memory operations:

1. Rename 'map_cnt' to 'cnt' for brevity.
2. Replace the O(N) memset() in reset_idmap_scratch() with a simple
   O(1) counter reset (idmap->cnt = 0).
3. Update check_ids() to search only up to 'idmap->cnt' entries. If no
   mapping is found, append the new mapping and increment 'cnt'.

This ensures that reset overhead is minimal and the search loop is
bounded by the number of IDs actually encountered in the current
equivalence check.

Benchmark results (system-wide 'perf stat' during high-concurrency 'veristat'
stress test, 60s):

The following results, captured using perf while running veristat in parallel
across all CPU cores, show a significant reduction in instruction overhead
(~9.3%) and branch executions (~11%), confirming that the O(1) reset logic
significantly reduces the verifier's workload during state equivalence
checks.

Metric          | Baseline      | Patched       | Delta
----------------|---------------|---------------|----------
Iterations      | 5710          | 5731          | +0.37%
Instructions    | 1.714 T       | 1.555 T       | -9.28%
Inst/Iter       | 300.2 M       | 271.3 M       | -9.63%
Cycles          | 1.436 T       | 1.335 T       | -7.03%
Branches        | 350.4 B       | 311.9 B       | -10.99%
Migrations      | 25,977        | 23,524        | -9.44%

Test Command:
  seq 1 2000000 | sudo perf stat -a -- \
    timeout 60s xargs -P $(nproc) -I {} ./veristat access_map_in_map.bpf.o

Detailed Performance Stats:

Baseline:
 Performance counter stats for 'system wide':

         6,735,538      context-switches                 #   3505.5 cs/sec  cs_per_second
      1,921,431.27 msec cpu-clock                        #     32.0 CPUs  CPUs_utilized
            25,977      cpu-migrations                   #     13.5 migrations/sec  migrations_per_second
         7,268,841      page-faults                      #   3783.0 faults/sec  page_fault_per_second
    18,662,357,052      branch-misses                    #      3.9 %  branch_miss_rate         (50.14%)
   350,411,558,023      branches                         #    182.4 M/sec  branch_frequency     (66.85%)
 1,435,774,261,319      cpu-cycles                       #      0.7 GHz  cycles_frequency       (66.95%)
 1,714,154,229,503      instructions                     #      1.2 instructions  insn_per_cycle  (66.86%)
   429,445,480,497      stalled-cycles-frontend          #     0.30 frontend_cycles_idle        (66.36%)

      60.035899231 seconds time elapsed

Patched:
 Performance counter stats for 'system wide':

         6,662,371      context-switches                 #   3467.3 cs/sec  cs_per_second
      1,921,497.78 msec cpu-clock                        #     32.0 CPUs  CPUs_utilized
            23,524      cpu-migrations                   #     12.2 migrations/sec  migrations_per_second
         7,783,064      page-faults                      #   4050.5 faults/sec  page_faults_per_second
    18,181,655,163      branch-misses                    #      4.3 %  branch_miss_rate         (50.15%)
   311,865,239,743      branches                         #    162.3 M/sec  branch_frequency     (66.86%)
 1,334,859,779,821      cpu-cycles                       #      0.7 GHz  cycles_frequency       (66.96%)
 1,555,086,465,845      instructions                     #      1.2 instructions  insn_per_cycle  (66.87%)
   407,666,712,045      stalled-cycles-frontend          #     0.31 frontend_cycles_idle        (66.35%)

      60.034702643 seconds time elapsed

Suggested-by: Eduard Zingerman <eddyz87@gmail.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
---
Hi Eduard and Andrii,

Thank you for the feedback on v1. I've optimized the ID mapping reset
logic to O(1) and updated the benchmark results as suggested.

v1 -> v2:
- Rename map_cnt to cnt (Andrii Nakryiko)
- Eliminate memset() by using cnt to bound search loop (Andrii Nakryiko)
- Remove unnecessary if() check in reset_idmap_scratch() (Eduard Zingerman)
- Use full name in Signed-off-by (Eduard Zingerman)

 include/linux/bpf_verifier.h |  2 +-
 kernel/bpf/verifier.c        | 23 +++++++++++------------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 562f7e63be29..8355b585cd18 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -692,7 +692,7 @@ struct bpf_id_pair {
 
 struct bpf_idmap {
 	u32 tmp_id_gen;
-	u32 map_cnt;
+	u32 cnt;
 	struct bpf_id_pair map[BPF_ID_MAP_SIZE];
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6220dde41107..c0e8604618de 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18949,19 +18949,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
 	if (old_id == 0) /* cur_id == 0 as well */
 		return true;
 
-	for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
-		if (!map[i].old) {
-			/* Reached an empty slot; haven't seen this id before */
-			map[i].old = old_id;
-			map[i].cur = cur_id;
-			idmap->map_cnt = i + 1;
-			return true;
-		}
+	for (i = 0; i < idmap->cnt; i++) {
 		if (map[i].old == old_id)
 			return map[i].cur == cur_id;
 		if (map[i].cur == cur_id)
 			return false;
 	}
+
+	/* Reached the end of known mappings; haven't seen this id before */
+	if (idmap->cnt < BPF_ID_MAP_SIZE) {
+		map[idmap->cnt].old = old_id;
+		map[idmap->cnt].cur = cur_id;
+		idmap->cnt++;
+		return true;
+	}
+
 	/* We ran out of idmap slots, which should be impossible */
 	WARN_ON_ONCE(1);
 	return false;
@@ -19475,10 +19477,7 @@ static void reset_idmap_scratch(struct bpf_verifier_env *env)
 	struct bpf_idmap *idmap = &env->idmap_scratch;
 
 	idmap->tmp_id_gen = env->id_gen;
-	if (idmap->map_cnt) {
-		memset(idmap->map, 0, idmap->map_cnt * sizeof(struct bpf_id_pair));
-		idmap->map_cnt = 0;
-	}
+	idmap->cnt = 0;
 }
 
 static bool states_equal(struct bpf_verifier_env *env,
-- 
2.39.5

Re: [PATCH v2] bpf/verifier: optimize ID mapping reset in states_equal

Posted by Eduard Zingerman 2 weeks, 4 days ago

On Fri, 2026-01-16 at 17:08 +0800, Qiliang Yuan wrote:

[...]

> Metric          | Baseline      | Patched       | Delta
> ----------------|---------------|---------------|----------
> Iterations      | 5710          | 5731          | +0.37%
> Instructions    | 1.714 T       | 1.555 T       | -9.28%
> Inst/Iter       | 300.2 M       | 271.3 M       | -9.63%
> Cycles          | 1.436 T       | 1.335 T       | -7.03%
> Branches        | 350.4 B       | 311.9 B       | -10.99%
> Migrations      | 25,977        | 23,524        | -9.44%
> 
> Test Command:
>   seq 1 2000000 | sudo perf stat -a -- \
>     timeout 60s xargs -P $(nproc) -I {} ./veristat access_map_in_map.bpf.o

[...]

As discussed in a separate thread, I don't think that system-wide
profiling is a good fit here. When using perf stat for collecting
pyperf180.bpf.o processing stats I don't see a big difference.
Regardless of specific statistics, I think this change should be landed.

> Suggested-by: Eduard Zingerman <eddyz87@gmail.com>

Please remove this 'Suggested-by', you found this change yourself,
Andrii suggested improving it further (this usually goes to a changelog).

> Suggested-by: Andrii Nakryiko <andrii@kernel.org>
> Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
> ---

Acked-by: Eduard Zingerman <eddyz87@gmail.com>

[...]

[PATCH v3] bpf/verifier: optimize ID mapping reset in states_equal

Posted by Qiliang Yuan 2 weeks, 4 days ago

The verifier uses an ID mapping table (struct bpf_idmap) during state
equivalence checks. Currently, reset_idmap_scratch performs a full memset
on the entire map (~4.7KB) in every call to states_equal.

This ensures that reset overhead is minimal and the search loop is
bounded by the number of IDs actually encountered in the current
equivalence check.

Benchmark results (system-wide 'perf stat' during high-concurrency 'veristat'
stress test, 60s):

The following results, captured using perf while running veristat in parallel
across all CPU cores, show a significant reduction in instruction overhead
(~9.3%) and branch executions (~11%), confirming that the O(1) reset logic
significantly reduces the verifier's workload during state equivalence
checks.

Metric          | Baseline      | Patched       | Delta
----------------|---------------|---------------|----------
Iterations      | 5710          | 5731          | +0.37%
Instructions    | 1.714 T       | 1.555 T       | -9.28%
Inst/Iter       | 300.2 M       | 271.3 M       | -9.63%
Cycles          | 1.436 T       | 1.335 T       | -7.03%
Branches        | 350.4 B       | 311.9 B       | -10.99%
Migrations      | 25,977        | 23,524        | -9.44%

Test Command:
  seq 1 2000000 | sudo perf stat -a -- \
    timeout 60s xargs -P $(nproc) -I {} ./veristat access_map_in_map.bpf.o

Detailed Performance Stats:

Baseline:
 Performance counter stats for 'system wide':

         6,735,538      context-switches                 #   3505.5 cs/sec  cs_per_second
      1,921,431.27 msec cpu-clock                        #     32.0 CPUs  CPUs_utilized
            25,977      cpu-migrations                   #     13.5 migrations/sec  migrations_per_second
         7,268,841      page-faults                      #   3783.0 faults/sec  page_fault_per_second
    18,662,357,052      branch-misses                    #      3.9 %  branch_miss_rate         (50.14%)
   350,411,558,023      branches                         #    182.4 M/sec  branch_frequency     (66.85%)
 1,435,774,261,319      cpu-cycles                       #      0.7 GHz  cycles_frequency       (66.95%)
 1,714,154,229,503      instructions                     #      1.2 instructions  insn_per_cycle  (66.86%)
   429,445,480,497      stalled-cycles-frontend          #     0.30 frontend_cycles_idle        (66.36%)

      60.035899231 seconds time elapsed

Patched:
 Performance counter stats for 'system wide':

         6,662,371      context-switches                 #   3467.3 cs/sec  cs_per_second
      1,921,497.78 msec cpu-clock                        #     32.0 CPUs  CPUs_utilized
            23,524      cpu-migrations                   #     12.2 migrations/sec  migrations_per_second
         7,783,064      page-faults                      #   4050.5 faults/sec  page_faults_per_second
    18,181,655,163      branch-misses                    #      4.3 %  branch_miss_rate         (50.15%)
   311,865,239,743      branches                         #    162.3 M/sec  branch_frequency     (66.86%)
 1,334,859,779,821      cpu-cycles                       #      0.7 GHz  cycles_frequency       (66.96%)
 1,555,086,465,845      instructions                     #      1.2 instructions  insn_per_cycle  (66.87%)
   407,666,712,045      stalled-cycles-frontend          #     0.31 frontend_cycles_idle        (66.35%)

      60.034702643 seconds time elapsed

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
---
v3:
 - Remove Suggested-by tags per Eduard's feedback.
 - Add Eduard's Acked-by.
 - Credit Andrii Nakryiko for the further optimization suggestion.
 - Mention the limitation of system-wide profiling in commit message.
v2:
 - Further optimize ID mapping reset (suggested by Andrii Nakryiko) by
   using a simple counter reset and bounding the search loop.
v1:
 - Initial version using a watermark-based partial memset to optimize the
   ID mapping reset overhead.
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 23 ++++++++++++++---------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 130bcbd66f60..8355b585cd18 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -692,6 +692,7 @@ struct bpf_id_pair {
 
 struct bpf_idmap {
 	u32 tmp_id_gen;
+	u32 cnt;
 	struct bpf_id_pair map[BPF_ID_MAP_SIZE];
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3135643d5695..6ec6d70e5ce7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18948,18 +18948,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
 	if (old_id == 0) /* cur_id == 0 as well */
 		return true;
 
-	for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
-		if (!map[i].old) {
-			/* Reached an empty slot; haven't seen this id before */
-			map[i].old = old_id;
-			map[i].cur = cur_id;
-			return true;
-		}
+	for (i = 0; i < idmap->cnt; i++) {
 		if (map[i].old == old_id)
 			return map[i].cur == cur_id;
 		if (map[i].cur == cur_id)
 			return false;
 	}
+
+	/* Reached the end of known mappings; haven't seen this id before */
+	if (idmap->cnt < BPF_ID_MAP_SIZE) {
+		map[idmap->cnt].old = old_id;
+		map[idmap->cnt].cur = cur_id;
+		idmap->cnt++;
+		return true;
+	}
+
 	/* We ran out of idmap slots, which should be impossible */
 	WARN_ON_ONCE(1);
 	return false;
@@ -19470,8 +19473,10 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
 
 static void reset_idmap_scratch(struct bpf_verifier_env *env)
 {
-	env->idmap_scratch.tmp_id_gen = env->id_gen;
-	memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+	struct bpf_idmap *idmap = &env->idmap_scratch;
+
+	idmap->tmp_id_gen = env->id_gen;
+	idmap->cnt = 0;
 }
 
 static bool states_equal(struct bpf_verifier_env *env,
-- 
2.39.5

Re: [PATCH v3] bpf/verifier: optimize ID mapping reset in states_equal

Posted by Alexei Starovoitov 2 weeks, 4 days ago

On Mon, Jan 19, 2026 at 5:56 PM Qiliang Yuan <realwujing@gmail.com> wrote:
>
> The verifier uses an ID mapping table (struct bpf_idmap) during state
> equivalence checks. Currently, reset_idmap_scratch performs a full memset
> on the entire map (~4.7KB) in every call to states_equal.
>
> This ensures that reset overhead is minimal and the search loop is
> bounded by the number of IDs actually encountered in the current
> equivalence check.
>
> Benchmark results (system-wide 'perf stat' during high-concurrency 'veristat'
> stress test, 60s):
>
> The following results, captured using perf while running veristat in parallel
> across all CPU cores, show a significant reduction in instruction overhead
> (~9.3%) and branch executions (~11%), confirming that the O(1) reset logic
> significantly reduces the verifier's workload during state equivalence
> checks.

You were already told by multiple people to stop doing this pointless
stress runs across all cpus.

> Metric          | Baseline      | Patched       | Delta
> ----------------|---------------|---------------|----------
> Iterations      | 5710          | 5731          | +0.37%
> Instructions    | 1.714 T       | 1.555 T       | -9.28%
> Inst/Iter       | 300.2 M       | 271.3 M       | -9.63%
> Cycles          | 1.436 T       | 1.335 T       | -7.03%
> Branches        | 350.4 B       | 311.9 B       | -10.99%
> Migrations      | 25,977        | 23,524        | -9.44%
>
> Test Command:
>   seq 1 2000000 | sudo perf stat -a -- \
>     timeout 60s xargs -P $(nproc) -I {} ./veristat access_map_in_map.bpf.o

and you were told to stop using tiny programs that don't exercise the
path you're changing in the patch.

> Detailed Performance Stats:
>
> Baseline:
>  Performance counter stats for 'system wide':
>
>          6,735,538      context-switches                 #   3505.5 cs/sec  cs_per_second
>       1,921,431.27 msec cpu-clock                        #     32.0 CPUs  CPUs_utilized
>             25,977      cpu-migrations                   #     13.5 migrations/sec  migrations_per_second
>          7,268,841      page-faults                      #   3783.0 faults/sec  page_fault_per_second
>     18,662,357,052      branch-misses                    #      3.9 %  branch_miss_rate         (50.14%)
>    350,411,558,023      branches                         #    182.4 M/sec  branch_frequency     (66.85%)
>  1,435,774,261,319      cpu-cycles                       #      0.7 GHz  cycles_frequency       (66.95%)
>  1,714,154,229,503      instructions                     #      1.2 instructions  insn_per_cycle  (66.86%)
>    429,445,480,497      stalled-cycles-frontend          #     0.30 frontend_cycles_idle        (66.36%)
>
>       60.035899231 seconds time elapsed
>
> Patched:
>  Performance counter stats for 'system wide':
>
>          6,662,371      context-switches                 #   3467.3 cs/sec  cs_per_second
>       1,921,497.78 msec cpu-clock                        #     32.0 CPUs  CPUs_utilized
>             23,524      cpu-migrations                   #     12.2 migrations/sec  migrations_per_second
>          7,783,064      page-faults                      #   4050.5 faults/sec  page_faults_per_second
>     18,181,655,163      branch-misses                    #      4.3 %  branch_miss_rate         (50.15%)
>    311,865,239,743      branches                         #    162.3 M/sec  branch_frequency     (66.86%)
>  1,334,859,779,821      cpu-cycles                       #      0.7 GHz  cycles_frequency       (66.96%)
>  1,555,086,465,845      instructions                     #      1.2 instructions  insn_per_cycle  (66.87%)
>    407,666,712,045      stalled-cycles-frontend          #     0.31 frontend_cycles_idle        (66.35%)
>
>       60.034702643 seconds time elapsed
>
> Acked-by: Eduard Zingerman <eddyz87@gmail.com>
> Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
> ---
> v3:
>  - Remove Suggested-by tags per Eduard's feedback.
>  - Add Eduard's Acked-by.
>  - Credit Andrii Nakryiko for the further optimization suggestion.
>  - Mention the limitation of system-wide profiling in commit message.

Do not add "educational" information to the commit log.
The commit should describe why and what is being done and the result.
If you cannot observe the difference before/after, then don't say anything.

"significantly reduces the verifier's workload" is not true.
You were not able to measure it. Don't invent gains.

pw-bot: cr

[PATCH v4] bpf/verifier: optimize ID mapping reset in states_equal

Posted by Qiliang Yuan 2 weeks, 4 days ago

Currently, reset_idmap_scratch() performs a 4.7KB memset() in every
states_equal() call. Optimize this by using a counter to track used
ID mappings, replacing the O(N) memset() with an O(1) reset and
bounding the search loop in check_ids().

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 23 ++++++++++++++---------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 130bcbd66f60..8355b585cd18 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -692,6 +692,7 @@ struct bpf_id_pair {
 
 struct bpf_idmap {
 	u32 tmp_id_gen;
+	u32 cnt;
 	struct bpf_id_pair map[BPF_ID_MAP_SIZE];
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3135643d5695..6ec6d70e5ce7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18948,18 +18948,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
 	if (old_id == 0) /* cur_id == 0 as well */
 		return true;
 
-	for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
-		if (!map[i].old) {
-			/* Reached an empty slot; haven't seen this id before */
-			map[i].old = old_id;
-			map[i].cur = cur_id;
-			return true;
-		}
+	for (i = 0; i < idmap->cnt; i++) {
 		if (map[i].old == old_id)
 			return map[i].cur == cur_id;
 		if (map[i].cur == cur_id)
 			return false;
 	}
+
+	/* Reached the end of known mappings; haven't seen this id before */
+	if (idmap->cnt < BPF_ID_MAP_SIZE) {
+		map[idmap->cnt].old = old_id;
+		map[idmap->cnt].cur = cur_id;
+		idmap->cnt++;
+		return true;
+	}
+
 	/* We ran out of idmap slots, which should be impossible */
 	WARN_ON_ONCE(1);
 	return false;
@@ -19470,8 +19473,10 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
 
 static void reset_idmap_scratch(struct bpf_verifier_env *env)
 {
-	env->idmap_scratch.tmp_id_gen = env->id_gen;
-	memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+	struct bpf_idmap *idmap = &env->idmap_scratch;
+
+	idmap->tmp_id_gen = env->id_gen;
+	idmap->cnt = 0;
 }
 
 static bool states_equal(struct bpf_verifier_env *env,
-- 
2.39.5