include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 23 ++++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-)
The verifier uses an ID mapping table (struct bpf_idmap) during state
equivalence checks. Currently, reset_idmap_scratch performs a full memset
on the entire map (~4.7KB) in every call to states_equal.
This ensures that reset overhead is minimal and the search loop is
bounded by the number of IDs actually encountered in the current
equivalence check.
Benchmark results (system-wide 'perf stat' during high-concurrency 'veristat'
stress test, 60s):
The following results, captured using perf while running veristat in parallel
across all CPU cores, show a significant reduction in instruction overhead
(~9.3%) and branch executions (~11%), confirming that the O(1) reset logic
significantly reduces the verifier's workload during state equivalence
checks.
Metric | Baseline | Patched | Delta
----------------|---------------|---------------|----------
Iterations | 5710 | 5731 | +0.37%
Instructions | 1.714 T | 1.555 T | -9.28%
Inst/Iter | 300.2 M | 271.3 M | -9.63%
Cycles | 1.436 T | 1.335 T | -7.03%
Branches | 350.4 B | 311.9 B | -10.99%
Migrations | 25,977 | 23,524 | -9.44%
Test Command:
seq 1 2000000 | sudo perf stat -a -- \
timeout 60s xargs -P $(nproc) -I {} ./veristat access_map_in_map.bpf.o
Detailed Performance Stats:
Baseline:
Performance counter stats for 'system wide':
6,735,538 context-switches # 3505.5 cs/sec cs_per_second
1,921,431.27 msec cpu-clock # 32.0 CPUs CPUs_utilized
25,977 cpu-migrations # 13.5 migrations/sec migrations_per_second
7,268,841 page-faults # 3783.0 faults/sec page_fault_per_second
18,662,357,052 branch-misses # 3.9 % branch_miss_rate (50.14%)
350,411,558,023 branches # 182.4 M/sec branch_frequency (66.85%)
1,435,774,261,319 cpu-cycles # 0.7 GHz cycles_frequency (66.95%)
1,714,154,229,503 instructions # 1.2 instructions insn_per_cycle (66.86%)
429,445,480,497 stalled-cycles-frontend # 0.30 frontend_cycles_idle (66.36%)
60.035899231 seconds time elapsed
Patched:
Performance counter stats for 'system wide':
6,662,371 context-switches # 3467.3 cs/sec cs_per_second
1,921,497.78 msec cpu-clock # 32.0 CPUs CPUs_utilized
23,524 cpu-migrations # 12.2 migrations/sec migrations_per_second
7,783,064 page-faults # 4050.5 faults/sec page_faults_per_second
18,181,655,163 branch-misses # 4.3 % branch_miss_rate (50.15%)
311,865,239,743 branches # 162.3 M/sec branch_frequency (66.86%)
1,334,859,779,821 cpu-cycles # 0.7 GHz cycles_frequency (66.96%)
1,555,086,465,845 instructions # 1.2 instructions insn_per_cycle (66.87%)
407,666,712,045 stalled-cycles-frontend # 0.31 frontend_cycles_idle (66.35%)
60.034702643 seconds time elapsed
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
---
v3:
- Remove Suggested-by tags per Eduard's feedback.
- Add Eduard's Acked-by.
- Credit Andrii Nakryiko for the further optimization suggestion.
- Mention the limitation of system-wide profiling in commit message.
v2:
- Further optimize ID mapping reset (suggested by Andrii Nakryiko) by
using a simple counter reset and bounding the search loop.
v1:
- Initial version using a watermark-based partial memset to optimize the
ID mapping reset overhead.
include/linux/bpf_verifier.h | 1 +
kernel/bpf/verifier.c | 23 ++++++++++++++---------
2 files changed, 15 insertions(+), 9 deletions(-)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 130bcbd66f60..8355b585cd18 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -692,6 +692,7 @@ struct bpf_id_pair {
struct bpf_idmap {
u32 tmp_id_gen;
+ u32 cnt;
struct bpf_id_pair map[BPF_ID_MAP_SIZE];
};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3135643d5695..6ec6d70e5ce7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18948,18 +18948,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
if (old_id == 0) /* cur_id == 0 as well */
return true;
- for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
- if (!map[i].old) {
- /* Reached an empty slot; haven't seen this id before */
- map[i].old = old_id;
- map[i].cur = cur_id;
- return true;
- }
+ for (i = 0; i < idmap->cnt; i++) {
if (map[i].old == old_id)
return map[i].cur == cur_id;
if (map[i].cur == cur_id)
return false;
}
+
+ /* Reached the end of known mappings; haven't seen this id before */
+ if (idmap->cnt < BPF_ID_MAP_SIZE) {
+ map[idmap->cnt].old = old_id;
+ map[idmap->cnt].cur = cur_id;
+ idmap->cnt++;
+ return true;
+ }
+
/* We ran out of idmap slots, which should be impossible */
WARN_ON_ONCE(1);
return false;
@@ -19470,8 +19473,10 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
static void reset_idmap_scratch(struct bpf_verifier_env *env)
{
- env->idmap_scratch.tmp_id_gen = env->id_gen;
- memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+ struct bpf_idmap *idmap = &env->idmap_scratch;
+
+ idmap->tmp_id_gen = env->id_gen;
+ idmap->cnt = 0;
}
static bool states_equal(struct bpf_verifier_env *env,
--
2.39.5
On Mon, Jan 19, 2026 at 5:56 PM Qiliang Yuan <realwujing@gmail.com> wrote:
>
> The verifier uses an ID mapping table (struct bpf_idmap) during state
> equivalence checks. Currently, reset_idmap_scratch performs a full memset
> on the entire map (~4.7KB) in every call to states_equal.
>
> This ensures that reset overhead is minimal and the search loop is
> bounded by the number of IDs actually encountered in the current
> equivalence check.
>
> Benchmark results (system-wide 'perf stat' during high-concurrency 'veristat'
> stress test, 60s):
>
> The following results, captured using perf while running veristat in parallel
> across all CPU cores, show a significant reduction in instruction overhead
> (~9.3%) and branch executions (~11%), confirming that the O(1) reset logic
> significantly reduces the verifier's workload during state equivalence
> checks.
You were already told by multiple people to stop doing this pointless
stress runs across all cpus.
> Metric | Baseline | Patched | Delta
> ----------------|---------------|---------------|----------
> Iterations | 5710 | 5731 | +0.37%
> Instructions | 1.714 T | 1.555 T | -9.28%
> Inst/Iter | 300.2 M | 271.3 M | -9.63%
> Cycles | 1.436 T | 1.335 T | -7.03%
> Branches | 350.4 B | 311.9 B | -10.99%
> Migrations | 25,977 | 23,524 | -9.44%
>
> Test Command:
> seq 1 2000000 | sudo perf stat -a -- \
> timeout 60s xargs -P $(nproc) -I {} ./veristat access_map_in_map.bpf.o
and you were told to stop using tiny programs that don't exercise the
path you're changing in the patch.
> Detailed Performance Stats:
>
> Baseline:
> Performance counter stats for 'system wide':
>
> 6,735,538 context-switches # 3505.5 cs/sec cs_per_second
> 1,921,431.27 msec cpu-clock # 32.0 CPUs CPUs_utilized
> 25,977 cpu-migrations # 13.5 migrations/sec migrations_per_second
> 7,268,841 page-faults # 3783.0 faults/sec page_fault_per_second
> 18,662,357,052 branch-misses # 3.9 % branch_miss_rate (50.14%)
> 350,411,558,023 branches # 182.4 M/sec branch_frequency (66.85%)
> 1,435,774,261,319 cpu-cycles # 0.7 GHz cycles_frequency (66.95%)
> 1,714,154,229,503 instructions # 1.2 instructions insn_per_cycle (66.86%)
> 429,445,480,497 stalled-cycles-frontend # 0.30 frontend_cycles_idle (66.36%)
>
> 60.035899231 seconds time elapsed
>
> Patched:
> Performance counter stats for 'system wide':
>
> 6,662,371 context-switches # 3467.3 cs/sec cs_per_second
> 1,921,497.78 msec cpu-clock # 32.0 CPUs CPUs_utilized
> 23,524 cpu-migrations # 12.2 migrations/sec migrations_per_second
> 7,783,064 page-faults # 4050.5 faults/sec page_faults_per_second
> 18,181,655,163 branch-misses # 4.3 % branch_miss_rate (50.15%)
> 311,865,239,743 branches # 162.3 M/sec branch_frequency (66.86%)
> 1,334,859,779,821 cpu-cycles # 0.7 GHz cycles_frequency (66.96%)
> 1,555,086,465,845 instructions # 1.2 instructions insn_per_cycle (66.87%)
> 407,666,712,045 stalled-cycles-frontend # 0.31 frontend_cycles_idle (66.35%)
>
> 60.034702643 seconds time elapsed
>
> Acked-by: Eduard Zingerman <eddyz87@gmail.com>
> Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
> ---
> v3:
> - Remove Suggested-by tags per Eduard's feedback.
> - Add Eduard's Acked-by.
> - Credit Andrii Nakryiko for the further optimization suggestion.
> - Mention the limitation of system-wide profiling in commit message.
Do not add "educational" information to the commit log.
The commit should describe why and what is being done and the result.
If you cannot observe the difference before/after, then don't say anything.
"significantly reduces the verifier's workload" is not true.
You were not able to measure it. Don't invent gains.
pw-bot: cr
Currently, reset_idmap_scratch() performs a 4.7KB memset() in every
states_equal() call. Optimize this by using a counter to track used
ID mappings, replacing the O(N) memset() with an O(1) reset and
bounding the search loop in check_ids().
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
---
include/linux/bpf_verifier.h | 1 +
kernel/bpf/verifier.c | 23 ++++++++++++++---------
2 files changed, 15 insertions(+), 9 deletions(-)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 130bcbd66f60..8355b585cd18 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -692,6 +692,7 @@ struct bpf_id_pair {
struct bpf_idmap {
u32 tmp_id_gen;
+ u32 cnt;
struct bpf_id_pair map[BPF_ID_MAP_SIZE];
};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3135643d5695..6ec6d70e5ce7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18948,18 +18948,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
if (old_id == 0) /* cur_id == 0 as well */
return true;
- for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
- if (!map[i].old) {
- /* Reached an empty slot; haven't seen this id before */
- map[i].old = old_id;
- map[i].cur = cur_id;
- return true;
- }
+ for (i = 0; i < idmap->cnt; i++) {
if (map[i].old == old_id)
return map[i].cur == cur_id;
if (map[i].cur == cur_id)
return false;
}
+
+ /* Reached the end of known mappings; haven't seen this id before */
+ if (idmap->cnt < BPF_ID_MAP_SIZE) {
+ map[idmap->cnt].old = old_id;
+ map[idmap->cnt].cur = cur_id;
+ idmap->cnt++;
+ return true;
+ }
+
/* We ran out of idmap slots, which should be impossible */
WARN_ON_ONCE(1);
return false;
@@ -19470,8 +19473,10 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
static void reset_idmap_scratch(struct bpf_verifier_env *env)
{
- env->idmap_scratch.tmp_id_gen = env->id_gen;
- memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+ struct bpf_idmap *idmap = &env->idmap_scratch;
+
+ idmap->tmp_id_gen = env->id_gen;
+ idmap->cnt = 0;
}
static bool states_equal(struct bpf_verifier_env *env,
--
2.39.5
© 2016 - 2026 Red Hat, Inc.