ALU sanitization was introduced to ensure that a subsequent ptr access
can never go OOBs, even under speculation. This is required because we
currently allow speculative scalar confusion. This is the case because
Spectre v4 sanitization only adds a nospec after critical stores (e.g.,
scalar overwritten with a pointer).
If we add a nospec before the ALU op, none of the operands can be
subject to scalar confusion. As an ADD/SUB can not introduce scalar
confusion itself, the result will also not be subject to scalar
confusion. Therefore, the subsequent ptr access is always safe.
For REASON_STACK, we raise the error from push_stack() directly now.
This effectively only changes the error code from EACCES to
ENOMEM (which arguably also fits). Raising the push_stack() error allows
a future commit to fall back to nospec for certain errors from
push_stack() if on a speculative path.
Fall back to nospec directly for the remaining sanitization errs even if
we are not on a speculative path.
Signed-off-by: Luis Gerhorst <luis.gerhorst@fau.de>
Acked-by: Henriette Herzog <henriette.herzog@rub.de>
Cc: Maximilian Ott <ott@cs.fau.de>
Cc: Milan Stephan <milan.stephan@fau.de>
---
kernel/bpf/verifier.c | 85 ++++++-------------
.../selftests/bpf/progs/verifier_bounds.c | 5 +-
.../bpf/progs/verifier_bounds_deduction.c | 43 ++++++----
.../selftests/bpf/progs/verifier_map_ptr.c | 12 ++-
.../bpf/progs/verifier_value_ptr_arith.c | 44 ++++++----
5 files changed, 92 insertions(+), 97 deletions(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 610f9567af7c..03af82f52a02 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13809,14 +13809,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
return true;
}
-enum {
- REASON_BOUNDS = -1,
- REASON_TYPE = -2,
- REASON_PATHS = -3,
- REASON_LIMIT = -4,
- REASON_STACK = -5,
-};
-
static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
u32 *alu_limit, bool mask_to_left)
{
@@ -13839,11 +13831,13 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
ptr_reg->umax_value) + ptr_reg->off;
break;
default:
- return REASON_TYPE;
+ /* Register has pointer with unsupported alu operation. */
+ return -EOPNOTSUPP;
}
+ /* Register tried access beyond pointer bounds. */
if (ptr_limit >= max)
- return REASON_LIMIT;
+ return -EOPNOTSUPP;
*alu_limit = ptr_limit;
return 0;
}
@@ -13864,8 +13858,12 @@ static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
*/
if (aux->alu_state &&
(aux->alu_state != alu_state ||
- aux->alu_limit != alu_limit))
- return REASON_PATHS;
+ aux->alu_limit != alu_limit)) {
+ /* Tried to perform alu op from different maps, paths or scalars */
+ aux->nospec = true;
+ aux->alu_state = 0;
+ return 0;
+ }
/* Corresponding fixup done in do_misc_fixups(). */
aux->alu_state = alu_state;
@@ -13946,16 +13944,24 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
if (!commit_window) {
if (!tnum_is_const(off_reg->var_off) &&
- (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
- return REASON_BOUNDS;
+ (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) {
+ /* Register has unknown scalar with mixed signed bounds. */
+ aux->nospec = true;
+ aux->alu_state = 0;
+ return 0;
+ }
info->mask_to_left = (opcode == BPF_ADD && off_is_neg) ||
(opcode == BPF_SUB && !off_is_neg);
}
err = retrieve_ptr_limit(ptr_reg, &alu_limit, info->mask_to_left);
- if (err < 0)
- return err;
+ if (err) {
+ WARN_ON_ONCE(err != -EOPNOTSUPP);
+ aux->nospec = true;
+ aux->alu_state = 0;
+ return 0;
+ }
if (commit_window) {
/* In commit phase we narrow the masking window based on
@@ -14008,7 +14014,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
env->insn_idx);
if (!ptr_is_dst_reg && !IS_ERR(branch))
*dst_reg = tmp;
- return IS_ERR(branch) ? REASON_STACK : 0;
+ return PTR_ERR_OR_ZERO(branch);
}
static void sanitize_mark_insn_seen(struct bpf_verifier_env *env)
@@ -14024,45 +14030,6 @@ static void sanitize_mark_insn_seen(struct bpf_verifier_env *env)
env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
}
-static int sanitize_err(struct bpf_verifier_env *env,
- const struct bpf_insn *insn, int reason,
- const struct bpf_reg_state *off_reg,
- const struct bpf_reg_state *dst_reg)
-{
- static const char *err = "pointer arithmetic with it prohibited for !root";
- const char *op = BPF_OP(insn->code) == BPF_ADD ? "add" : "sub";
- u32 dst = insn->dst_reg, src = insn->src_reg;
-
- switch (reason) {
- case REASON_BOUNDS:
- verbose(env, "R%d has unknown scalar with mixed signed bounds, %s\n",
- off_reg == dst_reg ? dst : src, err);
- break;
- case REASON_TYPE:
- verbose(env, "R%d has pointer with unsupported alu operation, %s\n",
- off_reg == dst_reg ? src : dst, err);
- break;
- case REASON_PATHS:
- verbose(env, "R%d tried to %s from different maps, paths or scalars, %s\n",
- dst, op, err);
- break;
- case REASON_LIMIT:
- verbose(env, "R%d tried to %s beyond pointer bounds, %s\n",
- dst, op, err);
- break;
- case REASON_STACK:
- verbose(env, "R%d could not be pushed for speculative verification, %s\n",
- dst, err);
- break;
- default:
- verbose(env, "verifier internal error: unknown reason (%d)\n",
- reason);
- break;
- }
-
- return -EACCES;
-}
-
/* check that stack access falls within stack limits and that 'reg' doesn't
* have a variable offset.
*
@@ -14228,7 +14195,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg,
&info, false);
if (ret < 0)
- return sanitize_err(env, insn, ret, off_reg, dst_reg);
+ return ret;
}
switch (opcode) {
@@ -14356,7 +14323,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg,
&info, true);
if (ret < 0)
- return sanitize_err(env, insn, ret, off_reg, dst_reg);
+ return ret;
}
return 0;
@@ -14950,7 +14917,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
if (sanitize_needed(opcode)) {
ret = sanitize_val_alu(env, insn);
if (ret < 0)
- return sanitize_err(env, insn, ret, NULL, NULL);
+ return ret;
}
/* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops.
diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c
index 0488ef05a7a4..ad405a609db4 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bounds.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c
@@ -47,9 +47,12 @@ SEC("socket")
__description("subtraction bounds (map value) variant 2")
__failure
__msg("R0 min value is negative, either use unsigned index or do a if (index >=0) check.")
-__msg_unpriv("R1 has unknown scalar with mixed signed bounds")
+__msg_unpriv("R0 pointer arithmetic of map value goes out of range, prohibited for !root")
__naked void bounds_map_value_variant_2(void)
{
+ /* unpriv: nospec inserted to prevent "R1 has unknown scalar with mixed
+ * signed bounds".
+ */
asm volatile (" \
r1 = 0; \
*(u64*)(r10 - 8) = r1; \
diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds_deduction.c b/tools/testing/selftests/bpf/progs/verifier_bounds_deduction.c
index c506afbdd936..c2b5099fa208 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bounds_deduction.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bounds_deduction.c
@@ -8,21 +8,21 @@
SEC("socket")
__description("check deducing bounds from const, 1")
__failure __msg("R0 tried to subtract pointer from scalar")
-__msg_unpriv("R1 has pointer with unsupported alu operation")
+__failure_unpriv
__naked void deducing_bounds_from_const_1(void)
{
asm volatile (" \
r0 = 1; \
if r0 s>= 1 goto l0_%=; \
-l0_%=: r0 -= r1; \
+l0_%=: /* unpriv: nospec (inserted to prevent `R1 has pointer with unsupported alu operation`) */\
+ r0 -= r1; \
exit; \
" ::: __clobber_all);
}
SEC("socket")
__description("check deducing bounds from const, 2")
-__success __failure_unpriv
-__msg_unpriv("R1 has pointer with unsupported alu operation")
+__success __success_unpriv
__retval(1)
__naked void deducing_bounds_from_const_2(void)
{
@@ -32,7 +32,8 @@ __naked void deducing_bounds_from_const_2(void)
exit; \
l0_%=: if r0 s<= 1 goto l1_%=; \
exit; \
-l1_%=: r1 -= r0; \
+l1_%=: /* unpriv: nospec (inserted to prevent `R1 has pointer with unsupported alu operation`) */\
+ r1 -= r0; \
exit; \
" ::: __clobber_all);
}
@@ -40,21 +41,21 @@ l1_%=: r1 -= r0; \
SEC("socket")
__description("check deducing bounds from const, 3")
__failure __msg("R0 tried to subtract pointer from scalar")
-__msg_unpriv("R1 has pointer with unsupported alu operation")
+__failure_unpriv
__naked void deducing_bounds_from_const_3(void)
{
asm volatile (" \
r0 = 0; \
if r0 s<= 0 goto l0_%=; \
-l0_%=: r0 -= r1; \
+l0_%=: /* unpriv: nospec (inserted to prevent `R1 has pointer with unsupported alu operation`) */\
+ r0 -= r1; \
exit; \
" ::: __clobber_all);
}
SEC("socket")
__description("check deducing bounds from const, 4")
-__success __failure_unpriv
-__msg_unpriv("R6 has pointer with unsupported alu operation")
+__success __success_unpriv
__retval(0)
__naked void deducing_bounds_from_const_4(void)
{
@@ -65,7 +66,8 @@ __naked void deducing_bounds_from_const_4(void)
exit; \
l0_%=: if r0 s>= 0 goto l1_%=; \
exit; \
-l1_%=: r6 -= r0; \
+l1_%=: /* unpriv: nospec (inserted to prevent `R6 has pointer with unsupported alu operation`) */\
+ r6 -= r0; \
exit; \
" ::: __clobber_all);
}
@@ -73,12 +75,13 @@ l1_%=: r6 -= r0; \
SEC("socket")
__description("check deducing bounds from const, 5")
__failure __msg("R0 tried to subtract pointer from scalar")
-__msg_unpriv("R1 has pointer with unsupported alu operation")
+__failure_unpriv
__naked void deducing_bounds_from_const_5(void)
{
asm volatile (" \
r0 = 0; \
if r0 s>= 1 goto l0_%=; \
+ /* unpriv: nospec (inserted to prevent `R1 has pointer with unsupported alu operation`) */\
r0 -= r1; \
l0_%=: exit; \
" ::: __clobber_all);
@@ -87,14 +90,15 @@ l0_%=: exit; \
SEC("socket")
__description("check deducing bounds from const, 6")
__failure __msg("R0 tried to subtract pointer from scalar")
-__msg_unpriv("R1 has pointer with unsupported alu operation")
+__failure_unpriv
__naked void deducing_bounds_from_const_6(void)
{
asm volatile (" \
r0 = 0; \
if r0 s>= 0 goto l0_%=; \
exit; \
-l0_%=: r0 -= r1; \
+l0_%=: /* unpriv: nospec (inserted to prevent `R1 has pointer with unsupported alu operation`) */\
+ r0 -= r1; \
exit; \
" ::: __clobber_all);
}
@@ -102,14 +106,15 @@ l0_%=: r0 -= r1; \
SEC("socket")
__description("check deducing bounds from const, 7")
__failure __msg("dereference of modified ctx ptr")
-__msg_unpriv("R1 has pointer with unsupported alu operation")
+__failure_unpriv
__flag(BPF_F_ANY_ALIGNMENT)
__naked void deducing_bounds_from_const_7(void)
{
asm volatile (" \
r0 = %[__imm_0]; \
if r0 s>= 0 goto l0_%=; \
-l0_%=: r1 -= r0; \
+l0_%=: /* unpriv: nospec (inserted to prevent `R1 has pointer with unsupported alu operation`) */\
+ r1 -= r0; \
r0 = *(u32*)(r1 + %[__sk_buff_mark]); \
exit; \
" :
@@ -121,13 +126,14 @@ l0_%=: r1 -= r0; \
SEC("socket")
__description("check deducing bounds from const, 8")
__failure __msg("negative offset ctx ptr R1 off=-1 disallowed")
-__msg_unpriv("R1 has pointer with unsupported alu operation")
+__failure_unpriv
__flag(BPF_F_ANY_ALIGNMENT)
__naked void deducing_bounds_from_const_8(void)
{
asm volatile (" \
r0 = %[__imm_0]; \
if r0 s>= 0 goto l0_%=; \
+ /* unpriv: nospec (inserted to prevent `R1 has pointer with unsupported alu operation`) */\
r1 += r0; \
l0_%=: r0 = *(u32*)(r1 + %[__sk_buff_mark]); \
exit; \
@@ -140,13 +146,14 @@ l0_%=: r0 = *(u32*)(r1 + %[__sk_buff_mark]); \
SEC("socket")
__description("check deducing bounds from const, 9")
__failure __msg("R0 tried to subtract pointer from scalar")
-__msg_unpriv("R1 has pointer with unsupported alu operation")
+__failure_unpriv
__naked void deducing_bounds_from_const_9(void)
{
asm volatile (" \
r0 = 0; \
if r0 s>= 0 goto l0_%=; \
-l0_%=: r0 -= r1; \
+l0_%=: /* unpriv: nospec (inserted to prevent `R1 has pointer with unsupported alu operation`) */\
+ r0 -= r1; \
exit; \
" ::: __clobber_all);
}
diff --git a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c
index 11a079145966..8d7cd5b82bbd 100644
--- a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c
+++ b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c
@@ -110,11 +110,13 @@ __naked void ptr_read_ops_field_accepted(void)
SEC("socket")
__description("bpf_map_ptr: r = 0, map_ptr = map_ptr + r")
-__success __failure_unpriv
-__msg_unpriv("R1 has pointer with unsupported alu operation")
+__success __success_unpriv
__retval(0)
__naked void map_ptr_map_ptr_r(void)
{
+ /* unpriv: nospec inserted to prevent "R1 has pointer with unsupported
+ * alu operation".
+ */
asm volatile (" \
r0 = 0; \
*(u64*)(r10 - 8) = r0; \
@@ -134,11 +136,13 @@ __naked void map_ptr_map_ptr_r(void)
SEC("socket")
__description("bpf_map_ptr: r = 0, r = r + map_ptr")
-__success __failure_unpriv
-__msg_unpriv("R0 has pointer with unsupported alu operation")
+__success __success_unpriv
__retval(0)
__naked void _0_r_r_map_ptr(void)
{
+ /* unpriv: nospec inserted to prevent "R1 has pointer with unsupported
+ * alu operation".
+ */
asm volatile (" \
r0 = 0; \
*(u64*)(r10 - 8) = r0; \
diff --git a/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c b/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c
index 2acd7769bd91..d1be1b6f4674 100644
--- a/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c
+++ b/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c
@@ -41,11 +41,13 @@ struct {
SEC("socket")
__description("map access: known scalar += value_ptr unknown vs const")
-__success __failure_unpriv
-__msg_unpriv("R1 tried to add from different maps, paths or scalars")
+__success __success_unpriv
__retval(1)
__naked void value_ptr_unknown_vs_const(void)
{
+ /* unpriv: nospec inserted to prevent "R1 tried to add from different
+ * maps, paths or scalars".
+ */
asm volatile (" \
r0 = *(u32*)(r1 + %[__sk_buff_len]); \
r1 = 0; \
@@ -79,11 +81,13 @@ l2_%=: r0 = 1; \
SEC("socket")
__description("map access: known scalar += value_ptr const vs unknown")
-__success __failure_unpriv
-__msg_unpriv("R1 tried to add from different maps, paths or scalars")
+__success __success_unpriv
__retval(1)
__naked void value_ptr_const_vs_unknown(void)
{
+ /* unpriv: nospec inserted to prevent "R1 tried to add from different
+ * maps, paths or scalars".
+ */
asm volatile (" \
r0 = *(u32*)(r1 + %[__sk_buff_len]); \
r1 = 0; \
@@ -117,11 +121,13 @@ l2_%=: r0 = 1; \
SEC("socket")
__description("map access: known scalar += value_ptr const vs const (ne)")
-__success __failure_unpriv
-__msg_unpriv("R1 tried to add from different maps, paths or scalars")
+__success __success_unpriv
__retval(1)
__naked void ptr_const_vs_const_ne(void)
{
+ /* unpriv: nospec inserted to prevent "R1 tried to add from different
+ * maps, paths or scalars".
+ */
asm volatile (" \
r0 = *(u32*)(r1 + %[__sk_buff_len]); \
r1 = 0; \
@@ -225,8 +231,7 @@ l2_%=: r0 = 1; \
SEC("socket")
__description("map access: known scalar += value_ptr unknown vs unknown (lt)")
-__success __failure_unpriv
-__msg_unpriv("R1 tried to add from different maps, paths or scalars")
+__success __success_unpriv
__retval(1)
__naked void ptr_unknown_vs_unknown_lt(void)
{
@@ -251,7 +256,8 @@ l1_%=: call %[bpf_map_lookup_elem]; \
l3_%=: r1 = 6; \
r1 = -r1; \
r1 &= 0x7; \
-l4_%=: r1 += r0; \
+l4_%=: /* unpriv: nospec (inserted to prevent `R1 tried to add from different maps, paths or scalars`) */\
+ r1 += r0; \
r0 = *(u8*)(r1 + 0); \
l2_%=: r0 = 1; \
exit; \
@@ -265,11 +271,13 @@ l2_%=: r0 = 1; \
SEC("socket")
__description("map access: known scalar += value_ptr unknown vs unknown (gt)")
-__success __failure_unpriv
-__msg_unpriv("R1 tried to add from different maps, paths or scalars")
+__success __success_unpriv
__retval(1)
__naked void ptr_unknown_vs_unknown_gt(void)
{
+ /* unpriv: nospec inserted to prevent "R1 tried to add from different
+ * maps, paths or scalars".
+ */
asm volatile (" \
r0 = *(u32*)(r1 + %[__sk_buff_len]); \
r1 = 0; \
@@ -398,11 +406,14 @@ l2_%=: r0 = 1; \
SEC("socket")
__description("map access: mixing value pointer and scalar, 1")
-__success __failure_unpriv
-__msg_unpriv("R2 tried to add from different maps, paths or scalars, pointer arithmetic with it prohibited for !root")
+__success __success_unpriv
__retval(0)
__naked void value_pointer_and_scalar_1(void)
{
+ /* unpriv: nospec inserted to prevent "R2 tried to add from different
+ * maps, paths or scalars, pointer arithmetic with it prohibited for
+ * !root".
+ */
asm volatile (" \
/* load map value pointer into r0 and r2 */ \
r0 = 1; \
@@ -452,11 +463,14 @@ l4_%=: /* fake-dead code; targeted from branch A to \
SEC("socket")
__description("map access: mixing value pointer and scalar, 2")
-__success __failure_unpriv
-__msg_unpriv("R2 tried to add from different maps, paths or scalars, pointer arithmetic with it prohibited for !root")
+__success __success_unpriv
__retval(0)
__naked void value_pointer_and_scalar_2(void)
{
+ /* unpriv: nospec inserted to prevent "R2 tried to add from different
+ * maps, paths or scalars, pointer arithmetic with it prohibited for
+ * !root".
+ */
asm volatile (" \
/* load map value pointer into r0 and r2 */ \
r0 = 1; \
--
2.48.1
This trades verification complexity for runtime overheads due to the
nospec inserted because of the EINVAL.
With increased limits this allows applying mitigations to large BPF
progs such as the Parca Continuous Profiler's prog. However, this
requires a jump-seq limit of 256k. In any case, the same principle
should apply to smaller programs therefore include it even if the limit
stays at 8k for now. Most programs in [1] only require a limit of 32k.
[1] https://arxiv.org/pdf/2405.00078 ("VeriFence: Lightweight and
Precise Spectre Defenses for Untrusted Linux Kernel Extensions")
Signed-off-by: Luis Gerhorst <luis.gerhorst@fau.de>
Acked-by: Henriette Herzog <henriette.herzog@rub.de>
Cc: Maximilian Ott <ott@cs.fau.de>
Cc: Milan Stephan <milan.stephan@fau.de>
---
kernel/bpf/verifier.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 03af82f52a02..49c7e2608ccd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -187,6 +187,7 @@ struct bpf_verifier_stack_elem {
};
#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192
+#define BPF_COMPLEXITY_LIMIT_SPEC_V1_VERIFICATION (BPF_COMPLEXITY_LIMIT_JMP_SEQ / 2)
#define BPF_COMPLEXITY_LIMIT_STATES 64
#define BPF_MAP_KEY_POISON (1ULL << 63)
@@ -2010,6 +2011,19 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
struct bpf_verifier_stack_elem *elem;
int err;
+ if (!env->bypass_spec_v1 &&
+ cur->speculative &&
+ env->stack_size > BPF_COMPLEXITY_LIMIT_SPEC_V1_VERIFICATION) {
+ /* Avoiding nested speculative path verification because we are
+ * close to exceeding the jump sequence complexity limit. Will
+ * instead insert a speculation barrier which will impact
+ * performace. To improve performance, authors should reduce the
+ * program's complexity. Barrier will be inserted in
+ * do_check().
+ */
+ return ERR_PTR(-EINVAL);
+ }
+
elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
if (!elem) {
err = -ENOMEM;
--
2.48.1
On Thu, Mar 13, 2025 at 10:57 AM Luis Gerhorst <luis.gerhorst@fau.de> wrote: > > This trades verification complexity for runtime overheads due to the > nospec inserted because of the EINVAL. > > With increased limits this allows applying mitigations to large BPF > progs such as the Parca Continuous Profiler's prog. However, this > requires a jump-seq limit of 256k. In any case, the same principle > should apply to smaller programs therefore include it even if the limit > stays at 8k for now. Most programs in [1] only require a limit of 32k. Do you mean that without this change the verifier needs 256k jmp limit to load Parca's prog as unpriv due to speculative path exploration with push_stack ? And this change uses 4k as a trade-off between prog runtime and verification time ? But tracing progs use bpf_probe_read_kernel(), so they're never going to be unpriv. > @@ -2010,6 +2011,19 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, > struct bpf_verifier_stack_elem *elem; > int err; > > + if (!env->bypass_spec_v1 && > + cur->speculative && Should this be (cur->speculative || speculative) ? In general I'm not convinced that the approach is safe. This recoverable EINVAL means that exploration under speculation stops early, but there could be more branches and they won't be sanitized with extra lfence. So speculative execution can still happen at later insns. Similar concern in patch 7: + if (state->speculative && cur_aux(env)->nospec) + goto process_bpf_exit; One lfence at this insn doesn't stop speculation until the program end. Only at this insn. The rest of the code is free to speculate. The refactoring in patches 1-3 is nice. Patches 4-5 are tricky and somewhat questionable, but make sense. Patch 7 without early goto process_bpf_exit looks correct too, Patch 8 is borderline. Feels like it's opening the door for new vulnerabilities and space to explore for security researchers. We disabled unpriv bpf by default and have no intentions to enable it. Even if we land the whole thing the unpriv will stay disabled. So trade offs don't appear favorable.
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> On Thu, Mar 13, 2025 at 10:57 AM Luis Gerhorst <luis.gerhorst@fau.de> wrote:
>> With increased limits this allows applying mitigations to large BPF
>> progs such as the Parca Continuous Profiler's prog. However, this
>> requires a jump-seq limit of 256k. In any case, the same principle
>> should apply to smaller programs therefore include it even if the limit
>> stays at 8k for now. Most programs in [1] only require a limit of 32k.
>
> Do you mean that without this change the verifier needs 256k
> jmp limit to load Parca's prog as unpriv due to speculative
> path exploration with push_stack ?
>
Only with this change Parca is loadable when manually enabling Spectre
defenses for privileged programs and setting the following limits:
- BPF_COMPLEXITY_LIMIT_JMP_SEQ=256k
- BPF_COMPLEXITY_LIMIT_SPEC_V1_VERIFICATION=128k
- BPF_COMPLEXITY_LIMIT_INSNS=32M
>
> And this change uses 4k as a trade-off between prog runtime
> and verification time ?
>
Yes, this change uses 4k to limited nested speculative path exploration.
At the top-level (i.e., on architectural paths), spawned speculative
paths are still explored.
>
> But tracing progs use bpf_probe_read_kernel(), so they're never going
> to be unpriv.
>
I'm sorry this was not clear. Parca is only used as an example here
to test whether this improves expressiveness in general.
If you do not see this as a favorable tradeoff (because of the code
complexity), I think it would be acceptable to drop the last patch for
now. The biggest improvements is already achieved with the other patches
as evident from the selftests. I can do a more exhaustive analysis on
patch 11 later.
>
>> @@ -2010,6 +2011,19 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
>> struct bpf_verifier_stack_elem *elem;
>> int err;
>>
>> + if (!env->bypass_spec_v1 &&
>> + cur->speculative &&
>
> Should this be
> (cur->speculative || speculative)
> ?
No, I think it will be unsafe to add `|| speculative` here. If we were
to return -EINVAL from push_stack() when pushing a speculative path from
a non-speculative path (e.g., in check_cond_jmp_op() through
sanitize_speculative_path()), this will cause do_check() to add an
lfence before the jump-op.
Here's a minimal example program:
A = true
B = true
if A goto e
f()
if B goto e
unsafe()
e: exit
There are the following speculative and non-speculative paths
(`cur->speculative` and `speculative` referring to the value of the
push_stack() parameters):
- A = true
- B = true
- if A goto e
- A && !cur->speculative && !speculative
- exit
- !A && !cur->speculative && speculative
- f()
- if B goto e
- B && cur->speculative && !speculative
- exit
- !B && cur->speculative && speculative
- unsafe()
`|| speculative` might cause us to only add an lfence before `if A goto
e`, which would not be sufficient. Intel recommends adding the lfence
after the jump [1].
>
> In general I'm not convinced that the approach is safe.
>
> This recoverable EINVAL means that exploration under speculation
> stops early, but there could be more branches and they won't be
> sanitized with extra lfence.
> So speculative execution can still happen at later insns.
>
`goto process_bpf_exit` only causes us to stop analyzing this particular
path, not the rest of the program.
This is based on the assumption, that the lfence stops the CPU from ever
reaching those branches (if they are not reachable through other means).
>
> Similar concern in patch 7:
> + if (state->speculative && cur_aux(env)->nospec)
> + goto process_bpf_exit;
>
> One lfence at this insn doesn't stop speculation until the program end.
> Only at this insn. The rest of the code is free to speculate.
>
Taking the program above as an example, this allows us to stop before
f() if an lfence was inserted there.
Fully patched program would be:
A = true
B = true
if A goto e
lfence
f()
if B goto e
unsafe()
e: exit
In this example, all instructions after the lfence are dead code (and
with the lfence they are also dead code speculatively).
I believe this is in line with Intel's guidance [1]:
An LFENCE instruction or a serializing instruction will ensure that
no later instructions execute, even speculatively, until all prior
instructions complete locally. Developers might prefer LFENCE over a
serializing instruction because LFENCE may have lower latency.
Inserting an LFENCE instruction after a bounds check prevents later
operations from executing before the bound check completes.
With regards to the example, this implies that `if B goto e` will not
execute before `if A goto e` completes. Once `if A goto e` completes,
the CPU should find that the speculation was wrong and continue with
`exit`.
If there is any other path that leads to `if B goto e` (and therefore
`unsafe()`) without going through `if A goto e`, then an lfence might of
course still be needed there. However, I assume this other path will be
explored separately and therefore be discovered by the verifier even if
the exploration discussed here stops at the lfence. If this assumption
is wrong, please let me know.
>
> The refactoring in patches 1-3 is nice.
> Patches 4-5 are tricky and somewhat questionable, but make sense.
> Patch 7 without early goto process_bpf_exit looks correct too,
> Patch 8 is borderline. Feels like it's opening the door for
> new vulnerabilities and space to explore for security researchers.
> We disabled unpriv bpf by default and have no intentions to enable it.
> Even if we land the whole thing the unpriv will stay disabled.
> So trade offs don't appear favorable.
>
Thank you very much for having a look. Let me know whether the above
resolves your concern.
In any case, should I separate patches 1-3 into another series?
[1] https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/runtime-speculative-side-channel-mitigations.html
("Managed Runtime Speculative Execution Side Channel Mitigations")
On Wed, Mar 19, 2025 at 2:06 AM Luis Gerhorst <luis.gerhorst@fau.de> wrote: > > Thank you very much for having a look. Let me know whether the above > resolves your concern. > > In any case, should I separate patches 1-3 into another series? Sorry for the delay. lsfmm was followed by the busy merge window. Please rebase and repost the patches with the detailed explanation of how lfence works internally and it affects on the algorithm. I mistakenly thought that lfence is a load fence only. That it forces all prior loads to complete, but not the other insns. Since it's an absolute speculation barrier the algorithm appears sound. My only remaining reservation is a heuristic in this patch. If we don't do it, we wouldn't have to special case push_stack() too, right? Let's continue discussion in the new thread.
© 2016 - 2025 Red Hat, Inc.