css_iter and process_iter should be used in rcu section. Specifically, in
sleepable progs explicit bpf_rcu_read_lock() is needed before use these
iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
use them directly.
This patch checks whether we are in rcu cs before we want to invoke
bpf_iter_process_new and bpf_iter_css_{pre, post}_new in
mark_stack_slots_iter(). If the rcu protection is guaranteed, we would
let st->type = PTR_TO_STACK | MEM_RCU. is_iter_reg_valid_init() will
reject if reg->type is UNTRUSTED.
Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2367483bf4c2..6a6827ba7a18 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1172,7 +1172,13 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
static void __mark_reg_known_zero(struct bpf_reg_state *reg);
+static bool in_rcu_cs(struct bpf_verifier_env *env);
+
+/* check whether we are using bpf_iter_process_*() or bpf_iter_css_*() */
+static bool is_iter_need_rcu(struct bpf_kfunc_call_arg_meta *meta);
+
static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
struct bpf_reg_state *reg, int insn_idx,
struct btf *btf, u32 btf_id, int nr_slots)
{
@@ -1193,6 +1199,12 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
__mark_reg_known_zero(st);
st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+ if (is_iter_need_rcu(meta)) {
+ if (in_rcu_cs(env))
+ st->type |= MEM_RCU;
+ else
+ st->type |= PTR_UNTRUSTED;
+ }
st->live |= REG_LIVE_WRITTEN;
st->ref_obj_id = i == 0 ? id : 0;
st->iter.btf = btf;
@@ -1281,6 +1293,8 @@ static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_
struct bpf_stack_state *slot = &state->stack[spi - i];
struct bpf_reg_state *st = &slot->spilled_ptr;
+ if (st->type & PTR_UNTRUSTED)
+ return false;
/* only main (first) slot has ref_obj_id set */
if (i == 0 && !st->ref_obj_id)
return false;
@@ -7503,13 +7517,13 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
return err;
}
- err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
+ err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots);
if (err)
return err;
} else {
/* iter_next() or iter_destroy() expect initialized iter state*/
if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
- verbose(env, "expected an initialized iter_%s as arg #%d\n",
+ verbose(env, "expected an initialized iter_%s as arg #%d or without bpf_rcu_read_lock()\n",
iter_type_str(meta->btf, btf_id), regno);
return -EINVAL;
}
@@ -10382,6 +10396,18 @@ BTF_ID(func, bpf_percpu_obj_new_impl)
BTF_ID(func, bpf_percpu_obj_drop_impl)
BTF_ID(func, bpf_iter_css_task_new)
+BTF_SET_START(rcu_protect_kfuns_set)
+BTF_ID(func, bpf_iter_process_new)
+BTF_ID(func, bpf_iter_css_pre_new)
+BTF_ID(func, bpf_iter_css_post_new)
+BTF_SET_END(rcu_protect_kfuns_set)
+
+static inline bool is_iter_need_rcu(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return btf_id_set_contains(&rcu_protect_kfuns_set, meta->func_id);
+}
+
+
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
--
2.20.1
On Tue, Sep 12, 2023 at 12:02 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
> css_iter and process_iter should be used in rcu section. Specifically, in
> sleepable progs explicit bpf_rcu_read_lock() is needed before use these
> iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
> use them directly.
>
> This patch checks whether we are in rcu cs before we want to invoke
> bpf_iter_process_new and bpf_iter_css_{pre, post}_new in
> mark_stack_slots_iter(). If the rcu protection is guaranteed, we would
> let st->type = PTR_TO_STACK | MEM_RCU. is_iter_reg_valid_init() will
> reject if reg->type is UNTRUSTED.
it would be nice to mention where this MEM_RCU is turned into
UNTRUSTED when we do rcu_read_unlock(). For someone unfamiliar with
these parts of verifier (like me) this is completely unobvious.
>
> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> ---
> kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++++++--
> 1 file changed, 28 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 2367483bf4c2..6a6827ba7a18 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1172,7 +1172,13 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
>
> static void __mark_reg_known_zero(struct bpf_reg_state *reg);
>
> +static bool in_rcu_cs(struct bpf_verifier_env *env);
> +
> +/* check whether we are using bpf_iter_process_*() or bpf_iter_css_*() */
> +static bool is_iter_need_rcu(struct bpf_kfunc_call_arg_meta *meta);
> +
> static int mark_stack_slots_iter(struct bpf_verifier_env *env,
> + struct bpf_kfunc_call_arg_meta *meta,
> struct bpf_reg_state *reg, int insn_idx,
> struct btf *btf, u32 btf_id, int nr_slots)
> {
> @@ -1193,6 +1199,12 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
>
> __mark_reg_known_zero(st);
> st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
> + if (is_iter_need_rcu(meta)) {
> + if (in_rcu_cs(env))
> + st->type |= MEM_RCU;
> + else
> + st->type |= PTR_UNTRUSTED;
> + }
> st->live |= REG_LIVE_WRITTEN;
> st->ref_obj_id = i == 0 ? id : 0;
> st->iter.btf = btf;
> @@ -1281,6 +1293,8 @@ static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_
> struct bpf_stack_state *slot = &state->stack[spi - i];
> struct bpf_reg_state *st = &slot->spilled_ptr;
>
> + if (st->type & PTR_UNTRUSTED)
> + return false;
> /* only main (first) slot has ref_obj_id set */
> if (i == 0 && !st->ref_obj_id)
> return false;
> @@ -7503,13 +7517,13 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
> return err;
> }
>
> - err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
> + err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots);
> if (err)
> return err;
> } else {
> /* iter_next() or iter_destroy() expect initialized iter state*/
> if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
> - verbose(env, "expected an initialized iter_%s as arg #%d\n",
> + verbose(env, "expected an initialized iter_%s as arg #%d or without bpf_rcu_read_lock()\n",
> iter_type_str(meta->btf, btf_id), regno);
this message makes no sense, but even if reworded it would be
confusing for users. So maybe do the RCU check separately and report a
clear message that this iterator is expected to be within a single
continuous rcu_read_{lock+unlock} region.
I do think tracking RCU regions explicitly would make for much easier
to follow code, better messages, etc. Probably would be beneficial for
some other RCU-protected features. But that's a separate topic.
> return -EINVAL;
> }
> @@ -10382,6 +10396,18 @@ BTF_ID(func, bpf_percpu_obj_new_impl)
> BTF_ID(func, bpf_percpu_obj_drop_impl)
> BTF_ID(func, bpf_iter_css_task_new)
>
> +BTF_SET_START(rcu_protect_kfuns_set)
> +BTF_ID(func, bpf_iter_process_new)
> +BTF_ID(func, bpf_iter_css_pre_new)
> +BTF_ID(func, bpf_iter_css_post_new)
> +BTF_SET_END(rcu_protect_kfuns_set)
> +
instead of maintaining these extra special sets, why not add a KF
flag, like KF_RCU_PROTECTED?
> +static inline bool is_iter_need_rcu(struct bpf_kfunc_call_arg_meta *meta)
> +{
> + return btf_id_set_contains(&rcu_protect_kfuns_set, meta->func_id);
> +}
> +
> +
> static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
> {
> if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
> --
> 2.20.1
>
Hello.
在 2023/9/12 15:01, Chuyi Zhou 写道:
> css_iter and process_iter should be used in rcu section. Specifically, in
> sleepable progs explicit bpf_rcu_read_lock() is needed before use these
> iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
> use them directly.
>
> This patch checks whether we are in rcu cs before we want to invoke
> bpf_iter_process_new and bpf_iter_css_{pre, post}_new in
> mark_stack_slots_iter(). If the rcu protection is guaranteed, we would
> let st->type = PTR_TO_STACK | MEM_RCU. is_iter_reg_valid_init() will
> reject if reg->type is UNTRUSTED.
I use the following BPF Prog to test this patch:
SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
int iter_task_for_each_sleep(void *ctx)
{
struct task_struct *task;
struct task_struct *cur_task = bpf_get_current_task_btf();
if (cur_task->pid != target_pid)
return 0;
bpf_rcu_read_lock();
bpf_for_each(process, task) {
bpf_rcu_read_unlock();
if (task->pid == target_pid)
process_cnt += 1;
bpf_rcu_read_lock();
}
bpf_rcu_read_unlock();
return 0;
}
Unfortunately, we can pass the verifier.
Then I add some printk-messages before setting/clearing state to help debug:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d151e6b43a5f..35f3fa9471a9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1200,7 +1200,7 @@ static int mark_stack_slots_iter(struct
bpf_verifier_env *env,
__mark_reg_known_zero(st);
st->type = PTR_TO_STACK; /* we don't have dedicated reg
type */
if (is_iter_need_rcu(meta)) {
+ printk("mark reg_addr : %px", st);
if (in_rcu_cs(env))
st->type |= MEM_RCU;
else
@@ -11472,8 +11472,8 @@ static int check_kfunc_call(struct
bpf_verifier_env *env, struct bpf_insn *insn,
return -EINVAL;
} else if (rcu_unlock) {
bpf_for_each_reg_in_vstate(env->cur_state,
state, reg, ({
+ printk("clear reg_addr : %px MEM_RCU :
%d PTR_UNTRUSTED : %d\n ", reg, reg->type & MEM_RCU, reg->type &
PTR_UNTRUSTED);
if (reg->type & MEM_RCU) {
- printk("clear reg addr : %lld",
reg);
reg->type &= ~(MEM_RCU |
PTR_MAYBE_NULL);
reg->type |= PTR_UNTRUSTED;
}
The demsg log:
[ 393.705324] mark reg_addr : ffff88814e40e200
[ 393.706883] clear reg_addr : ffff88814d5f8000 MEM_RCU : 0
PTR_UNTRUSTED : 0
[ 393.707353] clear reg_addr : ffff88814d5f8078 MEM_RCU : 0
PTR_UNTRUSTED : 0
[ 393.708099] clear reg_addr : ffff88814d5f80f0 MEM_RCU : 0
PTR_UNTRUSTED : 0
....
....
I didn't see ffff88814e40e200 is cleared as expected because
bpf_for_each_reg_in_vstate didn't find it.
It seems when we are doing bpf_read_unlock() in the middle of iteration
and want to clearing state through bpf_for_each_reg_in_vstate, we can
not find the previous reg which we marked MEM_RCU/PTR_UNTRUSTED in
mark_stack_slots_iter().
I thought maybe the correct answer here is operating the *iter_reg*
parameter in mark_stack_slots_iter() direcly so we can find it in
bpf_for_each_reg_in_vstate.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6a6827ba7a18..53330ddf2b3c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1218,6 +1218,12 @@ static int mark_stack_slots_iter(struct
bpf_verifier_env *env,
mark_stack_slot_scratched(env, spi - i);
}
+ if (is_iter_need_rcu(meta)) {
+ if (in_rcu_cs(env))
+ reg->type |= MEM_RCU;
+ else
+ reg->type |= PTR_UNTRUSTED;
+ }
return 0;
}
@@ -1307,7 +1315,8 @@ static bool is_iter_reg_valid_init(struct
bpf_verifier_env *env, struct bpf_reg_
if (slot->slot_type[j] != STACK_ITER)
Kumarreturn false;
}
-
+ if (reg->type & PTR_UNTRUSTED)
+ return false;
return true;
}
However, it did not work either. The reason it didn't work is the state
of iter_reg will be cleared implicitly before the
is_iter_reg_valid_init() even we don't call bpf_rcu_unlock.
It would be appreciate if you could give some suggestion. Maby it worthy
to try the solution proposed by Kumar?[1]
[1]
https://lore.kernel.org/lkml/CAP01T77cWxWNwq5HLr+Woiu7k4-P3QQfJWX1OeQJUkxW3=P4bA@mail.gmail.com/#t
在 2023/9/13 21:53, Chuyi Zhou 写道:
> Hello.
>
> 在 2023/9/12 15:01, Chuyi Zhou 写道:
>> css_iter and process_iter should be used in rcu section. Specifically, in
>> sleepable progs explicit bpf_rcu_read_lock() is needed before use these
>> iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
>> use them directly.
>>
>> This patch checks whether we are in rcu cs before we want to invoke
>> bpf_iter_process_new and bpf_iter_css_{pre, post}_new in
>> mark_stack_slots_iter(). If the rcu protection is guaranteed, we would
>> let st->type = PTR_TO_STACK | MEM_RCU. is_iter_reg_valid_init() will
>> reject if reg->type is UNTRUSTED.
>
> I use the following BPF Prog to test this patch:
>
> SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
> int iter_task_for_each_sleep(void *ctx)
> {
> struct task_struct *task;
> struct task_struct *cur_task = bpf_get_current_task_btf();
>
> if (cur_task->pid != target_pid)
> return 0;
> bpf_rcu_read_lock();
> bpf_for_each(process, task) {
> bpf_rcu_read_unlock();
> if (task->pid == target_pid)
> process_cnt += 1;
> bpf_rcu_read_lock();
> }
> bpf_rcu_read_unlock();
> return 0;
> }
>
> Unfortunately, we can pass the verifier.
>
> Then I add some printk-messages before setting/clearing state to help
> debug:
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index d151e6b43a5f..35f3fa9471a9 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1200,7 +1200,7 @@ static int mark_stack_slots_iter(struct
> bpf_verifier_env *env,
> __mark_reg_known_zero(st);
> st->type = PTR_TO_STACK; /* we don't have dedicated reg
> type */
> if (is_iter_need_rcu(meta)) {
> + printk("mark reg_addr : %px", st);
> if (in_rcu_cs(env))
> st->type |= MEM_RCU;
> else
> @@ -11472,8 +11472,8 @@ static int check_kfunc_call(struct
> bpf_verifier_env *env, struct bpf_insn *insn,
> return -EINVAL;
> } else if (rcu_unlock) {
> bpf_for_each_reg_in_vstate(env->cur_state,
> state, reg, ({
> + printk("clear reg_addr : %px MEM_RCU :
> %d PTR_UNTRUSTED : %d\n ", reg, reg->type & MEM_RCU, reg->type &
> PTR_UNTRUSTED);
> if (reg->type & MEM_RCU) {
> - printk("clear reg addr : %lld",
> reg);
> reg->type &= ~(MEM_RCU |
> PTR_MAYBE_NULL);
> reg->type |= PTR_UNTRUSTED;
> }
>
>
> The demsg log:
>
> [ 393.705324] mark reg_addr : ffff88814e40e200
>
> [ 393.706883] clear reg_addr : ffff88814d5f8000 MEM_RCU : 0
> PTR_UNTRUSTED : 0
>
> [ 393.707353] clear reg_addr : ffff88814d5f8078 MEM_RCU : 0
> PTR_UNTRUSTED : 0
>
> [ 393.708099] clear reg_addr : ffff88814d5f80f0 MEM_RCU : 0
> PTR_UNTRUSTED : 0
> ....
> ....
>
> I didn't see ffff88814e40e200 is cleared as expected because
> bpf_for_each_reg_in_vstate didn't find it.
>
> It seems when we are doing bpf_read_unlock() in the middle of iteration
> and want to clearing state through bpf_for_each_reg_in_vstate, we can
> not find the previous reg which we marked MEM_RCU/PTR_UNTRUSTED in
> mark_stack_slots_iter().
>
bpf_get_spilled_reg will skip slots if they are not STACK_SPILL, but in
mark_stack_slots_iter() we has marked the slots *STACK_ITER*
With the following change, everything seems work OK.
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index a3236651ec64..83c5ecccadb4 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -387,7 +387,7 @@ struct bpf_verifier_state {
#define bpf_get_spilled_reg(slot, frame) \
(((slot < frame->allocated_stack / BPF_REG_SIZE) && \
- (frame->stack[slot].slot_type[0] == STACK_SPILL)) \
+ (frame->stack[slot].slot_type[0] == STACK_SPILL ||
frame->stack[slot].slot_type[0] == STACK_ITER)) \
? &frame->stack[slot].spilled_ptr : NULL)
I am not sure whether this would harm some logic implicitly when using
bpf_get_spilled_reg/bpf_for_each_spilled_reg in other place. If so,
maybe we should add a extra parameter to control the picking behaviour.
#define bpf_get_spilled_reg(slot, frame, stack_type)
\
(((slot < frame->allocated_stack / BPF_REG_SIZE) && \
(frame->stack[slot].slot_type[0] == stack_type)) \
? &frame->stack[slot].spilled_ptr : NULL)
Thanks.
On Thu, Sep 14, 2023 at 1:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
>
>
> 在 2023/9/13 21:53, Chuyi Zhou 写道:
> > Hello.
> >
> > 在 2023/9/12 15:01, Chuyi Zhou 写道:
> >> css_iter and process_iter should be used in rcu section. Specifically, in
> >> sleepable progs explicit bpf_rcu_read_lock() is needed before use these
> >> iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
> >> use them directly.
> >>
> >> This patch checks whether we are in rcu cs before we want to invoke
> >> bpf_iter_process_new and bpf_iter_css_{pre, post}_new in
> >> mark_stack_slots_iter(). If the rcu protection is guaranteed, we would
> >> let st->type = PTR_TO_STACK | MEM_RCU. is_iter_reg_valid_init() will
> >> reject if reg->type is UNTRUSTED.
> >
> > I use the following BPF Prog to test this patch:
> >
> > SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
> > int iter_task_for_each_sleep(void *ctx)
> > {
> > struct task_struct *task;
> > struct task_struct *cur_task = bpf_get_current_task_btf();
> >
> > if (cur_task->pid != target_pid)
> > return 0;
> > bpf_rcu_read_lock();
> > bpf_for_each(process, task) {
> > bpf_rcu_read_unlock();
> > if (task->pid == target_pid)
> > process_cnt += 1;
> > bpf_rcu_read_lock();
> > }
> > bpf_rcu_read_unlock();
> > return 0;
> > }
> >
> > Unfortunately, we can pass the verifier.
> >
> > Then I add some printk-messages before setting/clearing state to help
> > debug:
> >
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index d151e6b43a5f..35f3fa9471a9 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -1200,7 +1200,7 @@ static int mark_stack_slots_iter(struct
> > bpf_verifier_env *env,
> > __mark_reg_known_zero(st);
> > st->type = PTR_TO_STACK; /* we don't have dedicated reg
> > type */
> > if (is_iter_need_rcu(meta)) {
> > + printk("mark reg_addr : %px", st);
> > if (in_rcu_cs(env))
> > st->type |= MEM_RCU;
> > else
> > @@ -11472,8 +11472,8 @@ static int check_kfunc_call(struct
> > bpf_verifier_env *env, struct bpf_insn *insn,
> > return -EINVAL;
> > } else if (rcu_unlock) {
> > bpf_for_each_reg_in_vstate(env->cur_state,
> > state, reg, ({
> > + printk("clear reg_addr : %px MEM_RCU :
> > %d PTR_UNTRUSTED : %d\n ", reg, reg->type & MEM_RCU, reg->type &
> > PTR_UNTRUSTED);
> > if (reg->type & MEM_RCU) {
> > - printk("clear reg addr : %lld",
> > reg);
> > reg->type &= ~(MEM_RCU |
> > PTR_MAYBE_NULL);
> > reg->type |= PTR_UNTRUSTED;
> > }
> >
> >
> > The demsg log:
> >
> > [ 393.705324] mark reg_addr : ffff88814e40e200
> >
> > [ 393.706883] clear reg_addr : ffff88814d5f8000 MEM_RCU : 0
> > PTR_UNTRUSTED : 0
> >
> > [ 393.707353] clear reg_addr : ffff88814d5f8078 MEM_RCU : 0
> > PTR_UNTRUSTED : 0
> >
> > [ 393.708099] clear reg_addr : ffff88814d5f80f0 MEM_RCU : 0
> > PTR_UNTRUSTED : 0
> > ....
> > ....
> >
> > I didn't see ffff88814e40e200 is cleared as expected because
> > bpf_for_each_reg_in_vstate didn't find it.
> >
> > It seems when we are doing bpf_read_unlock() in the middle of iteration
> > and want to clearing state through bpf_for_each_reg_in_vstate, we can
> > not find the previous reg which we marked MEM_RCU/PTR_UNTRUSTED in
> > mark_stack_slots_iter().
> >
>
> bpf_get_spilled_reg will skip slots if they are not STACK_SPILL, but in
> mark_stack_slots_iter() we has marked the slots *STACK_ITER*
>
> With the following change, everything seems work OK.
>
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index a3236651ec64..83c5ecccadb4 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -387,7 +387,7 @@ struct bpf_verifier_state {
>
> #define bpf_get_spilled_reg(slot, frame) \
> (((slot < frame->allocated_stack / BPF_REG_SIZE) && \
> - (frame->stack[slot].slot_type[0] == STACK_SPILL)) \
> + (frame->stack[slot].slot_type[0] == STACK_SPILL ||
> frame->stack[slot].slot_type[0] == STACK_ITER)) \
> ? &frame->stack[slot].spilled_ptr : NULL)
>
> I am not sure whether this would harm some logic implicitly when using
> bpf_get_spilled_reg/bpf_for_each_spilled_reg in other place. If so,
> maybe we should add a extra parameter to control the picking behaviour.
>
> #define bpf_get_spilled_reg(slot, frame, stack_type)
> \
> (((slot < frame->allocated_stack / BPF_REG_SIZE) && \
> (frame->stack[slot].slot_type[0] == stack_type)) \
> ? &frame->stack[slot].spilled_ptr : NULL)
>
> Thanks.
I don't think it's safe to just make bpf_get_spilled_reg, and
subsequently bpf_for_each_reg_in_vstate and bpf_for_each_spilled_reg
just suddenly start iterating iterator states and/or dynptrs. At least
some of existing uses of those assume they are really working just
with registers.
Hello.
在 2023/9/15 07:26, Andrii Nakryiko 写道:
> On Thu, Sep 14, 2023 at 1:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>>
>>
>>
>> 在 2023/9/13 21:53, Chuyi Zhou 写道:
>>> Hello.
>>>
>>> 在 2023/9/12 15:01, Chuyi Zhou 写道:
>>>> css_iter and process_iter should be used in rcu section. Specifically, in
>>>> sleepable progs explicit bpf_rcu_read_lock() is needed before use these
>>>> iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
>>>> use them directly.
>>>>
>>>> This patch checks whether we are in rcu cs before we want to invoke
>>>> bpf_iter_process_new and bpf_iter_css_{pre, post}_new in
>>>> mark_stack_slots_iter(). If the rcu protection is guaranteed, we would
>>>> let st->type = PTR_TO_STACK | MEM_RCU. is_iter_reg_valid_init() will
>>>> reject if reg->type is UNTRUSTED.
>>>
>>> I use the following BPF Prog to test this patch:
>>>
>>> SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
>>> int iter_task_for_each_sleep(void *ctx)
>>> {
>>> struct task_struct *task;
>>> struct task_struct *cur_task = bpf_get_current_task_btf();
>>>
>>> if (cur_task->pid != target_pid)
>>> return 0;
>>> bpf_rcu_read_lock();
>>> bpf_for_each(process, task) {
>>> bpf_rcu_read_unlock();
>>> if (task->pid == target_pid)
>>> process_cnt += 1;
>>> bpf_rcu_read_lock();
>>> }
>>> bpf_rcu_read_unlock();
>>> return 0;
>>> }
>>>
>>> Unfortunately, we can pass the verifier.
>>>
>>> Then I add some printk-messages before setting/clearing state to help
>>> debug:
>>>
>>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>>> index d151e6b43a5f..35f3fa9471a9 100644
>>> --- a/kernel/bpf/verifier.c
>>> +++ b/kernel/bpf/verifier.c
>>> @@ -1200,7 +1200,7 @@ static int mark_stack_slots_iter(struct
>>> bpf_verifier_env *env,
>>> __mark_reg_known_zero(st);
>>> st->type = PTR_TO_STACK; /* we don't have dedicated reg
>>> type */
>>> if (is_iter_need_rcu(meta)) {
>>> + printk("mark reg_addr : %px", st);
>>> if (in_rcu_cs(env))
>>> st->type |= MEM_RCU;
>>> else
>>> @@ -11472,8 +11472,8 @@ static int check_kfunc_call(struct
>>> bpf_verifier_env *env, struct bpf_insn *insn,
>>> return -EINVAL;
>>> } else if (rcu_unlock) {
>>> bpf_for_each_reg_in_vstate(env->cur_state,
>>> state, reg, ({
>>> + printk("clear reg_addr : %px MEM_RCU :
>>> %d PTR_UNTRUSTED : %d\n ", reg, reg->type & MEM_RCU, reg->type &
>>> PTR_UNTRUSTED);
>>> if (reg->type & MEM_RCU) {
>>> - printk("clear reg addr : %lld",
>>> reg);
>>> reg->type &= ~(MEM_RCU |
>>> PTR_MAYBE_NULL);
>>> reg->type |= PTR_UNTRUSTED;
>>> }
>>>
>>>
>>> The demsg log:
>>>
>>> [ 393.705324] mark reg_addr : ffff88814e40e200
>>>
>>> [ 393.706883] clear reg_addr : ffff88814d5f8000 MEM_RCU : 0
>>> PTR_UNTRUSTED : 0
>>>
>>> [ 393.707353] clear reg_addr : ffff88814d5f8078 MEM_RCU : 0
>>> PTR_UNTRUSTED : 0
>>>
>>> [ 393.708099] clear reg_addr : ffff88814d5f80f0 MEM_RCU : 0
>>> PTR_UNTRUSTED : 0
>>> ....
>>> ....
>>>
>>> I didn't see ffff88814e40e200 is cleared as expected because
>>> bpf_for_each_reg_in_vstate didn't find it.
>>>
>>> It seems when we are doing bpf_read_unlock() in the middle of iteration
>>> and want to clearing state through bpf_for_each_reg_in_vstate, we can
>>> not find the previous reg which we marked MEM_RCU/PTR_UNTRUSTED in
>>> mark_stack_slots_iter().
>>>
>>
>> bpf_get_spilled_reg will skip slots if they are not STACK_SPILL, but in
>> mark_stack_slots_iter() we has marked the slots *STACK_ITER*
>>
>> With the following change, everything seems work OK.
>>
>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>> index a3236651ec64..83c5ecccadb4 100644
>> --- a/include/linux/bpf_verifier.h
>> +++ b/include/linux/bpf_verifier.h
>> @@ -387,7 +387,7 @@ struct bpf_verifier_state {
>>
>> #define bpf_get_spilled_reg(slot, frame) \
>> (((slot < frame->allocated_stack / BPF_REG_SIZE) && \
>> - (frame->stack[slot].slot_type[0] == STACK_SPILL)) \
>> + (frame->stack[slot].slot_type[0] == STACK_SPILL ||
>> frame->stack[slot].slot_type[0] == STACK_ITER)) \
>> ? &frame->stack[slot].spilled_ptr : NULL)
>>
>> I am not sure whether this would harm some logic implicitly when using
>> bpf_get_spilled_reg/bpf_for_each_spilled_reg in other place. If so,
>> maybe we should add a extra parameter to control the picking behaviour.
>>
>> #define bpf_get_spilled_reg(slot, frame, stack_type)
>> \
>> (((slot < frame->allocated_stack / BPF_REG_SIZE) && \
>> (frame->stack[slot].slot_type[0] == stack_type)) \
>> ? &frame->stack[slot].spilled_ptr : NULL)
>>
>> Thanks.
>
> I don't think it's safe to just make bpf_get_spilled_reg, and
> subsequently bpf_for_each_reg_in_vstate and bpf_for_each_spilled_reg
> just suddenly start iterating iterator states and/or dynptrs. At least
> some of existing uses of those assume they are really working just
> with registers.
IIUC, when we are doing bpf_rcu_unlock, we do need to clear the state of
reg including STACK_ITER.
Maybe here we only need change the logic when using
bpf_for_each_reg_in_vstate to clear state in bpf_rcu_unlock and keep
everything else unchanged ?
Thanks.
On Thu, Sep 14, 2023 at 10:46 PM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
>
> Hello.
>
> 在 2023/9/15 07:26, Andrii Nakryiko 写道:
> > On Thu, Sep 14, 2023 at 1:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote:
> >>
> >>
> >>
> >> 在 2023/9/13 21:53, Chuyi Zhou 写道:
> >>> Hello.
> >>>
> >>> 在 2023/9/12 15:01, Chuyi Zhou 写道:
> >>>> css_iter and process_iter should be used in rcu section. Specifically, in
> >>>> sleepable progs explicit bpf_rcu_read_lock() is needed before use these
> >>>> iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
> >>>> use them directly.
> >>>>
> >>>> This patch checks whether we are in rcu cs before we want to invoke
> >>>> bpf_iter_process_new and bpf_iter_css_{pre, post}_new in
> >>>> mark_stack_slots_iter(). If the rcu protection is guaranteed, we would
> >>>> let st->type = PTR_TO_STACK | MEM_RCU. is_iter_reg_valid_init() will
> >>>> reject if reg->type is UNTRUSTED.
> >>>
> >>> I use the following BPF Prog to test this patch:
> >>>
> >>> SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
> >>> int iter_task_for_each_sleep(void *ctx)
> >>> {
> >>> struct task_struct *task;
> >>> struct task_struct *cur_task = bpf_get_current_task_btf();
> >>>
> >>> if (cur_task->pid != target_pid)
> >>> return 0;
> >>> bpf_rcu_read_lock();
> >>> bpf_for_each(process, task) {
> >>> bpf_rcu_read_unlock();
> >>> if (task->pid == target_pid)
> >>> process_cnt += 1;
> >>> bpf_rcu_read_lock();
> >>> }
> >>> bpf_rcu_read_unlock();
> >>> return 0;
> >>> }
> >>>
> >>> Unfortunately, we can pass the verifier.
> >>>
> >>> Then I add some printk-messages before setting/clearing state to help
> >>> debug:
> >>>
> >>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> >>> index d151e6b43a5f..35f3fa9471a9 100644
> >>> --- a/kernel/bpf/verifier.c
> >>> +++ b/kernel/bpf/verifier.c
> >>> @@ -1200,7 +1200,7 @@ static int mark_stack_slots_iter(struct
> >>> bpf_verifier_env *env,
> >>> __mark_reg_known_zero(st);
> >>> st->type = PTR_TO_STACK; /* we don't have dedicated reg
> >>> type */
> >>> if (is_iter_need_rcu(meta)) {
> >>> + printk("mark reg_addr : %px", st);
> >>> if (in_rcu_cs(env))
> >>> st->type |= MEM_RCU;
> >>> else
> >>> @@ -11472,8 +11472,8 @@ static int check_kfunc_call(struct
> >>> bpf_verifier_env *env, struct bpf_insn *insn,
> >>> return -EINVAL;
> >>> } else if (rcu_unlock) {
> >>> bpf_for_each_reg_in_vstate(env->cur_state,
> >>> state, reg, ({
> >>> + printk("clear reg_addr : %px MEM_RCU :
> >>> %d PTR_UNTRUSTED : %d\n ", reg, reg->type & MEM_RCU, reg->type &
> >>> PTR_UNTRUSTED);
> >>> if (reg->type & MEM_RCU) {
> >>> - printk("clear reg addr : %lld",
> >>> reg);
> >>> reg->type &= ~(MEM_RCU |
> >>> PTR_MAYBE_NULL);
> >>> reg->type |= PTR_UNTRUSTED;
> >>> }
> >>>
> >>>
> >>> The demsg log:
> >>>
> >>> [ 393.705324] mark reg_addr : ffff88814e40e200
> >>>
> >>> [ 393.706883] clear reg_addr : ffff88814d5f8000 MEM_RCU : 0
> >>> PTR_UNTRUSTED : 0
> >>>
> >>> [ 393.707353] clear reg_addr : ffff88814d5f8078 MEM_RCU : 0
> >>> PTR_UNTRUSTED : 0
> >>>
> >>> [ 393.708099] clear reg_addr : ffff88814d5f80f0 MEM_RCU : 0
> >>> PTR_UNTRUSTED : 0
> >>> ....
> >>> ....
> >>>
> >>> I didn't see ffff88814e40e200 is cleared as expected because
> >>> bpf_for_each_reg_in_vstate didn't find it.
> >>>
> >>> It seems when we are doing bpf_read_unlock() in the middle of iteration
> >>> and want to clearing state through bpf_for_each_reg_in_vstate, we can
> >>> not find the previous reg which we marked MEM_RCU/PTR_UNTRUSTED in
> >>> mark_stack_slots_iter().
> >>>
> >>
> >> bpf_get_spilled_reg will skip slots if they are not STACK_SPILL, but in
> >> mark_stack_slots_iter() we has marked the slots *STACK_ITER*
> >>
> >> With the following change, everything seems work OK.
> >>
> >> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> >> index a3236651ec64..83c5ecccadb4 100644
> >> --- a/include/linux/bpf_verifier.h
> >> +++ b/include/linux/bpf_verifier.h
> >> @@ -387,7 +387,7 @@ struct bpf_verifier_state {
> >>
> >> #define bpf_get_spilled_reg(slot, frame) \
> >> (((slot < frame->allocated_stack / BPF_REG_SIZE) && \
> >> - (frame->stack[slot].slot_type[0] == STACK_SPILL)) \
> >> + (frame->stack[slot].slot_type[0] == STACK_SPILL ||
> >> frame->stack[slot].slot_type[0] == STACK_ITER)) \
> >> ? &frame->stack[slot].spilled_ptr : NULL)
> >>
> >> I am not sure whether this would harm some logic implicitly when using
> >> bpf_get_spilled_reg/bpf_for_each_spilled_reg in other place. If so,
> >> maybe we should add a extra parameter to control the picking behaviour.
> >>
> >> #define bpf_get_spilled_reg(slot, frame, stack_type)
> >> \
> >> (((slot < frame->allocated_stack / BPF_REG_SIZE) && \
> >> (frame->stack[slot].slot_type[0] == stack_type)) \
> >> ? &frame->stack[slot].spilled_ptr : NULL)
> >>
> >> Thanks.
> >
> > I don't think it's safe to just make bpf_get_spilled_reg, and
> > subsequently bpf_for_each_reg_in_vstate and bpf_for_each_spilled_reg
> > just suddenly start iterating iterator states and/or dynptrs. At least
> > some of existing uses of those assume they are really working just
> > with registers.
>
> IIUC, when we are doing bpf_rcu_unlock, we do need to clear the state of
> reg including STACK_ITER.
>
> Maybe here we only need change the logic when using
> bpf_for_each_reg_in_vstate to clear state in bpf_rcu_unlock and keep
> everything else unchanged ?
Right, maybe. I see 10 uses of bpf_for_each_reg_in_vstate() in
kernel/bpf/verifier.c. Before we change the definition of
bpf_for_each_reg_in_vstate() we should validate that iterating dynptr
and iter states doesn't break any of them, that's all.
>
> Thanks.
© 2016 - 2025 Red Hat, Inc.