kernel/bpf/task_iter.c | 7 ------- 1 file changed, 7 deletions(-)
1. find_pid_ns() + get_pid_task() under rcu_read_lock() guarantees that we
can safely iterate the task->thread_group list. Even if this task exits
right after get_pid_task() (or goto retry) and pid_alive() returns 0.
Kill the unnecessary pid_alive() check.
2. next_thread() simply can't return NULL, kill the bogus "if (!next_task)"
check.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
kernel/bpf/task_iter.c | 7 -------
1 file changed, 7 deletions(-)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index c4ab9d6cdbe9..4d1125108014 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -75,15 +75,8 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
return NULL;
retry:
- if (!pid_alive(task)) {
- put_task_struct(task);
- return NULL;
- }
-
next_task = next_thread(task);
put_task_struct(task);
- if (!next_task)
- return NULL;
saved_tid = *tid;
*tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
--
2.25.1.362.g51ebf55
OK, it seems that you are not going to take these preparatory
cleanups ;)
I'll resend along with the s/next_thread/__next_thread/ change.
I was going to do the last change later, but this recent discussion
https://lore.kernel.org/all/20230824143112.GA31208@redhat.com/
makes me think we should do this right now.
On 08/21, Oleg Nesterov wrote:
>
> 1. find_pid_ns() + get_pid_task() under rcu_read_lock() guarantees that we
> can safely iterate the task->thread_group list. Even if this task exits
> right after get_pid_task() (or goto retry) and pid_alive() returns 0.
>
> Kill the unnecessary pid_alive() check.
>
> 2. next_thread() simply can't return NULL, kill the bogus "if (!next_task)"
> check.
>
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
> kernel/bpf/task_iter.c | 7 -------
> 1 file changed, 7 deletions(-)
>
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index c4ab9d6cdbe9..4d1125108014 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -75,15 +75,8 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
> return NULL;
>
> retry:
> - if (!pid_alive(task)) {
> - put_task_struct(task);
> - return NULL;
> - }
> -
> next_task = next_thread(task);
> put_task_struct(task);
> - if (!next_task)
> - return NULL;
>
> saved_tid = *tid;
> *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
> --
> 2.25.1.362.g51ebf55
>
>
Oleg Nesterov <oleg@redhat.com> writes: > OK, it seems that you are not going to take these preparatory > cleanups ;) > > I'll resend along with the s/next_thread/__next_thread/ change. > I was going to do the last change later, but this recent discussion > https://lore.kernel.org/all/20230824143112.GA31208@redhat.com/ > makes me think we should do this right now. For the record I find this code confusing, and wrong. It looks like it wants to keep the task_struct pointer or possibly the struct pid pointer like proc does, but then it winds up keeping a userspace pid value and regenerating both the struct pid pointer and the struct task_struct pointer. Which means that task_group_seq_get_next is unnecessarily slow and has a built in race condition which means it could wind up iterating through a different process. This whole thing looks to be a bad (aka racy) reimplementation of first_tid and next_tid from proc. I thought the changes were to adapt to the needs of bpf, but on closer examination the code is just racy. For this code to be correct bpf_iter_seq_task_common needs to store at a minimum a struct pid pointer. Oleg your patch makes it easier to see what the how far this is from first_tid/next_tid in proc. Acked-by: "Eric W. Biederman" <ebiederm@xmission.com> Eric
On 08/25, Eric W. Biederman wrote: > > For the record I find this code confusing, and wrong. Oh, yes... > and has > a built in race condition which means it could wind up iterating through > a different process. Yes, common->pid and/or common->pid_visiting can be reused but I am not going to try to fix this ;) > Acked-by: "Eric W. Biederman" <ebiederm@xmission.com> Thanks! Oleg.
On 8/21/23 08:09, Oleg Nesterov wrote:
> 1. find_pid_ns() + get_pid_task() under rcu_read_lock() guarantees that we
> can safely iterate the task->thread_group list. Even if this task exits
> right after get_pid_task() (or goto retry) and pid_alive() returns 0 >
> Kill the unnecessary pid_alive() check.
This function will return next_task holding a refcount, and release the
refcount until the next time calling the same function. Meanwhile,
the returned task A may be killed, and its next task B may be
killed after A as well, before calling this function again.
However, even task B is destroyed (free), A's next is still pointing to
task B. When this function is called again for the same iterator,
it doesn't promise that B is still there.
Does that make sense to you?
>
> 2. next_thread() simply can't return NULL, kill the bogus "if (!next_task)"
> check.
>
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
> kernel/bpf/task_iter.c | 7 -------
> 1 file changed, 7 deletions(-)
>
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index c4ab9d6cdbe9..4d1125108014 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -75,15 +75,8 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
> return NULL;
>
> retry:
> - if (!pid_alive(task)) {
> - put_task_struct(task);
> - return NULL;
> - }
> -
> next_task = next_thread(task);
> put_task_struct(task);
> - if (!next_task)
> - return NULL;
>
> saved_tid = *tid;
> *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
On 08/21, Kui-Feng Lee wrote: > > > On 8/21/23 08:09, Oleg Nesterov wrote: > >1. find_pid_ns() + get_pid_task() under rcu_read_lock() guarantees that we > > can safely iterate the task->thread_group list. Even if this task exits > > right after get_pid_task() (or goto retry) and pid_alive() returns 0 > > > Kill the unnecessary pid_alive() check. > > This function will return next_task holding a refcount, and release the > refcount until the next time calling the same function. Meanwhile, > the returned task A may be killed, and its next task B may be > killed after A as well, before calling this function again. > However, even task B is destroyed (free), A's next is still pointing to > task B. When this function is called again for the same iterator, > it doesn't promise that B is still there. Not sure I understand... OK, if we have a task pointer with incremented refcount and do not hold rcu lock, then yes, you can't remove the pid_alive() check in this code: rcu_read_lock(); if (pid_alive(task)) do_something(next_thread(task)); rcu_read_unlock(); because task and then task->next can exit and do call_rcu(delayed_put_task_struct) before we take rcu_read_lock(). But if you do something like rcu_read_lock(); task = find_task_in_some_rcu_protected_list(); do_something(next_thread(task)); rcu_read_unlock(); then next_thread(task) should be safe without pid_alive(). And iiuc task_group_seq_get_next() always does rcu_read_lock(); // the caller does lock/unlock task = get_pid_task(pid, PIDTYPE_PID); if (!task) return; next_task = next_thread(task); rcu_read_unlock(); Yes, both task and task->next can exit right after get_pid_task(), but since can only happen after we took rcu_read_lock(), delayed_put_task_struct() can't be called until we drop rcu lock. What have I missed? Oleg.
On 8/21/23 11:34, Oleg Nesterov wrote: > On 08/21, Kui-Feng Lee wrote: >> >> >> On 8/21/23 08:09, Oleg Nesterov wrote: >>> 1. find_pid_ns() + get_pid_task() under rcu_read_lock() guarantees that we >>> can safely iterate the task->thread_group list. Even if this task exits >>> right after get_pid_task() (or goto retry) and pid_alive() returns 0 > >>> Kill the unnecessary pid_alive() check. >> >> This function will return next_task holding a refcount, and release the >> refcount until the next time calling the same function. Meanwhile, >> the returned task A may be killed, and its next task B may be >> killed after A as well, before calling this function again. >> However, even task B is destroyed (free), A's next is still pointing to >> task B. When this function is called again for the same iterator, >> it doesn't promise that B is still there. > > Not sure I understand... > > OK, if we have a task pointer with incremented refcount and do not hold > rcu lock, then yes, you can't remove the pid_alive() check in this code: > > rcu_read_lock(); > if (pid_alive(task)) > do_something(next_thread(task)); > rcu_read_unlock(); > > because task and then task->next can exit and do call_rcu(delayed_put_task_struct) > before we take rcu_read_lock(). > > But if you do something like > > rcu_read_lock(); > > task = find_task_in_some_rcu_protected_list(); > do_something(next_thread(task)); > > rcu_read_unlock(); > > then next_thread(task) should be safe without pid_alive(). > > And iiuc task_group_seq_get_next() always does > > rcu_read_lock(); // the caller does lock/unlock > > task = get_pid_task(pid, PIDTYPE_PID); > if (!task) > return; > > next_task = next_thread(task); > > rcu_read_unlock(); > > Yes, both task and task->next can exit right after get_pid_task(), but since > can only happen after we took rcu_read_lock(), delayed_put_task_struct() can't > be called until we drop rcu lock. > > What have I missed? Then, it makes sense to me! Thank you for the explanation. > > Oleg. >
So I still think the pid_alive() check should die... and when I look at this code again I don't understand why does it abuse task_struct->usage, I'll send another patch on top of this one. On 08/21, Oleg Nesterov wrote: > > On 08/21, Kui-Feng Lee wrote: > > > > > > On 8/21/23 08:09, Oleg Nesterov wrote: > > >1. find_pid_ns() + get_pid_task() under rcu_read_lock() guarantees that we > > > can safely iterate the task->thread_group list. Even if this task exits > > > right after get_pid_task() (or goto retry) and pid_alive() returns 0 > > > > Kill the unnecessary pid_alive() check. > > > > This function will return next_task holding a refcount, and release the > > refcount until the next time calling the same function. Meanwhile, > > the returned task A may be killed, and its next task B may be > > killed after A as well, before calling this function again. > > However, even task B is destroyed (free), A's next is still pointing to > > task B. When this function is called again for the same iterator, > > it doesn't promise that B is still there. > > Not sure I understand... > > OK, if we have a task pointer with incremented refcount and do not hold > rcu lock, then yes, you can't remove the pid_alive() check in this code: > > rcu_read_lock(); > if (pid_alive(task)) > do_something(next_thread(task)); > rcu_read_unlock(); > > because task and then task->next can exit and do call_rcu(delayed_put_task_struct) > before we take rcu_read_lock(). > > But if you do something like > > rcu_read_lock(); > > task = find_task_in_some_rcu_protected_list(); > do_something(next_thread(task)); > > rcu_read_unlock(); > > then next_thread(task) should be safe without pid_alive(). > > And iiuc task_group_seq_get_next() always does > > rcu_read_lock(); // the caller does lock/unlock > > task = get_pid_task(pid, PIDTYPE_PID); > if (!task) > return; > > next_task = next_thread(task); > > rcu_read_unlock(); > > Yes, both task and task->next can exit right after get_pid_task(), but since > can only happen after we took rcu_read_lock(), delayed_put_task_struct() can't > be called until we drop rcu lock. > > What have I missed? > > Oleg.
get_pid_task() makes no sense, the code does put_task_struct() soon after.
Use find_task_by_pid_ns() instead of find_pid_ns + get_pid_task and kill
put_task_struct(), this allows to do get_task_struct() only once before
return.
While at it, kill the unnecessary "if (!pid)" check in the "if (!*tid)"
block, this matches the next usage of find_pid_ns() + get_pid_task() in
this function.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
---
kernel/bpf/task_iter.c | 12 ++----------
1 file changed, 2 insertions(+), 10 deletions(-)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 4d1125108014..1589ec3faded 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -42,9 +42,6 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
if (!*tid) {
/* The first time, the iterator calls this function. */
pid = find_pid_ns(common->pid, common->ns);
- if (!pid)
- return NULL;
-
task = get_pid_task(pid, PIDTYPE_TGID);
if (!task)
return NULL;
@@ -66,17 +63,12 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
return task;
}
- pid = find_pid_ns(common->pid_visiting, common->ns);
- if (!pid)
- return NULL;
-
- task = get_pid_task(pid, PIDTYPE_PID);
+ task = find_task_by_pid_ns(common->pid_visiting, common->ns);
if (!task)
return NULL;
retry:
next_task = next_thread(task);
- put_task_struct(task);
saved_tid = *tid;
*tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
@@ -88,7 +80,6 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
return NULL;
}
- get_task_struct(next_task);
common->pid_visiting = *tid;
if (skip_if_dup_files && task->files == task->group_leader->files) {
@@ -96,6 +87,7 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
goto retry;
}
+ get_task_struct(next_task);
return next_task;
}
--
2.25.1.362.g51ebf55
Hi Oleg, On 8/22/23 2:05 PM, Oleg Nesterov wrote: > get_pid_task() makes no sense, the code does put_task_struct() soon after. > Use find_task_by_pid_ns() instead of find_pid_ns + get_pid_task and kill > put_task_struct(), this allows to do get_task_struct() only once before > return. > > While at it, kill the unnecessary "if (!pid)" check in the "if (!*tid)" > block, this matches the next usage of find_pid_ns() + get_pid_task() in > this function. > > Signed-off-by: Oleg Nesterov <oleg@redhat.com> > Acked-by: Yonghong Song <yonghong.song@linux.dev> Could you rebase this against bpf-next tree so this can run through our BPF CI? Right now the CI cannot pick the patch up due to merge conflict [0]. Thanks, Daniel [0] https://patchwork.kernel.org/project/netdevbpf/patch/20230822120549.GA22091@redhat.com/
On 08/25, Daniel Borkmann wrote: > > Could you rebase this against bpf-next tree so this can run through our BPF > CI? Right now the CI cannot pick the patch up due to merge conflict [0]. > > Thanks, > Daniel > > [0] https://patchwork.kernel.org/project/netdevbpf/patch/20230822120549.GA22091@redhat.com/ The merge failed because this patch depends on [PATCH] bpf: task_group_seq_get_next: cleanup the usage of next_thread() in this thread. But please forget. I've sent the new series. It would be nice if you can test at least 1-5, the last 6/6 depends on [PATCH 1/2] introduce __next_thread(), fix next_tid() vs exec() race https://lore.kernel.org/all/20230824143142.GA31222@redhat.com/ which was not merged yet. Oleg.
get_pid_task() makes no sense, the code does put_task_struct() soon after.
Use find_task_by_pid_ns() instead of find_pid_ns + get_pid_task and kill
kill put_task_struct(), this allows to do get_task_struct() only once
before return.
While at it, kill the unnecessary "if (!pid)" check in the "if (!*tid)"
block, this matches the next usage of find_pid_ns() + get_pid_task() in
this function.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
kernel/bpf/task_iter.c | 12 ++----------
1 file changed, 2 insertions(+), 10 deletions(-)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 4d1125108014..1589ec3faded 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -42,9 +42,6 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
if (!*tid) {
/* The first time, the iterator calls this function. */
pid = find_pid_ns(common->pid, common->ns);
- if (!pid)
- return NULL;
-
task = get_pid_task(pid, PIDTYPE_TGID);
if (!task)
return NULL;
@@ -66,17 +63,12 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
return task;
}
- pid = find_pid_ns(common->pid_visiting, common->ns);
- if (!pid)
- return NULL;
-
- task = get_pid_task(pid, PIDTYPE_PID);
+ task = find_task_by_pid_ns(common->pid_visiting, common->ns);
if (!task)
return NULL;
retry:
next_task = next_thread(task);
- put_task_struct(task);
saved_tid = *tid;
*tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
@@ -88,7 +80,6 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
return NULL;
}
- get_task_struct(next_task);
common->pid_visiting = *tid;
if (skip_if_dup_files && task->files == task->group_leader->files) {
@@ -96,6 +87,7 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
goto retry;
}
+ get_task_struct(next_task);
return next_task;
}
--
2.25.1.362.g51ebf55
On 8/21/23 1:03 PM, Oleg Nesterov wrote: > get_pid_task() makes no sense, the code does put_task_struct() soon after. > Use find_task_by_pid_ns() instead of find_pid_ns + get_pid_task and kill > kill put_task_struct(), this allows to do get_task_struct() only once remove the duplicated 'kill' in the above. > before return. > > While at it, kill the unnecessary "if (!pid)" check in the "if (!*tid)" > block, this matches the next usage of find_pid_ns() + get_pid_task() in > this function. > > Signed-off-by: Oleg Nesterov <oleg@redhat.com> LGTM. Acked-by: Yonghong Song <yonghong.song@linux.dev>
On 08/21, Yonghong Song wrote: > > > On 8/21/23 1:03 PM, Oleg Nesterov wrote: > >get_pid_task() makes no sense, the code does put_task_struct() soon after. > >Use find_task_by_pid_ns() instead of find_pid_ns + get_pid_task and kill > >kill put_task_struct(), this allows to do get_task_struct() only once > > remove the duplicated 'kill' in the above. Done, > LGTM. > > Acked-by: Yonghong Song <yonghong.song@linux.dev> Thanks, I'll send V2 with your ack included in a minute. Oleg.
On 8/21/23 13:03, Oleg Nesterov wrote:
> get_pid_task() makes no sense, the code does put_task_struct() soon after.
> Use find_task_by_pid_ns() instead of find_pid_ns + get_pid_task and kill
> kill put_task_struct(), this allows to do get_task_struct() only once
> before return.
>
> While at it, kill the unnecessary "if (!pid)" check in the "if (!*tid)"
> block, this matches the next usage of find_pid_ns() + get_pid_task() in
> this function.
>
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
> kernel/bpf/task_iter.c | 12 ++----------
> 1 file changed, 2 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index 4d1125108014..1589ec3faded 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -42,9 +42,6 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
> if (!*tid) {
> /* The first time, the iterator calls this function. */
> pid = find_pid_ns(common->pid, common->ns);
> - if (!pid)
> - return NULL;
> -
> task = get_pid_task(pid, PIDTYPE_TGID);
> if (!task)
> return NULL;
> @@ -66,17 +63,12 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
> return task;
> }
>
> - pid = find_pid_ns(common->pid_visiting, common->ns);
> - if (!pid)
> - return NULL;
> -
> - task = get_pid_task(pid, PIDTYPE_PID);
> + task = find_task_by_pid_ns(common->pid_visiting, common->ns);
> if (!task)
> return NULL;
>
> retry:
> next_task = next_thread(task);
> - put_task_struct(task);
It called get_task_struct() against this task to hold a refcount at the
previous time calling this function. When will it release the refcount?
>
> saved_tid = *tid;
> *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
> @@ -88,7 +80,6 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
> return NULL;
> }
>
> - get_task_struct(next_task);
> common->pid_visiting = *tid;
>
> if (skip_if_dup_files && task->files == task->group_leader->files) {
> @@ -96,6 +87,7 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
> goto retry;
> }
>
> + get_task_struct(next_task);
> return next_task;
> }
>
On 8/21/23 13:32, Kui-Feng Lee wrote:
>
>
> On 8/21/23 13:03, Oleg Nesterov wrote:
>> get_pid_task() makes no sense, the code does put_task_struct() soon
>> after.
>> Use find_task_by_pid_ns() instead of find_pid_ns + get_pid_task and kill
>> kill put_task_struct(), this allows to do get_task_struct() only once
>> before return.
>>
>> While at it, kill the unnecessary "if (!pid)" check in the "if (!*tid)"
>> block, this matches the next usage of find_pid_ns() + get_pid_task() in
>> this function.
>>
>> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
>> ---
>> kernel/bpf/task_iter.c | 12 ++----------
>> 1 file changed, 2 insertions(+), 10 deletions(-)
>>
>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>> index 4d1125108014..1589ec3faded 100644
>> --- a/kernel/bpf/task_iter.c
>> +++ b/kernel/bpf/task_iter.c
>> @@ -42,9 +42,6 @@ static struct task_struct
>> *task_group_seq_get_next(struct bpf_iter_seq_task_comm
>> if (!*tid) {
>> /* The first time, the iterator calls this function. */
>> pid = find_pid_ns(common->pid, common->ns);
>> - if (!pid)
>> - return NULL;
>> -
>> task = get_pid_task(pid, PIDTYPE_TGID);
>> if (!task)
>> return NULL;
>> @@ -66,17 +63,12 @@ static struct task_struct
>> *task_group_seq_get_next(struct bpf_iter_seq_task_comm
>> return task;
>> }
>> - pid = find_pid_ns(common->pid_visiting, common->ns);
>> - if (!pid)
>> - return NULL;
>> -
>> - task = get_pid_task(pid, PIDTYPE_PID);
>> + task = find_task_by_pid_ns(common->pid_visiting, common->ns);
>> if (!task)
>> return NULL;
>> retry:
>> next_task = next_thread(task);
>> - put_task_struct(task);
>
> It called get_task_struct() against this task to hold a refcount at the
> previous time calling this function. When will it release the refcount?
Oh! I missed the fact that the caller will handle it.
>
>> saved_tid = *tid;
>> *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
>> @@ -88,7 +80,6 @@ static struct task_struct
>> *task_group_seq_get_next(struct bpf_iter_seq_task_comm
>> return NULL;
>> }
>> - get_task_struct(next_task);
>> common->pid_visiting = *tid;
>> if (skip_if_dup_files && task->files ==
>> task->group_leader->files) {
>> @@ -96,6 +87,7 @@ static struct task_struct
>> *task_group_seq_get_next(struct bpf_iter_seq_task_comm
>> goto retry;
>> }
>> + get_task_struct(next_task);
>> return next_task;
>> }
© 2016 - 2025 Red Hat, Inc.