[v1] trace/pid_list: optimize pid_list->lock contention

[PATCH] trace/pid_list: optimize pid_list->lock contention

Posted by Yongliang Gao 3 months, 3 weeks ago

From: Yongliang Gao <leonylgao@tencent.com>

When the system has many cores and task switching is frequent,
setting set_ftrace_pid can cause frequent pid_list->lock contention
and high system sys usage.

For example, in a vmcore environment with 288 cores, We found 267
CPUs are in pid_list->lock contention.

 #4 [ffffa6226fb4bc70] native_queued_spin_lock_slowpath at ffffffff99cd4b7e
 #5 [ffffa6226fb4bc90] _raw_spin_lock_irqsave at ffffffff99cd3e36
 #6 [ffffa6226fb4bca0] trace_pid_list_is_set at ffffffff99267554
 #7 [ffffa6226fb4bcc0] trace_ignore_this_task at ffffffff9925c288
 #8 [ffffa6226fb4bcd8] ftrace_filter_pid_sched_switch_probe at ffffffff99246efe
 #9 [ffffa6226fb4bcf0] __schedule at ffffffff99ccd161

Signed-off-by: Yongliang Gao <leonylgao@tencent.com>
Reviewed-by: Huang Cun <cunhuang@tencent.com>
---
 kernel/trace/pid_list.c | 26 +++++++++++++-------------
 kernel/trace/pid_list.h |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 090bb5ea4a19..62082a4f60db 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -138,14 +138,14 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
 		return false;
 
-	raw_spin_lock_irqsave(&pid_list->lock, flags);
+	read_lock_irqsave(&pid_list->lock, flags);
 	upper_chunk = pid_list->upper[upper1];
 	if (upper_chunk) {
 		lower_chunk = upper_chunk->data[upper2];
 		if (lower_chunk)
 			ret = test_bit(lower, lower_chunk->data);
 	}
-	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+	read_unlock_irqrestore(&pid_list->lock, flags);
 
 	return ret;
 }
@@ -177,7 +177,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
 		return -EINVAL;
 
-	raw_spin_lock_irqsave(&pid_list->lock, flags);
+	write_lock_irqsave(&pid_list->lock, flags);
 	upper_chunk = pid_list->upper[upper1];
 	if (!upper_chunk) {
 		upper_chunk = get_upper_chunk(pid_list);
@@ -199,7 +199,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
 	set_bit(lower, lower_chunk->data);
 	ret = 0;
  out:
-	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+	write_unlock_irqrestore(&pid_list->lock, flags);
 	return ret;
 }
 
@@ -229,7 +229,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
 		return -EINVAL;
 
-	raw_spin_lock_irqsave(&pid_list->lock, flags);
+	write_lock_irqsave(&pid_list->lock, flags);
 	upper_chunk = pid_list->upper[upper1];
 	if (!upper_chunk)
 		goto out;
@@ -250,7 +250,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
 		}
 	}
  out:
-	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+	write_unlock_irqrestore(&pid_list->lock, flags);
 	return 0;
 }
 
@@ -282,7 +282,7 @@ int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid,
 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
 		return -EINVAL;
 
-	raw_spin_lock_irqsave(&pid_list->lock, flags);
+	read_lock_irqsave(&pid_list->lock, flags);
 	for (; upper1 <= UPPER_MASK; upper1++, upper2 = 0) {
 		upper_chunk = pid_list->upper[upper1];
 
@@ -302,7 +302,7 @@ int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid,
 	}
 
  found:
-	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+	read_unlock_irqrestore(&pid_list->lock, flags);
 	if (upper1 > UPPER_MASK)
 		return -1;
 
@@ -339,10 +339,10 @@ static void pid_list_refill_irq(struct irq_work *iwork)
 	int lcnt = 0;
 
  again:
-	raw_spin_lock(&pid_list->lock);
+	write_lock(&pid_list->lock);
 	upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks;
 	lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks;
-	raw_spin_unlock(&pid_list->lock);
+	write_unlock(&pid_list->lock);
 
 	if (upper_count <= 0 && lower_count <= 0)
 		return;
@@ -369,7 +369,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
 		lcnt++;
 	}
 
-	raw_spin_lock(&pid_list->lock);
+	write_lock(&pid_list->lock);
 	if (upper) {
 		*upper_next = pid_list->upper_list;
 		pid_list->upper_list = upper;
@@ -380,7 +380,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
 		pid_list->lower_list = lower;
 		pid_list->free_lower_chunks += lcnt;
 	}
-	raw_spin_unlock(&pid_list->lock);
+	write_unlock(&pid_list->lock);
 
 	/*
 	 * On success of allocating all the chunks, both counters
@@ -418,7 +418,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
 
 	init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq);
 
-	raw_spin_lock_init(&pid_list->lock);
+	rwlock_init(&pid_list->lock);
 
 	for (i = 0; i < CHUNK_ALLOC; i++) {
 		union upper_chunk *chunk;
diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h
index 62e73f1ac85f..da200834f4ad 100644
--- a/kernel/trace/pid_list.h
+++ b/kernel/trace/pid_list.h
@@ -76,7 +76,7 @@ union upper_chunk {
 };
 
 struct trace_pid_list {
-	raw_spinlock_t			lock;
+	rwlock_t			lock;
 	struct irq_work			refill_irqwork;
 	union upper_chunk		*upper[UPPER1_SIZE]; // 1 or 2K in size
 	union upper_chunk		*upper_list;
-- 
2.43.5

Re: [PATCH] trace/pid_list: optimize pid_list->lock contention

Posted by Steven Rostedt 2 months, 4 weeks ago

On Wed, 15 Oct 2025 19:49:52 +0800
Yongliang Gao <leonylgao@gmail.com> wrote:

> diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
> index 090bb5ea4a19..62082a4f60db 100644
> --- a/kernel/trace/pid_list.c
> +++ b/kernel/trace/pid_list.c
> @@ -138,14 +138,14 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
>  	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
>  		return false;
>  
> -	raw_spin_lock_irqsave(&pid_list->lock, flags);
> +	read_lock_irqsave(&pid_list->lock, flags);
>  	upper_chunk = pid_list->upper[upper1];
>  	if (upper_chunk) {
>  		lower_chunk = upper_chunk->data[upper2];
>  		if (lower_chunk)
>  			ret = test_bit(lower, lower_chunk->data);
>  	}
> -	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
> +	read_unlock_irqrestore(&pid_list->lock, flags);
>  
>  	return ret;
>  }

Unfortunately you cannot do this because this is called while holding the
task pi_lock and rq locks. In PREEMPT_RT() the read/write_lock_* turn into
mutexes.

Sebastian, is there any equivalent of raw_read/write_locks() that can be
used?

-- Steve

Re: [PATCH] trace/pid_list: optimize pid_list->lock contention

Posted by Sebastian Andrzej Siewior 2 months, 4 weeks ago

On 2025-11-10 18:38:54 [-0500], Steven Rostedt wrote:
> On Wed, 15 Oct 2025 19:49:52 +0800
> Yongliang Gao <leonylgao@gmail.com> wrote:
> 
> > diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
> > index 090bb5ea4a19..62082a4f60db 100644
> > --- a/kernel/trace/pid_list.c
> > +++ b/kernel/trace/pid_list.c
> > @@ -138,14 +138,14 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
> >  	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
> >  		return false;
> >  
> > -	raw_spin_lock_irqsave(&pid_list->lock, flags);
> > +	read_lock_irqsave(&pid_list->lock, flags);
> >  	upper_chunk = pid_list->upper[upper1];
> >  	if (upper_chunk) {
> >  		lower_chunk = upper_chunk->data[upper2];
> >  		if (lower_chunk)
> >  			ret = test_bit(lower, lower_chunk->data);
> >  	}
> > -	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
> > +	read_unlock_irqrestore(&pid_list->lock, flags);
> >  
> >  	return ret;
> >  }
> 
> Unfortunately you cannot do this because this is called while holding the
> task pi_lock and rq locks. In PREEMPT_RT() the read/write_lock_* turn into
> mutexes.
> 
> Sebastian, is there any equivalent of raw_read/write_locks() that can be
> used?

Nope, no read-write lock that can be used in atomic sections. Well,
there is RCU.

> -- Steve

Sebastian

Re: [PATCH] trace/pid_list: optimize pid_list->lock contention

Posted by Steven Rostedt 2 months, 4 weeks ago

On Tue, 11 Nov 2025 09:13:14 +0100
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> Nope, no read-write lock that can be used in atomic sections. Well,
> there is RCU.

Well, it can't simply be replaced by RCU as the write side is also a
critical path. It happens when new tasks are spawned.

Now we could possibly do some RCU like magic, and remove the lock in the
read, but it would need some care with the writes.

Something like this (untested):

bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
{
	union upper_chunk *upper_chunk;
	union lower_chunk *lower_chunk;
	unsigned long flags;
	unsigned int upper1;
	unsigned int upper2;
	unsigned int lower;
	bool ret = false;

	if (!pid_list)
		return false;

	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
		return false;

	upper_chunk = READ_ONCE(pid_list->upper[upper1]);
	if (upper_chunk) {
		lower_chunk = READ_ONCE(upper_chunk->data[upper2]);
		if (lower_chunk)
			ret = test_bit(lower, lower_chunk->data);
	}

	return ret;
}

Now when all the bits of a chunk is cleared, it goes to a free-list. And
when a new chunk is needed, it acquires it from that free-list. We need to
make sure that the chunk acquired in the read hasn't gone through the
free-list.

Now we could have an atomic counter in the pid_list and make this more of a
seqcount? That is, have the counter updated when a chunk goes to the free
list and also when it is taken from the free list. We could then make this:

 again:
	counter = atomic_read(&pid_list->counter);
	smp_rmb();
	upper_chunk = READ_ONCE(pid_list->upper[upper1]);
	if (upper_chunk) {
		lower_chunk = READ_ONCE(upper_chunk->data[upper2]);
		if (lower_chunk) {
			ret = test_bit(lower, lower_chunk->data);
			smp_rmb();
			if (unlikely(counter != atomic_read(&pid_list->counter))) {
				ret = false;
				goto again;
			}
		}
	}

And in the set we need:

	upper_chunk = pid_list->upper[upper1];
	if (!upper_chunk) {
		upper_chunk = get_upper_chunk(pid_list);
		if (!upper_chunk) {
			ret = -ENOMEM;
			goto out;
		}
		atomic_inc(&pid_list->counter);
		smp_wmb();
		WRITE_ONCE(pid_list->upper[upper1], upper_chunk);
	}
	lower_chunk = upper_chunk->data[upper2];
	if (!lower_chunk) {
		lower_chunk = get_lower_chunk(pid_list);
		if (!lower_chunk) {
			ret = -ENOMEM;
			goto out;
		}
		atomic_inc(&pid_list->counter);
		smp_wmb();
		WRITE_ONCE(upper_chunk->data[upper2], lower_chunk);
	}

and in the clear:

	if (find_first_bit(lower_chunk->data, LOWER_MAX) >= LOWER_MAX) {
		put_lower_chunk(pid_list, lower_chunk);
		WRITE_ONCE(upper_chunk->data[upper2], NULL);
		smp_wmb();
		atomic_inc(&pid_list->counter);
		if (upper_empty(upper_chunk)) {
			put_upper_chunk(pid_list, upper_chunk);
			WRITE_ONCE(pid_list->upper[upper1], NULL);
			smp_wmb();
			atomic_inc(&pid_list->counter);
		}
	}

That is, the counter gets updated after setting the chunk to NULL and
before assigning it a new value. And reading it, the counter is read before
looking at any of the chunks, and tested after getting the result. If the
value is the same, then the chunks are for the correct PID and haven't
swapped in a free/alloc swap where it's looking at a chunk for a different
PID.

This would allow for the read to not take any locks.

-- Steve

Re: [PATCH] trace/pid_list: optimize pid_list->lock contention

Posted by Yongliang Gao 2 months, 3 weeks ago

Hi Steven,

Thank you for your detailed response and the proposed RCU-like approach.

I've looked into using a regular seqlock instead of the current
implementation, but as you pointed out, the write side is indeed a
critical path. More importantly, I found that even with seqlock, the
write_seqlock() function internally uses spin_lock() which on
PREEMPT_RT gets converted to an mutex. This would cause the same
issues we're trying to avoid - potential sleep in atomic contexts.

bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
{
    union upper_chunk *upper_chunk;
    union lower_chunk *lower_chunk;
    unsigned int seq;
    unsigned long flags;
    unsigned int upper1;
    unsigned int upper2;
    unsigned int lower;
    bool ret = false;

    if (!pid_list)
        return false;

    if (pid_split(pid, &upper1, &upper2, &lower) < 0)
        return false;

    do {
        local_irq_save(flags);
        seq = read_seqbegin(&pid_list->lock);
        ret = false;
        upper_chunk = pid_list->upper[upper1];
        if (upper_chunk) {
            lower_chunk = upper_chunk->data[upper2];
            if (lower_chunk)
                ret = test_bit(lower, lower_chunk->data);
        }
        local_irq_restore(flags);
    } while (read_seqretry(&pid_list->lock, seq));

    return ret;
}

 BUG: sleeping function called from invalid context at
kernel/locking/spinlock_rt.c:48
 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 192, name: bash
 preempt_count: 1, expected: 0
 RCU nest depth: 0, expected: 0
 CPU: 3 UID: 0 PID: 192 Comm: bash Not tainted 6.18.0-rc5+ #84 PREEMPT_{RT,LAZY}
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)
 Call Trace:
  <TASK>
  dump_stack_lvl+0x4f/0x70
  __might_resched+0x113/0x160
  rt_spin_lock+0x41/0x130
  trace_pid_list_set+0x52/0x150
  ftrace_pid_follow_sched_process_fork+0x19/0x30
  kernel_clone+0x1b8/0x3e0
  __do_sys_clone+0x65/0x90
  do_syscall_64+0x48/0xa60
  entry_SYSCALL_64_after_hwframe+0x76/0x7e

Your proposed solution using atomic counters and memory barriers
should provide the lock-free read path we need while maintaining code
correctness. However, to achieve this correctly requires careful
consideration of all memory ordering scenarios, and I'm concerned
about introducing subtle bugs given the complexity.

Would you be willing to submit a patch implementing your approach?

On Tue, Nov 11, 2025 at 11:27 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Tue, 11 Nov 2025 09:13:14 +0100
> Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
>
> > Nope, no read-write lock that can be used in atomic sections. Well,
> > there is RCU.
>
> Well, it can't simply be replaced by RCU as the write side is also a
> critical path. It happens when new tasks are spawned.
>
> Now we could possibly do some RCU like magic, and remove the lock in the
> read, but it would need some care with the writes.
>
> Something like this (untested):
>
> bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
> {
>         union upper_chunk *upper_chunk;
>         union lower_chunk *lower_chunk;
>         unsigned long flags;
>         unsigned int upper1;
>         unsigned int upper2;
>         unsigned int lower;
>         bool ret = false;
>
>         if (!pid_list)
>                 return false;
>
>         if (pid_split(pid, &upper1, &upper2, &lower) < 0)
>                 return false;
>
>         upper_chunk = READ_ONCE(pid_list->upper[upper1]);
>         if (upper_chunk) {
>                 lower_chunk = READ_ONCE(upper_chunk->data[upper2]);
>                 if (lower_chunk)
>                         ret = test_bit(lower, lower_chunk->data);
>         }
>
>         return ret;
> }
>
> Now when all the bits of a chunk is cleared, it goes to a free-list. And
> when a new chunk is needed, it acquires it from that free-list. We need to
> make sure that the chunk acquired in the read hasn't gone through the
> free-list.
>
> Now we could have an atomic counter in the pid_list and make this more of a
> seqcount? That is, have the counter updated when a chunk goes to the free
> list and also when it is taken from the free list. We could then make this:
>
>  again:
>         counter = atomic_read(&pid_list->counter);
>         smp_rmb();
>         upper_chunk = READ_ONCE(pid_list->upper[upper1]);
>         if (upper_chunk) {
>                 lower_chunk = READ_ONCE(upper_chunk->data[upper2]);
>                 if (lower_chunk) {
>                         ret = test_bit(lower, lower_chunk->data);
>                         smp_rmb();
>                         if (unlikely(counter != atomic_read(&pid_list->counter))) {
>                                 ret = false;
>                                 goto again;
>                         }
>                 }
>         }
>
>
> And in the set we need:
>
>         upper_chunk = pid_list->upper[upper1];
>         if (!upper_chunk) {
>                 upper_chunk = get_upper_chunk(pid_list);
>                 if (!upper_chunk) {
>                         ret = -ENOMEM;
>                         goto out;
>                 }
>                 atomic_inc(&pid_list->counter);
>                 smp_wmb();
>                 WRITE_ONCE(pid_list->upper[upper1], upper_chunk);
>         }
>         lower_chunk = upper_chunk->data[upper2];
>         if (!lower_chunk) {
>                 lower_chunk = get_lower_chunk(pid_list);
>                 if (!lower_chunk) {
>                         ret = -ENOMEM;
>                         goto out;
>                 }
>                 atomic_inc(&pid_list->counter);
>                 smp_wmb();
>                 WRITE_ONCE(upper_chunk->data[upper2], lower_chunk);
>         }
>
> and in the clear:
>
>         if (find_first_bit(lower_chunk->data, LOWER_MAX) >= LOWER_MAX) {
>                 put_lower_chunk(pid_list, lower_chunk);
>                 WRITE_ONCE(upper_chunk->data[upper2], NULL);
>                 smp_wmb();
>                 atomic_inc(&pid_list->counter);
>                 if (upper_empty(upper_chunk)) {
>                         put_upper_chunk(pid_list, upper_chunk);
>                         WRITE_ONCE(pid_list->upper[upper1], NULL);
>                         smp_wmb();
>                         atomic_inc(&pid_list->counter);
>                 }
>         }
>
> That is, the counter gets updated after setting the chunk to NULL and
> before assigning it a new value. And reading it, the counter is read before
> looking at any of the chunks, and tested after getting the result. If the
> value is the same, then the chunks are for the correct PID and haven't
> swapped in a free/alloc swap where it's looking at a chunk for a different
> PID.
>
> This would allow for the read to not take any locks.
>
> -- Steve

Re: [PATCH] trace/pid_list: optimize pid_list->lock contention

Posted by Steven Rostedt 2 months, 3 weeks ago

On Wed, 12 Nov 2025 13:27:10 +0800
Yongliang Gao <leonylgao@gmail.com> wrote:

> Thank you for your detailed response and the proposed RCU-like approach.
> 
> I've looked into using a regular seqlock instead of the current
> implementation, but as you pointed out, the write side is indeed a
> critical path. More importantly, I found that even with seqlock, the
> write_seqlock() function internally uses spin_lock() which on
> PREEMPT_RT gets converted to an mutex. This would cause the same
> issues we're trying to avoid - potential sleep in atomic contexts.

I believe there is a raw_read_seqcount() functionality that is safe for
PREEMPT_RT. Have you looked into using that?

-- Steve

Re: [PATCH] trace/pid_list: optimize pid_list->lock contention

Posted by Yongliang Gao 2 months, 4 weeks ago

Hi Steven, Sebastian,

Thank you for your review and explanation.
You are absolutely right. I overlooked the case of PREEMPT_RT. I will
look into other possible optimization methods later.

Best regards,
Yongliang

On Tue, Nov 11, 2025 at 4:13 PM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> On 2025-11-10 18:38:54 [-0500], Steven Rostedt wrote:
> > On Wed, 15 Oct 2025 19:49:52 +0800
> > Yongliang Gao <leonylgao@gmail.com> wrote:
> >
> > > diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
> > > index 090bb5ea4a19..62082a4f60db 100644
> > > --- a/kernel/trace/pid_list.c
> > > +++ b/kernel/trace/pid_list.c
> > > @@ -138,14 +138,14 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
> > >     if (pid_split(pid, &upper1, &upper2, &lower) < 0)
> > >             return false;
> > >
> > > -   raw_spin_lock_irqsave(&pid_list->lock, flags);
> > > +   read_lock_irqsave(&pid_list->lock, flags);
> > >     upper_chunk = pid_list->upper[upper1];
> > >     if (upper_chunk) {
> > >             lower_chunk = upper_chunk->data[upper2];
> > >             if (lower_chunk)
> > >                     ret = test_bit(lower, lower_chunk->data);
> > >     }
> > > -   raw_spin_unlock_irqrestore(&pid_list->lock, flags);
> > > +   read_unlock_irqrestore(&pid_list->lock, flags);
> > >
> > >     return ret;
> > >  }
> >
> > Unfortunately you cannot do this because this is called while holding the
> > task pi_lock and rq locks. In PREEMPT_RT() the read/write_lock_* turn into
> > mutexes.
> >
> > Sebastian, is there any equivalent of raw_read/write_locks() that can be
> > used?
>
> Nope, no read-write lock that can be used in atomic sections. Well,
> there is RCU.
>
> > -- Steve
>
> Sebastian