From nobody Sun Feb 8 08:22:47 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C950B18EAB for ; Sat, 15 Nov 2025 14:09:16 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763215756; cv=none; b=k1qbuhp1TcCJ/KUwnwBfOCCTlONk7IMIDbEH3pTlDMKvNgtSgnfikjEHchpqxeAecDsmMGJvNuHxr7jhg5sa6f2wMZ8dm3Z7aOiEqI83yyvUX1S5OAwPj6SZtNo/p2ksAFfISTnbhJ6+RxKz07UOZ54YeFBFl/6gdkG8xr2wrBg= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763215756; c=relaxed/simple; bh=muzfAoQpyW7Hl1nPIqOvXyYVicFITyPpW2MOYJ8iidg=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=D7en+ZxCJpvdh8cF163ooDpI2s5y5NlBUDAU4D5LEYSmvpW0Hews3APEgmqXlSQ/qzlfk/B++YMBgttcTL2dAeOGHMykO5mzGijFj1JDXfEUrZOeF7nolOOkTNCGe/Zw/aagZptsLr29IZ4dmBasPDwh0CkULwKQozkD0goPZfA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=BjJs14dc; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="BjJs14dc" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 609CFC4CEF8; Sat, 15 Nov 2025 14:09:16 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1763215756; bh=muzfAoQpyW7Hl1nPIqOvXyYVicFITyPpW2MOYJ8iidg=; h=Date:From:To:Cc:Subject:References:From; b=BjJs14dcprpJ9/d2mTDFqq1Kl9DsNVqRppJsGW/iQS1LM2VA+Y7ZeeW25DhPPp5B6 u2+5g7ed8rCt400mTdjZOW2uSikISArHWAYfvK2Gg1SSl1CcFeg1e4GPN9WFMM6TOp /irU2OSSJe6k4aZNOQ4tQQXtLxBHD6rkXwfz56pz11WllXt9OGbeHxA39lO5euikII GrHn7/dtU0cNZWJcSJMIHaQE1u+bd5s6fdaXRGGR/re/ZBqi8DHxnFa8fIOCEPNIPI bn1BDiN2vjdoNcQCU7SBIA1SspCAOxgWS5GJjopJvyN/ahUdUWrkHDCzg8+WSrxxUq dxJNGU4bNa29g== Received: from rostedt by gandalf with local (Exim 4.98.2) (envelope-from ) id 1vKGy8-00000002Asq-2iyU; Sat, 15 Nov 2025 09:09:36 -0500 Message-ID: <20251115140936.505263410@kernel.org> User-Agent: quilt/0.68 Date: Sat, 15 Nov 2025 09:09:11 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton , Huang Cun , Yongliang Gao Subject: [for-next][PATCH 1/5] trace/pid_list: optimize pid_list->lock contention References: <20251115140910.386662473@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Yongliang Gao When the system has many cores and task switching is frequent, setting set_ftrace_pid can cause frequent pid_list->lock contention and high system sys usage. For example, in a 288-core VM environment, we observed 267 CPUs experiencing contention on pid_list->lock, with stack traces showing: #4 [ffffa6226fb4bc70] native_queued_spin_lock_slowpath at ffffffff99cd4b7e #5 [ffffa6226fb4bc90] _raw_spin_lock_irqsave at ffffffff99cd3e36 #6 [ffffa6226fb4bca0] trace_pid_list_is_set at ffffffff99267554 #7 [ffffa6226fb4bcc0] trace_ignore_this_task at ffffffff9925c288 #8 [ffffa6226fb4bcd8] ftrace_filter_pid_sched_switch_probe at ffffffff9924= 6efe #9 [ffffa6226fb4bcf0] __schedule at ffffffff99ccd161 Replaces the existing spinlock with a seqlock to allow concurrent readers, while maintaining write exclusivity. Link: https://patch.msgid.link/20251113000252.1058144-1-leonylgao@gmail.com Reviewed-by: Huang Cun Signed-off-by: Yongliang Gao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/pid_list.c | 30 +++++++++++++++++++++--------- kernel/trace/pid_list.h | 1 + 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index 090bb5ea4a19..dbee72d69d0a 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -3,6 +3,7 @@ * Copyright (C) 2021 VMware Inc, Steven Rostedt */ #include +#include #include #include #include "trace.h" @@ -126,7 +127,7 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_l= ist, unsigned int pid) { union upper_chunk *upper_chunk; union lower_chunk *lower_chunk; - unsigned long flags; + unsigned int seq; unsigned int upper1; unsigned int upper2; unsigned int lower; @@ -138,14 +139,16 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid= _list, unsigned int pid) if (pid_split(pid, &upper1, &upper2, &lower) < 0) return false; =20 - raw_spin_lock_irqsave(&pid_list->lock, flags); - upper_chunk =3D pid_list->upper[upper1]; - if (upper_chunk) { - lower_chunk =3D upper_chunk->data[upper2]; - if (lower_chunk) - ret =3D test_bit(lower, lower_chunk->data); - } - raw_spin_unlock_irqrestore(&pid_list->lock, flags); + do { + seq =3D read_seqcount_begin(&pid_list->seqcount); + ret =3D false; + upper_chunk =3D pid_list->upper[upper1]; + if (upper_chunk) { + lower_chunk =3D upper_chunk->data[upper2]; + if (lower_chunk) + ret =3D test_bit(lower, lower_chunk->data); + } + } while (read_seqcount_retry(&pid_list->seqcount, seq)); =20 return ret; } @@ -178,6 +181,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list,= unsigned int pid) return -EINVAL; =20 raw_spin_lock_irqsave(&pid_list->lock, flags); + write_seqcount_begin(&pid_list->seqcount); upper_chunk =3D pid_list->upper[upper1]; if (!upper_chunk) { upper_chunk =3D get_upper_chunk(pid_list); @@ -199,6 +203,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list,= unsigned int pid) set_bit(lower, lower_chunk->data); ret =3D 0; out: + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock_irqrestore(&pid_list->lock, flags); return ret; } @@ -230,6 +235,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_lis= t, unsigned int pid) return -EINVAL; =20 raw_spin_lock_irqsave(&pid_list->lock, flags); + write_seqcount_begin(&pid_list->seqcount); upper_chunk =3D pid_list->upper[upper1]; if (!upper_chunk) goto out; @@ -250,6 +256,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_lis= t, unsigned int pid) } } out: + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock_irqrestore(&pid_list->lock, flags); return 0; } @@ -340,8 +347,10 @@ static void pid_list_refill_irq(struct irq_work *iwork) =20 again: raw_spin_lock(&pid_list->lock); + write_seqcount_begin(&pid_list->seqcount); upper_count =3D CHUNK_ALLOC - pid_list->free_upper_chunks; lower_count =3D CHUNK_ALLOC - pid_list->free_lower_chunks; + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock(&pid_list->lock); =20 if (upper_count <=3D 0 && lower_count <=3D 0) @@ -370,6 +379,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) } =20 raw_spin_lock(&pid_list->lock); + write_seqcount_begin(&pid_list->seqcount); if (upper) { *upper_next =3D pid_list->upper_list; pid_list->upper_list =3D upper; @@ -380,6 +390,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) pid_list->lower_list =3D lower; pid_list->free_lower_chunks +=3D lcnt; } + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock(&pid_list->lock); =20 /* @@ -419,6 +430,7 @@ struct trace_pid_list *trace_pid_list_alloc(void) init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq); =20 raw_spin_lock_init(&pid_list->lock); + seqcount_raw_spinlock_init(&pid_list->seqcount, &pid_list->lock); =20 for (i =3D 0; i < CHUNK_ALLOC; i++) { union upper_chunk *chunk; diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h index 62e73f1ac85f..0b45fb0eadb9 100644 --- a/kernel/trace/pid_list.h +++ b/kernel/trace/pid_list.h @@ -76,6 +76,7 @@ union upper_chunk { }; =20 struct trace_pid_list { + seqcount_raw_spinlock_t seqcount; raw_spinlock_t lock; struct irq_work refill_irqwork; union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size --=20 2.51.0