From nobody Sun Feb  8 16:13:11 2026
Received: from fanzine2.igalia.com (fanzine.igalia.com [178.60.130.6])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C0E341DE881
	for <linux-kernel@vger.kernel.org>; Thu, 16 Jan 2025 15:16:13 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=178.60.130.6
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1737040576; cv=none;
 b=N/7jRsEoDTUgU3bjNQL1erQElhita+OX8opjDGSLCTCaVWpQR/e58goY8JJTgByBx98Cd8yks1EqUFx67Q7M9AnJofa3Uqgeqaq/KhlnW9zFWGuRAgi1QHW0+47ShlR9etdZRXCVaPOxK0k3QwSEDglRmeIhgh8nIPIERAhZVfY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1737040576; c=relaxed/simple;
	bh=CxMFrxvZdD1BG0L9lhwrCPLt9CZkWaxxeARItpXOieI=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=jUBG9tCkIpqC9+k91iXcORl1VXHJ4CdfmgTXMATXkcvAoiRsnfQ5ghyNwuKiMzyphQSBcsKsIIjSSu/5jYzGsKbDp18Jwrnhp9FNMuu6yFDjswMXUGjnMav1StARbpROK2h7jq9pWYpTlpGu0Xayj2lRfGnstvSMBYIzrMBzZgI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=igalia.com;
 spf=pass smtp.mailfrom=igalia.com;
 dkim=pass (2048-bit key) header.d=igalia.com header.i=@igalia.com
 header.b=W5CeqspK; arc=none smtp.client-ip=178.60.130.6
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=igalia.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=igalia.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=igalia.com header.i=@igalia.com
 header.b="W5CeqspK"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=igalia.com;
	s=20170329; h=Content-Transfer-Encoding:MIME-Version:References:In-Reply-To:
	Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To:Content-Type:Content-ID:
	Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
	:Resent-Message-ID:List-Id:List-Help:List-Unsubscribe:List-Subscribe:
	List-Post:List-Owner:List-Archive;
	bh=mlc6rEdNoEBbfgryDkesgjOBU4Rz+ZCzdTQtrqYi2Aw=; b=W5CeqspKMYEGaNFwYA2keyXfiG
	w/qS+On7D3nx8qVKC+PPKHGEPEDJR2HQk6Zkv8P+L4WS4qNzPwL7OfAGZOQM+HSOOkHT12HHPUtTU
	2CgHMvh8J8fiQEPcu5N1oNBlZ6Qr6QyBn2nkFJh+iUP36FcnNepAEl2Kgl7KKIz8Of/BvtKqgwGTf
	i/YJkShgCc5ukdrHrtfmZvCqwb9uwthqX5IIqAg4CLcA/gzjI0m1dk6+VpTzJ5J9KEggRV+CkmA1D
	XRkhVjRpa0vHpXoYNseKa6FIaQeITY2pYL7ex0vpputMIK3ligd6+fYzWjfFVYDRjNQmddxGk/+Jo
	FwVfvSng==;
Received: from [58.29.143.236] (helo=localhost)
	by fanzine2.igalia.com with utf8esmtpsa
	(Cipher TLS1.3:ECDHE_SECP256R1__RSA_PSS_RSAE_SHA256__AES_256_GCM:256) (Exim)
	id 1tYRbJ-00Gi0e-3e; Thu, 16 Jan 2025 16:16:05 +0100
From: Changwoo Min <changwoo@igalia.com>
To: tj@kernel.org,
	void@manifault.com,
	arighi@nvidia.com
Cc: kernel-dev@igalia.com,
	linux-kernel@vger.kernel.org,
	Changwoo Min <changwoo@igalia.com>
Subject: [PATCH 1/7] sched_ext: Implement event counter infrastructure and add
 an event
Date: Fri, 17 Jan 2025 00:15:37 +0900
Message-ID: <20250116151543.80163-2-changwoo@igalia.com>
X-Mailer: git-send-email 2.48.1
In-Reply-To: <20250116151543.80163-1-changwoo@igalia.com>
References: <20250116151543.80163-1-changwoo@igalia.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Collect the statistics of specific types of behavior in the sched_ext core,
which are not easily visible but still interesting to an scx scheduler.

Also, add a core event, SCX_EVENT_INVAL_SELECT_CPU, which represents how
many times ops.select_cpu() returns a CPU that the task can't use.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
---
 kernel/sched/ext.c | 120 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 118 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 0bcdd1a31676..7e12d5b8322e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1458,6 +1458,66 @@ static struct task_struct *scx_task_iter_next_locked=
(struct scx_task_iter *iter)
 	return p;
 }
=20
+/*
+ * Collection of event counters.
+ */
+struct scx_event_stat {
+	/*
+	 * If ops.select_cpu() returns a CPU which can't be used by the task,
+	 * the core scheduler code silently picks a fallback CPU.
+	 */
+	u64		INVAL_SELECT_CPU;
+};
+
+#define SCX_EVENT_IDX(e)	(offsetof(struct scx_event_stat, e)/sizeof(u64))
+#define SCX_EVENT_END_IDX()	(sizeof(struct scx_event_stat)/sizeof(u64))
+#define SCX_EVENT_DEFINE(e)	SCX_EVENT_##e =3D SCX_EVENT_IDX(e)
+
+/*
+ * Types of event counters.
+ */
+enum scx_event_kind {
+	SCX_EVENT_BEGIN =3D 0,
+	SCX_EVENT_DEFINE(INVAL_SELECT_CPU),
+	SCX_EVENT_END =3D SCX_EVENT_END_IDX(),
+};
+
+static const char *scx_event_stat_str[] =3D {
+	[SCX_EVENT_INVAL_SELECT_CPU]	=3D "invalid_select_cpu",
+};
+
+/*
+ * The event counter is organized by a per-CPU variable to minimize the
+ * accounting overhead without synchronization. A system-wide view on the
+ * event counter is constructed when requested by scx_bpf_get_event_stat().
+ */
+static DEFINE_PER_CPU(struct scx_event_stat, event_stats);
+
+/**
+ * scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @name: an event name defined in struct scx_event_stat
+ * @cnt: the number of the event occured
+ */
+#define scx_add_event(name, cnt) ({						\
+	struct scx_event_stat *__e;						\
+	__e =3D get_cpu_ptr(&event_stats);					\
+	WRITE_ONCE(__e->name, __e->name+ (cnt));				\
+	put_cpu_ptr(&event_stats);						\
+})
+
+
+/**
+ * scx_read_event_kind - Read an event from 'e' with 'kind'
+ * @e: a pointer to an event collected by scx_bpf_event_stat()
+ * @kine: an event type defined in scx_event_kind
+ */
+#define scx_read_event_kind(e, kind) ({						\
+	u64 *__e64 =3D (u64 *)(e);						\
+	__e64[kind];								\
+})
+
+static void scx_bpf_event_stat(struct scx_event_stat *event, size_t event_=
_sz);
+
 static enum scx_ops_enable_state scx_ops_enable_state(void)
 {
 	return atomic_read(&scx_ops_enable_state_var);
@@ -3607,8 +3667,10 @@ static int select_task_rq_scx(struct task_struct *p,=
 int prev_cpu, int wake_flag
 		*ddsp_taskp =3D NULL;
 		if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
 			return cpu;
-		else
+		else {
+			scx_add_event(INVAL_SELECT_CPU, 1);
 			return prev_cpu;
+		}
 	} else {
 		bool found;
 		s32 cpu;
@@ -5053,6 +5115,15 @@ static void scx_ops_disable_workfn(struct kthread_wo=
rk *work)
 		scx_rq_clock_invalidate(rq);
 	}
=20
+	/*
+	 * Clear event counters so the next scx scheduler always gets
+	 * fresh event counter values.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct scx_event_stat *e =3D per_cpu_ptr(&event_stats, cpu);
+		memset(e, 0, sizeof(*e));
+	}
+
 	/* no task is on scx, turn off all the switches and flush in-progress cal=
ls */
 	static_branch_disable(&__scx_ops_enabled);
 	for (i =3D SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
@@ -5309,9 +5380,10 @@ static void scx_dump_state(struct scx_exit_info *ei,=
 size_t dump_len)
 		.at_jiffies =3D jiffies,
 	};
 	struct seq_buf s;
+	struct scx_event_stat event;
 	unsigned long flags;
 	char *buf;
-	int cpu;
+	int cpu, kind;
=20
 	spin_lock_irqsave(&dump_lock, flags);
=20
@@ -5417,6 +5489,16 @@ static void scx_dump_state(struct scx_exit_info *ei,=
 size_t dump_len)
 		rq_unlock(rq, &rf);
 	}
=20
+	dump_newline(&s);
+	dump_line(&s, "Event counters");
+	dump_line(&s, "--------------");
+
+	scx_bpf_event_stat(&event, sizeof(event));
+	for (kind =3D SCX_EVENT_BEGIN; kind < SCX_EVENT_END; kind++) {
+		dump_line(&s, "%25s : %llu", scx_event_stat_str[kind],
+			  scx_read_event_kind(&event, kind));
+	}
+
 	if (seq_buf_has_overflowed(&s) && dump_len >=3D sizeof(trunc_marker))
 		memcpy(ei->dump + dump_len - sizeof(trunc_marker),
 		       trunc_marker, sizeof(trunc_marker));
@@ -7720,6 +7802,39 @@ __bpf_kfunc u64 scx_bpf_now(void)
 	return clock;
 }
=20
+/*
+ * scx_bpf_event_stat - Get a system-wide event counter to
+ * @event: output buffer from a BPF program
+ * @event__sz: @event len, must end in '__sz'' for the verifier
+ */
+__bpf_kfunc void scx_bpf_event_stat(struct scx_event_stat *event,
+				    size_t event__sz)
+{
+	struct scx_event_stat *e;
+	u64 *event64, *e64;
+	int cpu, kind, event_end;
+
+	/*
+	 * We cannot entirely trust a BPF-provided size since a BPF program
+	 * might be compiled against a different vmlinux.h, of which
+	 * scx_event_stat would be larger (a newer vmlinux.h) or smaller
+	 * (an older vmlinux.h). Hence, we use the smaller size to avoid
+	 * memory corruption.
+	 */
+	event__sz =3D min(event__sz, sizeof(*event));
+	event_end =3D event__sz / sizeof(u64);
+
+	event64 =3D (u64 *)event;
+	memset(event, 0, event__sz);
+	for_each_possible_cpu(cpu) {
+		e =3D per_cpu_ptr(&event_stats, cpu);
+		e64 =3D (u64 *)e;
+		for (kind =3D 0; kind < event_end; kind++) {
+			event64[kind] +=3D READ_ONCE(e64[kind]);
+		}
+	}
+}
+
 __bpf_kfunc_end_defs();
=20
 BTF_KFUNCS_START(scx_kfunc_ids_any)
@@ -7752,6 +7867,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
 #endif
 BTF_ID_FLAGS(func, scx_bpf_now)
+BTF_ID_FLAGS(func, scx_bpf_event_stat, KF_TRUSTED_ARGS)
 BTF_KFUNCS_END(scx_kfunc_ids_any)
=20
 static const struct btf_kfunc_id_set scx_kfunc_set_any =3D {
--=20
2.48.1