From nobody Sun Apr 12 21:00:57 2026
Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.15])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2C98539DBCF
	for <linux-kernel@vger.kernel.org>; Wed,  1 Apr 2026 21:46:55 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=198.175.65.15
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1775080029; cv=none;
 b=LWz/1YwRAOEBOYJh6hr/0f1pposFUZa/PKctYqsavUfURSGuLv4VJtNAbCt0avBJq8tld3IEij1q2snBIh+knrUyv3PVCyBRBG0aADq+n06ujlK6wNX9l9mLPyibFwoxMvznAZSopTe8IJQq77qNHD9EBAjqfH9sr2TtLStGvDw=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1775080029; c=relaxed/simple;
	bh=CQ0SOkq6AsNh6YtxD74WemGeUuJ3Nchl/OU9rNa0SvI=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=kxKUrtjTeaynpC49x0bfxsmpJ/O8y/FGhbVcFL3Qr5YCMLwbC+02WpGB4WDwNY0Pfpf7TAm/qn3Rd3YF0f8cAxDbg/G5unrKzIp8o5UtLaAyMDIQw+ITC627LbyxJdfYKz8V0vBTO6uq2qfVwQpGTqvYjbDXQG5hM6B3CzJEdc8=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=linux.intel.com;
 spf=pass smtp.mailfrom=linux.intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=H0gJ7DpM; arc=none smtp.client-ip=198.175.65.15
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=linux.intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=linux.intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="H0gJ7DpM"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1775080015; x=1806616015;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=CQ0SOkq6AsNh6YtxD74WemGeUuJ3Nchl/OU9rNa0SvI=;
  b=H0gJ7DpMsoVhFS8bEFEf1CzEeglgwtZ5l/PknyJ2bnD0mwAGF0xCylN+
   Tsk73rMWlyIACadfvNEEdfU6P+b5uYPTOuHX3PORiOhaqVK0xTsiHciB8
   LCeqQk1ZHfE5QBfRBSvpb9LMvkx2Hq+pmd3Z08nisCJns0w8OtHcwuEYn
   7juTIr43J7/YO+7ROXfaytbh22pS78+P2U1pQAtstIEL9k+BfUHrixIf7
   IsPUfn/w/Xx4a2FJRf1wdsx4X51wHIO6nBCwqjoWi3sQlKQGMb9eSbTIT
   +mFalcN0xI0TCZyfnX4XOEdYGdmWRzrBgkwwBb60Vi4CaXJf9jVTSt4yA
   Q==;
X-CSE-ConnectionGUID: RGAmwFtLQv+Qf6KOxmTDOQ==
X-CSE-MsgGUID: a/jXVlE8S6K0u5K7lTpnIw==
X-IronPort-AV: E=McAfee;i="6800,10657,11746"; a="79739858"
X-IronPort-AV: E=Sophos;i="6.23,153,1770624000";
   d="scan'208";a="79739858"
Received: from fmviesa002.fm.intel.com ([10.60.135.142])
  by orvoesa107.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 01 Apr 2026 14:46:55 -0700
X-CSE-ConnectionGUID: 41SofprERGaGVtIZfSHbJw==
X-CSE-MsgGUID: m6MabzP1TBSl3ZN45UwQXQ==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.23,153,1770624000";
   d="scan'208";a="249842430"
Received: from b04f130c83f2.jf.intel.com ([10.165.154.98])
  by fmviesa002.fm.intel.com with ESMTP; 01 Apr 2026 14:46:53 -0700
From: Tim Chen <tim.c.chen@linux.intel.com>
To: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>,
	K Prateek Nayak <kprateek.nayak@amd.com>,
	"Gautham R . Shenoy" <gautham.shenoy@amd.com>,
	Vincent Guittot <vincent.guittot@linaro.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Mel Gorman <mgorman@suse.de>,
	Valentin Schneider <vschneid@redhat.com>,
	Madadi Vineeth Reddy <vineethr@linux.ibm.com>,
	Hillf Danton <hdanton@sina.com>,
	Shrikanth Hegde <sshegde@linux.ibm.com>,
	Jianyong Wu <jianyong.wu@outlook.com>,
	Yangyu Chen <cyy@cyyself.name>,
	Tingyin Duan <tingyin.duan@gmail.com>,
	Vern Hao <vernhao@tencent.com>,
	Vern Hao <haoxing990@gmail.com>,
	Len Brown <len.brown@intel.com>,
	Aubrey Li <aubrey.li@intel.com>,
	Zhao Liu <zhao1.liu@intel.com>,
	Chen Yu <yu.chen.surf@gmail.com>,
	Chen Yu <yu.c.chen@intel.com>,
	Adam Li <adamli@os.amperecomputing.com>,
	Aaron Lu <ziqianlu@bytedance.com>,
	Tim Chen <tim.c.chen@intel.com>,
	Josh Don <joshdon@google.com>,
	Gavin Guo <gavinguo@igalia.com>,
	Qais Yousef <qyousef@layalina.io>,
	Libo Chen <libchen@purestorage.com>,
	linux-kernel@vger.kernel.org
Subject: [Patch v4 08/22] sched/cache: Introduce per CPU's tasks LLC
 preference counter
Date: Wed,  1 Apr 2026 14:52:20 -0700
Message-Id: 
 <42e79eceb8cd6be8a032401d481d101913bc5703.1775065312.git.tim.c.chen@linux.intel.com>
X-Mailer: git-send-email 2.32.0
In-Reply-To: <cover.1775065312.git.tim.c.chen@linux.intel.com>
References: <cover.1775065312.git.tim.c.chen@linux.intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

The lowest level of sched domain for each CPU is assigned an
array where each element tracks the number of tasks preferring
a given LLC, indexed from 0 to max_lid. Since each CPU
has its dedicated sd, this implies that each CPU will have
a dedicated task LLC preference counter.

For example, sd->llc_counts[3] =3D 2 signifies that there
are 2 tasks on this runqueue which prefer to run within LLC3.

The load balancer can use this information to identify busy
runqueues and migrate tasks to their preferred LLC domains.
This array will be reallocated at runtime during sched domain
rebuild.

Introduce the buffer allocation mechanism, and the statistics
will be calculated in the subsequent patch.

Note: the LLC preference statistics of each CPU are reset on
sched domain rebuild and may under count temporarily, until the
CPU becomes idle and the count is cleared. This is a trade off
to avoid complex data synchronization across sched domain builds.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Co-developed-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---

Notes:
    v3->v4:
       Rename pf to llc_counter to better reflect its usage;
       Record its size (max_llcs) per sched domain;
       Publish the llc_counter and its size together in
       cpu_attach_domain().
       (Peter Zijlstra)

 include/linux/sched/topology.h | 13 ++++++++
 kernel/sched/topology.c        | 61 +++++++++++++++++++++++++++++++++-
 2 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index a4e2fb31f2fd..73153a3d9036 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -102,6 +102,19 @@ struct sched_domain {
 	u64 max_newidle_lb_cost;
 	unsigned long last_decay_max_lb_cost;
=20
+#ifdef CONFIG_SCHED_CACHE
+	unsigned int llc_max;
+	/*
+	 * per LLC preference counter
+	 * __counted_by cannot be used here because
+	 * when the percpu sched_domain is being allocated,
+	 * llc_max is unknown, and thus the actual size
+	 * of the sched_domain(including the llc_counts elements)
+	 * is undetermined.
+	 */
+	unsigned int *llc_counts;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 	/* sched_balance_rq() stats */
 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index edf6d7ec73ca..995a42cb4697 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -634,6 +634,11 @@ static void destroy_sched_domain(struct sched_domain *=
sd)
=20
 	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
 		kfree(sd->shared);
+
+#ifdef CONFIG_SCHED_CACHE
+	/* only the bottom sd has llc_counts array */
+	kfree(sd->llc_counts);
+#endif
 	kfree(sd);
 }
=20
@@ -753,10 +758,18 @@ cpu_attach_domain(struct sched_domain *sd, struct roo=
t_domain *rd, int cpu)
 	if (sd && sd_degenerate(sd)) {
 		tmp =3D sd;
 		sd =3D sd->parent;
-		destroy_sched_domain(tmp);
+
 		if (sd) {
 			struct sched_group *sg =3D sd->groups;
=20
+#ifdef CONFIG_SCHED_CACHE
+			/* move buffer to parent as child is being destroyed */
+			sd->llc_counts =3D tmp->llc_counts;
+			sd->llc_max =3D tmp->llc_max;
+			/* make sure destroy_sched_domain() does not free it */
+			tmp->llc_counts =3D NULL;
+			tmp->llc_max =3D 0;
+#endif
 			/*
 			 * sched groups hold the flags of the child sched
 			 * domain for convenience. Clear such flags since
@@ -768,6 +781,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_=
domain *rd, int cpu)
=20
 			sd->child =3D NULL;
 		}
+
+		destroy_sched_domain(tmp);
 	}
=20
 	sched_domain_debug(sd, cpu);
@@ -793,6 +808,48 @@ enum s_alloc {
 	sa_none,
 };
=20
+#ifdef CONFIG_SCHED_CACHE
+static bool alloc_sd_llc(const struct cpumask *cpu_map,
+			 struct s_data *d)
+{
+	struct sched_domain *sd;
+	unsigned int *p;
+	int i;
+
+	for_each_cpu(i, cpu_map) {
+		sd =3D *per_cpu_ptr(d->sd, i);
+		if (!sd)
+			goto err;
+
+		p =3D kcalloc(max_lid + 1, sizeof(unsigned int), GFP_KERNEL);
+		if (!p)
+			goto err;
+
+		sd->llc_counts =3D p;
+		sd->llc_max =3D max_lid;
+	}
+
+	return true;
+err:
+	for_each_cpu(i, cpu_map) {
+		sd =3D *per_cpu_ptr(d->sd, i);
+		if (sd) {
+			sd->llc_max =3D 0;
+			kfree(sd->llc_counts);
+			sd->llc_counts =3D NULL;
+		}
+	}
+
+	return false;
+}
+#else
+static bool alloc_sd_llc(const struct cpumask *cpu_map,
+			 struct s_data *d)
+{
+	return false;
+}
+#endif
+
 /*
  * Return the canonical balance CPU for this group, this is the first CPU
  * of this group that's also in the balance mask.
@@ -2759,6 +2816,8 @@ build_sched_domains(const struct cpumask *cpu_map, st=
ruct sched_domain_attr *att
 		}
 	}
=20
+	alloc_sd_llc(cpu_map, &d);
+
 	/* Attach the domains */
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map) {
--=20
2.32.0