From nobody Mon Feb  9 00:57:27 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id A5AE9C54FB9
	for <linux-kernel@archiver.kernel.org>; Tue, 21 Nov 2023 07:40:59 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229890AbjKUHlA (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 21 Nov 2023 02:41:00 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37480 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229905AbjKUHk6 (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 21 Nov 2023 02:40:58 -0500
Received: from mgamail.intel.com (mgamail.intel.com [134.134.136.126])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 745B497
        for <linux-kernel@vger.kernel.org>;
 Mon, 20 Nov 2023 23:40:54 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1700552454; x=1732088454;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=kDE9oNZ7PjL2CqLOzbBkD7BOUp0VBCVrH28Nu10oP9c=;
  b=EIh9LpPqo14pZR/q4qYTKUGmDwHxejQX/57dKeEjJBuxmlNLD4ZoUVPO
   DeqoewJunhpyHnV9GflEpz5UHbkDhJa+bEx5RqUfZ06+ypbTjnl1BtlIK
   tFAbbcl5VLTFFSZZwjFG7SQTMVy67DWAUhNETn4ozPGOGc1ZFxBH5xd4F
   T2ZLM0Zb7knlpF5hkKPPIAGSS2SlFbfn1rHjZ2SlXKcH1PCS1E648AOep
   4rK03qSXKudicczEf+X3J6pJetQdLCTU7eALB2gaIIoI5J7QHJhyPgG4C
   s+cGZagOJimnRHeJ9lgcy7FT5V4WOPrDXevsKeLdZ3FQMfFXF6G3zVCPX
   A==;
X-IronPort-AV: E=McAfee;i="6600,9927,10900"; a="376821688"
X-IronPort-AV: E=Sophos;i="6.04,215,1695711600";
   d="scan'208";a="376821688"
Received: from fmsmga002.fm.intel.com ([10.253.24.26])
  by orsmga106.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 20 Nov 2023 23:40:43 -0800
X-ExtLoop1: 1
X-IronPort-AV: E=McAfee;i="6600,9927,10900"; a="884101994"
X-IronPort-AV: E=Sophos;i="6.04,215,1695711600";
   d="scan'208";a="884101994"
Received: from chenyu-dev.sh.intel.com ([10.239.62.164])
  by fmsmga002.fm.intel.com with ESMTP; 20 Nov 2023 23:40:39 -0800
From: Chen Yu <yu.c.chen@intel.com>
To: Peter Zijlstra <peterz@infradead.org>,
        Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
        Ingo Molnar <mingo@redhat.com>,
        Vincent Guittot <vincent.guittot@linaro.org>,
        Juri Lelli <juri.lelli@redhat.com>
Cc: Tim Chen <tim.c.chen@intel.com>, Aaron Lu <aaron.lu@intel.com>,
        Dietmar Eggemann <dietmar.eggemann@arm.com>,
        Steven Rostedt <rostedt@goodmis.org>,
        Mel Gorman <mgorman@suse.de>,
        K Prateek Nayak <kprateek.nayak@amd.com>,
        "Gautham R . Shenoy" <gautham.shenoy@amd.com>,
        Chen Yu <yu.chen.surf@gmail.com>, linux-kernel@vger.kernel.org,
        Chen Yu <yu.c.chen@intel.com>
Subject: [PATCH v2 1/3] sched/fair: Record the task sleeping time as the cache
 hot duration
Date: Tue, 21 Nov 2023 15:39:40 +0800
Message-Id: 
 <d42d4c0d0eb0f411084be382dc3a1d21ebb94b83.1700548379.git.yu.c.chen@intel.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <cover.1700548379.git.yu.c.chen@intel.com>
References: <cover.1700548379.git.yu.c.chen@intel.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

The cache hot duration is calculated by the average sleeping
time of a task, which is the time delta between the task
being dequeued and enqueued.

The cache hot duration of a task is introduced to describe
how soon this dequeue task could be woken up. During this
cache hot period, the task's previous CPU is regarded as
still cache-hot for the task. This information will be used
by SIS_CACHE to improve cache locality for short-sleeping tasks.

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Suggested-by: Aaron Lu <aaron.lu@intel.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
---
 include/linux/sched.h |  4 ++++
 kernel/sched/fair.c   | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8d258162deb0..7d0fafd29345 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1347,6 +1347,10 @@ struct task_struct {
 	struct callback_head		cid_work;
 #endif
=20
+	u64				last_dequeue_time;
+	u64				avg_hot_dur;	/* Average cache hot duration */
+	int				last_dequeue_cpu;
+
 	struct tlbflush_unmap_batch	tlb_ubc;
=20
 	/* Cache last used pipe for splice(): */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 53e7bf2ccc44..672616503e35 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6667,6 +6667,36 @@ enqueue_task_fair(struct rq *rq, struct task_struct =
*p, int flags)
 	struct sched_entity *se =3D &p->se;
 	int idle_h_nr_running =3D task_has_idle_policy(p);
 	int task_new =3D !(flags & ENQUEUE_WAKEUP);
+	u64 last_dequeue =3D p->last_dequeue_time;
+
+	if ((flags & ENQUEUE_WAKEUP) && last_dequeue &&
+	    cpu_online(p->last_dequeue_cpu)) {
+		/*
+		 * The enqueue task_cpu(p) has already been assigned
+		 * with a new one. Need to calculate the task's sleeping
+		 * time based on its previous running CPU.
+		 */
+		u64 now =3D sched_clock_cpu(p->last_dequeue_cpu);
+
+		/*
+		 * Record the task's short sleep time. This sleep time
+		 * indicates how soon this task might be woken up again.
+		 * The task's previous running CPU is regarded as cache-hot
+		 * in the sleep time. So, define the average sleep time of
+		 * the task as its cache-hot duration. The SIS could leverage
+		 * the cache-hot duration for better idle CPU selection.
+		 * This improves cache locality for short-sleeping tasks.
+		 *
+		 * If the sleep time is longer than sysctl_sched_migration_cost,
+		 * give the cache hot duration a penalty by cutting it to half.
+		 */
+		if (now > last_dequeue) {
+			if (now - last_dequeue < sysctl_sched_migration_cost)
+				update_avg(&p->avg_hot_dur, now - last_dequeue);
+			else
+				p->avg_hot_dur >>=3D 1;
+		}
+	}
=20
 	/*
 	 * The code below (indirectly) updates schedutil which looks at
@@ -6821,6 +6851,15 @@ static void dequeue_task_fair(struct rq *rq, struct =
task_struct *p, int flags)
=20
 dequeue_throttle:
 	util_est_update(&rq->cfs, p, task_sleep);
+
+	if (task_sleep) {
+		p->last_dequeue_time =3D sched_clock_cpu(cpu_of(rq));
+		p->last_dequeue_cpu =3D cpu_of(rq);
+	} else {
+		/* 0 indicates the dequeue is not caused by sleep */
+		p->last_dequeue_time =3D 0;
+	}
+
 	hrtick_update(rq);
 }
=20
--=20
2.25.1
From nobody Mon Feb  9 00:57:27 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id C787BC54FB9
	for <linux-kernel@archiver.kernel.org>; Tue, 21 Nov 2023 07:41:09 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229905AbjKUHlK (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 21 Nov 2023 02:41:10 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:50994 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S230417AbjKUHlH (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 21 Nov 2023 02:41:07 -0500
Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.10])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 2E13C121
        for <linux-kernel@vger.kernel.org>;
 Mon, 20 Nov 2023 23:41:04 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1700552464; x=1732088464;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=KjPJrAVxXZh2V7iwRKhLhm1MJKfGXzVaYnYYF6gMzhg=;
  b=mN4XMyMyr0/FtqZ9tllMoXVAby60n9EJ7zHwRnSbZwS1GJyuJoCZ+6mv
   oExDxf5bPL5t7IQk+W3R9wy+eAELogtzTGRUwi+bMazCOuqO1ZplOZTKP
   qsoUXa3N2Xkcs4THVw5Pwek+gUYEifa7PxD0N2wxHFFfUEbKcFBcygqSG
   O5csuxszB+2LvkgZQ/8CQObveHU56fbpz54Xrz0oBRHGTN2/JSlv+6FKh
   Uj3DBGC8VPnFjaEnO51/pi96qev+gOPfs/ljfZgxKVLBT8O8NiEeCpyKo
   ikjzt5GG3DM1Z/pOwVYfkY7sCwZxbtVFqXO2YS0gTsyrZ/L+lLvpQXU4w
   g==;
X-IronPort-AV: E=McAfee;i="6600,9927,10900"; a="4978256"
X-IronPort-AV: E=Sophos;i="6.04,215,1695711600";
   d="scan'208";a="4978256"
Received: from fmsmga004.fm.intel.com ([10.253.24.48])
  by orvoesa102.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 20 Nov 2023 23:41:04 -0800
X-ExtLoop1: 1
X-IronPort-AV: E=McAfee;i="6600,9927,10900"; a="836972249"
X-IronPort-AV: E=Sophos;i="6.04,215,1695711600";
   d="scan'208";a="836972249"
Received: from chenyu-dev.sh.intel.com ([10.239.62.164])
  by fmsmga004.fm.intel.com with ESMTP; 20 Nov 2023 23:40:59 -0800
From: Chen Yu <yu.c.chen@intel.com>
To: Peter Zijlstra <peterz@infradead.org>,
        Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
        Ingo Molnar <mingo@redhat.com>,
        Vincent Guittot <vincent.guittot@linaro.org>,
        Juri Lelli <juri.lelli@redhat.com>
Cc: Tim Chen <tim.c.chen@intel.com>, Aaron Lu <aaron.lu@intel.com>,
        Dietmar Eggemann <dietmar.eggemann@arm.com>,
        Steven Rostedt <rostedt@goodmis.org>,
        Mel Gorman <mgorman@suse.de>,
        K Prateek Nayak <kprateek.nayak@amd.com>,
        "Gautham R . Shenoy" <gautham.shenoy@amd.com>,
        Chen Yu <yu.chen.surf@gmail.com>, linux-kernel@vger.kernel.org,
        Chen Yu <yu.c.chen@intel.com>
Subject: [PATCH v2 2/3] sched/fair: Calculate the cache-hot time of the idle
 CPU
Date: Tue, 21 Nov 2023 15:39:54 +0800
Message-Id: 
 <6dccbf0f54cc4ee068a157b9eebfb4b5fa3cc4af.1700548379.git.yu.c.chen@intel.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <cover.1700548379.git.yu.c.chen@intel.com>
References: <cover.1700548379.git.yu.c.chen@intel.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

When a CPU is about to become idle due to task dequeue, uses
the dequeued task's average sleep time to set the cache
hot timeout of this idle CPU. This information can facilitate
SIS to skip the cache-hot idle CPU and scan for the next
cache-cold one. When that task is woken up again, it can choose
its previous CPU and reuses its hot-cache.

This is a preparation for the next patch to introduce SIS_CACHE
based task wakeup.

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
---
 kernel/sched/fair.c     | 30 +++++++++++++++++++++++++++++-
 kernel/sched/features.h |  1 +
 kernel/sched/sched.h    |  1 +
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 672616503e35..c309b3d203c0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6853,8 +6853,17 @@ static void dequeue_task_fair(struct rq *rq, struct =
task_struct *p, int flags)
 	util_est_update(&rq->cfs, p, task_sleep);
=20
 	if (task_sleep) {
-		p->last_dequeue_time =3D sched_clock_cpu(cpu_of(rq));
+		u64 now =3D sched_clock_cpu(cpu_of(rq));
+
+		p->last_dequeue_time =3D now;
 		p->last_dequeue_cpu =3D cpu_of(rq);
+
+#ifdef CONFIG_SMP
+		/* this rq becomes idle, update its cache hot timeout */
+		if (sched_feat(SIS_CACHE) && !rq->nr_running &&
+		    p->avg_hot_dur)
+			rq->cache_hot_timeout =3D max(rq->cache_hot_timeout, now + p->avg_hot_d=
ur);
+#endif
 	} else {
 		/* 0 indicates the dequeue is not caused by sleep */
 		p->last_dequeue_time =3D 0;
@@ -7347,6 +7356,25 @@ static inline int select_idle_smt(struct task_struct=
 *p, int target)
=20
 #endif /* CONFIG_SCHED_SMT */
=20
+/*
+ * Return true if the idle CPU is cache-hot for someone,
+ * return false otherwise.
+ */
+static __maybe_unused bool cache_hot_cpu(int cpu, int *hot_cpu)
+{
+	if (!sched_feat(SIS_CACHE))
+		return false;
+
+	if (sched_clock_cpu(cpu) >=3D cpu_rq(cpu)->cache_hot_timeout)
+		return false;
+
+	/* record the first cache hot idle cpu as the backup */
+	if (*hot_cpu =3D=3D -1)
+		*hot_cpu =3D cpu;
+
+	return true;
+}
+
 /*
  * Scan the LLC domain for idle CPUs; this is dynamically regulated by
  * comparing the average scan cost (tracked in sd->avg_scan_cost) against =
the
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index a3ddf84de430..0af282712cd1 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -50,6 +50,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
  * When doing wakeups, attempt to limit superfluous scans of the LLC domai=
n.
  */
 SCHED_FEAT(SIS_UTIL, true)
+SCHED_FEAT(SIS_CACHE, true)
=20
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e58a54bda77d..191ed62ef06d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1083,6 +1083,7 @@ struct rq {
 #endif
 	u64			idle_stamp;
 	u64			avg_idle;
+	u64			cache_hot_timeout;
=20
 	/* This is used to determine avg_idle's max value */
 	u64			max_idle_balance_cost;
--=20
2.25.1
From nobody Mon Feb  9 00:57:27 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 7DDF2C54FB9
	for <linux-kernel@archiver.kernel.org>; Tue, 21 Nov 2023 07:41:26 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230473AbjKUHl0 (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 21 Nov 2023 02:41:26 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:50658 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S230414AbjKUHlW (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 21 Nov 2023 02:41:22 -0500
Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.10])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id EA06110C
        for <linux-kernel@vger.kernel.org>;
 Mon, 20 Nov 2023 23:41:17 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1700552478; x=1732088478;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=rwOMzL38TQAcQogt+9N+0DIqcb4pS3AdZb0OQ+1utqQ=;
  b=Q43qAnClPHXpydnhHDAKJdJ1Xj5gB+bVGs62epE1F3dAWP/lRL/aLWhl
   FWycfQhrB+f0rlpzgAKQ6jNj+H+il861QvcMhJTn+JjdHGt9fqMYCIulo
   gTRDdYudTBlBRnwQLz/4IGmKVhuECwukzA0+eKy6IcSysWksrgTe8r0y9
   aqR1BNERI7uCIIHj8Q5Y5qWSwPql9VG7QPlIp2izTZGBxQZl8WTpQPMwL
   lBxKdaEAqBgQtB4MBAQam0/LrGoyxOiPO3jkLAQa3enYAkZfGzAmW++qH
   UlNsrEF5lUGygK56h0fyFzZm9TcvwZgC6kfvkFGSo98vCl9BDUO/4938V
   Q==;
X-IronPort-AV: E=McAfee;i="6600,9927,10900"; a="4978292"
X-IronPort-AV: E=Sophos;i="6.04,215,1695711600";
   d="scan'208";a="4978292"
Received: from fmsmga004.fm.intel.com ([10.253.24.48])
  by orvoesa102.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 20 Nov 2023 23:41:17 -0800
X-ExtLoop1: 1
X-IronPort-AV: E=McAfee;i="6600,9927,10900"; a="836972502"
X-IronPort-AV: E=Sophos;i="6.04,215,1695711600";
   d="scan'208";a="836972502"
Received: from chenyu-dev.sh.intel.com ([10.239.62.164])
  by fmsmga004.fm.intel.com with ESMTP; 20 Nov 2023 23:41:13 -0800
From: Chen Yu <yu.c.chen@intel.com>
To: Peter Zijlstra <peterz@infradead.org>,
        Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
        Ingo Molnar <mingo@redhat.com>,
        Vincent Guittot <vincent.guittot@linaro.org>,
        Juri Lelli <juri.lelli@redhat.com>
Cc: Tim Chen <tim.c.chen@intel.com>, Aaron Lu <aaron.lu@intel.com>,
        Dietmar Eggemann <dietmar.eggemann@arm.com>,
        Steven Rostedt <rostedt@goodmis.org>,
        Mel Gorman <mgorman@suse.de>,
        K Prateek Nayak <kprateek.nayak@amd.com>,
        "Gautham R . Shenoy" <gautham.shenoy@amd.com>,
        Chen Yu <yu.chen.surf@gmail.com>, linux-kernel@vger.kernel.org,
        Chen Yu <yu.c.chen@intel.com>
Subject: [PATCH v2 3/3] sched/fair: do not scribble cache-hot CPU in
 select_idle_cpu()
Date: Tue, 21 Nov 2023 15:40:14 +0800
Message-Id: 
 <35e612eb2851693a52f7ed1ff9be5bc24011136f.1700548379.git.yu.c.chen@intel.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <cover.1700548379.git.yu.c.chen@intel.com>
References: <cover.1700548379.git.yu.c.chen@intel.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

Problem statement:
When task p is woken up, the scheduler leverages select_idle_sibling()
to find an idle CPU for it. p's previous CPU is usually a preference
because it can improve cache locality. However in many cases, the
previous CPU has already been taken by other wakees, thus p has to
find another idle CPU.

Proposal:
Introduce the SIS_CACHE. It considers the sleep time of the task for
better task placement. Based on the task's short sleeping history,
tag p's previous CPU as cache-hot. Later when p is woken up, it can
choose its previous CPU in select_idle_sibling(). When other task is
woken up, skip this cache-hot idle CPU when possible.

SIS_CACHE still prefers to choose an idle CPU during task wakeup,
the idea is to optimize the idle CPU scan sequence.

As pointed out by Prateek, this has the potential that all idle CPUs
are cache-hot and skipped. Mitigate this by returning the first
cache-hot idle CPU. Meanwhile, to reduce the time spend on scanning,
limit the max number of cache-hot CPU search depth to half of the number
suggested by SIS_UTIL.

Tested on Xeon 2 x 60C/120T platforms:

netperf
=3D=3D=3D=3D=3D=3D=3D
case            	load    	baseline(std%)	compare%( std%)
TCP_RR          	60-threads	 1.00 (  1.37)	 +0.04 (  1.47)
TCP_RR          	120-threads	 1.00 (  1.77)	 -1.03 (  1.31)
TCP_RR          	180-threads	 1.00 (  2.03)	 +1.25 (  1.66)
TCP_RR          	240-threads	 1.00 ( 41.31)	+73.71 ( 22.02)
TCP_RR          	300-threads	 1.00 ( 12.79)	 -0.11 ( 15.84)

tbench
=3D=3D=3D=3D=3D=3D
case            	load    	baseline(std%)	compare%( std%)
loopback        	60-threads	 1.00 (  0.35)	 +0.40 (  0.31)
loopback        	120-threads	 1.00 (  1.94)	 -1.89 (  1.17)
loopback        	180-threads	 1.00 ( 13.59)	 +9.97 (  0.93)
loopback        	240-threads	 1.00 ( 11.68)	+42.85 (  7.28)
loopback        	300-threads	 1.00 (  4.47)	+15.12 (  1.40)

hackbench
=3D=3D=3D=3D=3D=3D=3D=3D=3D
case            	load    	baseline(std%)	compare%( std%)
process-pipe    	1-groups	 1.00 (  9.21)	 -7.88 (  2.03)
process-pipe    	2-groups	 1.00 (  7.09)	 +5.47 (  9.02)
process-pipe    	4-groups	 1.00 (  1.60)	 +1.53 (  1.70)

schbench
=3D=3D=3D=3D=3D=3D=3D=3D
case            	load    	baseline(std%)	compare%( std%)
normal          	1-mthreads	 1.00 (  0.98)	 +0.26 (  0.37)
normal          	2-mthreads	 1.00 (  3.99)	 -7.97 (  7.33)
normal          	4-mthreads	 1.00 (  3.07)	 -1.55 (  3.27)

Also did some experiments on the OLTP workload on a 112 core 2 socket
SPR machine. The OLTP workload have a mixture of threads handling
database updates on disks and handling transaction queries over network.
Around 0.7% improvement is observed with less than 0.2% run-to-run
variation.

Thanks Madadi for testing the SIS_CACHE on a power system with 96 CPUs.
The results showed a max of 29% improvements in hackbench, 13% improvements
in producer_consumer workload, and 2% improvements in real life workload
named Daytrader.

Thanks Prateek for running microbenchmarks on top of the latest patch on
a 3rd Generation EPYC System:
- 2 sockets each with 64C/128T
- NPS1 (Each socket is a NUMA node)
- C2 Disabled (POLL and C1(MWAIT) remained enabled)
No consistent regression was observed in v2 version.

Analysis:
The reason why waking up the task on its previous CPU brings benefits
is due to less task migration and higher local resource locality.

Take netperf 240 case as an example, run the following script
to track the migration number within 10 seconds. Use perf topdown
to track the PMU events. The task migration and stall cycles
have been reduced a lot with SIS_CACHE:

kretfunc:select_task_rq_fair
{
        $p =3D (struct task_struct *)args->p;
        if ($p->comm =3D=3D "netperf") {
                if ($p->thread_info.cpu !=3D retval) {
                        @wakeup_migrate_netperf =3D count();
                } else {
                        @wakeup_prev_netperf =3D count();
                }
        }
}

NO_SIS_CACHE:
@wakeup_migrate_netperf: 57473509
@wakeup_prev_netperf: 14935964
RESOURCE_STALLS: 19.1% * 7.1% * 35.0%

SIS_CACHE:
@wakeup_migrate_netperf: 799
@wakeup_prev_netperf: 132937436
RESOURCE_STALLS: 5.4% * 7.5% * 39.8%

Suggested-by: Tim Chen <tim.c.chen@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
---
 kernel/sched/fair.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c309b3d203c0..d149eca74fca 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7360,7 +7360,7 @@ static inline int select_idle_smt(struct task_struct =
*p, int target)
  * Return true if the idle CPU is cache-hot for someone,
  * return false otherwise.
  */
-static __maybe_unused bool cache_hot_cpu(int cpu, int *hot_cpu)
+static bool cache_hot_cpu(int cpu, int *hot_cpu)
 {
 	if (!sched_feat(SIS_CACHE))
 		return false;
@@ -7383,7 +7383,7 @@ static __maybe_unused bool cache_hot_cpu(int cpu, int=
 *hot_cpu)
 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd,=
 bool has_idle_core, int target)
 {
 	struct cpumask *cpus =3D this_cpu_cpumask_var_ptr(select_rq_mask);
-	int i, cpu, idle_cpu =3D -1, nr =3D INT_MAX;
+	int i, cpu, idle_cpu =3D -1, nr =3D INT_MAX, nr_hot =3D 0, hot_cpu =3D -1;
 	struct sched_domain_shared *sd_share;
=20
 	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
@@ -7396,6 +7396,9 @@ static int select_idle_cpu(struct task_struct *p, str=
uct sched_domain *sd, bool
 			/* overloaded LLC is unlikely to have idle cpu/core */
 			if (nr =3D=3D 1)
 				return -1;
+
+			/* max number of cache-hot idle cpu check */
+			nr_hot =3D nr >> 1;
 		}
 	}
=20
@@ -7426,18 +7429,30 @@ static int select_idle_cpu(struct task_struct *p, s=
truct sched_domain *sd, bool
 	for_each_cpu_wrap(cpu, cpus, target + 1) {
 		if (has_idle_core) {
 			i =3D select_idle_core(p, cpu, cpus, &idle_cpu);
-			if ((unsigned int)i < nr_cpumask_bits)
+			if ((unsigned int)i < nr_cpumask_bits) {
+				if (--nr_hot >=3D 0 && cache_hot_cpu(i, &hot_cpu))
+					continue;
+
 				return i;
+			}
=20
 		} else {
 			if (--nr <=3D 0)
 				return -1;
 			idle_cpu =3D __select_idle_cpu(cpu, p);
-			if ((unsigned int)idle_cpu < nr_cpumask_bits)
+			if ((unsigned int)idle_cpu < nr_cpumask_bits) {
+				if (--nr_hot >=3D 0 && cache_hot_cpu(idle_cpu, &hot_cpu))
+					continue;
+
 				break;
+			}
 		}
 	}
=20
+	/* pick the first cache-hot CPU as the last resort */
+	if (idle_cpu =3D=3D -1 && hot_cpu !=3D -1)
+		idle_cpu =3D hot_cpu;
+
 	if (has_idle_core)
 		set_idle_cores(target, false);
=20
--=20
2.25.1