[PATCH v3] sched/cache: Reduce the overhead of task_cache_work by only scan the visisted cpus.

Luo Gengkun posted 1 patch 1 month, 3 weeks ago
include/linux/sched.h |  1 +
kernel/sched/fair.c   | 38 ++++++++++++++++++++++++++++++--------
2 files changed, 31 insertions(+), 8 deletions(-)
[PATCH v3] sched/cache: Reduce the overhead of task_cache_work by only scan the visisted cpus.
Posted by Luo Gengkun 1 month, 3 weeks ago
The overhead of task_cache_work is high, espeically in multi-NUMA system.
Currently, task_cache_work try to find the pref_llc by scan all cpus in the
system. However, most of these scans are meaningless, such as those for
cpus that have never been visited or were accessed a long time ago.

To address this problem, this patch introduces visited_cpus to track the
visited cpus and uses llc_epoch_affinity_timeout to evict cpus that have
timed out.

Signed-off-by: Luo Gengkun <luogengkun2@huawei.com>
---
Changes history
**v3 Changes:**
1. Remove the static key and enable this feature by default.
2. Reuse llc_epoch_affinity_timeout instead of introducing
llc_epoch_visited_timeout.
3. Move the calculation of rq->cpu_epoch - pcpu_sched->epoch into
fraction_mm_sched() to avoid race between task_cache_work() and
__update_mm_sched(). 
4. Reset work->next at the end of task_cache_work() to prevent concurrent
executions by multiple threads within the same process.


**v2 Changes:**
1. Added a pre-check before set/clear visited_cpus to avoid C2C overhead.
2. Optimized llc_epoch_visited_timeout by using a static key to minimize overhead.
---
 include/linux/sched.h |  1 +
 kernel/sched/fair.c   | 38 ++++++++++++++++++++++++++++++--------
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dfa4bfd099c6..f2327a13fda8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2390,6 +2390,7 @@ struct sched_cache_time {
 
 struct sched_cache_stat {
 	struct sched_cache_time __percpu *pcpu_sched;
+	struct cpumask visited_cpus;
 	raw_spinlock_t lock;
 	unsigned long epoch;
 	u64 nr_running_avg;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4e22696a0b1..49369f656d53 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1466,6 +1466,7 @@ void mm_init_sched(struct mm_struct *mm,
 	raw_spin_lock_init(&mm->sc_stat.lock);
 	mm->sc_stat.epoch = epoch;
 	mm->sc_stat.cpu = -1;
+	cpumask_clear(&mm->sc_stat.visited_cpus);
 
 	/*
 	 * The update to mm->sc_stat should not be reordered
@@ -1507,11 +1508,18 @@ static inline void __update_mm_sched(struct rq *rq,
 	}
 }
 
-static unsigned long fraction_mm_sched(struct rq *rq,
-				       struct sched_cache_time *pcpu_sched)
+static unsigned long fraction_mm_sched(int cpu, struct mm_struct *mm)
 {
+	struct rq *rq = cpu_rq(cpu);
+	struct sched_cache_time *pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, cpu);
 	guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
 
+	/* Skip the rq that has not been hit for a long time */
+	if ((rq->cpu_epoch - pcpu_sched->epoch) > llc_epoch_affinity_timeout) {
+		cpumask_clear_cpu(cpu, &mm->sc_stat.visited_cpus);
+		return 0;
+	}
+
 	__update_mm_sched(rq, pcpu_sched);
 
 	/*
@@ -1582,6 +1590,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
 		pcpu_sched->runtime += delta_exec;
 		rq->cpu_runtime += delta_exec;
 		epoch = rq->cpu_epoch;
+		if (!cpumask_test_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus))
+			cpumask_set_cpu(cpu_of(rq), &mm->sc_stat.visited_cpus);
 	}
 
 	/*
@@ -1627,7 +1637,11 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
 
 	guard(raw_spinlock)(&mm->sc_stat.lock);
 
-	if (work->next == work) {
+	/*
+	 * Pairs with smp_store_release in task_cache_work() to ensure that
+	 * tash_cache_work() has finished before re-queueing the work.
+	 */
+	if (smp_load_acquire(&work->next) == work) {
 		task_work_add(p, work, TWA_RESUME);
 		WRITE_ONCE(mm->sc_stat.epoch, epoch);
 	}
@@ -1695,6 +1709,8 @@ static inline void update_avg_scale(u64 *avg, u64 sample)
 	*avg += div64_s64(diff, divisor);
 }
 
+DEFINE_FREE(reset_work, struct callback_head *, smp_store_release(&_T->next, _T))
+
 static void task_cache_work(struct callback_head *work)
 {
 	int cpu, m_a_cpu = -1, nr_running = 0, curr_cpu;
@@ -1703,11 +1719,14 @@ static void task_cache_work(struct callback_head *work)
 	struct mm_struct *mm = p->mm;
 	unsigned long m_a_occ = 0;
 	cpumask_var_t cpus;
+	/*
+	 * Reset work->next at the end to avoid race between threads
+	 * within a process.
+	 */
+	struct callback_head *_w __free(reset_work) = work;
 
 	WARN_ON_ONCE(work != &p->cache_work);
 
-	work->next = work;
-
 	if (p->flags & PF_EXITING)
 		return;
 
@@ -1725,6 +1744,7 @@ static void task_cache_work(struct callback_head *work)
 
 	scoped_guard (cpus_read_lock) {
 		get_scan_cpumasks(cpus, p);
+		cpumask_and(cpus, cpus, &mm->sc_stat.visited_cpus);
 
 		for_each_cpu(cpu, cpus) {
 			/* XXX sched_cluster_active */
@@ -1735,9 +1755,11 @@ static void task_cache_work(struct callback_head *work)
 			if (!sd)
 				continue;
 
-			for_each_cpu(i, sched_domain_span(sd)) {
-				occ = fraction_mm_sched(cpu_rq(i),
-							per_cpu_ptr(mm->sc_stat.pcpu_sched, i));
+			for_each_cpu_and(i, sched_domain_span(sd), &mm->sc_stat.visited_cpus) {
+				occ = fraction_mm_sched(i, mm);
+				if (occ == 0)
+					continue;
+
 				a_occ += occ;
 				if (occ > m_occ) {
 					m_occ = occ;
-- 
2.34.1