1. Fix compile error on percpu allocation.
2. Enqueue to the target CPU rather than the current CPU.
3. NULL LLC sched domain check(Libo Chen).
4. Introduce sched feature SCHED_CACHE to control cache aware scheduling
5. Fix unsigned occupancy initialization to -1.
6. If there is only 1 thread in the process, no need to enable cache
awareness
7. Add __maybe_unused to __migrate_degrades_locality() to
avoid compile warnings.
8. Do not enable gcov coverage for task_cache_work() and
fraction_mm_sched() to avoid softlockup by gcov.
9. Make CONFIG_SCHED_CACHE depending on CONFIG_SMP to
avoid compile error on non-SMP system like microblaze
architecture.
10. Do not enable account cache aware statistics in
account_mm_sched() for non-normal tasks, as it could
be invoked by RT tasks.(Shrikanth Hegde)
11. Place cpu_epoch related fields in a dedicated cache line
to avoid interfering with clock_idle* fields.
(Shrikanth Hegde)
Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
include/linux/mm_types.h | 4 ++--
init/Kconfig | 4 ++++
kernel/sched/fair.c | 41 +++++++++++++++++++++++++++-------------
kernel/sched/features.h | 1 +
kernel/sched/sched.h | 2 +-
5 files changed, 36 insertions(+), 16 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index cf26ad8b41ab..41a598a44361 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1450,11 +1450,11 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
#endif /* CONFIG_SCHED_MM_CID */
#ifdef CONFIG_SCHED_CACHE
-extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched);
+void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched);
static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
{
- struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
+ struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
if (!pcpu_sched)
return -ENOMEM;
diff --git a/init/Kconfig b/init/Kconfig
index 27f4012347f9..4bab39a5254c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -950,6 +950,10 @@ config NUMA_BALANCING
config SCHED_CACHE
bool "Cache aware scheduler"
default y
+ depends on SMP
+ help
+ If set, the scheduler will try to aggregate tasks in the same process to
+ a single LLC if possible.
config NUMA_BALANCING_DEFAULT_ENABLED
bool "Automatically enable NUMA aware memory/task placement"
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e3897cd7696d..e97ab46509e3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1175,7 +1175,7 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
#define EPOCH_PERIOD (HZ/100) /* 10 ms */
#define EPOCH_OLD 5 /* 50 ms */
-void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
+void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
{
unsigned long epoch;
int i;
@@ -1186,7 +1186,7 @@ void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
pcpu_sched->runtime = 0;
pcpu_sched->epoch = epoch = rq->cpu_epoch;
- pcpu_sched->occ = -1;
+ pcpu_sched->occ = 0;
}
raw_spin_lock_init(&mm->mm_sched_lock);
@@ -1227,7 +1227,7 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
}
}
-static unsigned long fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
+static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
{
guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
@@ -1248,13 +1248,18 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
struct mm_sched *pcpu_sched;
unsigned long epoch;
+ if (!sched_feat(SCHED_CACHE))
+ return;
+
+ if (p->sched_class != &fair_sched_class)
+ return;
/*
* init_task and kthreads don't be having no mm
*/
if (!mm || !mm->pcpu_sched)
return;
- pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
+ pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq));
scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
__update_mm_sched(rq, pcpu_sched);
@@ -1264,12 +1269,14 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
}
/*
- * If this task hasn't hit task_cache_work() for a while, invalidate
+ * If this task hasn't hit task_cache_work() for a while, or it
+ * has only 1 thread, invalidate
* it's preferred state.
*/
- if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
+ if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD ||
+ get_nr_threads(p) <= 1) {
mm->mm_sched_cpu = -1;
- pcpu_sched->occ = -1;
+ pcpu_sched->occ = 0;
}
}
@@ -1278,6 +1285,9 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
struct callback_head *work = &p->cache_work;
struct mm_struct *mm = p->mm;
+ if (!sched_feat(SCHED_CACHE))
+ return;
+
if (!mm || !mm->pcpu_sched)
return;
@@ -1286,16 +1296,13 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
guard(raw_spinlock)(&mm->mm_sched_lock);
- if (mm->mm_sched_epoch == rq->cpu_epoch)
- return;
-
if (work->next == work) {
task_work_add(p, work, TWA_RESUME);
WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
}
}
-static void task_cache_work(struct callback_head *work)
+static void __no_profile task_cache_work(struct callback_head *work)
{
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
@@ -1322,6 +1329,9 @@ static void task_cache_work(struct callback_head *work)
unsigned long occ, m_occ = 0, a_occ = 0;
int m_cpu = -1, nr = 0, i;
+ if (!sd)
+ continue;
+
for_each_cpu(i, sched_domain_span(sd)) {
occ = fraction_mm_sched(cpu_rq(i),
per_cpu_ptr(mm->pcpu_sched, i));
@@ -8815,6 +8825,9 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
struct mm_struct *mm = p->mm;
int cpu;
+ if (!sched_feat(SCHED_CACHE))
+ return prev_cpu;
+
if (!mm || p->nr_cpus_allowed == 1)
return prev_cpu;
@@ -9569,7 +9582,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
return 0;
#ifdef CONFIG_SCHED_CACHE
- if (p->mm && p->mm->pcpu_sched) {
+ if (sched_feat(SCHED_CACHE) && p->mm && p->mm->pcpu_sched) {
/*
* XXX things like Skylake have non-inclusive L3 and might not
* like this L3 centric view. What to do about L2 stickyness ?
@@ -9647,7 +9660,9 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
}
#else
-static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
+static __maybe_unused long __migrate_degrades_locality(struct task_struct *p,
+ int src_cpu, int dst_cpu,
+ bool idle)
{
return 0;
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 3c12d9f93331..d2af7bfd36bf 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -87,6 +87,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
*/
SCHED_FEAT(SIS_UTIL, true)
+SCHED_FEAT(SCHED_CACHE, true)
/*
* Issue a WARN when we do multiple update_rq_clock() calls
* in a single rq->lock section. Default disabled because the
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 839463027ab0..f4ab45ecca86 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1174,7 +1174,7 @@ struct rq {
u64 clock_idle_copy;
#endif
#ifdef CONFIG_SCHED_CACHE
- raw_spinlock_t cpu_epoch_lock;
+ raw_spinlock_t cpu_epoch_lock ____cacheline_aligned;
u64 cpu_runtime;
unsigned long cpu_epoch;
unsigned long cpu_epoch_next;
--
2.25.1