sched/core: Fixes and enhancements around spurious need_resched() and idle load balancing

[PATCH 2/3] sched/core: Introduce SM_IDLE and an idle re-entry fast-path in __schedule()

Posted by K Prateek Nayak 1 year, 7 months ago

From: Peter Zijlstra <peterz@infradead.org>

Since commit b2a02fc43a1f ("smp: Optimize
send_call_function_single_ipi()") an idle CPU in TIF_POLLING_NRFLAG mode
can be pulled out of idle by setting TIF_NEED_RESCHED flag to service an
IPI without actually sending an interrupt. Even in cases where the IPI
handler does not queue a task on the idle CPU, do_idle() will call
__schedule() since need_resched() returns true in these cases.

Introduce and use SM_IDLE to identify call to __schedule() from
schedule_idle() and shorten the idle re-entry time by skipping
pick_next_task() when nr_running is 0 and the previous task is the idle
task.

With the SM_IDLE fast-path, the time taken to complete a fixed set of
IPIs using ipistorm improves significantly. Following are the numbers
from a dual socket 3rd Generation EPYC system (2 x 64C/128T) (boost on,
C2 disabled) running ipistorm between CPU8 and CPU16:

cmdline: insmod ipistorm.ko numipi=100000 single=1 offset=8 cpulist=8 wait=1

   ==================================================================
   Test          : ipistorm (modified)
   Units         : Normalized runtime
   Interpretation: Lower is better
   Statistic     : AMean
   ==================================================================
   kernel:				time [pct imp]
   tip:sched/core			1.00 [baseline]
   tip:sched/core + SM_IDLE		0.25 [75.11%]

[ kprateek: Commit log and testing ]

Link: https://lore.kernel.org/lkml/20240615012814.GP8774@noisy.programming.kicks-ass.net/
Not-yet-signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
 kernel/sched/core.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1e0c77eac65a..417d3ebbdf60 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6343,19 +6343,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  * Constants for the sched_mode argument of __schedule().
  *
  * The mode argument allows RT enabled kernels to differentiate a
- * preemption from blocking on an 'sleeping' spin/rwlock. Note that
- * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
- * optimize the AND operation out and just check for zero.
+ * preemption from blocking on an 'sleeping' spin/rwlock.
  */
-#define SM_NONE			0x0
-#define SM_PREEMPT		0x1
-#define SM_RTLOCK_WAIT		0x2
-
-#ifndef CONFIG_PREEMPT_RT
-# define SM_MASK_PREEMPT	(~0U)
-#else
-# define SM_MASK_PREEMPT	SM_PREEMPT
-#endif
+#define SM_IDLE			(-1)
+#define SM_NONE			0
+#define SM_PREEMPT		1
+#define SM_RTLOCK_WAIT		2
 
 /*
  * __schedule() is the main scheduler function.
@@ -6396,11 +6389,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  *
  * WARNING: must be called with preemption disabled!
  */
-static void __sched notrace __schedule(unsigned int sched_mode)
+static void __sched notrace __schedule(int sched_mode)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	unsigned long prev_state;
+	bool preempt = sched_mode > 0;
 	struct rq_flags rf;
 	struct rq *rq;
 	int cpu;
@@ -6409,13 +6403,13 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 	rq = cpu_rq(cpu);
 	prev = rq->curr;
 
-	schedule_debug(prev, !!sched_mode);
+	schedule_debug(prev, preempt);
 
 	if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
 		hrtick_clear(rq);
 
 	local_irq_disable();
-	rcu_note_context_switch(!!sched_mode);
+	rcu_note_context_switch(preempt);
 
 	/*
 	 * Make sure that signal_pending_state()->signal_pending() below
@@ -6449,7 +6443,12 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 	 * that we form a control dependency vs deactivate_task() below.
 	 */
 	prev_state = READ_ONCE(prev->__state);
-	if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
+	if (sched_mode == SM_IDLE) {
+		if (!rq->nr_running) {
+			next = prev;
+			goto picked;
+		}
+	} else if (!preempt && prev_state) {
 		if (signal_pending_state(prev_state, prev)) {
 			WRITE_ONCE(prev->__state, TASK_RUNNING);
 		} else {
@@ -6483,6 +6482,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 	}
 
 	next = pick_next_task(rq, prev, &rf);
+picked:
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 #ifdef CONFIG_SCHED_DEBUG
@@ -6523,7 +6523,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 		migrate_disable_switch(rq, prev);
 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
-		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
+		trace_sched_switch(preempt, prev, next, prev_state);
 
 		/* Also unlocks the rq: */
 		rq = context_switch(rq, prev, next, &rf);
@@ -6599,7 +6599,7 @@ static void sched_update_worker(struct task_struct *tsk)
 	}
 }
 
-static __always_inline void __schedule_loop(unsigned int sched_mode)
+static __always_inline void __schedule_loop(int sched_mode)
 {
 	do {
 		preempt_disable();
@@ -6644,7 +6644,7 @@ void __sched schedule_idle(void)
 	 */
 	WARN_ON_ONCE(current->__state);
 	do {
-		__schedule(SM_NONE);
+		__schedule(SM_IDLE);
 	} while (need_resched());
 }
 
-- 
2.34.1

Re: [PATCH 2/3] sched/core: Introduce SM_IDLE and an idle re-entry fast-path in __schedule()

Posted by Chen Yu 1 year, 6 months ago

On 2024-07-10 at 09:02:09 +0000, K Prateek Nayak wrote:
> From: Peter Zijlstra <peterz@infradead.org>
> 
> Since commit b2a02fc43a1f ("smp: Optimize
> send_call_function_single_ipi()") an idle CPU in TIF_POLLING_NRFLAG mode
> can be pulled out of idle by setting TIF_NEED_RESCHED flag to service an
> IPI without actually sending an interrupt. Even in cases where the IPI
> handler does not queue a task on the idle CPU, do_idle() will call
> __schedule() since need_resched() returns true in these cases.
> 
> Introduce and use SM_IDLE to identify call to __schedule() from
> schedule_idle() and shorten the idle re-entry time by skipping
> pick_next_task() when nr_running is 0 and the previous task is the idle
> task.
> 
> With the SM_IDLE fast-path, the time taken to complete a fixed set of
> IPIs using ipistorm improves significantly. Following are the numbers
> from a dual socket 3rd Generation EPYC system (2 x 64C/128T) (boost on,
> C2 disabled) running ipistorm between CPU8 and CPU16:
> 
> cmdline: insmod ipistorm.ko numipi=100000 single=1 offset=8 cpulist=8 wait=1
> 
>    ==================================================================
>    Test          : ipistorm (modified)
>    Units         : Normalized runtime
>    Interpretation: Lower is better
>    Statistic     : AMean
>    ==================================================================
>    kernel:				time [pct imp]
>    tip:sched/core			1.00 [baseline]
>    tip:sched/core + SM_IDLE		0.25 [75.11%]
> 
> [ kprateek: Commit log and testing ]
> 
> Link: https://lore.kernel.org/lkml/20240615012814.GP8774@noisy.programming.kicks-ass.net/
> Not-yet-signed-off-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
>

Only with current patch applied on top of sched/core commit c793a62823d1,
a significant throughput/run-to-run variance improvement is observed
on an Intel 240 CPUs/ 2 Nodes server. C-states >= C1E are disabled,
CPU frequency governor is set to performance and turbo-boost disabled.

Without the patch(lower the better):

158490995
113086433
737869191
302454894
731262790
677283357
729767478
830949261
399824606
743681976

(Amean): 542467098
(Std):   257011706


With the patch(lower the better):
128060992
115646768
132734621
150330954
113143538
169875051
145010400
151589193
162165800
159963320

(Amean): 142852063
(Std):    18646313

I've launched full tests for schbench/hackbench/netperf/tbench
to see if there is any difference.

thanks,
Chenyu

Re: [PATCH 2/3] sched/core: Introduce SM_IDLE and an idle re-entry fast-path in __schedule()

Posted by Chen Yu 1 year, 6 months ago

On 2024-07-31 at 00:13:40 +0800, Chen Yu wrote:
> On 2024-07-10 at 09:02:09 +0000, K Prateek Nayak wrote:
> > From: Peter Zijlstra <peterz@infradead.org>
> > 
> > Since commit b2a02fc43a1f ("smp: Optimize
> > send_call_function_single_ipi()") an idle CPU in TIF_POLLING_NRFLAG mode
> > can be pulled out of idle by setting TIF_NEED_RESCHED flag to service an
> > IPI without actually sending an interrupt. Even in cases where the IPI
> > handler does not queue a task on the idle CPU, do_idle() will call
> > __schedule() since need_resched() returns true in these cases.
> > 
> > Introduce and use SM_IDLE to identify call to __schedule() from
> > schedule_idle() and shorten the idle re-entry time by skipping
> > pick_next_task() when nr_running is 0 and the previous task is the idle
> > task.
> > 
> > With the SM_IDLE fast-path, the time taken to complete a fixed set of
> > IPIs using ipistorm improves significantly. Following are the numbers
> > from a dual socket 3rd Generation EPYC system (2 x 64C/128T) (boost on,
> > C2 disabled) running ipistorm between CPU8 and CPU16:
> > 
> > cmdline: insmod ipistorm.ko numipi=100000 single=1 offset=8 cpulist=8 wait=1
> > 
> >    ==================================================================
> >    Test          : ipistorm (modified)
> >    Units         : Normalized runtime
> >    Interpretation: Lower is better
> >    Statistic     : AMean
> >    ==================================================================
> >    kernel:				time [pct imp]
> >    tip:sched/core			1.00 [baseline]
> >    tip:sched/core + SM_IDLE		0.25 [75.11%]
> > 
> > [ kprateek: Commit log and testing ]
> > 
> > Link: https://lore.kernel.org/lkml/20240615012814.GP8774@noisy.programming.kicks-ass.net/
> > Not-yet-signed-off-by: Peter Zijlstra <peterz@infradead.org>
> > Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
> >
> 
> Only with current patch applied on top of sched/core commit c793a62823d1,
> a significant throughput/run-to-run variance improvement is observed
> on an Intel 240 CPUs/ 2 Nodes server. C-states >= C1E are disabled,
> CPU frequency governor is set to performance and turbo-boost disabled.
> 
> Without the patch(lower the better):
> 
> 158490995
> 113086433
> 737869191
> 302454894
> 731262790
> 677283357
> 729767478
> 830949261
> 399824606
> 743681976
> 
> (Amean): 542467098
> (Std):   257011706
> 
> 
> With the patch(lower the better):
> 128060992
> 115646768
> 132734621
> 150330954
> 113143538
> 169875051
> 145010400
> 151589193
> 162165800
> 159963320
> 
> (Amean): 142852063
> (Std):    18646313
> 
> I've launched full tests for schbench/hackbench/netperf/tbench
> to see if there is any difference.
>

Tested without CONFIG_PREEMPT_RT, so issue for SM_RTLOCK_WAIT as mentioned
by Vincent might not bring any impact. There is no obvious difference
(regression) detected according to the test in the 0day environment. Overall
this patch looks good to me. Once you send a refresh version out I'll re-launch
the test.

Tested on Xeon server with 128 CPUs, 4 Numa nodes, under different

      baseline                  with-SM_IDLE

hackbench
load level (25% ~ 100%)

hackbench-pipe-process.throughput
%25:
    846099            -0.3%     843217
%50:
    972015            +0.0%     972185
%100:
   1395650            -1.3%    1376963

hackbench-pipe-threads.throughput
%25:
    746629            -0.0%     746345
%50:
    885165            -0.4%     881602
%100:
   1227790            +1.3%    1243757

hackbench-socket-process.throughput
%25:
    395784            +1.2%     400717
%50:
    441312            +0.3%     442783
%100:
    324283 ±  2%      +6.0%     343826

hackbench-socket-threads.throughput
%25:
    379700            -0.8%     376642
%50:
    425315            -0.4%     423749
%100:
    311937 ±  2%      +0.9%     314892



      baseline                  with-SM_IDLE

schbench.request_latency_90%_us

1-mthread-1-worker:
      4562            -0.0%       4560
1-mthread-16-workers:
      4564            -0.0%       4563
12.5%-mthread-1:
      4565            +0.0%       4567
12.5%-mthread-16-workers:
     39204            +0.1%      39248
25%-mthread-1-worker:
      4574            +0.0%       4574
25%-mthread-16-workers:
    161944            +0.1%     162053
50%-mthread-1-workers:
      4784 ±  5%      +0.1%       4789 ±  5%
50%-mthread-16-workers:
    659156            +0.4%     661679
100%-mthread-1-workers:
      9328            +0.0%       9329
100%-mthread-16-workers:
   2489753            -0.7%    2472140


      baseline                  with-SM_IDLE

netperf.Throughput:

25%-TCP_RR:
   2449875            +0.0%    2450622        netperf.Throughput_total_tps
25%-UDP_RR:
   2746806            +0.1%    2748935        netperf.Throughput_total_tps
25%-TCP_STREAM:
   1352061            +0.7%    1361497        netperf.Throughput_total_Mbps
25%-UDP_STREAM:
   1815205            +0.1%    1816202        netperf.Throughput_total_Mbps
50%-TCP_RR:
   3981514            -0.3%    3970327        netperf.Throughput_total_tps
50%-UDP_RR:
   4496584            -1.3%    4438363        netperf.Throughput_total_tps
50%-TCP_STREAM:
   1478872            +0.4%    1484196        netperf.Throughput_total_Mbps
50%-UDP_STREAM:
   1739540            +0.3%    1744074        netperf.Throughput_total_Mbps
75%-TCP_RR:
   3696607            -0.5%    3677044        netperf.Throughput_total_tps
75%-UDP_RR:
   4161206            +1.3%    4217274 ±  2%  netperf.Throughput_total_tps
75%-TCP_STREAM:
    895874            +5.7%     946546 ±  5%  netperf.Throughput_total_Mbps
75%-UDP_STREAM:
   4100019            -0.3%    4088367        netperf.Throughput_total_Mbps
100%-TCP_RR:
   6724456            -1.7%    6610976        netperf.Throughput_total_tps
100%-UDP_RR:
   7329959            -0.5%    7294653        netperf.Throughput_total_tps
100%-TCP_STREAM:
    808165            +0.3%     810360        netperf.Throughput_total_Mbps
100%-UDP_STREAM:
   5562651            +0.0%    5564106        netperf.Throughput_total_Mbps

thanks,
Chenyu

Re: [PATCH 2/3] sched/core: Introduce SM_IDLE and an idle re-entry fast-path in __schedule()

Posted by K Prateek Nayak 1 year, 6 months ago

Hello Chenyu,

Thank you for testing the series. I'll have a second version out soon.

On 8/4/2024 9:35 AM, Chen Yu wrote:
> On 2024-07-31 at 00:13:40 +0800, Chen Yu wrote:
>> On 2024-07-10 at 09:02:09 +0000, K Prateek Nayak wrote:
>>> From: Peter Zijlstra <peterz@infradead.org>
>>>
>>> Since commit b2a02fc43a1f ("smp: Optimize
>>> send_call_function_single_ipi()") an idle CPU in TIF_POLLING_NRFLAG mode
>>> can be pulled out of idle by setting TIF_NEED_RESCHED flag to service an
>>> IPI without actually sending an interrupt. Even in cases where the IPI
>>> handler does not queue a task on the idle CPU, do_idle() will call
>>> __schedule() since need_resched() returns true in these cases.
>>>
>>> Introduce and use SM_IDLE to identify call to __schedule() from
>>> schedule_idle() and shorten the idle re-entry time by skipping
>>> pick_next_task() when nr_running is 0 and the previous task is the idle
>>> task.
>>>
>>> With the SM_IDLE fast-path, the time taken to complete a fixed set of
>>> IPIs using ipistorm improves significantly. Following are the numbers
>>> from a dual socket 3rd Generation EPYC system (2 x 64C/128T) (boost on,
>>> C2 disabled) running ipistorm between CPU8 and CPU16:
>>>
>>> cmdline: insmod ipistorm.ko numipi=100000 single=1 offset=8 cpulist=8 wait=1
>>>
>>>     ==================================================================
>>>     Test          : ipistorm (modified)
>>>     Units         : Normalized runtime
>>>     Interpretation: Lower is better
>>>     Statistic     : AMean
>>>     ==================================================================
>>>     kernel:				time [pct imp]
>>>     tip:sched/core			1.00 [baseline]
>>>     tip:sched/core + SM_IDLE		0.25 [75.11%]
>>>
>>> [ kprateek: Commit log and testing ]
>>>
>>> Link: https://lore.kernel.org/lkml/20240615012814.GP8774@noisy.programming.kicks-ass.net/
>>> Not-yet-signed-off-by: Peter Zijlstra <peterz@infradead.org>
>>> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
>>>
>>
>> Only with current patch applied on top of sched/core commit c793a62823d1,
>> a significant throughput/run-to-run variance improvement is observed
>> on an Intel 240 CPUs/ 2 Nodes server. C-states >= C1E are disabled,
>> CPU frequency governor is set to performance and turbo-boost disabled.
>>
>> Without the patch(lower the better):
>>
>> 158490995
>> 113086433
>> 737869191
>> 302454894
>> 731262790
>> 677283357
>> 729767478
>> 830949261
>> 399824606
>> 743681976
>>
>> (Amean): 542467098
>> (Std):   257011706
>>
>>
>> With the patch(lower the better):
>> 128060992
>> 115646768
>> 132734621
>> 150330954
>> 113143538
>> 169875051
>> 145010400
>> 151589193
>> 162165800
>> 159963320
>>
>> (Amean): 142852063
>> (Std):    18646313
>>
>> I've launched full tests for schbench/hackbench/netperf/tbench
>> to see if there is any difference.
>>
> 
> Tested without CONFIG_PREEMPT_RT, so issue for SM_RTLOCK_WAIT as mentioned
> by Vincent might not bring any impact. There is no obvious difference
> (regression) detected according to the test in the 0day environment. Overall
> this patch looks good to me. Once you send a refresh version out I'll re-launch
> the test.

Since SM_RTLOCK_WAIT is only used by schedule_rtlock(), which is only
defined for PREEMPT_RT kernels, non RT build should have no issue. I
could spot at least one case in rtlock_slowlock_locked() where the
pre->__state is set to "TASK_RTLOCK_WAIT" and schedule_rtlock() is
called. With this patch, it would pass the "sched_mode > SM_NONE" check
and call it an involuntary context-switch but on tip,
(preempt & SM_MASK_PREEMPT) would return false and eventually it'll
call deactivate_task() to dequeue the waiting task so this does need
fixing.

 From a brief look, all calls to schedule with "SM_RTLOCK_WAIT" already
set the task->__state to a non-zero value. I'll look into this further
after the respin and see if there is some optimization possible there
but for the time being, I'll respin this with the condition changed
to:

	...
     } else if (preempt != SM_PREEMPT && prev_state) {
	...

just to keep it explicit.

Thank you again for testing this version.
-- 
Thanks and Regards,
Prateek

> 
> Tested on Xeon server with 128 CPUs, 4 Numa nodes, under different
> 
>        baseline                  with-SM_IDLE
> 
> hackbench
> load level (25% ~ 100%)
> 
> hackbench-pipe-process.throughput
> %25:
>      846099            -0.3%     843217
> %50:
>      972015            +0.0%     972185
> %100:
>     1395650            -1.3%    1376963
> 
> hackbench-pipe-threads.throughput
> %25:
>      746629            -0.0%     746345
> %50:
>      885165            -0.4%     881602
> %100:
>     1227790            +1.3%    1243757
> 
> hackbench-socket-process.throughput
> %25:
>      395784            +1.2%     400717
> %50:
>      441312            +0.3%     442783
> %100:
>      324283 ±  2%      +6.0%     343826
> 
> hackbench-socket-threads.throughput
> %25:
>      379700            -0.8%     376642
> %50:
>      425315            -0.4%     423749
> %100:
>      311937 ±  2%      +0.9%     314892
> 
> 
> 
>        baseline                  with-SM_IDLE
> 
> schbench.request_latency_90%_us
> 
> 1-mthread-1-worker:
>        4562            -0.0%       4560
> 1-mthread-16-workers:
>        4564            -0.0%       4563
> 12.5%-mthread-1:
>        4565            +0.0%       4567
> 12.5%-mthread-16-workers:
>       39204            +0.1%      39248
> 25%-mthread-1-worker:
>        4574            +0.0%       4574
> 25%-mthread-16-workers:
>      161944            +0.1%     162053
> 50%-mthread-1-workers:
>        4784 ±  5%      +0.1%       4789 ±  5%
> 50%-mthread-16-workers:
>      659156            +0.4%     661679
> 100%-mthread-1-workers:
>        9328            +0.0%       9329
> 100%-mthread-16-workers:
>     2489753            -0.7%    2472140
> 
> 
>        baseline                  with-SM_IDLE
> 
> netperf.Throughput:
> 
> 25%-TCP_RR:
>     2449875            +0.0%    2450622        netperf.Throughput_total_tps
> 25%-UDP_RR:
>     2746806            +0.1%    2748935        netperf.Throughput_total_tps
> 25%-TCP_STREAM:
>     1352061            +0.7%    1361497        netperf.Throughput_total_Mbps
> 25%-UDP_STREAM:
>     1815205            +0.1%    1816202        netperf.Throughput_total_Mbps
> 50%-TCP_RR:
>     3981514            -0.3%    3970327        netperf.Throughput_total_tps
> 50%-UDP_RR:
>     4496584            -1.3%    4438363        netperf.Throughput_total_tps
> 50%-TCP_STREAM:
>     1478872            +0.4%    1484196        netperf.Throughput_total_Mbps
> 50%-UDP_STREAM:
>     1739540            +0.3%    1744074        netperf.Throughput_total_Mbps
> 75%-TCP_RR:
>     3696607            -0.5%    3677044        netperf.Throughput_total_tps
> 75%-UDP_RR:
>     4161206            +1.3%    4217274 ±  2%  netperf.Throughput_total_tps
> 75%-TCP_STREAM:
>      895874            +5.7%     946546 ±  5%  netperf.Throughput_total_Mbps
> 75%-UDP_STREAM:
>     4100019            -0.3%    4088367        netperf.Throughput_total_Mbps
> 100%-TCP_RR:
>     6724456            -1.7%    6610976        netperf.Throughput_total_tps
> 100%-UDP_RR:
>     7329959            -0.5%    7294653        netperf.Throughput_total_tps
> 100%-TCP_STREAM:
>      808165            +0.3%     810360        netperf.Throughput_total_Mbps
> 100%-UDP_STREAM:
>     5562651            +0.0%    5564106        netperf.Throughput_total_Mbps
> 
> thanks,
> Chenyu

Re: [PATCH 2/3] sched/core: Introduce SM_IDLE and an idle re-entry fast-path in __schedule()

Posted by Vincent Guittot 1 year, 7 months ago

On Wed, 10 Jul 2024 at 11:03, K Prateek Nayak <kprateek.nayak@amd.com> wrote:
>
> From: Peter Zijlstra <peterz@infradead.org>
>
> Since commit b2a02fc43a1f ("smp: Optimize
> send_call_function_single_ipi()") an idle CPU in TIF_POLLING_NRFLAG mode
> can be pulled out of idle by setting TIF_NEED_RESCHED flag to service an
> IPI without actually sending an interrupt. Even in cases where the IPI
> handler does not queue a task on the idle CPU, do_idle() will call
> __schedule() since need_resched() returns true in these cases.
>
> Introduce and use SM_IDLE to identify call to __schedule() from
> schedule_idle() and shorten the idle re-entry time by skipping
> pick_next_task() when nr_running is 0 and the previous task is the idle
> task.
>
> With the SM_IDLE fast-path, the time taken to complete a fixed set of
> IPIs using ipistorm improves significantly. Following are the numbers
> from a dual socket 3rd Generation EPYC system (2 x 64C/128T) (boost on,
> C2 disabled) running ipistorm between CPU8 and CPU16:
>
> cmdline: insmod ipistorm.ko numipi=100000 single=1 offset=8 cpulist=8 wait=1
>
>    ==================================================================
>    Test          : ipistorm (modified)
>    Units         : Normalized runtime
>    Interpretation: Lower is better
>    Statistic     : AMean
>    ==================================================================
>    kernel:                              time [pct imp]
>    tip:sched/core                       1.00 [baseline]
>    tip:sched/core + SM_IDLE             0.25 [75.11%]
>
> [ kprateek: Commit log and testing ]
>
> Link: https://lore.kernel.org/lkml/20240615012814.GP8774@noisy.programming.kicks-ass.net/
> Not-yet-signed-off-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
> ---
>  kernel/sched/core.c | 38 +++++++++++++++++++-------------------
>  1 file changed, 19 insertions(+), 19 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 1e0c77eac65a..417d3ebbdf60 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6343,19 +6343,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>   * Constants for the sched_mode argument of __schedule().
>   *
>   * The mode argument allows RT enabled kernels to differentiate a
> - * preemption from blocking on an 'sleeping' spin/rwlock. Note that
> - * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
> - * optimize the AND operation out and just check for zero.
> + * preemption from blocking on an 'sleeping' spin/rwlock.
>   */
> -#define SM_NONE                        0x0
> -#define SM_PREEMPT             0x1
> -#define SM_RTLOCK_WAIT         0x2
> -
> -#ifndef CONFIG_PREEMPT_RT
> -# define SM_MASK_PREEMPT       (~0U)
> -#else
> -# define SM_MASK_PREEMPT       SM_PREEMPT
> -#endif
> +#define SM_IDLE                        (-1)
> +#define SM_NONE                        0
> +#define SM_PREEMPT             1
> +#define SM_RTLOCK_WAIT         2
>
>  /*
>   * __schedule() is the main scheduler function.
> @@ -6396,11 +6389,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>   *
>   * WARNING: must be called with preemption disabled!
>   */
> -static void __sched notrace __schedule(unsigned int sched_mode)
> +static void __sched notrace __schedule(int sched_mode)
>  {
>         struct task_struct *prev, *next;
>         unsigned long *switch_count;
>         unsigned long prev_state;
> +       bool preempt = sched_mode > 0;
>         struct rq_flags rf;
>         struct rq *rq;
>         int cpu;
> @@ -6409,13 +6403,13 @@ static void __sched notrace __schedule(unsigned int sched_mode)
>         rq = cpu_rq(cpu);
>         prev = rq->curr;
>
> -       schedule_debug(prev, !!sched_mode);
> +       schedule_debug(prev, preempt);
>
>         if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
>                 hrtick_clear(rq);
>
>         local_irq_disable();
> -       rcu_note_context_switch(!!sched_mode);
> +       rcu_note_context_switch(preempt);
>
>         /*
>          * Make sure that signal_pending_state()->signal_pending() below
> @@ -6449,7 +6443,12 @@ static void __sched notrace __schedule(unsigned int sched_mode)
>          * that we form a control dependency vs deactivate_task() below.
>          */
>         prev_state = READ_ONCE(prev->__state);
> -       if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
> +       if (sched_mode == SM_IDLE) {
> +               if (!rq->nr_running) {
> +                       next = prev;
> +                       goto picked;
> +               }
> +       } else if (!preempt && prev_state) {

With CONFIG_PREEMPT_RT, it was only for SM_PREEMPT but not for SM_RTLOCK_WAIT

>                 if (signal_pending_state(prev_state, prev)) {
>                         WRITE_ONCE(prev->__state, TASK_RUNNING);
>                 } else {
> @@ -6483,6 +6482,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
>         }
>
>         next = pick_next_task(rq, prev, &rf);
> +picked:
>         clear_tsk_need_resched(prev);
>         clear_preempt_need_resched();
>  #ifdef CONFIG_SCHED_DEBUG
> @@ -6523,7 +6523,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
>                 migrate_disable_switch(rq, prev);
>                 psi_sched_switch(prev, next, !task_on_rq_queued(prev));
>
> -               trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
> +               trace_sched_switch(preempt, prev, next, prev_state);
>
>                 /* Also unlocks the rq: */
>                 rq = context_switch(rq, prev, next, &rf);
> @@ -6599,7 +6599,7 @@ static void sched_update_worker(struct task_struct *tsk)
>         }
>  }
>
> -static __always_inline void __schedule_loop(unsigned int sched_mode)
> +static __always_inline void __schedule_loop(int sched_mode)
>  {
>         do {
>                 preempt_disable();
> @@ -6644,7 +6644,7 @@ void __sched schedule_idle(void)
>          */
>         WARN_ON_ONCE(current->__state);
>         do {
> -               __schedule(SM_NONE);
> +               __schedule(SM_IDLE);
>         } while (need_resched());
>  }
>
> --
> 2.34.1
>

Re: [PATCH 2/3] sched/core: Introduce SM_IDLE and an idle re-entry fast-path in __schedule()

Posted by Peter Zijlstra 1 year, 7 months ago

On Thu, Jul 11, 2024 at 10:00:15AM +0200, Vincent Guittot wrote:
> On Wed, 10 Jul 2024 at 11:03, K Prateek Nayak <kprateek.nayak@amd.com> wrote:

> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 1e0c77eac65a..417d3ebbdf60 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -6343,19 +6343,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> >   * Constants for the sched_mode argument of __schedule().
> >   *
> >   * The mode argument allows RT enabled kernels to differentiate a
> > - * preemption from blocking on an 'sleeping' spin/rwlock. Note that
> > - * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
> > - * optimize the AND operation out and just check for zero.
> > + * preemption from blocking on an 'sleeping' spin/rwlock.
> >   */
> > -#define SM_NONE                        0x0
> > -#define SM_PREEMPT             0x1
> > -#define SM_RTLOCK_WAIT         0x2
> > -
> > -#ifndef CONFIG_PREEMPT_RT
> > -# define SM_MASK_PREEMPT       (~0U)
> > -#else
> > -# define SM_MASK_PREEMPT       SM_PREEMPT
> > -#endif
> > +#define SM_IDLE                        (-1)
> > +#define SM_NONE                        0
> > +#define SM_PREEMPT             1
> > +#define SM_RTLOCK_WAIT         2
> >
> >  /*
> >   * __schedule() is the main scheduler function.
> > @@ -6396,11 +6389,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> >   *
> >   * WARNING: must be called with preemption disabled!
> >   */
> > -static void __sched notrace __schedule(unsigned int sched_mode)
> > +static void __sched notrace __schedule(int sched_mode)
> >  {
> >         struct task_struct *prev, *next;
> >         unsigned long *switch_count;
> >         unsigned long prev_state;
> > +       bool preempt = sched_mode > 0;
> >         struct rq_flags rf;
> >         struct rq *rq;
> >         int cpu;
> > @@ -6409,13 +6403,13 @@ static void __sched notrace __schedule(unsigned int sched_mode)
> >         rq = cpu_rq(cpu);
> >         prev = rq->curr;
> >
> > -       schedule_debug(prev, !!sched_mode);
> > +       schedule_debug(prev, preempt);
> >
> >         if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
> >                 hrtick_clear(rq);
> >
> >         local_irq_disable();
> > -       rcu_note_context_switch(!!sched_mode);
> > +       rcu_note_context_switch(preempt);
> >
> >         /*
> >          * Make sure that signal_pending_state()->signal_pending() below
> > @@ -6449,7 +6443,12 @@ static void __sched notrace __schedule(unsigned int sched_mode)
> >          * that we form a control dependency vs deactivate_task() below.
> >          */
> >         prev_state = READ_ONCE(prev->__state);
> > -       if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
> > +       if (sched_mode == SM_IDLE) {
> > +               if (!rq->nr_running) {
> > +                       next = prev;
> > +                       goto picked;
> > +               }
> > +       } else if (!preempt && prev_state) {
> 
> With CONFIG_PREEMPT_RT, it was only for SM_PREEMPT but not for SM_RTLOCK_WAIT

Bah, yes. But then schedule_debug() and rcu_note_context_switch() have
an argument that is called 'preempt' but is set for SM_RTLOCK_WAIT.

Now, I think the RCU think is actually correct here, it doesn't want to
consider SM_RTLOCK_WAIT as a voluntary schedule point, because spinlocks
don't either. But it is confusing as heck.

We can either write things like:

	} else if (sched_mode != SM_PREEMPT && prev_state) {

or do silly things like:

#define SM_IDLE	(-16)

keep the SM_MASK_PREEMPT trickery and do:

	} else if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {

Not sure that is actually going to matter at this point though.

Re: [PATCH 2/3] sched/core: Introduce SM_IDLE and an idle re-entry fast-path in __schedule()

Posted by Vincent Guittot 1 year, 7 months ago

On Thu, 11 Jul 2024 at 11:19, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Jul 11, 2024 at 10:00:15AM +0200, Vincent Guittot wrote:
> > On Wed, 10 Jul 2024 at 11:03, K Prateek Nayak <kprateek.nayak@amd.com> wrote:
>
> > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > > index 1e0c77eac65a..417d3ebbdf60 100644
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -6343,19 +6343,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > >   * Constants for the sched_mode argument of __schedule().
> > >   *
> > >   * The mode argument allows RT enabled kernels to differentiate a
> > > - * preemption from blocking on an 'sleeping' spin/rwlock. Note that
> > > - * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
> > > - * optimize the AND operation out and just check for zero.
> > > + * preemption from blocking on an 'sleeping' spin/rwlock.
> > >   */
> > > -#define SM_NONE                        0x0
> > > -#define SM_PREEMPT             0x1
> > > -#define SM_RTLOCK_WAIT         0x2
> > > -
> > > -#ifndef CONFIG_PREEMPT_RT
> > > -# define SM_MASK_PREEMPT       (~0U)
> > > -#else
> > > -# define SM_MASK_PREEMPT       SM_PREEMPT
> > > -#endif
> > > +#define SM_IDLE                        (-1)
> > > +#define SM_NONE                        0
> > > +#define SM_PREEMPT             1
> > > +#define SM_RTLOCK_WAIT         2
> > >
> > >  /*
> > >   * __schedule() is the main scheduler function.
> > > @@ -6396,11 +6389,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > >   *
> > >   * WARNING: must be called with preemption disabled!
> > >   */
> > > -static void __sched notrace __schedule(unsigned int sched_mode)
> > > +static void __sched notrace __schedule(int sched_mode)
> > >  {
> > >         struct task_struct *prev, *next;
> > >         unsigned long *switch_count;
> > >         unsigned long prev_state;
> > > +       bool preempt = sched_mode > 0;
> > >         struct rq_flags rf;
> > >         struct rq *rq;
> > >         int cpu;
> > > @@ -6409,13 +6403,13 @@ static void __sched notrace __schedule(unsigned int sched_mode)
> > >         rq = cpu_rq(cpu);
> > >         prev = rq->curr;
> > >
> > > -       schedule_debug(prev, !!sched_mode);
> > > +       schedule_debug(prev, preempt);
> > >
> > >         if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
> > >                 hrtick_clear(rq);
> > >
> > >         local_irq_disable();
> > > -       rcu_note_context_switch(!!sched_mode);
> > > +       rcu_note_context_switch(preempt);
> > >
> > >         /*
> > >          * Make sure that signal_pending_state()->signal_pending() below
> > > @@ -6449,7 +6443,12 @@ static void __sched notrace __schedule(unsigned int sched_mode)
> > >          * that we form a control dependency vs deactivate_task() below.
> > >          */
> > >         prev_state = READ_ONCE(prev->__state);
> > > -       if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
> > > +       if (sched_mode == SM_IDLE) {
> > > +               if (!rq->nr_running) {
> > > +                       next = prev;
> > > +                       goto picked;
> > > +               }
> > > +       } else if (!preempt && prev_state) {
> >
> > With CONFIG_PREEMPT_RT, it was only for SM_PREEMPT but not for SM_RTLOCK_WAIT
>
> Bah, yes. But then schedule_debug() and rcu_note_context_switch() have
> an argument that is called 'preempt' but is set for SM_RTLOCK_WAIT.
>
> Now, I think the RCU think is actually correct here, it doesn't want to
> consider SM_RTLOCK_WAIT as a voluntary schedule point, because spinlocks
> don't either. But it is confusing as heck.
>
> We can either write things like:
>
>         } else if (sched_mode != SM_PREEMPT && prev_state) {

this would work with something like below

#ifdef CONFIG_PREEMPT_RT
          # define SM_RTLOCK_WAIT       2
#else
         # define SM_RTLOCK_WAIT       SM_PREEMPT
#endif

>
> or do silly things like:
>
> #define SM_IDLE (-16)
>
> keep the SM_MASK_PREEMPT trickery and do:
>
>         } else if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
>
> Not sure that is actually going to matter at this point though.

Re: [PATCH 2/3] sched/core: Introduce SM_IDLE and an idle re-entry fast-path in __schedule()

Posted by K Prateek Nayak 1 year, 7 months ago

Hello Vincent, Peter,

On 7/11/2024 6:44 PM, Vincent Guittot wrote:
> On Thu, 11 Jul 2024 at 11:19, Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> On Thu, Jul 11, 2024 at 10:00:15AM +0200, Vincent Guittot wrote:
>>> On Wed, 10 Jul 2024 at 11:03, K Prateek Nayak <kprateek.nayak@amd.com> wrote:
>>
>>>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>>>> index 1e0c77eac65a..417d3ebbdf60 100644
>>>> --- a/kernel/sched/core.c
>>>> +++ b/kernel/sched/core.c
>>>> @@ -6343,19 +6343,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>>>>    * Constants for the sched_mode argument of __schedule().
>>>>    *
>>>>    * The mode argument allows RT enabled kernels to differentiate a
>>>> - * preemption from blocking on an 'sleeping' spin/rwlock. Note that
>>>> - * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
>>>> - * optimize the AND operation out and just check for zero.
>>>> + * preemption from blocking on an 'sleeping' spin/rwlock.
>>>>    */
>>>> -#define SM_NONE                        0x0
>>>> -#define SM_PREEMPT             0x1
>>>> -#define SM_RTLOCK_WAIT         0x2
>>>> -
>>>> -#ifndef CONFIG_PREEMPT_RT
>>>> -# define SM_MASK_PREEMPT       (~0U)
>>>> -#else
>>>> -# define SM_MASK_PREEMPT       SM_PREEMPT
>>>> -#endif
>>>> +#define SM_IDLE                        (-1)
>>>> +#define SM_NONE                        0
>>>> +#define SM_PREEMPT             1
>>>> +#define SM_RTLOCK_WAIT         2
>>>>
>>>>   /*
>>>>    * __schedule() is the main scheduler function.
>>>> @@ -6396,11 +6389,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>>>>    *
>>>>    * WARNING: must be called with preemption disabled!
>>>>    */
>>>> -static void __sched notrace __schedule(unsigned int sched_mode)
>>>> +static void __sched notrace __schedule(int sched_mode)
>>>>   {
>>>>          struct task_struct *prev, *next;
>>>>          unsigned long *switch_count;
>>>>          unsigned long prev_state;
>>>> +       bool preempt = sched_mode > 0;
>>>>          struct rq_flags rf;
>>>>          struct rq *rq;
>>>>          int cpu;
>>>> @@ -6409,13 +6403,13 @@ static void __sched notrace __schedule(unsigned int sched_mode)
>>>>          rq = cpu_rq(cpu);
>>>>          prev = rq->curr;
>>>>
>>>> -       schedule_debug(prev, !!sched_mode);
>>>> +       schedule_debug(prev, preempt);
>>>>
>>>>          if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
>>>>                  hrtick_clear(rq);
>>>>
>>>>          local_irq_disable();
>>>> -       rcu_note_context_switch(!!sched_mode);
>>>> +       rcu_note_context_switch(preempt);
>>>>
>>>>          /*
>>>>           * Make sure that signal_pending_state()->signal_pending() below
>>>> @@ -6449,7 +6443,12 @@ static void __sched notrace __schedule(unsigned int sched_mode)
>>>>           * that we form a control dependency vs deactivate_task() below.
>>>>           */
>>>>          prev_state = READ_ONCE(prev->__state);
>>>> -       if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
>>>> +       if (sched_mode == SM_IDLE) {
>>>> +               if (!rq->nr_running) {
>>>> +                       next = prev;
>>>> +                       goto picked;
>>>> +               }
>>>> +       } else if (!preempt && prev_state) {
>>>
>>> With CONFIG_PREEMPT_RT, it was only for SM_PREEMPT but not for SM_RTLOCK_WAIT
>>
>> Bah, yes. But then schedule_debug() and rcu_note_context_switch() have
>> an argument that is called 'preempt' but is set for SM_RTLOCK_WAIT.
>>
>> Now, I think the RCU think is actually correct here, it doesn't want to
>> consider SM_RTLOCK_WAIT as a voluntary schedule point, because spinlocks
>> don't either. But it is confusing as heck.
>>
>> We can either write things like:
>>
>>          } else if (sched_mode != SM_PREEMPT && prev_state) {
> 
> this would work with something like below
> 
> #ifdef CONFIG_PREEMPT_RT
>            # define SM_RTLOCK_WAIT       2
> #else
>           # define SM_RTLOCK_WAIT       SM_PREEMPT
> #endif

Since "SM_RTLOCK_WAIT" is only used by "schedule_rtlock()" which is only
defined for PREEMPT_RT kernels (from a quick grep on linux-6.10.y-rt),
it should just work (famous last words) and we can perhaps skip the else
part too?

With this patch, we need to have the following view of what "preempt"
should be for the components in __schedule() looking at "sched_mode":

		schedule_debug()/		SM_MASK_PREEMPT check/
		rcu_note_context_switch()	trace_sched_switch()
SM_IDLE			F				F
SM_NONE			F				F
SM_PREEMPT		T				T
SM_RTLOCK_WAIT *	T				F

   * SM_RTLOCK_WAIT  is only used in PREEMPT_RT

> 
>>
>> or do silly things like:

... and since we are talking about silly ideas, here is one:

(only build tested on tip:sched/core and linux-6.10.y-rt)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 417d3ebbdf60..d9273af69f9e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6345,10 +6345,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   * The mode argument allows RT enabled kernels to differentiate a
   * preemption from blocking on an 'sleeping' spin/rwlock.
   */
-#define SM_IDLE			(-1)
-#define SM_NONE			0
-#define SM_PREEMPT		1
-#define SM_RTLOCK_WAIT		2
+#ifdef CONFIG_PREEMPT_RT
+#define SM_RTLOCK_WAIT		(-2)
+#endif
+#define SM_IDLE			0
+#define SM_NONE			1
+#define SM_PREEMPT		2
  
  /*
   * __schedule() is the main scheduler function.
@@ -6391,10 +6393,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   */
  static void __sched notrace __schedule(int sched_mode)
  {
+	/*
+	 * For PREEMPT_RT kernel, SM_RTLOCK_WAIT is considered as
+	 * preemption by schedule_debug() and
+	 * rcu_note_context_switch().
+	 */
+	bool preempt = (unsigned int) sched_mode > SM_NONE;
  	struct task_struct *prev, *next;
  	unsigned long *switch_count;
  	unsigned long prev_state;
-	bool preempt = sched_mode > 0;
  	struct rq_flags rf;
  	struct rq *rq;
  	int cpu;
@@ -6438,6 +6445,14 @@ static void __sched notrace __schedule(int sched_mode)
  
  	switch_count = &prev->nivcsw;
  
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * PREEMPT_RT kernel do not consider SM_RTLOCK_WAIT as
+	 * preemption when looking at prev->state.
+	 */
+	preempt = sched_mode > SM_NONE;
+#endif
+
  	/*
  	 * We must load prev->state once (task_struct::state is volatile), such
  	 * that we form a control dependency vs deactivate_task() below.
--

>>
>> #define SM_IDLE (-16)
>>
>> keep the SM_MASK_PREEMPT trickery and do:
>>
>>          } else if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
>>
>> Not sure that is actually going to matter at this point though.

-- 
Thanks and Regards,
Prateek

[PATCH 1/3] sched/core: Remove the unnecessary need_resched() check in nohz_csd_func()
[PATCH 2/3] sched/core: Introduce SM_IDLE and an idle re-entry fast-path in __schedule()
[RFC PATCH 3/3] softirq: Avoid waking up ksoftirqd from flush_smp_call_function_queue()