[PATCH 17/24] sched/fair: Implement delayed dequeue

Peter Zijlstra posted 24 patches 1 year, 4 months ago
[PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Peter Zijlstra 1 year, 4 months ago
Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
noting that lag is fundamentally a temporal measure. It should not be
carried around indefinitely.

OTOH it should also not be instantly discarded, doing so will allow a
task to game the system by purposefully (micro) sleeping at the end of
its time quantum.

Since lag is intimately tied to the virtual time base, a wall-time
based decay is also insufficient, notably competition is required for
any of this to make sense.

Instead, delay the dequeue and keep the 'tasks' on the runqueue,
competing until they are eligible.

Strictly speaking, we only care about keeping them until the 0-lag
point, but that is a difficult proposition, instead carry them around
until they get picked again, and dequeue them at that point.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/deadline.c |    1 
 kernel/sched/fair.c     |   82 ++++++++++++++++++++++++++++++++++++++++++------
 kernel/sched/features.h |    9 +++++
 3 files changed, 81 insertions(+), 11 deletions(-)

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2428,7 +2428,6 @@ static struct task_struct *__pick_next_t
 		else
 			p = dl_se->server_pick_next(dl_se);
 		if (!p) {
-			WARN_ON_ONCE(1);
 			dl_se->dl_yielded = 1;
 			update_curr_dl_se(rq, dl_se, 0);
 			goto again;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5379,20 +5379,44 @@ static void clear_buddies(struct cfs_rq
 
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
-static void
+static bool
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
-	int action = UPDATE_TG;
+	if (flags & DEQUEUE_DELAYED) {
+		/*
+		 * DEQUEUE_DELAYED is typically called from pick_next_entity()
+		 * at which point we've already done update_curr() and do not
+		 * want to do so again.
+		 */
+		SCHED_WARN_ON(!se->sched_delayed);
+		se->sched_delayed = 0;
+	} else {
+		bool sleep = flags & DEQUEUE_SLEEP;
+
+		/*
+		 * DELAY_DEQUEUE relies on spurious wakeups, special task
+		 * states must not suffer spurious wakeups, excempt them.
+		 */
+		if (flags & DEQUEUE_SPECIAL)
+			sleep = false;
+
+		SCHED_WARN_ON(sleep && se->sched_delayed);
+		update_curr(cfs_rq);
 
+		if (sched_feat(DELAY_DEQUEUE) && sleep &&
+		    !entity_eligible(cfs_rq, se)) {
+			if (cfs_rq->next == se)
+				cfs_rq->next = NULL;
+			se->sched_delayed = 1;
+			return false;
+		}
+	}
+
+	int action = UPDATE_TG;
 	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
 		action |= DO_DETACH;
 
 	/*
-	 * Update run-time statistics of the 'current'.
-	 */
-	update_curr(cfs_rq);
-
-	/*
 	 * When dequeuing a sched_entity, we must:
 	 *   - Update loads to have both entity and cfs_rq synced with now.
 	 *   - For group_entity, update its runnable_weight to reflect the new
@@ -5430,6 +5454,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
 
 	if (cfs_rq->nr_running == 0)
 		update_idle_cfs_rq_clock_pelt(cfs_rq);
+
+	return true;
 }
 
 static void
@@ -5828,11 +5854,21 @@ static bool throttle_cfs_rq(struct cfs_r
 	idle_task_delta = cfs_rq->idle_h_nr_running;
 	for_each_sched_entity(se) {
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+		int flags;
+
 		/* throttled entity or throttle-on-deactivate */
 		if (!se->on_rq)
 			goto done;
 
-		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+		/*
+		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
+		 * This avoids teaching dequeue_entities() about throttled
+		 * entities and keeps things relatively simple.
+		 */
+		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
+		if (se->sched_delayed)
+			flags |= DEQUEUE_DELAYED;
+		dequeue_entity(qcfs_rq, se, flags);
 
 		if (cfs_rq_is_idle(group_cfs_rq(se)))
 			idle_task_delta = cfs_rq->h_nr_running;
@@ -6918,6 +6954,7 @@ static int dequeue_entities(struct rq *r
 	bool was_sched_idle = sched_idle_rq(rq);
 	int rq_h_nr_running = rq->cfs.h_nr_running;
 	bool task_sleep = flags & DEQUEUE_SLEEP;
+	bool task_delayed = flags & DEQUEUE_DELAYED;
 	struct task_struct *p = NULL;
 	int idle_h_nr_running = 0;
 	int h_nr_running = 0;
@@ -6931,7 +6968,13 @@ static int dequeue_entities(struct rq *r
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		dequeue_entity(cfs_rq, se, flags);
+
+		if (!dequeue_entity(cfs_rq, se, flags)) {
+			if (p && &p->se == se)
+				return -1;
+
+			break;
+		}
 
 		cfs_rq->h_nr_running -= h_nr_running;
 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
@@ -6956,6 +6999,7 @@ static int dequeue_entities(struct rq *r
 			break;
 		}
 		flags |= DEQUEUE_SLEEP;
+		flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
 	}
 
 	for_each_sched_entity(se) {
@@ -6985,6 +7029,17 @@ static int dequeue_entities(struct rq *r
 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
 		rq->next_balance = jiffies;
 
+	if (p && task_delayed) {
+		SCHED_WARN_ON(!task_sleep);
+		SCHED_WARN_ON(p->on_rq != 1);
+
+		/* Fix-up what dequeue_task_fair() skipped */
+		hrtick_update(rq);
+
+		/* Fix-up what block_task() skipped. */
+		__block_task(rq, p);
+	}
+
 	return 1;
 }
 /*
@@ -6996,8 +7051,10 @@ static bool dequeue_task_fair(struct rq
 {
 	util_est_dequeue(&rq->cfs, p);
 
-	if (dequeue_entities(rq, &p->se, flags) < 0)
+	if (dequeue_entities(rq, &p->se, flags) < 0) {
+		util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
 		return false;
+	}
 
 	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
 	hrtick_update(rq);
@@ -12973,6 +13030,11 @@ static void set_next_task_fair(struct rq
 		/* ensure bandwidth has been allocated on our new cfs_rq */
 		account_cfs_rq_runtime(cfs_rq, 0);
 	}
+
+	if (!first)
+		return;
+
+	SCHED_WARN_ON(se->sched_delayed);
 }
 
 void init_cfs_rq(struct cfs_rq *cfs_rq)
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -29,6 +29,15 @@ SCHED_FEAT(NEXT_BUDDY, false)
 SCHED_FEAT(CACHE_HOT_BUDDY, true)
 
 /*
+ * Delay dequeueing tasks until they get selected or woken.
+ *
+ * By delaying the dequeue for non-eligible tasks, they remain in the
+ * competition and can burn off their negative lag. When they get selected
+ * they'll have positive lag by definition.
+ */
+SCHED_FEAT(DELAY_DEQUEUE, true)
+
+/*
  * Allow wakeup-time preemption of the current task:
  */
 SCHED_FEAT(WAKEUP_PREEMPTION, true)
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Phil Auld 1 year, 1 month ago
Hi Peterm

On Sat, Jul 27, 2024 at 12:27:49PM +0200 Peter Zijlstra wrote:
> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> noting that lag is fundamentally a temporal measure. It should not be
> carried around indefinitely.
> 
> OTOH it should also not be instantly discarded, doing so will allow a
> task to game the system by purposefully (micro) sleeping at the end of
> its time quantum.
> 
> Since lag is intimately tied to the virtual time base, a wall-time
> based decay is also insufficient, notably competition is required for
> any of this to make sense.
> 
> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> competing until they are eligible.
> 
> Strictly speaking, we only care about keeping them until the 0-lag
> point, but that is a difficult proposition, instead carry them around
> until they get picked again, and dequeue them at that point.

This one is causing a 10-20% performance hit on our filesystem tests.

On 6.12-rc5 (so with the latest follow ons) we get:

with DELAY_DEQUEUE the bandwidth is 510 MB/s
with NO_DELAY_DEQUEUE the bandwidth is 590 MB/s

The test is fio, something like this:

taskset -c 1,2,3,4,5,6,7,8 fio --rw randwrite --bs 4k --runtime 1m --fsync 0 --iodepth 32 --direct 1 --ioengine libaio --numjobs 8 --size 30g --nrfiles 1 --loops 1 --name default --randrepeat 1 --time_based --group_reporting --directory /testfs

In this case it's ext4, but I'm not sure it will be FS specific.

I should have the machine and setup next week to poke further but I wanted
to mention it now just in case any one has an "aha" moment.

It seems to only effect these FS loads. Other perf tests are not showing any
issues that I am aware of.



Thanks,
Phil



> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/deadline.c |    1 
>  kernel/sched/fair.c     |   82 ++++++++++++++++++++++++++++++++++++++++++------
>  kernel/sched/features.h |    9 +++++
>  3 files changed, 81 insertions(+), 11 deletions(-)
> 
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2428,7 +2428,6 @@ static struct task_struct *__pick_next_t
>  		else
>  			p = dl_se->server_pick_next(dl_se);
>  		if (!p) {
> -			WARN_ON_ONCE(1);
>  			dl_se->dl_yielded = 1;
>  			update_curr_dl_se(rq, dl_se, 0);
>  			goto again;
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5379,20 +5379,44 @@ static void clear_buddies(struct cfs_rq
>  
>  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
>  
> -static void
> +static bool
>  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  {
> -	int action = UPDATE_TG;
> +	if (flags & DEQUEUE_DELAYED) {
> +		/*
> +		 * DEQUEUE_DELAYED is typically called from pick_next_entity()
> +		 * at which point we've already done update_curr() and do not
> +		 * want to do so again.
> +		 */
> +		SCHED_WARN_ON(!se->sched_delayed);
> +		se->sched_delayed = 0;
> +	} else {
> +		bool sleep = flags & DEQUEUE_SLEEP;
> +
> +		/*
> +		 * DELAY_DEQUEUE relies on spurious wakeups, special task
> +		 * states must not suffer spurious wakeups, excempt them.
> +		 */
> +		if (flags & DEQUEUE_SPECIAL)
> +			sleep = false;
> +
> +		SCHED_WARN_ON(sleep && se->sched_delayed);
> +		update_curr(cfs_rq);
>  
> +		if (sched_feat(DELAY_DEQUEUE) && sleep &&
> +		    !entity_eligible(cfs_rq, se)) {
> +			if (cfs_rq->next == se)
> +				cfs_rq->next = NULL;
> +			se->sched_delayed = 1;
> +			return false;
> +		}
> +	}
> +
> +	int action = UPDATE_TG;
>  	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
>  		action |= DO_DETACH;
>  
>  	/*
> -	 * Update run-time statistics of the 'current'.
> -	 */
> -	update_curr(cfs_rq);
> -
> -	/*
>  	 * When dequeuing a sched_entity, we must:
>  	 *   - Update loads to have both entity and cfs_rq synced with now.
>  	 *   - For group_entity, update its runnable_weight to reflect the new
> @@ -5430,6 +5454,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
>  
>  	if (cfs_rq->nr_running == 0)
>  		update_idle_cfs_rq_clock_pelt(cfs_rq);
> +
> +	return true;
>  }
>  
>  static void
> @@ -5828,11 +5854,21 @@ static bool throttle_cfs_rq(struct cfs_r
>  	idle_task_delta = cfs_rq->idle_h_nr_running;
>  	for_each_sched_entity(se) {
>  		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
> +		int flags;
> +
>  		/* throttled entity or throttle-on-deactivate */
>  		if (!se->on_rq)
>  			goto done;
>  
> -		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
> +		/*
> +		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
> +		 * This avoids teaching dequeue_entities() about throttled
> +		 * entities and keeps things relatively simple.
> +		 */
> +		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
> +		if (se->sched_delayed)
> +			flags |= DEQUEUE_DELAYED;
> +		dequeue_entity(qcfs_rq, se, flags);
>  
>  		if (cfs_rq_is_idle(group_cfs_rq(se)))
>  			idle_task_delta = cfs_rq->h_nr_running;
> @@ -6918,6 +6954,7 @@ static int dequeue_entities(struct rq *r
>  	bool was_sched_idle = sched_idle_rq(rq);
>  	int rq_h_nr_running = rq->cfs.h_nr_running;
>  	bool task_sleep = flags & DEQUEUE_SLEEP;
> +	bool task_delayed = flags & DEQUEUE_DELAYED;
>  	struct task_struct *p = NULL;
>  	int idle_h_nr_running = 0;
>  	int h_nr_running = 0;
> @@ -6931,7 +6968,13 @@ static int dequeue_entities(struct rq *r
>  
>  	for_each_sched_entity(se) {
>  		cfs_rq = cfs_rq_of(se);
> -		dequeue_entity(cfs_rq, se, flags);
> +
> +		if (!dequeue_entity(cfs_rq, se, flags)) {
> +			if (p && &p->se == se)
> +				return -1;
> +
> +			break;
> +		}
>  
>  		cfs_rq->h_nr_running -= h_nr_running;
>  		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
> @@ -6956,6 +6999,7 @@ static int dequeue_entities(struct rq *r
>  			break;
>  		}
>  		flags |= DEQUEUE_SLEEP;
> +		flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
>  	}
>  
>  	for_each_sched_entity(se) {
> @@ -6985,6 +7029,17 @@ static int dequeue_entities(struct rq *r
>  	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
>  		rq->next_balance = jiffies;
>  
> +	if (p && task_delayed) {
> +		SCHED_WARN_ON(!task_sleep);
> +		SCHED_WARN_ON(p->on_rq != 1);
> +
> +		/* Fix-up what dequeue_task_fair() skipped */
> +		hrtick_update(rq);
> +
> +		/* Fix-up what block_task() skipped. */
> +		__block_task(rq, p);
> +	}
> +
>  	return 1;
>  }
>  /*
> @@ -6996,8 +7051,10 @@ static bool dequeue_task_fair(struct rq
>  {
>  	util_est_dequeue(&rq->cfs, p);
>  
> -	if (dequeue_entities(rq, &p->se, flags) < 0)
> +	if (dequeue_entities(rq, &p->se, flags) < 0) {
> +		util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
>  		return false;
> +	}
>  
>  	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
>  	hrtick_update(rq);
> @@ -12973,6 +13030,11 @@ static void set_next_task_fair(struct rq
>  		/* ensure bandwidth has been allocated on our new cfs_rq */
>  		account_cfs_rq_runtime(cfs_rq, 0);
>  	}
> +
> +	if (!first)
> +		return;
> +
> +	SCHED_WARN_ON(se->sched_delayed);
>  }
>  
>  void init_cfs_rq(struct cfs_rq *cfs_rq)
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -29,6 +29,15 @@ SCHED_FEAT(NEXT_BUDDY, false)
>  SCHED_FEAT(CACHE_HOT_BUDDY, true)
>  
>  /*
> + * Delay dequeueing tasks until they get selected or woken.
> + *
> + * By delaying the dequeue for non-eligible tasks, they remain in the
> + * competition and can burn off their negative lag. When they get selected
> + * they'll have positive lag by definition.
> + */
> +SCHED_FEAT(DELAY_DEQUEUE, true)
> +
> +/*
>   * Allow wakeup-time preemption of the current task:
>   */
>  SCHED_FEAT(WAKEUP_PREEMPTION, true)
> 
> 

--
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Dietmar Eggemann 1 year, 1 month ago
Hi Phil,

On 01/11/2024 13:47, Phil Auld wrote:
> 
> Hi Peterm
> 
> On Sat, Jul 27, 2024 at 12:27:49PM +0200 Peter Zijlstra wrote:
>> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
>> noting that lag is fundamentally a temporal measure. It should not be
>> carried around indefinitely.
>>
>> OTOH it should also not be instantly discarded, doing so will allow a
>> task to game the system by purposefully (micro) sleeping at the end of
>> its time quantum.
>>
>> Since lag is intimately tied to the virtual time base, a wall-time
>> based decay is also insufficient, notably competition is required for
>> any of this to make sense.
>>
>> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
>> competing until they are eligible.
>>
>> Strictly speaking, we only care about keeping them until the 0-lag
>> point, but that is a difficult proposition, instead carry them around
>> until they get picked again, and dequeue them at that point.
> 
> This one is causing a 10-20% performance hit on our filesystem tests.
> 
> On 6.12-rc5 (so with the latest follow ons) we get:
> 
> with DELAY_DEQUEUE the bandwidth is 510 MB/s
> with NO_DELAY_DEQUEUE the bandwidth is 590 MB/s
> 
> The test is fio, something like this:
> 
> taskset -c 1,2,3,4,5,6,7,8 fio --rw randwrite --bs 4k --runtime 1m --fsync 0 --iodepth 32 --direct 1 --ioengine libaio --numjobs 8 --size 30g --nrfiles 1 --loops 1 --name default --randrepeat 1 --time_based --group_reporting --directory /testfs

I'm not seeing this on my i7-13700K running tip sched/core (1a6151017ee5
- sched: psi: pass enqueue/dequeue flags to psi callbacks directly
(2024-10-26 Johannes Weiner)) (6.12.0-rc4 - based)

Using 'taskset 0xaaaaa' avoiding SMT and running only on P-cores.

vanilla features: 990MB/s (mean out of 5 runs, σ:  9.38)
NO_DELAY_DEQUEUE: 992MB/s (mean out of 5 runs, σ: 10.61)

# sudo lshw -class disk -class storage
  *-nvme                    
       description: NVMe device
       product: GIGABYTE GP-ASM2NE6500GTTD
       vendor: Phison Electronics Corporation
       physical id: 0
       bus info: pci@0000:01:00.0
       logical name: /dev/nvme0
       version: EGFM13.2
       ...
       capabilities: nvme pciexpress msix msi pm nvm_express bus_master cap_list
       configuration: driver=nvme latency=0 nqn=nqn.2014.08.org.nvmexpress:19871987SN215108954872 GIGABYTE GP-ASM2NE6500GTTD state=live
       resources: irq:16 memory:70800000-70803fff

# mount | grep ^/dev/nvme0
/dev/nvme0n1p2 on / type ext4 (rw,relatime,errors=remount-ro)

Which disk device you're using?

> 
> In this case it's ext4, but I'm not sure it will be FS specific.
> 
> I should have the machine and setup next week to poke further but I wanted
> to mention it now just in case any one has an "aha" moment.
> 
> It seems to only effect these FS loads. Other perf tests are not showing any
> issues that I am aware of.

[...]

Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Phil Auld 1 year, 1 month ago
Hi Dietmar,

On Mon, Nov 04, 2024 at 10:28:37AM +0100 Dietmar Eggemann wrote:
> Hi Phil,
> 
> On 01/11/2024 13:47, Phil Auld wrote:
> > 
> > Hi Peterm
> > 
> > On Sat, Jul 27, 2024 at 12:27:49PM +0200 Peter Zijlstra wrote:
> >> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> >> noting that lag is fundamentally a temporal measure. It should not be
> >> carried around indefinitely.
> >>
> >> OTOH it should also not be instantly discarded, doing so will allow a
> >> task to game the system by purposefully (micro) sleeping at the end of
> >> its time quantum.
> >>
> >> Since lag is intimately tied to the virtual time base, a wall-time
> >> based decay is also insufficient, notably competition is required for
> >> any of this to make sense.
> >>
> >> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> >> competing until they are eligible.
> >>
> >> Strictly speaking, we only care about keeping them until the 0-lag
> >> point, but that is a difficult proposition, instead carry them around
> >> until they get picked again, and dequeue them at that point.
> > 
> > This one is causing a 10-20% performance hit on our filesystem tests.
> > 
> > On 6.12-rc5 (so with the latest follow ons) we get:
> > 
> > with DELAY_DEQUEUE the bandwidth is 510 MB/s
> > with NO_DELAY_DEQUEUE the bandwidth is 590 MB/s
> > 
> > The test is fio, something like this:
> > 
> > taskset -c 1,2,3,4,5,6,7,8 fio --rw randwrite --bs 4k --runtime 1m --fsync 0 --iodepth 32 --direct 1 --ioengine libaio --numjobs 8 --size 30g --nrfiles 1 --loops 1 --name default --randrepeat 1 --time_based --group_reporting --directory /testfs
> 
> I'm not seeing this on my i7-13700K running tip sched/core (1a6151017ee5
> - sched: psi: pass enqueue/dequeue flags to psi callbacks directly
> (2024-10-26 Johannes Weiner)) (6.12.0-rc4 - based)
> 
> Using 'taskset 0xaaaaa' avoiding SMT and running only on P-cores.
> 
> vanilla features: 990MB/s (mean out of 5 runs, σ:  9.38)
> NO_DELAY_DEQUEUE: 992MB/s (mean out of 5 runs, σ: 10.61)
> 
> # sudo lshw -class disk -class storage
>   *-nvme                    
>        description: NVMe device
>        product: GIGABYTE GP-ASM2NE6500GTTD
>        vendor: Phison Electronics Corporation
>        physical id: 0
>        bus info: pci@0000:01:00.0
>        logical name: /dev/nvme0
>        version: EGFM13.2
>        ...
>        capabilities: nvme pciexpress msix msi pm nvm_express bus_master cap_list
>        configuration: driver=nvme latency=0 nqn=nqn.2014.08.org.nvmexpress:19871987SN215108954872 GIGABYTE GP-ASM2NE6500GTTD state=live
>        resources: irq:16 memory:70800000-70803fff
> 
> # mount | grep ^/dev/nvme0
> /dev/nvme0n1p2 on / type ext4 (rw,relatime,errors=remount-ro)
> 
> Which disk device you're using?

Most of the reports are on various NVME drives (samsung mostly I think).


One thing I should add is that it's all on LVM: 


vgcreate vg /dev/nvme0n1 -y
lvcreate -n thinMeta -L 3GB vg -y
lvcreate -n thinPool -l 99%FREE vg -y
lvconvert --thinpool /dev/mapper/vg-thinPool --poolmetadata /dev/mapper/vg-thinMeta -Zn -y
lvcreate -n testLV -V 1300G --thinpool thinPool vg
wipefs -a /dev/mapper/vg-testLV
mkfs.ext4 /dev/mapper/vg-testLV -E lazy_itable_init=0,lazy_journal_init=0 -F
mount /dev/mapper/vg-testLV /testfs 


With VDO or thinpool (as above) it shows on both ext4 and xfs. With fs on
drive directly it's a little more variable. Some it shows on xfs, some it show
on ext4 and not vice-versa, seems to depend on the drive or hw raid. But when
it shows it's 100% reproducible on that setup. 

It's always the randwrite numbers. The rest look fine.

Also, as yet I'm not personally doing this testing, just looking into it and
passing on the information I have. 


Thanks for taking a look. 

Cheers,
Phil

> 
> > 
> > In this case it's ext4, but I'm not sure it will be FS specific.
> > 
> > I should have the machine and setup next week to poke further but I wanted
> > to mention it now just in case any one has an "aha" moment.
> > 
> > It seems to only effect these FS loads. Other perf tests are not showing any
> > issues that I am aware of.
> 
> [...]
> 

-- 

Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Dietmar Eggemann 1 year, 1 month ago
On 04/11/2024 13:50, Phil Auld wrote:
> 
> Hi Dietmar,
> 
> On Mon, Nov 04, 2024 at 10:28:37AM +0100 Dietmar Eggemann wrote:
>> Hi Phil,
>>
>> On 01/11/2024 13:47, Phil Auld wrote:
>>>
>>> Hi Peterm

[...]

>> I'm not seeing this on my i7-13700K running tip sched/core (1a6151017ee5
>> - sched: psi: pass enqueue/dequeue flags to psi callbacks directly
>> (2024-10-26 Johannes Weiner)) (6.12.0-rc4 - based)
>>
>> Using 'taskset 0xaaaaa' avoiding SMT and running only on P-cores.
>>
>> vanilla features: 990MB/s (mean out of 5 runs, σ:  9.38)
>> NO_DELAY_DEQUEUE: 992MB/s (mean out of 5 runs, σ: 10.61)
>>
>> # sudo lshw -class disk -class storage
>>   *-nvme                    
>>        description: NVMe device
>>        product: GIGABYTE GP-ASM2NE6500GTTD
>>        vendor: Phison Electronics Corporation
>>        physical id: 0
>>        bus info: pci@0000:01:00.0
>>        logical name: /dev/nvme0
>>        version: EGFM13.2
>>        ...
>>        capabilities: nvme pciexpress msix msi pm nvm_express bus_master cap_list
>>        configuration: driver=nvme latency=0 nqn=nqn.2014.08.org.nvmexpress:19871987SN215108954872 GIGABYTE GP-ASM2NE6500GTTD state=live
>>        resources: irq:16 memory:70800000-70803fff
>>
>> # mount | grep ^/dev/nvme0
>> /dev/nvme0n1p2 on / type ext4 (rw,relatime,errors=remount-ro)
>>
>> Which disk device you're using?
> 
> Most of the reports are on various NVME drives (samsung mostly I think).
> 
> 
> One thing I should add is that it's all on LVM: 
> 
> 
> vgcreate vg /dev/nvme0n1 -y
> lvcreate -n thinMeta -L 3GB vg -y
> lvcreate -n thinPool -l 99%FREE vg -y
> lvconvert --thinpool /dev/mapper/vg-thinPool --poolmetadata /dev/mapper/vg-thinMeta -Zn -y
> lvcreate -n testLV -V 1300G --thinpool thinPool vg
> wipefs -a /dev/mapper/vg-testLV
> mkfs.ext4 /dev/mapper/vg-testLV -E lazy_itable_init=0,lazy_journal_init=0 -F
> mount /dev/mapper/vg-testLV /testfs 
> 
> 
> With VDO or thinpool (as above) it shows on both ext4 and xfs. With fs on
> drive directly it's a little more variable. Some it shows on xfs, some it show
> on ext4 and not vice-versa, seems to depend on the drive or hw raid. But when
> it shows it's 100% reproducible on that setup. 
> 
> It's always the randwrite numbers. The rest look fine.
> 
> Also, as yet I'm not personally doing this testing, just looking into it and
> passing on the information I have. 

One reason I don't see the difference between DELAY_DEQUEUE and
NO_DELAY_DEQUEUE could be because of the affinity of the related
nvme interrupts: 

$ cat /proc/interrupts

     CPU0 CPU1    CPU2 CPU3 CPU4    CPU5 CPU6 CPU7    CPU8 ...
132:   0    0  1523653    0   0        0   0    0       0  ... IR-PCI-MSIX-0000:01:00.0 1-edge nvme0q1
133:   0    0        0    0   0  1338451   0    0       0  ... IR-PCI-MSIX-0000:01:00.0 2-edge nvme0q2
134:   0    0        0    0   0        0   0    0  2252297 ... IR-PCI-MSIX-0000:01:00.0 3-edge nvme0q3

$ cat /proc/irq/132/smp_affinity_list 
0-2
cat /proc/irq/133/smp_affinity_list 
3-5
cat /proc/irq/134/smp_affinity_list 
6-8

So the 8 fio tasks from: 

# fio --cpus_allowed 1,2,3,4,5,6,7,8 --rw randwrite --bs 4k
  --runtime 8s --iodepth 32 --direct 1 --ioengine libaio
  --numjobs 8 --size 30g --name default --time_based
  --group_reporting --cpus_allowed_policy shared
  --directory /testfs

don't have to fight with per-CPU kworkers on each CPU.

e.g. 'nvme0q3 interrupt -> queue on workqueue dio/nvme0n1p2 -> 
      run iomap_dio_complete_work() in kworker/8:x'

In case I trace the 'task_on_rq_queued(p) && p->se.sched_delayed &&
rq->nr_running > 1) condition in ttwu_runnable() condition i only see
the per-CPU kworker in there, so p->nr_cpus_allowed == 1.

So the patch shouldn't make a difference for this scenario?

But maybe your VDO or thinpool setup creates waker/wakee pairs with
wakee->nr_cpus_allowed > 1? 

Does your machine has single CPU smp_affinity masks for these nvme
interrupts?

[...]


































Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Phil Auld 1 year, 1 month ago
On Fri, Nov 08, 2024 at 03:53:26PM +0100 Dietmar Eggemann wrote:
> On 04/11/2024 13:50, Phil Auld wrote:
> > 
> > Hi Dietmar,
> > 
> > On Mon, Nov 04, 2024 at 10:28:37AM +0100 Dietmar Eggemann wrote:
> >> Hi Phil,
> >>
> >> On 01/11/2024 13:47, Phil Auld wrote:
> >>>
> >>> Hi Peterm
> 
> [...]
> 
> >> I'm not seeing this on my i7-13700K running tip sched/core (1a6151017ee5
> >> - sched: psi: pass enqueue/dequeue flags to psi callbacks directly
> >> (2024-10-26 Johannes Weiner)) (6.12.0-rc4 - based)
> >>
> >> Using 'taskset 0xaaaaa' avoiding SMT and running only on P-cores.
> >>
> >> vanilla features: 990MB/s (mean out of 5 runs, σ:  9.38)
> >> NO_DELAY_DEQUEUE: 992MB/s (mean out of 5 runs, σ: 10.61)
> >>
> >> # sudo lshw -class disk -class storage
> >>   *-nvme                    
> >>        description: NVMe device
> >>        product: GIGABYTE GP-ASM2NE6500GTTD
> >>        vendor: Phison Electronics Corporation
> >>        physical id: 0
> >>        bus info: pci@0000:01:00.0
> >>        logical name: /dev/nvme0
> >>        version: EGFM13.2
> >>        ...
> >>        capabilities: nvme pciexpress msix msi pm nvm_express bus_master cap_list
> >>        configuration: driver=nvme latency=0 nqn=nqn.2014.08.org.nvmexpress:19871987SN215108954872 GIGABYTE GP-ASM2NE6500GTTD state=live
> >>        resources: irq:16 memory:70800000-70803fff
> >>
> >> # mount | grep ^/dev/nvme0
> >> /dev/nvme0n1p2 on / type ext4 (rw,relatime,errors=remount-ro)
> >>
> >> Which disk device you're using?
> > 
> > Most of the reports are on various NVME drives (samsung mostly I think).
> > 
> > 
> > One thing I should add is that it's all on LVM: 
> > 
> > 
> > vgcreate vg /dev/nvme0n1 -y
> > lvcreate -n thinMeta -L 3GB vg -y
> > lvcreate -n thinPool -l 99%FREE vg -y
> > lvconvert --thinpool /dev/mapper/vg-thinPool --poolmetadata /dev/mapper/vg-thinMeta -Zn -y
> > lvcreate -n testLV -V 1300G --thinpool thinPool vg
> > wipefs -a /dev/mapper/vg-testLV
> > mkfs.ext4 /dev/mapper/vg-testLV -E lazy_itable_init=0,lazy_journal_init=0 -F
> > mount /dev/mapper/vg-testLV /testfs 
> > 
> > 
> > With VDO or thinpool (as above) it shows on both ext4 and xfs. With fs on
> > drive directly it's a little more variable. Some it shows on xfs, some it show
> > on ext4 and not vice-versa, seems to depend on the drive or hw raid. But when
> > it shows it's 100% reproducible on that setup. 
> > 
> > It's always the randwrite numbers. The rest look fine.
> > 
> > Also, as yet I'm not personally doing this testing, just looking into it and
> > passing on the information I have. 
> 
> One reason I don't see the difference between DELAY_DEQUEUE and
> NO_DELAY_DEQUEUE could be because of the affinity of the related
> nvme interrupts: 
> 
> $ cat /proc/interrupts
> 
>      CPU0 CPU1    CPU2 CPU3 CPU4    CPU5 CPU6 CPU7    CPU8 ...
> 132:   0    0  1523653    0   0        0   0    0       0  ... IR-PCI-MSIX-0000:01:00.0 1-edge nvme0q1
> 133:   0    0        0    0   0  1338451   0    0       0  ... IR-PCI-MSIX-0000:01:00.0 2-edge nvme0q2
> 134:   0    0        0    0   0        0   0    0  2252297 ... IR-PCI-MSIX-0000:01:00.0 3-edge nvme0q3
> 
> $ cat /proc/irq/132/smp_affinity_list 
> 0-2
> cat /proc/irq/133/smp_affinity_list 
> 3-5
> cat /proc/irq/134/smp_affinity_list 
> 6-8
> 
> So the 8 fio tasks from: 
> 
> # fio --cpus_allowed 1,2,3,4,5,6,7,8 --rw randwrite --bs 4k
>   --runtime 8s --iodepth 32 --direct 1 --ioengine libaio
>   --numjobs 8 --size 30g --name default --time_based
>   --group_reporting --cpus_allowed_policy shared
>   --directory /testfs
> 
> don't have to fight with per-CPU kworkers on each CPU.
> 
> e.g. 'nvme0q3 interrupt -> queue on workqueue dio/nvme0n1p2 -> 
>       run iomap_dio_complete_work() in kworker/8:x'
> 
> In case I trace the 'task_on_rq_queued(p) && p->se.sched_delayed &&
> rq->nr_running > 1) condition in ttwu_runnable() condition i only see
> the per-CPU kworker in there, so p->nr_cpus_allowed == 1.
> 
> So the patch shouldn't make a difference for this scenario?
>

If the kworker is waking up an fio task it could.  I don't think
they are bound to a single cpu.

But yes if your trace is only showing the kworker there then it would
not help.  Are you actually able to reproduce the difference?


> But maybe your VDO or thinpool setup creates waker/wakee pairs with
> wakee->nr_cpus_allowed > 1? 
>

That's certainly possible but I don't know for sure. There are well more
dio kworkers on the box than cpus though if I recall. I don't know
if they all have singel cpu affinities. 


> Does your machine has single CPU smp_affinity masks for these nvme
> interrupts?
>

I don't know. I had to give the machine back. 



Cheers,
Phil


> [...]
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 

-- 

Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Dietmar Eggemann 1 year, 1 month ago
On 08/11/2024 19:16, Phil Auld wrote:
> On Fri, Nov 08, 2024 at 03:53:26PM +0100 Dietmar Eggemann wrote:
>> On 04/11/2024 13:50, Phil Auld wrote:
>>>
>>> Hi Dietmar,
>>>
>>> On Mon, Nov 04, 2024 at 10:28:37AM +0100 Dietmar Eggemann wrote:
>>>> Hi Phil,
>>>>
>>>> On 01/11/2024 13:47, Phil Auld wrote:

[...]

>> One reason I don't see the difference between DELAY_DEQUEUE and
>> NO_DELAY_DEQUEUE could be because of the affinity of the related
>> nvme interrupts: 
>>
>> $ cat /proc/interrupts
>>
>>      CPU0 CPU1    CPU2 CPU3 CPU4    CPU5 CPU6 CPU7    CPU8 ...
>> 132:   0    0  1523653    0   0        0   0    0       0  ... IR-PCI-MSIX-0000:01:00.0 1-edge nvme0q1
>> 133:   0    0        0    0   0  1338451   0    0       0  ... IR-PCI-MSIX-0000:01:00.0 2-edge nvme0q2
>> 134:   0    0        0    0   0        0   0    0  2252297 ... IR-PCI-MSIX-0000:01:00.0 3-edge nvme0q3
>>
>> $ cat /proc/irq/132/smp_affinity_list 
>> 0-2
>> cat /proc/irq/133/smp_affinity_list 
>> 3-5
>> cat /proc/irq/134/smp_affinity_list 
>> 6-8
>>
>> So the 8 fio tasks from: 
>>
>> # fio --cpus_allowed 1,2,3,4,5,6,7,8 --rw randwrite --bs 4k
>>   --runtime 8s --iodepth 32 --direct 1 --ioengine libaio
>>   --numjobs 8 --size 30g --name default --time_based
>>   --group_reporting --cpus_allowed_policy shared
>>   --directory /testfs
>>
>> don't have to fight with per-CPU kworkers on each CPU.
>>
>> e.g. 'nvme0q3 interrupt -> queue on workqueue dio/nvme0n1p2 -> 
>>       run iomap_dio_complete_work() in kworker/8:x'
>>
>> In case I trace the 'task_on_rq_queued(p) && p->se.sched_delayed &&
>> rq->nr_running > 1) condition in ttwu_runnable() condition i only see
>> the per-CPU kworker in there, so p->nr_cpus_allowed == 1.
>>
>> So the patch shouldn't make a difference for this scenario?
>>
> 
> If the kworker is waking up an fio task it could.  I don't think
> they are bound to a single cpu.
> 
> But yes if your trace is only showing the kworker there then it would
> not help.  Are you actually able to reproduce the difference?

No, with my setup I don't see any difference running your fio test. But
the traces also show me that there are no scenarios in which this patch
can make a difference in the scores.

>> But maybe your VDO or thinpool setup creates waker/wakee pairs with
>> wakee->nr_cpus_allowed > 1? 
>>
> 
> That's certainly possible but I don't know for sure. There are well more
> dio kworkers on the box than cpus though if I recall. I don't know
> if they all have singel cpu affinities. 

Yeah there must be more tasks (inc. kworkers) w/ 'p->nr_cpus_allowed >
1' involved.

>> Does your machine has single CPU smp_affinity masks for these nvme
>> interrupts?
>>
> 
> I don't know. I had to give the machine back.

Ah, too late then ;-)
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Christian Loehle 1 year, 1 month ago
On 11/4/24 12:50, Phil Auld wrote:
> 
> Hi Dietmar,
> 
> On Mon, Nov 04, 2024 at 10:28:37AM +0100 Dietmar Eggemann wrote:
>> Hi Phil,
>>
>> On 01/11/2024 13:47, Phil Auld wrote:
>>>
>>> Hi Peterm
>>>
>>> On Sat, Jul 27, 2024 at 12:27:49PM +0200 Peter Zijlstra wrote:
>>>> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
>>>> noting that lag is fundamentally a temporal measure. It should not be
>>>> carried around indefinitely.
>>>>
>>>> OTOH it should also not be instantly discarded, doing so will allow a
>>>> task to game the system by purposefully (micro) sleeping at the end of
>>>> its time quantum.
>>>>
>>>> Since lag is intimately tied to the virtual time base, a wall-time
>>>> based decay is also insufficient, notably competition is required for
>>>> any of this to make sense.
>>>>
>>>> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
>>>> competing until they are eligible.
>>>>
>>>> Strictly speaking, we only care about keeping them until the 0-lag
>>>> point, but that is a difficult proposition, instead carry them around
>>>> until they get picked again, and dequeue them at that point.
>>>
>>> This one is causing a 10-20% performance hit on our filesystem tests.
>>>
>>> On 6.12-rc5 (so with the latest follow ons) we get:
>>>
>>> with DELAY_DEQUEUE the bandwidth is 510 MB/s
>>> with NO_DELAY_DEQUEUE the bandwidth is 590 MB/s
>>>
>>> The test is fio, something like this:
>>>
>>> taskset -c 1,2,3,4,5,6,7,8 fio --rw randwrite --bs 4k --runtime 1m --fsync 0 --iodepth 32 --direct 1 --ioengine libaio --numjobs 8 --size 30g --nrfiles 1 --loops 1 --name default --randrepeat 1 --time_based --group_reporting --directory /testfs
>>
>> I'm not seeing this on my i7-13700K running tip sched/core (1a6151017ee5
>> - sched: psi: pass enqueue/dequeue flags to psi callbacks directly
>> (2024-10-26 Johannes Weiner)) (6.12.0-rc4 - based)
>>
>> Using 'taskset 0xaaaaa' avoiding SMT and running only on P-cores.
>>
>> vanilla features: 990MB/s (mean out of 5 runs, σ:  9.38)
>> NO_DELAY_DEQUEUE: 992MB/s (mean out of 5 runs, σ: 10.61)
>>
>> # sudo lshw -class disk -class storage
>>   *-nvme                    
>>        description: NVMe device
>>        product: GIGABYTE GP-ASM2NE6500GTTD
>>        vendor: Phison Electronics Corporation
>>        physical id: 0
>>        bus info: pci@0000:01:00.0
>>        logical name: /dev/nvme0
>>        version: EGFM13.2
>>        ...
>>        capabilities: nvme pciexpress msix msi pm nvm_express bus_master cap_list
>>        configuration: driver=nvme latency=0 nqn=nqn.2014.08.org.nvmexpress:19871987SN215108954872 GIGABYTE GP-ASM2NE6500GTTD state=live
>>        resources: irq:16 memory:70800000-70803fff 
>>
>> # mount | grep ^/dev/nvme0
>> /dev/nvme0n1p2 on / type ext4 (rw,relatime,errors=remount-ro)
>>
>> Which disk device you're using?
> 
> Most of the reports are on various NVME drives (samsung mostly I think).
> 
> 
> One thing I should add is that it's all on LVM: 
> 
> 
> vgcreate vg /dev/nvme0n1 -y
> lvcreate -n thinMeta -L 3GB vg -y
> lvcreate -n thinPool -l 99%FREE vg -y
> lvconvert --thinpool /dev/mapper/vg-thinPool --poolmetadata /dev/mapper/vg-thinMeta -Zn -y
> lvcreate -n testLV -V 1300G --thinpool thinPool vg
> wipefs -a /dev/mapper/vg-testLV
> mkfs.ext4 /dev/mapper/vg-testLV -E lazy_itable_init=0,lazy_journal_init=0 -F
> mount /dev/mapper/vg-testLV /testfs 
> 
> 
> With VDO or thinpool (as above) it shows on both ext4 and xfs. With fs on
> drive directly it's a little more variable. Some it shows on xfs, some it show
> on ext4 and not vice-versa, seems to depend on the drive or hw raid. But when
> it shows it's 100% reproducible on that setup. 
> 
> It's always the randwrite numbers. The rest look fine.

Hi Phil,

Thanks for the detailed instructions. Unfortunately even with your LVM setup on
the platforms I've tried I don't see a regression so far, all the numbers are
about equal for DELAY_DEQUEUE and NO_DELAY_DEQUEUE.

Anyway I have some follow-ups, first let me trim the fio command for readability:
fio --rw randwrite --bs 4k --runtime 1m --fsync 0 --iodepth 32 --direct 1 --ioengine libaio --numjobs 8 --size 30g --nrfiles 1 --loops 1 --name default --randrepeat 1 --time_based --group_reporting --directory /testfs

dropping defaults nr_files, loops, fsync, randrepeat
fio --rw randwrite --bs 4k --runtime 1m --iodepth 32 --direct 1 --ioengine libaio --numjobs 8 --size 30g --name default --time_based --group_reporting --directory /testfs

Adding the CPU affinities directly:
fio --cpus_allowed 1-8 --rw randwrite --bs 4k --runtime 1m --iodepth 32 --direct 1 --ioengine libaio --numjobs 8 --size 30g --name default --time_based --group_reporting --directory /testfs

Now I was wondering about the following:
Is it actually the kworker (not another fio) being preempted? (I'm pretty sure it is)
To test: --cpus_allowed_policy split (each fio process gets it's own CPU).

You wrote:
>I was thinking maybe the preemption was preventing some batching of IO completions or
>initiations. But that was wrong it seems.

So while it doesn't reproduce for me, the only thing being preempted regularly is
the kworker (running iomap_dio_complete_work). I don't quite follow the "that was
wrong it seems" part then. Could you elaborate?

Could you also post the other benchmark numbers? Does any of them score higher in IOPS?
Is --rw write the same issue if you set --bs 4k (assuming you set a larger bs for seqwrite).

Can you set the kworkers handling completions to SCHED_BATCH too? Just to confirm.

Regards,
Christian
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Phil Auld 1 year, 1 month ago
Hi Christian,

On Tue, Nov 05, 2024 at 09:53:49AM +0000 Christian Loehle wrote:
> On 11/4/24 12:50, Phil Auld wrote:
> > 
> > Hi Dietmar,
> > 
> > On Mon, Nov 04, 2024 at 10:28:37AM +0100 Dietmar Eggemann wrote:
> >> Hi Phil,
> >>
> >> On 01/11/2024 13:47, Phil Auld wrote:
> >>>
> >>> Hi Peterm
> >>>
> >>> On Sat, Jul 27, 2024 at 12:27:49PM +0200 Peter Zijlstra wrote:
> >>>> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> >>>> noting that lag is fundamentally a temporal measure. It should not be
> >>>> carried around indefinitely.
> >>>>
> >>>> OTOH it should also not be instantly discarded, doing so will allow a
> >>>> task to game the system by purposefully (micro) sleeping at the end of
> >>>> its time quantum.
> >>>>
> >>>> Since lag is intimately tied to the virtual time base, a wall-time
> >>>> based decay is also insufficient, notably competition is required for
> >>>> any of this to make sense.
> >>>>
> >>>> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> >>>> competing until they are eligible.
> >>>>
> >>>> Strictly speaking, we only care about keeping them until the 0-lag
> >>>> point, but that is a difficult proposition, instead carry them around
> >>>> until they get picked again, and dequeue them at that point.
> >>>
> >>> This one is causing a 10-20% performance hit on our filesystem tests.
> >>>
> >>> On 6.12-rc5 (so with the latest follow ons) we get:
> >>>
> >>> with DELAY_DEQUEUE the bandwidth is 510 MB/s
> >>> with NO_DELAY_DEQUEUE the bandwidth is 590 MB/s
> >>>
> >>> The test is fio, something like this:
> >>>
> >>> taskset -c 1,2,3,4,5,6,7,8 fio --rw randwrite --bs 4k --runtime 1m --fsync 0 --iodepth 32 --direct 1 --ioengine libaio --numjobs 8 --size 30g --nrfiles 1 --loops 1 --name default --randrepeat 1 --time_based --group_reporting --directory /testfs
> >>
> >> I'm not seeing this on my i7-13700K running tip sched/core (1a6151017ee5
> >> - sched: psi: pass enqueue/dequeue flags to psi callbacks directly
> >> (2024-10-26 Johannes Weiner)) (6.12.0-rc4 - based)
> >>
> >> Using 'taskset 0xaaaaa' avoiding SMT and running only on P-cores.
> >>
> >> vanilla features: 990MB/s (mean out of 5 runs, σ:  9.38)
> >> NO_DELAY_DEQUEUE: 992MB/s (mean out of 5 runs, σ: 10.61)
> >>
> >> # sudo lshw -class disk -class storage
> >>   *-nvme                    
> >>        description: NVMe device
> >>        product: GIGABYTE GP-ASM2NE6500GTTD
> >>        vendor: Phison Electronics Corporation
> >>        physical id: 0
> >>        bus info: pci@0000:01:00.0
> >>        logical name: /dev/nvme0
> >>        version: EGFM13.2
> >>        ...
> >>        capabilities: nvme pciexpress msix msi pm nvm_express bus_master cap_list
> >>        configuration: driver=nvme latency=0 nqn=nqn.2014.08.org.nvmexpress:19871987SN215108954872 GIGABYTE GP-ASM2NE6500GTTD state=live
> >>        resources: irq:16 memory:70800000-70803fff 
> >>
> >> # mount | grep ^/dev/nvme0
> >> /dev/nvme0n1p2 on / type ext4 (rw,relatime,errors=remount-ro)
> >>
> >> Which disk device you're using?
> > 
> > Most of the reports are on various NVME drives (samsung mostly I think).
> > 
> > 
> > One thing I should add is that it's all on LVM: 
> > 
> > 
> > vgcreate vg /dev/nvme0n1 -y
> > lvcreate -n thinMeta -L 3GB vg -y
> > lvcreate -n thinPool -l 99%FREE vg -y
> > lvconvert --thinpool /dev/mapper/vg-thinPool --poolmetadata /dev/mapper/vg-thinMeta -Zn -y
> > lvcreate -n testLV -V 1300G --thinpool thinPool vg
> > wipefs -a /dev/mapper/vg-testLV
> > mkfs.ext4 /dev/mapper/vg-testLV -E lazy_itable_init=0,lazy_journal_init=0 -F
> > mount /dev/mapper/vg-testLV /testfs 
> > 
> > 
> > With VDO or thinpool (as above) it shows on both ext4 and xfs. With fs on
> > drive directly it's a little more variable. Some it shows on xfs, some it show
> > on ext4 and not vice-versa, seems to depend on the drive or hw raid. But when
> > it shows it's 100% reproducible on that setup. 
> > 
> > It's always the randwrite numbers. The rest look fine.
> 
> Hi Phil,
> 
> Thanks for the detailed instructions. Unfortunately even with your LVM setup on
> the platforms I've tried I don't see a regression so far, all the numbers are
> about equal for DELAY_DEQUEUE and NO_DELAY_DEQUEUE.
>

Yeah, that's odd.

Fwiw:

Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          48 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   32
  On-line CPU(s) list:    0-31
Vendor ID:                AuthenticAMD
  BIOS Vendor ID:         Advanced Micro Devices, Inc.
  Model name:             AMD EPYC 7313P 16-Core Processor
    BIOS Model name:      AMD EPYC 7313P 16-Core Processor                Unknown CPU @ 3.0GHz
    BIOS CPU family:      107
    CPU family:           25
    ...

16 SMT2 cores (siblings are 16-31)


#lsblk -N
NAME    TYPE MODEL                      SERIAL              REV TRAN   RQ-SIZE  MQ
nvme3n1 disk SAMSUNG MZQL21T9HCJR-00A07 S64GNJ0T605178 GDC5602Q nvme      1023  32
nvme2n1 disk SAMSUNG MZQL21T9HCJR-00A07 S64GNJ0T605128 GDC5602Q nvme      1023  32
nvme0n1 disk SAMSUNG MZQL21T9HCJR-00A07 S64GNJ0T605125 GDC5602Q nvme      1023  32
nvme1n1 disk SAMSUNG MZQL21T9HCJR-00A07 S64GNJ0T605127 GDC5602Q nvme      1023  32

Where nvme0n1 is the one I'm actually using.

I'm on 6.12.0-0.rc5.44.eln143.x86_64  which is v6.12-rc5 with RHEL .config.  This
should have little to no franken-kernel bits but now that I have the machine
I'll build from upstream (with the RHEL .config still) to make sure.

We did see it on all the RCs so far. 


> Anyway I have some follow-ups, first let me trim the fio command for readability:
> fio --rw randwrite --bs 4k --runtime 1m --fsync 0 --iodepth 32 --direct 1 --ioengine libaio --numjobs 8 --size 30g --nrfiles 1 --loops 1 --name default --randrepeat 1 --time_based --group_reporting --directory /testfs
> 
> dropping defaults nr_files, loops, fsync, randrepeat
> fio --rw randwrite --bs 4k --runtime 1m --iodepth 32 --direct 1 --ioengine libaio --numjobs 8 --size 30g --name default --time_based --group_reporting --directory /testfs
> 
> Adding the CPU affinities directly:
> fio --cpus_allowed 1-8 --rw randwrite --bs 4k --runtime 1m --iodepth 32 --direct 1 --ioengine libaio --numjobs 8 --size 30g --name default --time_based --group_reporting --directory /testfs
>

Fair enough.  It should work the same with taskset I suppose except the below bit. I've
been given this from our perforance team. They have a framework that produces nice html
pages with red and green results and graphs and whatnot.  Right now it's in the form of
a script that pulls the KB/s number of out the json output which is nice and keeps me
from going crosseyed looking at the full fio run output.

> Now I was wondering about the following:
> Is it actually the kworker (not another fio) being preempted? (I'm pretty sure it is)
> To test: --cpus_allowed_policy split (each fio process gets it's own CPU).

with  --cpus-allowed and --cpus_allowed_policy split the results with DELAY_DEQUEUE are
better (540MB/s) but with NO_DELAY_DEQUEUE they are also better (640 MB/s). It was
510MB/s and 590MB/s before. 

> 
> You wrote:
> >I was thinking maybe the preemption was preventing some batching of IO completions or
> >initiations. But that was wrong it seems.
> 
> So while it doesn't reproduce for me, the only thing being preempted regularly is
> the kworker (running iomap_dio_complete_work). I don't quite follow the "that was
> wrong it seems" part then. Could you elaborate?
>

I was thinking that the fio batch test along with the disabling WAKEUP_PREEMPTION
was telling me that it wasn't the over preemption issue, but that also I could be
wrong about...


> Could you also post the other benchmark numbers? Does any of them score higher in IOPS?
> Is --rw write the same issue if you set --bs 4k (assuming you set a larger bs for seqwrite).
>

I don't have numbers for all of the other flavors but I ran --rw write --bs 4k:

DELAY_DEQUEUE     ~590MB/s
NO_DELAY_DEQUEUE  ~840MB/s

Those results are not good for DELAY_DEQUEUE either.

> Can you set the kworkers handling completions to SCHED_BATCH too? Just to confirm.

I think I did the wrong kworkes the first time. So I'll try again to figure out which
kworkers to twiddle (or I'll just do all 227 of them...).



Thanks,
Phil


> 
> Regards,
> Christian
> 

-- 

Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Dietmar Eggemann 1 year, 1 month ago
+cc Christian Loehle <christian.loehle@arm.com>

On 04/11/2024 10:28, Dietmar Eggemann wrote:
> Hi Phil,
> 
> On 01/11/2024 13:47, Phil Auld wrote:
>>
>> Hi Peterm
>>
>> On Sat, Jul 27, 2024 at 12:27:49PM +0200 Peter Zijlstra wrote:
>>> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
>>> noting that lag is fundamentally a temporal measure. It should not be
>>> carried around indefinitely.
>>>
>>> OTOH it should also not be instantly discarded, doing so will allow a
>>> task to game the system by purposefully (micro) sleeping at the end of
>>> its time quantum.
>>>
>>> Since lag is intimately tied to the virtual time base, a wall-time
>>> based decay is also insufficient, notably competition is required for
>>> any of this to make sense.
>>>
>>> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
>>> competing until they are eligible.
>>>
>>> Strictly speaking, we only care about keeping them until the 0-lag
>>> point, but that is a difficult proposition, instead carry them around
>>> until they get picked again, and dequeue them at that point.
>>
>> This one is causing a 10-20% performance hit on our filesystem tests.
>>
>> On 6.12-rc5 (so with the latest follow ons) we get:
>>
>> with DELAY_DEQUEUE the bandwidth is 510 MB/s
>> with NO_DELAY_DEQUEUE the bandwidth is 590 MB/s
>>
>> The test is fio, something like this:
>>
>> taskset -c 1,2,3,4,5,6,7,8 fio --rw randwrite --bs 4k --runtime 1m --fsync 0 --iodepth 32 --direct 1 --ioengine libaio --numjobs 8 --size 30g --nrfiles 1 --loops 1 --name default --randrepeat 1 --time_based --group_reporting --directory /testfs
> 
> I'm not seeing this on my i7-13700K running tip sched/core (1a6151017ee5
> - sched: psi: pass enqueue/dequeue flags to psi callbacks directly
> (2024-10-26 Johannes Weiner)) (6.12.0-rc4 - based)
> 
> Using 'taskset 0xaaaaa' avoiding SMT and running only on P-cores.
                 ^^^^^^^
> 
> vanilla features: 990MB/s (mean out of 5 runs, σ:  9.38)
> NO_DELAY_DEQUEUE: 992MB/s (mean out of 5 runs, σ: 10.61)

Christian Loehle just told me that my cpumask looks odd. Should be
0xaaaa instead.

Retested:

vanilla features: 954MB/s (mean out of 5 runs, σ: 30.83)
NO_DELAY_DEQUEUE: 932MB/s (mean out of 5 runs, σ: 28.10)

Now there are only 8 CPUs (instead of 10) for the 8 (+2) fio tasks. σ
went up probably because of more wakeup/preemption latency.

> 
> # sudo lshw -class disk -class storage
>   *-nvme                    
>        description: NVMe device
>        product: GIGABYTE GP-ASM2NE6500GTTD
>        vendor: Phison Electronics Corporation
>        physical id: 0
>        bus info: pci@0000:01:00.0
>        logical name: /dev/nvme0
>        version: EGFM13.2
>        ...
>        capabilities: nvme pciexpress msix msi pm nvm_express bus_master cap_list
>        configuration: driver=nvme latency=0 nqn=nqn.2014.08.org.nvmexpress:19871987SN215108954872 GIGABYTE GP-ASM2NE6500GTTD state=live
>        resources: irq:16 memory:70800000-70803fff
> 
> # mount | grep ^/dev/nvme0
> /dev/nvme0n1p2 on / type ext4 (rw,relatime,errors=remount-ro)
> 
> Which disk device you're using?
> 
>>
>> In this case it's ext4, but I'm not sure it will be FS specific.
>>
>> I should have the machine and setup next week to poke further but I wanted
>> to mention it now just in case any one has an "aha" moment.
>>
>> It seems to only effect these FS loads. Other perf tests are not showing any
>> issues that I am aware of.
> 
> [...]
> 
> 

Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Peter Zijlstra 1 year, 1 month ago
On Fri, Nov 01, 2024 at 08:47:15AM -0400, Phil Auld wrote:

> This one is causing a 10-20% performance hit on our filesystem tests.
> 
> On 6.12-rc5 (so with the latest follow ons) we get:
> 
> with DELAY_DEQUEUE the bandwidth is 510 MB/s
> with NO_DELAY_DEQUEUE the bandwidth is 590 MB/s
> 
> The test is fio, something like this:
> 
> taskset -c 1,2,3,4,5,6,7,8 fio --rw randwrite --bs 4k --runtime 1m --fsync 0 --iodepth 32 --direct 1 --ioengine libaio --numjobs 8 --size 30g --nrfiles 1 --loops 1 --name default --randrepeat 1 --time_based --group_reporting --directory /testfs
> 
> In this case it's ext4, but I'm not sure it will be FS specific.
> 
> I should have the machine and setup next week to poke further but I wanted
> to mention it now just in case any one has an "aha" moment.
> 
> It seems to only effect these FS loads. Other perf tests are not showing any
> issues that I am aware of.

There's a number of reports -- mostly it seems to be a case of excessive
preemption hurting things.

What happens if you use:

  schedtool -B -a 1-8 -e fio ....
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Phil Auld 1 year, 1 month ago
On Fri, Nov 01, 2024 at 01:56:59PM +0100 Peter Zijlstra wrote:
> On Fri, Nov 01, 2024 at 08:47:15AM -0400, Phil Auld wrote:
> 
> > This one is causing a 10-20% performance hit on our filesystem tests.
> > 
> > On 6.12-rc5 (so with the latest follow ons) we get:
> > 
> > with DELAY_DEQUEUE the bandwidth is 510 MB/s
> > with NO_DELAY_DEQUEUE the bandwidth is 590 MB/s
> > 
> > The test is fio, something like this:
> > 
> > taskset -c 1,2,3,4,5,6,7,8 fio --rw randwrite --bs 4k --runtime 1m --fsync 0 --iodepth 32 --direct 1 --ioengine libaio --numjobs 8 --size 30g --nrfiles 1 --loops 1 --name default --randrepeat 1 --time_based --group_reporting --directory /testfs
> > 
> > In this case it's ext4, but I'm not sure it will be FS specific.
> > 
> > I should have the machine and setup next week to poke further but I wanted
> > to mention it now just in case any one has an "aha" moment.
> > 
> > It seems to only effect these FS loads. Other perf tests are not showing any
> > issues that I am aware of.
> 
> There's a number of reports -- mostly it seems to be a case of excessive
> preemption hurting things.
> 
> What happens if you use:
> 
>   schedtool -B -a 1-8 -e fio ....
> 
>

Thanks for taking a look. 

That makes the overall performance way worse:

DELAY_DEQUEUE - 146 MB/s
NO_DELAY_DEQUEUE - 156 MB/s

I guess that does make the difference between delay and nodelay
about half. 

How is delay dequeue causing more preemption? Or is that more
for eevdf in general?  We aren't seeing any issues there except
for the delay dequeue thing.



Cheers,
Phil

--
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Peter Zijlstra 1 year, 1 month ago
On Fri, Nov 01, 2024 at 09:38:22AM -0400, Phil Auld wrote:

> How is delay dequeue causing more preemption? 

The thing delay dequeue does is it keeps !eligible tasks on the runqueue
until they're picked again. Them getting picked means they're eligible.
If at that point they're still not runnable, they're dequeued.

By keeping them around like this, they can earn back their lag.

The result is that the moment they get woken up again, they're going to
be eligible and are considered for preemption.


The whole thinking behind this is that while 'lag' measures the
mount of service difference from the ideal (positive lag will have less
service, while negative lag will have had too much service), this is
only true for the (constantly) competing task.

The moment a task leaves, will it still have had too much service? And
after a few seconds of inactivity?

So by keeping the deactivated tasks (artificially) in the competition
until they're at least at the equal service point, lets them burn off
some of that debt.

It is not dissimilar to how CFS had sleeper bonus, except that was
walltime based, while this is competition based.


Notably, this makes a significant difference for interactive tasks that
only run periodically. If they're not eligible at the point of wakeup,
they'll incur undue latency.


Now, I imagine FIO to have tasks blocking on IO, and while they're
blocked, they'll be earning their eligibility, such that when they're
woken they're good to go, preempting whatever.

Whatever doesn't seem to enjoy this.


Given BATCH makes such a terrible mess of things, I'm thinking FIO as a
whole does like preemption -- so now it's a question of figuring out
what exactly it does and doesn't like. Which is never trivial :/
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Phil Auld 1 year, 1 month ago
On Fri, Nov 01, 2024 at 03:26:49PM +0100 Peter Zijlstra wrote:
> On Fri, Nov 01, 2024 at 09:38:22AM -0400, Phil Auld wrote:
> 
> > How is delay dequeue causing more preemption? 
> 
> The thing delay dequeue does is it keeps !eligible tasks on the runqueue
> until they're picked again. Them getting picked means they're eligible.
> If at that point they're still not runnable, they're dequeued.
> 
> By keeping them around like this, they can earn back their lag.
> 
> The result is that the moment they get woken up again, they're going to
> be eligible and are considered for preemption.
> 
> 
> The whole thinking behind this is that while 'lag' measures the
> mount of service difference from the ideal (positive lag will have less
> service, while negative lag will have had too much service), this is
> only true for the (constantly) competing task.
> 
> The moment a task leaves, will it still have had too much service? And
> after a few seconds of inactivity?
> 
> So by keeping the deactivated tasks (artificially) in the competition
> until they're at least at the equal service point, lets them burn off
> some of that debt.
> 
> It is not dissimilar to how CFS had sleeper bonus, except that was
> walltime based, while this is competition based.
> 
> 
> Notably, this makes a significant difference for interactive tasks that
> only run periodically. If they're not eligible at the point of wakeup,
> they'll incur undue latency.
> 
> 
> Now, I imagine FIO to have tasks blocking on IO, and while they're
> blocked, they'll be earning their eligibility, such that when they're
> woken they're good to go, preempting whatever.
> 
> Whatever doesn't seem to enjoy this.
> 
> 
> Given BATCH makes such a terrible mess of things, I'm thinking FIO as a
> whole does like preemption -- so now it's a question of figuring out
> what exactly it does and doesn't like. Which is never trivial :/
> 

Thanks for that detailed explanation.

I can confirm that FIO does like the preemption

NO_WAKEUP_P and DELAY    - 427 MB/s
NO_WAKEUP_P and NO_DELAY - 498 MB/s
WAKEUP_P and DELAY       - 519 MB/s
WAKEUP_P and NO_DELAY    - 590 MB/s

Something in the delay itself
(extra tasks in the queue? not migrating the delayed task? ...) 

I'll start looking at tracing next week.


Thanks,
Phil


--
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Mike Galbraith 1 year, 1 month ago
On Fri, 2024-11-01 at 10:42 -0400, Phil Auld wrote:
> On Fri, Nov 01, 2024 at 03:26:49PM +0100 Peter Zijlstra wrote:
> > On Fri, Nov 01, 2024 at 09:38:22AM -0400, Phil Auld wrote:
> >
> > > How is delay dequeue causing more preemption?
> >
> > The thing delay dequeue does is it keeps !eligible tasks on the runqueue
> > until they're picked again. Them getting picked means they're eligible.
> > If at that point they're still not runnable, they're dequeued.
> >
> > By keeping them around like this, they can earn back their lag.
> >
> > The result is that the moment they get woken up again, they're going to
> > be eligible and are considered for preemption.
> >
> >
> > The whole thinking behind this is that while 'lag' measures the
> > mount of service difference from the ideal (positive lag will have less
> > service, while negative lag will have had too much service), this is
> > only true for the (constantly) competing task.
> >
> > The moment a task leaves, will it still have had too much service? And
> > after a few seconds of inactivity?
> >
> > So by keeping the deactivated tasks (artificially) in the competition
> > until they're at least at the equal service point, lets them burn off
> > some of that debt.
> >
> > It is not dissimilar to how CFS had sleeper bonus, except that was
> > walltime based, while this is competition based.
> >
> >
> > Notably, this makes a significant difference for interactive tasks that
> > only run periodically. If they're not eligible at the point of wakeup,
> > they'll incur undue latency.
> >
> >
> > Now, I imagine FIO to have tasks blocking on IO, and while they're
> > blocked, they'll be earning their eligibility, such that when they're
> > woken they're good to go, preempting whatever.
> >
> > Whatever doesn't seem to enjoy this.
> >
> >
> > Given BATCH makes such a terrible mess of things, I'm thinking FIO as a
> > whole does like preemption -- so now it's a question of figuring out
> > what exactly it does and doesn't like. Which is never trivial :/
> >
>
> Thanks for that detailed explanation.
>
> I can confirm that FIO does like the preemption
>
> NO_WAKEUP_P and DELAY    - 427 MB/s
> NO_WAKEUP_P and NO_DELAY - 498 MB/s
> WAKEUP_P and DELAY       - 519 MB/s
> WAKEUP_P and NO_DELAY    - 590 MB/s
>
> Something in the delay itself
> (extra tasks in the queue? not migrating the delayed task? ...)

I think it's all about short term fairness and asymmetric buddies.

tbench comparison eevdf vs cfs, 100% apple to apple.

1 tbench buddy pair scheduled cross core.

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ P COMMAND
13770 root      20   0   21424   1920   1792 S 60.13 0.012   0:33.81 3 tbench
13771 root      20   0    4720    896    768 S 46.84 0.006   0:26.10 2 tbench_srv

Note 60/47 utilization, now pinned/stacked.

6.1.114-cfs
  PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ P COMMAND
 4407 root      20   0   21424   1980   1772 R 50.00 0.012   0:29.20 3 tbench
 4408 root      20   0    4720    124      0 R 50.00 0.001   0:28.76 3 tbench_srv

Note what happens to the lighter tbench_srv. Consuming red hot L2 data,
it can utilize a full 50%, but it must first preempt wide bottom buddy.

Now eevdf.  (zero source deltas other than eevdf)
6.1.114-eevdf -delay_dequeue
  PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ P COMMAND
 4988 root      20   0   21424   1948   1736 R 56.44 0.012   0:32.92 3 tbench
 4989 root      20   0    4720    128      0 R 44.55 0.001   0:25.49 3 tbench_srv
6.1.114-eevdf +delay_dequeue
  PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ P COMMAND
 4934 root      20   0   21424   1952   1736 R 52.00 0.012   0:30.09 3 tbench
 4935 root      20   0    4720    124      0 R 49.00 0.001   0:28.15 3 tbench_srv

As Peter noted, delay_dequeue levels the sleeper playing field.  Both
of these guys are 1:1 sleepers, but they're asymmetric in width.

Bottom line, box full of 1:1 buddies pairing up and stacking in L2.

tbench 8
6.1.114-cfs      3674.37 MB/sec
6.1.114-eevdf    3505.25 MB/sec -delay_dequeue
                 3701.66 MB/sec +delay_dequeue

For tbench, preemption = shorter turnaround = higher throughput.

	-Mike
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Phil Auld 1 year, 1 month ago
Hi Mike,

On Fri, Nov 01, 2024 at 07:08:31PM +0100 Mike Galbraith wrote:
> On Fri, 2024-11-01 at 10:42 -0400, Phil Auld wrote:
> > On Fri, Nov 01, 2024 at 03:26:49PM +0100 Peter Zijlstra wrote:
> > > On Fri, Nov 01, 2024 at 09:38:22AM -0400, Phil Auld wrote:
> > >
> > > > How is delay dequeue causing more preemption?
> > >
> > > The thing delay dequeue does is it keeps !eligible tasks on the runqueue
> > > until they're picked again. Them getting picked means they're eligible.
> > > If at that point they're still not runnable, they're dequeued.
> > >
> > > By keeping them around like this, they can earn back their lag.
> > >
> > > The result is that the moment they get woken up again, they're going to
> > > be eligible and are considered for preemption.
> > >
> > >
> > > The whole thinking behind this is that while 'lag' measures the
> > > mount of service difference from the ideal (positive lag will have less
> > > service, while negative lag will have had too much service), this is
> > > only true for the (constantly) competing task.
> > >
> > > The moment a task leaves, will it still have had too much service? And
> > > after a few seconds of inactivity?
> > >
> > > So by keeping the deactivated tasks (artificially) in the competition
> > > until they're at least at the equal service point, lets them burn off
> > > some of that debt.
> > >
> > > It is not dissimilar to how CFS had sleeper bonus, except that was
> > > walltime based, while this is competition based.
> > >
> > >
> > > Notably, this makes a significant difference for interactive tasks that
> > > only run periodically. If they're not eligible at the point of wakeup,
> > > they'll incur undue latency.
> > >
> > >
> > > Now, I imagine FIO to have tasks blocking on IO, and while they're
> > > blocked, they'll be earning their eligibility, such that when they're
> > > woken they're good to go, preempting whatever.
> > >
> > > Whatever doesn't seem to enjoy this.
> > >
> > >
> > > Given BATCH makes such a terrible mess of things, I'm thinking FIO as a
> > > whole does like preemption -- so now it's a question of figuring out
> > > what exactly it does and doesn't like. Which is never trivial :/
> > >
> >
> > Thanks for that detailed explanation.
> >
> > I can confirm that FIO does like the preemption
> >
> > NO_WAKEUP_P and DELAY    - 427 MB/s
> > NO_WAKEUP_P and NO_DELAY - 498 MB/s
> > WAKEUP_P and DELAY       - 519 MB/s
> > WAKEUP_P and NO_DELAY    - 590 MB/s
> >
> > Something in the delay itself
> > (extra tasks in the queue? not migrating the delayed task? ...)
> 
> I think it's all about short term fairness and asymmetric buddies.

Thanks for jumping in.  My jargon decoder ring seems to be failing me
so I'm not completely sure what you are saying below :)

"buddies" you mean tasks that waking each other up and sleeping.
And one runs for longer than the other, right?

> 
> tbench comparison eevdf vs cfs, 100% apple to apple.
> 
> 1 tbench buddy pair scheduled cross core.
> 
>   PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ P COMMAND
> 13770 root      20   0   21424   1920   1792 S 60.13 0.012   0:33.81 3 tbench
> 13771 root      20   0    4720    896    768 S 46.84 0.006   0:26.10 2 tbench_srv
 
> Note 60/47 utilization, now pinned/stacked.
> 
> 6.1.114-cfs
>   PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ P COMMAND
>  4407 root      20   0   21424   1980   1772 R 50.00 0.012   0:29.20 3 tbench
>  4408 root      20   0    4720    124      0 R 50.00 0.001   0:28.76 3 tbench_srv

What is the difference between these first two?  The first is on
separate cores so they don't interfere with each other? And the second is
pinned to the same core?

>
> Note what happens to the lighter tbench_srv. Consuming red hot L2 data,
> it can utilize a full 50%, but it must first preempt wide bottom buddy.
>

We've got "light" and "wide" here which is a bit mixed metaphorically :) 
So here CFS is letting the wakee preempt the waker and providing pretty
equal fairness. And hot l2 caching is masking the assymmetry. 

> Now eevdf.  (zero source deltas other than eevdf)
> 6.1.114-eevdf -delay_dequeue
>   PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ P COMMAND
>  4988 root      20   0   21424   1948   1736 R 56.44 0.012   0:32.92 3 tbench
>  4989 root      20   0    4720    128      0 R 44.55 0.001   0:25.49 3 tbench_srv
> 6.1.114-eevdf +delay_dequeue
>   PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ P COMMAND
>  4934 root      20   0   21424   1952   1736 R 52.00 0.012   0:30.09 3 tbench
>  4935 root      20   0    4720    124      0 R 49.00 0.001   0:28.15 3 tbench_srv
> 
> As Peter noted, delay_dequeue levels the sleeper playing field.  Both
> of these guys are 1:1 sleepers, but they're asymmetric in width.

With wakeup preemption off it doesn't help in my case. I was thinking
maybe the preemption was preventing some batching of IO completions or
initiations. But that was wrong it seems. 

Does it also possibly make wakeup migration less likely and thus increase
stacking?  

> Bottom line, box full of 1:1 buddies pairing up and stacking in L2.
> 
> tbench 8
> 6.1.114-cfs      3674.37 MB/sec
> 6.1.114-eevdf    3505.25 MB/sec -delay_dequeue
>                  3701.66 MB/sec +delay_dequeue
> 
> For tbench, preemption = shorter turnaround = higher throughput.

So here you have a benchmark that gets a ~5% boost from delayed_dequeue.

But I've got one that get's a 20% penalty so I'm not exactly sure what
to make of that. Clearly FIO does not have the same pattern as tbench. 

It's not a special case though, this is one that our perf team runs
regularly to look for regressions. 

I'll be able to poke at it more next week so hopefully I can see what it's
doing. 


Cheers,
Phil


> 
> 	-Mike
> 

-- 
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Mike Galbraith 1 year, 1 month ago
On Fri, 2024-11-01 at 16:07 -0400, Phil Auld wrote:


> Thanks for jumping in.  My jargon decoder ring seems to be failing me
> so I'm not completely sure what you are saying below :)
>
> "buddies" you mean tasks that waking each other up and sleeping.
> And one runs for longer than the other, right?

Yeah, buddies are related waker/wakee 1:1 1:N or M:N, excluding tasks
happening to be sitting on a CPU where, say a timer fires, an IRQ leads
to a wakeup of lord knows what, lock wakeups etc etc etc. I think Peter
coined the term buddy to mean that (less typing), and it stuck.

> > 1 tbench buddy pair scheduled cross core.
> >
> >   PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ P COMMAND
> > 13770 root      20   0   21424   1920   1792 S 60.13 0.012   0:33.81 3 tbench
> > 13771 root      20   0    4720    896    768 S 46.84 0.006   0:26.10 2 tbench_srv
>  
> > Note 60/47 utilization, now pinned/stacked.
> >
> > 6.1.114-cfs
> >   PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ P COMMAND
> >  4407 root      20   0   21424   1980   1772 R 50.00 0.012   0:29.20 3 tbench
> >  4408 root      20   0    4720    124      0 R 50.00 0.001   0:28.76 3 tbench_srv
>
> What is the difference between these first two?  The first is on
> separate cores so they don't interfere with each other? And the second is
> pinned to the same core?

Yeah, see 'P'. Given CPU headroom, a tbench pair can consume ~107%.
They're not fully synchronous.. wouldn't be relevant here/now if they
were :)

> > Note what happens to the lighter tbench_srv. Consuming red hot L2 data,
> > it can utilize a full 50%, but it must first preempt wide bottom buddy.
> >
>
> We've got "light" and "wide" here which is a bit mixed metaphorically
> :)

Wide, skinny, feather-weight or lard-ball, they all work for me.

> So here CFS is letting the wakee preempt the waker and providing pretty
> equal fairness. And hot l2 caching is masking the assymmetry.

No, it's way simpler: preemption slices through the only thing it can
slice through, the post wakeup concurrent bits.. that otherwise sits
directly in the communication stream as a lump of latency in a latency
bound operation.

>
> With wakeup preemption off it doesn't help in my case. I was thinking
> maybe the preemption was preventing some batching of IO completions
> or
> initiations. But that was wrong it seems.

Dunno.

> Does it also possibly make wakeup migration less likely and thus increase
> stacking?

The buddy being preempted certainly won't be wakeup migrated, because
it won't sleep. Two very sleepy tasks when bw constrained becomes one
100% hog and one 99.99% hog when CPU constrained.

> > Bottom line, box full of 1:1 buddies pairing up and stacking in L2.
> >
> > tbench 8
> > 6.1.114-cfs      3674.37 MB/sec
> > 6.1.114-eevdf    3505.25 MB/sec -delay_dequeue
> >                  3701.66 MB/sec +delay_dequeue
> >
> > For tbench, preemption = shorter turnaround = higher throughput.
>
> So here you have a benchmark that gets a ~5% boost from
> delayed_dequeue.
>
> But I've got one that get's a 20% penalty so I'm not exactly sure what
> to make of that. Clearly FIO does not have the same pattern as tbench.

There are basically two options in sched-land, shave fastpath cycles,
or some variant of Rob Peter to pay Paul ;-)

	-Mike
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Phil Auld 1 year, 1 month ago
On Sat, Nov 02, 2024 at 05:32:14AM +0100 Mike Galbraith wrote:
> On Fri, 2024-11-01 at 16:07 -0400, Phil Auld wrote:
> 
> 
> > Thanks for jumping in.  My jargon decoder ring seems to be failing me
> > so I'm not completely sure what you are saying below :)
> >
> > "buddies" you mean tasks that waking each other up and sleeping.
> > And one runs for longer than the other, right?
> 
> Yeah, buddies are related waker/wakee 1:1 1:N or M:N, excluding tasks
> happening to be sitting on a CPU where, say a timer fires, an IRQ leads
> to a wakeup of lord knows what, lock wakeups etc etc etc. I think Peter
> coined the term buddy to mean that (less typing), and it stuck.
>

Thanks!

> > > 1 tbench buddy pair scheduled cross core.
> > >
> > >   PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ P COMMAND
> > > 13770 root      20   0   21424   1920   1792 S 60.13 0.012   0:33.81 3 tbench
> > > 13771 root      20   0    4720    896    768 S 46.84 0.006   0:26.10 2 tbench_srv
> >  
> > > Note 60/47 utilization, now pinned/stacked.
> > >
> > > 6.1.114-cfs
> > >   PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ P COMMAND
> > >  4407 root      20   0   21424   1980   1772 R 50.00 0.012   0:29.20 3 tbench
> > >  4408 root      20   0    4720    124      0 R 50.00 0.001   0:28.76 3 tbench_srv
> >
> > What is the difference between these first two?  The first is on
> > separate cores so they don't interfere with each other? And the second is
> > pinned to the same core?
> 
> Yeah, see 'P'. Given CPU headroom, a tbench pair can consume ~107%.
> They're not fully synchronous.. wouldn't be relevant here/now if they
> were :)
> 
> > > Note what happens to the lighter tbench_srv. Consuming red hot L2 data,
> > > it can utilize a full 50%, but it must first preempt wide bottom buddy.
> > >
> >
> > We've got "light" and "wide" here which is a bit mixed metaphorically
> > :)
> 
> Wide, skinny, feather-weight or lard-ball, they all work for me.
> 
> > So here CFS is letting the wakee preempt the waker and providing pretty
> > equal fairness. And hot l2 caching is masking the assymmetry.
> 
> No, it's way simpler: preemption slices through the only thing it can
> slice through, the post wakeup concurrent bits.. that otherwise sits
> directly in the communication stream as a lump of latency in a latency
> bound operation.
> 
> >
> > With wakeup preemption off it doesn't help in my case. I was thinking
> > maybe the preemption was preventing some batching of IO completions
> > or
> > initiations. But that was wrong it seems.
> 
> Dunno.
> 
> > Does it also possibly make wakeup migration less likely and thus increase
> > stacking?
> 
> The buddy being preempted certainly won't be wakeup migrated, because
> it won't sleep. Two very sleepy tasks when bw constrained becomes one
> 100% hog and one 99.99% hog when CPU constrained.
>

Not the waker who gets preempted but the wakee may be a bit more
sticky on his current cpu and thus stack more since he's still
in that runqueue. But that's just a mental excercise trying to
find things that are directly related to delay dequeue. No observation
other than the over all perf hit. 


> > > Bottom line, box full of 1:1 buddies pairing up and stacking in L2.
> > >
> > > tbench 8
> > > 6.1.114-cfs      3674.37 MB/sec
> > > 6.1.114-eevdf    3505.25 MB/sec -delay_dequeue
> > >                  3701.66 MB/sec +delay_dequeue
> > >
> > > For tbench, preemption = shorter turnaround = higher throughput.
> >
> > So here you have a benchmark that gets a ~5% boost from
> > delayed_dequeue.
> >
> > But I've got one that get's a 20% penalty so I'm not exactly sure what
> > to make of that. Clearly FIO does not have the same pattern as tbench.
> 
> There are basically two options in sched-land, shave fastpath cycles,
> or some variant of Rob Peter to pay Paul ;-)
>

That Peter is cranky :)

Cheers,
Phil


> 	-Mike
> 

-- 
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Mike Galbraith 1 year, 1 month ago
On Mon, 2024-11-04 at 08:05 -0500, Phil Auld wrote:
> On Sat, Nov 02, 2024 at 05:32:14AM +0100 Mike Galbraith wrote:
>
> >
> > The buddy being preempted certainly won't be wakeup migrated...
>
> Not the waker who gets preempted but the wakee may be a bit more
> sticky on his current cpu and thus stack more since he's still
> in that runqueue.

Ah, indeed, if wakees don't get scraped off before being awakened, they
can and do miss chances at an idle CPU according to trace_printk().

I'm undecided if overall it's boon, bane or even matters, as there is
still an ample supply of wakeup migration, but seems it can indeed
inject wakeup latency needlessly, so <sharpens stick>...

My box booted and neither become exceptionally noisy nor inexplicably
silent in.. oh, minutes now, so surely yours will be perfectly fine.

After one minute of lightly loaded box browsing, trace_printk() said:

  645   - racy peek says there is a room available
   11   - cool, reserved room is free
  206   - no vacancy or wakee pinned
38807   - SIS accommodates room seeker

The below should improve the odds, but high return seems unlikely.

---
 kernel/sched/core.c |    9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3790,7 +3790,13 @@ static int ttwu_runnable(struct task_str
 	rq = __task_rq_lock(p, &rf);
 	if (task_on_rq_queued(p)) {
 		update_rq_clock(rq);
-		if (p->se.sched_delayed)
+		/*
+		 * If wakee is mobile and the room it reserved is occupied, let it try to migrate.
+		 */
+		if (p->se.sched_delayed && rq->nr_running > 1 && cpumask_weight(p->cpus_ptr) > 1) {
+			dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+			goto out_unlock;
+		} else if (p->se.sched_delayed)
 			enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
 		if (!task_on_cpu(rq, p)) {
 			/*
@@ -3802,6 +3808,7 @@ static int ttwu_runnable(struct task_str
 		ttwu_do_wakeup(p);
 		ret = 1;
 	}
+out_unlock:
 	__task_rq_unlock(rq, &rf);

 	return ret;
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Peter Zijlstra 1 year, 1 month ago
On Tue, Nov 05, 2024 at 05:05:12AM +0100, Mike Galbraith wrote:

> After one minute of lightly loaded box browsing, trace_printk() said:
> 
>   645   - racy peek says there is a room available
>    11   - cool, reserved room is free
>   206   - no vacancy or wakee pinned
> 38807   - SIS accommodates room seeker
> 
> The below should improve the odds, but high return seems unlikely.
> 
> ---
>  kernel/sched/core.c |    9 ++++++++-
>  1 file changed, 8 insertions(+), 1 deletion(-)
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3790,7 +3790,13 @@ static int ttwu_runnable(struct task_str
>  	rq = __task_rq_lock(p, &rf);
>  	if (task_on_rq_queued(p)) {
>  		update_rq_clock(rq);
> -		if (p->se.sched_delayed)
> +		/*
> +		 * If wakee is mobile and the room it reserved is occupied, let it try to migrate.
> +		 */
> +		if (p->se.sched_delayed && rq->nr_running > 1 && cpumask_weight(p->cpus_ptr) > 1) {
> +			dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
> +			goto out_unlock;
> +		} else if (p->se.sched_delayed)
>  			enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
>  		if (!task_on_cpu(rq, p)) {
>  			/*
> @@ -3802,6 +3808,7 @@ static int ttwu_runnable(struct task_str
>  		ttwu_do_wakeup(p);
>  		ret = 1;
>  	}
> +out_unlock:
>  	__task_rq_unlock(rq, &rf);
> 
>  	return ret;

So... I was trying to make that prettier and ended up with something
like this:

---
 kernel/sched/core.c  | 46 ++++++++++++++++++++++++++++------------------
 kernel/sched/sched.h |  5 +++++
 2 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 54d82c21fc8e..b083c6385e88 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3774,28 +3774,38 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
  */
 static int ttwu_runnable(struct task_struct *p, int wake_flags)
 {
-	struct rq_flags rf;
-	struct rq *rq;
-	int ret = 0;
+	CLASS(__task_rq_lock, rq_guard)(p);
+	struct rq *rq = rq_guard.rq;
 
-	rq = __task_rq_lock(p, &rf);
-	if (task_on_rq_queued(p)) {
-		update_rq_clock(rq);
-		if (p->se.sched_delayed)
-			enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
-		if (!task_on_cpu(rq, p)) {
-			/*
-			 * When on_rq && !on_cpu the task is preempted, see if
-			 * it should preempt the task that is current now.
-			 */
-			wakeup_preempt(rq, p, wake_flags);
+	if (!task_on_rq_queued(p))
+		return 0;
+
+	update_rq_clock(rq);
+	if (p->se.sched_delayed) {
+		int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
+
+		/*
+		 * Since sched_delayed means we cannot be current anywhere,
+		 * dequeue it here and have it fall through to the
+		 * select_task_rq() case further along the ttwu() path.
+		 */
+		if (rq->nr_running > 1 && p->nr_cpus_allowed > 1) {
+			dequeue_task(rq, p, DEQUEUE_SLEEP | queue_flags);
+			return 0;
 		}
-		ttwu_do_wakeup(p);
-		ret = 1;
+
+		enqueue_task(rq, p, queue_flags);
 	}
-	__task_rq_unlock(rq, &rf);
+	if (!task_on_cpu(rq, p)) {
+		/*
+		 * When on_rq && !on_cpu the task is preempted, see if
+		 * it should preempt the task that is current now.
+		 */
+		wakeup_preempt(rq, p, wake_flags);
+	}
+	ttwu_do_wakeup(p);
 
-	return ret;
+	return 1;
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 21b1780c6695..1714ac38500f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1787,6 +1787,11 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
 	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 }
 
+DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
+		    _T->rq = __task_rq_lock(_T->lock, &_T->rf),
+		    __task_rq_unlock(_T->rq, &_T->rf),
+		    struct rq *rq; struct rq_flags rf)
+
 DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
 		    _T->rq = task_rq_lock(_T->lock, &_T->rf),
 		    task_rq_unlock(_T->rq, _T->lock, &_T->rf),
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Mike Galbraith 1 year, 1 month ago
On Wed, 2024-11-06 at 14:53 +0100, Peter Zijlstra wrote:
> 
> So... I was trying to make that prettier and ended up with something
> like this:

Passing ENQUEUE_DELAYED to dequeue_task() looks funky until you check
the value, but otherwise yeah, when applied that looks better to me.

> 
> ---
>  kernel/sched/core.c  | 46 ++++++++++++++++++++++++++++------------------
>  kernel/sched/sched.h |  5 +++++
>  2 files changed, 33 insertions(+), 18 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 54d82c21fc8e..b083c6385e88 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3774,28 +3774,38 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
>   */
>  static int ttwu_runnable(struct task_struct *p, int wake_flags)
>  {
> -       struct rq_flags rf;
> -       struct rq *rq;
> -       int ret = 0;
> +       CLASS(__task_rq_lock, rq_guard)(p);
> +       struct rq *rq = rq_guard.rq;
>  
> -       rq = __task_rq_lock(p, &rf);
> -       if (task_on_rq_queued(p)) {
> -               update_rq_clock(rq);
> -               if (p->se.sched_delayed)
> -                       enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
> -               if (!task_on_cpu(rq, p)) {
> -                       /*
> -                        * When on_rq && !on_cpu the task is preempted, see if
> -                        * it should preempt the task that is current now.
> -                        */
> -                       wakeup_preempt(rq, p, wake_flags);
> +       if (!task_on_rq_queued(p))
> +               return 0;
> +
> +       update_rq_clock(rq);
> +       if (p->se.sched_delayed) {
> +               int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
> +
> +               /*
> +                * Since sched_delayed means we cannot be current anywhere,
> +                * dequeue it here and have it fall through to the
> +                * select_task_rq() case further along the ttwu() path.
> +                */
> +               if (rq->nr_running > 1 && p->nr_cpus_allowed > 1) {
> +                       dequeue_task(rq, p, DEQUEUE_SLEEP | queue_flags);
> +                       return 0;
>                 }
> -               ttwu_do_wakeup(p);
> -               ret = 1;
> +
> +               enqueue_task(rq, p, queue_flags);
>         }
> -       __task_rq_unlock(rq, &rf);
> +       if (!task_on_cpu(rq, p)) {
> +               /*
> +                * When on_rq && !on_cpu the task is preempted, see if
> +                * it should preempt the task that is current now.
> +                */
> +               wakeup_preempt(rq, p, wake_flags);
> +       }
> +       ttwu_do_wakeup(p);
>  
> -       return ret;
> +       return 1;
>  }
>  
>  #ifdef CONFIG_SMP
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 21b1780c6695..1714ac38500f 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1787,6 +1787,11 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
>         raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
>  }
>  
> +DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
> +                   _T->rq = __task_rq_lock(_T->lock, &_T->rf),
> +                   __task_rq_unlock(_T->rq, &_T->rf),
> +                   struct rq *rq; struct rq_flags rf)
> +
>  DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
>                     _T->rq = task_rq_lock(_T->lock, &_T->rf),
>                     task_rq_unlock(_T->rq, _T->lock, &_T->rf),

Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Peter Zijlstra 1 year, 1 month ago
On Wed, Nov 06, 2024 at 03:14:58PM +0100, Mike Galbraith wrote:
> On Wed, 2024-11-06 at 14:53 +0100, Peter Zijlstra wrote:
> > 
> > So... I was trying to make that prettier and ended up with something
> > like this:
> 
> Passing ENQUEUE_DELAYED to dequeue_task() looks funky until you check
> the value, but otherwise yeah, when applied that looks better to me.

Yeah, it does look funneh, but we've been doing that for a long long
while.

Still, perhaps I should rename the shared ones to QUEUE_foo and only
have the specific ones be {EN,DE}QUEUE_foo.
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Peter Zijlstra 1 year, 1 month ago
On Wed, Nov 06, 2024 at 02:53:46PM +0100, Peter Zijlstra wrote:

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 54d82c21fc8e..b083c6385e88 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3774,28 +3774,38 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
>   */
>  static int ttwu_runnable(struct task_struct *p, int wake_flags)
>  {
> +	CLASS(__task_rq_lock, rq_guard)(p);
> +	struct rq *rq = rq_guard.rq;
>  
> +	if (!task_on_rq_queued(p))
> +		return 0;
> +
> +	update_rq_clock(rq);
> +	if (p->se.sched_delayed) {
> +		int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
> +
> +		/*
> +		 * Since sched_delayed means we cannot be current anywhere,
> +		 * dequeue it here and have it fall through to the
> +		 * select_task_rq() case further along the ttwu() path.
> +		 */
> +		if (rq->nr_running > 1 && p->nr_cpus_allowed > 1) {
> +			dequeue_task(rq, p, DEQUEUE_SLEEP | queue_flags);
> +			return 0;
>  		}
> +
> +		enqueue_task(rq, p, queue_flags);

And then I wondered... this means that !task_on_cpu() is true for
sched_delayed, and thus we can move this in the below branch.

But also, we can probably dequeue every such task, not only
sched_delayed ones.

>  	}
> +	if (!task_on_cpu(rq, p)) {
> +		/*
> +		 * When on_rq && !on_cpu the task is preempted, see if
> +		 * it should preempt the task that is current now.
> +		 */
> +		wakeup_preempt(rq, p, wake_flags);
> +	}
> +	ttwu_do_wakeup(p);
>  
> +	return 1;
>  }


Yielding something like this on top... which boots. But since I forgot
to make it a feature, I can't actually tell at this point.. *sigh*

Anyway, more toys to poke at I suppose.


diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b083c6385e88..69b19ba77598 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3781,28 +3781,32 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
 		return 0;
 
 	update_rq_clock(rq);
-	if (p->se.sched_delayed) {
-		int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
+	if (!task_on_cpu(rq, p)) {
+		int queue_flags = DEQUEUE_NOCLOCK;
+
+		if (p->se.sched_delayed)
+			queue_flags |= DEQUEUE_DELAYED;
 
 		/*
-		 * Since sched_delayed means we cannot be current anywhere,
-		 * dequeue it here and have it fall through to the
-		 * select_task_rq() case further along the ttwu() path.
+		 * Since we're not current anywhere *AND* hold pi_lock, dequeue
+		 * it here and have it fall through to the select_task_rq()
+		 * case further along the ttwu() path.
 		 */
 		if (rq->nr_running > 1 && p->nr_cpus_allowed > 1) {
 			dequeue_task(rq, p, DEQUEUE_SLEEP | queue_flags);
 			return 0;
 		}
 
-		enqueue_task(rq, p, queue_flags);
-	}
-	if (!task_on_cpu(rq, p)) {
+		if (p->se.sched_delayed)
+			enqueue_task(rq, p, queue_flags);
+
 		/*
 		 * When on_rq && !on_cpu the task is preempted, see if
 		 * it should preempt the task that is current now.
 		 */
 		wakeup_preempt(rq, p, wake_flags);
 	}
+	SCHED_WARN_ON(p->se.sched_delayed);
 	ttwu_do_wakeup(p);
 
 	return 1;
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Mike Galbraith 1 year, 1 month ago
On Wed, 2024-11-06 at 15:14 +0100, Peter Zijlstra wrote:
> On Wed, Nov 06, 2024 at 02:53:46PM +0100, Peter Zijlstra wrote:
> 
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 54d82c21fc8e..b083c6385e88 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -3774,28 +3774,38 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
> >   */
> >  static int ttwu_runnable(struct task_struct *p, int wake_flags)
> >  {
> > +       CLASS(__task_rq_lock, rq_guard)(p);
> > +       struct rq *rq = rq_guard.rq;
> >  
> > +       if (!task_on_rq_queued(p))
> > +               return 0;
> > +
> > +       update_rq_clock(rq);
> > +       if (p->se.sched_delayed) {
> > +               int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
> > +
> > +               /*
> > +                * Since sched_delayed means we cannot be current anywhere,
> > +                * dequeue it here and have it fall through to the
> > +                * select_task_rq() case further along the ttwu() path.
> > +                */
> > +               if (rq->nr_running > 1 && p->nr_cpus_allowed > 1) {
> > +                       dequeue_task(rq, p, DEQUEUE_SLEEP | queue_flags);
> > +                       return 0;
> >                 }
> > +
> > +               enqueue_task(rq, p, queue_flags);
> 
> And then I wondered... this means that !task_on_cpu() is true for
> sched_delayed, and thus we can move this in the below branch.
> 
> But also, we can probably dequeue every such task, not only
> sched_delayed ones.
> 
> >         }
> > +       if (!task_on_cpu(rq, p)) {
> > +               /*
> > +                * When on_rq && !on_cpu the task is preempted, see if
> > +                * it should preempt the task that is current now.
> > +                */
> > +               wakeup_preempt(rq, p, wake_flags);
> > +       }
> > +       ttwu_do_wakeup(p);
> >  
> > +       return 1;
> >  }
> 
> 
> Yielding something like this on top... which boots. But since I forgot
> to make it a feature, I can't actually tell at this point.. *sigh*
> 
> Anyway, more toys to poke at I suppose.
> 
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index b083c6385e88..69b19ba77598 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3781,28 +3781,32 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
>                 return 0;
>  
>         update_rq_clock(rq);
> -       if (p->se.sched_delayed) {
> -               int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
> +       if (!task_on_cpu(rq, p)) {
> +               int queue_flags = DEQUEUE_NOCLOCK;
> +
> +               if (p->se.sched_delayed)
> +                       queue_flags |= DEQUEUE_DELAYED;
>  
>                 /*
> -                * Since sched_delayed means we cannot be current anywhere,
> -                * dequeue it here and have it fall through to the
> -                * select_task_rq() case further along the ttwu() path.
> +                * Since we're not current anywhere *AND* hold pi_lock, dequeue
> +                * it here and have it fall through to the select_task_rq()
> +                * case further along the ttwu() path.
>                  */
>                 if (rq->nr_running > 1 && p->nr_cpus_allowed > 1) {
>                         dequeue_task(rq, p, DEQUEUE_SLEEP | queue_flags);
>                         return 0;
>                 }

Hm, if we try to bounce a preempted task and fail, the wakeup_preempt()
call won't happen.

Bouncing preempted tasks is double edged sword.. on the one hand, it's
a huge win if bounce works for communicating tasks who will otherwise
be talking around the not-my-buddy man-in-the-middle who did the
preempting, but on the other, when PELT has its white hat on (also has
a black one) and has buddies pairing up nicely in an approaching
saturation scenario, bounces disturb it, add chaos.  Dunno.

>  
> -               enqueue_task(rq, p, queue_flags);
> -       }
> -       if (!task_on_cpu(rq, p)) {
> +               if (p->se.sched_delayed)
> +                       enqueue_task(rq, p, queue_flags);
> +
>                 /*
>                  * When on_rq && !on_cpu the task is preempted, see if
>                  * it should preempt the task that is current now.
>                  */
>                 wakeup_preempt(rq, p, wake_flags);
>         }
> +       SCHED_WARN_ON(p->se.sched_delayed);
>         ttwu_do_wakeup(p);
>  
>         return 1;

Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Mike Galbraith 1 year, 1 month ago
On Wed, 2024-11-06 at 16:22 +0100, Mike Galbraith wrote:
>
> Hm, if we try to bounce a preempted task and fail, the wakeup_preempt()
> call won't happen.

Zzzt, wrong, falling through still leads to the bottom of a wakeup with
its preempt check...

> Bouncing preempted tasks is double edged sword..

..but that bit is pretty intriguing.  From the service latency and
utilization perspective only at decision time (prime mission), it's an
obvious win to migrate to an idle CPU.

It's also a clear win for communication latency when buddies are NOT
popular but misused end to end latency measurement tools ala TCP_RR
with only microscopic concurrency. For the other netperf modes of
operation, there's no shortage of concurrency to salvage *and get out
of the communication stream*, and I think that applies to wide swaths
of the real world.  What makes it intriguing is the cross-over point
where "stacking is the stupidest idea ever" becomes "stacking may put
my and my buddy's wide butts directly in our own communication stream,
but that's less pain than what unrelated wide butts inflict on top of
higher LLC vs L2 latency".

For UDP_STREAM (async to the bone), there is no such a point, it would
seemingly prefer its buddy call from orbit, but for its more reasonable
TCP brother and ilk, there is.

Sample numbers (talk), interference is 8 unbound 88% compute instances,
box is crusty ole 8 rq i7-4790.

UDP_STREAM-1    unbound    Avg:  47135  Sum:    47135
UDP_STREAM-1    stacked    Avg:  39602  Sum:    39602
UDP_STREAM-1    cross-smt  Avg:  61599  Sum:    61599
UDP_STREAM-1    cross-core Avg:  67680  Sum:    67680

(distancia muy bueno!)

TCP_STREAM-1    unbound    Avg:  26299  Sum:    26299
TCP_STREAM-1    stacked    Avg:  27893  Sum:    27893
TCP_STREAM-1    cross-smt  Avg:  16728  Sum:    16728
TCP_STREAM-1    cross-core Avg:  13877  Sum:    13877

(idiota, distancia NO bueno, castillo inflable muy bueno!)

Service latency dominates.. not quite always, and bouncing tasks about
is simultaneously the only sane thing to do and pure evil... like
everything else in sched land, making it a hard game to win :)

I built that patch out of curiosity, and yeah, set_next_task_fair()
finding a cfs_rq->curr ends play time pretty quickly.  Too bad my
service latency is a bit dinged up, bouncing preempted wakees about
promises to be interesting.

	-Mike
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Mike Galbraith 1 year, 1 month ago
On Thu, 2024-11-07 at 05:03 +0100, Mike Galbraith wrote:
>
> I built that patch out of curiosity, and yeah, set_next_task_fair()
> finding a cfs_rq->curr ends play time pretty quickly.

The below improved uptime, and trace_printk() says it's doing the
intended, so I suppose I'll add a feature and see what falls out.

---
 kernel/sched/core.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3794,7 +3794,7 @@ static int ttwu_runnable(struct task_str
 		int queue_flags = DEQUEUE_NOCLOCK;

 		if (p->se.sched_delayed)
-			queue_flags |= DEQUEUE_DELAYED;
+			queue_flags |= (DEQUEUE_DELAYED | DEQUEUE_SLEEP);

 		/*
 		 * Since we're not current anywhere *AND* hold pi_lock, dequeue
@@ -3802,7 +3802,7 @@ static int ttwu_runnable(struct task_str
 		 * case further along the ttwu() path.
 		 */
 		if (rq->nr_running > 1 && p->nr_cpus_allowed > 1) {
-			dequeue_task(rq, p, DEQUEUE_SLEEP | queue_flags);
+			dequeue_task(rq, p, queue_flags);
 			return 0;
 		}
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Mike Galbraith 1 year, 1 month ago
On Thu, 2024-11-07 at 10:46 +0100, Mike Galbraith wrote:
> On Thu, 2024-11-07 at 05:03 +0100, Mike Galbraith wrote:
> >
> > I built that patch out of curiosity, and yeah, set_next_task_fair()
> > finding a cfs_rq->curr ends play time pretty quickly.
>
> The below improved uptime, and trace_printk() says it's doing the
> intended, so I suppose I'll add a feature and see what falls out.

From netperf, I got.. number tabulation practice.  Three runs of each
test with and without produced nothing but variance/noise.

	-Mike
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Peter Zijlstra 1 year, 1 month ago
On Thu, Nov 07, 2024 at 03:02:36PM +0100, Mike Galbraith wrote:
> On Thu, 2024-11-07 at 10:46 +0100, Mike Galbraith wrote:
> > On Thu, 2024-11-07 at 05:03 +0100, Mike Galbraith wrote:
> > >
> > > I built that patch out of curiosity, and yeah, set_next_task_fair()
> > > finding a cfs_rq->curr ends play time pretty quickly.
> >
> > The below improved uptime, and trace_printk() says it's doing the
> > intended, so I suppose I'll add a feature and see what falls out.
> 
> From netperf, I got.. number tabulation practice.  Three runs of each
> test with and without produced nothing but variance/noise.

Make it go away then.

If you could write a Changelog for you inspired bit and stick my cleaned
up version under it, I'd be much obliged.
[PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Mike Galbraith 1 year, 1 month ago
On Thu, 2024-11-07 at 15:09 +0100, Peter Zijlstra wrote:
> On Thu, Nov 07, 2024 at 03:02:36PM +0100, Mike Galbraith wrote:
> > On Thu, 2024-11-07 at 10:46 +0100, Mike Galbraith wrote:
> > > On Thu, 2024-11-07 at 05:03 +0100, Mike Galbraith wrote:
> > > >
> > > > I built that patch out of curiosity, and yeah, set_next_task_fair()
> > > > finding a cfs_rq->curr ends play time pretty quickly.
> > >
> > > The below improved uptime, and trace_printk() says it's doing the
> > > intended, so I suppose I'll add a feature and see what falls out.
> >
> > From netperf, I got.. number tabulation practice.  Three runs of each
> > test with and without produced nothing but variance/noise.
>
> Make it go away then.
>
> If you could write a Changelog for you inspired bit and stick my cleaned
> up version under it, I'd be much obliged.

Salut, much obliged for eyeball relief.

---snip---

Phil Auld (Redhat) reported an fio benchmark regression having been found
to have been caused by addition of the DELAY_DEQUEUE feature, suggested it
may be related to wakees losing the ability to migrate, and confirmed that
restoration of same indeed did restore previous performance.

(de-uglified-a-lot-by)

Reported-by: Phil Auld <pauld@redhat.com>
Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
Link: https://lore.kernel.org/lkml/20241101124715.GA689589@pauld.westford.csb/
Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched/core.c  |   48 +++++++++++++++++++++++++++++-------------------
 kernel/sched/sched.h |    5 +++++
 2 files changed, 34 insertions(+), 19 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3783,28 +3783,38 @@ ttwu_do_activate(struct rq *rq, struct t
  */
 static int ttwu_runnable(struct task_struct *p, int wake_flags)
 {
-	struct rq_flags rf;
-	struct rq *rq;
-	int ret = 0;
-
-	rq = __task_rq_lock(p, &rf);
-	if (task_on_rq_queued(p)) {
-		update_rq_clock(rq);
-		if (p->se.sched_delayed)
-			enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
-		if (!task_on_cpu(rq, p)) {
-			/*
-			 * When on_rq && !on_cpu the task is preempted, see if
-			 * it should preempt the task that is current now.
-			 */
-			wakeup_preempt(rq, p, wake_flags);
+	CLASS(__task_rq_lock, rq_guard)(p);
+	struct rq *rq = rq_guard.rq;
+
+	if (!task_on_rq_queued(p))
+		return 0;
+
+	update_rq_clock(rq);
+	if (p->se.sched_delayed) {
+		int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
+
+		/*
+		 * Since sched_delayed means we cannot be current anywhere,
+		 * dequeue it here and have it fall through to the
+		 * select_task_rq() case further along the ttwu() path.
+		 */
+		if (rq->nr_running > 1 && p->nr_cpus_allowed > 1) {
+			dequeue_task(rq, p, DEQUEUE_SLEEP | queue_flags);
+			return 0;
 		}
-		ttwu_do_wakeup(p);
-		ret = 1;
+
+		enqueue_task(rq, p, queue_flags);
+	}
+	if (!task_on_cpu(rq, p)) {
+		/*
+		 * When on_rq && !on_cpu the task is preempted, see if
+		 * it should preempt the task that is current now.
+		 */
+		wakeup_preempt(rq, p, wake_flags);
 	}
-	__task_rq_unlock(rq, &rf);
+	ttwu_do_wakeup(p);

-	return ret;
+	return 1;
 }

 #ifdef CONFIG_SMP
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1779,6 +1779,11 @@ task_rq_unlock(struct rq *rq, struct tas
 	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 }

+DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
+		    _T->rq = __task_rq_lock(_T->lock, &_T->rf),
+		    __task_rq_unlock(_T->rq, &_T->rf),
+		    struct rq *rq; struct rq_flags rf)
+
 DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
 		    _T->rq = task_rq_lock(_T->lock, &_T->rf),
 		    task_rq_unlock(_T->rq, _T->lock, &_T->rf),
Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Mike Galbraith 1 year, 1 month ago
On Fri, 2024-11-08 at 01:24 +0100, Mike Galbraith wrote:
> On Thu, 2024-11-07 at 15:09 +0100, Peter Zijlstra wrote:
> > On Thu, Nov 07, 2024 at 03:02:36PM +0100, Mike Galbraith wrote:
> > > On Thu, 2024-11-07 at 10:46 +0100, Mike Galbraith wrote:
> > > > On Thu, 2024-11-07 at 05:03 +0100, Mike Galbraith wrote:
> > > > > 
> > > > > I built that patch out of curiosity, and yeah, set_next_task_fair()
> > > > > finding a cfs_rq->curr ends play time pretty quickly.
> > > > 
> > > > The below improved uptime, and trace_printk() says it's doing the
> > > > intended, so I suppose I'll add a feature and see what falls out.
> > > 
> > > From netperf, I got.. number tabulation practice.  Three runs of each
> > > test with and without produced nothing but variance/noise.
> > 
> > Make it go away then.
> > 
> > If you could write a Changelog for you inspired bit and stick my cleaned
> > up version under it, I'd be much obliged.
> 
> Salut, much obliged for eyeball relief.

Unfortunate change log place holder below aside, I think this patch may
need to be yanked as trading one not readily repeatable regression for
at least one that definitely is, and likely multiple others.

(adds knob)

tbench 8

NO_MIGRATE_DELAYED    3613.49 MB/sec
MIGRATE_DELAYED       3145.59 MB/sec
NO_DELAY_DEQUEUE      3355.42 MB/sec

First line is DELAY_DEQUEUE restoring pre-EEVDF tbench throughput as
I've mentioned it doing, but $subject promptly did away with that and
then some.

I thought I might be able to do away with the reservation like side
effect of DELAY_DEQUEUE by borrowing h_nr_delayed from...

     sched/eevdf: More PELT vs DELAYED_DEQUEUE

...for cgroups free test config, but Q/D poke at idle_cpu() helped not
at all.

> ---snip---
> 
> Phil Auld (Redhat) reported an fio benchmark regression having been found
> to have been caused by addition of the DELAY_DEQUEUE feature, suggested it
> may be related to wakees losing the ability to migrate, and confirmed that
> restoration of same indeed did restore previous performance.
> 
> (de-uglified-a-lot-by)
> 
> Reported-by: Phil Auld <pauld@redhat.com>
> Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
> Link: https://lore.kernel.org/lkml/20241101124715.GA689589@pauld.westford.csb/
> Signed-off-by: Mike Galbraith <efault@gmx.de>
> ---
>  kernel/sched/core.c  |   48 +++++++++++++++++++++++++++++-------------------
>  kernel/sched/sched.h |    5 +++++
>  2 files changed, 34 insertions(+), 19 deletions(-)
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3783,28 +3783,38 @@ ttwu_do_activate(struct rq *rq, struct t
>   */
>  static int ttwu_runnable(struct task_struct *p, int wake_flags)
>  {
> -       struct rq_flags rf;
> -       struct rq *rq;
> -       int ret = 0;
> -
> -       rq = __task_rq_lock(p, &rf);
> -       if (task_on_rq_queued(p)) {
> -               update_rq_clock(rq);
> -               if (p->se.sched_delayed)
> -                       enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
> -               if (!task_on_cpu(rq, p)) {
> -                       /*
> -                        * When on_rq && !on_cpu the task is preempted, see if
> -                        * it should preempt the task that is current now.
> -                        */
> -                       wakeup_preempt(rq, p, wake_flags);
> +       CLASS(__task_rq_lock, rq_guard)(p);
> +       struct rq *rq = rq_guard.rq;
> +
> +       if (!task_on_rq_queued(p))
> +               return 0;
> +
> +       update_rq_clock(rq);
> +       if (p->se.sched_delayed) {
> +               int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
> +
> +               /*
> +                * Since sched_delayed means we cannot be current anywhere,
> +                * dequeue it here and have it fall through to the
> +                * select_task_rq() case further along the ttwu() path.
> +                */
> +               if (rq->nr_running > 1 && p->nr_cpus_allowed > 1) {
> +                       dequeue_task(rq, p, DEQUEUE_SLEEP | queue_flags);
> +                       return 0;
>                 }
> -               ttwu_do_wakeup(p);
> -               ret = 1;
> +
> +               enqueue_task(rq, p, queue_flags);
> +       }
> +       if (!task_on_cpu(rq, p)) {
> +               /*
> +                * When on_rq && !on_cpu the task is preempted, see if
> +                * it should preempt the task that is current now.
> +                */
> +               wakeup_preempt(rq, p, wake_flags);
>         }
> -       __task_rq_unlock(rq, &rf);
> +       ttwu_do_wakeup(p);
>  
> -       return ret;
> +       return 1;
>  }
>  
>  #ifdef CONFIG_SMP
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1779,6 +1779,11 @@ task_rq_unlock(struct rq *rq, struct tas
>         raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
>  }
>  
> +DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
> +                   _T->rq = __task_rq_lock(_T->lock, &_T->rf),
> +                   __task_rq_unlock(_T->rq, &_T->rf),
> +                   struct rq *rq; struct rq_flags rf)
> +
>  DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
>                     _T->rq = task_rq_lock(_T->lock, &_T->rf),
>                     task_rq_unlock(_T->rq, _T->lock, &_T->rf),
> 
> 

Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Phil Auld 1 year, 1 month ago
On Tue, Nov 12, 2024 at 08:05:04AM +0100 Mike Galbraith wrote:
> On Fri, 2024-11-08 at 01:24 +0100, Mike Galbraith wrote:
> > On Thu, 2024-11-07 at 15:09 +0100, Peter Zijlstra wrote:
> > > On Thu, Nov 07, 2024 at 03:02:36PM +0100, Mike Galbraith wrote:
> > > > On Thu, 2024-11-07 at 10:46 +0100, Mike Galbraith wrote:
> > > > > On Thu, 2024-11-07 at 05:03 +0100, Mike Galbraith wrote:
> > > > > > 
> > > > > > I built that patch out of curiosity, and yeah, set_next_task_fair()
> > > > > > finding a cfs_rq->curr ends play time pretty quickly.
> > > > > 
> > > > > The below improved uptime, and trace_printk() says it's doing the
> > > > > intended, so I suppose I'll add a feature and see what falls out.
> > > > 
> > > > From netperf, I got.. number tabulation practice.  Three runs of each
> > > > test with and without produced nothing but variance/noise.
> > > 
> > > Make it go away then.
> > > 
> > > If you could write a Changelog for you inspired bit and stick my cleaned
> > > up version under it, I'd be much obliged.
> > 
> > Salut, much obliged for eyeball relief.
> 
> Unfortunate change log place holder below aside, I think this patch may
> need to be yanked as trading one not readily repeatable regression for
> at least one that definitely is, and likely multiple others.
> 
> (adds knob)
>

Yes, I ws just coming here to reply. I have the results from the first
version of the patch (I don't think the later one fundemtally changed
enough that it will matter but those results are still pending).

Not entirely surprisingly we've traded a ~10% rand write regression for
5-10% rand read regression. This makes sense to me since the reads are
more likely to be synchronous and thus be more buddy-like and benefit
from flipping back and forth on the same cpu.  

I'd probably have to take the reads over the writes in such a trade off :)

> tbench 8
> 
> NO_MIGRATE_DELAYED    3613.49 MB/sec
> MIGRATE_DELAYED       3145.59 MB/sec
> NO_DELAY_DEQUEUE      3355.42 MB/sec
> 
> First line is DELAY_DEQUEUE restoring pre-EEVDF tbench throughput as
> I've mentioned it doing, but $subject promptly did away with that and
> then some.
>

Yep, that's not pretty. 

> I thought I might be able to do away with the reservation like side
> effect of DELAY_DEQUEUE by borrowing h_nr_delayed from...
> 
>      sched/eevdf: More PELT vs DELAYED_DEQUEUE
> 
> ...for cgroups free test config, but Q/D poke at idle_cpu() helped not
> at all.
>

I wonder if the last_wakee stuff could be leveraged here (an idle thought,
so to speak). Haven't looked closely enough. 


Cheers,
Phil

> > ---snip---
> > 
> > Phil Auld (Redhat) reported an fio benchmark regression having been found
> > to have been caused by addition of the DELAY_DEQUEUE feature, suggested it
> > may be related to wakees losing the ability to migrate, and confirmed that
> > restoration of same indeed did restore previous performance.
> > 
> > (de-uglified-a-lot-by)
> > 
> > Reported-by: Phil Auld <pauld@redhat.com>
> > Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
> > Link: https://lore.kernel.org/lkml/20241101124715.GA689589@pauld.westford.csb/
> > Signed-off-by: Mike Galbraith <efault@gmx.de>
> > ---
> >  kernel/sched/core.c  |   48 +++++++++++++++++++++++++++++-------------------
> >  kernel/sched/sched.h |    5 +++++
> >  2 files changed, 34 insertions(+), 19 deletions(-)
> > 
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -3783,28 +3783,38 @@ ttwu_do_activate(struct rq *rq, struct t
> >   */
> >  static int ttwu_runnable(struct task_struct *p, int wake_flags)
> >  {
> > -       struct rq_flags rf;
> > -       struct rq *rq;
> > -       int ret = 0;
> > -
> > -       rq = __task_rq_lock(p, &rf);
> > -       if (task_on_rq_queued(p)) {
> > -               update_rq_clock(rq);
> > -               if (p->se.sched_delayed)
> > -                       enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
> > -               if (!task_on_cpu(rq, p)) {
> > -                       /*
> > -                        * When on_rq && !on_cpu the task is preempted, see if
> > -                        * it should preempt the task that is current now.
> > -                        */
> > -                       wakeup_preempt(rq, p, wake_flags);
> > +       CLASS(__task_rq_lock, rq_guard)(p);
> > +       struct rq *rq = rq_guard.rq;
> > +
> > +       if (!task_on_rq_queued(p))
> > +               return 0;
> > +
> > +       update_rq_clock(rq);
> > +       if (p->se.sched_delayed) {
> > +               int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
> > +
> > +               /*
> > +                * Since sched_delayed means we cannot be current anywhere,
> > +                * dequeue it here and have it fall through to the
> > +                * select_task_rq() case further along the ttwu() path.
> > +                */
> > +               if (rq->nr_running > 1 && p->nr_cpus_allowed > 1) {
> > +                       dequeue_task(rq, p, DEQUEUE_SLEEP | queue_flags);
> > +                       return 0;
> >                 }
> > -               ttwu_do_wakeup(p);
> > -               ret = 1;
> > +
> > +               enqueue_task(rq, p, queue_flags);
> > +       }
> > +       if (!task_on_cpu(rq, p)) {
> > +               /*
> > +                * When on_rq && !on_cpu the task is preempted, see if
> > +                * it should preempt the task that is current now.
> > +                */
> > +               wakeup_preempt(rq, p, wake_flags);
> >         }
> > -       __task_rq_unlock(rq, &rf);
> > +       ttwu_do_wakeup(p);
> >  
> > -       return ret;
> > +       return 1;
> >  }
> >  
> >  #ifdef CONFIG_SMP
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -1779,6 +1779,11 @@ task_rq_unlock(struct rq *rq, struct tas
> >         raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
> >  }
> >  
> > +DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
> > +                   _T->rq = __task_rq_lock(_T->lock, &_T->rf),
> > +                   __task_rq_unlock(_T->rq, &_T->rf),
> > +                   struct rq *rq; struct rq_flags rf)
> > +
> >  DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
> >                     _T->rq = task_rq_lock(_T->lock, &_T->rf),
> >                     task_rq_unlock(_T->rq, _T->lock, &_T->rf),
> > 
> > 
> 

-- 
Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Peter Zijlstra 1 year, 1 month ago
On Tue, Nov 12, 2024 at 07:41:17AM -0500, Phil Auld wrote:
> On Tue, Nov 12, 2024 at 08:05:04AM +0100 Mike Galbraith wrote:
> > On Fri, 2024-11-08 at 01:24 +0100, Mike Galbraith wrote:
> > > On Thu, 2024-11-07 at 15:09 +0100, Peter Zijlstra wrote:
> > > > On Thu, Nov 07, 2024 at 03:02:36PM +0100, Mike Galbraith wrote:
> > > > > On Thu, 2024-11-07 at 10:46 +0100, Mike Galbraith wrote:
> > > > > > On Thu, 2024-11-07 at 05:03 +0100, Mike Galbraith wrote:
> > > > > > > 
> > > > > > > I built that patch out of curiosity, and yeah, set_next_task_fair()
> > > > > > > finding a cfs_rq->curr ends play time pretty quickly.
> > > > > > 
> > > > > > The below improved uptime, and trace_printk() says it's doing the
> > > > > > intended, so I suppose I'll add a feature and see what falls out.
> > > > > 
> > > > > From netperf, I got.. number tabulation practice.  Three runs of each
> > > > > test with and without produced nothing but variance/noise.
> > > > 
> > > > Make it go away then.
> > > > 
> > > > If you could write a Changelog for you inspired bit and stick my cleaned
> > > > up version under it, I'd be much obliged.
> > > 
> > > Salut, much obliged for eyeball relief.
> > 
> > Unfortunate change log place holder below aside, I think this patch may
> > need to be yanked as trading one not readily repeatable regression for
> > at least one that definitely is, and likely multiple others.
> > 
> > (adds knob)
> >
> 
> Yes, I ws just coming here to reply. I have the results from the first
> version of the patch (I don't think the later one fundemtally changed
> enough that it will matter but those results are still pending).
> 
> Not entirely surprisingly we've traded a ~10% rand write regression for
> 5-10% rand read regression. This makes sense to me since the reads are
> more likely to be synchronous and thus be more buddy-like and benefit
> from flipping back and forth on the same cpu.  

OK, so I'm going to make this commit disappear.
Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Mike Galbraith 1 year, 1 month ago
On Tue, 2024-11-12 at 07:41 -0500, Phil Auld wrote:
> On Tue, Nov 12, 2024 at 08:05:04AM +0100 Mike Galbraith wrote:
> >
> > Unfortunate change log place holder below aside, I think this patch may
> > need to be yanked as trading one not readily repeatable regression for
> > at least one that definitely is, and likely multiple others.
> >
> > (adds knob)
> >
>
> Yes, I ws just coming here to reply. I have the results from the first
> version of the patch (I don't think the later one fundemtally changed
> enough that it will matter but those results are still pending).
>
> Not entirely surprisingly we've traded a ~10% rand write regression for
> 5-10% rand read regression. This makes sense to me since the reads are
> more likely to be synchronous and thus be more buddy-like and benefit
> from flipping back and forth on the same cpu. 

Ok, that would seem to second "shoot it".

> I'd probably have to take the reads over the writes in such a trade off :)
>
> > tbench 8
> >
> > NO_MIGRATE_DELAYED    3613.49 MB/sec
> > MIGRATE_DELAYED       3145.59 MB/sec
> > NO_DELAY_DEQUEUE      3355.42 MB/sec
> >
> > First line is DELAY_DEQUEUE restoring pre-EEVDF tbench throughput as
> > I've mentioned it doing, but $subject promptly did away with that and
> > then some.
> >
>
> Yep, that's not pretty.

Yeah, not to mention annoying.

I get the "adds bounce cache pain" aspect, but not why pre-EEVDF
wouldn't be just as heavily affected, it having nothing blocking high
frequency migration (the eternal scheduler boogieman:).

Bottom line would appear to be that these survivors should be left
where they ended up, either due to LB or more likely bog standard
prev_cpu locality, for they are part and parcel of a progression.

> > I thought I might be able to do away with the reservation like side
> > effect of DELAY_DEQUEUE by borrowing h_nr_delayed from...
> >
> >      sched/eevdf: More PELT vs DELAYED_DEQUEUE
> >
> > ...for cgroups free test config, but Q/D poke at idle_cpu() helped not
> > at all.

We don't however have to let sched_delayed block SIS though.  Rendering
them transparent in idle_cpu() did NOT wreck the progression, so
maaaybe could help your regression.

> I wonder if the last_wakee stuff could be leveraged here (an idle thought,
> so to speak). Haven't looked closely enough.

If you mean heuristics, the less of those we have, the better off we
are.. they _always_ find a way to embed their teeth in your backside.

	-Mike
Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Phil Auld 1 year, 1 month ago
On Tue, Nov 12, 2024 at 03:23:38PM +0100 Mike Galbraith wrote:
> On Tue, 2024-11-12 at 07:41 -0500, Phil Auld wrote:
> > On Tue, Nov 12, 2024 at 08:05:04AM +0100 Mike Galbraith wrote:
> > >
> > > Unfortunate change log place holder below aside, I think this patch may
> > > need to be yanked as trading one not readily repeatable regression for
> > > at least one that definitely is, and likely multiple others.
> > >
> > > (adds knob)
> > >
> >
> > Yes, I ws just coming here to reply. I have the results from the first
> > version of the patch (I don't think the later one fundemtally changed
> > enough that it will matter but those results are still pending).
> >
> > Not entirely surprisingly we've traded a ~10% rand write regression for
> > 5-10% rand read regression. This makes sense to me since the reads are
> > more likely to be synchronous and thus be more buddy-like and benefit
> > from flipping back and forth on the same cpu. 
> 
> Ok, that would seem to second "shoot it".
>

Yes, drop it please, I think. Thanks!


> > I'd probably have to take the reads over the writes in such a trade off :)
> >
> > > tbench 8
> > >
> > > NO_MIGRATE_DELAYED    3613.49 MB/sec
> > > MIGRATE_DELAYED       3145.59 MB/sec
> > > NO_DELAY_DEQUEUE      3355.42 MB/sec
> > >
> > > First line is DELAY_DEQUEUE restoring pre-EEVDF tbench throughput as
> > > I've mentioned it doing, but $subject promptly did away with that and
> > > then some.
> > >
> >
> > Yep, that's not pretty.
> 
> Yeah, not to mention annoying.
> 
> I get the "adds bounce cache pain" aspect, but not why pre-EEVDF
> wouldn't be just as heavily affected, it having nothing blocking high
> frequency migration (the eternal scheduler boogieman:).
> 
> Bottom line would appear to be that these survivors should be left
> where they ended up, either due to LB or more likely bog standard
> prev_cpu locality, for they are part and parcel of a progression.
> 
> > > I thought I might be able to do away with the reservation like side
> > > effect of DELAY_DEQUEUE by borrowing h_nr_delayed from...
> > >
> > >      sched/eevdf: More PELT vs DELAYED_DEQUEUE
> > >
> > > ...for cgroups free test config, but Q/D poke at idle_cpu() helped not
> > > at all.
> 
> We don't however have to let sched_delayed block SIS though.  Rendering
> them transparent in idle_cpu() did NOT wreck the progression, so
> maaaybe could help your regression.
>

You mean something like:

if (rq->nr_running > rq->h_nr_delayed)
       return 0;

in idle_cpu() instead of the straight rq->nr_running check? I don't
have the h_nr_delayed stuff yet but can look for it.  I'm not sure
that will help with the delayees being sticky.  But I can try to try
that if I'm understanding you right.

I'll try to dig into it some more regardless.

> > I wonder if the last_wakee stuff could be leveraged here (an idle thought,
> > so to speak). Haven't looked closely enough.
> 
> If you mean heuristics, the less of those we have, the better off we
> are.. they _always_ find a way to embed their teeth in your backside.
>

Sure, I get that. But when you have a trade-off like this being "smarter"
about when to do the dequeue might help. But yes, that could go wrong.

I'm not a fan of knobs either but we could do your patch with the feat
and default it off.


Cheers,
Phil

> 	-Mike
> 

-- 
Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Mike Galbraith 1 year, 1 month ago
On Tue, 2024-11-12 at 10:41 -0500, Phil Auld wrote:
> On Tue, Nov 12, 2024 at 03:23:38PM +0100 Mike Galbraith wrote:
>
> >
> > We don't however have to let sched_delayed block SIS though.  Rendering
> > them transparent in idle_cpu() did NOT wreck the progression, so
> > maaaybe could help your regression.
> >
>
> You mean something like:
>
> if (rq->nr_running > rq->h_nr_delayed)
>        return 0;
>
> in idle_cpu() instead of the straight rq->nr_running check?

Yeah, close enough.


	-Mike
Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Mike Galbraith 1 year ago
On Tue, 2024-11-12 at 17:15 +0100, Mike Galbraith wrote:
> On Tue, 2024-11-12 at 10:41 -0500, Phil Auld wrote:
> > On Tue, Nov 12, 2024 at 03:23:38PM +0100 Mike Galbraith wrote:
> >
> > >
> > > We don't however have to let sched_delayed block SIS though.  Rendering
> > > them transparent in idle_cpu() did NOT wreck the progression, so
> > > maaaybe could help your regression.
> > >
> >
> > You mean something like:
> >
> > if (rq->nr_running > rq->h_nr_delayed)
> >        return 0;
> >
> > in idle_cpu() instead of the straight rq->nr_running check?
>
> Yeah, close enough.

The below is all you need.

Watching blockage rate during part of a netperf scaling run without, a
bit over 2/sec was the highest it got, but with, that drops to the same
zero as turning off the feature, so... relevance highly unlikely but
not quite impossible?

---
 kernel/sched/fair.c |    4 ++++
 1 file changed, 4 insertions(+)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9454,11 +9454,15 @@ int can_migrate_task(struct task_struct

 	/*
 	 * We do not migrate tasks that are:
+	 * 0) not runnable (not useful here/now, but are annoying), or
 	 * 1) throttled_lb_pair, or
 	 * 2) cannot be migrated to this CPU due to cpus_ptr, or
 	 * 3) running (obviously), or
 	 * 4) are cache-hot on their current CPU.
 	 */
+	if (p->se.sched_delayed)
+		return 0;
+
 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
 		return 0;
Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Phil Auld 1 year ago
On Thu, Nov 14, 2024 at 12:07:03PM +0100 Mike Galbraith wrote:
> On Tue, 2024-11-12 at 17:15 +0100, Mike Galbraith wrote:
> > On Tue, 2024-11-12 at 10:41 -0500, Phil Auld wrote:
> > > On Tue, Nov 12, 2024 at 03:23:38PM +0100 Mike Galbraith wrote:
> > >
> > > >
> > > > We don't however have to let sched_delayed block SIS though.  Rendering
> > > > them transparent in idle_cpu() did NOT wreck the progression, so
> > > > maaaybe could help your regression.
> > > >
> > >
> > > You mean something like:
> > >
> > > if (rq->nr_running > rq->h_nr_delayed)
> > >        return 0;
> > >
> > > in idle_cpu() instead of the straight rq->nr_running check?
> >
> > Yeah, close enough.
> 
> The below is all you need.
> 
> Watching blockage rate during part of a netperf scaling run without, a
> bit over 2/sec was the highest it got, but with, that drops to the same
> zero as turning off the feature, so... relevance highly unlikely but
> not quite impossible?
>

I'll give this a try on my issue. This'll be simpler than the other way.

Thanks!



Cheers,
Phil


> ---
>  kernel/sched/fair.c |    4 ++++
>  1 file changed, 4 insertions(+)
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9454,11 +9454,15 @@ int can_migrate_task(struct task_struct
> 
>  	/*
>  	 * We do not migrate tasks that are:
> +	 * 0) not runnable (not useful here/now, but are annoying), or
>  	 * 1) throttled_lb_pair, or
>  	 * 2) cannot be migrated to this CPU due to cpus_ptr, or
>  	 * 3) running (obviously), or
>  	 * 4) are cache-hot on their current CPU.
>  	 */
> +	if (p->se.sched_delayed)
> +		return 0;
> +
>  	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
>  		return 0;
> 
> 

-- 
Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Phil Auld 1 year ago
On Thu, Nov 14, 2024 at 06:28:54AM -0500 Phil Auld wrote:
> On Thu, Nov 14, 2024 at 12:07:03PM +0100 Mike Galbraith wrote:
> > On Tue, 2024-11-12 at 17:15 +0100, Mike Galbraith wrote:
> > > On Tue, 2024-11-12 at 10:41 -0500, Phil Auld wrote:
> > > > On Tue, Nov 12, 2024 at 03:23:38PM +0100 Mike Galbraith wrote:
> > > >
> > > > >
> > > > > We don't however have to let sched_delayed block SIS though.  Rendering
> > > > > them transparent in idle_cpu() did NOT wreck the progression, so
> > > > > maaaybe could help your regression.
> > > > >
> > > >
> > > > You mean something like:
> > > >
> > > > if (rq->nr_running > rq->h_nr_delayed)
> > > >        return 0;
> > > >
> > > > in idle_cpu() instead of the straight rq->nr_running check?
> > >
> > > Yeah, close enough.
> > 
> > The below is all you need.
> > 
> > Watching blockage rate during part of a netperf scaling run without, a
> > bit over 2/sec was the highest it got, but with, that drops to the same
> > zero as turning off the feature, so... relevance highly unlikely but
> > not quite impossible?
> >
> 
> I'll give this a try on my issue. This'll be simpler than the other way.
>

This, below, by itself, did not do help and caused a small slowdown on some
other tests.  Did this need to be on top of the wakeup change? 


Cheers,
Phil

> Thanks!
> 
> 
> 
> Cheers,
> Phil
> 
> 
> > ---
> >  kernel/sched/fair.c |    4 ++++
> >  1 file changed, 4 insertions(+)
> > 
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -9454,11 +9454,15 @@ int can_migrate_task(struct task_struct
> > 
> >  	/*
> >  	 * We do not migrate tasks that are:
> > +	 * 0) not runnable (not useful here/now, but are annoying), or
> >  	 * 1) throttled_lb_pair, or
> >  	 * 2) cannot be migrated to this CPU due to cpus_ptr, or
> >  	 * 3) running (obviously), or
> >  	 * 4) are cache-hot on their current CPU.
> >  	 */
> > +	if (p->se.sched_delayed)
> > +		return 0;
> > +
> >  	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
> >  		return 0;
> > 
> > 
> 
> -- 
> 
> 

-- 
Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Mike Galbraith 1 year ago
On Tue, 2024-11-19 at 06:30 -0500, Phil Auld wrote:
>
> This, below, by itself, did not do help and caused a small slowdown on some
> other tests.  Did this need to be on top of the wakeup change?

No, that made a mess.  The numbers said it was quite a reach, no
surprise it fell flat.

	-Mike
Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Mike Galbraith 1 year ago
On Tue, 2024-11-19 at 12:51 +0100, Mike Galbraith wrote:
> On Tue, 2024-11-19 at 06:30 -0500, Phil Auld wrote:
> >
> > This, below, by itself, did not do help and caused a small slowdown on some
> > other tests.  Did this need to be on top of the wakeup change?
>
> No, that made a mess.

Rashly speculating that turning mobile kthread component loose is what
helped your write regression...

You could try adding (p->flags & PF_KTHREAD) to the wakeup patch to
only turn hard working kthreads loose to try to dodge service latency.
Seems unlikely wakeup frequency * instances would combine to shred fio
the way turning tbench loose did.

	-Mike
Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Phil Auld 1 year ago
On Wed, Nov 20, 2024 at 07:37:39PM +0100 Mike Galbraith wrote:
> On Tue, 2024-11-19 at 12:51 +0100, Mike Galbraith wrote:
> > On Tue, 2024-11-19 at 06:30 -0500, Phil Auld wrote:
> > >
> > > This, below, by itself, did not do help and caused a small slowdown on some
> > > other tests.  Did this need to be on top of the wakeup change?
> >
> > No, that made a mess.
> 
> Rashly speculating that turning mobile kthread component loose is what
> helped your write regression...
> 
> You could try adding (p->flags & PF_KTHREAD) to the wakeup patch to
> only turn hard working kthreads loose to try to dodge service latency.
> Seems unlikely wakeup frequency * instances would combine to shred fio
> the way turning tbench loose did.
>

Thanks, I'll try that. 


Cheers,
Phil



> 	-Mike
> 

-- 
Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Phil Auld 1 year ago
On Thu, Nov 21, 2024 at 06:56:28AM -0500 Phil Auld wrote:
> On Wed, Nov 20, 2024 at 07:37:39PM +0100 Mike Galbraith wrote:
> > On Tue, 2024-11-19 at 12:51 +0100, Mike Galbraith wrote:
> > > On Tue, 2024-11-19 at 06:30 -0500, Phil Auld wrote:
> > > >
> > > > This, below, by itself, did not do help and caused a small slowdown on some
> > > > other tests.  Did this need to be on top of the wakeup change?
> > >
> > > No, that made a mess.
> > 
> > Rashly speculating that turning mobile kthread component loose is what
> > helped your write regression...
> > 
> > You could try adding (p->flags & PF_KTHREAD) to the wakeup patch to
> > only turn hard working kthreads loose to try to dodge service latency.
> > Seems unlikely wakeup frequency * instances would combine to shred fio
> > the way turning tbench loose did.
> >
> 
> Thanks, I'll try that. 
>

Also, fwiw, I think there is another report here

https://lore.kernel.org/lkml/392209D9-5AC6-4FDE-8D84-FB8A82AD9AEF@oracle.com/

which seems to be the same thing but mis-bisected. I've asked them to try
with NO_DELAY_DEQUEUE just to be sure.  But it looks like a duck.


Cheers,
Phil

-- 
Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Phil Auld 1 year ago
On Thu, Nov 21, 2024 at 07:07:04AM -0500 Phil Auld wrote:
> On Thu, Nov 21, 2024 at 06:56:28AM -0500 Phil Auld wrote:
> > On Wed, Nov 20, 2024 at 07:37:39PM +0100 Mike Galbraith wrote:
> > > On Tue, 2024-11-19 at 12:51 +0100, Mike Galbraith wrote:
> > > > On Tue, 2024-11-19 at 06:30 -0500, Phil Auld wrote:
> > > > >
> > > > > This, below, by itself, did not do help and caused a small slowdown on some
> > > > > other tests.  Did this need to be on top of the wakeup change?
> > > >
> > > > No, that made a mess.
> > > 
> > > Rashly speculating that turning mobile kthread component loose is what
> > > helped your write regression...
> > > 
> > > You could try adding (p->flags & PF_KTHREAD) to the wakeup patch to
> > > only turn hard working kthreads loose to try to dodge service latency.
> > > Seems unlikely wakeup frequency * instances would combine to shred fio
> > > the way turning tbench loose did.
> > >
> > 
> > Thanks, I'll try that. 
> >
> 
> Also, fwiw, I think there is another report here
> 
> https://lore.kernel.org/lkml/392209D9-5AC6-4FDE-8D84-FB8A82AD9AEF@oracle.com/
> 
> which seems to be the same thing but mis-bisected. I've asked them to try
> with NO_DELAY_DEQUEUE just to be sure.  But it looks like a duck.
>

But it does not quack like one.  Their issue did not go away with
NO_DELAY_DEQUEUE so something different is causing that one.


> 
> Cheers,
> Phil
> 
> -- 
> 
> 

-- 
[PATCH V2] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Mike Galbraith 1 year ago
On Thu, 2024-11-21 at 06:56 -0500, Phil Auld wrote:
> On Wed, Nov 20, 2024 at 07:37:39PM +0100 Mike Galbraith wrote:
> > On Tue, 2024-11-19 at 12:51 +0100, Mike Galbraith wrote:
> > > On Tue, 2024-11-19 at 06:30 -0500, Phil Auld wrote:
> > > >
> > > > This, below, by itself, did not do help and caused a small slowdown on some
> > > > other tests.  Did this need to be on top of the wakeup change?
> > >
> > > No, that made a mess.
> >
> > Rashly speculating that turning mobile kthread component loose is what
> > helped your write regression...
> >
> > You could try adding (p->flags & PF_KTHREAD) to the wakeup patch to
> > only turn hard working kthreads loose to try to dodge service latency.
> > Seems unlikely wakeup frequency * instances would combine to shred fio
> > the way turning tbench loose did.
> >
>
> Thanks, I'll try that.

You may still want to try that, but my box says probably not.  Playing
with your write command line, the players I see are pinned kworkers and
mobile fio instances.

Maybe try the below instead. The changelog is obsolete BS unless you
say otherwise, but while twiddled V2 will still migrate tbench a bit,
and per trace_printk() does still let all kinds of stuff wander off to
roll the SIS dice, it does not even scratch the paint of the formerly
obliterated tbench progression.

Question: did wiping off the evil leave any meaningful goodness behind?

---

sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU

Phil Auld (Redhat) reported an fio benchmark regression having been found
to have been caused by addition of the DELAY_DEQUEUE feature, suggested it
may be related to wakees losing the ability to migrate, and confirmed that
restoration of same indeed did restore previous performance.

V2: do not rip buddies apart, convenient on/off switch

Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched/core.c     |   51 ++++++++++++++++++++++++++++++------------------
 kernel/sched/features.h |    5 ++++
 kernel/sched/sched.h    |    5 ++++
 3 files changed, 42 insertions(+), 19 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3783,28 +3783,41 @@ ttwu_do_activate(struct rq *rq, struct t
  */
 static int ttwu_runnable(struct task_struct *p, int wake_flags)
 {
-	struct rq_flags rf;
-	struct rq *rq;
-	int ret = 0;
-
-	rq = __task_rq_lock(p, &rf);
-	if (task_on_rq_queued(p)) {
-		update_rq_clock(rq);
-		if (p->se.sched_delayed)
-			enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
-		if (!task_on_cpu(rq, p)) {
-			/*
-			 * When on_rq && !on_cpu the task is preempted, see if
-			 * it should preempt the task that is current now.
-			 */
-			wakeup_preempt(rq, p, wake_flags);
+	CLASS(__task_rq_lock, rq_guard)(p);
+	struct rq *rq = rq_guard.rq;
+
+	if (!task_on_rq_queued(p))
+		return 0;
+
+	update_rq_clock(rq);
+	if (p->se.sched_delayed) {
+		int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
+		int dequeue = sched_feat(DEQUEUE_DELAYED);
+
+		/*
+		 * Since sched_delayed means we cannot be current anywhere,
+		 * dequeue it here and have it fall through to the
+		 * select_task_rq() case further along in ttwu() path.
+		 * Note: Do not rip buddies apart else chaos follows.
+		 */
+		if (dequeue && rq->nr_running > 1 && p->nr_cpus_allowed > 1 &&
+		    !(rq->curr->last_wakee == p || p->last_wakee == rq->curr)) {
+			dequeue_task(rq, p, DEQUEUE_SLEEP | queue_flags);
+			return 0;
 		}
-		ttwu_do_wakeup(p);
-		ret = 1;
+
+		enqueue_task(rq, p, queue_flags);
+	}
+	if (!task_on_cpu(rq, p)) {
+		/*
+		 * When on_rq && !on_cpu the task is preempted, see if
+		 * it should preempt the task that is current now.
+		 */
+		wakeup_preempt(rq, p, wake_flags);
 	}
-	__task_rq_unlock(rq, &rf);
+	ttwu_do_wakeup(p);

-	return ret;
+	return 1;
 }

 #ifdef CONFIG_SMP
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -47,6 +47,11 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
  * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0.
  */
 SCHED_FEAT(DELAY_DEQUEUE, true)
+/*
+ * Free ONLY non-buddy delayed tasks to wakeup-migrate to avoid taking.
+ * an unnecessary latency hit.  Rending buddies asunder inflicts harm.
+ */
+SCHED_FEAT(DEQUEUE_DELAYED, true)
 SCHED_FEAT(DELAY_ZERO, true)

 /*
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1783,6 +1783,11 @@ task_rq_unlock(struct rq *rq, struct tas
 	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 }

+DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
+		    _T->rq = __task_rq_lock(_T->lock, &_T->rf),
+		    __task_rq_unlock(_T->rq, &_T->rf),
+		    struct rq *rq; struct rq_flags rf)
+
 DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
 		    _T->rq = task_rq_lock(_T->lock, &_T->rf),
 		    task_rq_unlock(_T->rq, _T->lock, &_T->rf),
Re: [PATCH V2] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Phil Auld 1 year ago
On Sat, Nov 23, 2024 at 09:44:40AM +0100 Mike Galbraith wrote:
> On Thu, 2024-11-21 at 06:56 -0500, Phil Auld wrote:
> > On Wed, Nov 20, 2024 at 07:37:39PM +0100 Mike Galbraith wrote:
> > > On Tue, 2024-11-19 at 12:51 +0100, Mike Galbraith wrote:
> > > > On Tue, 2024-11-19 at 06:30 -0500, Phil Auld wrote:
> > > > >
> > > > > This, below, by itself, did not do help and caused a small slowdown on some
> > > > > other tests.  Did this need to be on top of the wakeup change?
> > > >
> > > > No, that made a mess.
> > >
> > > Rashly speculating that turning mobile kthread component loose is what
> > > helped your write regression...
> > >
> > > You could try adding (p->flags & PF_KTHREAD) to the wakeup patch to
> > > only turn hard working kthreads loose to try to dodge service latency.
> > > Seems unlikely wakeup frequency * instances would combine to shred fio
> > > the way turning tbench loose did.
> > >
> >
> > Thanks, I'll try that.
> 
> You may still want to try that, but my box says probably not.  Playing
> with your write command line, the players I see are pinned kworkers and
> mobile fio instances.
>

Yep. The PF_KTHREAD thing did not help. 


> Maybe try the below instead. The changelog is obsolete BS unless you
> say otherwise, but while twiddled V2 will still migrate tbench a bit,
> and per trace_printk() does still let all kinds of stuff wander off to
> roll the SIS dice, it does not even scratch the paint of the formerly
> obliterated tbench progression.
>

Will give this one a try when I get caught up from being off all week for
US turkey day. 


Thanks!

> Question: did wiping off the evil leave any meaningful goodness behind?


Is that for this patch?  

If you mean for the original patch (which subsequently broke the reads) then
no. It was more or less even for all the other tests. It fixed the randwrite
issue by moving it to randread. Everything else we run regularly was about
the same. So no extra goodness to help decide :)


Cheers,
Phil

> 
> ---
> 
> sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
> 
> Phil Auld (Redhat) reported an fio benchmark regression having been found
> to have been caused by addition of the DELAY_DEQUEUE feature, suggested it
> may be related to wakees losing the ability to migrate, and confirmed that
> restoration of same indeed did restore previous performance.
> 
> V2: do not rip buddies apart, convenient on/off switch
> 
> Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
> Signed-off-by: Mike Galbraith <efault@gmx.de>
> ---
>  kernel/sched/core.c     |   51 ++++++++++++++++++++++++++++++------------------
>  kernel/sched/features.h |    5 ++++
>  kernel/sched/sched.h    |    5 ++++
>  3 files changed, 42 insertions(+), 19 deletions(-)
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3783,28 +3783,41 @@ ttwu_do_activate(struct rq *rq, struct t
>   */
>  static int ttwu_runnable(struct task_struct *p, int wake_flags)
>  {
> -	struct rq_flags rf;
> -	struct rq *rq;
> -	int ret = 0;
> -
> -	rq = __task_rq_lock(p, &rf);
> -	if (task_on_rq_queued(p)) {
> -		update_rq_clock(rq);
> -		if (p->se.sched_delayed)
> -			enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
> -		if (!task_on_cpu(rq, p)) {
> -			/*
> -			 * When on_rq && !on_cpu the task is preempted, see if
> -			 * it should preempt the task that is current now.
> -			 */
> -			wakeup_preempt(rq, p, wake_flags);
> +	CLASS(__task_rq_lock, rq_guard)(p);
> +	struct rq *rq = rq_guard.rq;
> +
> +	if (!task_on_rq_queued(p))
> +		return 0;
> +
> +	update_rq_clock(rq);
> +	if (p->se.sched_delayed) {
> +		int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
> +		int dequeue = sched_feat(DEQUEUE_DELAYED);
> +
> +		/*
> +		 * Since sched_delayed means we cannot be current anywhere,
> +		 * dequeue it here and have it fall through to the
> +		 * select_task_rq() case further along in ttwu() path.
> +		 * Note: Do not rip buddies apart else chaos follows.
> +		 */
> +		if (dequeue && rq->nr_running > 1 && p->nr_cpus_allowed > 1 &&
> +		    !(rq->curr->last_wakee == p || p->last_wakee == rq->curr)) {
> +			dequeue_task(rq, p, DEQUEUE_SLEEP | queue_flags);
> +			return 0;
>  		}
> -		ttwu_do_wakeup(p);
> -		ret = 1;
> +
> +		enqueue_task(rq, p, queue_flags);
> +	}
> +	if (!task_on_cpu(rq, p)) {
> +		/*
> +		 * When on_rq && !on_cpu the task is preempted, see if
> +		 * it should preempt the task that is current now.
> +		 */
> +		wakeup_preempt(rq, p, wake_flags);
>  	}
> -	__task_rq_unlock(rq, &rf);
> +	ttwu_do_wakeup(p);
> 
> -	return ret;
> +	return 1;
>  }
> 
>  #ifdef CONFIG_SMP
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -47,6 +47,11 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
>   * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0.
>   */
>  SCHED_FEAT(DELAY_DEQUEUE, true)
> +/*
> + * Free ONLY non-buddy delayed tasks to wakeup-migrate to avoid taking.
> + * an unnecessary latency hit.  Rending buddies asunder inflicts harm.
> + */
> +SCHED_FEAT(DEQUEUE_DELAYED, true)
>  SCHED_FEAT(DELAY_ZERO, true)
> 
>  /*
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1783,6 +1783,11 @@ task_rq_unlock(struct rq *rq, struct tas
>  	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
>  }
> 
> +DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
> +		    _T->rq = __task_rq_lock(_T->lock, &_T->rf),
> +		    __task_rq_unlock(_T->rq, &_T->rf),
> +		    struct rq *rq; struct rq_flags rf)
> +
>  DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
>  		    _T->rq = task_rq_lock(_T->lock, &_T->rf),
>  		    task_rq_unlock(_T->rq, _T->lock, &_T->rf),
> 

-- 
Re: [PATCH V2] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Mike Galbraith 1 year ago
On Mon, 2024-12-02 at 11:24 -0500, Phil Auld wrote:
> On Sat, Nov 23, 2024 at 09:44:40AM +0100 Mike Galbraith wrote:
>
>
> > Question: did wiping off the evil leave any meaningful goodness behind?
>
> Is that for this patch?

Yeah.  Trying it on my box with your write command line didn't improve
the confidence level either.  My box has one CPU handling IRQs and
waking pinned workers to service 8 fio instances.  Patch was useless
for that.

	-Mike
Re: [PATCH V2] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Phil Auld 1 year ago
On Mon, Dec 02, 2024 at 05:55:28PM +0100 Mike Galbraith wrote:
> On Mon, 2024-12-02 at 11:24 -0500, Phil Auld wrote:
> > On Sat, Nov 23, 2024 at 09:44:40AM +0100 Mike Galbraith wrote:
> >
> >
> > > Question: did wiping off the evil leave any meaningful goodness behind?
> >
> > Is that for this patch?
> 
> Yeah.  Trying it on my box with your write command line didn't improve
> the confidence level either.  My box has one CPU handling IRQs and
> waking pinned workers to service 8 fio instances.  Patch was useless
> for that.
>

I'll give it a try. Our "box" is multiple different boxes but the results
vary somewhat.  The one I sent info about earlier in this thread is just
one of the more egregious and is the one the perf team lent me for a while.


Cheers,
Phil


> 	-Mike
> 
> 

--
Re: [PATCH V2] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Phil Auld 1 year ago
Hi Mike et al.,

On Mon, Dec 02, 2024 at 02:12:52PM -0500 Phil Auld wrote:
> On Mon, Dec 02, 2024 at 05:55:28PM +0100 Mike Galbraith wrote:
> > On Mon, 2024-12-02 at 11:24 -0500, Phil Auld wrote:
> > > On Sat, Nov 23, 2024 at 09:44:40AM +0100 Mike Galbraith wrote:
> > >
> > >
> > > > Question: did wiping off the evil leave any meaningful goodness behind?
> > >
> > > Is that for this patch?
> > 
> > Yeah.  Trying it on my box with your write command line didn't improve
> > the confidence level either.  My box has one CPU handling IRQs and
> > waking pinned workers to service 8 fio instances.  Patch was useless
> > for that.
> >
> 
> I'll give it a try. Our "box" is multiple different boxes but the results
> vary somewhat.  The one I sent info about earlier in this thread is just
> one of the more egregious and is the one the perf team lent me for a while.
>

In our testing this has the same effect as the original dequeue-when-delayed
fix.  It solves the randwrite issue and introduces the ~10-15% randread
regression. 

Seems to be a real trade-off here. The same guys who benefit from spreading
in one case benefit from staying put in the other... 


Cheers,
Phil

--
Re: [PATCH V2] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Mike Galbraith 1 year ago
On Mon, 2024-12-09 at 08:11 -0500, Phil Auld wrote:
>
> Hi Mike et al.,
>
> On Mon, Dec 02, 2024 at 02:12:52PM -0500 Phil Auld wrote:
> > On Mon, Dec 02, 2024 at 05:55:28PM +0100 Mike Galbraith wrote:
> > > On Mon, 2024-12-02 at 11:24 -0500, Phil Auld wrote:
> > > > On Sat, Nov 23, 2024 at 09:44:40AM +0100 Mike Galbraith wrote:
> > > >
> > > >
> > > > > Question: did wiping off the evil leave any meaningful goodness behind?
> > > >
> > > > Is that for this patch?
> > >
> > > Yeah.  Trying it on my box with your write command line didn't improve
> > > the confidence level either.  My box has one CPU handling IRQs and
> > > waking pinned workers to service 8 fio instances.  Patch was useless
> > > for that.
> > >
> >
> > I'll give it a try. Our "box" is multiple different boxes but the results
> > vary somewhat.  The one I sent info about earlier in this thread is just
> > one of the more egregious and is the one the perf team lent me for a while.
> >
>
> In our testing this has the same effect as the original dequeue-when-delayed
> fix.  It solves the randwrite issue and introduces the ~10-15% randread
> regression.
>
> Seems to be a real trade-off here. The same guys who benefit from spreading
> in one case benefit from staying put in the other...

Does as much harm as it does good isn't the mark of a keeper.  Oh well.

	-Mike
Re: [PATCH V2] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by K Prateek Nayak 1 year ago
Hello Mike,

On 11/23/2024 2:14 PM, Mike Galbraith wrote:
> [..snip..]
> 
> Question: did wiping off the evil leave any meaningful goodness behind?
> 
> ---
> 
> sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
> 
> Phil Auld (Redhat) reported an fio benchmark regression having been found
> to have been caused by addition of the DELAY_DEQUEUE feature, suggested it
> may be related to wakees losing the ability to migrate, and confirmed that
> restoration of same indeed did restore previous performance.
> 
> V2: do not rip buddies apart, convenient on/off switch
> 
> Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
> Signed-off-by: Mike Galbraith <efault@gmx.de>
> ---
>   kernel/sched/core.c     |   51 ++++++++++++++++++++++++++++++------------------
>   kernel/sched/features.h |    5 ++++
>   kernel/sched/sched.h    |    5 ++++
>   3 files changed, 42 insertions(+), 19 deletions(-)
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3783,28 +3783,41 @@ ttwu_do_activate(struct rq *rq, struct t
>    */
>   static int ttwu_runnable(struct task_struct *p, int wake_flags)
>   {
> -	struct rq_flags rf;
> -	struct rq *rq;
> -	int ret = 0;
> -
> -	rq = __task_rq_lock(p, &rf);
> -	if (task_on_rq_queued(p)) {
> -		update_rq_clock(rq);
> -		if (p->se.sched_delayed)
> -			enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
> -		if (!task_on_cpu(rq, p)) {
> -			/*
> -			 * When on_rq && !on_cpu the task is preempted, see if
> -			 * it should preempt the task that is current now.
> -			 */
> -			wakeup_preempt(rq, p, wake_flags);
> +	CLASS(__task_rq_lock, rq_guard)(p);
> +	struct rq *rq = rq_guard.rq;
> +
> +	if (!task_on_rq_queued(p))
> +		return 0;
> +
> +	update_rq_clock(rq);
> +	if (p->se.sched_delayed) {
> +		int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
> +		int dequeue = sched_feat(DEQUEUE_DELAYED);
> +
> +		/*
> +		 * Since sched_delayed means we cannot be current anywhere,
> +		 * dequeue it here and have it fall through to the
> +		 * select_task_rq() case further along in ttwu() path.
> +		 * Note: Do not rip buddies apart else chaos follows.
> +		 */
> +		if (dequeue && rq->nr_running > 1 && p->nr_cpus_allowed > 1 &&

Do we really care if DEQUEUE_DELAYED is enabled / disabled here when we
encounter a delayed task?

> +		    !(rq->curr->last_wakee == p || p->last_wakee == rq->curr)) {

Technically, we are still looking at the last wakeup here since
record_wakee() is only called later. If we care about 1:1 buddies,
should we just see "current == p->last_wakee", otherwise, there is a
good chance "p" has a m:n waker-wakee relationship in which case
perhaps a "want_affine" like heuristic can help?

For science, I was wondering if the decision to dequeue + migrate or
requeue the delayed task can be put off until after the whole
select_task_rq() target selection (note: without the h_nr_delayed
stuff, some of that wake_affine_idle() logic falls apart). Hackbench
(which saw some regression with EEVDF Complete) seem to like it
somewhat, but it still falls behind NO_DELAY_DEQUEUE.

    ==================================================================
     Test          : hackbench
     Units         : Normalized time in seconds
     Interpretation: Lower is better
     Statistic     : AMean
     ==================================================================
     NO_DELAY_DEQUEUE	     Mike's v2	  Full ttwu + requeue/migrate
     5.76	           5.72  (  1% )        5.82  ( -1% )
     6.53	           6.56  (  0% )        6.65  ( -2% )
     6.79	           7.04  ( -4% )        7.02  ( -3% )
     6.91	           7.04  ( -2% )        7.03  ( -2% )
     7.63	           8.05  ( -6% )        7.88  ( -3% )

Only subtle changes in IBS profiles; there aren't any obvious shift
in hotspots with hackbench at least. Not sure if it is just the act of
needing to do a dequeue + enqueue from the wakeup context that adds to
the overall regression.

-- 
Thanks and Regards,
Prateek

> +			dequeue_task(rq, p, DEQUEUE_SLEEP | queue_flags);
> +			return 0;
>   		}
> -		ttwu_do_wakeup(p);
> -		ret = 1;
> +
> +		enqueue_task(rq, p, queue_flags);
> +	}
> +	if (!task_on_cpu(rq, p)) {
> +		/*
> +		 * When on_rq && !on_cpu the task is preempted, see if
> +		 * it should preempt the task that is current now.
> +		 */
> +		wakeup_preempt(rq, p, wake_flags);
>   	}
> -	__task_rq_unlock(rq, &rf);
> +	ttwu_do_wakeup(p);
> 
> -	return ret;
> +	return 1;
>   }
> 
>   #ifdef CONFIG_SMP
> [..snip..]
Re: [PATCH V2] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Mike Galbraith 1 year ago
On Tue, 2024-11-26 at 11:02 +0530, K Prateek Nayak wrote:
> Hello Mike,
> 
> On 11/23/2024 2:14 PM, Mike Galbraith wrote:
> > [..snip..]
> > 
> > Question: did wiping off the evil leave any meaningful goodness behind?
> > 
> > ---
> > 
> > sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
> > 
> > Phil Auld (Redhat) reported an fio benchmark regression having been found
> > to have been caused by addition of the DELAY_DEQUEUE feature, suggested it
> > may be related to wakees losing the ability to migrate, and confirmed that
> > restoration of same indeed did restore previous performance.
> > 
> > V2: do not rip buddies apart, convenient on/off switch
> > 
> > Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
> > Signed-off-by: Mike Galbraith <efault@gmx.de>
> > ---
> >   kernel/sched/core.c     |   51 ++++++++++++++++++++++++++++++------------------
> >   kernel/sched/features.h |    5 ++++
> >   kernel/sched/sched.h    |    5 ++++
> >   3 files changed, 42 insertions(+), 19 deletions(-)
> > 
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -3783,28 +3783,41 @@ ttwu_do_activate(struct rq *rq, struct t
> >    */
> >   static int ttwu_runnable(struct task_struct *p, int wake_flags)
> >   {
> > -       struct rq_flags rf;
> > -       struct rq *rq;
> > -       int ret = 0;
> > -
> > -       rq = __task_rq_lock(p, &rf);
> > -       if (task_on_rq_queued(p)) {
> > -               update_rq_clock(rq);
> > -               if (p->se.sched_delayed)
> > -                       enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
> > -               if (!task_on_cpu(rq, p)) {
> > -                       /*
> > -                        * When on_rq && !on_cpu the task is preempted, see if
> > -                        * it should preempt the task that is current now.
> > -                        */
> > -                       wakeup_preempt(rq, p, wake_flags);
> > +       CLASS(__task_rq_lock, rq_guard)(p);
> > +       struct rq *rq = rq_guard.rq;
> > +
> > +       if (!task_on_rq_queued(p))
> > +               return 0;
> > +
> > +       update_rq_clock(rq);
> > +       if (p->se.sched_delayed) {
> > +               int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
> > +               int dequeue = sched_feat(DEQUEUE_DELAYED);
> > +
> > +               /*
> > +                * Since sched_delayed means we cannot be current anywhere,
> > +                * dequeue it here and have it fall through to the
> > +                * select_task_rq() case further along in ttwu() path.
> > +                * Note: Do not rip buddies apart else chaos follows.
> > +                */
> > +               if (dequeue && rq->nr_running > 1 && p->nr_cpus_allowed > 1 &&
> 
> Do we really care if DEQUEUE_DELAYED is enabled / disabled here when we
> encounter a delayed task?

The switch is for test convenience.

> > +                   !(rq->curr->last_wakee == p || p->last_wakee == rq->curr)) {
> 
> Technically, we are still looking at the last wakeup here since
> record_wakee() is only called later. If we care about 1:1 buddies,
> should we just see "current == p->last_wakee", otherwise, there is a
> good chance "p" has a m:n waker-wakee relationship in which case
> perhaps a "want_affine" like heuristic can help?

The intent is to blunt the instrument a bit. Paul should have a highly
active interrupt source, which will give wakeup credit to whatever is
sitting on that CPU, breaking 1:1 connections.. a little bit.   That's
why it still migrates tbench buddies, but NOT at a rate that turns a
tbench progression into a new low regression.  The hope is that the
load shift caused by that active interrupt source is enough to give
Paul's regression some of the help it demonstrated wanting, without the
collateral damage.  It might now be so weak as to not meet the
"meaningful" in my question, in which case it lands on the ginormous
pile of meh, sorta works, but why would anyone care.

> For science, I was wondering if the decision to dequeue + migrate or
> requeue the delayed task can be put off until after the whole
> select_task_rq() target selection (note: without the h_nr_delayed
> stuff, some of that wake_affine_idle() logic falls apart). Hackbench
> (which saw some regression with EEVDF Complete) seem to like it
> somewhat, but it still falls behind NO_DELAY_DEQUEUE.

You can, with a few more fast path cycles and some duplication, none of
which looks very desirable.
 
>     ==================================================================
>      Test          : hackbench
>      Units         : Normalized time in seconds
>      Interpretation: Lower is better
>      Statistic     : AMean
>      ==================================================================
>      NO_DELAY_DEQUEUE        Mike's v2    Full ttwu + requeue/migrate
>      5.76                  5.72  (  1% )        5.82  ( -1% )
>      6.53                  6.56  (  0% )        6.65  ( -2% )
>      6.79                  7.04  ( -4% )        7.02  ( -3% )
>      6.91                  7.04  ( -2% )        7.03  ( -2% )
>      7.63                  8.05  ( -6% )        7.88  ( -3% )
> 
> Only subtle changes in IBS profiles; there aren't any obvious shift
> in hotspots with hackbench at least. Not sure if it is just the act of
> needing to do a dequeue + enqueue from the wakeup context that adds to
> the overall regression.

Those numbers say say to me that hackbench doesn't care deeply.  That
works for me, because I don't care deeply about nutty fork bombs ;-)

	-Mike
Re: [PATCH V2] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Mike Galbraith 1 year ago
On Tue, 2024-11-26 at 07:30 +0100, Mike Galbraith wrote:
>
> The intent is to blunt the instrument a bit. Paul should have a highly
> active interrupt source, which will give wakeup credit to whatever is
> sitting on that CPU, breaking 1:1 connections.. a little bit.   That's
> why it still migrates tbench buddies, but NOT at a rate that turns a
> tbench progression into a new low regression.

BTW, the reason for the tbench wreckage being so bad is that when the
box is near saturation, not only are a portion of the surviving
sched_delayed tasks affine wakeups (always the optimal configuration
for this fast mover cliebt/server pair in an L3 equipped box), they are
exclusively affine wakeups.  That is most definitely gonna hurt.

When saturating that becomes the best option for a lot of client/server
pairs, even those with a lot of concurrency.  Turning them loose to
migrate at that time is far more likely than not to hurt a LOT, so V1
was doomed.

> The hope is that the
> load shift caused by that active interrupt source is enough to give
> Paul's regression some of the help it demonstrated wanting, without the
> collateral damage.  It might now be so weak as to not meet the
> "meaningful" in my question, in which case it lands on the ginormous
> pile of meh, sorta works, but why would anyone care.

In my IO challenged box, patch is useless to fio, nothing can help a
load where all of the IO action, and wimpy action at that, is nailed to
one CPU.  I can see it helping other latency sensitive stuff, like say
1:N mother of all work and/or control threads (and ilk), but if Phil's
problematic box looks anything like this box.. nah, it's a long reach.

	-Mike
Re: [PATCH V2] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Mike Galbraith 1 year ago
On Tue, 2024-11-26 at 07:30 +0100, Mike Galbraith wrote:
>
> The intent is to blunt the instrument a bit. Paul should have

Yeah I did... ahem, I meant of course Phil.

	-Mike
Re: [PATCH V2] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Phil Auld 1 year ago
On Tue, Nov 26, 2024 at 10:42:37AM +0100 Mike Galbraith wrote:
> On Tue, 2024-11-26 at 07:30 +0100, Mike Galbraith wrote:
> >
> > The intent is to blunt the instrument a bit. Paul should have
> 
> Yeah I did... ahem, I meant of course Phil.
>

Heh, you are not alone, Mike  :)


> 	-Mike
> 

--
Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Xuewen Yan 1 year, 1 month ago
Hi Mike and Peter,

On Fri, Nov 8, 2024 at 8:28 AM Mike Galbraith <efault@gmx.de> wrote:
>
> On Thu, 2024-11-07 at 15:09 +0100, Peter Zijlstra wrote:
> > On Thu, Nov 07, 2024 at 03:02:36PM +0100, Mike Galbraith wrote:
> > > On Thu, 2024-11-07 at 10:46 +0100, Mike Galbraith wrote:
> > > > On Thu, 2024-11-07 at 05:03 +0100, Mike Galbraith wrote:
> > > > >
> > > > > I built that patch out of curiosity, and yeah, set_next_task_fair()
> > > > > finding a cfs_rq->curr ends play time pretty quickly.
> > > >
> > > > The below improved uptime, and trace_printk() says it's doing the
> > > > intended, so I suppose I'll add a feature and see what falls out.
> > >
> > > From netperf, I got.. number tabulation practice.  Three runs of each
> > > test with and without produced nothing but variance/noise.
> >
> > Make it go away then.
> >
> > If you could write a Changelog for you inspired bit and stick my cleaned
> > up version under it, I'd be much obliged.
>
> Salut, much obliged for eyeball relief.
>
> ---snip---
>
> Phil Auld (Redhat) reported an fio benchmark regression having been found
> to have been caused by addition of the DELAY_DEQUEUE feature, suggested it
> may be related to wakees losing the ability to migrate, and confirmed that
> restoration of same indeed did restore previous performance.
>
> (de-uglified-a-lot-by)
>
> Reported-by: Phil Auld <pauld@redhat.com>
> Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
> Link: https://lore.kernel.org/lkml/20241101124715.GA689589@pauld.westford.csb/
> Signed-off-by: Mike Galbraith <efault@gmx.de>
> ---
>  kernel/sched/core.c  |   48 +++++++++++++++++++++++++++++-------------------
>  kernel/sched/sched.h |    5 +++++
>  2 files changed, 34 insertions(+), 19 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3783,28 +3783,38 @@ ttwu_do_activate(struct rq *rq, struct t
>   */
>  static int ttwu_runnable(struct task_struct *p, int wake_flags)
>  {
> -       struct rq_flags rf;
> -       struct rq *rq;
> -       int ret = 0;
> -
> -       rq = __task_rq_lock(p, &rf);
> -       if (task_on_rq_queued(p)) {
> -               update_rq_clock(rq);
> -               if (p->se.sched_delayed)
> -                       enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
> -               if (!task_on_cpu(rq, p)) {
> -                       /*
> -                        * When on_rq && !on_cpu the task is preempted, see if
> -                        * it should preempt the task that is current now.
> -                        */
> -                       wakeup_preempt(rq, p, wake_flags);
> +       CLASS(__task_rq_lock, rq_guard)(p);
> +       struct rq *rq = rq_guard.rq;
> +
> +       if (!task_on_rq_queued(p))
> +               return 0;
> +
> +       update_rq_clock(rq);
> +       if (p->se.sched_delayed) {
> +               int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
> +
> +               /*
> +                * Since sched_delayed means we cannot be current anywhere,
> +                * dequeue it here and have it fall through to the
> +                * select_task_rq() case further along the ttwu() path.
> +                */
> +               if (rq->nr_running > 1 && p->nr_cpus_allowed > 1) {

For sched_asym_cpucapacity system, need we consider the
task_fits_cpu_capacity there?


> +                       dequeue_task(rq, p, DEQUEUE_SLEEP | queue_flags);
> +                       return 0;
>                 }
> -               ttwu_do_wakeup(p);
> -               ret = 1;
> +
> +               enqueue_task(rq, p, queue_flags);
> +       }
> +       if (!task_on_cpu(rq, p)) {
> +               /*
> +                * When on_rq && !on_cpu the task is preempted, see if
> +                * it should preempt the task that is current now.
> +                */
> +               wakeup_preempt(rq, p, wake_flags);
>         }
> -       __task_rq_unlock(rq, &rf);
> +       ttwu_do_wakeup(p);
>
> -       return ret;
> +       return 1;
>  }
>
>  #ifdef CONFIG_SMP
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1779,6 +1779,11 @@ task_rq_unlock(struct rq *rq, struct tas
>         raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
>  }
>
> +DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
> +                   _T->rq = __task_rq_lock(_T->lock, &_T->rf),
> +                   __task_rq_unlock(_T->rq, &_T->rf),
> +                   struct rq *rq; struct rq_flags rf)
> +
>  DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
>                     _T->rq = task_rq_lock(_T->lock, &_T->rf),
>                     task_rq_unlock(_T->rq, _T->lock, &_T->rf),
>
>
>
Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Mike Galbraith 1 year, 1 month ago
On Mon, 2024-11-11 at 10:46 +0800, Xuewen Yan wrote:
> > 
> > +       if (p->se.sched_delayed) {
> > +               int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
> > +
> > +               /*
> > +                * Since sched_delayed means we cannot be current anywhere,
> > +                * dequeue it here and have it fall through to the
> > +                * select_task_rq() case further along the ttwu() path.
> > +                */
> > +               if (rq->nr_running > 1 && p->nr_cpus_allowed > 1) {
> 
> For sched_asym_cpucapacity system, need we consider the
> task_fits_cpu_capacity there?

I don't think so.  Wakeup placement logic is what we're deflecting the
wakee toward, this is not the right spot to add any complexity.


	-Mike
Re: [PATCH] sched/fair: Dequeue sched_delayed tasks when waking to a busy CPU
Posted by Phil Auld 1 year, 1 month ago
On Fri, Nov 08, 2024 at 01:24:35AM +0100 Mike Galbraith wrote:
> On Thu, 2024-11-07 at 15:09 +0100, Peter Zijlstra wrote:
> > On Thu, Nov 07, 2024 at 03:02:36PM +0100, Mike Galbraith wrote:
> > > On Thu, 2024-11-07 at 10:46 +0100, Mike Galbraith wrote:
> > > > On Thu, 2024-11-07 at 05:03 +0100, Mike Galbraith wrote:
> > > > >
> > > > > I built that patch out of curiosity, and yeah, set_next_task_fair()
> > > > > finding a cfs_rq->curr ends play time pretty quickly.
> > > >
> > > > The below improved uptime, and trace_printk() says it's doing the
> > > > intended, so I suppose I'll add a feature and see what falls out.
> > >
> > > From netperf, I got.. number tabulation practice.  Three runs of each
> > > test with and without produced nothing but variance/noise.
> >
> > Make it go away then.
> >
> > If you could write a Changelog for you inspired bit and stick my cleaned
> > up version under it, I'd be much obliged.
> 
> Salut, much obliged for eyeball relief.
>

Thanks Mike (and Peter).  We have our full perf tests running on Mike's
original verion of this patch. Results probably Monday (there's a long
queue). We'll see if this blows up anything else then.  I'll queue up a
build with this cleaned up version as well but the results will be late
next week, probably.

At that point maybe some or all of these:

Suggested-by: Phil Auld <pauld@redhat.com> 
Reviewed-by: Phil Auld <pauld@redhat.com>
Tested-by: Jirka Hladky <jhladky@redhat.com>


Cheers,
Phil


> ---snip---
> 
> Phil Auld (Redhat) reported an fio benchmark regression having been found
> to have been caused by addition of the DELAY_DEQUEUE feature, suggested it
> may be related to wakees losing the ability to migrate, and confirmed that
> restoration of same indeed did restore previous performance.
> 
> (de-uglified-a-lot-by)
> 
> Reported-by: Phil Auld <pauld@redhat.com>
> Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
> Link: https://lore.kernel.org/lkml/20241101124715.GA689589@pauld.westford.csb/
> Signed-off-by: Mike Galbraith <efault@gmx.de>
> ---
>  kernel/sched/core.c  |   48 +++++++++++++++++++++++++++++-------------------
>  kernel/sched/sched.h |    5 +++++
>  2 files changed, 34 insertions(+), 19 deletions(-)
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3783,28 +3783,38 @@ ttwu_do_activate(struct rq *rq, struct t
>   */
>  static int ttwu_runnable(struct task_struct *p, int wake_flags)
>  {
> -	struct rq_flags rf;
> -	struct rq *rq;
> -	int ret = 0;
> -
> -	rq = __task_rq_lock(p, &rf);
> -	if (task_on_rq_queued(p)) {
> -		update_rq_clock(rq);
> -		if (p->se.sched_delayed)
> -			enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
> -		if (!task_on_cpu(rq, p)) {
> -			/*
> -			 * When on_rq && !on_cpu the task is preempted, see if
> -			 * it should preempt the task that is current now.
> -			 */
> -			wakeup_preempt(rq, p, wake_flags);
> +	CLASS(__task_rq_lock, rq_guard)(p);
> +	struct rq *rq = rq_guard.rq;
> +
> +	if (!task_on_rq_queued(p))
> +		return 0;
> +
> +	update_rq_clock(rq);
> +	if (p->se.sched_delayed) {
> +		int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
> +
> +		/*
> +		 * Since sched_delayed means we cannot be current anywhere,
> +		 * dequeue it here and have it fall through to the
> +		 * select_task_rq() case further along the ttwu() path.
> +		 */
> +		if (rq->nr_running > 1 && p->nr_cpus_allowed > 1) {
> +			dequeue_task(rq, p, DEQUEUE_SLEEP | queue_flags);
> +			return 0;
>  		}
> -		ttwu_do_wakeup(p);
> -		ret = 1;
> +
> +		enqueue_task(rq, p, queue_flags);
> +	}
> +	if (!task_on_cpu(rq, p)) {
> +		/*
> +		 * When on_rq && !on_cpu the task is preempted, see if
> +		 * it should preempt the task that is current now.
> +		 */
> +		wakeup_preempt(rq, p, wake_flags);
>  	}
> -	__task_rq_unlock(rq, &rf);
> +	ttwu_do_wakeup(p);
> 
> -	return ret;
> +	return 1;
>  }
> 
>  #ifdef CONFIG_SMP
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1779,6 +1779,11 @@ task_rq_unlock(struct rq *rq, struct tas
>  	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
>  }
> 
> +DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
> +		    _T->rq = __task_rq_lock(_T->lock, &_T->rf),
> +		    __task_rq_unlock(_T->rq, &_T->rf),
> +		    struct rq *rq; struct rq_flags rf)
> +
>  DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
>  		    _T->rq = task_rq_lock(_T->lock, &_T->rf),
>  		    task_rq_unlock(_T->rq, _T->lock, &_T->rf),
> 
> 

-- 
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Peter Zijlstra 1 year, 1 month ago
On Wed, Nov 06, 2024 at 03:14:20PM +0100, Peter Zijlstra wrote:
> On Wed, Nov 06, 2024 at 02:53:46PM +0100, Peter Zijlstra wrote:
> 
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 54d82c21fc8e..b083c6385e88 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -3774,28 +3774,38 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
> >   */
> >  static int ttwu_runnable(struct task_struct *p, int wake_flags)
> >  {
> > +	CLASS(__task_rq_lock, rq_guard)(p);
> > +	struct rq *rq = rq_guard.rq;
> >  
> > +	if (!task_on_rq_queued(p))
> > +		return 0;
> > +
> > +	update_rq_clock(rq);
> > +	if (p->se.sched_delayed) {
> > +		int queue_flags = ENQUEUE_DELAYED | ENQUEUE_NOCLOCK;
> > +
> > +		/*
> > +		 * Since sched_delayed means we cannot be current anywhere,
> > +		 * dequeue it here and have it fall through to the
> > +		 * select_task_rq() case further along the ttwu() path.
> > +		 */
> > +		if (rq->nr_running > 1 && p->nr_cpus_allowed > 1) {
> > +			dequeue_task(rq, p, DEQUEUE_SLEEP | queue_flags);
> > +			return 0;
> >  		}
> > +
> > +		enqueue_task(rq, p, queue_flags);
> 
> And then I wondered... this means that !task_on_cpu() is true for
> sched_delayed, and thus we can move this in the below branch.
> 
> But also, we can probably dequeue every such task, not only
> sched_delayed ones.
> 
> >  	}
> > +	if (!task_on_cpu(rq, p)) {
> > +		/*
> > +		 * When on_rq && !on_cpu the task is preempted, see if
> > +		 * it should preempt the task that is current now.
> > +		 */
> > +		wakeup_preempt(rq, p, wake_flags);
> > +	}
> > +	ttwu_do_wakeup(p);
> >  
> > +	return 1;
> >  }
> 
> 
> Yielding something like this on top... which boots. But since I forgot
> to make it a feature, I can't actually tell at this point.. *sigh*

It dies real fast, so clearly I'm missing something. Oh well.
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Phil Auld 1 year, 1 month ago
On Tue, Nov 05, 2024 at 05:05:12AM +0100 Mike Galbraith wrote:
> On Mon, 2024-11-04 at 08:05 -0500, Phil Auld wrote:
> > On Sat, Nov 02, 2024 at 05:32:14AM +0100 Mike Galbraith wrote:
> >
> > >
> > > The buddy being preempted certainly won't be wakeup migrated...
> >
> > Not the waker who gets preempted but the wakee may be a bit more
> > sticky on his current cpu and thus stack more since he's still
> > in that runqueue.
> 
> Ah, indeed, if wakees don't get scraped off before being awakened, they
> can and do miss chances at an idle CPU according to trace_printk().
> 
> I'm undecided if overall it's boon, bane or even matters, as there is
> still an ample supply of wakeup migration, but seems it can indeed
> inject wakeup latency needlessly, so <sharpens stick>...
> 
> My box booted and neither become exceptionally noisy nor inexplicably
> silent in.. oh, minutes now, so surely yours will be perfectly fine.
> 
> After one minute of lightly loaded box browsing, trace_printk() said:
> 
>   645   - racy peek says there is a room available
>    11   - cool, reserved room is free
>   206   - no vacancy or wakee pinned
> 38807   - SIS accommodates room seeker
> 
> The below should improve the odds, but high return seems unlikely.
>

Thanks, I'll give it a spin with the nr_cpus_allowed bit.


Cheers,
Phil



> ---
>  kernel/sched/core.c |    9 ++++++++-
>  1 file changed, 8 insertions(+), 1 deletion(-)
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3790,7 +3790,13 @@ static int ttwu_runnable(struct task_str
>  	rq = __task_rq_lock(p, &rf);
>  	if (task_on_rq_queued(p)) {
>  		update_rq_clock(rq);
> -		if (p->se.sched_delayed)
> +		/*
> +		 * If wakee is mobile and the room it reserved is occupied, let it try to migrate.
> +		 */
> +		if (p->se.sched_delayed && rq->nr_running > 1 && cpumask_weight(p->cpus_ptr) > 1) {
> +			dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
> +			goto out_unlock;
> +		} else if (p->se.sched_delayed)
>  			enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
>  		if (!task_on_cpu(rq, p)) {
>  			/*
> @@ -3802,6 +3808,7 @@ static int ttwu_runnable(struct task_str
>  		ttwu_do_wakeup(p);
>  		ret = 1;
>  	}
> +out_unlock:
>  	__task_rq_unlock(rq, &rf);
> 
>  	return ret;
> 
> 

--
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Phil Auld 1 year, 1 month ago
On Tue, Nov 05, 2024 at 10:20:10AM -0500 Phil Auld wrote:
> On Tue, Nov 05, 2024 at 05:05:12AM +0100 Mike Galbraith wrote:
> > On Mon, 2024-11-04 at 08:05 -0500, Phil Auld wrote:
> > > On Sat, Nov 02, 2024 at 05:32:14AM +0100 Mike Galbraith wrote:
> > >
> > > >
> > > > The buddy being preempted certainly won't be wakeup migrated...
> > >
> > > Not the waker who gets preempted but the wakee may be a bit more
> > > sticky on his current cpu and thus stack more since he's still
> > > in that runqueue.
> > 
> > Ah, indeed, if wakees don't get scraped off before being awakened, they
> > can and do miss chances at an idle CPU according to trace_printk().
> > 
> > I'm undecided if overall it's boon, bane or even matters, as there is
> > still an ample supply of wakeup migration, but seems it can indeed
> > inject wakeup latency needlessly, so <sharpens stick>...
> > 
> > My box booted and neither become exceptionally noisy nor inexplicably
> > silent in.. oh, minutes now, so surely yours will be perfectly fine.
> > 
> > After one minute of lightly loaded box browsing, trace_printk() said:
> > 
> >   645   - racy peek says there is a room available
> >    11   - cool, reserved room is free
> >   206   - no vacancy or wakee pinned
> > 38807   - SIS accommodates room seeker
> > 
> > The below should improve the odds, but high return seems unlikely.
> >
> 
> Thanks, I'll give it a spin with the nr_cpus_allowed bit.
>

Well that worked pretty well. It actually makes DELAY_DEQUEUE a litte better
than NO_DELAY_DEQUEUE

DELAY_DEQUEUE     ~595MB/s
NO_DELAY_DEQUEUE  ~581MB/s

I left the cpumask_weight becaude vim isn't happy with my terminal to that machine
for some reason I have not found yet. So I couldn't actually edit the darn thing.
This is not my normal build setup. But I'll spin up a real build with this patch
and throw it over the wall to the perf team to have them do their full battery
of tests on it.

Probably "Paul" will be cranky now. 


Thanks,
Phil


> 
> Cheers,
> Phil
> 
> 
> 
> > ---
> >  kernel/sched/core.c |    9 ++++++++-
> >  1 file changed, 8 insertions(+), 1 deletion(-)
> > 
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -3790,7 +3790,13 @@ static int ttwu_runnable(struct task_str
> >  	rq = __task_rq_lock(p, &rf);
> >  	if (task_on_rq_queued(p)) {
> >  		update_rq_clock(rq);
> > -		if (p->se.sched_delayed)
> > +		/*
> > +		 * If wakee is mobile and the room it reserved is occupied, let it try to migrate.
> > +		 */
> > +		if (p->se.sched_delayed && rq->nr_running > 1 && cpumask_weight(p->cpus_ptr) > 1) {
> > +			dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
> > +			goto out_unlock;
> > +		} else if (p->se.sched_delayed)
> >  			enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
> >  		if (!task_on_cpu(rq, p)) {
> >  			/*
> > @@ -3802,6 +3808,7 @@ static int ttwu_runnable(struct task_str
> >  		ttwu_do_wakeup(p);
> >  		ret = 1;
> >  	}
> > +out_unlock:
> >  	__task_rq_unlock(rq, &rf);
> > 
> >  	return ret;
> > 
> > 
> 
> -- 
> 
> 

--
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Mike Galbraith 1 year, 1 month ago
On Tue, 2024-11-05 at 14:05 -0500, Phil Auld wrote:
>
> Well that worked pretty well. It actually makes DELAY_DEQUEUE a litte better
> than NO_DELAY_DEQUEUE
>
> DELAY_DEQUEUE     ~595MB/s
> NO_DELAY_DEQUEUE  ~581MB/s

Hrmph, not the expected result, but sharp stick's mission was to
confirm/deny that delta's relevance, so job well done.. kindling.

	-Mike
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by K Prateek Nayak 1 year, 1 month ago
Hello Mike,

On 11/5/2024 9:35 AM, Mike Galbraith wrote:
> On Mon, 2024-11-04 at 08:05 -0500, Phil Auld wrote:
>> On Sat, Nov 02, 2024 at 05:32:14AM +0100 Mike Galbraith wrote:
>>
>>>
>>> The buddy being preempted certainly won't be wakeup migrated...
>>
>> Not the waker who gets preempted but the wakee may be a bit more
>> sticky on his current cpu and thus stack more since he's still
>> in that runqueue.
> 
> Ah, indeed, if wakees don't get scraped off before being awakened, they
> can and do miss chances at an idle CPU according to trace_printk().
> 
> I'm undecided if overall it's boon, bane or even matters, as there is
> still an ample supply of wakeup migration, but seems it can indeed
> inject wakeup latency needlessly, so <sharpens stick>...

I had tried this out a while back but I was indiscriminately doing a
DEQUEUE_DELAYED and letting delayed tasks go through a full ttwu cycle
which did not yield any improvements on hackbench. Your approach to
selectively do it might indeed be better (more thoughts below)

> 
> My box booted and neither become exceptionally noisy nor inexplicably
> silent in.. oh, minutes now, so surely yours will be perfectly fine.
> 
> After one minute of lightly loaded box browsing, trace_printk() said:
> 
>    645   - racy peek says there is a room available
>     11   - cool, reserved room is free
>    206   - no vacancy or wakee pinned
> 38807   - SIS accommodates room seeker
> 
> The below should improve the odds, but high return seems unlikely.
> 
> ---
>   kernel/sched/core.c |    9 ++++++++-
>   1 file changed, 8 insertions(+), 1 deletion(-)
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3790,7 +3790,13 @@ static int ttwu_runnable(struct task_str
>   	rq = __task_rq_lock(p, &rf);
>   	if (task_on_rq_queued(p)) {
>   		update_rq_clock(rq);
> -		if (p->se.sched_delayed)
> +		/*
> +		 * If wakee is mobile and the room it reserved is occupied, let it try to migrate.
> +		 */
> +		if (p->se.sched_delayed && rq->nr_running > 1 && cpumask_weight(p->cpus_ptr) > 1) {

Would checking "p->nr_cpus_allowed > 1" be enough instead of doing a
"cpumask_weight(p->cpus_ptr) > 1"?

I was thinking, since the task is indeed delayed, there has to be more
than one task on the runqueue right since a single task by itself cannot
be ineligible and be marked for delayed dequeue? The only time we
encounter a delayed task with "rq->nr_running == 1" is if the other
tasks have been fully dequeued and pick_next_task() is in the process of
picking off all the delayed task, but since that is done with the rq
lock held in schedule(), it is even possible for the
"rq->nr_running > 1" to be false here?

> +			dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
> +			goto out_unlock;
> +		} else if (p->se.sched_delayed)
>   			enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
>   		if (!task_on_cpu(rq, p)) {
>   			/*
> @@ -3802,6 +3808,7 @@ static int ttwu_runnable(struct task_str
>   		ttwu_do_wakeup(p);
>   		ret = 1;
>   	}
> +out_unlock:
>   	__task_rq_unlock(rq, &rf);
> 
>   	return ret;
> 
> 

-- 
Thanks and Regards,
Prateek
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Mike Galbraith 1 year, 1 month ago
On Tue, 2024-11-05 at 09:52 +0530, K Prateek Nayak wrote:
> Hello Mike,

Greetings,

> Would checking "p->nr_cpus_allowed > 1" be enough instead of doing a
> "cpumask_weight(p->cpus_ptr) > 1"?

Yeah (thwap).

> I was thinking, since the task is indeed delayed, there has to be more
> than one task on the runqueue right since a single task by itself cannot
> be ineligible and be marked for delayed dequeue?

But they migrate via LB, and idle balance unlocks the rq.
trace_printk() just verified that they do still both land with
sched_delayed intact and with nr_running = 1.

> The only time we
> encounter a delayed task with "rq->nr_running == 1" is if the other
> tasks have been fully dequeued and pick_next_task() is in the process of
> picking off all the delayed task, but since that is done with the rq
> lock held in schedule(), it is even possible for the
> "rq->nr_running > 1" to be false here?

I don't see how, the rq being looked at is locked.

	-Mike
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by K Prateek Nayak 1 year, 1 month ago
Hello Mike,

On 11/5/2024 12:16 PM, Mike Galbraith wrote:
> On Tue, 2024-11-05 at 09:52 +0530, K Prateek Nayak wrote:
>> Hello Mike,
> 
> Greetings,
> 
>> Would checking "p->nr_cpus_allowed > 1" be enough instead of doing a
>> "cpumask_weight(p->cpus_ptr) > 1"?
> 
> Yeah (thwap).
> 
>> I was thinking, since the task is indeed delayed, there has to be more
>> than one task on the runqueue right since a single task by itself cannot
>> be ineligible and be marked for delayed dequeue?
> 
> But they migrate via LB, and idle balance unlocks the rq.
> trace_printk() just verified that they do still both land with
> sched_delayed intact and with nr_running = 1.

Ah! You are right! thank you for clarifying. Since the sharp stick seems
to be working, let me go thrown a bunch of workloads at it and report
back :)

-- 
Thanks and Regards,
Prateek

> 
>> The only time we
>> encounter a delayed task with "rq->nr_running == 1" is if the other
>> tasks have been fully dequeued and pick_next_task() is in the process of
>> picking off all the delayed task, but since that is done with the rq
>> lock held in schedule(), it is even possible for the
>> "rq->nr_running > 1" to be false here?
> 
> I don't see how, the rq being looked at is locked.
> 
> 	-Mike
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Marek Szyprowski 1 year, 3 months ago
On 27.07.2024 12:27, Peter Zijlstra wrote:
> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> noting that lag is fundamentally a temporal measure. It should not be
> carried around indefinitely.
>
> OTOH it should also not be instantly discarded, doing so will allow a
> task to game the system by purposefully (micro) sleeping at the end of
> its time quantum.
>
> Since lag is intimately tied to the virtual time base, a wall-time
> based decay is also insufficient, notably competition is required for
> any of this to make sense.
>
> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> competing until they are eligible.
>
> Strictly speaking, we only care about keeping them until the 0-lag
> point, but that is a difficult proposition, instead carry them around
> until they get picked again, and dequeue them at that point.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

This patch landed recently in linux-next as commit 152e11f6df29 
("sched/fair: Implement delayed dequeue"). In my tests on some of the 
ARM 32bit boards it causes a regression in rtcwake tool behavior - from 
time to time this simple call never ends:

# time rtcwake -s 10 -m on

Reverting this commit (together with its compile dependencies) on top of 
linux-next fixes this issue. Let me know how can I help debugging this 
issue.

> ---
>   kernel/sched/deadline.c |    1
>   kernel/sched/fair.c     |   82 ++++++++++++++++++++++++++++++++++++++++++------
>   kernel/sched/features.h |    9 +++++
>   3 files changed, 81 insertions(+), 11 deletions(-)
>
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2428,7 +2428,6 @@ static struct task_struct *__pick_next_t
>   		else
>   			p = dl_se->server_pick_next(dl_se);
>   		if (!p) {
> -			WARN_ON_ONCE(1);
>   			dl_se->dl_yielded = 1;
>   			update_curr_dl_se(rq, dl_se, 0);
>   			goto again;
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5379,20 +5379,44 @@ static void clear_buddies(struct cfs_rq
>   
>   static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
>   
> -static void
> +static bool
>   dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>   {
> -	int action = UPDATE_TG;
> +	if (flags & DEQUEUE_DELAYED) {
> +		/*
> +		 * DEQUEUE_DELAYED is typically called from pick_next_entity()
> +		 * at which point we've already done update_curr() and do not
> +		 * want to do so again.
> +		 */
> +		SCHED_WARN_ON(!se->sched_delayed);
> +		se->sched_delayed = 0;
> +	} else {
> +		bool sleep = flags & DEQUEUE_SLEEP;
> +
> +		/*
> +		 * DELAY_DEQUEUE relies on spurious wakeups, special task
> +		 * states must not suffer spurious wakeups, excempt them.
> +		 */
> +		if (flags & DEQUEUE_SPECIAL)
> +			sleep = false;
> +
> +		SCHED_WARN_ON(sleep && se->sched_delayed);
> +		update_curr(cfs_rq);
>   
> +		if (sched_feat(DELAY_DEQUEUE) && sleep &&
> +		    !entity_eligible(cfs_rq, se)) {
> +			if (cfs_rq->next == se)
> +				cfs_rq->next = NULL;
> +			se->sched_delayed = 1;
> +			return false;
> +		}
> +	}
> +
> +	int action = UPDATE_TG;
>   	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
>   		action |= DO_DETACH;
>   
>   	/*
> -	 * Update run-time statistics of the 'current'.
> -	 */
> -	update_curr(cfs_rq);
> -
> -	/*
>   	 * When dequeuing a sched_entity, we must:
>   	 *   - Update loads to have both entity and cfs_rq synced with now.
>   	 *   - For group_entity, update its runnable_weight to reflect the new
> @@ -5430,6 +5454,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
>   
>   	if (cfs_rq->nr_running == 0)
>   		update_idle_cfs_rq_clock_pelt(cfs_rq);
> +
> +	return true;
>   }
>   
>   static void
> @@ -5828,11 +5854,21 @@ static bool throttle_cfs_rq(struct cfs_r
>   	idle_task_delta = cfs_rq->idle_h_nr_running;
>   	for_each_sched_entity(se) {
>   		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
> +		int flags;
> +
>   		/* throttled entity or throttle-on-deactivate */
>   		if (!se->on_rq)
>   			goto done;
>   
> -		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
> +		/*
> +		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
> +		 * This avoids teaching dequeue_entities() about throttled
> +		 * entities and keeps things relatively simple.
> +		 */
> +		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
> +		if (se->sched_delayed)
> +			flags |= DEQUEUE_DELAYED;
> +		dequeue_entity(qcfs_rq, se, flags);
>   
>   		if (cfs_rq_is_idle(group_cfs_rq(se)))
>   			idle_task_delta = cfs_rq->h_nr_running;
> @@ -6918,6 +6954,7 @@ static int dequeue_entities(struct rq *r
>   	bool was_sched_idle = sched_idle_rq(rq);
>   	int rq_h_nr_running = rq->cfs.h_nr_running;
>   	bool task_sleep = flags & DEQUEUE_SLEEP;
> +	bool task_delayed = flags & DEQUEUE_DELAYED;
>   	struct task_struct *p = NULL;
>   	int idle_h_nr_running = 0;
>   	int h_nr_running = 0;
> @@ -6931,7 +6968,13 @@ static int dequeue_entities(struct rq *r
>   
>   	for_each_sched_entity(se) {
>   		cfs_rq = cfs_rq_of(se);
> -		dequeue_entity(cfs_rq, se, flags);
> +
> +		if (!dequeue_entity(cfs_rq, se, flags)) {
> +			if (p && &p->se == se)
> +				return -1;
> +
> +			break;
> +		}
>   
>   		cfs_rq->h_nr_running -= h_nr_running;
>   		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
> @@ -6956,6 +6999,7 @@ static int dequeue_entities(struct rq *r
>   			break;
>   		}
>   		flags |= DEQUEUE_SLEEP;
> +		flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
>   	}
>   
>   	for_each_sched_entity(se) {
> @@ -6985,6 +7029,17 @@ static int dequeue_entities(struct rq *r
>   	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
>   		rq->next_balance = jiffies;
>   
> +	if (p && task_delayed) {
> +		SCHED_WARN_ON(!task_sleep);
> +		SCHED_WARN_ON(p->on_rq != 1);
> +
> +		/* Fix-up what dequeue_task_fair() skipped */
> +		hrtick_update(rq);
> +
> +		/* Fix-up what block_task() skipped. */
> +		__block_task(rq, p);
> +	}
> +
>   	return 1;
>   }
>   /*
> @@ -6996,8 +7051,10 @@ static bool dequeue_task_fair(struct rq
>   {
>   	util_est_dequeue(&rq->cfs, p);
>   
> -	if (dequeue_entities(rq, &p->se, flags) < 0)
> +	if (dequeue_entities(rq, &p->se, flags) < 0) {
> +		util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
>   		return false;
> +	}
>   
>   	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
>   	hrtick_update(rq);
> @@ -12973,6 +13030,11 @@ static void set_next_task_fair(struct rq
>   		/* ensure bandwidth has been allocated on our new cfs_rq */
>   		account_cfs_rq_runtime(cfs_rq, 0);
>   	}
> +
> +	if (!first)
> +		return;
> +
> +	SCHED_WARN_ON(se->sched_delayed);
>   }
>   
>   void init_cfs_rq(struct cfs_rq *cfs_rq)
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -29,6 +29,15 @@ SCHED_FEAT(NEXT_BUDDY, false)
>   SCHED_FEAT(CACHE_HOT_BUDDY, true)
>   
>   /*
> + * Delay dequeueing tasks until they get selected or woken.
> + *
> + * By delaying the dequeue for non-eligible tasks, they remain in the
> + * competition and can burn off their negative lag. When they get selected
> + * they'll have positive lag by definition.
> + */
> +SCHED_FEAT(DELAY_DEQUEUE, true)
> +
> +/*
>    * Allow wakeup-time preemption of the current task:
>    */
>   SCHED_FEAT(WAKEUP_PREEMPTION, true)
>
>
>
Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Sean Christopherson 1 year, 2 months ago
+KVM

On Thu, Aug 29, 2024, Marek Szyprowski wrote:
> On 27.07.2024 12:27, Peter Zijlstra wrote:
> > Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> > noting that lag is fundamentally a temporal measure. It should not be
> > carried around indefinitely.
> >
> > OTOH it should also not be instantly discarded, doing so will allow a
> > task to game the system by purposefully (micro) sleeping at the end of
> > its time quantum.
> >
> > Since lag is intimately tied to the virtual time base, a wall-time
> > based decay is also insufficient, notably competition is required for
> > any of this to make sense.
> >
> > Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> > competing until they are eligible.
> >
> > Strictly speaking, we only care about keeping them until the 0-lag
> > point, but that is a difficult proposition, instead carry them around
> > until they get picked again, and dequeue them at that point.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> 
> This patch landed recently in linux-next as commit 152e11f6df29 
> ("sched/fair: Implement delayed dequeue"). In my tests on some of the 
> ARM 32bit boards it causes a regression in rtcwake tool behavior - from 
> time to time this simple call never ends:
> 
> # time rtcwake -s 10 -m on
> 
> Reverting this commit (together with its compile dependencies) on top of 
> linux-next fixes this issue. Let me know how can I help debugging this 
> issue.

This commit broke KVM's posted interrupt handling (and other things), and the root
cause may be the same underlying issue.

TL;DR: Code that checks task_struct.on_rq may be broken by this commit.

KVM's breakage boils down to the preempt notifiers, i.e. kvm_sched_out(), being
invoked with current->on_rq "true" after KVM has explicitly called schedule().
kvm_sched_out() uses current->on_rq to determine if the vCPU is being preempted
(voluntarily or not, doesn't matter), and so waiting until some later point in
time to call __block_task() causes KVM to think the task was preempted, when in
reality it was not.

  static void kvm_sched_out(struct preempt_notifier *pn,
 			  struct task_struct *next)
  {
	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);

	WRITE_ONCE(vcpu->scheduled_out, true);

	if (current->on_rq && vcpu->wants_to_run) {  <================
		WRITE_ONCE(vcpu->preempted, true);
		WRITE_ONCE(vcpu->ready, true);
	}
	kvm_arch_vcpu_put(vcpu);
	__this_cpu_write(kvm_running_vcpu, NULL);
  }

KVM uses vcpu->preempted for a variety of things, but the most visibly problematic
is waking a vCPU from (virtual) HLT via posted interrupt wakeup.  When a vCPU
HLTs, KVM ultimate calls schedule() to schedule out the vCPU until it receives
a wake event.

When a device or another vCPU can post an interrupt as a wake event, KVM mucks
with the blocking vCPU's posted interrupt descriptor so that posted interrupts
that should be wake events get delivered on a dedicated host IRQ vector, so that
KVM can kick and wake the target vCPU.

But when vcpu->preempted is true, KVM suppresses posted interrupt notifications,
knowing that the vCPU will be scheduled back in.  Because a vCPU (task) can be
preempted while KVM is emulating HLT, KVM keys off vcpu->preempted to set PID.SN,
and doesn't exempt the blocking case.  In short, KVM uses vcpu->preempted, i.e.
current->on_rq, to differentiate between the vCPU getting preempted and KVM
executing schedule().

As a result, the false positive for vcpu->preempted causes KVM to suppress posted
interrupt notifications and the target vCPU never gets its wake event.


Peter,

Any thoughts on how best to handle this?  The below hack-a-fix resolves the issue,
but it's obviously not appropriate.  KVM uses vcpu->preempted for more than just
posted interrupts, so KVM needs equivalent functionality to current->on-rq as it
was before this commit.

@@ -6387,7 +6390,7 @@ static void kvm_sched_out(struct preempt_notifier *pn,
 
        WRITE_ONCE(vcpu->scheduled_out, true);
 
-       if (current->on_rq && vcpu->wants_to_run) {
+       if (se_runnable(&current->se) && vcpu->wants_to_run) {
                WRITE_ONCE(vcpu->preempted, true);
                WRITE_ONCE(vcpu->ready, true);
        }
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Peter Zijlstra 1 year, 2 months ago
On Wed, Oct 09, 2024 at 07:49:54PM -0700, Sean Christopherson wrote:

> TL;DR: Code that checks task_struct.on_rq may be broken by this commit.

Correct, and while I did look at quite a few, I did miss KVM used it,
damn.

> Peter,
> 
> Any thoughts on how best to handle this?  The below hack-a-fix resolves the issue,
> but it's obviously not appropriate.  KVM uses vcpu->preempted for more than just
> posted interrupts, so KVM needs equivalent functionality to current->on-rq as it
> was before this commit.
> 
> @@ -6387,7 +6390,7 @@ static void kvm_sched_out(struct preempt_notifier *pn,
>  
>         WRITE_ONCE(vcpu->scheduled_out, true);
>  
> -       if (current->on_rq && vcpu->wants_to_run) {
> +       if (se_runnable(&current->se) && vcpu->wants_to_run) {
>                 WRITE_ONCE(vcpu->preempted, true);
>                 WRITE_ONCE(vcpu->ready, true);
>         }

se_runnable() isn't quite right, but yes, a helper along those lines is
probably best. Let me try and grep more to see if there's others I
missed as well :/
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Peter Zijlstra 1 year, 2 months ago
On Thu, Oct 10, 2024 at 10:19:40AM +0200, Peter Zijlstra wrote:
> On Wed, Oct 09, 2024 at 07:49:54PM -0700, Sean Christopherson wrote:
> 
> > TL;DR: Code that checks task_struct.on_rq may be broken by this commit.
> 
> Correct, and while I did look at quite a few, I did miss KVM used it,
> damn.
> 
> > Peter,
> > 
> > Any thoughts on how best to handle this?  The below hack-a-fix resolves the issue,
> > but it's obviously not appropriate.  KVM uses vcpu->preempted for more than just
> > posted interrupts, so KVM needs equivalent functionality to current->on-rq as it
> > was before this commit.
> > 
> > @@ -6387,7 +6390,7 @@ static void kvm_sched_out(struct preempt_notifier *pn,
> >  
> >         WRITE_ONCE(vcpu->scheduled_out, true);
> >  
> > -       if (current->on_rq && vcpu->wants_to_run) {
> > +       if (se_runnable(&current->se) && vcpu->wants_to_run) {
> >                 WRITE_ONCE(vcpu->preempted, true);
> >                 WRITE_ONCE(vcpu->ready, true);
> >         }
> 
> se_runnable() isn't quite right, but yes, a helper along those lines is
> probably best. Let me try and grep more to see if there's others I
> missed as well :/

How's the below? I remember looking at the freezer thing before and
deciding it isn't a correctness thing, but given I added the helper, I
changed it anyway. I've added a bunch of comments and the perf thing is
similar to KVM, it wants to know about preemptions so that had to change
too.

---
 include/linux/sched.h         |  5 +++++
 kernel/events/core.c          |  2 +-
 kernel/freezer.c              |  7 ++++++-
 kernel/rcu/tasks.h            |  9 +++++++++
 kernel/sched/core.c           | 12 +++++++++---
 kernel/time/tick-sched.c      |  5 +++++
 kernel/trace/trace_selftest.c |  2 +-
 virt/kvm/kvm_main.c           |  2 +-
 8 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0053f0664847..2b1f454e4575 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2134,6 +2134,11 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+static inline bool task_is_runnable(struct task_struct *p)
+{
+	return p->on_rq && !p->se.sched_delayed;
+}
+
 extern bool sched_task_on_rq(struct task_struct *p);
 extern unsigned long get_wchan(struct task_struct *p);
 extern struct task_struct *cpu_curr_snapshot(int cpu);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e3589c4287cb..cdd09769e6c5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9251,7 +9251,7 @@ static void perf_event_switch(struct task_struct *task,
 		},
 	};
 
-	if (!sched_in && task->on_rq) {
+	if (!sched_in && task_is_runnable(task)) {
 		switch_event.event_id.header.misc |=
 				PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
 	}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 44bbd7dbd2c8..8d530d0949ff 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -109,7 +109,12 @@ static int __set_task_frozen(struct task_struct *p, void *arg)
 {
 	unsigned int state = READ_ONCE(p->__state);
 
-	if (p->on_rq)
+	/*
+	 * Allow freezing the sched_delayed tasks; they will not execute until
+	 * ttwu() fixes them up, so it is safe to swap their state now, instead
+	 * of waiting for them to get fully dequeued.
+	 */
+	if (task_is_runnable(p))
 		return 0;
 
 	if (p != current && task_curr(p))
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 6333f4ccf024..4d7ee95df06e 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -985,6 +985,15 @@ static bool rcu_tasks_is_holdout(struct task_struct *t)
 	if (!READ_ONCE(t->on_rq))
 		return false;
 
+	/*
+	 * t->on_rq && !t->se.sched_delayed *could* be considered sleeping but
+	 * since it is a spurious state (it will transition into the
+	 * traditional blocked state or get woken up without outside
+	 * dependencies), not considering it such should only affect timing.
+	 *
+	 * Be conservative for now and not include it.
+	 */
+
 	/*
 	 * Idle tasks (or idle injection) within the idle loop are RCU-tasks
 	 * quiescent states. But CPU boot code performed by the idle task
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0bacc5cd3693..be5c04eb5ba0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -548,6 +548,11 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
  *   ON_RQ_MIGRATING state is used for migration without holding both
  *   rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
  *
+ *   Additionally it is possible to be ->on_rq but still be considered not
+ *   runnable when p->se.sched_delayed is true. These tasks are on the runqueue
+ *   but will be dequeued as soon as they get picked again. See the
+ *   task_is_runnable() helper.
+ *
  * p->on_cpu <- { 0, 1 }:
  *
  *   is set by prepare_task() and cleared by finish_task() such that it will be
@@ -4358,9 +4363,10 @@ static bool __task_needs_rq_lock(struct task_struct *p)
  * @arg: Argument to function.
  *
  * Fix the task in it's current state by avoiding wakeups and or rq operations
- * and call @func(@arg) on it.  This function can use ->on_rq and task_curr()
- * to work out what the state is, if required.  Given that @func can be invoked
- * with a runqueue lock held, it had better be quite lightweight.
+ * and call @func(@arg) on it.  This function can use task_is_runnable() and
+ * task_curr() to work out what the state is, if required.  Given that @func
+ * can be invoked with a runqueue lock held, it had better be quite
+ * lightweight.
  *
  * Returns:
  *   Whatever @func returns
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 753a184c7090..59efa14ce185 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -435,6 +435,11 @@ static void tick_nohz_kick_task(struct task_struct *tsk)
 	 *   tick_nohz_task_switch()
 	 *     LOAD p->tick_dep_mask
 	 */
+	// XXX given a task picks up the dependency on schedule(), should we
+	// only care about tasks that are currently on the CPU instead of all
+	// that are on the runqueue?
+	//
+	// That is, does this want to be: task_on_cpu() / task_curr()?
 	if (!sched_task_on_rq(tsk))
 		return;
 
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index c4ad7cd7e778..1469dd8075fa 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1485,7 +1485,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 	/* reset the max latency */
 	tr->max_latency = 0;
 
-	while (p->on_rq) {
+	while (task_is_runnable(p)) {
 		/*
 		 * Sleep to make sure the -deadline thread is asleep too.
 		 * On virtual machines we can't rely on timings,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 05cbb2548d99..0c666f1870af 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -6387,7 +6387,7 @@ static void kvm_sched_out(struct preempt_notifier *pn,
 
 	WRITE_ONCE(vcpu->scheduled_out, true);
 
-	if (current->on_rq && vcpu->wants_to_run) {
+	if (task_is_runnable(current) && vcpu->wants_to_run) {
 		WRITE_ONCE(vcpu->preempted, true);
 		WRITE_ONCE(vcpu->ready, true);
 	}
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Sean Christopherson 1 year, 2 months ago
On Thu, Oct 10, 2024, Peter Zijlstra wrote:
> On Thu, Oct 10, 2024 at 10:19:40AM +0200, Peter Zijlstra wrote:
> > On Wed, Oct 09, 2024 at 07:49:54PM -0700, Sean Christopherson wrote:
> > 
> > > TL;DR: Code that checks task_struct.on_rq may be broken by this commit.
> > 
> > Correct, and while I did look at quite a few, I did miss KVM used it,
> > damn.
> > 
> > > Peter,
> > > 
> > > Any thoughts on how best to handle this?  The below hack-a-fix resolves the issue,
> > > but it's obviously not appropriate.  KVM uses vcpu->preempted for more than just
> > > posted interrupts, so KVM needs equivalent functionality to current->on-rq as it
> > > was before this commit.
> > > 
> > > @@ -6387,7 +6390,7 @@ static void kvm_sched_out(struct preempt_notifier *pn,
> > >  
> > >         WRITE_ONCE(vcpu->scheduled_out, true);
> > >  
> > > -       if (current->on_rq && vcpu->wants_to_run) {
> > > +       if (se_runnable(&current->se) && vcpu->wants_to_run) {
> > >                 WRITE_ONCE(vcpu->preempted, true);
> > >                 WRITE_ONCE(vcpu->ready, true);
> > >         }
> > 
> > se_runnable() isn't quite right, but yes, a helper along those lines is
> > probably best. Let me try and grep more to see if there's others I
> > missed as well :/
> 
> How's the below? I remember looking at the freezer thing before and
> deciding it isn't a correctness thing, but given I added the helper, I
> changed it anyway. I've added a bunch of comments and the perf thing is
> similar to KVM, it wants to know about preemptions so that had to change
> too.

Fixes KVM's woes!  Thanks!
[tip: sched/urgent] sched/fair: Fix external p->on_rq users
Posted by tip-bot2 for Peter Zijlstra 1 year, 2 months ago
The following commit has been merged into the sched/urgent branch of tip:

Commit-ID:     cd9626e9ebc77edec33023fe95dab4b04ffc819d
Gitweb:        https://git.kernel.org/tip/cd9626e9ebc77edec33023fe95dab4b04ffc819d
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 10 Oct 2024 11:38:10 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Mon, 14 Oct 2024 09:14:35 +02:00

sched/fair: Fix external p->on_rq users

Sean noted that ever since commit 152e11f6df29 ("sched/fair: Implement
delayed dequeue") KVM's preemption notifiers have started
mis-classifying preemption vs blocking.

Notably p->on_rq is no longer sufficient to determine if a task is
runnable or blocked -- the aforementioned commit introduces tasks that
remain on the runqueue even through they will not run again, and
should be considered blocked for many cases.

Add the task_is_runnable() helper to classify things and audit all
external users of the p->on_rq state. Also add a few comments.

Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
Reported-by: Sean Christopherson <seanjc@google.com>
Tested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20241010091843.GK33184@noisy.programming.kicks-ass.net
---
 include/linux/sched.h         |  5 +++++
 kernel/events/core.c          |  2 +-
 kernel/freezer.c              |  7 ++++++-
 kernel/rcu/tasks.h            |  9 +++++++++
 kernel/sched/core.c           | 12 +++++++++---
 kernel/time/tick-sched.c      |  6 ++++++
 kernel/trace/trace_selftest.c |  2 +-
 virt/kvm/kvm_main.c           |  2 +-
 8 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e6ee425..8a9517e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2133,6 +2133,11 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+static inline bool task_is_runnable(struct task_struct *p)
+{
+	return p->on_rq && !p->se.sched_delayed;
+}
+
 extern bool sched_task_on_rq(struct task_struct *p);
 extern unsigned long get_wchan(struct task_struct *p);
 extern struct task_struct *cpu_curr_snapshot(int cpu);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e3589c4..cdd0976 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9251,7 +9251,7 @@ static void perf_event_switch(struct task_struct *task,
 		},
 	};
 
-	if (!sched_in && task->on_rq) {
+	if (!sched_in && task_is_runnable(task)) {
 		switch_event.event_id.header.misc |=
 				PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
 	}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 44bbd7d..8d530d0 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -109,7 +109,12 @@ static int __set_task_frozen(struct task_struct *p, void *arg)
 {
 	unsigned int state = READ_ONCE(p->__state);
 
-	if (p->on_rq)
+	/*
+	 * Allow freezing the sched_delayed tasks; they will not execute until
+	 * ttwu() fixes them up, so it is safe to swap their state now, instead
+	 * of waiting for them to get fully dequeued.
+	 */
+	if (task_is_runnable(p))
 		return 0;
 
 	if (p != current && task_curr(p))
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 6333f4c..4d7ee95 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -986,6 +986,15 @@ static bool rcu_tasks_is_holdout(struct task_struct *t)
 		return false;
 
 	/*
+	 * t->on_rq && !t->se.sched_delayed *could* be considered sleeping but
+	 * since it is a spurious state (it will transition into the
+	 * traditional blocked state or get woken up without outside
+	 * dependencies), not considering it such should only affect timing.
+	 *
+	 * Be conservative for now and not include it.
+	 */
+
+	/*
 	 * Idle tasks (or idle injection) within the idle loop are RCU-tasks
 	 * quiescent states. But CPU boot code performed by the idle task
 	 * isn't a quiescent state.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 71232f8..7db711b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -548,6 +548,11 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
  *   ON_RQ_MIGRATING state is used for migration without holding both
  *   rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
  *
+ *   Additionally it is possible to be ->on_rq but still be considered not
+ *   runnable when p->se.sched_delayed is true. These tasks are on the runqueue
+ *   but will be dequeued as soon as they get picked again. See the
+ *   task_is_runnable() helper.
+ *
  * p->on_cpu <- { 0, 1 }:
  *
  *   is set by prepare_task() and cleared by finish_task() such that it will be
@@ -4317,9 +4322,10 @@ static bool __task_needs_rq_lock(struct task_struct *p)
  * @arg: Argument to function.
  *
  * Fix the task in it's current state by avoiding wakeups and or rq operations
- * and call @func(@arg) on it.  This function can use ->on_rq and task_curr()
- * to work out what the state is, if required.  Given that @func can be invoked
- * with a runqueue lock held, it had better be quite lightweight.
+ * and call @func(@arg) on it.  This function can use task_is_runnable() and
+ * task_curr() to work out what the state is, if required.  Given that @func
+ * can be invoked with a runqueue lock held, it had better be quite
+ * lightweight.
  *
  * Returns:
  *   Whatever @func returns
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 753a184..f203f00 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -434,6 +434,12 @@ static void tick_nohz_kick_task(struct task_struct *tsk)
 	 *   smp_mb__after_spin_lock()
 	 *   tick_nohz_task_switch()
 	 *     LOAD p->tick_dep_mask
+	 *
+	 * XXX given a task picks up the dependency on schedule(), should we
+	 * only care about tasks that are currently on the CPU instead of all
+	 * that are on the runqueue?
+	 *
+	 * That is, does this want to be: task_on_cpu() / task_curr()?
 	 */
 	if (!sched_task_on_rq(tsk))
 		return;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index c4ad7cd..1469dd8 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1485,7 +1485,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 	/* reset the max latency */
 	tr->max_latency = 0;
 
-	while (p->on_rq) {
+	while (task_is_runnable(p)) {
 		/*
 		 * Sleep to make sure the -deadline thread is asleep too.
 		 * On virtual machines we can't rely on timings,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 05cbb25..0c666f1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -6387,7 +6387,7 @@ static void kvm_sched_out(struct preempt_notifier *pn,
 
 	WRITE_ONCE(vcpu->scheduled_out, true);
 
-	if (current->on_rq && vcpu->wants_to_run) {
+	if (task_is_runnable(current) && vcpu->wants_to_run) {
 		WRITE_ONCE(vcpu->preempted, true);
 		WRITE_ONCE(vcpu->ready, true);
 	}
[tip: sched/urgent] sched: Fix external p->on_rq users
Posted by tip-bot2 for Peter Zijlstra 1 year, 2 months ago
The following commit has been merged into the sched/urgent branch of tip:

Commit-ID:     1cc2f68c016ad3ac8b3a0495797dd61e19a10025
Gitweb:        https://git.kernel.org/tip/1cc2f68c016ad3ac8b3a0495797dd61e19a10025
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 10 Oct 2024 11:38:10 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Fri, 11 Oct 2024 10:49:33 +02:00

sched: Fix external p->on_rq users

Sean noted that ever since commit 152e11f6df29 ("sched/fair: Implement
delayed dequeue") KVM's preemption notifiers have started
mis-classifying preemption vs blocking.

Notably p->on_rq is no longer sufficient to determine if a task is
runnable or blocked -- the aforementioned commit introduces tasks that
remain on the runqueue even through they will not run again, and
should be considered blocked for many cases.

Add the task_is_runnable() helper to classify things and audit all
external users of the p->on_rq state. Also add a few comments.

Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
Reported-by: Sean Christopherson <seanjc@google.com>
Tested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20241010091843.GK33184@noisy.programming.kicks-ass.net
---
 include/linux/sched.h         |  5 +++++
 kernel/events/core.c          |  2 +-
 kernel/freezer.c              |  7 ++++++-
 kernel/rcu/tasks.h            |  9 +++++++++
 kernel/sched/core.c           | 12 +++++++++---
 kernel/time/tick-sched.c      |  5 +++++
 kernel/trace/trace_selftest.c |  2 +-
 virt/kvm/kvm_main.c           |  2 +-
 8 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e6ee425..8a9517e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2133,6 +2133,11 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+static inline bool task_is_runnable(struct task_struct *p)
+{
+	return p->on_rq && !p->se.sched_delayed;
+}
+
 extern bool sched_task_on_rq(struct task_struct *p);
 extern unsigned long get_wchan(struct task_struct *p);
 extern struct task_struct *cpu_curr_snapshot(int cpu);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e3589c4..cdd0976 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9251,7 +9251,7 @@ static void perf_event_switch(struct task_struct *task,
 		},
 	};
 
-	if (!sched_in && task->on_rq) {
+	if (!sched_in && task_is_runnable(task)) {
 		switch_event.event_id.header.misc |=
 				PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
 	}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 44bbd7d..8d530d0 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -109,7 +109,12 @@ static int __set_task_frozen(struct task_struct *p, void *arg)
 {
 	unsigned int state = READ_ONCE(p->__state);
 
-	if (p->on_rq)
+	/*
+	 * Allow freezing the sched_delayed tasks; they will not execute until
+	 * ttwu() fixes them up, so it is safe to swap their state now, instead
+	 * of waiting for them to get fully dequeued.
+	 */
+	if (task_is_runnable(p))
 		return 0;
 
 	if (p != current && task_curr(p))
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 6333f4c..4d7ee95 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -986,6 +986,15 @@ static bool rcu_tasks_is_holdout(struct task_struct *t)
 		return false;
 
 	/*
+	 * t->on_rq && !t->se.sched_delayed *could* be considered sleeping but
+	 * since it is a spurious state (it will transition into the
+	 * traditional blocked state or get woken up without outside
+	 * dependencies), not considering it such should only affect timing.
+	 *
+	 * Be conservative for now and not include it.
+	 */
+
+	/*
 	 * Idle tasks (or idle injection) within the idle loop are RCU-tasks
 	 * quiescent states. But CPU boot code performed by the idle task
 	 * isn't a quiescent state.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 71232f8..7db711b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -548,6 +548,11 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
  *   ON_RQ_MIGRATING state is used for migration without holding both
  *   rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
  *
+ *   Additionally it is possible to be ->on_rq but still be considered not
+ *   runnable when p->se.sched_delayed is true. These tasks are on the runqueue
+ *   but will be dequeued as soon as they get picked again. See the
+ *   task_is_runnable() helper.
+ *
  * p->on_cpu <- { 0, 1 }:
  *
  *   is set by prepare_task() and cleared by finish_task() such that it will be
@@ -4317,9 +4322,10 @@ static bool __task_needs_rq_lock(struct task_struct *p)
  * @arg: Argument to function.
  *
  * Fix the task in it's current state by avoiding wakeups and or rq operations
- * and call @func(@arg) on it.  This function can use ->on_rq and task_curr()
- * to work out what the state is, if required.  Given that @func can be invoked
- * with a runqueue lock held, it had better be quite lightweight.
+ * and call @func(@arg) on it.  This function can use task_is_runnable() and
+ * task_curr() to work out what the state is, if required.  Given that @func
+ * can be invoked with a runqueue lock held, it had better be quite
+ * lightweight.
  *
  * Returns:
  *   Whatever @func returns
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 753a184..59efa14 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -435,6 +435,11 @@ static void tick_nohz_kick_task(struct task_struct *tsk)
 	 *   tick_nohz_task_switch()
 	 *     LOAD p->tick_dep_mask
 	 */
+	// XXX given a task picks up the dependency on schedule(), should we
+	// only care about tasks that are currently on the CPU instead of all
+	// that are on the runqueue?
+	//
+	// That is, does this want to be: task_on_cpu() / task_curr()?
 	if (!sched_task_on_rq(tsk))
 		return;
 
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index c4ad7cd..1469dd8 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1485,7 +1485,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 	/* reset the max latency */
 	tr->max_latency = 0;
 
-	while (p->on_rq) {
+	while (task_is_runnable(p)) {
 		/*
 		 * Sleep to make sure the -deadline thread is asleep too.
 		 * On virtual machines we can't rely on timings,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 05cbb25..0c666f1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -6387,7 +6387,7 @@ static void kvm_sched_out(struct preempt_notifier *pn,
 
 	WRITE_ONCE(vcpu->scheduled_out, true);
 
-	if (current->on_rq && vcpu->wants_to_run) {
+	if (task_is_runnable(current) && vcpu->wants_to_run) {
 		WRITE_ONCE(vcpu->preempted, true);
 		WRITE_ONCE(vcpu->ready, true);
 	}
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Mike Galbraith 1 year, 2 months ago
On Wed, 2024-10-09 at 19:49 -0700, Sean Christopherson wrote:
>
> Any thoughts on how best to handle this?  The below hack-a-fix resolves the issue,
> but it's obviously not appropriate.  KVM uses vcpu->preempted for more than just
> posted interrupts, so KVM needs equivalent functionality to current->on-rq as it
> was before this commit.
>
> @@ -6387,7 +6390,7 @@ static void kvm_sched_out(struct preempt_notifier *pn,
>  
>         WRITE_ONCE(vcpu->scheduled_out, true);
>  
> -       if (current->on_rq && vcpu->wants_to_run) {
> +       if (se_runnable(&current->se) && vcpu->wants_to_run) {
>                 WRITE_ONCE(vcpu->preempted, true);
>                 WRITE_ONCE(vcpu->ready, true);
>         }

Why is that deemed "obviously not appropriate"?  ->on_rq in and of
itself meaning only "on rq" doesn't seem like a bad thing.

	-Mike
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Sean Christopherson 1 year, 2 months ago
On Thu, Oct 10, 2024, Mike Galbraith wrote:
> On Wed, 2024-10-09 at 19:49 -0700, Sean Christopherson wrote:
> >
> > Any thoughts on how best to handle this?  The below hack-a-fix resolves the issue,
> > but it's obviously not appropriate.  KVM uses vcpu->preempted for more than just
> > posted interrupts, so KVM needs equivalent functionality to current->on-rq as it
> > was before this commit.
> >
> > @@ -6387,7 +6390,7 @@ static void kvm_sched_out(struct preempt_notifier *pn,
> >  
> >         WRITE_ONCE(vcpu->scheduled_out, true);
> >  
> > -       if (current->on_rq && vcpu->wants_to_run) {
> > +       if (se_runnable(&current->se) && vcpu->wants_to_run) {
> >                 WRITE_ONCE(vcpu->preempted, true);
> >                 WRITE_ONCE(vcpu->ready, true);
> >         }
> 
> Why is that deemed "obviously not appropriate"?  ->on_rq in and of
> itself meaning only "on rq" doesn't seem like a bad thing.

Doh, my wording was unclear.  I didn't mean the logic was inappropriate, I meant
that KVM shouldn't be poking into an internal sched/ helper.
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Mike Galbraith 1 year, 2 months ago
On Thu, 2024-10-10 at 09:18 -0700, Sean Christopherson wrote:
> On Thu, Oct 10, 2024, Mike Galbraith wrote:
> > On Wed, 2024-10-09 at 19:49 -0700, Sean Christopherson wrote:
> > >
> > > Any thoughts on how best to handle this?  The below hack-a-fix resolves the issue,
> > > but it's obviously not appropriate.  KVM uses vcpu->preempted for more than just
> > > posted interrupts, so KVM needs equivalent functionality to current->on-rq as it
> > > was before this commit.
> > >
> > > @@ -6387,7 +6390,7 @@ static void kvm_sched_out(struct preempt_notifier *pn,
> > >  
> > >         WRITE_ONCE(vcpu->scheduled_out, true);
> > >  
> > > -       if (current->on_rq && vcpu->wants_to_run) {
> > > +       if (se_runnable(&current->se) && vcpu->wants_to_run) {
> > >                 WRITE_ONCE(vcpu->preempted, true);
> > >                 WRITE_ONCE(vcpu->ready, true);
> > >         }
> >
> > Why is that deemed "obviously not appropriate"?  ->on_rq in and of
> > itself meaning only "on rq" doesn't seem like a bad thing.
>
> Doh, my wording was unclear.  I didn't mean the logic was inappropriate, I meant
> that KVM shouldn't be poking into an internal sched/ helper.

Ah, confusion all better.  (yeah, swiping other's toys is naughty)

	-Mike
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Luis Machado 1 year, 3 months ago
Hi Peter,

On 7/27/24 11:27, Peter Zijlstra wrote:
> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> noting that lag is fundamentally a temporal measure. It should not be
> carried around indefinitely.
> 
> OTOH it should also not be instantly discarded, doing so will allow a
> task to game the system by purposefully (micro) sleeping at the end of
> its time quantum.
> 
> Since lag is intimately tied to the virtual time base, a wall-time
> based decay is also insufficient, notably competition is required for
> any of this to make sense.
> 
> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> competing until they are eligible.
> 
> Strictly speaking, we only care about keeping them until the 0-lag
> point, but that is a difficult proposition, instead carry them around
> until they get picked again, and dequeue them at that point.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/deadline.c |    1 
>  kernel/sched/fair.c     |   82 ++++++++++++++++++++++++++++++++++++++++++------
>  kernel/sched/features.h |    9 +++++
>  3 files changed, 81 insertions(+), 11 deletions(-)
> 
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2428,7 +2428,6 @@ static struct task_struct *__pick_next_t
>  		else
>  			p = dl_se->server_pick_next(dl_se);
>  		if (!p) {
> -			WARN_ON_ONCE(1);
>  			dl_se->dl_yielded = 1;
>  			update_curr_dl_se(rq, dl_se, 0);
>  			goto again;
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5379,20 +5379,44 @@ static void clear_buddies(struct cfs_rq
>  
>  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
>  
> -static void
> +static bool
>  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  {
> -	int action = UPDATE_TG;
> +	if (flags & DEQUEUE_DELAYED) {
> +		/*
> +		 * DEQUEUE_DELAYED is typically called from pick_next_entity()
> +		 * at which point we've already done update_curr() and do not
> +		 * want to do so again.
> +		 */
> +		SCHED_WARN_ON(!se->sched_delayed);
> +		se->sched_delayed = 0;
> +	} else {
> +		bool sleep = flags & DEQUEUE_SLEEP;
> +
> +		/*
> +		 * DELAY_DEQUEUE relies on spurious wakeups, special task
> +		 * states must not suffer spurious wakeups, excempt them.
> +		 */
> +		if (flags & DEQUEUE_SPECIAL)
> +			sleep = false;
> +
> +		SCHED_WARN_ON(sleep && se->sched_delayed);
> +		update_curr(cfs_rq);
>  
> +		if (sched_feat(DELAY_DEQUEUE) && sleep &&
> +		    !entity_eligible(cfs_rq, se)) {
> +			if (cfs_rq->next == se)
> +				cfs_rq->next = NULL;
> +			se->sched_delayed = 1;
> +			return false;
> +		}
> +	}
> +
> +	int action = UPDATE_TG;
>  	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
>  		action |= DO_DETACH;
>  
>  	/*
> -	 * Update run-time statistics of the 'current'.
> -	 */
> -	update_curr(cfs_rq);
> -
> -	/*
>  	 * When dequeuing a sched_entity, we must:
>  	 *   - Update loads to have both entity and cfs_rq synced with now.
>  	 *   - For group_entity, update its runnable_weight to reflect the new
> @@ -5430,6 +5454,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
>  
>  	if (cfs_rq->nr_running == 0)
>  		update_idle_cfs_rq_clock_pelt(cfs_rq);
> +
> +	return true;
>  }
>  
>  static void
> @@ -5828,11 +5854,21 @@ static bool throttle_cfs_rq(struct cfs_r
>  	idle_task_delta = cfs_rq->idle_h_nr_running;
>  	for_each_sched_entity(se) {
>  		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
> +		int flags;
> +
>  		/* throttled entity or throttle-on-deactivate */
>  		if (!se->on_rq)
>  			goto done;
>  
> -		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
> +		/*
> +		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
> +		 * This avoids teaching dequeue_entities() about throttled
> +		 * entities and keeps things relatively simple.
> +		 */
> +		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
> +		if (se->sched_delayed)
> +			flags |= DEQUEUE_DELAYED;
> +		dequeue_entity(qcfs_rq, se, flags);
>  
>  		if (cfs_rq_is_idle(group_cfs_rq(se)))
>  			idle_task_delta = cfs_rq->h_nr_running;
> @@ -6918,6 +6954,7 @@ static int dequeue_entities(struct rq *r
>  	bool was_sched_idle = sched_idle_rq(rq);
>  	int rq_h_nr_running = rq->cfs.h_nr_running;
>  	bool task_sleep = flags & DEQUEUE_SLEEP;
> +	bool task_delayed = flags & DEQUEUE_DELAYED;
>  	struct task_struct *p = NULL;
>  	int idle_h_nr_running = 0;
>  	int h_nr_running = 0;
> @@ -6931,7 +6968,13 @@ static int dequeue_entities(struct rq *r
>  
>  	for_each_sched_entity(se) {
>  		cfs_rq = cfs_rq_of(se);
> -		dequeue_entity(cfs_rq, se, flags);
> +
> +		if (!dequeue_entity(cfs_rq, se, flags)) {
> +			if (p && &p->se == se)
> +				return -1;
> +
> +			break;
> +		}
>  
>  		cfs_rq->h_nr_running -= h_nr_running;
>  		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
> @@ -6956,6 +6999,7 @@ static int dequeue_entities(struct rq *r
>  			break;
>  		}
>  		flags |= DEQUEUE_SLEEP;
> +		flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
>  	}
>  
>  	for_each_sched_entity(se) {
> @@ -6985,6 +7029,17 @@ static int dequeue_entities(struct rq *r
>  	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
>  		rq->next_balance = jiffies;
>  
> +	if (p && task_delayed) {
> +		SCHED_WARN_ON(!task_sleep);
> +		SCHED_WARN_ON(p->on_rq != 1);
> +
> +		/* Fix-up what dequeue_task_fair() skipped */
> +		hrtick_update(rq);
> +
> +		/* Fix-up what block_task() skipped. */
> +		__block_task(rq, p);
> +	}
> +
>  	return 1;
>  }
>  /*
> @@ -6996,8 +7051,10 @@ static bool dequeue_task_fair(struct rq
>  {
>  	util_est_dequeue(&rq->cfs, p);
>  
> -	if (dequeue_entities(rq, &p->se, flags) < 0)
> +	if (dequeue_entities(rq, &p->se, flags) < 0) {
> +		util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
>  		return false;
> +	}
>  
>  	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
>  	hrtick_update(rq);
> @@ -12973,6 +13030,11 @@ static void set_next_task_fair(struct rq
>  		/* ensure bandwidth has been allocated on our new cfs_rq */
>  		account_cfs_rq_runtime(cfs_rq, 0);
>  	}
> +
> +	if (!first)
> +		return;
> +
> +	SCHED_WARN_ON(se->sched_delayed);
>  }
>  
>  void init_cfs_rq(struct cfs_rq *cfs_rq)
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -29,6 +29,15 @@ SCHED_FEAT(NEXT_BUDDY, false)
>  SCHED_FEAT(CACHE_HOT_BUDDY, true)
>  
>  /*
> + * Delay dequeueing tasks until they get selected or woken.
> + *
> + * By delaying the dequeue for non-eligible tasks, they remain in the
> + * competition and can burn off their negative lag. When they get selected
> + * they'll have positive lag by definition.
> + */
> +SCHED_FEAT(DELAY_DEQUEUE, true)
> +
> +/*
>   * Allow wakeup-time preemption of the current task:
>   */
>  SCHED_FEAT(WAKEUP_PREEMPTION, true)
> 
> 
> 

Just a heads-up I'm chasing some odd behavior on the big.little/pixel 6 platform, where
sometimes I see runs with spikes of higher frequencies for extended amounts of time (multiple
seconds), in particular for little cores, which leads to higher energy use.

I'm still trying to understand why that happens, but looks like the utilization values are
sometimes stuck at high values. I just want to make sure the delayed dequeue changes aren't
interfering with the util calculations.

Unfortunately the benchmark is Android-specific, so hard to provide a reasonable
reproducer for Linux.
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Valentin Schneider 1 year, 4 months ago
On 27/07/24 12:27, Peter Zijlstra wrote:
> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> noting that lag is fundamentally a temporal measure. It should not be
> carried around indefinitely.
>
> OTOH it should also not be instantly discarded, doing so will allow a
> task to game the system by purposefully (micro) sleeping at the end of
> its time quantum.
>
> Since lag is intimately tied to the virtual time base, a wall-time
> based decay is also insufficient, notably competition is required for
> any of this to make sense.
>
> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> competing until they are eligible.
>
> Strictly speaking, we only care about keeping them until the 0-lag
> point, but that is a difficult proposition, instead carry them around
> until they get picked again, and dequeue them at that point.
>

Question from a lazy student who just caught up to the current state of
EEVDF...

IIUC this makes it so time spent sleeping increases an entity's lag, rather
than it being frozen & restored via the place_entity() magic.

So entities with negative lag get closer to their 0-lag point, after which
they can get picked & dequeued if still not runnable.

However, don't entities with positive lag get *further* away from their
0-lag point?
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Peter Zijlstra 1 year, 4 months ago
On Fri, Aug 02, 2024 at 04:39:08PM +0200, Valentin Schneider wrote:
> 
> On 27/07/24 12:27, Peter Zijlstra wrote:
> > Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> > noting that lag is fundamentally a temporal measure. It should not be
> > carried around indefinitely.
> >
> > OTOH it should also not be instantly discarded, doing so will allow a
> > task to game the system by purposefully (micro) sleeping at the end of
> > its time quantum.
> >
> > Since lag is intimately tied to the virtual time base, a wall-time
> > based decay is also insufficient, notably competition is required for
> > any of this to make sense.
> >
> > Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> > competing until they are eligible.
> >
> > Strictly speaking, we only care about keeping them until the 0-lag
> > point, but that is a difficult proposition, instead carry them around
> > until they get picked again, and dequeue them at that point.
> >
> 
> Question from a lazy student who just caught up to the current state of
> EEVDF...
> 
> IIUC this makes it so time spent sleeping increases an entity's lag, rather
> than it being frozen & restored via the place_entity() magic.
> 
> So entities with negative lag get closer to their 0-lag point, after which
> they can get picked & dequeued if still not runnable.

Right.

> However, don't entities with positive lag get *further* away from their
> 0-lag point?

Which is why we only delay de dequeue when !eligible, IOW when lag is
negative.

The next patch additionally truncates lag to 0 (for delayed entities),
so they can never earn extra time.
Re: [PATCH 17/24] sched/fair: Implement delayed dequeue
Posted by Valentin Schneider 1 year, 4 months ago
On 02/08/24 16:59, Peter Zijlstra wrote:
> On Fri, Aug 02, 2024 at 04:39:08PM +0200, Valentin Schneider wrote:
>>
>> On 27/07/24 12:27, Peter Zijlstra wrote:
>> > Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
>> > noting that lag is fundamentally a temporal measure. It should not be
>> > carried around indefinitely.
>> >
>> > OTOH it should also not be instantly discarded, doing so will allow a
>> > task to game the system by purposefully (micro) sleeping at the end of
>> > its time quantum.
>> >
>> > Since lag is intimately tied to the virtual time base, a wall-time
>> > based decay is also insufficient, notably competition is required for
>> > any of this to make sense.
>> >
>> > Instead, delay the dequeue and keep the 'tasks' on the runqueue,
>> > competing until they are eligible.
>> >
>> > Strictly speaking, we only care about keeping them until the 0-lag
>> > point, but that is a difficult proposition, instead carry them around
>> > until they get picked again, and dequeue them at that point.
>> >
>>
>> Question from a lazy student who just caught up to the current state of
>> EEVDF...
>>
>> IIUC this makes it so time spent sleeping increases an entity's lag, rather
>> than it being frozen & restored via the place_entity() magic.
>>
>> So entities with negative lag get closer to their 0-lag point, after which
>> they can get picked & dequeued if still not runnable.
>
> Right.
>
>> However, don't entities with positive lag get *further* away from their
>> 0-lag point?
>
> Which is why we only delay de dequeue when !eligible, IOW when lag is
> negative.
>
> The next patch additionally truncates lag to 0 (for delayed entities),
> so they can never earn extra time.

Gotcha, thanks for pointing that out, I think I'm (slowly) getting it :D
[tip: sched/core] sched/fair: Implement delayed dequeue
Posted by tip-bot2 for Peter Zijlstra 1 year, 3 months ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     152e11f6df293e816a6a37c69757033cdc72667d
Gitweb:        https://git.kernel.org/tip/152e11f6df293e816a6a37c69757033cdc72667d
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 23 May 2024 12:25:32 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Sat, 17 Aug 2024 11:06:44 +02:00

sched/fair: Implement delayed dequeue

Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
noting that lag is fundamentally a temporal measure. It should not be
carried around indefinitely.

OTOH it should also not be instantly discarded, doing so will allow a
task to game the system by purposefully (micro) sleeping at the end of
its time quantum.

Since lag is intimately tied to the virtual time base, a wall-time
based decay is also insufficient, notably competition is required for
any of this to make sense.

Instead, delay the dequeue and keep the 'tasks' on the runqueue,
competing until they are eligible.

Strictly speaking, we only care about keeping them until the 0-lag
point, but that is a difficult proposition, instead carry them around
until they get picked again, and dequeue them at that point.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/20240727105030.226163742@infradead.org
---
 kernel/sched/deadline.c |  1 +-
 kernel/sched/fair.c     | 80 +++++++++++++++++++++++++++++++++++-----
 kernel/sched/features.h |  9 +++++-
 3 files changed, 79 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index bbaeace..0f2df67 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2428,7 +2428,6 @@ again:
 		else
 			p = dl_se->server_pick_next(dl_se);
 		if (!p) {
-			WARN_ON_ONCE(1);
 			dl_se->dl_yielded = 1;
 			update_curr_dl_se(rq, dl_se, 0);
 			goto again;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 25b14df..da5065a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5379,20 +5379,39 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
-static void
+static bool
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
-	int action = UPDATE_TG;
+	update_curr(cfs_rq);
+
+	if (flags & DEQUEUE_DELAYED) {
+		SCHED_WARN_ON(!se->sched_delayed);
+	} else {
+		bool sleep = flags & DEQUEUE_SLEEP;
 
+		/*
+		 * DELAY_DEQUEUE relies on spurious wakeups, special task
+		 * states must not suffer spurious wakeups, excempt them.
+		 */
+		if (flags & DEQUEUE_SPECIAL)
+			sleep = false;
+
+		SCHED_WARN_ON(sleep && se->sched_delayed);
+
+		if (sched_feat(DELAY_DEQUEUE) && sleep &&
+		    !entity_eligible(cfs_rq, se)) {
+			if (cfs_rq->next == se)
+				cfs_rq->next = NULL;
+			se->sched_delayed = 1;
+			return false;
+		}
+	}
+
+	int action = UPDATE_TG;
 	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
 		action |= DO_DETACH;
 
 	/*
-	 * Update run-time statistics of the 'current'.
-	 */
-	update_curr(cfs_rq);
-
-	/*
 	 * When dequeuing a sched_entity, we must:
 	 *   - Update loads to have both entity and cfs_rq synced with now.
 	 *   - For group_entity, update its runnable_weight to reflect the new
@@ -5428,8 +5447,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
 		update_min_vruntime(cfs_rq);
 
+	if (flags & DEQUEUE_DELAYED)
+		se->sched_delayed = 0;
+
 	if (cfs_rq->nr_running == 0)
 		update_idle_cfs_rq_clock_pelt(cfs_rq);
+
+	return true;
 }
 
 static void
@@ -5828,11 +5852,21 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	idle_task_delta = cfs_rq->idle_h_nr_running;
 	for_each_sched_entity(se) {
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+		int flags;
+
 		/* throttled entity or throttle-on-deactivate */
 		if (!se->on_rq)
 			goto done;
 
-		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+		/*
+		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
+		 * This avoids teaching dequeue_entities() about throttled
+		 * entities and keeps things relatively simple.
+		 */
+		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
+		if (se->sched_delayed)
+			flags |= DEQUEUE_DELAYED;
+		dequeue_entity(qcfs_rq, se, flags);
 
 		if (cfs_rq_is_idle(group_cfs_rq(se)))
 			idle_task_delta = cfs_rq->h_nr_running;
@@ -6918,6 +6952,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 	bool was_sched_idle = sched_idle_rq(rq);
 	int rq_h_nr_running = rq->cfs.h_nr_running;
 	bool task_sleep = flags & DEQUEUE_SLEEP;
+	bool task_delayed = flags & DEQUEUE_DELAYED;
 	struct task_struct *p = NULL;
 	int idle_h_nr_running = 0;
 	int h_nr_running = 0;
@@ -6931,7 +6966,13 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		dequeue_entity(cfs_rq, se, flags);
+
+		if (!dequeue_entity(cfs_rq, se, flags)) {
+			if (p && &p->se == se)
+				return -1;
+
+			break;
+		}
 
 		cfs_rq->h_nr_running -= h_nr_running;
 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
@@ -6956,6 +6997,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 			break;
 		}
 		flags |= DEQUEUE_SLEEP;
+		flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
 	}
 
 	for_each_sched_entity(se) {
@@ -6985,6 +7027,17 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
 		rq->next_balance = jiffies;
 
+	if (p && task_delayed) {
+		SCHED_WARN_ON(!task_sleep);
+		SCHED_WARN_ON(p->on_rq != 1);
+
+		/* Fix-up what dequeue_task_fair() skipped */
+		hrtick_update(rq);
+
+		/* Fix-up what block_task() skipped. */
+		__block_task(rq, p);
+	}
+
 	return 1;
 }
 
@@ -6997,8 +7050,10 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	util_est_dequeue(&rq->cfs, p);
 
-	if (dequeue_entities(rq, &p->se, flags) < 0)
+	if (dequeue_entities(rq, &p->se, flags) < 0) {
+		util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
 		return false;
+	}
 
 	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
 	hrtick_update(rq);
@@ -12971,6 +13026,11 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
 		/* ensure bandwidth has been allocated on our new cfs_rq */
 		account_cfs_rq_runtime(cfs_rq, 0);
 	}
+
+	if (!first)
+		return;
+
+	SCHED_WARN_ON(se->sched_delayed);
 }
 
 void init_cfs_rq(struct cfs_rq *cfs_rq)
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 97fb2d4..1feaa7b 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -29,6 +29,15 @@ SCHED_FEAT(NEXT_BUDDY, false)
 SCHED_FEAT(CACHE_HOT_BUDDY, true)
 
 /*
+ * Delay dequeueing tasks until they get selected or woken.
+ *
+ * By delaying the dequeue for non-eligible tasks, they remain in the
+ * competition and can burn off their negative lag. When they get selected
+ * they'll have positive lag by definition.
+ */
+SCHED_FEAT(DELAY_DEQUEUE, true)
+
+/*
  * Allow wakeup-time preemption of the current task:
  */
 SCHED_FEAT(WAKEUP_PREEMPTION, true)